xref: /aosp_15_r20/external/libdav1d/src/arm/64/ipred16.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
32//                              const pixel *const topleft,
33//                              const int width, const int height, const int a,
34//                              const int max_width, const int max_height,
35//                              const int bitdepth_max);
36function ipred_dc_128_16bpc_neon, export=1
37        ldr             w8,  [sp]
38        clz             w3,  w3
39        movrel          x5,  ipred_dc_128_tbl
40        sub             w3,  w3,  #25
41        ldrsw           x3,  [x5, w3, uxtw #2]
42        dup             v0.8h,   w8
43        add             x5,  x5,  x3
44        add             x6,  x0,  x1
45        lsl             x1,  x1,  #1
46        urshr           v0.8h,   v0.8h,  #1
47        br              x5
4840:
49        AARCH64_VALID_JUMP_TARGET
504:
51        st1             {v0.4h},  [x0], x1
52        st1             {v0.4h},  [x6], x1
53        subs            w4,  w4,  #4
54        st1             {v0.4h},  [x0], x1
55        st1             {v0.4h},  [x6], x1
56        b.gt            4b
57        ret
5880:
59        AARCH64_VALID_JUMP_TARGET
608:
61        st1             {v0.8h},  [x0], x1
62        st1             {v0.8h},  [x6], x1
63        subs            w4,  w4,  #4
64        st1             {v0.8h},  [x0], x1
65        st1             {v0.8h},  [x6], x1
66        b.gt            8b
67        ret
68160:
69        AARCH64_VALID_JUMP_TARGET
70        mov             v1.16b,  v0.16b
7116:
72        st1             {v0.8h, v1.8h}, [x0], x1
73        st1             {v0.8h, v1.8h}, [x6], x1
74        subs            w4,  w4,  #4
75        st1             {v0.8h, v1.8h}, [x0], x1
76        st1             {v0.8h, v1.8h}, [x6], x1
77        b.gt            16b
78        ret
79320:
80        AARCH64_VALID_JUMP_TARGET
81        mov             v1.16b,  v0.16b
82        mov             v2.16b,  v0.16b
83        mov             v3.16b,  v0.16b
8432:
85        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
86        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
87        subs            w4,  w4,  #4
88        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
89        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
90        b.gt            32b
91        ret
92640:
93        AARCH64_VALID_JUMP_TARGET
94        mov             v1.16b,  v0.16b
95        mov             v2.16b,  v0.16b
96        mov             v3.16b,  v0.16b
97        sub             x1,  x1,  #64
9864:
99        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
100        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
101        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
102        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
103        subs            w4,  w4,  #4
104        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
105        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
106        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
107        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
108        b.gt            64b
109        ret
110endfunc
111
112jumptable ipred_dc_128_tbl
113        .word 640b - ipred_dc_128_tbl
114        .word 320b - ipred_dc_128_tbl
115        .word 160b - ipred_dc_128_tbl
116        .word 80b  - ipred_dc_128_tbl
117        .word 40b  - ipred_dc_128_tbl
118endjumptable
119
120// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
121//                         const pixel *const topleft,
122//                         const int width, const int height, const int a,
123//                         const int max_width, const int max_height);
124function ipred_v_16bpc_neon, export=1
125        clz             w3,  w3
126        movrel          x5,  ipred_v_tbl
127        sub             w3,  w3,  #25
128        ldrsw           x3,  [x5, w3, uxtw #2]
129        add             x2,  x2,  #2
130        add             x5,  x5,  x3
131        add             x6,  x0,  x1
132        lsl             x1,  x1,  #1
133        br              x5
13440:
135        AARCH64_VALID_JUMP_TARGET
136        ld1             {v0.4h},  [x2]
1374:
138        st1             {v0.4h},  [x0], x1
139        st1             {v0.4h},  [x6], x1
140        subs            w4,  w4,  #4
141        st1             {v0.4h},  [x0], x1
142        st1             {v0.4h},  [x6], x1
143        b.gt            4b
144        ret
14580:
146        AARCH64_VALID_JUMP_TARGET
147        ld1             {v0.8h},  [x2]
1488:
149        st1             {v0.8h},  [x0], x1
150        st1             {v0.8h},  [x6], x1
151        subs            w4,  w4,  #4
152        st1             {v0.8h},  [x0], x1
153        st1             {v0.8h},  [x6], x1
154        b.gt            8b
155        ret
156160:
157        AARCH64_VALID_JUMP_TARGET
158        ld1             {v0.8h, v1.8h}, [x2]
15916:
160        st1             {v0.8h, v1.8h}, [x0], x1
161        st1             {v0.8h, v1.8h}, [x6], x1
162        subs            w4,  w4,  #4
163        st1             {v0.8h, v1.8h}, [x0], x1
164        st1             {v0.8h, v1.8h}, [x6], x1
165        b.gt            16b
166        ret
167320:
168        AARCH64_VALID_JUMP_TARGET
169        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
17032:
171        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
172        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
173        subs            w4,  w4,  #4
174        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
175        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
176        b.gt            32b
177        ret
178640:
179        AARCH64_VALID_JUMP_TARGET
180        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
181        sub             x1,  x1,  #64
182        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
18364:
184        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
185        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
186        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
187        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
188        subs            w4,  w4,  #4
189        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
190        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
191        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
192        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
193        b.gt            64b
194        ret
195endfunc
196
197jumptable ipred_v_tbl
198        .word 640b - ipred_v_tbl
199        .word 320b - ipred_v_tbl
200        .word 160b - ipred_v_tbl
201        .word 80b  - ipred_v_tbl
202        .word 40b  - ipred_v_tbl
203endjumptable
204
205// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
206//                         const pixel *const topleft,
207//                         const int width, const int height, const int a,
208//                         const int max_width, const int max_height);
209function ipred_h_16bpc_neon, export=1
210        clz             w3,  w3
211        movrel          x5,  ipred_h_tbl
212        sub             w3,  w3,  #25
213        ldrsw           x3,  [x5, w3, uxtw #2]
214        sub             x2,  x2,  #8
215        add             x5,  x5,  x3
216        mov             x7,  #-8
217        add             x6,  x0,  x1
218        lsl             x1,  x1,  #1
219        br              x5
22040:
221        AARCH64_VALID_JUMP_TARGET
2224:
223        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
224        st1             {v3.4h},  [x0], x1
225        st1             {v2.4h},  [x6], x1
226        subs            w4,  w4,  #4
227        st1             {v1.4h},  [x0], x1
228        st1             {v0.4h},  [x6], x1
229        b.gt            4b
230        ret
23180:
232        AARCH64_VALID_JUMP_TARGET
2338:
234        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
235        st1             {v3.8h},  [x0], x1
236        st1             {v2.8h},  [x6], x1
237        subs            w4,  w4,  #4
238        st1             {v1.8h},  [x0], x1
239        st1             {v0.8h},  [x6], x1
240        b.gt            8b
241        ret
242160:
243        AARCH64_VALID_JUMP_TARGET
24416:
245        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
246        str             q3,  [x0, #16]
247        str             q2,  [x6, #16]
248        st1             {v3.8h}, [x0], x1
249        st1             {v2.8h}, [x6], x1
250        subs            w4,  w4,  #4
251        str             q1,  [x0, #16]
252        str             q0,  [x6, #16]
253        st1             {v1.8h}, [x0], x1
254        st1             {v0.8h}, [x6], x1
255        b.gt            16b
256        ret
257320:
258        AARCH64_VALID_JUMP_TARGET
25932:
260        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
261        str             q3,  [x0, #16]
262        str             q2,  [x6, #16]
263        stp             q3,  q3,  [x0, #32]
264        stp             q2,  q2,  [x6, #32]
265        st1             {v3.8h}, [x0], x1
266        st1             {v2.8h}, [x6], x1
267        subs            w4,  w4,  #4
268        str             q1,  [x0, #16]
269        str             q0,  [x6, #16]
270        stp             q1,  q1,  [x0, #32]
271        stp             q0,  q0,  [x6, #32]
272        st1             {v1.8h}, [x0], x1
273        st1             {v0.8h}, [x6], x1
274        b.gt            32b
275        ret
276640:
277        AARCH64_VALID_JUMP_TARGET
27864:
279        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
280        str             q3,  [x0, #16]
281        str             q2,  [x6, #16]
282        stp             q3,  q3,  [x0, #32]
283        stp             q2,  q2,  [x6, #32]
284        stp             q3,  q3,  [x0, #64]
285        stp             q2,  q2,  [x6, #64]
286        stp             q3,  q3,  [x0, #96]
287        stp             q2,  q2,  [x6, #96]
288        st1             {v3.8h}, [x0], x1
289        st1             {v2.8h}, [x6], x1
290        subs            w4,  w4,  #4
291        str             q1,  [x0, #16]
292        str             q0,  [x6, #16]
293        stp             q1,  q1,  [x0, #32]
294        stp             q0,  q0,  [x6, #32]
295        stp             q1,  q1,  [x0, #64]
296        stp             q0,  q0,  [x6, #64]
297        stp             q1,  q1,  [x0, #96]
298        stp             q0,  q0,  [x6, #96]
299        st1             {v1.8h}, [x0], x1
300        st1             {v0.8h}, [x6], x1
301        b.gt            64b
302        ret
303endfunc
304
305jumptable ipred_h_tbl
306        .word 640b - ipred_h_tbl
307        .word 320b - ipred_h_tbl
308        .word 160b - ipred_h_tbl
309        .word 80b  - ipred_h_tbl
310        .word 40b  - ipred_h_tbl
311endjumptable
312
313// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
314//                              const pixel *const topleft,
315//                              const int width, const int height, const int a,
316//                              const int max_width, const int max_height);
317function ipred_dc_top_16bpc_neon, export=1
318        clz             w3,  w3
319        movrel          x5,  ipred_dc_top_tbl
320        sub             w3,  w3,  #25
321        ldrsw           x3,  [x5, w3, uxtw #2]
322        add             x2,  x2,  #2
323        add             x5,  x5,  x3
324        add             x6,  x0,  x1
325        lsl             x1,  x1,  #1
326        br              x5
32740:
328        AARCH64_VALID_JUMP_TARGET
329        ld1             {v0.4h},  [x2]
330        addv            h0,      v0.4h
331        urshr           v0.4h,   v0.4h,   #2
332        dup             v0.4h,   v0.h[0]
3334:
334        st1             {v0.4h},  [x0], x1
335        st1             {v0.4h},  [x6], x1
336        subs            w4,  w4,  #4
337        st1             {v0.4h},  [x0], x1
338        st1             {v0.4h},  [x6], x1
339        b.gt            4b
340        ret
34180:
342        AARCH64_VALID_JUMP_TARGET
343        ld1             {v0.8h},  [x2]
344        addv            h0,      v0.8h
345        urshr           v0.4h,   v0.4h,   #3
346        dup             v0.8h,   v0.h[0]
3478:
348        st1             {v0.8h},  [x0], x1
349        st1             {v0.8h},  [x6], x1
350        subs            w4,  w4,  #4
351        st1             {v0.8h},  [x0], x1
352        st1             {v0.8h},  [x6], x1
353        b.gt            8b
354        ret
355160:
356        AARCH64_VALID_JUMP_TARGET
357        ld1             {v0.8h, v1.8h}, [x2]
358        addp            v0.8h,   v0.8h,   v1.8h
359        addv            h0,      v0.8h
360        urshr           v2.4h,   v0.4h,   #4
361        dup             v0.8h,   v2.h[0]
362        dup             v1.8h,   v2.h[0]
36316:
364        st1             {v0.8h, v1.8h}, [x0], x1
365        st1             {v0.8h, v1.8h}, [x6], x1
366        subs            w4,  w4,  #4
367        st1             {v0.8h, v1.8h}, [x0], x1
368        st1             {v0.8h, v1.8h}, [x6], x1
369        b.gt            16b
370        ret
371320:
372        AARCH64_VALID_JUMP_TARGET
373        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
374        addp            v0.8h,   v0.8h,   v1.8h
375        addp            v2.8h,   v2.8h,   v3.8h
376        addp            v0.8h,   v0.8h,   v2.8h
377        uaddlv          s0,      v0.8h
378        rshrn           v4.4h,   v0.4s,   #5
379        dup             v0.8h,   v4.h[0]
380        dup             v1.8h,   v4.h[0]
381        dup             v2.8h,   v4.h[0]
382        dup             v3.8h,   v4.h[0]
38332:
384        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
385        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
386        subs            w4,  w4,  #4
387        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
388        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
389        b.gt            32b
390        ret
391640:
392        AARCH64_VALID_JUMP_TARGET
393        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
394        addp            v0.8h,   v0.8h,   v1.8h
395        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
396        addp            v2.8h,   v2.8h,   v3.8h
397        addp            v4.8h,   v4.8h,   v5.8h
398        addp            v6.8h,   v6.8h,   v7.8h
399        addp            v0.8h,   v0.8h,   v2.8h
400        addp            v4.8h,   v4.8h,   v6.8h
401        addp            v0.8h,   v0.8h,   v4.8h
402        uaddlv          s0,      v0.8h
403        rshrn           v4.4h,   v0.4s,   #6
404        sub             x1,  x1,  #64
405        dup             v0.8h,   v4.h[0]
406        dup             v1.8h,   v4.h[0]
407        dup             v2.8h,   v4.h[0]
408        dup             v3.8h,   v4.h[0]
40964:
410        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
411        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
412        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
413        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
414        subs            w4,  w4,  #4
415        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
416        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
417        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
418        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
419        b.gt            64b
420        ret
421endfunc
422
423jumptable ipred_dc_top_tbl
424        .word 640b - ipred_dc_top_tbl
425        .word 320b - ipred_dc_top_tbl
426        .word 160b - ipred_dc_top_tbl
427        .word 80b  - ipred_dc_top_tbl
428        .word 40b  - ipred_dc_top_tbl
429endjumptable
430
431// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
432//                               const pixel *const topleft,
433//                               const int width, const int height, const int a,
434//                               const int max_width, const int max_height);
435function ipred_dc_left_16bpc_neon, export=1
436        sub             x2,  x2,  w4, uxtw #1
437        clz             w3,  w3
438        clz             w7,  w4
439        movrel          x5,  ipred_dc_left_tbl
440        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
441        sub             w7,  w7,  #25
442        ldrsw           x3,  [x5, w3, uxtw #2]
443        ldrsw           x7,  [x5, w7, uxtw #2]
444        add             x3,  x5,  x3
445        add             x5,  x5,  x7
446        add             x6,  x0,  x1
447        lsl             x1,  x1,  #1
448        br              x5
449
450L(ipred_dc_left_h4):
451        AARCH64_VALID_JUMP_TARGET
452        ld1             {v0.4h},  [x2]
453        addv            h0,      v0.4h
454        urshr           v0.4h,   v0.4h,   #2
455        dup             v0.8h,   v0.h[0]
456        br              x3
457L(ipred_dc_left_w4):
458        AARCH64_VALID_JUMP_TARGET
4591:
460        st1             {v0.4h},  [x0], x1
461        st1             {v0.4h},  [x6], x1
462        subs            w4,  w4,  #4
463        st1             {v0.4h},  [x0], x1
464        st1             {v0.4h},  [x6], x1
465        b.gt            1b
466        ret
467
468L(ipred_dc_left_h8):
469        AARCH64_VALID_JUMP_TARGET
470        ld1             {v0.8h},  [x2]
471        addv            h0,      v0.8h
472        urshr           v0.4h,   v0.4h,   #3
473        dup             v0.8h,   v0.h[0]
474        br              x3
475L(ipred_dc_left_w8):
476        AARCH64_VALID_JUMP_TARGET
4771:
478        st1             {v0.8h},  [x0], x1
479        st1             {v0.8h},  [x6], x1
480        subs            w4,  w4,  #4
481        st1             {v0.8h},  [x0], x1
482        st1             {v0.8h},  [x6], x1
483        b.gt            1b
484        ret
485
486L(ipred_dc_left_h16):
487        AARCH64_VALID_JUMP_TARGET
488        ld1             {v0.8h, v1.8h}, [x2]
489        addp            v0.8h,   v0.8h,   v1.8h
490        addv            h0,      v0.8h
491        urshr           v2.4h,   v0.4h,   #4
492        dup             v0.8h,   v2.h[0]
493        dup             v1.8h,   v2.h[0]
494        br              x3
495L(ipred_dc_left_w16):
496        AARCH64_VALID_JUMP_TARGET
497        mov             v1.16b,  v0.16b
4981:
499        st1             {v0.8h, v1.8h}, [x0], x1
500        st1             {v0.8h, v1.8h}, [x6], x1
501        subs            w4,  w4,  #4
502        st1             {v0.8h, v1.8h}, [x0], x1
503        st1             {v0.8h, v1.8h}, [x6], x1
504        b.gt            1b
505        ret
506
507L(ipred_dc_left_h32):
508        AARCH64_VALID_JUMP_TARGET
509        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
510        addp            v0.8h,   v0.8h,   v1.8h
511        addp            v2.8h,   v2.8h,   v3.8h
512        addp            v0.8h,   v0.8h,   v2.8h
513        uaddlp          v0.4s,   v0.8h
514        addv            s0,      v0.4s
515        rshrn           v4.4h,   v0.4s,   #5
516        dup             v0.8h,   v4.h[0]
517        br              x3
518L(ipred_dc_left_w32):
519        AARCH64_VALID_JUMP_TARGET
520        mov             v1.16b,  v0.16b
521        mov             v2.16b,  v0.16b
522        mov             v3.16b,  v0.16b
5231:
524        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
525        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
526        subs            w4,  w4,  #4
527        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
528        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
529        b.gt            1b
530        ret
531
532L(ipred_dc_left_h64):
533        AARCH64_VALID_JUMP_TARGET
534        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
535        addp            v0.8h,   v0.8h,   v1.8h
536        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
537        addp            v2.8h,   v2.8h,   v3.8h
538        addp            v4.8h,   v4.8h,   v5.8h
539        addp            v6.8h,   v6.8h,   v7.8h
540        addp            v0.8h,   v0.8h,   v2.8h
541        addp            v4.8h,   v4.8h,   v6.8h
542        addp            v0.8h,   v0.8h,   v4.8h
543        uaddlv          s0,      v0.8h
544        rshrn           v4.4h,   v0.4s,   #6
545        dup             v0.8h,   v4.h[0]
546        br              x3
547L(ipred_dc_left_w64):
548        AARCH64_VALID_JUMP_TARGET
549        mov             v1.16b,  v0.16b
550        mov             v2.16b,  v0.16b
551        mov             v3.16b,  v0.16b
552        sub             x1,  x1,  #64
5531:
554        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
555        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
556        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
557        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
558        subs            w4,  w4,  #4
559        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
560        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
561        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
562        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
563        b.gt            1b
564        ret
565endfunc
566
567jumptable ipred_dc_left_tbl
568        .word L(ipred_dc_left_h64) - ipred_dc_left_tbl
569        .word L(ipred_dc_left_h32) - ipred_dc_left_tbl
570        .word L(ipred_dc_left_h16) - ipred_dc_left_tbl
571        .word L(ipred_dc_left_h8)  - ipred_dc_left_tbl
572        .word L(ipred_dc_left_h4)  - ipred_dc_left_tbl
573        .word L(ipred_dc_left_w64) - ipred_dc_left_tbl
574        .word L(ipred_dc_left_w32) - ipred_dc_left_tbl
575        .word L(ipred_dc_left_w16) - ipred_dc_left_tbl
576        .word L(ipred_dc_left_w8)  - ipred_dc_left_tbl
577        .word L(ipred_dc_left_w4)  - ipred_dc_left_tbl
578endjumptable
579
580// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
581//                          const pixel *const topleft,
582//                          const int width, const int height, const int a,
583//                          const int max_width, const int max_height);
584function ipred_dc_16bpc_neon, export=1
585        sub             x2,  x2,  w4, uxtw #1
586        add             w7,  w3,  w4             // width + height
587        clz             w3,  w3
588        clz             w6,  w4
589        dup             v16.4s, w7               // width + height
590        movrel          x5,  ipred_dc_tbl
591        rbit            w7,  w7                  // rbit(width + height)
592        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
593        sub             w6,  w6,  #25
594        clz             w7,  w7                  // ctz(width + height)
595        ldrsw           x3,  [x5, w3, uxtw #2]
596        ldrsw           x6,  [x5, w6, uxtw #2]
597        neg             w7,  w7                  // -ctz(width + height)
598        add             x3,  x5,  x3
599        add             x5,  x5,  x6
600        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
601        dup             v17.4s,  w7              // -ctz(width + height)
602        add             x6,  x0,  x1
603        lsl             x1,  x1,  #1
604        br              x5
605
606L(ipred_dc_h4):
607        AARCH64_VALID_JUMP_TARGET
608        ld1             {v0.4h},  [x2], #8
609        uaddlv          s0,      v0.4h
610        add             x2,  x2,  #2
611        br              x3
612L(ipred_dc_w4):
613        AARCH64_VALID_JUMP_TARGET
614        ld1             {v1.4h},  [x2]
615        add             v0.2s,   v0.2s,   v16.2s
616        uaddlv          s1,      v1.4h
617        cmp             w4,  #4
618        add             v0.2s,   v0.2s,   v1.2s
619        ushl            v0.2s,   v0.2s,   v17.2s
620        b.eq            1f
621        // h = 8/16
622        cmp             w4,  #16
623        mov             w16, #0x6667
624        mov             w17, #0xAAAB
625        csel            w16, w16, w17, eq
626        dup             v16.2s,  w16
627        mul             v0.2s,   v0.2s,   v16.2s
628        ushr            v0.2s,   v0.2s,   #17
6291:
630        dup             v0.4h,   v0.h[0]
6312:
632        st1             {v0.4h},  [x0], x1
633        st1             {v0.4h},  [x6], x1
634        subs            w4,  w4,  #4
635        st1             {v0.4h},  [x0], x1
636        st1             {v0.4h},  [x6], x1
637        b.gt            2b
638        ret
639
640L(ipred_dc_h8):
641        AARCH64_VALID_JUMP_TARGET
642        ld1             {v0.8h},  [x2], #16
643        uaddlv          s0,      v0.8h
644        add             x2,  x2,  #2
645        br              x3
646L(ipred_dc_w8):
647        AARCH64_VALID_JUMP_TARGET
648        ld1             {v1.8h},  [x2]
649        add             v0.2s,   v0.2s,   v16.2s
650        uaddlv          s1,      v1.8h
651        cmp             w4,  #8
652        add             v0.2s,   v0.2s,   v1.2s
653        ushl            v0.2s,   v0.2s,   v17.2s
654        b.eq            1f
655        // h = 4/16/32
656        cmp             w4,  #32
657        mov             w16, #0x6667
658        mov             w17, #0xAAAB
659        csel            w16, w16, w17, eq
660        dup             v16.2s,  w16
661        mul             v0.2s,   v0.2s,   v16.2s
662        ushr            v0.2s,   v0.2s,   #17
6631:
664        dup             v0.8h,   v0.h[0]
6652:
666        st1             {v0.8h},  [x0], x1
667        st1             {v0.8h},  [x6], x1
668        subs            w4,  w4,  #4
669        st1             {v0.8h},  [x0], x1
670        st1             {v0.8h},  [x6], x1
671        b.gt            2b
672        ret
673
674L(ipred_dc_h16):
675        AARCH64_VALID_JUMP_TARGET
676        ld1             {v0.8h, v1.8h}, [x2], #32
677        addp            v0.8h,   v0.8h,   v1.8h
678        add             x2,  x2,  #2
679        uaddlv          s0,      v0.8h
680        br              x3
681L(ipred_dc_w16):
682        AARCH64_VALID_JUMP_TARGET
683        ld1             {v1.8h, v2.8h}, [x2]
684        add             v0.2s,   v0.2s,   v16.2s
685        addp            v1.8h,   v1.8h,   v2.8h
686        uaddlv          s1,      v1.8h
687        cmp             w4,  #16
688        add             v0.2s,   v0.2s,   v1.2s
689        ushl            v4.2s,   v0.2s,   v17.2s
690        b.eq            1f
691        // h = 4/8/32/64
692        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
693        mov             w16, #0x6667
694        mov             w17, #0xAAAB
695        csel            w16, w16, w17, eq
696        dup             v16.2s,  w16
697        mul             v4.2s,   v4.2s,   v16.2s
698        ushr            v4.2s,   v4.2s,   #17
6991:
700        dup             v0.8h,   v4.h[0]
701        dup             v1.8h,   v4.h[0]
7022:
703        st1             {v0.8h, v1.8h}, [x0], x1
704        st1             {v0.8h, v1.8h}, [x6], x1
705        subs            w4,  w4,  #4
706        st1             {v0.8h, v1.8h}, [x0], x1
707        st1             {v0.8h, v1.8h}, [x6], x1
708        b.gt            2b
709        ret
710
711L(ipred_dc_h32):
712        AARCH64_VALID_JUMP_TARGET
713        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
714        addp            v0.8h,   v0.8h,   v1.8h
715        addp            v2.8h,   v2.8h,   v3.8h
716        addp            v0.8h,   v0.8h,   v2.8h
717        add             x2,  x2,  #2
718        uaddlv          s0,      v0.8h
719        br              x3
720L(ipred_dc_w32):
721        AARCH64_VALID_JUMP_TARGET
722        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
723        add             v0.2s,   v0.2s,   v16.2s
724        addp            v1.8h,   v1.8h,   v2.8h
725        addp            v3.8h,   v3.8h,   v4.8h
726        addp            v1.8h,   v1.8h,   v3.8h
727        uaddlv          s1,      v1.8h
728        cmp             w4,  #32
729        add             v0.2s,   v0.2s,   v1.2s
730        ushl            v4.2s,   v0.2s,   v17.2s
731        b.eq            1f
732        // h = 8/16/64
733        cmp             w4,  #8
734        mov             w16, #0x6667
735        mov             w17, #0xAAAB
736        csel            w16, w16, w17, eq
737        dup             v16.2s,  w16
738        mul             v4.2s,   v4.2s,   v16.2s
739        ushr            v4.2s,   v4.2s,   #17
7401:
741        dup             v0.8h,   v4.h[0]
742        dup             v1.8h,   v4.h[0]
743        dup             v2.8h,   v4.h[0]
744        dup             v3.8h,   v4.h[0]
7452:
746        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
747        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
748        subs            w4,  w4,  #4
749        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
750        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
751        b.gt            2b
752        ret
753
754L(ipred_dc_h64):
755        AARCH64_VALID_JUMP_TARGET
756        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
757        addp            v0.8h,   v0.8h,   v1.8h
758        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
759        addp            v2.8h,   v2.8h,   v3.8h
760        addp            v4.8h,   v4.8h,   v5.8h
761        addp            v6.8h,   v6.8h,   v7.8h
762        addp            v0.8h,   v0.8h,   v2.8h
763        addp            v4.8h,   v4.8h,   v6.8h
764        addp            v0.8h,   v0.8h,   v4.8h
765        add             x2,  x2,  #2
766        uaddlv          s0,      v0.8h
767        br              x3
768L(ipred_dc_w64):
769        AARCH64_VALID_JUMP_TARGET
770        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
771        add             v0.2s,   v0.2s,   v16.2s
772        addp            v1.8h,   v1.8h,   v2.8h
773        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
774        addp            v3.8h,   v3.8h,   v4.8h
775        addp            v20.8h,  v20.8h,  v21.8h
776        addp            v22.8h,  v22.8h,  v23.8h
777        addp            v1.8h,   v1.8h,   v3.8h
778        addp            v20.8h,  v20.8h,  v22.8h
779        addp            v1.8h,   v1.8h,   v20.8h
780        uaddlv          s1,      v1.8h
781        cmp             w4,  #64
782        add             v0.2s,   v0.2s,   v1.2s
783        ushl            v4.2s,   v0.2s,   v17.2s
784        b.eq            1f
785        // h = 16/32
786        cmp             w4,  #16
787        mov             w16, #0x6667
788        mov             w17, #0xAAAB
789        csel            w16, w16, w17, eq
790        dup             v16.2s,  w16
791        mul             v4.2s,   v4.2s,   v16.2s
792        ushr            v4.2s,   v4.2s,   #17
7931:
794        sub             x1,  x1,  #64
795        dup             v0.8h,   v4.h[0]
796        dup             v1.8h,   v4.h[0]
797        dup             v2.8h,   v4.h[0]
798        dup             v3.8h,   v4.h[0]
7992:
800        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
801        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
802        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
803        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
804        subs            w4,  w4,  #4
805        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
806        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
807        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
808        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
809        b.gt            2b
810        ret
811endfunc
812
813jumptable ipred_dc_tbl
814        .word L(ipred_dc_h64) - ipred_dc_tbl
815        .word L(ipred_dc_h32) - ipred_dc_tbl
816        .word L(ipred_dc_h16) - ipred_dc_tbl
817        .word L(ipred_dc_h8)  - ipred_dc_tbl
818        .word L(ipred_dc_h4)  - ipred_dc_tbl
819        .word L(ipred_dc_w64) - ipred_dc_tbl
820        .word L(ipred_dc_w32) - ipred_dc_tbl
821        .word L(ipred_dc_w16) - ipred_dc_tbl
822        .word L(ipred_dc_w8)  - ipred_dc_tbl
823        .word L(ipred_dc_w4)  - ipred_dc_tbl
824endjumptable
825
826// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
827//                             const pixel *const topleft,
828//                             const int width, const int height, const int a,
829//                             const int max_width, const int max_height);
830function ipred_paeth_16bpc_neon, export=1
831        clz             w9,  w3
832        movrel          x5,  ipred_paeth_tbl
833        sub             w9,  w9,  #25
834        ldrsw           x9,  [x5, w9, uxtw #2]
835        ld1r            {v4.8h},  [x2]
836        add             x8,  x2,  #2
837        sub             x2,  x2,  #8
838        add             x5,  x5,  x9
839        mov             x7,  #-8
840        add             x6,  x0,  x1
841        lsl             x1,  x1,  #1
842        br              x5
84340:
844        AARCH64_VALID_JUMP_TARGET
845        ld1r            {v5.2d},  [x8]
846        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
8474:
848        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7
849        zip1            v0.2d,   v0.2d,   v1.2d
850        zip1            v2.2d,   v2.2d,   v3.2d
851        add             v16.8h,  v6.8h,   v0.8h   // base
852        add             v17.8h,  v6.8h,   v2.8h
853        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
854        sabd            v21.8h,  v5.8h,   v17.8h
855        sabd            v22.8h,  v4.8h,   v16.8h  // tldiff
856        sabd            v23.8h,  v4.8h,   v17.8h
857        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
858        sabd            v17.8h,  v2.8h,   v17.8h
859        umin            v18.8h,  v20.8h,  v22.8h  // min(tdiff, tldiff)
860        umin            v19.8h,  v21.8h,  v23.8h
861        cmge            v20.8h,  v22.8h,  v20.8h  // tldiff >= tdiff
862        cmge            v21.8h,  v23.8h,  v21.8h
863        cmge            v16.8h,  v18.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
864        cmge            v17.8h,  v19.8h,  v17.8h
865        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
866        bsl             v20.16b, v5.16b,  v4.16b
867        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
868        bit             v20.16b, v0.16b,  v16.16b
869        st1             {v21.d}[1], [x0], x1
870        st1             {v21.d}[0], [x6], x1
871        subs            w4,  w4,  #4
872        st1             {v20.d}[1], [x0], x1
873        st1             {v20.d}[0], [x6], x1
874        b.gt            4b
875        ret
87680:
877160:
878320:
879640:
880        AARCH64_VALID_JUMP_TARGET
881        ld1             {v5.8h},  [x8], #16
882        mov             w9,  w3
883        // Set up pointers for four rows in parallel; x0, x6, x5, x10
884        add             x5,  x0,  x1
885        add             x10, x6,  x1
886        lsl             x1,  x1,  #1
887        sub             x1,  x1,  w3, uxtw #1
8881:
889        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
8902:
891        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
892        add             v16.8h,  v6.8h,   v0.8h   // base
893        add             v17.8h,  v6.8h,   v1.8h
894        add             v18.8h,  v6.8h,   v2.8h
895        add             v19.8h,  v6.8h,   v3.8h
896        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
897        sabd            v21.8h,  v5.8h,   v17.8h
898        sabd            v22.8h,  v5.8h,   v18.8h
899        sabd            v23.8h,  v5.8h,   v19.8h
900        sabd            v24.8h,  v4.8h,   v16.8h  // tldiff
901        sabd            v25.8h,  v4.8h,   v17.8h
902        sabd            v26.8h,  v4.8h,   v18.8h
903        sabd            v27.8h,  v4.8h,   v19.8h
904        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
905        sabd            v17.8h,  v1.8h,   v17.8h
906        sabd            v18.8h,  v2.8h,   v18.8h
907        sabd            v19.8h,  v3.8h,   v19.8h
908        umin            v28.8h,  v20.8h,  v24.8h  // min(tdiff, tldiff)
909        umin            v29.8h,  v21.8h,  v25.8h
910        umin            v30.8h,  v22.8h,  v26.8h
911        umin            v31.8h,  v23.8h,  v27.8h
912        cmge            v20.8h,  v24.8h,  v20.8h  // tldiff >= tdiff
913        cmge            v21.8h,  v25.8h,  v21.8h
914        cmge            v22.8h,  v26.8h,  v22.8h
915        cmge            v23.8h,  v27.8h,  v23.8h
916        cmge            v16.8h,  v28.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
917        cmge            v17.8h,  v29.8h,  v17.8h
918        cmge            v18.8h,  v30.8h,  v18.8h
919        cmge            v19.8h,  v31.8h,  v19.8h
920        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
921        bsl             v22.16b, v5.16b,  v4.16b
922        bsl             v21.16b, v5.16b,  v4.16b
923        bsl             v20.16b, v5.16b,  v4.16b
924        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
925        bit             v22.16b, v2.16b,  v18.16b
926        bit             v21.16b, v1.16b,  v17.16b
927        bit             v20.16b, v0.16b,  v16.16b
928        st1             {v23.8h}, [x0], #16
929        st1             {v22.8h}, [x6], #16
930        subs            w3,  w3,  #8
931        st1             {v21.8h}, [x5], #16
932        st1             {v20.8h}, [x10], #16
933        b.le            8f
934        ld1             {v5.8h},  [x8], #16
935        b               2b
9368:
937        subs            w4,  w4,  #4
938        b.le            9f
939        // End of horizontal loop, move pointers to next four rows
940        sub             x8,  x8,  w9, uxtw #1
941        add             x0,  x0,  x1
942        add             x6,  x6,  x1
943        // Load the top row as early as possible
944        ld1             {v5.8h},  [x8], #16
945        add             x5,  x5,  x1
946        add             x10, x10, x1
947        mov             w3,  w9
948        b               1b
9499:
950        ret
951endfunc
952
953jumptable ipred_paeth_tbl
954        .word 640b - ipred_paeth_tbl
955        .word 320b - ipred_paeth_tbl
956        .word 160b - ipred_paeth_tbl
957        .word 80b  - ipred_paeth_tbl
958        .word 40b  - ipred_paeth_tbl
959endjumptable
960
961// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
962//                              const pixel *const topleft,
963//                              const int width, const int height, const int a,
964//                              const int max_width, const int max_height);
965function ipred_smooth_16bpc_neon, export=1
966        movrel          x10, X(sm_weights)
967        add             x11, x10, w4, uxtw
968        add             x10, x10, w3, uxtw
969        clz             w9,  w3
970        movrel          x5,  ipred_smooth_tbl
971        sub             x12, x2,  w4, uxtw #1
972        sub             w9,  w9,  #25
973        ldrsw           x9,  [x5, w9, uxtw #2]
974        ld1r            {v4.8h},  [x12] // bottom
975        add             x8,  x2,  #2
976        add             x5,  x5,  x9
977        add             x6,  x0,  x1
978        lsl             x1,  x1,  #1
979        br              x5
98040:
981        AARCH64_VALID_JUMP_TARGET
982        ld1r            {v6.2d}, [x8]             // top
983        ld1r            {v7.2s}, [x10]            // weights_hor
984        sub             x2,  x2,  #8
985        mov             x7,  #-8
986        dup             v5.8h,   v6.h[3]          // right
987        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
988        uxtl            v7.8h,   v7.8b            // weights_hor
989        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
9904:
991        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
992        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
993        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
994        ushll           v21.4s,  v31.4h,  #8
995        ushll           v22.4s,  v31.4h,  #8
996        ushll           v23.4s,  v31.4h,  #8
997        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
998        zip1            v0.2d,   v3.2d,   v2.2d
999        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
1000        zip1            v18.2s,  v18.2s,  v19.2s
1001        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1002        sub             v1.8h,   v1.8h,   v5.8h
1003        uxtl            v16.8h,  v16.8b           // weights_ver
1004        uxtl            v18.8h,  v18.8b
1005        smlal           v20.4s,  v0.4h,   v7.4h   // += (left-right)*weights_hor
1006        smlal2          v21.4s,  v0.8h,   v7.8h
1007        smlal           v22.4s,  v1.4h,   v7.4h
1008        smlal2          v23.4s,  v1.8h,   v7.8h
1009        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
1010        smlal2          v21.4s,  v6.8h,   v16.8h
1011        smlal           v22.4s,  v6.4h,   v18.4h
1012        smlal2          v23.4s,  v6.8h,   v18.8h
1013        rshrn           v20.4h,  v20.4s,  #9
1014        rshrn           v21.4h,  v21.4s,  #9
1015        rshrn           v22.4h,  v22.4s,  #9
1016        rshrn           v23.4h,  v23.4s,  #9
1017        st1             {v20.4h}, [x0], x1
1018        st1             {v21.4h}, [x6], x1
1019        subs            w4,  w4,  #4
1020        st1             {v22.4h}, [x0], x1
1021        st1             {v23.4h}, [x6], x1
1022        b.gt            4b
1023        ret
102480:
1025        AARCH64_VALID_JUMP_TARGET
1026        ld1             {v6.8h}, [x8]             // top
1027        ld1             {v7.8b}, [x10]            // weights_hor
1028        sub             x2,  x2,  #8
1029        mov             x7,  #-8
1030        dup             v5.8h,   v6.h[7]          // right
1031        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
1032        uxtl            v7.8h,   v7.8b            // weights_hor
1033        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
10348:
1035        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
1036        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
1037        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
1038        ushll           v21.4s,  v31.4h,  #8
1039        ushll           v22.4s,  v31.4h,  #8
1040        ushll           v23.4s,  v31.4h,  #8
1041        ushll           v24.4s,  v31.4h,  #8
1042        ushll           v25.4s,  v31.4h,  #8
1043        ushll           v26.4s,  v31.4h,  #8
1044        ushll           v27.4s,  v31.4h,  #8
1045        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1046        sub             v1.8h,   v1.8h,   v5.8h
1047        sub             v2.8h,   v2.8h,   v5.8h
1048        sub             v3.8h,   v3.8h,   v5.8h
1049        uxtl            v16.8h,  v16.8b           // weights_ver
1050        uxtl            v17.8h,  v17.8b
1051        uxtl            v18.8h,  v18.8b
1052        uxtl            v19.8h,  v19.8b
1053        smlal           v20.4s,  v3.4h,   v7.4h   // += (left-right)*weights_hor
1054        smlal2          v21.4s,  v3.8h,   v7.8h   // (left flipped)
1055        smlal           v22.4s,  v2.4h,   v7.4h
1056        smlal2          v23.4s,  v2.8h,   v7.8h
1057        smlal           v24.4s,  v1.4h,   v7.4h
1058        smlal2          v25.4s,  v1.8h,   v7.8h
1059        smlal           v26.4s,  v0.4h,   v7.4h
1060        smlal2          v27.4s,  v0.8h,   v7.8h
1061        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
1062        smlal2          v21.4s,  v6.8h,   v16.8h
1063        smlal           v22.4s,  v6.4h,   v17.4h
1064        smlal2          v23.4s,  v6.8h,   v17.8h
1065        smlal           v24.4s,  v6.4h,   v18.4h
1066        smlal2          v25.4s,  v6.8h,   v18.8h
1067        smlal           v26.4s,  v6.4h,   v19.4h
1068        smlal2          v27.4s,  v6.8h,   v19.8h
1069        rshrn           v20.4h,  v20.4s,  #9
1070        rshrn2          v20.8h,  v21.4s,  #9
1071        rshrn           v21.4h,  v22.4s,  #9
1072        rshrn2          v21.8h,  v23.4s,  #9
1073        rshrn           v22.4h,  v24.4s,  #9
1074        rshrn2          v22.8h,  v25.4s,  #9
1075        rshrn           v23.4h,  v26.4s,  #9
1076        rshrn2          v23.8h,  v27.4s,  #9
1077        st1             {v20.8h}, [x0], x1
1078        st1             {v21.8h}, [x6], x1
1079        subs            w4,  w4,  #4
1080        st1             {v22.8h}, [x0], x1
1081        st1             {v23.8h}, [x6], x1
1082        b.gt            8b
1083        ret
1084160:
1085320:
1086640:
1087        AARCH64_VALID_JUMP_TARGET
1088        add             x12, x2,  w3, uxtw #1
1089        sub             x1,  x1,  w3, uxtw #1
1090        ld1r            {v5.8h}, [x12]            // right
1091        sub             x2,  x2,  #4
1092        mov             x7,  #-4
1093        mov             w9,  w3
1094        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
1095
10961:
1097        ld2r            {v0.8h, v1.8h},   [x2],  x7 // left
1098        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
1099        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1100        sub             v1.8h,   v1.8h,   v5.8h
1101        uxtl            v16.8h,  v16.8b           // weights_ver
1102        uxtl            v17.8h,  v17.8b
11032:
1104        ld1             {v7.16b}, [x10],  #16     // weights_hor
1105        ld1             {v2.8h, v3.8h}, [x8], #32 // top
1106        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
1107        ushll           v21.4s,  v31.4h,  #8
1108        ushll           v22.4s,  v31.4h,  #8
1109        ushll           v23.4s,  v31.4h,  #8
1110        ushll           v24.4s,  v31.4h,  #8
1111        ushll           v25.4s,  v31.4h,  #8
1112        ushll           v26.4s,  v31.4h,  #8
1113        ushll           v27.4s,  v31.4h,  #8
1114        uxtl            v6.8h,   v7.8b            // weights_hor
1115        uxtl2           v7.8h,   v7.16b
1116        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
1117        sub             v3.8h,   v3.8h,   v4.8h
1118        smlal           v20.4s,  v1.4h,   v6.4h   // += (left-right)*weights_hor
1119        smlal2          v21.4s,  v1.8h,   v6.8h   // (left flipped)
1120        smlal           v22.4s,  v1.4h,   v7.4h
1121        smlal2          v23.4s,  v1.8h,   v7.8h
1122        smlal           v24.4s,  v0.4h,   v6.4h
1123        smlal2          v25.4s,  v0.8h,   v6.8h
1124        smlal           v26.4s,  v0.4h,   v7.4h
1125        smlal2          v27.4s,  v0.8h,   v7.8h
1126        smlal           v20.4s,  v2.4h,   v16.4h  // += (top-bottom)*weights_ver
1127        smlal2          v21.4s,  v2.8h,   v16.8h
1128        smlal           v22.4s,  v3.4h,   v16.4h
1129        smlal2          v23.4s,  v3.8h,   v16.8h
1130        smlal           v24.4s,  v2.4h,   v17.4h
1131        smlal2          v25.4s,  v2.8h,   v17.8h
1132        smlal           v26.4s,  v3.4h,   v17.4h
1133        smlal2          v27.4s,  v3.8h,   v17.8h
1134        rshrn           v20.4h,  v20.4s,  #9
1135        rshrn2          v20.8h,  v21.4s,  #9
1136        rshrn           v21.4h,  v22.4s,  #9
1137        rshrn2          v21.8h,  v23.4s,  #9
1138        rshrn           v22.4h,  v24.4s,  #9
1139        rshrn2          v22.8h,  v25.4s,  #9
1140        rshrn           v23.4h,  v26.4s,  #9
1141        rshrn2          v23.8h,  v27.4s,  #9
1142        subs            w3,  w3,  #16
1143        st1             {v20.8h, v21.8h}, [x0], #32
1144        st1             {v22.8h, v23.8h}, [x6], #32
1145        b.gt            2b
1146        subs            w4,  w4,  #2
1147        b.le            9f
1148        sub             x8,  x8,  w9, uxtw #1
1149        sub             x10, x10, w9, uxtw
1150        add             x0,  x0,  x1
1151        add             x6,  x6,  x1
1152        mov             w3,  w9
1153        b               1b
11549:
1155        ret
1156endfunc
1157
1158jumptable ipred_smooth_tbl
1159        .word 640b - ipred_smooth_tbl
1160        .word 320b - ipred_smooth_tbl
1161        .word 160b - ipred_smooth_tbl
1162        .word 80b  - ipred_smooth_tbl
1163        .word 40b  - ipred_smooth_tbl
1164endjumptable
1165
1166// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1167//                                const pixel *const topleft,
1168//                                const int width, const int height, const int a,
1169//                                const int max_width, const int max_height);
1170function ipred_smooth_v_16bpc_neon, export=1
1171        movrel          x7,  X(sm_weights)
1172        add             x7,  x7,  w4, uxtw
1173        clz             w9,  w3
1174        movrel          x5,  ipred_smooth_v_tbl
1175        sub             x8,  x2,  w4, uxtw #1
1176        sub             w9,  w9,  #25
1177        ldrsw           x9,  [x5, w9, uxtw #2]
1178        ld1r            {v4.8h},  [x8] // bottom
1179        add             x2,  x2,  #2
1180        add             x5,  x5,  x9
1181        add             x6,  x0,  x1
1182        lsl             x1,  x1,  #1
1183        br              x5
118440:
1185        AARCH64_VALID_JUMP_TARGET
1186        ld1r            {v6.2d}, [x2]             // top
1187        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
11884:
1189        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1190        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
1191        zip1            v18.2s,  v18.2s,  v19.2s
1192        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1193        ushll           v18.8h,  v18.8b,  #7
1194        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1195        sqrdmulh        v21.8h,  v6.8h,   v18.8h
1196        add             v20.8h,  v20.8h,  v4.8h
1197        add             v21.8h,  v21.8h,  v4.8h
1198        st1             {v20.d}[0], [x0], x1
1199        st1             {v20.d}[1], [x6], x1
1200        subs            w4,  w4,  #4
1201        st1             {v21.d}[0], [x0], x1
1202        st1             {v21.d}[1], [x6], x1
1203        b.gt            4b
1204        ret
120580:
1206        AARCH64_VALID_JUMP_TARGET
1207        ld1             {v6.8h}, [x2]             // top
1208        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
12098:
1210        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1211        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1212        ushll           v17.8h,  v17.8b,  #7
1213        ushll           v18.8h,  v18.8b,  #7
1214        ushll           v19.8h,  v19.8b,  #7
1215        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1216        sqrdmulh        v21.8h,  v6.8h,   v17.8h
1217        sqrdmulh        v22.8h,  v6.8h,   v18.8h
1218        sqrdmulh        v23.8h,  v6.8h,   v19.8h
1219        add             v20.8h,  v20.8h,  v4.8h
1220        add             v21.8h,  v21.8h,  v4.8h
1221        add             v22.8h,  v22.8h,  v4.8h
1222        add             v23.8h,  v23.8h,  v4.8h
1223        st1             {v20.8h}, [x0], x1
1224        st1             {v21.8h}, [x6], x1
1225        subs            w4,  w4,  #4
1226        st1             {v22.8h}, [x0], x1
1227        st1             {v23.8h}, [x6], x1
1228        b.gt            8b
1229        ret
1230160:
1231320:
1232640:
1233        AARCH64_VALID_JUMP_TARGET
1234        // Set up pointers for four rows in parallel; x0, x6, x5, x8
1235        add             x5,  x0,  x1
1236        add             x8,  x6,  x1
1237        lsl             x1,  x1,  #1
1238        sub             x1,  x1,  w3, uxtw #1
1239        mov             w9,  w3
1240
12411:
1242        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1243        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1244        ushll           v17.8h,  v17.8b,  #7
1245        ushll           v18.8h,  v18.8b,  #7
1246        ushll           v19.8h,  v19.8b,  #7
12472:
1248        ld1             {v2.8h, v3.8h}, [x2], #32 // top
1249        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
1250        sub             v3.8h,   v3.8h,   v4.8h
1251        sqrdmulh        v20.8h,  v2.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1252        sqrdmulh        v21.8h,  v3.8h,   v16.8h
1253        sqrdmulh        v22.8h,  v2.8h,   v17.8h
1254        sqrdmulh        v23.8h,  v3.8h,   v17.8h
1255        sqrdmulh        v24.8h,  v2.8h,   v18.8h
1256        sqrdmulh        v25.8h,  v3.8h,   v18.8h
1257        sqrdmulh        v26.8h,  v2.8h,   v19.8h
1258        sqrdmulh        v27.8h,  v3.8h,   v19.8h
1259        add             v20.8h,  v20.8h,  v4.8h
1260        add             v21.8h,  v21.8h,  v4.8h
1261        add             v22.8h,  v22.8h,  v4.8h
1262        add             v23.8h,  v23.8h,  v4.8h
1263        add             v24.8h,  v24.8h,  v4.8h
1264        add             v25.8h,  v25.8h,  v4.8h
1265        add             v26.8h,  v26.8h,  v4.8h
1266        add             v27.8h,  v27.8h,  v4.8h
1267        subs            w3,  w3,  #16
1268        st1             {v20.8h, v21.8h}, [x0], #32
1269        st1             {v22.8h, v23.8h}, [x6], #32
1270        st1             {v24.8h, v25.8h}, [x5], #32
1271        st1             {v26.8h, v27.8h}, [x8], #32
1272        b.gt            2b
1273        subs            w4,  w4,  #4
1274        b.le            9f
1275        sub             x2,  x2,  w9, uxtw #1
1276        add             x0,  x0,  x1
1277        add             x6,  x6,  x1
1278        add             x5,  x5,  x1
1279        add             x8,  x8,  x1
1280        mov             w3,  w9
1281        b               1b
12829:
1283        ret
1284endfunc
1285
1286jumptable ipred_smooth_v_tbl
1287        .word 640b - ipred_smooth_v_tbl
1288        .word 320b - ipred_smooth_v_tbl
1289        .word 160b - ipred_smooth_v_tbl
1290        .word 80b  - ipred_smooth_v_tbl
1291        .word 40b  - ipred_smooth_v_tbl
1292endjumptable
1293
1294// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1295//                                const pixel *const topleft,
1296//                                const int width, const int height, const int a,
1297//                                const int max_width, const int max_height);
1298function ipred_smooth_h_16bpc_neon, export=1
1299        movrel          x8,  X(sm_weights)
1300        add             x8,  x8,  w3, uxtw
1301        clz             w9,  w3
1302        movrel          x5,  ipred_smooth_h_tbl
1303        add             x12, x2,  w3, uxtw #1
1304        sub             w9,  w9,  #25
1305        ldrsw           x9,  [x5, w9, uxtw #2]
1306        ld1r            {v5.8h},  [x12] // right
1307        add             x5,  x5,  x9
1308        add             x6,  x0,  x1
1309        lsl             x1,  x1,  #1
1310        br              x5
131140:
1312        AARCH64_VALID_JUMP_TARGET
1313        ld1r            {v7.2s}, [x8]             // weights_hor
1314        sub             x2,  x2,  #8
1315        mov             x7,  #-8
1316        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
13174:
1318        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
1319        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
1320        zip1            v0.2d,   v3.2d,   v2.2d
1321        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1322        sub             v1.8h,   v1.8h,   v5.8h
1323        sqrdmulh        v20.8h,  v0.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
1324        sqrdmulh        v21.8h,  v1.8h,   v7.8h
1325        add             v20.8h,  v20.8h,  v5.8h
1326        add             v21.8h,  v21.8h,  v5.8h
1327        st1             {v20.d}[0], [x0], x1
1328        st1             {v20.d}[1], [x6], x1
1329        subs            w4,  w4,  #4
1330        st1             {v21.d}[0], [x0], x1
1331        st1             {v21.d}[1], [x6], x1
1332        b.gt            4b
1333        ret
133480:
1335        AARCH64_VALID_JUMP_TARGET
1336        ld1             {v7.8b}, [x8]             // weights_hor
1337        sub             x2,  x2,  #8
1338        mov             x7,  #-8
1339        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
13408:
1341        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
1342        sub             v3.8h,   v3.8h,   v5.8h   // left-right
1343        sub             v2.8h,   v2.8h,   v5.8h
1344        sub             v1.8h,   v1.8h,   v5.8h
1345        sub             v0.8h,   v0.8h,   v5.8h
1346        sqrdmulh        v20.8h,  v3.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
1347        sqrdmulh        v21.8h,  v2.8h,   v7.8h   // (left flipped)
1348        sqrdmulh        v22.8h,  v1.8h,   v7.8h
1349        sqrdmulh        v23.8h,  v0.8h,   v7.8h
1350        add             v20.8h,  v20.8h,  v5.8h
1351        add             v21.8h,  v21.8h,  v5.8h
1352        add             v22.8h,  v22.8h,  v5.8h
1353        add             v23.8h,  v23.8h,  v5.8h
1354        st1             {v20.8h}, [x0], x1
1355        st1             {v21.8h}, [x6], x1
1356        subs            w4,  w4,  #4
1357        st1             {v22.8h}, [x0], x1
1358        st1             {v23.8h}, [x6], x1
1359        b.gt            8b
1360        ret
1361160:
1362320:
1363640:
1364        AARCH64_VALID_JUMP_TARGET
1365        sub             x2,  x2,  #8
1366        mov             x7,  #-8
1367        // Set up pointers for four rows in parallel; x0, x6, x5, x10
1368        add             x5,  x0,  x1
1369        add             x10, x6,  x1
1370        lsl             x1,  x1,  #1
1371        sub             x1,  x1,  w3, uxtw #1
1372        mov             w9,  w3
1373
13741:
1375        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},   [x2],  x7 // left
1376        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1377        sub             v1.8h,   v1.8h,   v5.8h
1378        sub             v2.8h,   v2.8h,   v5.8h
1379        sub             v3.8h,   v3.8h,   v5.8h
13802:
1381        ld1             {v7.16b}, [x8],   #16     // weights_hor
1382        ushll           v6.8h,   v7.8b,   #7      // weights_hor << 7
1383        ushll2          v7.8h,   v7.16b,  #7
1384        sqrdmulh        v20.8h,  v3.8h,   v6.8h   // ((left-right)*weights_hor + 128) >> 8
1385        sqrdmulh        v21.8h,  v3.8h,   v7.8h   // (left flipped)
1386        sqrdmulh        v22.8h,  v2.8h,   v6.8h
1387        sqrdmulh        v23.8h,  v2.8h,   v7.8h
1388        sqrdmulh        v24.8h,  v1.8h,   v6.8h
1389        sqrdmulh        v25.8h,  v1.8h,   v7.8h
1390        sqrdmulh        v26.8h,  v0.8h,   v6.8h
1391        sqrdmulh        v27.8h,  v0.8h,   v7.8h
1392        add             v20.8h,  v20.8h,  v5.8h
1393        add             v21.8h,  v21.8h,  v5.8h
1394        add             v22.8h,  v22.8h,  v5.8h
1395        add             v23.8h,  v23.8h,  v5.8h
1396        add             v24.8h,  v24.8h,  v5.8h
1397        add             v25.8h,  v25.8h,  v5.8h
1398        add             v26.8h,  v26.8h,  v5.8h
1399        add             v27.8h,  v27.8h,  v5.8h
1400        subs            w3,  w3,  #16
1401        st1             {v20.8h, v21.8h}, [x0],  #32
1402        st1             {v22.8h, v23.8h}, [x6],  #32
1403        st1             {v24.8h, v25.8h}, [x5],  #32
1404        st1             {v26.8h, v27.8h}, [x10], #32
1405        b.gt            2b
1406        subs            w4,  w4,  #4
1407        b.le            9f
1408        sub             x8,  x8,  w9, uxtw
1409        add             x0,  x0,  x1
1410        add             x6,  x6,  x1
1411        add             x5,  x5,  x1
1412        add             x10, x10, x1
1413        mov             w3,  w9
1414        b               1b
14159:
1416        ret
1417endfunc
1418
1419jumptable ipred_smooth_h_tbl
1420        .word 640b - ipred_smooth_h_tbl
1421        .word 320b - ipred_smooth_h_tbl
1422        .word 160b - ipred_smooth_h_tbl
1423        .word 80b  - ipred_smooth_h_tbl
1424        .word 40b  - ipred_smooth_h_tbl
1425endjumptable
1426
1427const padding_mask_buf
1428        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1429        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1430        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1431        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1432        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1433        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1434padding_mask:
1435        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1436        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1437        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1438        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1439        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1440        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1441endconst
1442
1443// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz,
1444//                                        const pixel *const in, const int end,
1445//                                        const int bitdepth_max);
1446function ipred_z1_upsample_edge_16bpc_neon, export=1
1447        dup             v30.8h,  w4               // bitdepth_max
1448        movrel          x4,  padding_mask
1449        ld1             {v0.8h, v1.8h},  [x2]     // in[]
1450        add             x5,  x2,  w3,  uxtw #1    // in[end]
1451        sub             x4,  x4,  w3,  uxtw #1
1452
1453        ld1r            {v2.8h},  [x5]            // padding
1454        ld1             {v3.8h, v4.8h}, [x4]      // padding_mask
1455
1456        movi            v31.8h,  #9
1457
1458        bit             v0.16b,  v2.16b,  v3.16b  // padded in[]
1459        bit             v1.16b,  v2.16b,  v4.16b
1460
1461        ext             v4.16b,  v0.16b,  v1.16b,  #2
1462        ext             v5.16b,  v1.16b,  v2.16b,  #2
1463        ext             v6.16b,  v0.16b,  v1.16b,  #4
1464        ext             v7.16b,  v1.16b,  v2.16b,  #4
1465        ext             v16.16b, v0.16b,  v1.16b,  #6
1466        ext             v17.16b, v1.16b,  v2.16b,  #6
1467
1468        add             v18.8h,  v4.8h,   v6.8h   // in[i+1] + in[i+2]
1469        add             v19.8h,  v5.8h,   v7.8h
1470        add             v20.8h,  v0.8h,   v16.8h
1471        add             v21.8h,  v1.8h,   v17.8h
1472        umull           v22.4s,  v18.4h,  v31.4h  // 9*(in[i+1] + in[i+2])
1473        umull2          v23.4s,  v18.8h,  v31.8h
1474        umull           v24.4s,  v19.4h,  v31.4h
1475        umull2          v25.4s,  v19.8h,  v31.8h
1476        usubw           v22.4s,  v22.4s,  v20.4h
1477        usubw2          v23.4s,  v23.4s,  v20.8h
1478        usubw           v24.4s,  v24.4s,  v21.4h
1479        usubw2          v25.4s,  v25.4s,  v21.8h
1480
1481        sqrshrun        v16.4h,  v22.4s,  #4
1482        sqrshrun2       v16.8h,  v23.4s,  #4
1483        sqrshrun        v17.4h,  v24.4s,  #4
1484        sqrshrun2       v17.8h,  v25.4s,  #4
1485
1486        smin            v16.8h,  v16.8h,  v30.8h
1487        smin            v17.8h,  v17.8h,  v30.8h
1488
1489        zip1            v0.8h,   v4.8h,   v16.8h
1490        zip2            v1.8h,   v4.8h,   v16.8h
1491        zip1            v2.8h,   v5.8h,   v17.8h
1492        zip2            v3.8h,   v5.8h,   v17.8h
1493
1494        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
1495
1496        ret
1497endfunc
1498
1499// void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz,
1500//                                        const pixel *const in,
1501//                                        const int bitdepth_max);
1502function ipred_z2_upsample_edge_16bpc_neon, export=1
1503        dup             v30.8h,  w3               // bitdepth_max
1504        // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
1505        movrel          x4,  padding_mask
1506        ld1             {v0.8h, v1.8h}, [x2]      // in[]
1507        add             x5,  x2,  w1,  uxtw #1    // in[sz]
1508        sub             x4,  x4,  w1,  uxtw #1
1509
1510        ld1r            {v3.8h},  [x2]            // in[0] for padding
1511        ld1r            {v2.8h},  [x5]            // padding
1512        ld1             {v4.8h, v5.8h}, [x4]      // padding_mask
1513
1514        movi            v31.8h,  #9
1515
1516        bit             v0.16b,  v2.16b,  v4.16b  // padded in[]
1517        bit             v1.16b,  v2.16b,  v5.16b
1518
1519        ext             v4.16b,  v3.16b,  v0.16b,  #14
1520        ext             v5.16b,  v0.16b,  v1.16b,  #2
1521        ext             v6.16b,  v0.16b,  v1.16b,  #4
1522
1523        add             v16.8h,  v0.8h,   v5.8h   // in[i+0] + in[i+1]
1524        add             v17.8h,  v4.8h,   v6.8h   // in[i-1] + in[i+2]
1525        umull           v18.4s,  v16.4h,  v31.4h  // 9*(in[i+1] + in[i+2])
1526        umull2          v19.4s,  v16.8h,  v31.8h
1527        usubw           v18.4s,  v18.4s,  v17.4h
1528        usubw2          v19.4s,  v19.4s,  v17.8h
1529
1530        sqrshrun        v16.4h,  v18.4s,  #4
1531        sqrshrun2       v16.8h,  v19.4s,  #4
1532
1533        add             x5,  x0,  #2*16
1534
1535        smin            v16.8h,  v16.8h,  v30.8h
1536
1537        zip1            v4.8h,   v0.8h,   v16.8h
1538        zip2            v5.8h,   v0.8h,   v16.8h
1539
1540        st1             {v2.h}[0], [x5]
1541        // In case sz=8, output one single pixel in out[16].
1542        st1             {v4.8h, v5.8h}, [x0]
1543
1544        ret
1545endfunc
1546
1547const edge_filter
1548        .short 0, 4, 8, 0
1549        .short 0, 5, 6, 0
1550// Leaving out the coeffs for strength=3
1551//      .byte 2, 4, 4, 0
1552endconst
1553
1554// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz,
1555//                                      const pixel *const in, const int end,
1556//                                      const int strength);
1557function ipred_z1_filter_edge_16bpc_neon, export=1
1558        cmp             w4, #3
1559        b.eq            L(fivetap)                // if (strength == 3) goto fivetap
1560
1561        movrel          x5,  edge_filter, -6
1562        add             x5,  x5,  w4,  uxtw #3    // edge_filter + 2*((strength - 1)*4 + 1)
1563
1564        ld1             {v31.s}[0], [x5]          // kernel[1-2]
1565
1566        ld1             {v0.8h}, [x2], #16
1567
1568        dup             v30.8h, v31.h[0]
1569        dup             v31.8h, v31.h[1]
15701:
1571        // in[end], is the last valid pixel. We produce 16 pixels out by
1572        // using 18 pixels in - the last pixel used is [17] of the ones
1573        // read/buffered.
1574        cmp             w3,  #17
1575        ld1             {v1.8h, v2.8h}, [x2], #32
1576        b.lt            2f
1577        ext             v3.16b,  v0.16b,  v1.16b,  #2
1578        ext             v4.16b,  v1.16b,  v2.16b,  #2
1579        ext             v5.16b,  v0.16b,  v1.16b,  #4
1580        ext             v6.16b,  v1.16b,  v2.16b,  #4
1581        mul             v16.8h,  v0.8h,   v30.8h
1582        mla             v16.8h,  v3.8h,   v31.8h
1583        mla             v16.8h,  v5.8h,   v30.8h
1584        mul             v17.8h,  v1.8h,   v30.8h
1585        mla             v17.8h,  v4.8h,   v31.8h
1586        mla             v17.8h,  v6.8h,   v30.8h
1587        subs            w1,  w1,  #16
1588        mov             v0.16b,  v2.16b
1589        urshr           v16.8h,  v16.8h,  #4
1590        urshr           v17.8h,  v17.8h,  #4
1591        sub             w3,  w3,  #16
1592        st1             {v16.8h, v17.8h}, [x0], #32
1593        b.gt            1b
1594        ret
15952:
1596        // Right padding
1597
1598        // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead)
1599        movrel          x5,  padding_mask
1600        sub             w6,  w3,  #24
1601        sub             x5,  x5,  w3,  uxtw #1
1602        add             x6,  x2,  w6,  sxtw #1
1603
1604        ld1             {v3.8h, v4.8h}, [x5] // padding_mask
1605
1606        ld1r            {v2.8h}, [x6]
1607        bit             v0.16b,  v2.16b,  v3.16b  // Pad v0-v1
1608        bit             v1.16b,  v2.16b,  v4.16b
1609
1610        // Filter one block
1611        ext             v3.16b,  v0.16b,  v1.16b,  #2
1612        ext             v4.16b,  v1.16b,  v2.16b,  #2
1613        ext             v5.16b,  v0.16b,  v1.16b,  #4
1614        ext             v6.16b,  v1.16b,  v2.16b,  #4
1615        mul             v16.8h,  v0.8h,   v30.8h
1616        mla             v16.8h,  v3.8h,   v31.8h
1617        mla             v16.8h,  v5.8h,   v30.8h
1618        mul             v17.8h,  v1.8h,   v30.8h
1619        mla             v17.8h,  v4.8h,   v31.8h
1620        mla             v17.8h,  v6.8h,   v30.8h
1621        subs            w1,  w1,  #16
1622        urshr           v16.8h,  v16.8h,  #4
1623        urshr           v17.8h,  v17.8h,  #4
1624        st1             {v16.8h, v17.8h}, [x0], #32
1625        b.le            9f
16265:
1627        // After one block, any remaining output would only be filtering
1628        // padding - thus just store the padding.
1629        subs            w1,  w1,  #16
1630        st1             {v2.16b}, [x0], #16
1631        b.gt            5b
16329:
1633        ret
1634
1635L(fivetap):
1636        sub             x2,  x2,  #2              // topleft -= 1 pixel
1637        movi            v29.8h, #2
1638        ld1             {v0.8h}, [x2], #16
1639        movi            v30.8h, #4
1640        movi            v31.8h, #4
1641        ins             v0.h[0], v0.h[1]
16421:
1643        // in[end+1], is the last valid pixel. We produce 16 pixels out by
1644        // using 20 pixels in - the last pixel used is [19] of the ones
1645        // read/buffered.
1646        cmp             w3,  #18
1647        ld1             {v1.8h, v2.8h}, [x2], #32
1648        b.lt            2f                        // if (end + 1 < 19)
1649        ext             v3.16b,  v0.16b,  v1.16b,  #2
1650        ext             v4.16b,  v1.16b,  v2.16b,  #2
1651        ext             v5.16b,  v0.16b,  v1.16b,  #4
1652        ext             v6.16b,  v1.16b,  v2.16b,  #4
1653        ext             v16.16b, v0.16b,  v1.16b,  #6
1654        ext             v17.16b, v1.16b,  v2.16b,  #6
1655        ext             v18.16b, v0.16b,  v1.16b,  #8
1656        ext             v19.16b, v1.16b,  v2.16b,  #8
1657        mul             v20.8h,  v0.8h,   v29.8h
1658        mla             v20.8h,  v3.8h,   v30.8h
1659        mla             v20.8h,  v5.8h,   v31.8h
1660        mla             v20.8h,  v16.8h,  v30.8h
1661        mla             v20.8h,  v18.8h,  v29.8h
1662        mul             v21.8h,  v1.8h,   v29.8h
1663        mla             v21.8h,  v4.8h,   v30.8h
1664        mla             v21.8h,  v6.8h,   v31.8h
1665        mla             v21.8h,  v17.8h,  v30.8h
1666        mla             v21.8h,  v19.8h,  v29.8h
1667        subs            w1,  w1,  #16
1668        mov             v0.16b,  v2.16b
1669        urshr           v20.8h,  v20.8h,  #4
1670        urshr           v21.8h,  v21.8h,  #4
1671        sub             w3,  w3,  #16
1672        st1             {v20.8h, v21.8h}, [x0], #32
1673        b.gt            1b
1674        ret
16752:
1676        // Right padding
1677
1678        // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead)
1679        movrel          x5,  padding_mask, -2
1680        sub             w6,  w3,  #23
1681        sub             x5,  x5,  w3,  uxtw #1
1682        add             x6,  x2,  w6,  sxtw #1
1683
1684        ld1             {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask
1685
1686        ld1r            {v28.8h}, [x6]
1687        bit             v0.16b,  v28.16b, v3.16b  // Pad v0-v2
1688        bit             v1.16b,  v28.16b, v4.16b
1689        bit             v2.16b,  v28.16b, v5.16b
16904:
1691        // Filter one block
1692        ext             v3.16b,  v0.16b,  v1.16b,  #2
1693        ext             v4.16b,  v1.16b,  v2.16b,  #2
1694        ext             v5.16b,  v0.16b,  v1.16b,  #4
1695        ext             v6.16b,  v1.16b,  v2.16b,  #4
1696        ext             v16.16b, v0.16b,  v1.16b,  #6
1697        ext             v17.16b, v1.16b,  v2.16b,  #6
1698        ext             v18.16b, v0.16b,  v1.16b,  #8
1699        ext             v19.16b, v1.16b,  v2.16b,  #8
1700        mul             v20.8h,  v0.8h,   v29.8h
1701        mla             v20.8h,  v3.8h,   v30.8h
1702        mla             v20.8h,  v5.8h,   v31.8h
1703        mla             v20.8h,  v16.8h,  v30.8h
1704        mla             v20.8h,  v18.8h,  v29.8h
1705        mul             v21.8h,  v1.8h,   v29.8h
1706        mla             v21.8h,  v4.8h,   v30.8h
1707        mla             v21.8h,  v6.8h,   v31.8h
1708        mla             v21.8h,  v17.8h,  v30.8h
1709        mla             v21.8h,  v19.8h,  v29.8h
1710        subs            w1,  w1,  #16
1711        mov             v0.16b,  v2.16b
1712        mov             v1.16b,  v28.16b
1713        mov             v2.16b,  v28.16b
1714        urshr           v20.8h,  v20.8h,  #4
1715        urshr           v21.8h,  v21.8h,  #4
1716        sub             w3,  w3,  #16
1717        st1             {v20.8h, v21.8h}, [x0], #32
1718        b.le            9f
1719        // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
1720        // filter properly once more - aka (w3 >= 0).
1721        cmp             w3,  #0
1722        b.ge            4b
17235:
1724        // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
1725        // last valid pixel - thus just output that without filtering.
1726        subs            w1,  w1,  #8
1727        st1             {v28.8h}, [x0], #16
1728        b.gt            5b
17299:
1730        ret
1731endfunc
1732
1733// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px,
1734//                                 const int n);
1735function ipred_pixel_set_16bpc_neon, export=1
1736        dup             v0.8h,   w1
17371:
1738        subs            w2,  w2,  #8
1739        st1             {v0.8h}, [x0], #16
1740        b.gt            1b
1741        ret
1742endfunc
1743
1744// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1745//                                const pixel *const top,
1746//                                const int width, const int height,
1747//                                const int dx, const int max_base_x);
1748function ipred_z1_fill1_16bpc_neon, export=1
1749        clz             w9,  w3
1750        movrel          x8,  ipred_z1_fill1_tbl
1751        sub             w9,  w9,  #25
1752        ldrsw           x9,  [x8, w9, uxtw #2]
1753        add             x10, x2,  w6,  uxtw #1    // top[max_base_x]
1754        add             x8,  x8,  x9
1755        ld1r            {v31.8h}, [x10]           // padding
1756        mov             w7,  w5
1757        mov             w15, #64
1758        br              x8
175940:
1760        AARCH64_VALID_JUMP_TARGET
17614:
1762        lsr             w8,  w7,  #6              // base
1763        and             w9,  w7,  #0x3e           // frac
1764        add             w7,  w7,  w5              // xpos += dx
1765        cmp             w8,  w6                   // base >= max_base_x
1766        lsr             w10, w7,  #6              // base
1767        and             w11, w7,  #0x3e           // frac
1768        b.ge            49f
1769        lsl             w8,  w8,  #1
1770        lsl             w10, w10, #1
1771        ldr             q0,  [x2, w8, uxtw]       // top[base]
1772        ldr             q2,  [x2, w10, uxtw]
1773        dup             v4.4h,   w9               // frac
1774        dup             v5.4h,   w11
1775        ext             v1.16b,  v0.16b,  v0.16b,  #2 // top[base+1]
1776        ext             v3.16b,  v2.16b,  v2.16b,  #2
1777        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
1778        sub             v7.4h,   v3.4h,   v2.4h
1779        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
1780        ushll           v17.4s,  v2.4h,   #6
1781        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
1782        smlal           v17.4s,  v7.4h,   v5.4h
1783        rshrn           v16.4h,  v16.4s,  #6
1784        rshrn           v17.4h,  v17.4s,  #6
1785        st1             {v16.4h}, [x0], x1
1786        add             w7,  w7,  w5              // xpos += dx
1787        subs            w4,  w4,  #2
1788        st1             {v17.4h}, [x0], x1
1789        b.gt            4b
1790        ret
1791
179249:
1793        st1             {v31.4h}, [x0], x1
1794        subs            w4,  w4,  #2
1795        st1             {v31.4h}, [x0], x1
1796        b.gt            49b
1797        ret
1798
179980:
1800        AARCH64_VALID_JUMP_TARGET
18018:
1802        lsr             w8,  w7,  #6              // base
1803        and             w9,  w7,  #0x3e           // frac
1804        add             w7,  w7,  w5              // xpos += dx
1805        cmp             w8,  w6                   // base >= max_base_x
1806        lsr             w10, w7,  #6              // base
1807        and             w11, w7,  #0x3e           // frac
1808        b.ge            89f
1809        add             x8,  x2,  w8,  uxtw #1
1810        add             x10, x2,  w10, uxtw #1
1811        dup             v4.8h,   w9               // frac
1812        dup             v5.8h,   w11
1813        ld1             {v0.8h},  [x8]            // top[base]
1814        ld1             {v2.8h},  [x10]
1815        sub             w9,  w15, w9              // 64 - frac
1816        sub             w11, w15, w11
1817        ldr             h1, [x8, #16]
1818        ldr             h3, [x10, #16]
1819        dup             v6.8h,   w9               // 64 - frac
1820        dup             v7.8h,   w11
1821        ext             v1.16b,  v0.16b,  v1.16b,  #2 // top[base+1]
1822        ext             v3.16b,  v2.16b,  v3.16b,  #2
1823        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
1824        umlal           v16.4s,  v1.4h,   v4.4h   // + top[base+1]*frac
1825        umull2          v17.4s,  v0.8h,   v6.8h
1826        umlal2          v17.4s,  v1.8h,   v4.8h
1827        umull           v18.4s,  v2.4h,   v7.4h
1828        umlal           v18.4s,  v3.4h,   v5.4h
1829        umull2          v19.4s,  v2.8h,   v7.8h
1830        umlal2          v19.4s,  v3.8h,   v5.8h
1831        rshrn           v16.4h,  v16.4s,  #6
1832        rshrn2          v16.8h,  v17.4s,  #6
1833        rshrn           v17.4h,  v18.4s,  #6
1834        rshrn2          v17.8h,  v19.4s,  #6
1835        st1             {v16.8h}, [x0], x1
1836        add             w7,  w7,  w5              // xpos += dx
1837        subs            w4,  w4,  #2
1838        st1             {v17.8h}, [x0], x1
1839        b.gt            8b
1840        ret
1841
184289:
1843        st1             {v31.8h}, [x0], x1
1844        subs            w4,  w4,  #2
1845        st1             {v31.8h}, [x0], x1
1846        b.gt            89b
1847        ret
1848
1849160:
1850320:
1851640:
1852        AARCH64_VALID_JUMP_TARGET
1853
1854        mov             w12, w3
1855
1856        add             x13, x0,  x1
1857        lsl             x1,  x1,  #1
1858        sub             x1,  x1,  w3,  uxtw #1
18591:
1860        lsr             w8,  w7,  #6              // base
1861        and             w9,  w7,  #0x3e           // frac
1862        add             w7,  w7,  w5              // xpos += dx
1863        cmp             w8,  w6                   // base >= max_base_x
1864        lsr             w10, w7,  #6              // base
1865        and             w11, w7,  #0x3e           // frac
1866        b.ge            169f
1867        add             x8,  x2,  w8,  uxtw #1
1868        add             x10, x2,  w10, uxtw #1
1869        dup             v6.8h,   w9               // frac
1870        dup             v7.8h,   w11
1871        ld1             {v0.8h, v1.8h, v2.8h}, [x8],  #48 // top[base]
1872        ld1             {v3.8h, v4.8h, v5.8h}, [x10], #48
1873        sub             w9,  w15, w9              // 64 - frac
1874        sub             w11, w15, w11
1875        dup             v16.8h,  w9               // 64 - frac
1876        dup             v17.8h,  w11
1877        add             w7,  w7,  w5              // xpos += dx
18782:
1879        ext             v18.16b, v0.16b,  v1.16b,  #2 // top[base+1]
1880        ext             v19.16b, v1.16b,  v2.16b,  #2
1881        ext             v20.16b, v3.16b,  v4.16b,  #2
1882        ext             v21.16b, v4.16b,  v5.16b,  #2
1883        subs            w3,  w3,  #16
1884        umull           v22.4s,  v0.4h,   v16.4h  // top[base]*(64-frac)
1885        umlal           v22.4s,  v18.4h,  v6.4h   // + top[base+1]*frac
1886        umull2          v23.4s,  v0.8h,   v16.8h
1887        umlal2          v23.4s,  v18.8h,  v6.8h
1888        umull           v24.4s,  v1.4h,   v16.4h
1889        umlal           v24.4s,  v19.4h,  v6.4h
1890        umull2          v25.4s,  v1.8h,   v16.8h
1891        umlal2          v25.4s,  v19.8h,  v6.8h
1892        umull           v26.4s,  v3.4h,   v17.4h
1893        umlal           v26.4s,  v20.4h,  v7.4h
1894        umull2          v27.4s,  v3.8h,   v17.8h
1895        umlal2          v27.4s,  v20.8h,  v7.8h
1896        umull           v28.4s,  v4.4h,   v17.4h
1897        umlal           v28.4s,  v21.4h,  v7.4h
1898        umull2          v29.4s,  v4.8h,   v17.8h
1899        umlal2          v29.4s,  v21.8h,  v7.8h
1900        rshrn           v22.4h,  v22.4s,  #6
1901        rshrn2          v22.8h,  v23.4s,  #6
1902        rshrn           v23.4h,  v24.4s,  #6
1903        rshrn2          v23.8h,  v25.4s,  #6
1904        rshrn           v24.4h,  v26.4s,  #6
1905        rshrn2          v24.8h,  v27.4s,  #6
1906        rshrn           v25.4h,  v28.4s,  #6
1907        rshrn2          v25.8h,  v29.4s,  #6
1908        st1             {v22.8h, v23.8h}, [x0],  #32
1909        st1             {v24.8h, v25.8h}, [x13], #32
1910        b.le            3f
1911        mov             v0.16b,  v2.16b
1912        ld1             {v1.8h, v2.8h}, [x8],  #32 // top[base]
1913        mov             v3.16b,  v5.16b
1914        ld1             {v4.8h, v5.8h}, [x10], #32
1915        b               2b
1916
19173:
1918        subs            w4,  w4,  #2
1919        b.le            9f
1920        add             x0,  x0,  x1
1921        add             x13, x13, x1
1922        mov             w3,  w12
1923        b               1b
19249:
1925        ret
1926
1927169:
1928        st1             {v31.8h}, [x0],  #16
1929        subs            w3,  w3,  #8
1930        st1             {v31.8h}, [x13], #16
1931        b.gt            169b
1932        subs            w4,  w4,  #2
1933        b.le            9b
1934        add             x0,  x0,  x1
1935        add             x13, x13, x1
1936        mov             w3,  w12
1937        b               169b
1938endfunc
1939
1940jumptable ipred_z1_fill1_tbl
1941        .word 640b - ipred_z1_fill1_tbl
1942        .word 320b - ipred_z1_fill1_tbl
1943        .word 160b - ipred_z1_fill1_tbl
1944        .word 80b  - ipred_z1_fill1_tbl
1945        .word 40b  - ipred_z1_fill1_tbl
1946endjumptable
1947
1948function ipred_z1_fill2_16bpc_neon, export=1
1949        cmp             w3,  #8
1950        add             x10, x2,  w6,  uxtw       // top[max_base_x]
1951        ld1r            {v31.16b}, [x10]          // padding
1952        mov             w7,  w5
1953        mov             w15, #64
1954        b.eq            8f
1955
19564:      // w == 4
1957        lsr             w8,  w7,  #6              // base
1958        and             w9,  w7,  #0x3e           // frac
1959        add             w7,  w7,  w5              // xpos += dx
1960        cmp             w8,  w6                   // base >= max_base_x
1961        lsr             w10, w7,  #6              // base
1962        and             w11, w7,  #0x3e           // frac
1963        b.ge            49f
1964        lsl             w8,  w8,  #1
1965        lsl             w10, w10, #1
1966        ldr             q0,  [x2, w8, uxtw]       // top[base]
1967        ldr             q2,  [x2, w10, uxtw]
1968        dup             v4.4h,   w9               // frac
1969        dup             v5.4h,   w11
1970        uzp2            v1.8h,   v0.8h,   v0.8h   // top[base+1]
1971        uzp1            v0.8h,   v0.8h,   v0.8h   // top[base]
1972        uzp2            v3.8h,   v2.8h,   v2.8h
1973        uzp1            v2.8h,   v2.8h,   v2.8h
1974        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
1975        sub             v7.4h,   v3.4h,   v2.4h
1976        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
1977        ushll           v17.4s,  v2.4h,   #6
1978        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
1979        smlal           v17.4s,  v7.4h,   v5.4h
1980        rshrn           v16.4h,  v16.4s,  #6
1981        rshrn           v17.4h,  v17.4s,  #6
1982        st1             {v16.4h}, [x0], x1
1983        add             w7,  w7,  w5              // xpos += dx
1984        subs            w4,  w4,  #2
1985        st1             {v17.4h}, [x0], x1
1986        b.gt            4b
1987        ret
1988
198949:
1990        st1             {v31.4h}, [x0], x1
1991        subs            w4,  w4,  #2
1992        st1             {v31.4h}, [x0], x1
1993        b.gt            49b
1994        ret
1995
19968:      // w == 8
1997        lsr             w8,  w7,  #6              // base
1998        and             w9,  w7,  #0x3e           // frac
1999        add             w7,  w7,  w5              // xpos += dx
2000        cmp             w8,  w6                   // base >= max_base_x
2001        lsr             w10, w7,  #6              // base
2002        and             w11, w7,  #0x3e           // frac
2003        b.ge            89f
2004        add             x8,  x2,  w8,  uxtw #1
2005        add             x10, x2,  w10, uxtw #1
2006        dup             v4.8h,   w9               // frac
2007        dup             v5.8h,   w11
2008        ld1             {v0.8h, v1.8h},  [x8]     // top[base]
2009        ld1             {v2.8h, v3.8h},  [x10]
2010        sub             w9,  w15, w9              // 64 - frac
2011        sub             w11, w15, w11
2012        dup             v6.8h,   w9               // 64 - frac
2013        dup             v7.8h,   w11
2014        uzp2            v20.8h,  v0.8h,   v1.8h   // top[base+1]
2015        uzp1            v0.8h,   v0.8h,   v1.8h   // top[base]
2016        uzp2            v21.8h,  v2.8h,   v3.8h
2017        uzp1            v2.8h,   v2.8h,   v3.8h
2018        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
2019        umlal           v16.4s,  v20.4h,  v4.4h   // + top[base+1]*frac
2020        umull2          v17.4s,  v0.8h,   v6.8h
2021        umlal2          v17.4s,  v20.8h,  v4.8h
2022        umull           v18.4s,  v2.4h,   v7.4h
2023        umlal           v18.4s,  v21.4h,  v5.4h
2024        umull2          v19.4s,  v2.8h,   v7.8h
2025        umlal2          v19.4s,  v21.8h,  v5.8h
2026        rshrn           v16.4h,  v16.4s,  #6
2027        rshrn2          v16.8h,  v17.4s,  #6
2028        rshrn           v17.4h,  v18.4s,  #6
2029        rshrn2          v17.8h,  v19.4s,  #6
2030        st1             {v16.8h}, [x0], x1
2031        add             w7,  w7,  w5              // xpos += dx
2032        subs            w4,  w4,  #2
2033        st1             {v17.8h}, [x0], x1
2034        b.gt            8b
2035        ret
2036
203789:
2038        st1             {v31.8h}, [x0], x1
2039        subs            w4,  w4,  #2
2040        st1             {v31.8h}, [x0], x1
2041        b.gt            89b
2042        ret
2043endfunc
2044
2045// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src,
2046//                               const int n);
2047function ipred_reverse_16bpc_neon, export=1
2048        sub             x1,  x1,  #16
2049        add             x3,  x0,  #8
2050        mov             x4,  #16
20511:
2052        ld1             {v0.8h}, [x1]
2053        subs            w2,  w2,  #8
2054        rev64           v0.8h,  v0.8h
2055        sub             x1,  x1,  #16
2056        st1             {v0.d}[1], [x0], x4
2057        st1             {v0.d}[0], [x3], x4
2058        b.gt            1b
2059        ret
2060endfunc
2061
2062const increments
2063        .short          0,  1,  2,  3,  4,  5,  6,  7
2064endconst
2065
2066// void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
2067//                                const pixel *const top,
2068//                                const pixel *const left,
2069//                                const int width, const int height,
2070//                                const int dx, const int dy);
2071function ipred_z2_fill1_16bpc_neon, export=1
2072        clz             w10, w4
2073        movrel          x9,  ipred_z2_fill1_tbl
2074        sub             w10, w10, #25
2075        ldrsw           x10, [x9, w10, uxtw #2]
2076        mov             w8,  #(1 << 6)            // xpos = 1 << 6
2077        add             x9,  x9,  x10
2078        sub             w8,  w8,  w6              // xpos -= dx
2079
2080        movrel          x11, increments
2081        ld1             {v31.8h},  [x11]          // increments
2082        neg             w7,  w7                   // -dy
2083
2084        br              x9
208540:
2086        AARCH64_VALID_JUMP_TARGET
2087
2088        dup             v30.4h,  w7               // -dy
2089        movi            v17.8b,  #1
2090
2091        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
2092        movi            v25.8h,  #0x3e
2093        add             v30.4h,  v16.4h,  v30.4h  // -= dy
2094
2095        // Worst case height for w=4 is 16, but we need at least h+1 elements
2096        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]
2097
2098        movi            v26.8h,  #64
2099        movi            v19.16b, #4
2100
2101        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2102        and             v27.8b,  v30.8b,  v25.8b  // frac_y
2103
2104        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2105
2106        movi            v23.4h,  #1, lsl #8
2107        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2108        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
2109        movi            v17.8b,  #2
2110        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...
2111
2112        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
2113        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)
2114
2115        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
2116
2117        trn1            v30.2d,  v30.2d,  v28.2d  // base_y + 1, base_y + 2
2118
2119        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y
2120
2121        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}
2122
2123        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
2124        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
2125
2126        movi            v29.16b, #4
21274:
2128        asr             w9,  w8,  #6              // base_x
2129        dup             v16.4h,  w8               // xpos
2130        sub             w8,  w8,  w6              // xpos -= dx
2131        cmp             w9,  #-4                  // base_x <= -4
2132        asr             w11, w8,  #6              // base_x
2133        b.le            49f
2134
2135        lsl             w9,  w9,  #1
2136        lsl             w11, w11, #1
2137
2138        dup             v17.4h,  w8               // xpos
2139
2140        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
2141        ldr             q6,  [x2, w11, sxtw]
2142
2143        trn1            v16.2d,  v16.2d,  v17.2d  // xpos
2144
2145        // Cut corners here; only doing tbl over v0-v1 here; we only
2146        // seem to need the last pixel, from v2, after skipping to the
2147        // left-only codepath below.
2148        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
2149
2150        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row
2151
2152        ext             v5.16b,  v4.16b,  v4.16b,  #2 // top[base_x+1]
2153        ext             v7.16b,  v6.16b,  v6.16b,  #2
2154
2155        and             v16.16b, v16.16b, v25.16b // frac_x
2156
2157        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
2158
2159        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
2160        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
2161
2162        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x
2163
2164        add             v20.8h,  v20.8h,  v31.8h  // actual base_x
2165
2166        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2167        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2168        umull2          v22.4s,  v18.8h,  v28.8h
2169        umlal2          v22.4s,  v19.8h,  v27.8h
2170
2171        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
2172        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
2173        umull2          v24.4s,  v4.8h,   v17.8h
2174        umlal2          v24.4s,  v5.8h,   v16.8h
2175
2176        cmge            v20.8h,  v20.8h,  #0
2177
2178        rshrn           v21.4h,  v21.4s,  #6
2179        rshrn2          v21.8h,  v22.4s,  #6
2180        rshrn           v22.4h,  v23.4s,  #6
2181        rshrn2          v22.8h,  v24.4s,  #6
2182
2183        bit             v21.16b, v22.16b, v20.16b
2184
2185        st1             {v21.d}[0], [x0], x1
2186        sub             w8,  w8,  w6              // xpos -= dx
2187        subs            w5,  w5,  #2
2188        st1             {v21.d}[1], [x0], x1
2189        b.le            9f
2190
2191        ext             v18.16b, v19.16b, v19.16b, #8
2192        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
2193        b               4b
2194
219549:
2196        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2]
2197
2198        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
2199
2200        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2201        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2202        umull2          v21.4s,  v18.8h,  v28.8h
2203        umlal2          v21.4s,  v19.8h,  v27.8h
2204
2205        rshrn           v20.4h,  v20.4s,  #6
2206        rshrn2          v20.8h,  v21.4s,  #6
2207
2208        st1             {v20.d}[0], [x0], x1
2209        subs            w5,  w5,  #2
2210        st1             {v20.d}[1], [x0], x1
2211        b.le            9f
2212
2213        ext             v18.16b, v19.16b, v19.16b, #8
2214        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
2215        b               49b
2216
22179:
2218        ret
2219
222080:
2221        AARCH64_VALID_JUMP_TARGET
2222
2223        stp             d8,  d9,  [sp, #-0x40]!
2224        stp             d10, d11, [sp, #0x10]
2225        stp             d12, d13, [sp, #0x20]
2226        stp             d14, d15, [sp, #0x30]
2227
2228        dup             v18.8h,  w7               // -dy
2229        add             x3,  x3,  #2              // Skip past left[0]
2230
2231        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
2232        movi            v25.8h,  #0x3e
2233        add             v16.8h,  v16.8h,  v18.8h  // -= dy
2234
2235        // Worst case height for w=8 is 32.
2236        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
2237        ld1r            {v15.8h}, [x2]            // left[0] == top[0]
2238
2239        movi            v26.8h,  #64
2240        movi            v19.16b, #4
2241
2242        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
2243        and             v27.16b, v16.16b, v25.16b // frac_y
2244
2245        movi            v23.8h,  #1, lsl #8
2246        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2247        mov             v18.16b, v15.16b          // left[0]
2248        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
2249        movi            v17.16b, #2
2250        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
2251
2252        // Cut corners here; for the first row we don't expect to need to
2253        // read outside of v0.
2254        tbx             v18.16b, {v0.16b}, v29.16b // left[base_y]
2255
2256        add             v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
2257        add             v29.16b, v29.16b, v17.16b // base_y + 1 (*2)
2258
2259        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y
2260
2261        movi            v24.16b, #4
22628:
2263        asr             w9,  w8,  #6              // base_x
2264        dup             v16.8h,   w8              // xpos
2265        sub             w8,  w8,  w6              // xpos -= dx
2266        cmp             w9,  #-16                 // base_x <= -16
2267        asr             w11, w8,  #6              // base_x
2268        b.le            89f
2269
2270        dup             v17.8h,   w8              // xpos
2271
2272        add             x9,  x2,  w9,  sxtw #1
2273        add             x11, x2,  w11, sxtw #1
2274
2275        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
2276        mov             v19.16b, v15.16b          // left[0]
2277        ld1             {v6.8h, v7.8h}, [x11]
2278
2279        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2280
2281        mov             v20.16b, v15.16b          // left[0]
2282
2283        sshr            v21.8h,  v16.8h,  #6      // first base_x
2284        sshr            v22.8h,  v17.8h,  #6
2285
2286        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
2287
2288        ext             v5.16b,  v4.16b,  v5.16b,  #2 // top[base_x+1]
2289        ext             v7.16b,  v6.16b,  v7.16b,  #2
2290
2291        and             v16.16b, v16.16b, v25.16b // frac_x
2292        and             v17.16b, v17.16b, v25.16b
2293
2294        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2295        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2296
2297        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
2298        sub             v9.8h,   v26.8h,  v17.8h
2299
2300        umull2          v11.4s,  v18.8h,  v28.8h
2301        umlal2          v11.4s,  v19.8h,  v27.8h
2302
2303        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
2304        add             v22.8h,  v22.8h,  v31.8h
2305
2306        umull           v12.4s,  v19.4h,  v28.4h
2307        umlal           v12.4s,  v20.4h,  v27.4h
2308        umull2          v13.4s,  v19.8h,  v28.8h
2309        umlal2          v13.4s,  v20.8h,  v27.8h
2310
2311        rshrn           v10.4h,  v10.4s,  #6
2312        rshrn2          v10.8h,  v11.4s,  #6
2313        rshrn           v11.4h,  v12.4s,  #6
2314        rshrn2          v11.8h,  v13.4s,  #6
2315
2316        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
2317        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
2318        umull2          v13.4s,  v4.8h,   v8.8h
2319        umlal2          v13.4s,  v5.8h,   v16.8h
2320        umull           v14.4s,  v6.4h,   v9.4h
2321        umlal           v14.4s,  v7.4h,   v17.4h
2322        umull2          v18.4s,  v6.8h,   v9.8h
2323        umlal2          v18.4s,  v7.8h,   v17.8h
2324
2325        cmge            v21.8h,  v21.8h,  #0
2326        cmge            v22.8h,  v22.8h,  #0
2327
2328        rshrn           v12.4h,  v12.4s,  #6
2329        rshrn2          v12.8h,  v13.4s,  #6
2330        rshrn           v13.4h,  v14.4s,  #6
2331        rshrn2          v13.8h,  v18.4s,  #6
2332
2333        bit             v10.16b, v12.16b, v21.16b
2334        bit             v11.16b, v13.16b, v22.16b
2335
2336        st1             {v10.8h}, [x0], x1
2337        subs            w5,  w5,  #2
2338        sub             w8,  w8,  w6              // xpos -= dx
2339        st1             {v11.8h}, [x0], x1
2340        b.le            9f
2341
2342        mov             v18.16b, v20.16b
2343        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
2344        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
2345        b               8b
2346
234789:
2348        mov             v19.16b, v15.16b
2349        mov             v20.16b, v15.16b
2350        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2351        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
2352
2353        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2354        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2355        umull2          v5.4s,   v18.8h,  v28.8h
2356        umlal2          v5.4s,   v19.8h,  v27.8h
2357        umull           v6.4s,   v19.4h,  v28.4h
2358        umlal           v6.4s,   v20.4h,  v27.4h
2359        umull2          v7.4s,   v19.8h,  v28.8h
2360        umlal2          v7.4s,   v20.8h,  v27.8h
2361
2362        rshrn           v4.4h,   v4.4s,   #6
2363        rshrn2          v4.8h,   v5.4s,   #6
2364        rshrn           v5.4h,   v6.4s,   #6
2365        rshrn2          v5.8h,   v7.4s,   #6
2366
2367        st1             {v4.8h}, [x0], x1
2368        subs            w5,  w5,  #2
2369        st1             {v5.8h}, [x0], x1
2370        b.le            9f
2371
2372        mov             v18.16b, v20.16b
2373        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
2374        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
2375        b               89b
2376
23779:
2378        ldp             d14, d15, [sp, #0x30]
2379        ldp             d12, d13, [sp, #0x20]
2380        ldp             d10, d11, [sp, #0x10]
2381        ldp             d8,  d9,  [sp], 0x40
2382        ret
2383
2384160:
2385320:
2386640:
2387        AARCH64_VALID_JUMP_TARGET
2388
2389        stp             d8,  d9,  [sp, #-0x40]!
2390        stp             d10, d11, [sp, #0x10]
2391        stp             d12, d13, [sp, #0x20]
2392        stp             d14, d15, [sp, #0x30]
2393
2394        dup             v25.8h,  w7               // -dy
2395        add             x3,  x3,  #2              // Skip past left[0]
2396
2397        add             x13, x0,  x1              // alternating row
2398        lsl             x1,  x1,  #1              // stride *= 2
2399        sub             x1,  x1,  w4,  uxtw #1    // stride -= width
2400
2401        movi            v11.8h,  #8
2402        mul             v26.8h,  v31.8h,  v25.8h  // {0,1,2,3,4,5,6,7}* -dy
2403        add             v26.8h,  v26.8h,  v25.8h  // -= dy
2404        mul             v25.8h,  v25.8h,  v11.8h  // -8*dy
2405
2406        // Worst case height is 64, but we can only fit 32 pixels into
2407        // v0-v3 usable within one tbx instruction. As long as base_y is
2408        // up to 32, we use tbx.
2409        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
2410        ld1r            {v15.8h}, [x2]            // left[0] == top[0]
2411
2412        mov             w12, w4                   // orig w
2413        neg             w14, w4                   // -w
2414
24151:
2416        mov             v23.16b, v26.16b          // reset ypos
2417
2418        asr             w9,  w8,  #6              // base_x
2419        dup             v16.8h,   w8              // xpos
2420        sub             w8,  w8,  w6              // xpos -= dx
2421        cmp             w9,  w14                  // base_x <= -2*w
2422        asr             w11, w8,  #6              // base_x
2423        b.le            169f
2424
2425        dup             v17.8h,   w8              // xpos
2426        sub             w8,  w8,  w6              // xpos -= dx
2427
2428        add             x9,  x2,  w9,  sxtw #1
2429        add             x11, x2,  w11, sxtw #1
2430
2431        sshr            v21.8h,  v16.8h,  #6      // first base_x
2432        sshr            v22.8h,  v17.8h,  #6
2433
2434        ld1             {v4.8h}, [x9], #16        // top[base_x]
2435        ld1             {v6.8h}, [x11], #16
2436
2437        movi            v10.8h,  #0x3e
2438        movi            v11.8h,  #64
2439
2440        and             v16.16b, v16.16b, v10.16b // frac_x
2441        and             v17.16b, v17.16b, v10.16b
2442
2443        sub             v8.8h,   v11.8h,  v16.8h  // 64 - frac_x
2444        sub             v9.8h,   v11.8h,  v17.8h
2445
2446        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
2447        add             v22.8h,  v22.8h,  v31.8h
2448
24492:
2450        smov            w10,     v22.h[0]
2451
2452        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
2453        movi            v12.8h,  #64
2454        cmp             w10, #0                   // base_x (bottom left) >= 0
2455        smov            w10,     v29.b[0]         // base_y[0]
2456        movi            v10.8h,  #0x3e
2457
2458        b.ge            4f
2459        and             v27.16b, v23.16b, v10.16b // frac_y
2460        cmp             w10,     #(32-3)
2461
2462        mov             v18.16b, v15.16b          // left[0]
2463        sub             v28.8h,  v12.8h,  v27.8h  // 64 - frac_y
2464        b.gt            22f
2465
246621:
2467        // base_y < 32, using tbx
2468        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2469        movi            v11.8h,  #1, lsl #8
2470        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
2471        add             v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...
2472
2473        movi            v13.16b, #2
2474
2475        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
2476
2477        add             v29.16b, v29.16b, v13.16b // base_y + 1 (*2)
2478        mov             v19.16b, v15.16b          // left[0]
2479
2480        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2481
2482        add             v29.16b, v29.16b, v13.16b // base_y + 2 (*2)
2483        mov             v20.16b, v15.16b          // left[0]
2484
2485        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
2486
2487        b               23f
2488
248922:
2490        // base_y >= 32, using separate loads.
2491        smov            w15,     v29.b[1]
2492        smov            w16,     v29.b[2]
2493        add             x10, x3,  w10, sxtw #1
2494        smov            w17,     v29.b[3]
2495        add             x15, x3,  w15, sxtw #1
2496        ld3             {v18.h, v19.h, v20.h}[0], [x10]
2497        smov            w10,     v29.b[4]
2498        add             x16, x3,  w16, sxtw #1
2499        ld3             {v18.h, v19.h, v20.h}[1], [x15]
2500        smov            w15,     v29.b[5]
2501        add             x17, x3,  w17, sxtw #1
2502        ld3             {v18.h, v19.h, v20.h}[2], [x16]
2503        smov            w16,     v29.b[6]
2504        add             x10, x3,  w10, sxtw #1
2505        ld3             {v18.h, v19.h, v20.h}[3], [x17]
2506        smov            w17,     v29.b[7]
2507        add             x15, x3,  w15, sxtw #1
2508        add             x16, x3,  w16, sxtw #1
2509        ld3             {v18.h, v19.h, v20.h}[4], [x10]
2510        add             x17, x3,  w17, sxtw #1
2511        ld3             {v18.h, v19.h, v20.h}[5], [x15]
2512        ld3             {v18.h, v19.h, v20.h}[6], [x16]
2513        ld3             {v18.h, v19.h, v20.h}[7], [x17]
2514
251523:
2516
2517        ld1             {v5.8h}, [x9], #16        // top[base_x]
2518        ld1             {v7.8h}, [x11], #16
2519
2520        add             v23.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
2521
2522        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2523        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2524        umull2          v11.4s,  v18.8h,  v28.8h
2525        umlal2          v11.4s,  v19.8h,  v27.8h
2526        umull           v12.4s,  v19.4h,  v28.4h
2527        umlal           v12.4s,  v20.4h,  v27.4h
2528        umull2          v13.4s,  v19.8h,  v28.8h
2529        umlal2          v13.4s,  v20.8h,  v27.8h
2530
2531        ext             v18.16b, v4.16b,  v5.16b,  #2 // top[base_x+1]
2532        ext             v19.16b, v6.16b,  v7.16b,  #2
2533
2534        rshrn           v10.4h,  v10.4s,  #6
2535        rshrn2          v10.8h,  v11.4s,  #6
2536        rshrn           v11.4h,  v12.4s,  #6
2537        rshrn2          v11.8h,  v13.4s,  #6
2538
2539        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
2540        umlal           v12.4s,  v18.4h,  v16.4h  // + top[base_x+1]*frac_x
2541        umull2          v13.4s,  v4.8h,   v8.8h
2542        umlal2          v13.4s,  v18.8h,  v16.8h
2543        umull           v14.4s,  v6.4h,   v9.4h
2544        umlal           v14.4s,  v19.4h,  v17.4h
2545        umull2          v20.4s,  v6.8h,   v9.8h
2546        umlal2          v20.4s,  v19.8h,  v17.8h
2547
2548        cmge            v18.8h,  v21.8h,  #0
2549        cmge            v19.8h,  v22.8h,  #0
2550
2551        rshrn           v12.4h,  v12.4s,  #6
2552        rshrn2          v12.8h,  v13.4s,  #6
2553        rshrn           v13.4h,  v14.4s,  #6
2554        rshrn2          v13.8h,  v20.4s,  #6
2555
2556        bit             v10.16b, v12.16b, v18.16b
2557        bit             v11.16b, v13.16b, v19.16b
2558
2559        st1             {v10.8h}, [x0], #16
2560        subs            w4,  w4,  #8
2561        st1             {v11.8h}, [x13], #16
2562        b.le            3f
2563
2564        movi            v10.8h,  #8
2565        mov             v4.16b,  v5.16b
2566        mov             v6.16b,  v7.16b
2567        add             v21.8h,  v21.8h,  v10.8h  // base_x += 8
2568        add             v22.8h,  v22.8h,  v10.8h
2569        b               2b
2570
25713:
2572        subs            w5,  w5,  #2
2573        b.le            9f
2574        movi            v10.8h, #128
2575        add             x0,  x0,  x1
2576        add             x13, x13, x1
2577        mov             w4,  w12                  // reset w
2578        add             v26.8h,  v26.8h,  v10.8h  // ypos += 2*(1<<6)
2579        b               1b
2580
25814:      // The rest of the row only predicted from top[]
2582        ld1             {v5.8h}, [x9], #16        // top[base_x]
2583        ld1             {v7.8h}, [x11], #16
2584
2585        ext             v18.16b, v4.16b,  v5.16b,  #2 // top[base_x+1]
2586        ext             v19.16b, v6.16b,  v7.16b,  #2
2587
2588        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
2589        umlal           v12.4s,  v18.4h,  v16.4h  // + top[base_x+1]*frac_x
2590        umull2          v13.4s,  v4.8h,   v8.8h
2591        umlal2          v13.4s,  v18.8h,  v16.8h
2592        umull           v14.4s,  v6.4h,   v9.4h
2593        umlal           v14.4s,  v19.4h,  v17.4h
2594        umull2          v20.4s,  v6.8h,   v9.8h
2595        umlal2          v20.4s,  v19.8h,  v17.8h
2596
2597        rshrn           v12.4h,  v12.4s,  #6
2598        rshrn2          v12.8h,  v13.4s,  #6
2599        rshrn           v13.4h,  v14.4s,  #6
2600        rshrn2          v13.8h,  v20.4s,  #6
2601
2602        st1             {v12.8h}, [x0], #16
2603        subs            w4,  w4,  #8
2604        st1             {v13.8h}, [x13], #16
2605        b.le            3b
2606
2607        mov             v4.16b,  v5.16b
2608        mov             v6.16b,  v7.16b
2609        b               4b
2610
2611169:    // The rest of the block only predicted from left[]
2612        add             x1,  x1,  w4,  uxtw #1    // restore stride
2613        mov             w12, w5                   // orig remaining h
26141:
2615        movi            v12.8h,  #64
2616        movi            v10.8h,  #0x3e
2617
2618        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
2619        and             v27.16b, v23.16b, v10.16b // frac_y
2620
2621        smov            w10,     v29.b[0]         // base_y[0]
2622
2623        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2624        movi            v11.8h,  #1, lsl #8
2625        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
2626        add             v23.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
2627        add             v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...
2628
2629        cmp             w10,     #(32-1)
2630
2631        mov             v18.16b, v15.16b          // left[0]
2632        movi            v21.16b, #2
2633
2634        sub             v28.8h,  v12.8h,  v27.8h  // 64 - frac_y
2635
2636        b.gt            31f
2637
2638        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
2639        add             v29.16b, v29.16b, v21.16b // base_y + 1 (*2)
2640
26412:
2642        // base_y < 32, using tbx.
2643        smov            w10,     v29.b[0]         // base_y[0]
2644        mov             v19.16b, v15.16b          // left[0]
2645        cmp             w10,     #(64-4)
2646        b.gt            32f
2647        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2648        add             v29.16b, v29.16b, v21.16b // base_y + 2 (*2)
2649        mov             v20.16b, v15.16b          // left[0]
2650        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
2651        add             v29.16b, v29.16b, v21.16b // next base_y
2652
2653        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2654        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2655        umull2          v11.4s,  v18.8h,  v28.8h
2656        umlal2          v11.4s,  v19.8h,  v27.8h
2657        umull           v12.4s,  v19.4h,  v28.4h
2658        umlal           v12.4s,  v20.4h,  v27.4h
2659        umull2          v13.4s,  v19.8h,  v28.8h
2660        umlal2          v13.4s,  v20.8h,  v27.8h
2661
2662        rshrn           v10.4h,  v10.4s,  #6
2663        rshrn2          v10.8h,  v11.4s,  #6
2664        rshrn           v11.4h,  v12.4s,  #6
2665        rshrn2          v11.8h,  v13.4s,  #6
2666
2667        st1             {v10.8h}, [x0], x1
2668        subs            w5,  w5,  #2
2669        st1             {v11.8h}, [x13], x1
2670        b.le            4f
2671        mov             v18.16b, v20.16b
2672        b               2b
2673
267431:     // base_y >= 32, using separate loads, loading v18 if we had to bail
2675        // in the prologue.
2676        smov            w10,     v29.b[0]
2677        smov            w15,     v29.b[2]
2678        movi            v21.16b, #2
2679        smov            w16,     v29.b[4]
2680        add             x10, x3,  w10, sxtw
2681        smov            w17,     v29.b[6]
2682        add             x15, x3,  w15, sxtw
2683        ld1             {v18.h}[0], [x10]
2684        smov            w10,     v29.b[8]
2685        add             x16, x3,  w16, sxtw
2686        ld1             {v18.h}[1], [x15]
2687        smov            w15,     v29.b[10]
2688        add             x17, x3,  w17, sxtw
2689        ld1             {v18.h}[2], [x16]
2690        smov            w16,     v29.b[12]
2691        add             x10, x3,  w10, sxtw
2692        ld1             {v18.h}[3], [x17]
2693        smov            w17,     v29.b[14]
2694        add             x15, x3,  w15, sxtw
2695        add             x16, x3,  w16, sxtw
2696        ld1             {v18.h}[4], [x10]
2697        add             x17, x3,  w17, sxtw
2698        ld1             {v18.h}[5], [x15]
2699        add             v29.16b, v29.16b, v21.16b // next base_y
2700        ld1             {v18.h}[6], [x16]
2701        ld1             {v18.h}[7], [x17]
2702
270332:     // base_y >= 32, using separate loads.
2704        cmp             w5,  #4
2705        b.lt            34f
270633:     // h >= 4, preserving v18 from the previous round, loading v19-v22.
2707        smov            w10,     v29.b[0]
2708        subs            w5,  w5,  #4
2709        smov            w15,     v29.b[2]
2710        movi            v10.16b, #8
2711        smov            w16,     v29.b[4]
2712        add             x10, x3,  w10, sxtw
2713        smov            w17,     v29.b[6]
2714        add             x15, x3,  w15, sxtw
2715        ld4             {v19.h, v20.h, v21.h, v22.h}[0], [x10]
2716        smov            w10,     v29.b[8]
2717        add             x16, x3,  w16, sxtw
2718        ld4             {v19.h, v20.h, v21.h, v22.h}[1], [x15]
2719        smov            w15,     v29.b[10]
2720        add             x17, x3,  w17, sxtw
2721        ld4             {v19.h, v20.h, v21.h, v22.h}[2], [x16]
2722        smov            w16,     v29.b[12]
2723        add             x10, x3,  w10, sxtw
2724        ld4             {v19.h, v20.h, v21.h, v22.h}[3], [x17]
2725        smov            w17,     v29.b[14]
2726        add             x15, x3,  w15, sxtw
2727        add             x16, x3,  w16, sxtw
2728        ld4             {v19.h, v20.h, v21.h, v22.h}[4], [x10]
2729        add             x17, x3,  w17, sxtw
2730        ld4             {v19.h, v20.h, v21.h, v22.h}[5], [x15]
2731        ld4             {v19.h, v20.h, v21.h, v22.h}[6], [x16]
2732        add             v29.16b, v29.16b, v10.16b // next base_y
2733        ld4             {v19.h, v20.h, v21.h, v22.h}[7], [x17]
2734
2735        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2736        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2737        umull2          v11.4s,  v18.8h,  v28.8h
2738        umlal2          v11.4s,  v19.8h,  v27.8h
2739        umull           v12.4s,  v19.4h,  v28.4h
2740        umlal           v12.4s,  v20.4h,  v27.4h
2741        umull2          v13.4s,  v19.8h,  v28.8h
2742        umlal2          v13.4s,  v20.8h,  v27.8h
2743
2744        rshrn           v10.4h,  v10.4s,  #6
2745        rshrn2          v10.8h,  v11.4s,  #6
2746        rshrn           v11.4h,  v12.4s,  #6
2747        rshrn2          v11.8h,  v13.4s,  #6
2748
2749        umull           v12.4s,  v20.4h,  v28.4h  // left[base_y]*(64-frac_y)
2750        umlal           v12.4s,  v21.4h,  v27.4h  // + left[base_y+1]*frac_y
2751        umull2          v13.4s,  v20.8h,  v28.8h
2752        umlal2          v13.4s,  v21.8h,  v27.8h
2753        umull           v14.4s,  v21.4h,  v28.4h
2754        umlal           v14.4s,  v22.4h,  v27.4h
2755        umull2          v18.4s,  v21.8h,  v28.8h
2756        umlal2          v18.4s,  v22.8h,  v27.8h
2757
2758        rshrn           v12.4h,  v12.4s,  #6
2759        rshrn2          v12.8h,  v13.4s,  #6
2760        rshrn           v13.4h,  v14.4s,  #6
2761        rshrn2          v13.8h,  v18.4s,  #6
2762
2763        st1             {v10.8h}, [x0],  x1
2764        cmp             w5,  #2
2765        st1             {v11.8h}, [x13], x1
2766        st1             {v12.8h}, [x0],  x1
2767        st1             {v13.8h}, [x13], x1
2768        b.lt            4f
2769        mov             v18.16b, v22.16b
2770        b.gt            33b
2771
277234:     // h == 2, preserving v18 from the previous round, loading v19-v20.
2773        smov            w10,     v29.b[0]
2774        smov            w15,     v29.b[2]
2775        movi            v21.16b, #4
2776        smov            w16,     v29.b[4]
2777        add             x10, x3,  w10, sxtw
2778        smov            w17,     v29.b[6]
2779        add             x15, x3,  w15, sxtw
2780        ld2             {v19.h, v20.h}[0], [x10]
2781        smov            w10,     v29.b[8]
2782        add             x16, x3,  w16, sxtw
2783        ld2             {v19.h, v20.h}[1], [x15]
2784        smov            w15,     v29.b[10]
2785        add             x17, x3,  w17, sxtw
2786        ld2             {v19.h, v20.h}[2], [x16]
2787        smov            w16,     v29.b[12]
2788        add             x10, x3,  w10, sxtw
2789        ld2             {v19.h, v20.h}[3], [x17]
2790        smov            w17,     v29.b[14]
2791        add             x15, x3,  w15, sxtw
2792        add             x16, x3,  w16, sxtw
2793        ld2             {v19.h, v20.h}[4], [x10]
2794        add             x17, x3,  w17, sxtw
2795        ld2             {v19.h, v20.h}[5], [x15]
2796        ld2             {v19.h, v20.h}[6], [x16]
2797        add             v29.16b, v29.16b, v21.16b // next base_y
2798        ld2             {v19.h, v20.h}[7], [x17]
2799
2800        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2801        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2802        umull2          v11.4s,  v18.8h,  v28.8h
2803        umlal2          v11.4s,  v19.8h,  v27.8h
2804        umull           v12.4s,  v19.4h,  v28.4h
2805        umlal           v12.4s,  v20.4h,  v27.4h
2806        umull2          v13.4s,  v19.8h,  v28.8h
2807        umlal2          v13.4s,  v20.8h,  v27.8h
2808
2809        rshrn           v10.4h,  v10.4s,  #6
2810        rshrn2          v10.8h,  v11.4s,  #6
2811        rshrn           v11.4h,  v12.4s,  #6
2812        rshrn2          v11.8h,  v13.4s,  #6
2813
2814        st1             {v10.8h}, [x0], x1
2815        st1             {v11.8h}, [x13], x1
2816        // The h==2 case only happens once at the end, if at all.
2817
28184:
2819        subs            w4,  w4,  #8
2820        b.le            9f
2821
2822        lsr             x1,  x1,  #1
2823        msub            x0,  x1,  x12, x0         // ptr -= h * stride
2824        msub            x13, x1,  x12, x13
2825        lsl             x1,  x1,  #1
2826        add             x0,  x0,  #16
2827        add             x13, x13, #16
2828        mov             w5,  w12                  // reset h
2829        b               1b
2830
28319:
2832        ldp             d14, d15, [sp, #0x30]
2833        ldp             d12, d13, [sp, #0x20]
2834        ldp             d10, d11, [sp, #0x10]
2835        ldp             d8,  d9,  [sp], 0x40
2836        ret
2837endfunc
2838
2839jumptable ipred_z2_fill1_tbl
2840        .word 640b - ipred_z2_fill1_tbl
2841        .word 320b - ipred_z2_fill1_tbl
2842        .word 160b - ipred_z2_fill1_tbl
2843        .word 80b  - ipred_z2_fill1_tbl
2844        .word 40b  - ipred_z2_fill1_tbl
2845endjumptable
2846
2847function ipred_z2_fill2_16bpc_neon, export=1
2848        cmp             w4,  #8
2849        mov             w8,  #(2 << 6)            // xpos = 2 << 6
2850        sub             w8,  w8,  w6              // xpos -= dx
2851
2852        movrel          x11, increments
2853        ld1             {v31.8h},  [x11]          // increments
2854        neg             w7,  w7                   // -dy
2855        b.eq            80f
2856
285740:
2858        dup             v30.4h,  w7               // -dy
2859        movi            v17.8b,  #1
2860
2861        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
2862        movi            v25.8h,  #0x3e
2863        add             v30.4h,  v16.4h,  v30.4h  // -= dy
2864
2865        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
2866        // from left.
2867        ld1             {v0.8h, v1.8h}, [x3]      // left[]
2868
2869        movi            v26.8h,  #64
2870        movi            v19.16b, #4
2871
2872        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2873        and             v27.8b,  v30.8b,  v25.8b  // frac_y
2874
2875        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2876
2877        movi            v23.4h,  #1, lsl #8
2878        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2879        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
2880        movi            v17.8b,  #2
2881        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...
2882
2883        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
2884        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)
2885
2886        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
2887
2888        trn1            v30.2d,  v30.2d,  v28.2d  // base_y + 1, base_y + 2
2889
2890        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y
2891
2892        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}
2893
2894        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
2895        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
2896
2897        movi            v29.16b, #4
2898        add             v31.8h,  v31.8h,  v31.8h  // {0,2,4,6,0,2,4,6}
28994:
2900        asr             w9,  w8,  #6              // base_x
2901        dup             v16.4h,  w8               // xpos
2902        sub             w8,  w8,  w6              // xpos -= dx
2903        cmp             w9,  #-8                  // base_x <= -8
2904        asr             w11, w8,  #6              // base_x
2905        b.le            49f
2906
2907        lsl             w9,  w9,  #1
2908        lsl             w11, w11, #1
2909
2910        dup             v17.4h,  w8               // xpos
2911
2912        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
2913        ldr             q6,  [x2, w11, sxtw]
2914
2915        trn1            v16.2d,  v16.2d,  v17.2d  // xpos
2916
2917        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
2918
2919        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row
2920
2921        uzp2            v5.8h,   v4.8h,   v6.8h   // top[base_x+1]
2922        uzp1            v4.8h,   v4.8h,   v6.8h   // top[base_x]
2923
2924        and             v16.16b, v16.16b, v25.16b // frac_x
2925
2926        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
2927
2928        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x
2929
2930        add             v20.8h,  v20.8h,  v31.8h  // actual base_x
2931
2932        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2933        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2934        umull2          v22.4s,  v18.8h,  v28.8h
2935        umlal2          v22.4s,  v19.8h,  v27.8h
2936
2937        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
2938        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
2939        umull2          v24.4s,  v4.8h,   v17.8h
2940        umlal2          v24.4s,  v5.8h,   v16.8h
2941
2942        cmge            v20.8h,  v20.8h,  #0
2943
2944        rshrn           v21.4h,  v21.4s,  #6
2945        rshrn2          v21.8h,  v22.4s,  #6
2946        rshrn           v22.4h,  v23.4s,  #6
2947        rshrn2          v22.8h,  v24.4s,  #6
2948
2949        bit             v21.16b, v22.16b, v20.16b
2950
2951        st1             {v21.d}[0], [x0], x1
2952        sub             w8,  w8,  w6              // xpos -= dx
2953        subs            w5,  w5,  #2
2954        st1             {v21.d}[1], [x0], x1
2955        b.le            9f
2956
2957        ext             v18.16b, v19.16b, v19.16b, #8
2958        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
2959        b               4b
2960
296149:
2962        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
2963
2964        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
2965
2966        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2967        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2968        umull2          v21.4s,  v18.8h,  v28.8h
2969        umlal2          v21.4s,  v19.8h,  v27.8h
2970
2971        rshrn           v20.4h,  v20.4s,  #6
2972        rshrn2          v20.8h,  v21.4s,  #6
2973
2974        st1             {v20.d}[0], [x0], x1
2975        subs            w5,  w5,  #2
2976        st1             {v20.d}[1], [x0], x1
2977        b.le            9f
2978
2979        ext             v18.16b, v19.16b, v19.16b, #8
2980        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
2981        b               49b
2982
29839:
2984        ret
2985
298680:
2987        stp             d8,  d9,  [sp, #-0x40]!
2988        stp             d10, d11, [sp, #0x10]
2989        stp             d12, d13, [sp, #0x20]
2990        stp             d14, d15, [sp, #0x30]
2991
2992        dup             v18.8h,  w7               // -dy
2993        movi            v17.8b,  #1
2994
2995        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
2996        movi            v25.8h,  #0x3e
2997        add             v16.8h,  v16.8h,  v18.8h  // -= dy
2998
2999        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
3000        // from left.
3001        ld1             {v0.8h, v1.8h}, [x3]      // left[]
3002
3003        movi            v26.8h,  #64
3004        movi            v19.16b, #4
3005
3006        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
3007        and             v27.16b, v16.16b, v25.16b // frac_y
3008
3009        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
3010
3011        movi            v23.8h,  #1, lsl #8
3012        shl             v29.8b,  v29.8b,  #1      // 2*base_y
3013        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
3014        movi            v17.16b, #2
3015        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
3016
3017        // Cut corners here; for the first row we don't expect to need to
3018        // read outside of v0.
3019        tbl             v18.16b, {v0.16b}, v29.16b // left[base_y]
3020
3021        add             v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
3022        add             v29.16b, v29.16b, v17.16b // base_y + 1 (*2)
3023
3024        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y
3025
3026        movi            v24.16b, #4
3027        add             v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
30288:
3029        asr             w9,  w8,  #6              // base_x
3030        dup             v16.8h,   w8              // xpos
3031        sub             w8,  w8,  w6              // xpos -= dx
3032        cmp             w9,  #-16                 // base_x <= -16
3033        asr             w11, w8,  #6              // base_x
3034        b.le            89f
3035
3036        dup             v17.8h,   w8              // xpos
3037
3038        add             x9,  x2,  w9,  sxtw #1
3039        add             x11, x2,  w11, sxtw #1
3040
3041        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
3042        ld1             {v6.8h, v7.8h}, [x11]
3043
3044        tbl             v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
3045
3046        sshr            v21.8h,  v16.8h,  #6      // first base_x
3047        sshr            v22.8h,  v17.8h,  #6
3048
3049        tbl             v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]
3050
3051        uzp2            v2.8h,   v4.8h,   v5.8h   // top[base_x+1]
3052        uzp1            v4.8h,   v4.8h,   v5.8h   // top[base_x]
3053        uzp2            v3.8h,   v6.8h,   v7.8h
3054        uzp1            v6.8h,   v6.8h,   v7.8h
3055        mov             v5.16b,  v2.16b
3056        mov             v7.16b,  v3.16b
3057
3058        and             v16.16b, v16.16b, v25.16b // frac_x
3059        and             v17.16b, v17.16b, v25.16b
3060
3061        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3062        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3063
3064        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
3065        sub             v9.8h,   v26.8h,  v17.8h
3066
3067        umull2          v11.4s,  v18.8h,  v28.8h
3068        umlal2          v11.4s,  v19.8h,  v27.8h
3069
3070        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
3071        add             v22.8h,  v22.8h,  v31.8h
3072
3073        umull           v12.4s,  v19.4h,  v28.4h
3074        umlal           v12.4s,  v20.4h,  v27.4h
3075        umull2          v13.4s,  v19.8h,  v28.8h
3076        umlal2          v13.4s,  v20.8h,  v27.8h
3077
3078        rshrn           v10.4h,  v10.4s,  #6
3079        rshrn2          v10.8h,  v11.4s,  #6
3080        rshrn           v11.4h,  v12.4s,  #6
3081        rshrn2          v11.8h,  v13.4s,  #6
3082
3083        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
3084        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
3085        umull2          v13.4s,  v4.8h,   v8.8h
3086        umlal2          v13.4s,  v5.8h,   v16.8h
3087        umull           v14.4s,  v6.4h,   v9.4h
3088        umlal           v14.4s,  v7.4h,   v17.4h
3089        umull2          v18.4s,  v6.8h,   v9.8h
3090        umlal2          v18.4s,  v7.8h,   v17.8h
3091
3092        cmge            v21.8h,  v21.8h,  #0
3093        cmge            v22.8h,  v22.8h,  #0
3094
3095        rshrn           v12.4h,  v12.4s,  #6
3096        rshrn2          v12.8h,  v13.4s,  #6
3097        rshrn           v13.4h,  v14.4s,  #6
3098        rshrn2          v13.8h,  v18.4s,  #6
3099
3100        bit             v10.16b, v12.16b, v21.16b
3101        bit             v11.16b, v13.16b, v22.16b
3102
3103        st1             {v10.8h}, [x0], x1
3104        subs            w5,  w5,  #2
3105        sub             w8,  w8,  w6              // xpos -= dx
3106        st1             {v11.8h}, [x0], x1
3107        b.le            9f
3108
3109        mov             v18.16b, v20.16b
3110        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3111        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
3112        b               8b
3113
311489:
3115        tbl             v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
3116        tbl             v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]
3117
3118        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3119        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3120        umull2          v5.4s,   v18.8h,  v28.8h
3121        umlal2          v5.4s,   v19.8h,  v27.8h
3122        umull           v6.4s,   v19.4h,  v28.4h
3123        umlal           v6.4s,   v20.4h,  v27.4h
3124        umull2          v7.4s,   v19.8h,  v28.8h
3125        umlal2          v7.4s,   v20.8h,  v27.8h
3126
3127        rshrn           v4.4h,   v4.4s,   #6
3128        rshrn2          v4.8h,   v5.4s,   #6
3129        rshrn           v5.4h,   v6.4s,   #6
3130        rshrn2          v5.8h,   v7.4s,   #6
3131
3132        st1             {v4.8h}, [x0], x1
3133        subs            w5,  w5,  #2
3134        st1             {v5.8h}, [x0], x1
3135        b.le            9f
3136
3137        mov             v18.16b, v20.16b
3138        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3139        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
3140        b               89b
3141
31429:
3143        ldp             d14, d15, [sp, #0x30]
3144        ldp             d12, d13, [sp, #0x20]
3145        ldp             d10, d11, [sp, #0x10]
3146        ldp             d8,  d9,  [sp], 0x40
3147        ret
3148endfunc
3149
3150function ipred_z2_fill3_16bpc_neon, export=1
3151        cmp             w4,  #8
3152        mov             w8,  #(1 << 6)            // xpos = 1 << 6
3153        sub             w8,  w8,  w6              // xpos -= dx
3154
3155        movrel          x11, increments
3156        ld1             {v31.8h},  [x11]          // increments
3157        neg             w7,  w7                   // -dy
3158        b.eq            80f
3159
316040:
3161        dup             v30.4h,  w7               // -dy
3162        movi            v17.8b,  #1
3163
3164        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
3165        movi            v25.8h,  #0x3e
3166        add             v30.4h,  v16.4h,  v30.4h  // -= dy
3167
3168        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
3169        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]
3170
3171        movi            v26.8h,  #64
3172        movi            v19.16b, #2
3173
3174        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
3175        and             v27.8b,  v30.8b,  v25.8b  // frac_y
3176
3177        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2
3178
3179        movi            v23.4h,  #1, lsl #8
3180        shl             v29.8b,  v29.8b,  #1      // 2*base_y
3181        movi            v19.16b, #4
3182        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
3183        movi            v17.8b,  #2
3184        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...
3185
3186        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
3187        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)
3188
3189        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}
3190
3191        add             v24.8b,  v30.8b,  v19.8b  // base_y + 3 (*2)
3192
3193        trn1            v29.2d,  v29.2d,  v28.2d  // base_y + 0, base_y + 2
3194        trn1            v30.2d,  v30.2d,  v24.2d  // base_y + 1, base_y + 3
3195
3196        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y
3197
3198        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
3199        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
3200
3201        movi            v24.16b, #8
32024:
3203        asr             w9,  w8,  #6              // base_x
3204        dup             v16.4h,  w8               // xpos
3205        sub             w8,  w8,  w6              // xpos -= dx
3206        cmp             w9,  #-4                  // base_x <= -4
3207        asr             w11, w8,  #6              // base_x
3208        b.le            49f
3209
3210        lsl             w9,  w9,  #1
3211        lsl             w11, w11, #1
3212
3213        dup             v17.4h,  w8               // xpos
3214
3215        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
3216        ldr             q6,  [x2, w11, sxtw]
3217
3218        trn1            v16.2d,  v16.2d,  v17.2d  // xpos
3219
3220        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
3221        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
3222
3223        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row
3224
3225        ext             v5.16b,  v4.16b,  v4.16b,  #2 // top[base_x+1]
3226        ext             v7.16b,  v6.16b,  v6.16b,  #2
3227
3228        and             v16.16b, v16.16b, v25.16b // frac_x
3229
3230        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
3231        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
3232
3233        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x
3234
3235        add             v20.8h,  v20.8h,  v31.8h  // actual base_x
3236
3237        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3238        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3239        umull2          v22.4s,  v18.8h,  v28.8h
3240        umlal2          v22.4s,  v19.8h,  v27.8h
3241
3242        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
3243        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
3244        umull2          v24.4s,  v4.8h,   v17.8h
3245        umlal2          v24.4s,  v5.8h,   v16.8h
3246
3247        cmge            v20.8h,  v20.8h,  #0
3248
3249        rshrn           v21.4h,  v21.4s,  #6
3250        rshrn2          v21.8h,  v22.4s,  #6
3251        rshrn           v22.4h,  v23.4s,  #6
3252        rshrn2          v22.8h,  v24.4s,  #6
3253
3254        movi            v24.16b, #8
3255
3256        bit             v21.16b, v22.16b, v20.16b
3257
3258        st1             {v21.d}[0], [x0], x1
3259        sub             w8,  w8,  w6              // xpos -= dx
3260        subs            w5,  w5,  #2
3261        st1             {v21.d}[1], [x0], x1
3262        b.le            9f
3263
3264        add             v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
3265        add             v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
3266        b               4b
3267
326849:
3269        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
3270        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
3271
3272        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3273        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3274        umull2          v21.4s,  v18.8h,  v28.8h
3275        umlal2          v21.4s,  v19.8h,  v27.8h
3276
3277        rshrn           v20.4h,  v20.4s,  #6
3278        rshrn2          v20.8h,  v21.4s,  #6
3279
3280        st1             {v20.d}[0], [x0], x1
3281        subs            w5,  w5,  #2
3282        st1             {v20.d}[1], [x0], x1
3283        b.le            9f
3284
3285        add             v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
3286        add             v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
3287        b               49b
3288
32899:
3290        ret
3291
329280:
3293        stp             d8,  d9,  [sp, #-0x40]!
3294        stp             d10, d11, [sp, #0x10]
3295        stp             d12, d13, [sp, #0x20]
3296        stp             d14, d15, [sp, #0x30]
3297
3298        dup             v18.8h,  w7               // -dy
3299        movi            v17.16b, #2
3300
3301        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
3302        movi            v25.8h,  #0x3e
3303        add             v16.8h,  v16.8h,  v18.8h  // -= dy
3304
3305        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
3306        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]
3307
3308        movi            v26.8h,  #64
3309        movi            v19.16b, #4
3310
3311        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
3312        and             v27.16b, v16.16b, v25.16b // frac_y
3313
3314        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 2
3315
3316        movi            v23.8h,  #1, lsl #8
3317        shl             v29.8b,  v29.8b,  #1      // 2*base_y
3318        mov             v18.16b, v15.16b          // left[0]
3319        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
3320        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
3321
3322        add             v30.16b, v29.16b, v17.16b // base_y + 1 (*2)
3323
3324        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y
3325
3326        movi            v24.16b, #4
33278:
3328        asr             w9,  w8,  #6              // base_x
3329        dup             v16.8h,   w8              // xpos
3330        sub             w8,  w8,  w6              // xpos -= dx
3331        cmp             w9,  #-16                 // base_x <= -16
3332        asr             w11, w8,  #6              // base_x
3333        b.le            89f
3334
3335        dup             v17.8h,   w8              // xpos
3336
3337        add             x9,  x2,  w9,  sxtw #1
3338        add             x11, x2,  w11, sxtw #1
3339
3340        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
3341        ld1             {v6.8h, v7.8h}, [x11]
3342
3343        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
3344        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3345        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
3346        add             v30.16b, v30.16b, v24.16b
3347
3348        sshr            v22.8h,  v16.8h,  #6      // first base_x
3349        tbl             v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
3350        sshr            v23.8h,  v17.8h,  #6
3351        tbl             v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]
3352
3353        ext             v5.16b,  v4.16b,  v5.16b,  #2 // top[base_x+1]
3354        ext             v7.16b,  v6.16b,  v7.16b,  #2
3355
3356        and             v16.16b, v16.16b, v25.16b // frac_x
3357        and             v17.16b, v17.16b, v25.16b
3358
3359        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3360        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3361
3362        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
3363        sub             v9.8h,   v26.8h,  v17.8h
3364
3365        umull2          v11.4s,  v18.8h,  v28.8h
3366        umlal2          v11.4s,  v19.8h,  v27.8h
3367
3368        add             v22.8h,  v22.8h,  v31.8h  // actual base_x
3369        add             v23.8h,  v23.8h,  v31.8h
3370
3371        umull           v12.4s,  v20.4h,  v28.4h
3372        umlal           v12.4s,  v21.4h,  v27.4h
3373        umull2          v13.4s,  v20.8h,  v28.8h
3374        umlal2          v13.4s,  v21.8h,  v27.8h
3375
3376        rshrn           v10.4h,  v10.4s,  #6
3377        rshrn2          v10.8h,  v11.4s,  #6
3378        rshrn           v11.4h,  v12.4s,  #6
3379        rshrn2          v11.8h,  v13.4s,  #6
3380
3381        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
3382        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
3383        umull2          v13.4s,  v4.8h,   v8.8h
3384        umlal2          v13.4s,  v5.8h,   v16.8h
3385        umull           v14.4s,  v6.4h,   v9.4h
3386        umlal           v14.4s,  v7.4h,   v17.4h
3387        umull2          v18.4s,  v6.8h,   v9.8h
3388        umlal2          v18.4s,  v7.8h,   v17.8h
3389
3390        cmge            v22.8h,  v22.8h,  #0
3391        cmge            v23.8h,  v23.8h,  #0
3392
3393        rshrn           v12.4h,  v12.4s,  #6
3394        rshrn2          v12.8h,  v13.4s,  #6
3395        rshrn           v13.4h,  v14.4s,  #6
3396        rshrn2          v13.8h,  v18.4s,  #6
3397
3398        bit             v10.16b, v12.16b, v22.16b
3399        bit             v11.16b, v13.16b, v23.16b
3400
3401        st1             {v10.8h}, [x0], x1
3402        subs            w5,  w5,  #2
3403        sub             w8,  w8,  w6              // xpos -= dx
3404        st1             {v11.8h}, [x0], x1
3405        b.le            9f
3406
3407        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3408        add             v30.16b, v30.16b, v24.16b
3409        b               8b
3410
341189:
3412        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
3413        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3414        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
3415        add             v30.16b, v30.16b, v24.16b
3416        tbl             v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
3417        tbl             v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]
3418
3419        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3420        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3421        umull2          v5.4s,   v18.8h,  v28.8h
3422        umlal2          v5.4s,   v19.8h,  v27.8h
3423        umull           v6.4s,   v20.4h,  v28.4h
3424        umlal           v6.4s,   v21.4h,  v27.4h
3425        umull2          v7.4s,   v20.8h,  v28.8h
3426        umlal2          v7.4s,   v21.8h,  v27.8h
3427
3428        rshrn           v4.4h,   v4.4s,   #6
3429        rshrn2          v4.8h,   v5.4s,   #6
3430        rshrn           v5.4h,   v6.4s,   #6
3431        rshrn2          v5.8h,   v7.4s,   #6
3432
3433        st1             {v4.8h}, [x0], x1
3434        subs            w5,  w5,  #2
3435        st1             {v5.8h}, [x0], x1
3436        b.le            9f
3437
3438        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3439        add             v30.16b, v30.16b, v24.16b
3440        b               89b
3441
34429:
3443        ldp             d14, d15, [sp, #0x30]
3444        ldp             d12, d13, [sp, #0x20]
3445        ldp             d10, d11, [sp, #0x10]
3446        ldp             d8,  d9,  [sp], 0x40
3447        ret
3448endfunc
3449
3450// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
3451//                                const pixel *const left,
3452//                                const int width, const int height,
3453//                                const int dy, const int max_base_y);
3454function ipred_z3_fill1_16bpc_neon, export=1
3455        clz             w9,  w4
3456        movrel          x8,  ipred_z3_fill1_tbl
3457        sub             w9,  w9,  #25
3458        ldrsw           x9,  [x8, w9, uxtw #2]
3459        add             x10, x2,  w6,  uxtw #1    // left[max_base_y]
3460        add             x8,  x8,  x9
3461        ld1r            {v31.8h}, [x10]           // padding
3462        mov             w7,  w5
3463        mov             w15, #64
3464        add             x13, x0,  x1
3465        lsl             x1,  x1,  #1
3466        br              x8
3467
346840:
3469        AARCH64_VALID_JUMP_TARGET
34704:
3471        lsr             w8,  w7,  #6              // base
3472        and             w9,  w7,  #0x3e           // frac
3473        add             w7,  w7,  w5              // xpos += dx
3474        cmp             w8,  w6                   // base >= max_base_x
3475        lsr             w10, w7,  #6              // base
3476        and             w11, w7,  #0x3e           // frac
3477        b.ge            ipred_z3_fill_padding_neon
3478        lsl             w8,  w8,  #1
3479        lsl             w10, w10, #1
3480        ldr             q0,  [x2, w8, uxtw]       // left[base]
3481        ldr             q2,  [x2, w10, uxtw]
3482        dup             v4.8h,   w9               // frac
3483        dup             v5.8h,   w11
3484        ext             v1.16b,  v0.16b,  v0.16b,  #2 // left[base+1]
3485        ext             v3.16b,  v2.16b,  v2.16b,  #2
3486        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
3487        sub             v7.4h,   v3.4h,   v2.4h
3488        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
3489        ushll           v17.4s,  v2.4h,   #6
3490        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
3491        smlal           v17.4s,  v7.4h,   v5.4h
3492        rshrn           v16.4h,  v16.4s,  #6
3493        rshrn           v17.4h,  v17.4s,  #6
3494        subs            w3,  w3,  #2
3495        zip1            v18.8h,  v16.8h,  v17.8h
3496        st1             {v18.s}[0], [x0],  x1
3497        st1             {v18.s}[1], [x13], x1
3498        add             w7,  w7,  w5              // xpos += dx
3499        st1             {v18.s}[2], [x0]
3500        st1             {v18.s}[3], [x13]
3501        b.le            9f
3502        sub             x0,  x0,  x1              // ptr -= 4 * (2*stride)
3503        sub             x13, x13, x1
3504        add             x0,  x0,  #4
3505        add             x13, x13, #4
3506        b               4b
35079:
3508        ret
3509
351080:
3511        AARCH64_VALID_JUMP_TARGET
35128:
3513        lsr             w8,  w7,  #6              // base
3514        and             w9,  w7,  #0x3e           // frac
3515        add             w7,  w7,  w5              // xpos += dx
3516        cmp             w8,  w6                   // base >= max_base_x
3517        lsr             w10, w7,  #6              // base
3518        and             w11, w7,  #0x3e           // frac
3519        b.ge            ipred_z3_fill_padding_neon
3520        add             x8,  x2,  w8,  uxtw #1
3521        add             x10, x2,  w10, uxtw #1
3522        dup             v4.8h,   w9               // frac
3523        dup             v5.8h,   w11
3524        ld1             {v0.8h},  [x8]            // left[base]
3525        ld1             {v2.8h},  [x10]
3526        sub             w9,  w15, w9              // 64 - frac
3527        sub             w11, w15, w11
3528        ldr             h1, [x8, #16]
3529        ldr             h3, [x10, #16]
3530        dup             v6.8h,   w9               // 64 - frac
3531        dup             v7.8h,   w11
3532        ext             v1.16b,  v0.16b,  v1.16b,  #2 // left[base+1]
3533        ext             v3.16b,  v2.16b,  v3.16b,  #2
3534        umull           v16.4s,  v0.4h,   v6.4h   // left[base]*(64-frac)
3535        umlal           v16.4s,  v1.4h,   v4.4h   // + left[base+1]*frac
3536        umull2          v17.4s,  v0.8h,   v6.8h
3537        umlal2          v17.4s,  v1.8h,   v4.8h
3538        umull           v18.4s,  v2.4h,   v7.4h
3539        umlal           v18.4s,  v3.4h,   v5.4h
3540        umull2          v19.4s,  v2.8h,   v7.8h
3541        umlal2          v19.4s,  v3.8h,   v5.8h
3542        rshrn           v16.4h,  v16.4s,  #6
3543        rshrn2          v16.8h,  v17.4s,  #6
3544        rshrn           v17.4h,  v18.4s,  #6
3545        rshrn2          v17.8h,  v19.4s,  #6
3546        subs            w3,  w3,  #2
3547        zip1            v18.8h,  v16.8h,  v17.8h
3548        zip2            v19.8h,  v16.8h,  v17.8h
3549        add             w7,  w7,  w5              // xpos += dx
3550        st1             {v18.s}[0], [x0],  x1
3551        st1             {v18.s}[1], [x13], x1
3552        st1             {v18.s}[2], [x0],  x1
3553        st1             {v18.s}[3], [x13], x1
3554        st1             {v19.s}[0], [x0],  x1
3555        st1             {v19.s}[1], [x13], x1
3556        st1             {v19.s}[2], [x0],  x1
3557        st1             {v19.s}[3], [x13], x1
3558        b.le            9f
3559        sub             x0,  x0,  x1, lsl #2      // ptr -= 4 * (2*stride)
3560        sub             x13, x13, x1, lsl #2
3561        add             x0,  x0,  #4
3562        add             x13, x13, #4
3563        b               8b
35649:
3565        ret
3566
3567160:
3568320:
3569640:
3570        AARCH64_VALID_JUMP_TARGET
3571        mov             w12, w4
35721:
3573        lsr             w8,  w7,  #6              // base
3574        and             w9,  w7,  #0x3e           // frac
3575        add             w7,  w7,  w5              // ypos += dy
3576        cmp             w8,  w6                   // base >= max_base_y
3577        lsr             w10, w7,  #6              // base
3578        and             w11, w7,  #0x3e           // frac
3579        b.ge            ipred_z3_fill_padding_neon
3580        add             x8,  x2,  w8,  uxtw #1
3581        add             x10, x2,  w10, uxtw #1
3582        dup             v6.8h,   w9               // frac
3583        dup             v7.8h,   w11
3584        ld1             {v0.8h, v1.8h, v2.8h}, [x8],  #48 // left[base]
3585        ld1             {v3.8h, v4.8h, v5.8h}, [x10], #48
3586        sub             w9,  w15, w9              // 64 - frac
3587        sub             w11, w15, w11
3588        dup             v16.8h,  w9               // 64 - frac
3589        dup             v17.8h,  w11
3590        add             w7,  w7,  w5              // ypos += dy
35912:
3592        ext             v18.16b, v0.16b,  v1.16b,  #2 // left[base+1]
3593        ext             v19.16b, v1.16b,  v2.16b,  #2
3594        ext             v20.16b, v3.16b,  v4.16b,  #2
3595        ext             v21.16b, v4.16b,  v5.16b,  #2
3596        subs            w4,  w4,  #16
3597        umull           v22.4s,  v0.4h,   v16.4h  // left[base]*(64-frac)
3598        umlal           v22.4s,  v18.4h,  v6.4h   // + left[base+1]*frac
3599        umull2          v23.4s,  v0.8h,   v16.8h
3600        umlal2          v23.4s,  v18.8h,  v6.8h
3601        umull           v24.4s,  v1.4h,   v16.4h
3602        umlal           v24.4s,  v19.4h,  v6.4h
3603        umull2          v25.4s,  v1.8h,   v16.8h
3604        umlal2          v25.4s,  v19.8h,  v6.8h
3605        umull           v26.4s,  v3.4h,   v17.4h
3606        umlal           v26.4s,  v20.4h,  v7.4h
3607        umull2          v27.4s,  v3.8h,   v17.8h
3608        umlal2          v27.4s,  v20.8h,  v7.8h
3609        umull           v28.4s,  v4.4h,   v17.4h
3610        umlal           v28.4s,  v21.4h,  v7.4h
3611        umull2          v29.4s,  v4.8h,   v17.8h
3612        umlal2          v29.4s,  v21.8h,  v7.8h
3613        rshrn           v22.4h,  v22.4s,  #6
3614        rshrn2          v22.8h,  v23.4s,  #6
3615        rshrn           v23.4h,  v24.4s,  #6
3616        rshrn2          v23.8h,  v25.4s,  #6
3617        rshrn           v24.4h,  v26.4s,  #6
3618        rshrn2          v24.8h,  v27.4s,  #6
3619        rshrn           v25.4h,  v28.4s,  #6
3620        rshrn2          v25.8h,  v29.4s,  #6
3621        zip1            v18.8h,  v22.8h,  v24.8h
3622        zip2            v19.8h,  v22.8h,  v24.8h
3623        zip1            v20.8h,  v23.8h,  v25.8h
3624        zip2            v21.8h,  v23.8h,  v25.8h
3625        st1             {v18.s}[0], [x0],  x1
3626        st1             {v18.s}[1], [x13], x1
3627        st1             {v18.s}[2], [x0],  x1
3628        st1             {v18.s}[3], [x13], x1
3629        st1             {v19.s}[0], [x0],  x1
3630        st1             {v19.s}[1], [x13], x1
3631        st1             {v19.s}[2], [x0],  x1
3632        st1             {v19.s}[3], [x13], x1
3633        st1             {v20.s}[0], [x0],  x1
3634        st1             {v20.s}[1], [x13], x1
3635        st1             {v20.s}[2], [x0],  x1
3636        st1             {v20.s}[3], [x13], x1
3637        st1             {v21.s}[0], [x0],  x1
3638        st1             {v21.s}[1], [x13], x1
3639        st1             {v21.s}[2], [x0],  x1
3640        st1             {v21.s}[3], [x13], x1
3641        b.le            3f
3642        mov             v0.16b,  v2.16b
3643        ld1             {v1.8h, v2.8h}, [x8],  #32      // left[base]
3644        mov             v3.16b,  v5.16b
3645        ld1             {v4.8h, v5.8h}, [x10], #32
3646        b               2b
3647
36483:
3649        subs            w3,  w3,  #2
3650        b.le            9f
3651        lsr             x1,  x1,  #1
3652        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3653        msub            x13, x1,  x12, x13
3654        lsl             x1,  x1,  #1
3655        add             x0,  x0,  #4
3656        add             x13, x13, #4
3657        mov             w4,  w12
3658        b               1b
36599:
3660        ret
3661endfunc
3662
3663jumptable ipred_z3_fill1_tbl
3664        .word 640b - ipred_z3_fill1_tbl
3665        .word 320b - ipred_z3_fill1_tbl
3666        .word 160b - ipred_z3_fill1_tbl
3667        .word 80b  - ipred_z3_fill1_tbl
3668        .word 40b  - ipred_z3_fill1_tbl
3669endjumptable
3670
3671function ipred_z3_fill_padding_neon, export=0
3672        cmp             w3,  #8
3673        movrel          x8,  ipred_z3_fill_padding_tbl
3674        b.gt            ipred_z3_fill_padding_wide
3675        // w3 = remaining width, w4 = constant height
3676        mov             w12, w4
3677
36781:
3679        // Fill a WxH rectangle with padding. W can be any number;
3680        // this fills the exact width by filling in the largest
3681        // power of two in the remaining width, and repeating.
3682        clz             w9,  w3
3683        sub             w9,  w9,  #25
3684        ldrsw           x9,  [x8, w9, uxtw #2]
3685        add             x9,  x8,  x9
3686        br              x9
3687
368820:
3689        AARCH64_VALID_JUMP_TARGET
36902:
3691        st1             {v31.s}[0], [x0],  x1
3692        subs            w4,  w4,  #4
3693        st1             {v31.s}[0], [x13], x1
3694        st1             {v31.s}[0], [x0],  x1
3695        st1             {v31.s}[0], [x13], x1
3696        b.gt            2b
3697        subs            w3,  w3,  #2
3698        lsr             x1,  x1,  #1
3699        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3700        msub            x13, x1,  x12, x13
3701        b.le            9f
3702        lsl             x1,  x1,  #1
3703        add             x0,  x0,  #4
3704        add             x13, x13, #4
3705        mov             w4,  w12
3706        b               1b
3707
370840:
3709        AARCH64_VALID_JUMP_TARGET
37104:
3711        st1             {v31.4h}, [x0],  x1
3712        subs            w4,  w4,  #4
3713        st1             {v31.4h}, [x13], x1
3714        st1             {v31.4h}, [x0],  x1
3715        st1             {v31.4h}, [x13], x1
3716        b.gt            4b
3717        subs            w3,  w3,  #4
3718        lsr             x1,  x1,  #1
3719        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3720        msub            x13, x1,  x12, x13
3721        b.le            9f
3722        lsl             x1,  x1,  #1
3723        add             x0,  x0,  #8
3724        add             x13, x13, #8
3725        mov             w4,  w12
3726        b               1b
3727
372880:
3729160:
3730320:
3731640:
3732        AARCH64_VALID_JUMP_TARGET
37338:
3734        st1             {v31.8h}, [x0],  x1
3735        subs            w4,  w4,  #4
3736        st1             {v31.8h}, [x13], x1
3737        st1             {v31.8h}, [x0],  x1
3738        st1             {v31.8h}, [x13], x1
3739        b.gt            8b
3740        subs            w3,  w3,  #8
3741        lsr             x1,  x1,  #1
3742        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3743        msub            x13, x1,  x12, x13
3744        b.le            9f
3745        lsl             x1,  x1,  #1
3746        add             x0,  x0,  #16
3747        add             x13, x13, #16
3748        mov             w4,  w12
3749        b               1b
3750
37519:
3752        ret
3753endfunc
3754
3755jumptable ipred_z3_fill_padding_tbl
3756        .word 640b - ipred_z3_fill_padding_tbl
3757        .word 320b - ipred_z3_fill_padding_tbl
3758        .word 160b - ipred_z3_fill_padding_tbl
3759        .word 80b  - ipred_z3_fill_padding_tbl
3760        .word 40b  - ipred_z3_fill_padding_tbl
3761        .word 20b  - ipred_z3_fill_padding_tbl
3762endjumptable
3763
3764function ipred_z3_fill_padding_wide
3765        // Fill a WxH rectangle with padding, with W > 8.
3766        lsr             x1,  x1,  #1
3767        mov             w12, w3
3768        sub             x1,  x1,  w3,  uxtw #1
37691:
3770        ands            w5,  w3,  #7
3771        b.eq            2f
3772        // If the width isn't aligned to 8, first do one 8 pixel write
3773        // and align the start pointer.
3774        sub             w3,  w3,  w5
3775        st1             {v31.8h}, [x0]
3776        add             x0,  x0,  w5,  uxtw #1
37772:
3778        // Fill the rest of the line with aligned 8 pixel writes.
3779        subs            w3,  w3,  #8
3780        st1             {v31.8h}, [x0], #16
3781        b.gt            2b
3782        subs            w4,  w4,  #1
3783        add             x0,  x0,  x1
3784        b.le            9f
3785        mov             w3,  w12
3786        b               1b
37879:
3788        ret
3789endfunc
3790
3791function ipred_z3_fill2_16bpc_neon, export=1
3792        cmp             w4,  #8
3793        add             x10, x2,  w6,  uxtw       // left[max_base_y]
3794        ld1r            {v31.16b}, [x10]          // padding
3795        mov             w7,  w5
3796        mov             w15, #64
3797        add             x13, x0,  x1
3798        lsl             x1,  x1,  #1
3799        b.eq            8f
3800
38014:      // h == 4
3802        lsr             w8,  w7,  #6              // base
3803        and             w9,  w7,  #0x3e           // frac
3804        add             w7,  w7,  w5              // xpos += dx
3805        cmp             w8,  w6                   // base >= max_base_x
3806        lsr             w10, w7,  #6              // base
3807        and             w11, w7,  #0x3e           // frac
3808        b.ge            ipred_z3_fill_padding_neon
3809        lsl             w8,  w8,  #1
3810        lsl             w10, w10, #1
3811        ldr             q0,  [x2, w8, uxtw]       // top[base]
3812        ldr             q2,  [x2, w10, uxtw]
3813        dup             v4.4h,   w9               // frac
3814        dup             v5.4h,   w11
3815        uzp2            v1.8h,   v0.8h,   v0.8h   // top[base+1]
3816        uzp1            v0.8h,   v0.8h,   v0.8h   // top[base]
3817        uzp2            v3.8h,   v2.8h,   v2.8h
3818        uzp1            v2.8h,   v2.8h,   v2.8h
3819        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
3820        sub             v7.4h,   v3.4h,   v2.4h
3821        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
3822        ushll           v17.4s,  v2.4h,   #6
3823        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
3824        smlal           v17.4s,  v7.4h,   v5.4h
3825        rshrn           v16.4h,  v16.4s,  #6
3826        rshrn           v17.4h,  v17.4s,  #6
3827        subs            w3,  w3,  #2
3828        zip1            v18.8h,  v16.8h,  v17.8h
3829        st1             {v18.s}[0], [x0],  x1
3830        st1             {v18.s}[1], [x13], x1
3831        add             w7,  w7,  w5              // xpos += dx
3832        st1             {v18.s}[2], [x0]
3833        st1             {v18.s}[3], [x13]
3834        b.le            9f
3835        sub             x0,  x0,  x1              // ptr -= 4 * (2*stride)
3836        sub             x13, x13, x1
3837        add             x0,  x0,  #4
3838        add             x13, x13, #4
3839        b               4b
38409:
3841        ret
3842
38438:      // h == 8
3844        lsr             w8,  w7,  #6              // base
3845        and             w9,  w7,  #0x3e           // frac
3846        add             w7,  w7,  w5              // xpos += dx
3847        cmp             w8,  w6                   // base >= max_base_x
3848        lsr             w10, w7,  #6              // base
3849        and             w11, w7,  #0x3e           // frac
3850        b.ge            ipred_z3_fill_padding_neon
3851        add             x8,  x2,  w8,  uxtw #1
3852        add             x10, x2,  w10, uxtw #1
3853        dup             v4.8h,   w9               // frac
3854        dup             v5.8h,   w11
3855        ld1             {v0.8h, v1.8h},  [x8]     // top[base]
3856        ld1             {v2.8h, v3.8h},  [x10]
3857        sub             w9,  w15, w9              // 64 - frac
3858        sub             w11, w15, w11
3859        dup             v6.8h,   w9               // 64 - frac
3860        dup             v7.8h,   w11
3861        uzp2            v20.8h,  v0.8h,   v1.8h   // top[base+1]
3862        uzp1            v0.8h,   v0.8h,   v1.8h   // top[base]
3863        uzp2            v21.8h,  v2.8h,   v3.8h
3864        uzp1            v2.8h,   v2.8h,   v3.8h
3865        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
3866        umlal           v16.4s,  v20.4h,  v4.4h   // + top[base+1]*frac
3867        umull2          v17.4s,  v0.8h,   v6.8h
3868        umlal2          v17.4s,  v20.8h,  v4.8h
3869        umull           v18.4s,  v2.4h,   v7.4h
3870        umlal           v18.4s,  v21.4h,  v5.4h
3871        umull2          v19.4s,  v2.8h,   v7.8h
3872        umlal2          v19.4s,  v21.8h,  v5.8h
3873        rshrn           v16.4h,  v16.4s,  #6
3874        rshrn2          v16.8h,  v17.4s,  #6
3875        rshrn           v17.4h,  v18.4s,  #6
3876        rshrn2          v17.8h,  v19.4s,  #6
3877        subs            w3,  w3,  #2
3878        zip1            v18.8h,  v16.8h,  v17.8h
3879        zip2            v19.8h,  v16.8h,  v17.8h
3880        add             w7,  w7,  w5              // xpos += dx
3881        st1             {v18.s}[0], [x0],  x1
3882        st1             {v18.s}[1], [x13], x1
3883        st1             {v18.s}[2], [x0],  x1
3884        st1             {v18.s}[3], [x13], x1
3885        st1             {v19.s}[0], [x0],  x1
3886        st1             {v19.s}[1], [x13], x1
3887        st1             {v19.s}[2], [x0],  x1
3888        st1             {v19.s}[3], [x13], x1
3889        b.le            9f
3890        sub             x0,  x0,  x1, lsl #2      // ptr -= 4 * (2*stride)
3891        sub             x13, x13, x1, lsl #2
3892        add             x0,  x0,  #4
3893        add             x13, x13, #4
3894        b               8b
38959:
3896        ret
3897endfunc
3898
3899
3900// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
3901//                              const pixel *const topleft,
3902//                              const int width, const int height, const int filt_idx,
3903//                              const int max_width, const int max_height,
3904//                              const int bitdepth_max);
3905.macro filter_fn bpc
3906function ipred_filter_\bpc\()bpc_neon
3907        and             w5,  w5,  #511
3908        movrel          x6,  X(filter_intra_taps)
3909        lsl             w5,  w5,  #6
3910        add             x6,  x6,  w5, uxtw
3911        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
3912        clz             w9,  w3
3913        movrel          x5,  ipred_filter\bpc\()_tbl
3914        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
3915        sub             w9,  w9,  #26
3916        ldrsw           x9,  [x5, w9, uxtw #2]
3917        sxtl            v16.8h,  v16.8b
3918        sxtl            v17.8h,  v17.8b
3919        add             x5,  x5,  x9
3920        sxtl            v18.8h,  v18.8b
3921        sxtl            v19.8h,  v19.8b
3922        add             x6,  x0,  x1
3923        lsl             x1,  x1,  #1
3924        sxtl            v20.8h,  v20.8b
3925        sxtl            v21.8h,  v21.8b
3926        sxtl            v22.8h,  v22.8b
3927        dup             v31.8h,  w8
3928.if \bpc == 10
3929        movi            v30.8h,  #0
3930.endif
3931        br              x5
393240:
3933        AARCH64_VALID_JUMP_TARGET
3934        ldur            d0,  [x2, #2]             // top (0-3)
3935        sub             x2,  x2,  #4
3936        mov             x7,  #-4
39374:
3938        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
3939.if \bpc == 10
3940        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3941        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3942        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3943        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3944        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3945        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3946        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3947        srshr           v2.8h,   v2.8h,   #4
3948        smax            v2.8h,   v2.8h,   v30.8h
3949.else
3950        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
3951        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
3952        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
3953        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
3954        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
3955        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
3956        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
3957        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3958        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3959        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3960        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3961        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3962        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3963        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3964        sqrshrun        v2.4h,   v2.4s,   #4
3965        sqrshrun2       v2.8h,   v3.4s,   #4
3966.endif
3967        smin            v2.8h,   v2.8h,   v31.8h
3968        subs            w4,  w4,  #2
3969        st1             {v2.d}[0], [x0], x1
3970        ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]
3971        st1             {v2.d}[1], [x6], x1
3972        b.gt            4b
3973        ret
397480:
3975        AARCH64_VALID_JUMP_TARGET
3976        ldur            q0,  [x2, #2]             // top (0-7)
3977        sub             x2,  x2,  #4
3978        mov             x7,  #-4
39798:
3980        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
3981.if \bpc == 10
3982        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3983        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3984        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3985        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3986        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3987        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3988        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3989        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
3990        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
3991        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
3992        srshr           v2.8h,   v2.8h,   #4
3993        smax            v2.8h,   v2.8h,   v30.8h
3994        smin            v2.8h,   v2.8h,   v31.8h
3995        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
3996        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
3997        mla             v3.8h,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
3998        mla             v3.8h,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
3999        srshr           v3.8h,   v3.8h,   #4
4000        smax            v3.8h,   v3.8h,   v30.8h
4001.else
4002        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
4003        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
4004        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
4005        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
4006        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
4007        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
4008        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
4009        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
4010        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
4011        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
4012        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
4013        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
4014        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
4015        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
4016        smull           v4.4s,   v17.4h,  v0.h[4] // p1(top[0]) * filter(1)
4017        smlal           v4.4s,   v18.4h,  v0.h[5] // p2(top[1]) * filter(2)
4018        smlal           v4.4s,   v19.4h,  v0.h[6] // p3(top[2]) * filter(3)
4019        sqrshrun        v2.4h,   v2.4s,   #4
4020        sqrshrun2       v2.8h,   v3.4s,   #4
4021        smin            v2.8h,   v2.8h,   v31.8h
4022        smlal           v4.4s,   v20.4h,  v0.h[7] // p4(top[3]) * filter(4)
4023        smlal           v4.4s,   v16.4h,  v0.h[3] // p0(topleft) * filter(0)
4024        smlal           v4.4s,   v21.4h,  v2.h[3] // p5(left[0]) * filter(5)
4025        smlal           v4.4s,   v22.4h,  v2.h[7] // p6(left[1]) * filter(6)
4026        smull2          v5.4s,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
4027        smlal2          v5.4s,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
4028        smlal2          v5.4s,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
4029        smlal2          v5.4s,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
4030        smlal2          v5.4s,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
4031        smlal2          v5.4s,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
4032        smlal2          v5.4s,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
4033        sqrshrun        v3.4h,   v4.4s,   #4
4034        sqrshrun2       v3.8h,   v5.4s,   #4
4035.endif
4036        smin            v3.8h,   v3.8h,   v31.8h
4037        subs            w4,  w4,  #2
4038        st2             {v2.d, v3.d}[0], [x0], x1
4039        zip2            v0.2d,   v2.2d,   v3.2d
4040        st2             {v2.d, v3.d}[1], [x6], x1
4041        b.gt            8b
4042        ret
4043160:
4044320:
4045        AARCH64_VALID_JUMP_TARGET
4046        add             x8,  x2,  #2
4047        sub             x2,  x2,  #4
4048        mov             x7,  #-4
4049        sub             x1,  x1,  w3, uxtw #1
4050        mov             w9,  w3
4051
40521:
4053        ld1             {v0.4h}, [x2], x7         // left (0-1) + topleft (2)
40542:
4055        ld1             {v1.8h, v2.8h}, [x8], #32 // top(0-15)
4056.if \bpc == 10
4057        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
4058        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
4059        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
4060        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
4061        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
4062        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
4063        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
4064
4065        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
4066        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
4067        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
4068        srshr           v3.8h,   v3.8h,   #4
4069        smax            v3.8h,   v3.8h,   v30.8h
4070        smin            v3.8h,   v3.8h,   v31.8h
4071        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
4072        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
4073        mla             v4.8h,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
4074        mla             v4.8h,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
4075
4076        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
4077        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
4078        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
4079        srshr           v4.8h,   v4.8h,   #4
4080        smax            v4.8h,   v4.8h,   v30.8h
4081        smin            v4.8h,   v4.8h,   v31.8h
4082        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
4083        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
4084        mla             v5.8h,   v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
4085        mla             v5.8h,   v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
4086
4087        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
4088        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
4089        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
4090        srshr           v5.8h,   v5.8h,   #4
4091        smax            v5.8h,   v5.8h,   v30.8h
4092        smin            v5.8h,   v5.8h,   v31.8h
4093        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
4094        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
4095        mla             v6.8h,   v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
4096        mla             v6.8h,   v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
4097
4098        subs            w3,  w3,  #16
4099        srshr           v6.8h,   v6.8h,   #4
4100        smax            v6.8h,   v6.8h,   v30.8h
4101.else
4102        smull           v3.4s,   v16.4h,  v0.h[2] // p0(topleft) * filter(0)
4103        smlal           v3.4s,   v21.4h,  v0.h[1] // p5(left[0]) * filter(5)
4104        smlal           v3.4s,   v22.4h,  v0.h[0] // p6(left[1]) * filter(6)
4105        smlal           v3.4s,   v17.4h,  v1.h[0] // p1(top[0]) * filter(1)
4106        smlal           v3.4s,   v18.4h,  v1.h[1] // p2(top[1]) * filter(2)
4107        smlal           v3.4s,   v19.4h,  v1.h[2] // p3(top[2]) * filter(3)
4108        smlal           v3.4s,   v20.4h,  v1.h[3] // p4(top[3]) * filter(4)
4109        smull2          v4.4s,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
4110        smlal2          v4.4s,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
4111        smlal2          v4.4s,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
4112        smlal2          v4.4s,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
4113        smlal2          v4.4s,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
4114        smlal2          v4.4s,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
4115        smlal2          v4.4s,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
4116
4117        smull           v5.4s,   v17.4h,  v1.h[4] // p1(top[0]) * filter(1)
4118        smlal           v5.4s,   v18.4h,  v1.h[5] // p2(top[1]) * filter(2)
4119        smlal           v5.4s,   v19.4h,  v1.h[6] // p3(top[2]) * filter(3)
4120        sqrshrun        v3.4h,   v3.4s,   #4
4121        sqrshrun2       v3.8h,   v4.4s,   #4
4122        smin            v3.8h,   v3.8h,   v31.8h
4123        smlal           v5.4s,   v20.4h,  v1.h[7] // p4(top[3]) * filter(4)
4124        smlal           v5.4s,   v16.4h,  v1.h[3] // p0(topleft) * filter(0)
4125        smlal           v5.4s,   v21.4h,  v3.h[3] // p5(left[0]) * filter(5)
4126        smlal           v5.4s,   v22.4h,  v3.h[7] // p6(left[1]) * filter(6)
4127        smull2          v6.4s,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
4128        smlal2          v6.4s,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
4129        smlal2          v6.4s,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
4130        smlal2          v6.4s,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
4131        smlal2          v6.4s,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
4132        smlal2          v6.4s,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
4133        smlal2          v6.4s,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
4134
4135        smull           v24.4s,  v17.4h,  v2.h[0] // p1(top[0]) * filter(1)
4136        smlal           v24.4s,  v18.4h,  v2.h[1] // p2(top[1]) * filter(2)
4137        smlal           v24.4s,  v19.4h,  v2.h[2] // p3(top[2]) * filter(3)
4138        sqrshrun        v4.4h,   v5.4s,   #4
4139        sqrshrun2       v4.8h,   v6.4s,   #4
4140        smin            v4.8h,   v4.8h,   v31.8h
4141        smlal           v24.4s,  v20.4h,  v2.h[3] // p4(top[3]) * filter(4)
4142        smlal           v24.4s,  v16.4h,  v1.h[7] // p0(topleft) * filter(0)
4143        smlal           v24.4s,  v21.4h,  v4.h[3] // p5(left[0]) * filter(5)
4144        smlal           v24.4s,  v22.4h,  v4.h[7] // p6(left[1]) * filter(6)
4145        smull2          v25.4s,  v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
4146        smlal2          v25.4s,  v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
4147        smlal2          v25.4s,  v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
4148        smlal2          v25.4s,  v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
4149        smlal2          v25.4s,  v16.8h,  v1.h[7] // p0(topleft) * filter(0)
4150        smlal2          v25.4s,  v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
4151        smlal2          v25.4s,  v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
4152
4153        smull           v26.4s,  v17.4h,  v2.h[4] // p1(top[0]) * filter(1)
4154        smlal           v26.4s,  v18.4h,  v2.h[5] // p2(top[1]) * filter(2)
4155        smlal           v26.4s,  v19.4h,  v2.h[6] // p3(top[2]) * filter(3)
4156        sqrshrun        v5.4h,   v24.4s,  #4
4157        sqrshrun2       v5.8h,   v25.4s,  #4
4158        smin            v5.8h,   v5.8h,   v31.8h
4159        smlal           v26.4s,  v20.4h,  v2.h[7] // p4(top[3]) * filter(4)
4160        smlal           v26.4s,  v16.4h,  v2.h[3] // p0(topleft) * filter(0)
4161        smlal           v26.4s,  v21.4h,  v5.h[3] // p5(left[0]) * filter(5)
4162        smlal           v26.4s,  v22.4h,  v5.h[7] // p6(left[1]) * filter(6)
4163        smull2          v27.4s,  v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
4164        smlal2          v27.4s,  v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
4165        smlal2          v27.4s,  v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
4166        smlal2          v27.4s,  v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
4167        smlal2          v27.4s,  v16.8h,  v2.h[3] // p0(topleft) * filter(0)
4168        smlal2          v27.4s,  v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
4169        smlal2          v27.4s,  v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
4170
4171        subs            w3,  w3,  #16
4172        sqrshrun        v6.4h,   v26.4s,  #4
4173        sqrshrun2       v6.8h,   v27.4s,  #4
4174.endif
4175        smin            v6.8h,   v6.8h,   v31.8h
4176
4177        ins             v0.h[2], v2.h[7]
4178        st4             {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
4179        ins             v0.h[0], v6.h[7]
4180        st4             {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
4181        ins             v0.h[1], v6.h[3]
4182        b.gt            2b
4183        subs            w4,  w4,  #2
4184        b.le            9f
4185        sub             x8,  x6,  w9, uxtw #1
4186        add             x0,  x0,  x1
4187        add             x6,  x6,  x1
4188        mov             w3,  w9
4189        b               1b
41909:
4191        ret
4192endfunc
4193
4194jumptable ipred_filter\bpc\()_tbl
4195        .word 320b - ipred_filter\bpc\()_tbl
4196        .word 160b - ipred_filter\bpc\()_tbl
4197        .word 80b  - ipred_filter\bpc\()_tbl
4198        .word 40b  - ipred_filter\bpc\()_tbl
4199endjumptable
4200.endm
4201
4202filter_fn 10
4203filter_fn 12
4204
4205function ipred_filter_16bpc_neon, export=1
4206        ldr             w8,  [sp]
4207        cmp             w8,  0x3ff
4208        b.le            ipred_filter_10bpc_neon
4209        b               ipred_filter_12bpc_neon
4210endfunc
4211
4212// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4213//                          const pixel *const pal, const uint8_t *idx,
4214//                          const int w, const int h);
4215function pal_pred_16bpc_neon, export=1
4216        ld1             {v30.8h}, [x2]
4217        clz             w9,  w4
4218        movrel          x6,  pal_pred_tbl
4219        sub             w9,  w9,  #25
4220        movi            v29.16b, #7
4221        ldrsw           x9,  [x6, w9, uxtw #2]
4222        movi            v31.8h,  #1, lsl #8
4223        add             x6,  x6,  x9
4224        br              x6
422540:
4226        AARCH64_VALID_JUMP_TARGET
4227        add             x2,  x0,  x1
4228        lsl             x1,  x1,  #1
42294:
4230        ld1             {v1.8b}, [x3], #8
4231        subs            w5,  w5,  #4
4232        ushr            v3.8b,   v1.8b,   #4
4233        and             v2.8b,   v1.8b,   v29.8b
4234        zip1            v1.16b,  v2.16b,  v3.16b
4235        // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
4236        add             v1.16b,  v1.16b,  v1.16b
4237        zip1            v0.16b,  v1.16b,  v1.16b
4238        zip2            v1.16b,  v1.16b,  v1.16b
4239        add             v0.8h,   v0.8h,   v31.8h
4240        add             v1.8h,   v1.8h,   v31.8h
4241        tbl             v0.16b, {v30.16b}, v0.16b
4242        st1             {v0.d}[0], [x0], x1
4243        tbl             v1.16b, {v30.16b}, v1.16b
4244        st1             {v0.d}[1], [x2], x1
4245        st1             {v1.d}[0], [x0], x1
4246        st1             {v1.d}[1], [x2], x1
4247        b.gt            4b
4248        ret
424980:
4250        AARCH64_VALID_JUMP_TARGET
4251        add             x2,  x0,  x1
4252        lsl             x1,  x1,  #1
42538:
4254        ld1             {v2.16b}, [x3], #16
4255        subs            w5,  w5,  #4
4256        ushr            v4.16b,  v2.16b,  #4
4257        and             v3.16b,  v2.16b,  v29.16b
4258        zip1            v2.16b,  v3.16b,  v4.16b
4259        zip2            v3.16b,  v3.16b,  v4.16b
4260        add             v2.16b,  v2.16b,  v2.16b
4261        add             v3.16b,  v3.16b,  v3.16b
4262        zip1            v0.16b,  v2.16b,  v2.16b
4263        zip2            v1.16b,  v2.16b,  v2.16b
4264        zip1            v2.16b,  v3.16b,  v3.16b
4265        zip2            v3.16b,  v3.16b,  v3.16b
4266        add             v0.8h,   v0.8h,   v31.8h
4267        add             v1.8h,   v1.8h,   v31.8h
4268        add             v2.8h,   v2.8h,   v31.8h
4269        add             v3.8h,   v3.8h,   v31.8h
4270        tbl             v0.16b, {v30.16b}, v0.16b
4271        tbl             v1.16b, {v30.16b}, v1.16b
4272        st1             {v0.8h}, [x0], x1
4273        tbl             v2.16b, {v30.16b}, v2.16b
4274        st1             {v1.8h}, [x2], x1
4275        tbl             v3.16b, {v30.16b}, v3.16b
4276        st1             {v2.8h}, [x0], x1
4277        st1             {v3.8h}, [x2], x1
4278        b.gt            8b
4279        ret
4280160:
4281        AARCH64_VALID_JUMP_TARGET
4282        add             x2,  x0,  x1
4283        lsl             x1,  x1,  #1
428416:
4285        ld1             {v4.16b, v5.16b}, [x3], #32
4286        subs            w5,  w5,  #4
4287        ushr            v7.16b,  v4.16b,  #4
4288        and             v6.16b,  v4.16b,  v29.16b
4289        ushr            v3.16b,  v5.16b,  #4
4290        and             v2.16b,  v5.16b,  v29.16b
4291        zip1            v4.16b,  v6.16b,  v7.16b
4292        zip2            v5.16b,  v6.16b,  v7.16b
4293        zip1            v6.16b,  v2.16b,  v3.16b
4294        zip2            v7.16b,  v2.16b,  v3.16b
4295        add             v4.16b,  v4.16b,  v4.16b
4296        add             v5.16b,  v5.16b,  v5.16b
4297        add             v6.16b,  v6.16b,  v6.16b
4298        add             v7.16b,  v7.16b,  v7.16b
4299        zip1            v0.16b,  v4.16b,  v4.16b
4300        zip2            v1.16b,  v4.16b,  v4.16b
4301        zip1            v2.16b,  v5.16b,  v5.16b
4302        zip2            v3.16b,  v5.16b,  v5.16b
4303        zip1            v4.16b,  v6.16b,  v6.16b
4304        zip2            v5.16b,  v6.16b,  v6.16b
4305        zip1            v6.16b,  v7.16b,  v7.16b
4306        zip2            v7.16b,  v7.16b,  v7.16b
4307        add             v0.8h,   v0.8h,   v31.8h
4308        add             v1.8h,   v1.8h,   v31.8h
4309        add             v2.8h,   v2.8h,   v31.8h
4310        add             v3.8h,   v3.8h,   v31.8h
4311        add             v4.8h,   v4.8h,   v31.8h
4312        tbl             v0.16b, {v30.16b}, v0.16b
4313        add             v5.8h,   v5.8h,   v31.8h
4314        tbl             v1.16b, {v30.16b}, v1.16b
4315        add             v6.8h,   v6.8h,   v31.8h
4316        tbl             v2.16b, {v30.16b}, v2.16b
4317        add             v7.8h,   v7.8h,   v31.8h
4318        tbl             v3.16b, {v30.16b}, v3.16b
4319        tbl             v4.16b, {v30.16b}, v4.16b
4320        tbl             v5.16b, {v30.16b}, v5.16b
4321        st1             {v0.8h, v1.8h}, [x0], x1
4322        tbl             v6.16b, {v30.16b}, v6.16b
4323        st1             {v2.8h, v3.8h}, [x2], x1
4324        tbl             v7.16b, {v30.16b}, v7.16b
4325        st1             {v4.8h, v5.8h}, [x0], x1
4326        st1             {v6.8h, v7.8h}, [x2], x1
4327        b.gt            16b
4328        ret
4329320:
4330        AARCH64_VALID_JUMP_TARGET
4331        add             x2,  x0,  x1
4332        lsl             x1,  x1,  #1
433332:
4334        ld1             {v4.16b, v5.16b}, [x3], #32
4335        subs            w5,  w5,  #2
4336        ushr            v7.16b,  v4.16b,  #4
4337        and             v6.16b,  v4.16b,  v29.16b
4338        ushr            v3.16b,  v5.16b,  #4
4339        and             v2.16b,  v5.16b,  v29.16b
4340        zip1            v4.16b,  v6.16b,  v7.16b
4341        zip2            v5.16b,  v6.16b,  v7.16b
4342        zip1            v6.16b,  v2.16b,  v3.16b
4343        zip2            v7.16b,  v2.16b,  v3.16b
4344        add             v4.16b,  v4.16b,  v4.16b
4345        add             v5.16b,  v5.16b,  v5.16b
4346        add             v6.16b,  v6.16b,  v6.16b
4347        add             v7.16b,  v7.16b,  v7.16b
4348        zip1            v0.16b,  v4.16b,  v4.16b
4349        zip2            v1.16b,  v4.16b,  v4.16b
4350        zip1            v2.16b,  v5.16b,  v5.16b
4351        zip2            v3.16b,  v5.16b,  v5.16b
4352        zip1            v4.16b,  v6.16b,  v6.16b
4353        zip2            v5.16b,  v6.16b,  v6.16b
4354        zip1            v6.16b,  v7.16b,  v7.16b
4355        zip2            v7.16b,  v7.16b,  v7.16b
4356        add             v0.8h,   v0.8h,   v31.8h
4357        add             v1.8h,   v1.8h,   v31.8h
4358        add             v2.8h,   v2.8h,   v31.8h
4359        add             v3.8h,   v3.8h,   v31.8h
4360        add             v4.8h,   v4.8h,   v31.8h
4361        tbl             v0.16b, {v30.16b}, v0.16b
4362        add             v5.8h,   v5.8h,   v31.8h
4363        tbl             v1.16b, {v30.16b}, v1.16b
4364        add             v6.8h,   v6.8h,   v31.8h
4365        tbl             v2.16b, {v30.16b}, v2.16b
4366        add             v7.8h,   v7.8h,   v31.8h
4367        tbl             v3.16b, {v30.16b}, v3.16b
4368        tbl             v4.16b, {v30.16b}, v4.16b
4369        tbl             v5.16b, {v30.16b}, v5.16b
4370        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
4371        tbl             v6.16b, {v30.16b}, v6.16b
4372        tbl             v7.16b, {v30.16b}, v7.16b
4373        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
4374        b.gt            32b
4375        ret
4376640:
4377        AARCH64_VALID_JUMP_TARGET
4378        add             x2,  x0,  #64
437964:
4380        ld1             {v4.16b, v5.16b}, [x3], #32
4381        subs            w5,  w5,  #1
4382        ushr            v7.16b,  v4.16b,  #4
4383        and             v6.16b,  v4.16b,  v29.16b
4384        ushr            v3.16b,  v5.16b,  #4
4385        and             v2.16b,  v5.16b,  v29.16b
4386        zip1            v4.16b,  v6.16b,  v7.16b
4387        zip2            v5.16b,  v6.16b,  v7.16b
4388        zip1            v6.16b,  v2.16b,  v3.16b
4389        zip2            v7.16b,  v2.16b,  v3.16b
4390        add             v4.16b,  v4.16b,  v4.16b
4391        add             v5.16b,  v5.16b,  v5.16b
4392        add             v6.16b,  v6.16b,  v6.16b
4393        add             v7.16b,  v7.16b,  v7.16b
4394        zip1            v0.16b,  v4.16b,  v4.16b
4395        zip2            v1.16b,  v4.16b,  v4.16b
4396        zip1            v2.16b,  v5.16b,  v5.16b
4397        zip2            v3.16b,  v5.16b,  v5.16b
4398        zip1            v4.16b,  v6.16b,  v6.16b
4399        zip2            v5.16b,  v6.16b,  v6.16b
4400        zip1            v6.16b,  v7.16b,  v7.16b
4401        zip2            v7.16b,  v7.16b,  v7.16b
4402        add             v0.8h,   v0.8h,   v31.8h
4403        add             v1.8h,   v1.8h,   v31.8h
4404        add             v2.8h,   v2.8h,   v31.8h
4405        add             v3.8h,   v3.8h,   v31.8h
4406        add             v4.8h,   v4.8h,   v31.8h
4407        tbl             v0.16b, {v30.16b}, v0.16b
4408        add             v5.8h,   v5.8h,   v31.8h
4409        tbl             v1.16b, {v30.16b}, v1.16b
4410        add             v6.8h,   v6.8h,   v31.8h
4411        tbl             v2.16b, {v30.16b}, v2.16b
4412        add             v7.8h,   v7.8h,   v31.8h
4413        tbl             v3.16b, {v30.16b}, v3.16b
4414        tbl             v4.16b, {v30.16b}, v4.16b
4415        tbl             v5.16b, {v30.16b}, v5.16b
4416        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
4417        tbl             v6.16b, {v30.16b}, v6.16b
4418        tbl             v7.16b, {v30.16b}, v7.16b
4419        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
4420        b.gt            64b
4421        ret
4422endfunc
4423
4424jumptable pal_pred_tbl
4425        .word 640b - pal_pred_tbl
4426        .word 320b - pal_pred_tbl
4427        .word 160b - pal_pred_tbl
4428        .word 80b  - pal_pred_tbl
4429        .word 40b  - pal_pred_tbl
4430endjumptable
4431
4432// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4433//                               const pixel *const topleft,
4434//                               const int width, const int height,
4435//                               const int16_t *ac, const int alpha,
4436//                               const int bitdepth_max);
4437function ipred_cfl_128_16bpc_neon, export=1
4438        dup             v31.8h,  w7   // bitdepth_max
4439        clz             w9,  w3
4440        movrel          x7,  ipred_cfl_128_tbl
4441        sub             w9,  w9,  #26
4442        ldrsw           x9,  [x7, w9, uxtw #2]
4443        urshr           v0.8h,   v31.8h,  #1
4444        dup             v1.8h,   w6   // alpha
4445        add             x7,  x7,  x9
4446        add             x6,  x0,  x1
4447        lsl             x1,  x1,  #1
4448        movi            v30.8h,  #0
4449        br              x7
4450L(ipred_cfl_splat_w4):
4451        AARCH64_VALID_JUMP_TARGET
44521:
4453        ld1             {v4.8h, v5.8h}, [x5], #32
4454        subs            w4,  w4,  #4
4455        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
4456        smull2          v3.4s,   v4.8h,   v1.8h
4457        smull           v4.4s,   v5.4h,   v1.4h
4458        smull2          v5.4s,   v5.8h,   v1.8h
4459        cmlt            v16.4s,  v2.4s,   #0     // sign
4460        cmlt            v17.4s,  v3.4s,   #0
4461        cmlt            v18.4s,  v4.4s,   #0
4462        cmlt            v19.4s,  v5.4s,   #0
4463        add             v2.4s,   v2.4s,   v16.4s // diff + sign
4464        add             v3.4s,   v3.4s,   v17.4s
4465        add             v4.4s,   v4.4s,   v18.4s
4466        add             v5.4s,   v5.4s,   v19.4s
4467        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4468        rshrn2          v2.8h,   v3.4s,   #6
4469        rshrn           v3.4h,   v4.4s,   #6
4470        rshrn2          v3.8h,   v5.4s,   #6
4471        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4472        add             v3.8h,   v3.8h,   v0.8h
4473        smax            v2.8h,   v2.8h,   v30.8h
4474        smax            v3.8h,   v3.8h,   v30.8h
4475        smin            v2.8h,   v2.8h,   v31.8h
4476        smin            v3.8h,   v3.8h,   v31.8h
4477        st1             {v2.d}[0],  [x0], x1
4478        st1             {v2.d}[1],  [x6], x1
4479        st1             {v3.d}[0],  [x0], x1
4480        st1             {v3.d}[1],  [x6], x1
4481        b.gt            1b
4482        ret
4483L(ipred_cfl_splat_w8):
4484        AARCH64_VALID_JUMP_TARGET
44851:
4486        ld1             {v4.8h, v5.8h}, [x5], #32
4487        subs            w4,  w4,  #2
4488        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
4489        smull2          v3.4s,   v4.8h,   v1.8h
4490        smull           v4.4s,   v5.4h,   v1.4h
4491        smull2          v5.4s,   v5.8h,   v1.8h
4492        cmlt            v16.4s,  v2.4s,   #0     // sign
4493        cmlt            v17.4s,  v3.4s,   #0
4494        cmlt            v18.4s,  v4.4s,   #0
4495        cmlt            v19.4s,  v5.4s,   #0
4496        add             v2.4s,   v2.4s,   v16.4s // diff + sign
4497        add             v3.4s,   v3.4s,   v17.4s
4498        add             v4.4s,   v4.4s,   v18.4s
4499        add             v5.4s,   v5.4s,   v19.4s
4500        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4501        rshrn2          v2.8h,   v3.4s,   #6
4502        rshrn           v3.4h,   v4.4s,   #6
4503        rshrn2          v3.8h,   v5.4s,   #6
4504        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4505        add             v3.8h,   v3.8h,   v0.8h
4506        smax            v2.8h,   v2.8h,   v30.8h
4507        smax            v3.8h,   v3.8h,   v30.8h
4508        smin            v2.8h,   v2.8h,   v31.8h
4509        smin            v3.8h,   v3.8h,   v31.8h
4510        st1             {v2.8h},  [x0], x1
4511        st1             {v3.8h},  [x6], x1
4512        b.gt            1b
4513        ret
4514L(ipred_cfl_splat_w16):
4515        AARCH64_VALID_JUMP_TARGET
4516        add             x7,  x5,  w3, uxtw #1
4517        sub             x1,  x1,  w3, uxtw #1
4518        mov             w9,  w3
45191:
4520        ld1             {v2.8h, v3.8h}, [x5], #32
4521        ld1             {v4.8h, v5.8h}, [x7], #32
4522        subs            w3,  w3,  #16
4523        smull           v16.4s,  v2.4h,   v1.4h  // diff = ac * alpha
4524        smull2          v17.4s,  v2.8h,   v1.8h
4525        smull           v18.4s,  v3.4h,   v1.4h
4526        smull2          v19.4s,  v3.8h,   v1.8h
4527        smull           v2.4s,   v4.4h,   v1.4h
4528        smull2          v3.4s,   v4.8h,   v1.8h
4529        smull           v4.4s,   v5.4h,   v1.4h
4530        smull2          v5.4s,   v5.8h,   v1.8h
4531        cmlt            v20.4s,  v16.4s,  #0     // sign
4532        cmlt            v21.4s,  v17.4s,  #0
4533        cmlt            v22.4s,  v18.4s,  #0
4534        cmlt            v23.4s,  v19.4s,  #0
4535        cmlt            v24.4s,  v2.4s,   #0
4536        cmlt            v25.4s,  v3.4s,   #0
4537        cmlt            v26.4s,  v4.4s,   #0
4538        cmlt            v27.4s,  v5.4s,   #0
4539        add             v16.4s,  v16.4s,  v20.4s // diff + sign
4540        add             v17.4s,  v17.4s,  v21.4s
4541        add             v18.4s,  v18.4s,  v22.4s
4542        add             v19.4s,  v19.4s,  v23.4s
4543        add             v2.4s,   v2.4s,   v24.4s
4544        add             v3.4s,   v3.4s,   v25.4s
4545        add             v4.4s,   v4.4s,   v26.4s
4546        add             v5.4s,   v5.4s,   v27.4s
4547        rshrn           v16.4h,  v16.4s,  #6     // (diff + sign + 32) >> 6 = apply_sign()
4548        rshrn2          v16.8h,  v17.4s,  #6
4549        rshrn           v17.4h,  v18.4s,  #6
4550        rshrn2          v17.8h,  v19.4s,  #6
4551        rshrn           v6.4h,   v2.4s,   #6
4552        rshrn2          v6.8h,   v3.4s,   #6
4553        rshrn           v7.4h,   v4.4s,   #6
4554        rshrn2          v7.8h,   v5.4s,   #6
4555        add             v2.8h,   v16.8h,  v0.8h  // dc + apply_sign()
4556        add             v3.8h,   v17.8h,  v0.8h
4557        add             v4.8h,   v6.8h,   v0.8h
4558        add             v5.8h,   v7.8h,   v0.8h
4559        smax            v2.8h,   v2.8h,   v30.8h
4560        smax            v3.8h,   v3.8h,   v30.8h
4561        smax            v4.8h,   v4.8h,   v30.8h
4562        smax            v5.8h,   v5.8h,   v30.8h
4563        smin            v2.8h,   v2.8h,   v31.8h
4564        smin            v3.8h,   v3.8h,   v31.8h
4565        smin            v4.8h,   v4.8h,   v31.8h
4566        smin            v5.8h,   v5.8h,   v31.8h
4567        st1             {v2.8h, v3.8h},  [x0], #32
4568        st1             {v4.8h, v5.8h},  [x6], #32
4569        b.gt            1b
4570        subs            w4,  w4,  #2
4571        add             x5,  x5,  w9, uxtw #1
4572        add             x7,  x7,  w9, uxtw #1
4573        add             x0,  x0,  x1
4574        add             x6,  x6,  x1
4575        mov             w3,  w9
4576        b.gt            1b
4577        ret
4578endfunc
4579
4580jumptable ipred_cfl_128_tbl
4581ipred_cfl_splat_tbl:
4582        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
4583        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
4584        .word L(ipred_cfl_splat_w8) - ipred_cfl_128_tbl
4585        .word L(ipred_cfl_splat_w4) - ipred_cfl_128_tbl
4586endjumptable
4587
4588// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4589//                               const pixel *const topleft,
4590//                               const int width, const int height,
4591//                               const int16_t *ac, const int alpha,
4592//                               const int bitdepth_max);
4593function ipred_cfl_top_16bpc_neon, export=1
4594        dup             v31.8h,  w7   // bitdepth_max
4595        clz             w9,  w3
4596        movrel          x7,  ipred_cfl_top_tbl
4597        sub             w9,  w9,  #26
4598        ldrsw           x9,  [x7, w9, uxtw #2]
4599        dup             v1.8h,   w6   // alpha
4600        add             x2,  x2,  #2
4601        add             x7,  x7,  x9
4602        add             x6,  x0,  x1
4603        lsl             x1,  x1,  #1
4604        movi            v30.8h,  #0
4605        br              x7
46064:
4607        AARCH64_VALID_JUMP_TARGET
4608        ld1             {v0.4h},  [x2]
4609        addv            h0,      v0.4h
4610        urshr           v0.4h,   v0.4h,   #2
4611        dup             v0.8h,   v0.h[0]
4612        b               L(ipred_cfl_splat_w4)
46138:
4614        AARCH64_VALID_JUMP_TARGET
4615        ld1             {v0.8h},  [x2]
4616        addv            h0,      v0.8h
4617        urshr           v0.4h,   v0.4h,   #3
4618        dup             v0.8h,   v0.h[0]
4619        b               L(ipred_cfl_splat_w8)
462016:
4621        AARCH64_VALID_JUMP_TARGET
4622        ld1             {v2.8h, v3.8h}, [x2]
4623        addp            v0.8h,   v2.8h,   v3.8h
4624        addv            h0,      v0.8h
4625        urshr           v0.4h,   v0.4h,   #4
4626        dup             v0.8h,   v0.h[0]
4627        b               L(ipred_cfl_splat_w16)
462832:
4629        AARCH64_VALID_JUMP_TARGET
4630        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
4631        addp            v2.8h,   v2.8h,   v3.8h
4632        addp            v4.8h,   v4.8h,   v5.8h
4633        addp            v0.8h,   v2.8h,   v4.8h
4634        uaddlv          s0,      v0.8h
4635        rshrn           v0.4h,   v0.4s,   #5
4636        dup             v0.8h,   v0.h[0]
4637        b               L(ipred_cfl_splat_w16)
4638endfunc
4639
4640jumptable ipred_cfl_top_tbl
4641        .word 32b - ipred_cfl_top_tbl
4642        .word 16b - ipred_cfl_top_tbl
4643        .word 8b  - ipred_cfl_top_tbl
4644        .word 4b  - ipred_cfl_top_tbl
4645endjumptable
4646
4647// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4648//                                const pixel *const topleft,
4649//                                const int width, const int height,
4650//                                const int16_t *ac, const int alpha,
4651//                                const int bitdepth_max);
4652function ipred_cfl_left_16bpc_neon, export=1
4653        dup             v31.8h,  w7   // bitdepth_max
4654        sub             x2,  x2,  w4, uxtw #1
4655        clz             w9,  w3
4656        clz             w8,  w4
4657        movrel          x10, ipred_cfl_splat_tbl
4658        movrel          x7,  ipred_cfl_left_tbl
4659        sub             w9,  w9,  #26
4660        sub             w8,  w8,  #26
4661        ldrsw           x9,  [x10, w9, uxtw #2]
4662        ldrsw           x8,  [x7,  w8, uxtw #2]
4663        dup             v1.8h,   w6   // alpha
4664        add             x9,  x10, x9
4665        add             x7,  x7,  x8
4666        add             x6,  x0,  x1
4667        lsl             x1,  x1,  #1
4668        movi            v30.8h,  #0
4669        br              x7
4670
4671L(ipred_cfl_left_h4):
4672        AARCH64_VALID_JUMP_TARGET
4673        ld1             {v0.4h},  [x2]
4674        addv            h0,      v0.4h
4675        urshr           v0.4h,   v0.4h,   #2
4676        dup             v0.8h,   v0.h[0]
4677        br              x9
4678
4679L(ipred_cfl_left_h8):
4680        AARCH64_VALID_JUMP_TARGET
4681        ld1             {v0.8h},  [x2]
4682        addv            h0,      v0.8h
4683        urshr           v0.4h,   v0.4h,   #3
4684        dup             v0.8h,   v0.h[0]
4685        br              x9
4686
4687L(ipred_cfl_left_h16):
4688        AARCH64_VALID_JUMP_TARGET
4689        ld1             {v2.8h, v3.8h}, [x2]
4690        addp            v0.8h,   v2.8h,   v3.8h
4691        addv            h0,      v0.8h
4692        urshr           v0.4h,   v0.4h,   #4
4693        dup             v0.8h,   v0.h[0]
4694        br              x9
4695
4696L(ipred_cfl_left_h32):
4697        AARCH64_VALID_JUMP_TARGET
4698        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
4699        addp            v2.8h,   v2.8h,   v3.8h
4700        addp            v4.8h,   v4.8h,   v5.8h
4701        addp            v0.8h,   v2.8h,   v4.8h
4702        uaddlv          s0,      v0.8h
4703        rshrn           v0.4h,   v0.4s,   #5
4704        dup             v0.8h,   v0.h[0]
4705        br              x9
4706endfunc
4707
4708jumptable ipred_cfl_left_tbl
4709        .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl
4710        .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl
4711        .word L(ipred_cfl_left_h8)  - ipred_cfl_left_tbl
4712        .word L(ipred_cfl_left_h4)  - ipred_cfl_left_tbl
4713endjumptable
4714
4715// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4716//                           const pixel *const topleft,
4717//                           const int width, const int height,
4718//                           const int16_t *ac, const int alpha,
4719//                           const int bitdepth_max);
4720function ipred_cfl_16bpc_neon, export=1
4721        dup             v31.8h,  w7              // bitdepth_max
4722        sub             x2,  x2,  w4, uxtw #1
4723        add             w8,  w3,  w4             // width + height
4724        dup             v1.8h,   w6              // alpha
4725        clz             w9,  w3
4726        clz             w6,  w4
4727        dup             v16.4s, w8               // width + height
4728        movrel          x7,  ipred_cfl_tbl
4729        rbit            w8,  w8                  // rbit(width + height)
4730        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
4731        sub             w6,  w6,  #26
4732        clz             w8,  w8                  // ctz(width + height)
4733        ldrsw           x9,  [x7, w9, uxtw #2]
4734        ldrsw           x6,  [x7, w6, uxtw #2]
4735        neg             w8,  w8                  // -ctz(width + height)
4736        add             x9,  x7,  x9
4737        add             x7,  x7,  x6
4738        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
4739        dup             v17.4s,  w8              // -ctz(width + height)
4740        add             x6,  x0,  x1
4741        lsl             x1,  x1,  #1
4742        movi            v30.8h,  #0
4743        br              x7
4744
4745L(ipred_cfl_h4):
4746        AARCH64_VALID_JUMP_TARGET
4747        ld1             {v0.4h},  [x2], #8
4748        uaddlv          s0,      v0.4h
4749        add             x2,  x2,  #2
4750        br              x9
4751L(ipred_cfl_w4):
4752        AARCH64_VALID_JUMP_TARGET
4753        ld1             {v2.4h},  [x2]
4754        add             v0.2s,   v0.2s,   v16.2s
4755        uaddlv          s2,      v2.4h
4756        cmp             w4,  #4
4757        add             v0.2s,   v0.2s,   v2.2s
4758        ushl            v0.2s,   v0.2s,   v17.2s
4759        b.eq            1f
4760        // h = 8/16
4761        cmp             w4,  #16
4762        mov             w16, #0x6667
4763        mov             w17, #0xAAAB
4764        csel            w16, w16, w17, eq
4765        dup             v16.2s,  w16
4766        mul             v0.2s,   v0.2s,   v16.2s
4767        ushr            v0.2s,   v0.2s,   #17
47681:
4769        dup             v0.8h,   v0.h[0]
4770        b               L(ipred_cfl_splat_w4)
4771
4772L(ipred_cfl_h8):
4773        AARCH64_VALID_JUMP_TARGET
4774        ld1             {v0.8h},  [x2], #16
4775        uaddlv          s0,      v0.8h
4776        add             x2,  x2,  #2
4777        br              x9
4778L(ipred_cfl_w8):
4779        AARCH64_VALID_JUMP_TARGET
4780        ld1             {v2.8h},  [x2]
4781        add             v0.2s,   v0.2s,   v16.2s
4782        uaddlv          s2,      v2.8h
4783        cmp             w4,  #8
4784        add             v0.2s,   v0.2s,   v2.2s
4785        ushl            v0.2s,   v0.2s,   v17.2s
4786        b.eq            1f
4787        // h = 4/16/32
4788        cmp             w4,  #32
4789        mov             w16, #0x6667
4790        mov             w17, #0xAAAB
4791        csel            w16, w16, w17, eq
4792        dup             v16.2s,  w16
4793        mul             v0.2s,   v0.2s,   v16.2s
4794        ushr            v0.2s,   v0.2s,   #17
47951:
4796        dup             v0.8h,   v0.h[0]
4797        b               L(ipred_cfl_splat_w8)
4798
4799L(ipred_cfl_h16):
4800        AARCH64_VALID_JUMP_TARGET
4801        ld1             {v2.8h, v3.8h}, [x2], #32
4802        addp            v0.8h,   v2.8h,   v3.8h
4803        add             x2,  x2,  #2
4804        uaddlv          s0,      v0.8h
4805        br              x9
4806L(ipred_cfl_w16):
4807        AARCH64_VALID_JUMP_TARGET
4808        ld1             {v2.8h, v3.8h}, [x2]
4809        add             v0.2s,   v0.2s,   v16.2s
4810        addp            v2.8h,   v2.8h,   v3.8h
4811        uaddlv          s2,      v2.8h
4812        cmp             w4,  #16
4813        add             v0.2s,   v0.2s,   v2.2s
4814        ushl            v0.2s,   v0.2s,   v17.2s
4815        b.eq            1f
4816        // h = 4/8/32
4817        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
4818        mov             w16, #0x6667
4819        mov             w17, #0xAAAB
4820        csel            w16, w16, w17, eq
4821        dup             v16.2s,  w16
4822        mul             v0.2s,   v0.2s,   v16.2s
4823        ushr            v0.2s,   v0.2s,   #17
48241:
4825        dup             v0.8h,   v0.h[0]
4826        b               L(ipred_cfl_splat_w16)
4827
4828L(ipred_cfl_h32):
4829        AARCH64_VALID_JUMP_TARGET
4830        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
4831        addp            v2.8h,   v2.8h,   v3.8h
4832        addp            v4.8h,   v4.8h,   v5.8h
4833        addp            v0.8h,   v2.8h,   v4.8h
4834        add             x2,  x2,  #2
4835        uaddlv          s0,      v0.8h
4836        br              x9
4837L(ipred_cfl_w32):
4838        AARCH64_VALID_JUMP_TARGET
4839        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
4840        add             v0.4s,   v0.4s,   v16.4s
4841        addp            v2.8h,   v2.8h,   v3.8h
4842        addp            v4.8h,   v4.8h,   v5.8h
4843        addp            v2.8h,   v2.8h,   v4.8h
4844        cmp             w4,  #32
4845        uaddlv          s2,      v2.8h
4846        add             v0.2s,   v0.2s,   v2.2s
4847        ushl            v0.2s,   v0.2s,   v17.2s
4848        b.eq            1f
4849        // h = 8/16
4850        cmp             w4,  #8
4851        mov             w16, #0x6667
4852        mov             w17, #0xAAAB
4853        csel            w16, w16, w17, eq
4854        dup             v16.2s,  w16
4855        mul             v0.2s,   v0.2s,   v16.2s
4856        ushr            v0.2s,   v0.2s,   #17
48571:
4858        dup             v0.8h,   v0.h[0]
4859        b               L(ipred_cfl_splat_w16)
4860endfunc
4861
4862jumptable ipred_cfl_tbl
4863        .word L(ipred_cfl_h32) - ipred_cfl_tbl
4864        .word L(ipred_cfl_h16) - ipred_cfl_tbl
4865        .word L(ipred_cfl_h8)  - ipred_cfl_tbl
4866        .word L(ipred_cfl_h4)  - ipred_cfl_tbl
4867        .word L(ipred_cfl_w32) - ipred_cfl_tbl
4868        .word L(ipred_cfl_w16) - ipred_cfl_tbl
4869        .word L(ipred_cfl_w8)  - ipred_cfl_tbl
4870        .word L(ipred_cfl_w4)  - ipred_cfl_tbl
4871endjumptable
4872
4873// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
4874//                            const ptrdiff_t stride, const int w_pad,
4875//                            const int h_pad, const int cw, const int ch);
4876function ipred_cfl_ac_420_16bpc_neon, export=1
4877        clz             w8,  w5
4878        lsl             w4,  w4,  #2
4879        movrel          x7,  ipred_cfl_ac_420_tbl
4880        sub             w8,  w8,  #27
4881        ldrsw           x8,  [x7, w8, uxtw #2]
4882        movi            v24.4s,  #0
4883        movi            v25.4s,  #0
4884        movi            v26.4s,  #0
4885        movi            v27.4s,  #0
4886        add             x7,  x7,  x8
4887        sub             w8,  w6,  w4         // height - h_pad
4888        rbit            w9,  w5              // rbit(width)
4889        rbit            w10, w6              // rbit(height)
4890        clz             w9,  w9              // ctz(width)
4891        clz             w10, w10             // ctz(height)
4892        add             w9,  w9,  w10        // log2sz
4893        add             x10, x1,  x2
4894        dup             v31.4s,  w9
4895        lsl             x2,  x2,  #1
4896        neg             v31.4s,  v31.4s      // -log2sz
4897        br              x7
4898
4899L(ipred_cfl_ac_420_w4):
4900        AARCH64_VALID_JUMP_TARGET
49011:      // Copy and subsample input
4902        ld1             {v0.8h}, [x1],  x2
4903        ld1             {v1.8h}, [x10], x2
4904        ld1             {v2.8h}, [x1],  x2
4905        ld1             {v3.8h}, [x10], x2
4906        addp            v0.8h,   v0.8h,   v2.8h
4907        addp            v1.8h,   v1.8h,   v3.8h
4908        add             v0.8h,   v0.8h,   v1.8h
4909        shl             v0.8h,   v0.8h,   #1
4910        subs            w8,  w8,  #2
4911        st1             {v0.8h}, [x0], #16
4912        uaddw           v24.4s,  v24.4s,  v0.4h
4913        uaddw2          v25.4s,  v25.4s,  v0.8h
4914        b.gt            1b
4915        trn2            v1.2d,   v0.2d,   v0.2d
4916        trn2            v0.2d,   v0.2d,   v0.2d
4917L(ipred_cfl_ac_420_w4_hpad):
4918        cbz             w4,  3f
49192:      // Vertical padding (h_pad > 0)
4920        subs            w4,  w4,  #4
4921        st1             {v0.8h, v1.8h}, [x0], #32
4922        uaddw           v24.4s,  v24.4s,  v0.4h
4923        uaddw2          v25.4s,  v25.4s,  v0.8h
4924        uaddw           v26.4s,  v26.4s,  v1.4h
4925        uaddw2          v27.4s,  v27.4s,  v1.8h
4926        b.gt            2b
49273:
4928L(ipred_cfl_ac_420_w4_calc_subtract_dc):
4929        // Aggregate the sums
4930        add             v24.4s,  v24.4s,  v25.4s
4931        add             v26.4s,  v26.4s,  v27.4s
4932        add             v0.4s,   v24.4s,  v26.4s
4933        addv            s0,  v0.4s                // sum
4934        sub             x0,  x0,  w6, uxtw #3
4935        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1)))  >>= log2sz
4936        dup             v4.8h,   v4.h[0]
49376:      // Subtract dc from ac
4938        ld1             {v0.8h, v1.8h}, [x0]
4939        subs            w6,  w6,  #4
4940        sub             v0.8h,   v0.8h,   v4.8h
4941        sub             v1.8h,   v1.8h,   v4.8h
4942        st1             {v0.8h, v1.8h}, [x0], #32
4943        b.gt            6b
4944        ret
4945
4946L(ipred_cfl_ac_420_w8):
4947        AARCH64_VALID_JUMP_TARGET
4948        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
49491:      // Copy and subsample input, without padding
4950        ld1             {v0.8h, v1.8h}, [x1],  x2
4951        ld1             {v2.8h, v3.8h}, [x10], x2
4952        ld1             {v4.8h, v5.8h}, [x1],  x2
4953        addp            v0.8h,   v0.8h,   v1.8h
4954        ld1             {v6.8h, v7.8h}, [x10], x2
4955        addp            v2.8h,   v2.8h,   v3.8h
4956        addp            v4.8h,   v4.8h,   v5.8h
4957        addp            v6.8h,   v6.8h,   v7.8h
4958        add             v0.8h,   v0.8h,   v2.8h
4959        add             v4.8h,   v4.8h,   v6.8h
4960        shl             v0.8h,   v0.8h,   #1
4961        shl             v1.8h,   v4.8h,   #1
4962        subs            w8,  w8,  #2
4963        st1             {v0.8h, v1.8h}, [x0], #32
4964        uaddw           v24.4s,  v24.4s,  v0.4h
4965        uaddw2          v25.4s,  v25.4s,  v0.8h
4966        uaddw           v26.4s,  v26.4s,  v1.4h
4967        uaddw2          v27.4s,  v27.4s,  v1.8h
4968        b.gt            1b
4969        mov             v0.16b,  v1.16b
4970        b               L(ipred_cfl_ac_420_w8_hpad)
4971
4972L(ipred_cfl_ac_420_w8_wpad):
49731:      // Copy and subsample input, padding 4
4974        ld1             {v0.8h}, [x1],  x2
4975        ld1             {v1.8h}, [x10], x2
4976        ld1             {v2.8h}, [x1],  x2
4977        ld1             {v3.8h}, [x10], x2
4978        addp            v0.8h,   v0.8h,   v2.8h
4979        addp            v1.8h,   v1.8h,   v3.8h
4980        add             v0.8h,   v0.8h,   v1.8h
4981        shl             v0.8h,   v0.8h,   #1
4982        dup             v1.4h,   v0.h[3]
4983        dup             v3.4h,   v0.h[7]
4984        trn2            v2.2d,   v0.2d,   v0.2d
4985        subs            w8,  w8,  #2
4986        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
4987        uaddw           v24.4s,  v24.4s,  v0.4h
4988        uaddw           v25.4s,  v25.4s,  v1.4h
4989        uaddw           v26.4s,  v26.4s,  v2.4h
4990        uaddw           v27.4s,  v27.4s,  v3.4h
4991        b.gt            1b
4992        trn1            v0.2d,   v2.2d,   v3.2d
4993        trn1            v1.2d,   v2.2d,   v3.2d
4994
4995L(ipred_cfl_ac_420_w8_hpad):
4996        cbz             w4,  3f
49972:      // Vertical padding (h_pad > 0)
4998        subs            w4,  w4,  #4
4999        st1             {v0.8h, v1.8h}, [x0], #32
5000        uaddw           v24.4s,  v24.4s,  v0.4h
5001        uaddw2          v25.4s,  v25.4s,  v0.8h
5002        uaddw           v26.4s,  v26.4s,  v1.4h
5003        uaddw2          v27.4s,  v27.4s,  v1.8h
5004        st1             {v0.8h, v1.8h}, [x0], #32
5005        uaddw           v24.4s,  v24.4s,  v0.4h
5006        uaddw2          v25.4s,  v25.4s,  v0.8h
5007        uaddw           v26.4s,  v26.4s,  v1.4h
5008        uaddw2          v27.4s,  v27.4s,  v1.8h
5009        b.gt            2b
50103:
5011
5012        // Double the height and reuse the w4 summing/subtracting
5013        lsl             w6,  w6,  #1
5014        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
5015
5016L(ipred_cfl_ac_420_w16):
5017        AARCH64_VALID_JUMP_TARGET
5018        movrel          x7,  ipred_cfl_ac_420_w16_tbl
5019        ldrsw           x3,  [x7, w3, uxtw #2]
5020        add             x7,  x7,  x3
5021        br              x7
5022
5023L(ipred_cfl_ac_420_w16_wpad0):
5024        AARCH64_VALID_JUMP_TARGET
50251:      // Copy and subsample input, without padding
5026        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
5027        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
5028        addp            v0.8h,   v0.8h,   v1.8h
5029        addp            v2.8h,   v2.8h,   v3.8h
5030        addp            v4.8h,   v4.8h,   v5.8h
5031        addp            v6.8h,   v6.8h,   v7.8h
5032        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x1],  x2
5033        add             v0.8h,   v0.8h,   v4.8h
5034        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
5035        add             v2.8h,   v2.8h,   v6.8h
5036        addp            v16.8h,  v16.8h,  v17.8h
5037        addp            v18.8h,  v18.8h,  v19.8h
5038        addp            v20.8h,  v20.8h,  v21.8h
5039        addp            v22.8h,  v22.8h,  v23.8h
5040        add             v16.8h,  v16.8h,  v20.8h
5041        add             v18.8h,  v18.8h,  v22.8h
5042        shl             v0.8h,   v0.8h,   #1
5043        shl             v1.8h,   v2.8h,   #1
5044        shl             v2.8h,   v16.8h,  #1
5045        shl             v3.8h,   v18.8h,  #1
5046        subs            w8,  w8,  #2
5047        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5048        uaddw           v24.4s,  v24.4s,  v0.4h
5049        uaddw2          v25.4s,  v25.4s,  v0.8h
5050        uaddw           v26.4s,  v26.4s,  v1.4h
5051        uaddw2          v27.4s,  v27.4s,  v1.8h
5052        uaddw           v24.4s,  v24.4s,  v2.4h
5053        uaddw2          v25.4s,  v25.4s,  v2.8h
5054        uaddw           v26.4s,  v26.4s,  v3.4h
5055        uaddw2          v27.4s,  v27.4s,  v3.8h
5056        b.gt            1b
5057        mov             v0.16b,  v2.16b
5058        mov             v1.16b,  v3.16b
5059        b               L(ipred_cfl_ac_420_w16_hpad)
5060
5061L(ipred_cfl_ac_420_w16_wpad1):
5062        AARCH64_VALID_JUMP_TARGET
50631:      // Copy and subsample input, padding 4
5064        ldr             q2,  [x1,  #32]
5065        ld1             {v0.8h, v1.8h}, [x1],  x2
5066        ldr             q5,  [x10, #32]
5067        ld1             {v3.8h, v4.8h}, [x10], x2
5068        addp            v2.8h,   v2.8h,   v2.8h
5069        addp            v0.8h,   v0.8h,   v1.8h
5070        addp            v5.8h,   v5.8h,   v5.8h
5071        addp            v3.8h,   v3.8h,   v4.8h
5072        ldr             q18, [x1,  #32]
5073        add             v2.4h,   v2.4h,   v5.4h
5074        ld1             {v16.8h, v17.8h}, [x1],  x2
5075        add             v0.8h,   v0.8h,   v3.8h
5076        ldr             q21, [x10, #32]
5077        ld1             {v19.8h, v20.8h}, [x10], x2
5078        addp            v18.8h,  v18.8h,  v18.8h
5079        addp            v16.8h,  v16.8h,  v17.8h
5080        addp            v21.8h,  v21.8h,  v21.8h
5081        addp            v19.8h,  v19.8h,  v20.8h
5082        add             v18.4h,  v18.4h,  v21.4h
5083        add             v16.8h,  v16.8h,  v19.8h
5084        shl             v1.4h,   v2.4h,   #1
5085        shl             v0.8h,   v0.8h,   #1
5086        shl             v3.4h,   v18.4h,  #1
5087        shl             v2.8h,   v16.8h,  #1
5088        dup             v4.4h,   v1.h[3]
5089        dup             v5.4h,   v3.h[3]
5090        trn1            v1.2d,   v1.2d,   v4.2d
5091        trn1            v3.2d,   v3.2d,   v5.2d
5092        subs            w8,  w8,  #2
5093        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5094        uaddw           v24.4s,  v24.4s,  v0.4h
5095        uaddw2          v25.4s,  v25.4s,  v0.8h
5096        uaddw           v26.4s,  v26.4s,  v1.4h
5097        uaddw2          v27.4s,  v27.4s,  v1.8h
5098        uaddw           v24.4s,  v24.4s,  v2.4h
5099        uaddw2          v25.4s,  v25.4s,  v2.8h
5100        uaddw           v26.4s,  v26.4s,  v3.4h
5101        uaddw2          v27.4s,  v27.4s,  v3.8h
5102        b.gt            1b
5103        mov             v0.16b,  v2.16b
5104        mov             v1.16b,  v3.16b
5105        b               L(ipred_cfl_ac_420_w16_hpad)
5106
5107L(ipred_cfl_ac_420_w16_wpad2):
5108        AARCH64_VALID_JUMP_TARGET
51091:      // Copy and subsample input, padding 8
5110        ld1             {v0.8h, v1.8h}, [x1],  x2
5111        ld1             {v2.8h, v3.8h}, [x10], x2
5112        ld1             {v4.8h, v5.8h}, [x1],  x2
5113        addp            v0.8h,   v0.8h,   v1.8h
5114        ld1             {v6.8h, v7.8h}, [x10], x2
5115        addp            v2.8h,   v2.8h,   v3.8h
5116        addp            v4.8h,   v4.8h,   v5.8h
5117        addp            v6.8h,   v6.8h,   v7.8h
5118        add             v0.8h,   v0.8h,   v2.8h
5119        add             v4.8h,   v4.8h,   v6.8h
5120        shl             v0.8h,   v0.8h,   #1
5121        shl             v2.8h,   v4.8h,   #1
5122        dup             v1.8h,   v0.h[7]
5123        dup             v3.8h,   v2.h[7]
5124        subs            w8,  w8,  #2
5125        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5126        uaddw           v24.4s,  v24.4s,  v0.4h
5127        uaddw2          v25.4s,  v25.4s,  v0.8h
5128        uaddw           v26.4s,  v26.4s,  v1.4h
5129        uaddw2          v27.4s,  v27.4s,  v1.8h
5130        uaddw           v24.4s,  v24.4s,  v2.4h
5131        uaddw2          v25.4s,  v25.4s,  v2.8h
5132        uaddw           v26.4s,  v26.4s,  v3.4h
5133        uaddw2          v27.4s,  v27.4s,  v3.8h
5134        b.gt            1b
5135        mov             v0.16b,  v2.16b
5136        mov             v1.16b,  v3.16b
5137        b               L(ipred_cfl_ac_420_w16_hpad)
5138
5139L(ipred_cfl_ac_420_w16_wpad3):
5140        AARCH64_VALID_JUMP_TARGET
51411:      // Copy and subsample input, padding 12
5142        ld1             {v0.8h}, [x1],  x2
5143        ld1             {v2.8h}, [x10], x2
5144        ld1             {v4.8h}, [x1],  x2
5145        ld1             {v6.8h}, [x10], x2
5146        addp            v0.8h,   v0.8h,   v4.8h
5147        addp            v2.8h,   v2.8h,   v6.8h
5148        add             v0.8h,   v0.8h,   v2.8h
5149        shl             v0.8h,   v0.8h,   #1
5150        dup             v1.8h,   v0.h[3]
5151        dup             v3.8h,   v0.h[7]
5152        trn2            v2.2d,   v0.2d,   v3.2d
5153        trn1            v0.2d,   v0.2d,   v1.2d
5154        subs            w8,  w8,  #2
5155        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5156        uaddw           v24.4s,  v24.4s,  v0.4h
5157        uaddw2          v25.4s,  v25.4s,  v0.8h
5158        uaddw           v26.4s,  v26.4s,  v1.4h
5159        uaddw2          v27.4s,  v27.4s,  v1.8h
5160        uaddw           v24.4s,  v24.4s,  v2.4h
5161        uaddw2          v25.4s,  v25.4s,  v2.8h
5162        uaddw           v26.4s,  v26.4s,  v3.4h
5163        uaddw2          v27.4s,  v27.4s,  v3.8h
5164        b.gt            1b
5165        mov             v0.16b,  v2.16b
5166        mov             v1.16b,  v3.16b
5167
5168L(ipred_cfl_ac_420_w16_hpad):
5169        cbz             w4,  3f
51702:      // Vertical padding (h_pad > 0)
5171        subs            w4,  w4,  #4
5172        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5173        uaddw           v24.4s,  v24.4s,  v0.4h
5174        uaddw2          v25.4s,  v25.4s,  v0.8h
5175        uaddw           v26.4s,  v26.4s,  v1.4h
5176        uaddw2          v27.4s,  v27.4s,  v1.8h
5177        uaddw           v24.4s,  v24.4s,  v2.4h
5178        uaddw2          v25.4s,  v25.4s,  v2.8h
5179        uaddw           v26.4s,  v26.4s,  v3.4h
5180        uaddw2          v27.4s,  v27.4s,  v3.8h
5181        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5182        uaddw           v24.4s,  v24.4s,  v0.4h
5183        uaddw2          v25.4s,  v25.4s,  v0.8h
5184        uaddw           v26.4s,  v26.4s,  v1.4h
5185        uaddw2          v27.4s,  v27.4s,  v1.8h
5186        uaddw           v24.4s,  v24.4s,  v2.4h
5187        uaddw2          v25.4s,  v25.4s,  v2.8h
5188        uaddw           v26.4s,  v26.4s,  v3.4h
5189        uaddw2          v27.4s,  v27.4s,  v3.8h
5190        b.gt            2b
51913:
5192
5193        // Quadruple the height and reuse the w4 summing/subtracting
5194        lsl             w6,  w6,  #2
5195        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
5196endfunc
5197
5198jumptable ipred_cfl_ac_420_tbl
5199        .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl
5200        .word L(ipred_cfl_ac_420_w8)  - ipred_cfl_ac_420_tbl
5201        .word L(ipred_cfl_ac_420_w4)  - ipred_cfl_ac_420_tbl
5202endjumptable
5203
5204jumptable ipred_cfl_ac_420_w16_tbl
5205        .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl
5206        .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl
5207        .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl
5208        .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl
5209endjumptable
5210
5211// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
5212//                            const ptrdiff_t stride, const int w_pad,
5213//                            const int h_pad, const int cw, const int ch);
5214function ipred_cfl_ac_422_16bpc_neon, export=1
5215        clz             w8,  w5
5216        lsl             w4,  w4,  #2
5217        movrel          x7,  ipred_cfl_ac_422_tbl
5218        sub             w8,  w8,  #27
5219        ldrsw           x8,  [x7, w8, uxtw #2]
5220        movi            v24.4s,  #0
5221        movi            v25.4s,  #0
5222        movi            v26.4s,  #0
5223        movi            v27.4s,  #0
5224        add             x7,  x7,  x8
5225        sub             w8,  w6,  w4         // height - h_pad
5226        rbit            w9,  w5              // rbit(width)
5227        rbit            w10, w6              // rbit(height)
5228        clz             w9,  w9              // ctz(width)
5229        clz             w10, w10             // ctz(height)
5230        add             w9,  w9,  w10        // log2sz
5231        add             x10, x1,  x2
5232        dup             v31.4s,  w9
5233        lsl             x2,  x2,  #1
5234        neg             v31.4s,  v31.4s      // -log2sz
5235        br              x7
5236
5237L(ipred_cfl_ac_422_w4):
5238        AARCH64_VALID_JUMP_TARGET
52391:      // Copy and subsample input
5240        ld1             {v0.8h}, [x1],  x2
5241        ld1             {v1.8h}, [x10], x2
5242        ld1             {v2.8h}, [x1],  x2
5243        ld1             {v3.8h}, [x10], x2
5244        addp            v0.8h,   v0.8h,   v1.8h
5245        addp            v2.8h,   v2.8h,   v3.8h
5246        shl             v0.8h,   v0.8h,   #2
5247        shl             v1.8h,   v2.8h,   #2
5248        subs            w8,  w8,  #4
5249        st1             {v0.8h, v1.8h}, [x0], #32
5250        uaddw           v24.4s,  v24.4s,  v0.4h
5251        uaddw2          v25.4s,  v25.4s,  v0.8h
5252        uaddw           v26.4s,  v26.4s,  v1.4h
5253        uaddw2          v27.4s,  v27.4s,  v1.8h
5254        b.gt            1b
5255        trn2            v0.2d,   v1.2d,   v1.2d
5256        trn2            v1.2d,   v1.2d,   v1.2d
5257        b               L(ipred_cfl_ac_420_w4_hpad)
5258
5259L(ipred_cfl_ac_422_w8):
5260        AARCH64_VALID_JUMP_TARGET
5261        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
52621:      // Copy and subsample input, without padding
5263        ld1             {v0.8h, v1.8h}, [x1],  x2
5264        ld1             {v2.8h, v3.8h}, [x10], x2
5265        ld1             {v4.8h, v5.8h}, [x1],  x2
5266        addp            v0.8h,   v0.8h,   v1.8h
5267        ld1             {v6.8h, v7.8h}, [x10], x2
5268        addp            v2.8h,   v2.8h,   v3.8h
5269        addp            v4.8h,   v4.8h,   v5.8h
5270        addp            v6.8h,   v6.8h,   v7.8h
5271        shl             v0.8h,   v0.8h,   #2
5272        shl             v1.8h,   v2.8h,   #2
5273        shl             v2.8h,   v4.8h,   #2
5274        shl             v3.8h,   v6.8h,   #2
5275        subs            w8,  w8,  #4
5276        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5277        uaddw           v24.4s,  v24.4s,  v0.4h
5278        uaddw2          v25.4s,  v25.4s,  v0.8h
5279        uaddw           v26.4s,  v26.4s,  v1.4h
5280        uaddw2          v27.4s,  v27.4s,  v1.8h
5281        uaddw           v24.4s,  v24.4s,  v2.4h
5282        uaddw2          v25.4s,  v25.4s,  v2.8h
5283        uaddw           v26.4s,  v26.4s,  v3.4h
5284        uaddw2          v27.4s,  v27.4s,  v3.8h
5285        b.gt            1b
5286        mov             v0.16b,  v3.16b
5287        mov             v1.16b,  v3.16b
5288        b               L(ipred_cfl_ac_420_w8_hpad)
5289
5290L(ipred_cfl_ac_422_w8_wpad):
52911:      // Copy and subsample input, padding 4
5292        ld1             {v0.8h}, [x1],  x2
5293        ld1             {v1.8h}, [x10], x2
5294        ld1             {v2.8h}, [x1],  x2
5295        ld1             {v3.8h}, [x10], x2
5296        addp            v0.8h,   v0.8h,   v1.8h
5297        addp            v2.8h,   v2.8h,   v3.8h
5298        shl             v0.8h,   v0.8h,   #2
5299        shl             v2.8h,   v2.8h,   #2
5300        dup             v4.4h,   v0.h[3]
5301        dup             v5.8h,   v0.h[7]
5302        dup             v6.4h,   v2.h[3]
5303        dup             v7.8h,   v2.h[7]
5304        trn2            v1.2d,   v0.2d,   v5.2d
5305        trn1            v0.2d,   v0.2d,   v4.2d
5306        trn2            v3.2d,   v2.2d,   v7.2d
5307        trn1            v2.2d,   v2.2d,   v6.2d
5308        subs            w8,  w8,  #4
5309        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5310        uaddw           v24.4s,  v24.4s,  v0.4h
5311        uaddw2          v25.4s,  v25.4s,  v0.8h
5312        uaddw           v26.4s,  v26.4s,  v1.4h
5313        uaddw2          v27.4s,  v27.4s,  v1.8h
5314        uaddw           v24.4s,  v24.4s,  v2.4h
5315        uaddw2          v25.4s,  v25.4s,  v2.8h
5316        uaddw           v26.4s,  v26.4s,  v3.4h
5317        uaddw2          v27.4s,  v27.4s,  v3.8h
5318        b.gt            1b
5319        mov             v0.16b,  v3.16b
5320        mov             v1.16b,  v3.16b
5321        b               L(ipred_cfl_ac_420_w8_hpad)
5322
5323L(ipred_cfl_ac_422_w16):
5324        AARCH64_VALID_JUMP_TARGET
5325        movrel          x7,  ipred_cfl_ac_422_w16_tbl
5326        ldrsw           x3,  [x7, w3, uxtw #2]
5327        add             x7,  x7,  x3
5328        br              x7
5329
5330L(ipred_cfl_ac_422_w16_wpad0):
5331        AARCH64_VALID_JUMP_TARGET
53321:      // Copy and subsample input, without padding
5333        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
5334        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
5335        addp            v0.8h,   v0.8h,   v1.8h
5336        addp            v2.8h,   v2.8h,   v3.8h
5337        addp            v4.8h,   v4.8h,   v5.8h
5338        addp            v6.8h,   v6.8h,   v7.8h
5339        shl             v0.8h,   v0.8h,   #2
5340        shl             v1.8h,   v2.8h,   #2
5341        shl             v2.8h,   v4.8h,   #2
5342        shl             v3.8h,   v6.8h,   #2
5343        subs            w8,  w8,  #2
5344        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5345        uaddw           v24.4s,  v24.4s,  v0.4h
5346        uaddw2          v25.4s,  v25.4s,  v0.8h
5347        uaddw           v26.4s,  v26.4s,  v1.4h
5348        uaddw2          v27.4s,  v27.4s,  v1.8h
5349        uaddw           v24.4s,  v24.4s,  v2.4h
5350        uaddw2          v25.4s,  v25.4s,  v2.8h
5351        uaddw           v26.4s,  v26.4s,  v3.4h
5352        uaddw2          v27.4s,  v27.4s,  v3.8h
5353        b.gt            1b
5354        mov             v0.16b,  v2.16b
5355        mov             v1.16b,  v3.16b
5356        b               L(ipred_cfl_ac_420_w16_hpad)
5357
5358L(ipred_cfl_ac_422_w16_wpad1):
5359        AARCH64_VALID_JUMP_TARGET
53601:      // Copy and subsample input, padding 4
5361        ldr             q2,  [x1,  #32]
5362        ld1             {v0.8h, v1.8h}, [x1],  x2
5363        ldr             q6,  [x10, #32]
5364        ld1             {v4.8h, v5.8h}, [x10], x2
5365        addp            v2.8h,   v2.8h,   v2.8h
5366        addp            v0.8h,   v0.8h,   v1.8h
5367        addp            v6.8h,   v6.8h,   v6.8h
5368        addp            v4.8h,   v4.8h,   v5.8h
5369        shl             v1.4h,   v2.4h,   #2
5370        shl             v0.8h,   v0.8h,   #2
5371        shl             v3.4h,   v6.4h,   #2
5372        shl             v2.8h,   v4.8h,   #2
5373        dup             v4.4h,   v1.h[3]
5374        dup             v5.4h,   v3.h[3]
5375        trn1            v1.2d,   v1.2d,   v4.2d
5376        trn1            v3.2d,   v3.2d,   v5.2d
5377        subs            w8,  w8,  #2
5378        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5379        uaddw           v24.4s,  v24.4s,  v0.4h
5380        uaddw2          v25.4s,  v25.4s,  v0.8h
5381        uaddw           v26.4s,  v26.4s,  v1.4h
5382        uaddw2          v27.4s,  v27.4s,  v1.8h
5383        uaddw           v24.4s,  v24.4s,  v2.4h
5384        uaddw2          v25.4s,  v25.4s,  v2.8h
5385        uaddw           v26.4s,  v26.4s,  v3.4h
5386        uaddw2          v27.4s,  v27.4s,  v3.8h
5387        b.gt            1b
5388        mov             v0.16b,  v2.16b
5389        mov             v1.16b,  v3.16b
5390        b               L(ipred_cfl_ac_420_w16_hpad)
5391
5392L(ipred_cfl_ac_422_w16_wpad2):
5393        AARCH64_VALID_JUMP_TARGET
53941:      // Copy and subsample input, padding 8
5395        ld1             {v0.8h, v1.8h}, [x1],  x2
5396        ld1             {v2.8h, v3.8h}, [x10], x2
5397        addp            v0.8h,   v0.8h,   v1.8h
5398        addp            v2.8h,   v2.8h,   v3.8h
5399        shl             v0.8h,   v0.8h,   #2
5400        shl             v2.8h,   v2.8h,   #2
5401        dup             v1.8h,   v0.h[7]
5402        dup             v3.8h,   v2.h[7]
5403        subs            w8,  w8,  #2
5404        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5405        uaddw           v24.4s,  v24.4s,  v0.4h
5406        uaddw2          v25.4s,  v25.4s,  v0.8h
5407        uaddw           v26.4s,  v26.4s,  v1.4h
5408        uaddw2          v27.4s,  v27.4s,  v1.8h
5409        uaddw           v24.4s,  v24.4s,  v2.4h
5410        uaddw2          v25.4s,  v25.4s,  v2.8h
5411        uaddw           v26.4s,  v26.4s,  v3.4h
5412        uaddw2          v27.4s,  v27.4s,  v3.8h
5413        b.gt            1b
5414        mov             v0.16b,  v2.16b
5415        mov             v1.16b,  v3.16b
5416        b               L(ipred_cfl_ac_420_w16_hpad)
5417
5418L(ipred_cfl_ac_422_w16_wpad3):
5419        AARCH64_VALID_JUMP_TARGET
54201:      // Copy and subsample input, padding 12
5421        ld1             {v0.8h}, [x1],  x2
5422        ld1             {v2.8h}, [x10], x2
5423        addp            v0.8h,   v0.8h,   v0.8h
5424        addp            v2.8h,   v2.8h,   v2.8h
5425        shl             v0.4h,   v0.4h,   #2
5426        shl             v2.4h,   v2.4h,   #2
5427        dup             v1.8h,   v0.h[3]
5428        dup             v3.8h,   v2.h[3]
5429        trn1            v0.2d,   v0.2d,   v1.2d
5430        trn1            v2.2d,   v2.2d,   v3.2d
5431        subs            w8,  w8,  #2
5432        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5433        uaddw           v24.4s,  v24.4s,  v0.4h
5434        uaddw2          v25.4s,  v25.4s,  v0.8h
5435        uaddw           v26.4s,  v26.4s,  v1.4h
5436        uaddw2          v27.4s,  v27.4s,  v1.8h
5437        uaddw           v24.4s,  v24.4s,  v2.4h
5438        uaddw2          v25.4s,  v25.4s,  v2.8h
5439        uaddw           v26.4s,  v26.4s,  v3.4h
5440        uaddw2          v27.4s,  v27.4s,  v3.8h
5441        b.gt            1b
5442        mov             v0.16b,  v2.16b
5443        mov             v1.16b,  v3.16b
5444        b               L(ipred_cfl_ac_420_w16_hpad)
5445endfunc
5446
5447jumptable ipred_cfl_ac_422_tbl
5448        .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl
5449        .word L(ipred_cfl_ac_422_w8)  - ipred_cfl_ac_422_tbl
5450        .word L(ipred_cfl_ac_422_w4)  - ipred_cfl_ac_422_tbl
5451endjumptable
5452
5453jumptable ipred_cfl_ac_422_w16_tbl
5454        .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl
5455        .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl
5456        .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl
5457        .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl
5458endjumptable
5459
5460// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
5461//                            const ptrdiff_t stride, const int w_pad,
5462//                            const int h_pad, const int cw, const int ch);
5463function ipred_cfl_ac_444_16bpc_neon, export=1
5464        clz             w8,  w5
5465        lsl             w4,  w4,  #2
5466        movrel          x7,  ipred_cfl_ac_444_tbl
5467        sub             w8,  w8,  #26
5468        ldrsw           x8,  [x7, w8, uxtw #2]
5469        movi            v24.4s,  #0
5470        movi            v25.4s,  #0
5471        movi            v26.4s,  #0
5472        movi            v27.4s,  #0
5473        add             x7,  x7,  x8
5474        sub             w8,  w6,  w4         // height - h_pad
5475        rbit            w9,  w5              // rbit(width)
5476        rbit            w10, w6              // rbit(height)
5477        clz             w9,  w9              // ctz(width)
5478        clz             w10, w10             // ctz(height)
5479        add             w9,  w9,  w10        // log2sz
5480        add             x10, x1,  x2
5481        dup             v31.4s,  w9
5482        lsl             x2,  x2,  #1
5483        neg             v31.4s,  v31.4s      // -log2sz
5484        br              x7
5485
5486L(ipred_cfl_ac_444_w4):
5487        AARCH64_VALID_JUMP_TARGET
54881:      // Copy and expand input
5489        ld1             {v0.4h},   [x1],  x2
5490        ld1             {v0.d}[1], [x10], x2
5491        ld1             {v1.4h},   [x1],  x2
5492        ld1             {v1.d}[1], [x10], x2
5493        shl             v0.8h,   v0.8h,   #3
5494        shl             v1.8h,   v1.8h,   #3
5495        subs            w8,  w8,  #4
5496        st1             {v0.8h, v1.8h}, [x0], #32
5497        uaddw           v24.4s,  v24.4s,  v0.4h
5498        uaddw2          v25.4s,  v25.4s,  v0.8h
5499        uaddw           v26.4s,  v26.4s,  v1.4h
5500        uaddw2          v27.4s,  v27.4s,  v1.8h
5501        b.gt            1b
5502        trn2            v0.2d,   v1.2d,   v1.2d
5503        trn2            v1.2d,   v1.2d,   v1.2d
5504        b               L(ipred_cfl_ac_420_w4_hpad)
5505
5506L(ipred_cfl_ac_444_w8):
5507        AARCH64_VALID_JUMP_TARGET
55081:      // Copy and expand input
5509        ld1             {v0.8h}, [x1],  x2
5510        ld1             {v1.8h}, [x10], x2
5511        ld1             {v2.8h}, [x1],  x2
5512        shl             v0.8h,   v0.8h,   #3
5513        ld1             {v3.8h}, [x10], x2
5514        shl             v1.8h,   v1.8h,   #3
5515        shl             v2.8h,   v2.8h,   #3
5516        shl             v3.8h,   v3.8h,   #3
5517        subs            w8,  w8,  #4
5518        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5519        uaddw           v24.4s,  v24.4s,  v0.4h
5520        uaddw2          v25.4s,  v25.4s,  v0.8h
5521        uaddw           v26.4s,  v26.4s,  v1.4h
5522        uaddw2          v27.4s,  v27.4s,  v1.8h
5523        uaddw           v24.4s,  v24.4s,  v2.4h
5524        uaddw2          v25.4s,  v25.4s,  v2.8h
5525        uaddw           v26.4s,  v26.4s,  v3.4h
5526        uaddw2          v27.4s,  v27.4s,  v3.8h
5527        b.gt            1b
5528        mov             v0.16b,  v3.16b
5529        mov             v1.16b,  v3.16b
5530        b               L(ipred_cfl_ac_420_w8_hpad)
5531
5532L(ipred_cfl_ac_444_w16):
5533        AARCH64_VALID_JUMP_TARGET
5534        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
55351:      // Copy and expand input, without padding
5536        ld1             {v0.8h, v1.8h}, [x1],  x2
5537        ld1             {v2.8h, v3.8h}, [x10], x2
5538        shl             v0.8h,   v0.8h,   #3
5539        shl             v1.8h,   v1.8h,   #3
5540        shl             v2.8h,   v2.8h,   #3
5541        shl             v3.8h,   v3.8h,   #3
5542        subs            w8,  w8,  #2
5543        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5544        uaddw           v24.4s,  v24.4s,  v0.4h
5545        uaddw2          v25.4s,  v25.4s,  v0.8h
5546        uaddw           v26.4s,  v26.4s,  v1.4h
5547        uaddw2          v27.4s,  v27.4s,  v1.8h
5548        uaddw           v24.4s,  v24.4s,  v2.4h
5549        uaddw2          v25.4s,  v25.4s,  v2.8h
5550        uaddw           v26.4s,  v26.4s,  v3.4h
5551        uaddw2          v27.4s,  v27.4s,  v3.8h
5552        b.gt            1b
5553        mov             v0.16b,  v2.16b
5554        mov             v1.16b,  v3.16b
5555        b               L(ipred_cfl_ac_420_w16_hpad)
5556
5557L(ipred_cfl_ac_444_w16_wpad):
55581:      // Copy and expand input, padding 8
5559        ld1             {v0.8h}, [x1],  x2
5560        ld1             {v2.8h}, [x10], x2
5561        shl             v0.8h,   v0.8h,   #3
5562        shl             v2.8h,   v2.8h,   #3
5563        dup             v1.8h,   v0.h[7]
5564        dup             v3.8h,   v2.h[7]
5565        subs            w8,  w8,  #2
5566        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5567        uaddw           v24.4s,  v24.4s,  v0.4h
5568        uaddw2          v25.4s,  v25.4s,  v0.8h
5569        uaddw           v26.4s,  v26.4s,  v1.4h
5570        uaddw2          v27.4s,  v27.4s,  v1.8h
5571        uaddw           v24.4s,  v24.4s,  v2.4h
5572        uaddw2          v25.4s,  v25.4s,  v2.8h
5573        uaddw           v26.4s,  v26.4s,  v3.4h
5574        uaddw2          v27.4s,  v27.4s,  v3.8h
5575        b.gt            1b
5576        mov             v0.16b,  v2.16b
5577        mov             v1.16b,  v3.16b
5578        b               L(ipred_cfl_ac_420_w16_hpad)
5579
5580L(ipred_cfl_ac_444_w32):
5581        AARCH64_VALID_JUMP_TARGET
5582        movrel          x7,  ipred_cfl_ac_444_w32_tbl
5583        lsr             w3,  w3,  #1
5584        ldrsw           x3,  [x7, w3, uxtw #2]
5585        lsr             x2,  x2,  #1 // Restore the stride to one line increments
5586        add             x7,  x7,  x3
5587        br              x7
5588
5589L(ipred_cfl_ac_444_w32_wpad0):
5590        AARCH64_VALID_JUMP_TARGET
55911:      // Copy and expand input, without padding
5592        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
5593        shl             v0.8h,   v0.8h,   #3
5594        shl             v1.8h,   v1.8h,   #3
5595        shl             v2.8h,   v2.8h,   #3
5596        shl             v3.8h,   v3.8h,   #3
5597        subs            w8,  w8,  #1
5598        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5599        uaddw           v24.4s,  v24.4s,  v0.4h
5600        uaddw2          v25.4s,  v25.4s,  v0.8h
5601        uaddw           v26.4s,  v26.4s,  v1.4h
5602        uaddw2          v27.4s,  v27.4s,  v1.8h
5603        uaddw           v24.4s,  v24.4s,  v2.4h
5604        uaddw2          v25.4s,  v25.4s,  v2.8h
5605        uaddw           v26.4s,  v26.4s,  v3.4h
5606        uaddw2          v27.4s,  v27.4s,  v3.8h
5607        b.gt            1b
5608        b               L(ipred_cfl_ac_444_w32_hpad)
5609
5610L(ipred_cfl_ac_444_w32_wpad2):
5611        AARCH64_VALID_JUMP_TARGET
56121:      // Copy and expand input, padding 8
5613        ld1             {v0.8h, v1.8h, v2.8h}, [x1],  x2
5614        shl             v2.8h,   v2.8h,   #3
5615        shl             v0.8h,   v0.8h,   #3
5616        shl             v1.8h,   v1.8h,   #3
5617        dup             v3.8h,   v2.h[7]
5618        subs            w8,  w8,  #1
5619        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5620        uaddw           v24.4s,  v24.4s,  v0.4h
5621        uaddw2          v25.4s,  v25.4s,  v0.8h
5622        uaddw           v26.4s,  v26.4s,  v1.4h
5623        uaddw2          v27.4s,  v27.4s,  v1.8h
5624        uaddw           v24.4s,  v24.4s,  v2.4h
5625        uaddw2          v25.4s,  v25.4s,  v2.8h
5626        uaddw           v26.4s,  v26.4s,  v3.4h
5627        uaddw2          v27.4s,  v27.4s,  v3.8h
5628        b.gt            1b
5629        b               L(ipred_cfl_ac_444_w32_hpad)
5630
5631L(ipred_cfl_ac_444_w32_wpad4):
5632        AARCH64_VALID_JUMP_TARGET
56331:      // Copy and expand input, padding 16
5634        ld1             {v0.8h, v1.8h}, [x1],  x2
5635        shl             v1.8h,   v1.8h,   #3
5636        shl             v0.8h,   v0.8h,   #3
5637        dup             v2.8h,   v1.h[7]
5638        dup             v3.8h,   v1.h[7]
5639        subs            w8,  w8,  #1
5640        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5641        uaddw           v24.4s,  v24.4s,  v0.4h
5642        uaddw2          v25.4s,  v25.4s,  v0.8h
5643        uaddw           v26.4s,  v26.4s,  v1.4h
5644        uaddw2          v27.4s,  v27.4s,  v1.8h
5645        uaddw           v24.4s,  v24.4s,  v2.4h
5646        uaddw2          v25.4s,  v25.4s,  v2.8h
5647        uaddw           v26.4s,  v26.4s,  v3.4h
5648        uaddw2          v27.4s,  v27.4s,  v3.8h
5649        b.gt            1b
5650        b               L(ipred_cfl_ac_444_w32_hpad)
5651
5652L(ipred_cfl_ac_444_w32_wpad6):
5653        AARCH64_VALID_JUMP_TARGET
56541:      // Copy and expand input, padding 24
5655        ld1             {v0.8h}, [x1],  x2
5656        shl             v0.8h,   v0.8h,   #3
5657        dup             v1.8h,   v0.h[7]
5658        dup             v2.8h,   v0.h[7]
5659        dup             v3.8h,   v0.h[7]
5660        subs            w8,  w8,  #1
5661        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5662        uaddw           v24.4s,  v24.4s,  v0.4h
5663        uaddw2          v25.4s,  v25.4s,  v0.8h
5664        uaddw           v26.4s,  v26.4s,  v1.4h
5665        uaddw2          v27.4s,  v27.4s,  v1.8h
5666        uaddw           v24.4s,  v24.4s,  v2.4h
5667        uaddw2          v25.4s,  v25.4s,  v2.8h
5668        uaddw           v26.4s,  v26.4s,  v3.4h
5669        uaddw2          v27.4s,  v27.4s,  v3.8h
5670        b.gt            1b
5671
5672L(ipred_cfl_ac_444_w32_hpad):
5673        cbz             w4,  3f
56742:      // Vertical padding (h_pad > 0)
5675        subs            w4,  w4,  #2
5676        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5677        uaddw           v24.4s,  v24.4s,  v0.4h
5678        uaddw2          v25.4s,  v25.4s,  v0.8h
5679        uaddw           v26.4s,  v26.4s,  v1.4h
5680        uaddw2          v27.4s,  v27.4s,  v1.8h
5681        uaddw           v24.4s,  v24.4s,  v2.4h
5682        uaddw2          v25.4s,  v25.4s,  v2.8h
5683        uaddw           v26.4s,  v26.4s,  v3.4h
5684        uaddw2          v27.4s,  v27.4s,  v3.8h
5685        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5686        uaddw           v24.4s,  v24.4s,  v0.4h
5687        uaddw2          v25.4s,  v25.4s,  v0.8h
5688        uaddw           v26.4s,  v26.4s,  v1.4h
5689        uaddw2          v27.4s,  v27.4s,  v1.8h
5690        uaddw           v24.4s,  v24.4s,  v2.4h
5691        uaddw2          v25.4s,  v25.4s,  v2.8h
5692        uaddw           v26.4s,  v26.4s,  v3.4h
5693        uaddw2          v27.4s,  v27.4s,  v3.8h
5694        b.gt            2b
56953:
5696
5697        //  Multiply the height by eight and reuse the w4 subtracting
5698        lsl             w6,  w6,  #3
5699        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
5700endfunc
5701
5702jumptable ipred_cfl_ac_444_tbl
5703        .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl
5704        .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl
5705        .word L(ipred_cfl_ac_444_w8)  - ipred_cfl_ac_444_tbl
5706        .word L(ipred_cfl_ac_444_w4)  - ipred_cfl_ac_444_tbl
5707endjumptable
5708
5709jumptable ipred_cfl_ac_444_w32_tbl
5710        .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl
5711        .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl
5712        .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl
5713        .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl
5714endjumptable
5715