xref: /aosp_15_r20/external/libdav1d/src/arm/64/mc.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Janne Grunau
4 * Copyright © 2018, Martin Storsjo
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "src/arm/asm.S"
30#include "util.S"
31
32.macro avg dst, t0, t1, t2, t3
33        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
34        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
35        add             \t0\().8h,   \t0\().8h,   \t2\().8h
36        add             \t1\().8h,   \t1\().8h,   \t3\().8h
37        sqrshrun        \dst\().8b,  \t0\().8h,   #5
38        sqrshrun2       \dst\().16b, \t1\().8h,   #5
39.endm
40
41.macro w_avg dst, t0, t1, t2, t3
42        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
43        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
44        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
45        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
46        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
47        sqdmulh         \t1\().8h,   \t1\().8h,   v30.8h
48        add             \t0\().8h,   \t2\().8h,   \t0\().8h
49        add             \t1\().8h,   \t3\().8h,   \t1\().8h
50        sqrshrun        \dst\().8b,  \t0\().8h,   #4
51        sqrshrun2       \dst\().16b, \t1\().8h,   #4
52.endm
53
54.macro mask dst, t0, t1, t2, t3
55        ld1             {v30.16b}, [x6],  16
56        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
57        mul             v30.16b, v30.16b, v31.16b
58        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
59        shll            v28.8h, v30.8b,  #8
60        shll2           v29.8h, v30.16b, #8
61        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
62        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
63        sqdmulh         \t0\().8h,   \t0\().8h,   v28.8h
64        sqdmulh         \t1\().8h,   \t1\().8h,   v29.8h
65        add             \t0\().8h,   \t2\().8h,   \t0\().8h
66        add             \t1\().8h,   \t3\().8h,   \t1\().8h
67        sqrshrun        \dst\().8b,  \t0\().8h,   #4
68        sqrshrun2       \dst\().16b, \t1\().8h,   #4
69.endm
70
71.macro bidir_fn type
72function \type\()_8bpc_neon, export=1
73        clz             w4,  w4
74.ifc \type, w_avg
75        dup             v30.8h, w6
76        neg             v30.8h, v30.8h
77        shl             v30.8h, v30.8h, #11
78.endif
79.ifc \type, mask
80        movi            v31.16b, #256-2
81.endif
82        movrel          x7,  \type\()_tbl
83        sub             w4,  w4,  #24
84        ldrsw           x4,  [x7, x4, lsl #2]
85        \type           v4,  v0,  v1,  v2,  v3
86        add             x7,  x7,  x4
87        br              x7
8840:
89        AARCH64_VALID_JUMP_TARGET
90        add             x7,  x0,  x1
91        lsl             x1,  x1,  #1
924:
93        cmp             w5,  #4
94        st1             {v4.s}[0],  [x0], x1
95        st1             {v4.s}[1],  [x7], x1
96        st1             {v4.s}[2],  [x0], x1
97        st1             {v4.s}[3],  [x7], x1
98        b.eq            0f
99        \type           v5,  v0,  v1,  v2,  v3
100        cmp             w5,  #8
101        st1             {v5.s}[0],  [x0], x1
102        st1             {v5.s}[1],  [x7], x1
103        st1             {v5.s}[2],  [x0], x1
104        st1             {v5.s}[3],  [x7], x1
105        b.eq            0f
106        \type           v4,  v0,  v1,  v2,  v3
107        st1             {v4.s}[0],  [x0], x1
108        st1             {v4.s}[1],  [x7], x1
109        \type           v5,  v0,  v1,  v2,  v3
110        st1             {v4.s}[2],  [x0], x1
111        st1             {v4.s}[3],  [x7], x1
112        st1             {v5.s}[0],  [x0], x1
113        st1             {v5.s}[1],  [x7], x1
114        st1             {v5.s}[2],  [x0], x1
115        st1             {v5.s}[3],  [x7], x1
116        ret
11780:
118        AARCH64_VALID_JUMP_TARGET
119        add             x7,  x0,  x1
120        lsl             x1,  x1,  #1
1218:
122        st1             {v4.8b},    [x0], x1
123        \type           v5,  v0,  v1,  v2,  v3
124        st1             {v4.d}[1],  [x7], x1
125        st1             {v5.8b},    [x0], x1
126        subs            w5,  w5,  #4
127        st1             {v5.d}[1],  [x7], x1
128        b.le            0f
129        \type           v4,  v0,  v1,  v2,  v3
130        b               8b
131160:
132        AARCH64_VALID_JUMP_TARGET
13316:
134        \type           v5,  v0,  v1,  v2,  v3
135        st1             {v4.16b}, [x0], x1
136        \type           v6,  v0,  v1,  v2,  v3
137        st1             {v5.16b}, [x0], x1
138        \type           v7,  v0,  v1,  v2,  v3
139        st1             {v6.16b}, [x0], x1
140        subs            w5,  w5,  #4
141        st1             {v7.16b}, [x0], x1
142        b.le            0f
143        \type           v4,  v0,  v1,  v2,  v3
144        b               16b
145320:
146        AARCH64_VALID_JUMP_TARGET
147        add             x7,  x0,  x1
148        lsl             x1,  x1,  #1
14932:
150        \type           v5,  v0,  v1,  v2,  v3
151        \type           v6,  v0,  v1,  v2,  v3
152        st1             {v4.16b,v5.16b}, [x0], x1
153        \type           v7,  v0,  v1,  v2,  v3
154        subs            w5,  w5,  #2
155        st1             {v6.16b,v7.16b}, [x7], x1
156        b.le            0f
157        \type           v4,  v0,  v1,  v2,  v3
158        b               32b
159640:
160        AARCH64_VALID_JUMP_TARGET
161        add             x7,  x0,  x1
162        lsl             x1,  x1,  #1
16364:
164        \type           v5,  v0,  v1,  v2,  v3
165        \type           v6,  v0,  v1,  v2,  v3
166        \type           v7,  v0,  v1,  v2,  v3
167        \type           v16, v0,  v1,  v2,  v3
168        \type           v17, v0,  v1,  v2,  v3
169        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
170        \type           v18, v0,  v1,  v2,  v3
171        \type           v19, v0,  v1,  v2,  v3
172        subs            w5,  w5,  #2
173        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
174        b.le            0f
175        \type           v4, v0,  v1,  v2,  v3
176        b               64b
1771280:
178        AARCH64_VALID_JUMP_TARGET
179        add             x7,  x0,  #64
180128:
181        \type           v5,  v0,  v1,  v2,  v3
182        \type           v6,  v0,  v1,  v2,  v3
183        \type           v7,  v0,  v1,  v2,  v3
184        \type           v16, v0,  v1,  v2,  v3
185        \type           v17, v0,  v1,  v2,  v3
186        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
187        \type           v18, v0,  v1,  v2,  v3
188        \type           v19, v0,  v1,  v2,  v3
189        subs            w5,  w5,  #1
190        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
191        b.le            0f
192        \type           v4, v0,  v1,  v2,  v3
193        b               128b
1940:
195        ret
196endfunc
197
198jumptable \type\()_tbl
199        .word 1280b - \type\()_tbl
200        .word 640b  - \type\()_tbl
201        .word 320b  - \type\()_tbl
202        .word 160b  - \type\()_tbl
203        .word 80b   - \type\()_tbl
204        .word 40b   - \type\()_tbl
205endjumptable
206.endm
207
208bidir_fn avg
209bidir_fn w_avg
210bidir_fn mask
211
212
213.macro w_mask_fn type
214function w_mask_\type\()_8bpc_neon, export=1
215        clz             w8,  w4
216        movrel          x9,  w_mask_\type\()_tbl
217        sub             w8,  w8,  #24
218        ldrsw           x8,  [x9,  x8,  lsl #2]
219        add             x9,  x9,  x8
220        mov             w10, #6903
221        dup             v0.8h,   w10
222.if \type == 444
223        movi            v1.16b,  #64
224.elseif \type == 422
225        dup             v2.8b,   w7
226        movi            v3.8b,   #129
227        sub             v3.8b,   v3.8b,   v2.8b
228.elseif \type == 420
229        dup             v2.8h,   w7
230        movi            v3.8h,   #1, lsl #8
231        sub             v3.8h,   v3.8h,   v2.8h
232.endif
233        add             x12,  x0,  x1
234        lsl             x1,   x1,  #1
235        br              x9
23640:
237        AARCH64_VALID_JUMP_TARGET
2384:
239        ld1             {v4.8h,   v5.8h},   [x2],  #32  // tmp1 (four rows at once)
240        ld1             {v6.8h,   v7.8h},   [x3],  #32  // tmp2 (four rows at once)
241        subs            w5,  w5,  #4
242        sub             v16.8h,  v6.8h,   v4.8h
243        sub             v17.8h,  v7.8h,   v5.8h
244        sabd            v18.8h,  v4.8h,   v6.8h
245        sabd            v19.8h,  v5.8h,   v7.8h
246        uqsub           v18.8h,  v0.8h,   v18.8h
247        uqsub           v19.8h,  v0.8h,   v19.8h
248        ushr            v18.8h,  v18.8h,  #8
249        ushr            v19.8h,  v19.8h,  #8
250        shl             v20.8h,  v18.8h,  #9
251        shl             v21.8h,  v19.8h,  #9
252        sqdmulh         v20.8h,  v20.8h,  v16.8h
253        sqdmulh         v21.8h,  v21.8h,  v17.8h
254        add             v20.8h,  v20.8h,  v4.8h
255        add             v21.8h,  v21.8h,  v5.8h
256        sqrshrun        v22.8b,  v20.8h,  #4
257        sqrshrun        v23.8b,  v21.8h,  #4
258.if \type == 444
259        uzp1            v18.16b,  v18.16b, v19.16b      // Same as xtn, xtn2
260        sub             v18.16b,  v1.16b,  v18.16b
261        st1             {v18.16b}, [x6],  #16
262.elseif \type == 422
263        addp            v18.8h,   v18.8h,  v19.8h
264        xtn             v18.8b,   v18.8h
265        uhsub           v18.8b,   v3.8b,   v18.8b
266        st1             {v18.8b},  [x6],  #8
267.elseif \type == 420
268        trn1            v24.2d,   v18.2d,  v19.2d
269        trn2            v25.2d,   v18.2d,  v19.2d
270        add             v24.8h,   v24.8h,  v25.8h
271        addp            v18.8h,   v24.8h,  v24.8h
272        sub             v18.4h,   v3.4h,   v18.4h
273        rshrn           v18.8b,   v18.8h,  #2
274        str             s18,         [x6],  #4
275.endif
276        st1             {v22.s}[0],  [x0],  x1
277        st1             {v22.s}[1],  [x12], x1
278        st1             {v23.s}[0],  [x0],  x1
279        st1             {v23.s}[1],  [x12], x1
280        b.gt            4b
281        ret
28280:
283        AARCH64_VALID_JUMP_TARGET
2848:
285        ld1             {v4.8h,   v5.8h},   [x2],  #32
286        ld1             {v6.8h,   v7.8h},   [x3],  #32
287        subs            w5,  w5,  #2
288        sub             v16.8h,  v6.8h,   v4.8h
289        sub             v17.8h,  v7.8h,   v5.8h
290        sabd            v18.8h,  v4.8h,   v6.8h
291        sabd            v19.8h,  v5.8h,   v7.8h
292        uqsub           v18.8h,  v0.8h,   v18.8h
293        uqsub           v19.8h,  v0.8h,   v19.8h
294        ushr            v18.8h,  v18.8h,  #8
295        ushr            v19.8h,  v19.8h,  #8
296        shl             v20.8h,  v18.8h,  #9
297        shl             v21.8h,  v19.8h,  #9
298        sqdmulh         v20.8h,  v20.8h,  v16.8h
299        sqdmulh         v21.8h,  v21.8h,  v17.8h
300        add             v20.8h,  v20.8h,  v4.8h
301        add             v21.8h,  v21.8h,  v5.8h
302        sqrshrun        v22.8b,  v20.8h,  #4
303        sqrshrun        v23.8b,  v21.8h,  #4
304.if \type == 444
305        uzp1            v18.16b, v18.16b, v19.16b       // Same as xtn, xtn2
306        sub             v18.16b, v1.16b,  v18.16b
307        st1             {v18.16b}, [x6],  #16
308.elseif \type == 422
309        addp            v18.8h,  v18.8h,  v19.8h
310        xtn             v18.8b,  v18.8h
311        uhsub           v18.8b,  v3.8b,   v18.8b
312        st1             {v18.8b},  [x6],  #8
313.elseif \type == 420
314        add             v18.8h,  v18.8h,  v19.8h
315        addp            v18.8h,  v18.8h,  v18.8h
316        sub             v18.4h,  v3.4h,   v18.4h
317        rshrn           v18.8b,  v18.8h,  #2
318        str             s18,       [x6],  #4
319.endif
320        st1             {v22.8b},  [x0],  x1
321        st1             {v23.8b},  [x12], x1
322        b.gt            8b
323        ret
3241280:
325640:
326320:
327160:
328        AARCH64_VALID_JUMP_TARGET
329        mov             w11, w4
330        sub             x1,  x1,  w4,  uxtw
331.if \type == 444
332        add             x10, x6,  w4,  uxtw
333.elseif \type == 422
334        add             x10, x6,  x11, lsr #1
335.endif
336        add             x9,  x3,  w4,  uxtw #1
337        add             x7,  x2,  w4,  uxtw #1
338161:
339        mov             w8,  w4
34016:
341        ld1             {v4.8h,   v5.8h},   [x2],  #32
342        ld1             {v6.8h,   v7.8h},   [x3],  #32
343        ld1             {v16.8h,  v17.8h},  [x7],  #32
344        ld1             {v18.8h,  v19.8h},  [x9],  #32
345        subs            w8,  w8,  #16
346        sub             v6.8h,   v6.8h,   v4.8h
347        sub             v7.8h,   v7.8h,   v5.8h
348        sub             v18.8h,  v18.8h,  v16.8h
349        sub             v19.8h,  v19.8h,  v17.8h
350        abs             v20.8h,  v6.8h
351        abs             v21.8h,  v7.8h
352        abs             v22.8h,  v18.8h
353        abs             v23.8h,  v19.8h
354        uqsub           v20.8h,  v0.8h,   v20.8h
355        uqsub           v21.8h,  v0.8h,   v21.8h
356        uqsub           v22.8h,  v0.8h,   v22.8h
357        uqsub           v23.8h,  v0.8h,   v23.8h
358        ushr            v20.8h,  v20.8h,  #8
359        ushr            v21.8h,  v21.8h,  #8
360        ushr            v22.8h,  v22.8h,  #8
361        ushr            v23.8h,  v23.8h,  #8
362        shl             v24.8h,  v20.8h,  #9
363        shl             v25.8h,  v21.8h,  #9
364        shl             v26.8h,  v22.8h,  #9
365        shl             v27.8h,  v23.8h,  #9
366        sqdmulh         v24.8h,  v24.8h,  v6.8h
367        sqdmulh         v25.8h,  v25.8h,  v7.8h
368        sqdmulh         v26.8h,  v26.8h,  v18.8h
369        sqdmulh         v27.8h,  v27.8h,  v19.8h
370        add             v24.8h,  v24.8h,  v4.8h
371        add             v25.8h,  v25.8h,  v5.8h
372        add             v26.8h,  v26.8h,  v16.8h
373        add             v27.8h,  v27.8h,  v17.8h
374        sqrshrun        v24.8b,  v24.8h,  #4
375        sqrshrun        v25.8b,  v25.8h,  #4
376        sqrshrun        v26.8b,  v26.8h,  #4
377        sqrshrun        v27.8b,  v27.8h,  #4
378.if \type == 444
379        uzp1            v20.16b, v20.16b, v21.16b       // Same as xtn, xtn2
380        uzp1            v21.16b, v22.16b, v23.16b       // Ditto
381        sub             v20.16b, v1.16b,  v20.16b
382        sub             v21.16b, v1.16b,  v21.16b
383        st1             {v20.16b}, [x6],  #16
384        st1             {v21.16b}, [x10], #16
385.elseif \type == 422
386        addp            v20.8h,  v20.8h,  v21.8h
387        addp            v21.8h,  v22.8h,  v23.8h
388        xtn             v20.8b,  v20.8h
389        xtn             v21.8b,  v21.8h
390        uhsub           v20.8b,  v3.8b,   v20.8b
391        uhsub           v21.8b,  v3.8b,   v21.8b
392        st1             {v20.8b},  [x6],  #8
393        st1             {v21.8b},  [x10], #8
394.elseif \type == 420
395        add             v20.8h,  v20.8h,  v22.8h
396        add             v21.8h,  v21.8h,  v23.8h
397        addp            v20.8h,  v20.8h,  v21.8h
398        sub             v20.8h,  v3.8h,   v20.8h
399        rshrn           v20.8b,  v20.8h,  #2
400        st1             {v20.8b},  [x6],  #8
401.endif
402        st1             {v24.8b,  v25.8b},  [x0],  #16
403        st1             {v26.8b,  v27.8b},  [x12], #16
404        b.gt            16b
405        subs            w5,  w5,  #2
406        add             x2,  x2,  w4,  uxtw #1
407        add             x3,  x3,  w4,  uxtw #1
408        add             x7,  x7,  w4,  uxtw #1
409        add             x9,  x9,  w4,  uxtw #1
410.if \type == 444
411        add             x6,  x6,  w4,  uxtw
412        add             x10, x10, w4,  uxtw
413.elseif \type == 422
414        add             x6,  x6,  x11, lsr #1
415        add             x10, x10, x11, lsr #1
416.endif
417        add             x0,  x0,  x1
418        add             x12, x12, x1
419        b.gt            161b
420        ret
421endfunc
422
423jumptable w_mask_\type\()_tbl
424        .word 1280b - w_mask_\type\()_tbl
425        .word 640b  - w_mask_\type\()_tbl
426        .word 320b  - w_mask_\type\()_tbl
427        .word 160b  - w_mask_\type\()_tbl
428        .word 80b   - w_mask_\type\()_tbl
429        .word 40b   - w_mask_\type\()_tbl
430endjumptable
431.endm
432
433w_mask_fn 444
434w_mask_fn 422
435w_mask_fn 420
436
437
438function blend_8bpc_neon, export=1
439        movrel          x6,  blend_tbl
440        clz             w3,  w3
441        sub             w3,  w3,  #26
442        ldrsw           x3,  [x6,  x3,  lsl #2]
443        add             x6,  x6,  x3
444        movi            v4.16b,  #64
445        add             x8,  x0,  x1
446        lsl             x1,  x1,  #1
447        br              x6
44840:
449        AARCH64_VALID_JUMP_TARGET
4504:
451        ld1             {v2.8b},  [x5],  #8
452        ldr             d1,       [x2],  #8
453        ldr             s0,       [x0]
454        subs            w4,  w4,  #2
455        ld1             {v0.s}[1],   [x8]
456        sub             v3.8b,   v4.8b,   v2.8b
457        umull           v5.8h,   v1.8b,   v2.8b
458        umlal           v5.8h,   v0.8b,   v3.8b
459        rshrn           v6.8b,   v5.8h,   #6
460        st1             {v6.s}[0],   [x0],  x1
461        st1             {v6.s}[1],   [x8],  x1
462        b.gt            4b
463        ret
46480:
465        AARCH64_VALID_JUMP_TARGET
4668:
467        ld1             {v2.16b},  [x5],  #16
468        ld1             {v1.16b},  [x2],  #16
469        ldr             d0,        [x0]
470        ld1             {v0.d}[1], [x8]
471        sub             v3.16b,  v4.16b,  v2.16b
472        subs            w4,  w4,  #2
473        umull           v5.8h,   v1.8b,   v2.8b
474        umlal           v5.8h,   v0.8b,   v3.8b
475        umull2          v6.8h,   v1.16b,  v2.16b
476        umlal2          v6.8h,   v0.16b,  v3.16b
477        rshrn           v7.8b,   v5.8h,   #6
478        rshrn           v16.8b,  v6.8h,   #6
479        st1             {v7.8b},   [x0],  x1
480        st1             {v16.8b},  [x8],  x1
481        b.gt            8b
482        ret
483160:
484        AARCH64_VALID_JUMP_TARGET
48516:
486        ld1             {v1.16b,  v2.16b},  [x5],  #32
487        ld1             {v5.16b,  v6.16b},  [x2],  #32
488        ld1             {v0.16b},  [x0]
489        subs            w4,  w4,  #2
490        sub             v7.16b,  v4.16b,  v1.16b
491        sub             v20.16b, v4.16b,  v2.16b
492        ld1             {v3.16b},  [x8]
493        umull           v16.8h,  v5.8b,   v1.8b
494        umlal           v16.8h,  v0.8b,   v7.8b
495        umull2          v17.8h,  v5.16b,  v1.16b
496        umlal2          v17.8h,  v0.16b,  v7.16b
497        umull           v21.8h,  v6.8b,   v2.8b
498        umlal           v21.8h,  v3.8b,   v20.8b
499        umull2          v22.8h,  v6.16b,  v2.16b
500        umlal2          v22.8h,  v3.16b,  v20.16b
501        rshrn           v18.8b,  v16.8h,  #6
502        rshrn2          v18.16b, v17.8h,  #6
503        rshrn           v19.8b,  v21.8h,  #6
504        rshrn2          v19.16b, v22.8h,  #6
505        st1             {v18.16b}, [x0],  x1
506        st1             {v19.16b}, [x8],  x1
507        b.gt            16b
508        ret
509320:
510        AARCH64_VALID_JUMP_TARGET
51132:
512        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5],  #64
513        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
514        ld1             {v20.16b, v21.16b}, [x0]
515        subs            w4,  w4,  #2
516        ld1             {v22.16b, v23.16b}, [x8]
517        sub             v5.16b,  v4.16b,  v0.16b
518        sub             v6.16b,  v4.16b,  v1.16b
519        sub             v30.16b, v4.16b,  v2.16b
520        sub             v31.16b, v4.16b,  v3.16b
521        umull           v24.8h,  v16.8b,  v0.8b
522        umlal           v24.8h,  v20.8b,  v5.8b
523        umull2          v26.8h,  v16.16b, v0.16b
524        umlal2          v26.8h,  v20.16b, v5.16b
525        umull           v28.8h,  v17.8b,  v1.8b
526        umlal           v28.8h,  v21.8b,  v6.8b
527        umull2          v7.8h,   v17.16b, v1.16b
528        umlal2          v7.8h,   v21.16b, v6.16b
529        umull           v27.8h,  v18.8b,  v2.8b
530        umlal           v27.8h,  v22.8b,  v30.8b
531        umull2          v1.8h,   v18.16b, v2.16b
532        umlal2          v1.8h,   v22.16b, v30.16b
533        umull           v29.8h,  v19.8b,  v3.8b
534        umlal           v29.8h,  v23.8b,  v31.8b
535        umull2          v21.8h,  v19.16b, v3.16b
536        umlal2          v21.8h,  v23.16b, v31.16b
537        rshrn           v24.8b,  v24.8h,  #6
538        rshrn2          v24.16b, v26.8h,  #6
539        rshrn           v25.8b,  v28.8h,  #6
540        rshrn2          v25.16b, v7.8h,   #6
541        rshrn           v27.8b,  v27.8h,  #6
542        rshrn2          v27.16b, v1.8h,   #6
543        rshrn           v28.8b,  v29.8h,  #6
544        rshrn2          v28.16b, v21.8h,  #6
545        st1             {v24.16b, v25.16b}, [x0],  x1
546        st1             {v27.16b, v28.16b}, [x8],  x1
547        b.gt            32b
548        ret
549endfunc
550
551jumptable blend_tbl
552        .word 320b - blend_tbl
553        .word 160b - blend_tbl
554        .word 80b  - blend_tbl
555        .word 40b  - blend_tbl
556endjumptable
557
558function blend_h_8bpc_neon, export=1
559        movrel          x6,  blend_h_tbl
560        movrel          x5,  X(obmc_masks)
561        add             x5,  x5,  w4,  uxtw
562        sub             w4,  w4,  w4,  lsr #2
563        clz             w7,  w3
564        movi            v4.16b,  #64
565        add             x8,  x0,  x1
566        lsl             x1,  x1,  #1
567        sub             w7,  w7,  #24
568        ldrsw           x7,  [x6,  x7,  lsl #2]
569        add             x6,  x6,  x7
570        br              x6
57120:
572        AARCH64_VALID_JUMP_TARGET
5732:
574        ldr             h0,  [x5],  #2
575        ldr             s1,  [x2],  #4
576        subs            w4,  w4,  #2
577        ldr             h2,  [x0]
578        zip1            v0.8b,   v0.8b,   v0.8b
579        sub             v3.8b,   v4.8b,   v0.8b
580        ld1             {v2.h}[1],   [x8]
581        umull           v5.8h,   v1.8b,   v0.8b
582        umlal           v5.8h,   v2.8b,   v3.8b
583        rshrn           v5.8b,   v5.8h,   #6
584        st1             {v5.h}[0],   [x0],  x1
585        st1             {v5.h}[1],   [x8],  x1
586        b.gt            2b
587        ret
58840:
589        AARCH64_VALID_JUMP_TARGET
5904:
591        ld2r            {v0.8b,   v1.8b},   [x5],  #2
592        ld1             {v2.8b},   [x2],  #8
593        subs            w4,  w4,  #2
594        ext             v0.8b,   v0.8b,   v1.8b,   #4
595        ldr             s3,          [x0]
596        sub             v5.8b,   v4.8b,   v0.8b
597        ld1             {v3.s}[1],   [x8]
598        umull           v6.8h,   v2.8b,   v0.8b
599        umlal           v6.8h,   v3.8b,   v5.8b
600        rshrn           v6.8b,   v6.8h,   #6
601        st1             {v6.s}[0],   [x0],  x1
602        st1             {v6.s}[1],   [x8],  x1
603        b.gt            4b
604        ret
60580:
606        AARCH64_VALID_JUMP_TARGET
6078:
608        ld2r            {v0.16b,  v1.16b},  [x5],  #2
609        ld1             {v2.16b},  [x2],  #16
610        ldr             d3,        [x0]
611        ext             v0.16b,  v0.16b,  v1.16b,  #8
612        sub             v5.16b,  v4.16b,  v0.16b
613        ld1             {v3.d}[1], [x8]
614        subs            w4,  w4,  #2
615        umull           v6.8h,   v0.8b,   v2.8b
616        umlal           v6.8h,   v3.8b,   v5.8b
617        umull2          v7.8h,   v0.16b,  v2.16b
618        umlal2          v7.8h,   v3.16b,  v5.16b
619        rshrn           v16.8b,  v6.8h,   #6
620        rshrn           v17.8b,  v7.8h,   #6
621        st1             {v16.8b},  [x0],  x1
622        st1             {v17.8b},  [x8],  x1
623        b.gt            8b
624        ret
625160:
626        AARCH64_VALID_JUMP_TARGET
62716:
628        ld2r            {v0.16b,  v1.16b},  [x5],  #2
629        ld1             {v2.16b,  v3.16b},  [x2],  #32
630        ld1             {v5.16b},  [x0]
631        sub             v7.16b,  v4.16b,  v0.16b
632        sub             v16.16b, v4.16b,  v1.16b
633        ld1             {v6.16b},  [x8]
634        subs            w4,  w4,  #2
635        umull           v17.8h,  v0.8b,   v2.8b
636        umlal           v17.8h,  v5.8b,   v7.8b
637        umull2          v18.8h,  v0.16b,  v2.16b
638        umlal2          v18.8h,  v5.16b,  v7.16b
639        umull           v19.8h,  v1.8b,   v3.8b
640        umlal           v19.8h,  v6.8b,   v16.8b
641        umull2          v20.8h,  v1.16b,  v3.16b
642        umlal2          v20.8h,  v6.16b,  v16.16b
643        rshrn           v21.8b,  v17.8h,  #6
644        rshrn2          v21.16b, v18.8h,  #6
645        rshrn           v22.8b,  v19.8h,  #6
646        rshrn2          v22.16b, v20.8h,  #6
647        st1             {v21.16b}, [x0],  x1
648        st1             {v22.16b}, [x8],  x1
649        b.gt            16b
650        ret
6511280:
652640:
653320:
654        AARCH64_VALID_JUMP_TARGET
655        sub             x1,  x1,  w3,  uxtw
656        add             x7,  x2,  w3,  uxtw
657321:
658        ld2r            {v0.16b,  v1.16b},  [x5],  #2
659        mov             w6,  w3
660        sub             v20.16b, v4.16b,  v0.16b
661        sub             v21.16b, v4.16b,  v1.16b
66232:
663        ld1             {v16.16b, v17.16b}, [x2],  #32
664        ld1             {v2.16b,  v3.16b},  [x0]
665        subs            w6,  w6,  #32
666        umull           v23.8h,  v0.8b,   v16.8b
667        umlal           v23.8h,  v2.8b,   v20.8b
668        ld1             {v18.16b, v19.16b}, [x7],  #32
669        umull2          v27.8h,  v0.16b,  v16.16b
670        umlal2          v27.8h,  v2.16b,  v20.16b
671        ld1             {v6.16b,  v7.16b},  [x8]
672        umull           v24.8h,  v0.8b,   v17.8b
673        umlal           v24.8h,  v3.8b,   v20.8b
674        umull2          v28.8h,  v0.16b,  v17.16b
675        umlal2          v28.8h,  v3.16b,  v20.16b
676        umull           v25.8h,  v1.8b,   v18.8b
677        umlal           v25.8h,  v6.8b,   v21.8b
678        umull2          v5.8h,   v1.16b,  v18.16b
679        umlal2          v5.8h,   v6.16b,  v21.16b
680        rshrn           v29.8b,  v23.8h,  #6
681        rshrn2          v29.16b, v27.8h,  #6
682        umull           v26.8h,  v1.8b,   v19.8b
683        umlal           v26.8h,  v7.8b,   v21.8b
684        umull2          v31.8h,  v1.16b,  v19.16b
685        umlal2          v31.8h,  v7.16b,  v21.16b
686        rshrn           v30.8b,  v24.8h,  #6
687        rshrn2          v30.16b, v28.8h,  #6
688        rshrn           v23.8b,  v25.8h,  #6
689        rshrn2          v23.16b, v5.8h,   #6
690        rshrn           v24.8b,  v26.8h,  #6
691        st1             {v29.16b, v30.16b}, [x0],  #32
692        rshrn2          v24.16b, v31.8h,  #6
693        st1             {v23.16b, v24.16b}, [x8],  #32
694        b.gt            32b
695        subs            w4,  w4,  #2
696        add             x0,  x0,  x1
697        add             x8,  x8,  x1
698        add             x2,  x2,  w3,  uxtw
699        add             x7,  x7,  w3,  uxtw
700        b.gt            321b
701        ret
702endfunc
703
704jumptable blend_h_tbl
705        .word 1280b - blend_h_tbl
706        .word 640b  - blend_h_tbl
707        .word 320b  - blend_h_tbl
708        .word 160b  - blend_h_tbl
709        .word 80b   - blend_h_tbl
710        .word 40b   - blend_h_tbl
711        .word 20b   - blend_h_tbl
712endjumptable
713
714function blend_v_8bpc_neon, export=1
715        movrel          x6,  blend_v_tbl
716        movrel          x5,  X(obmc_masks)
717        add             x5,  x5,  w3,  uxtw
718        clz             w3,  w3
719        movi            v4.16b,  #64
720        add             x8,  x0,  x1
721        lsl             x1,  x1,  #1
722        sub             w3,  w3,  #26
723        ldrsw           x3,  [x6,  x3,  lsl #2]
724        add             x6,  x6,  x3
725        br              x6
72620:
727        AARCH64_VALID_JUMP_TARGET
728        ld1r            {v0.8b},   [x5]
729        sub             v1.8b,   v4.8b,   v0.8b
7302:
731        ldr             h2,          [x2],  #2
732        ldr             b3,          [x0]
733        subs            w4,  w4,  #2
734        ld1             {v2.b}[1],   [x2]
735        ld1             {v3.b}[1],   [x8]
736        umull           v5.8h,   v2.8b,   v0.8b
737        umlal           v5.8h,   v3.8b,   v1.8b
738        rshrn           v5.8b,   v5.8h,   #6
739        add             x2,  x2,  #2
740        st1             {v5.b}[0],   [x0],  x1
741        st1             {v5.b}[1],   [x8],  x1
742        b.gt            2b
743        ret
74440:
745        AARCH64_VALID_JUMP_TARGET
746        ld1r            {v0.2s},   [x5]
747        sub             x1,  x1,  #2
748        sub             v1.8b,   v4.8b,   v0.8b
7494:
750        ld1             {v2.8b},   [x2],  #8
751        ldr             s3,          [x0]
752        ld1             {v3.s}[1],   [x8]
753        subs            w4,  w4,  #2
754        umull           v5.8h,   v2.8b,   v0.8b
755        umlal           v5.8h,   v3.8b,   v1.8b
756        rshrn           v5.8b,   v5.8h,   #6
757        str             h5,          [x0],  #2
758        st1             {v5.h}[2],   [x8],  #2
759        st1             {v5.b}[2],   [x0],  x1
760        st1             {v5.b}[6],   [x8],  x1
761        b.gt            4b
762        ret
76380:
764        AARCH64_VALID_JUMP_TARGET
765        ld1r            {v0.2d},   [x5]
766        sub             x1,  x1,  #4
767        sub             v1.16b,  v4.16b,  v0.16b
768        zip2            v16.2d,  v1.2d,   v1.2d
7698:
770        ld1             {v2.16b},  [x2],  #16
771        ldr             d3,          [x0]
772        ldr             d4,          [x8]
773        subs            w4,  w4,  #2
774        umull           v5.8h,  v0.8b,  v2.8b
775        umlal           v5.8h,  v3.8b,  v1.8b
776        umull2          v6.8h,  v0.16b, v2.16b
777        umlal           v6.8h,  v4.8b,  v16.8b
778        rshrn           v7.8b,  v5.8h,  #6
779        rshrn           v17.8b, v6.8h,  #6
780        str             s7,          [x0],  #4
781        str             s17,         [x8],  #4
782        st1             {v7.h}[2],   [x0],  x1
783        st1             {v17.h}[2],  [x8],  x1
784        b.gt            8b
785        ret
786160:
787        AARCH64_VALID_JUMP_TARGET
788        ld1             {v0.16b},  [x5]
789        sub             x1,  x1,  #8
790        sub             v2.16b,  v4.16b,  v0.16b
79116:
792        ld1             {v5.16b,  v6.16b},  [x2],  #32
793        ld1             {v7.16b},  [x0]
794        subs            w4,  w4,  #2
795        ld1             {v16.16b}, [x8]
796        umull           v17.8h,  v5.8b,   v0.8b
797        umlal           v17.8h,  v7.8b,   v2.8b
798        umull2          v18.8h,  v5.16b,  v0.16b
799        umlal2          v18.8h,  v7.16b,  v2.16b
800        umull           v20.8h,  v6.8b,   v0.8b
801        umlal           v20.8h,  v16.8b,  v2.8b
802        umull2          v21.8h,  v6.16b,  v0.16b
803        umlal2          v21.8h,  v16.16b, v2.16b
804        rshrn           v19.8b,  v17.8h,  #6
805        rshrn2          v19.16b, v18.8h,  #6
806        rshrn           v22.8b,  v20.8h,  #6
807        rshrn2          v22.16b, v21.8h,  #6
808        st1             {v19.8b},  [x0],  #8
809        st1             {v22.8b},  [x8],  #8
810        st1             {v19.s}[2],  [x0],  x1
811        st1             {v22.s}[2],  [x8],  x1
812        b.gt            16b
813        ret
814320:
815        AARCH64_VALID_JUMP_TARGET
816        ld1             {v0.16b,  v1.16b},  [x5]
817        sub             x1,  x1,  #16
818        sub             v2.16b,  v4.16b,  v0.16b
819        sub             v3.8b,   v4.8b,   v1.8b
82032:
821        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
822        ld1             {v5.16b,  v6.16b},  [x0]
823        subs            w4,  w4,  #2
824        ld1             {v20.16b, v21.16b}, [x8]
825        umull           v22.8h,  v16.8b,  v0.8b
826        umlal           v22.8h,  v5.8b,   v2.8b
827        umull2          v23.8h,  v16.16b, v0.16b
828        umlal2          v23.8h,  v5.16b,  v2.16b
829        umull           v28.8h,  v17.8b,  v1.8b
830        umlal           v28.8h,  v6.8b,   v3.8b
831        umull           v30.8h,  v18.8b,  v0.8b
832        umlal           v30.8h,  v20.8b,  v2.8b
833        umull2          v31.8h,  v18.16b, v0.16b
834        umlal2          v31.8h,  v20.16b, v2.16b
835        umull           v25.8h,  v19.8b,  v1.8b
836        umlal           v25.8h,  v21.8b,  v3.8b
837        rshrn           v24.8b,  v22.8h,  #6
838        rshrn2          v24.16b, v23.8h,  #6
839        rshrn           v28.8b,  v28.8h,  #6
840        rshrn           v30.8b,  v30.8h,  #6
841        rshrn2          v30.16b, v31.8h,  #6
842        rshrn           v27.8b,  v25.8h,  #6
843        st1             {v24.16b}, [x0],  #16
844        st1             {v30.16b}, [x8],  #16
845        st1             {v28.8b},  [x0],  x1
846        st1             {v27.8b},  [x8],  x1
847        b.gt            32b
848        ret
849endfunc
850
851jumptable blend_v_tbl
852        .word 320b - blend_v_tbl
853        .word 160b - blend_v_tbl
854        .word 80b  - blend_v_tbl
855        .word 40b  - blend_v_tbl
856        .word 20b  - blend_v_tbl
857endjumptable
858
859
860// This has got the same signature as the put_8tap functions,
861// and assumes that x8 is set to (clz(w)-24).
862function put_neon, export=1
863        movrel          x9,  put_tbl
864        ldrsw           x8,  [x9, x8, lsl #2]
865        add             x9,  x9,  x8
866        br              x9
867
86820:
869        AARCH64_VALID_JUMP_TARGET
8702:
871        ldrh            w9, [x2]
872        ldrh            w10, [x2, x3]
873        add             x2, x2, x3, lsl #1
874        subs            w5, w5, #2
875        strh            w9, [x0]
876        strh            w10, [x0, x1]
877        add             x0, x0, x1, lsl #1
878        b.gt            2b
879        ret
88040:
881        AARCH64_VALID_JUMP_TARGET
8824:
883        ldr             w9, [x2]
884        ldr             w10, [x2, x3]
885        add             x2, x2, x3, lsl #1
886        subs            w5, w5, #2
887        str             w9, [x0]
888        str             w10, [x0, x1]
889        add             x0, x0, x1, lsl #1
890        b.gt            4b
891        ret
89280:
893        AARCH64_VALID_JUMP_TARGET
8948:
895        ldr             x9, [x2]
896        ldr             x10, [x2, x3]
897        add             x2, x2, x3, lsl #1
898        subs            w5, w5, #2
899        str             x9, [x0]
900        str             x10, [x0, x1]
901        add             x0, x0, x1, lsl #1
902        b.gt            8b
903        ret
904160:
905        AARCH64_VALID_JUMP_TARGET
90616:
907        ldr             q0, [x2]
908        ldr             q1, [x2, x3]
909        add             x2, x2, x3, lsl #1
910        subs            w5, w5, #2
911        str             q0, [x0]
912        str             q1, [x0, x1]
913        add             x0, x0, x1, lsl #1
914        b.gt            16b
915        ret
916320:
917        AARCH64_VALID_JUMP_TARGET
91832:
919        ldp             q0, q1, [x2]
920        add             x2, x2, x3
921        stp             q0, q1, [x0]
922        add             x0, x0, x1
923        ldp             q2, q3, [x2]
924        add             x2, x2, x3
925        stp             q2, q3, [x0]
926        subs            w5, w5, #2
927        add             x0, x0, x1
928        b.gt            32b
929        ret
930640:
931        AARCH64_VALID_JUMP_TARGET
93264:
933        ldp             q0, q1, [x2]
934        stp             q0, q1, [x0]
935        ldp             q2, q3, [x2, #32]
936        add             x2, x2, x3
937        stp             q2, q3, [x0, #32]
938        subs            w5, w5, #1
939        add             x0, x0, x1
940        b.gt            64b
941        ret
9421280:
943        AARCH64_VALID_JUMP_TARGET
944128:
945        ldp             q0, q1, [x2]
946        stp             q0, q1, [x0]
947        ldp             q2, q3, [x2, #32]
948        stp             q2, q3, [x0, #32]
949        ldp             q4, q5, [x2, #64]
950        stp             q4, q5, [x0, #64]
951        ldp             q6, q7, [x2, #96]
952        add             x2, x2, x3
953        stp             q6, q7, [x0, #96]
954        subs            w5, w5, #1
955        add             x0, x0, x1
956        b.gt            128b
957        ret
958endfunc
959
960jumptable put_tbl
961        .word 1280b - put_tbl
962        .word 640b  - put_tbl
963        .word 320b  - put_tbl
964        .word 160b  - put_tbl
965        .word 80b   - put_tbl
966        .word 40b   - put_tbl
967        .word 20b   - put_tbl
968endjumptable
969
970
971// This has got the same signature as the prep_8tap functions,
972// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
973function prep_neon, export=1
974        movrel          x9,  prep_tbl
975        ldrsw           x8,  [x9, x8, lsl #2]
976        movi            v24.16b, #16
977        add             x9,  x9,  x8
978        br              x9
979
98040:
981        AARCH64_VALID_JUMP_TARGET
9824:
983        ldr             s0, [x1]
984        ldr             s2, [x1, x2]
985        add             x1, x1, x2, lsl #1
986        ldr             s1, [x1]
987        ldr             s3, [x1, x2]
988        add             x1, x1, x2, lsl #1
989        mov             v0.s[1], v2.s[0]
990        mov             v1.s[1], v3.s[0]
991        ushll           v0.8h, v0.8b, #4
992        ushll           v1.8h, v1.8b, #4
993        subs            w4, w4, #4
994        stp             q0, q1, [x0], #32
995        b.gt            4b
996        ret
99780:
998        AARCH64_VALID_JUMP_TARGET
9998:
1000        ldr             d0, [x1]
1001        ldr             d1, [x1, x2]
1002        add             x1, x1, x2, lsl #1
1003        ldr             d2, [x1]
1004        ldr             d3, [x1, x2]
1005        add             x1, x1, x2, lsl #1
1006        ushll           v0.8h, v0.8b, #4
1007        ushll           v1.8h, v1.8b, #4
1008        umull           v2.8h, v2.8b, v24.8b
1009        umull           v3.8h, v3.8b, v24.8b
1010        subs            w4, w4, #4
1011        stp             q0, q1, [x0]
1012        stp             q2, q3, [x0, #32]
1013        add             x0, x0, #64
1014        b.gt            8b
1015        ret
1016160:
1017        AARCH64_VALID_JUMP_TARGET
101816:
1019        ldr             q1, [x1]
1020        ldr             q3, [x1, x2]
1021        add             x1, x1, x2, lsl #1
1022        ushll           v0.8h, v1.8b, #4
1023        ushll2          v1.8h, v1.16b, #4
1024        ldr             q5, [x1]
1025        ldr             q7, [x1, x2]
1026        add             x1, x1, x2, lsl #1
1027        umull           v2.8h, v3.8b, v24.8b
1028        umull2          v3.8h, v3.16b, v24.16b
1029        ushll           v4.8h, v5.8b, #4
1030        ushll2          v5.8h, v5.16b, #4
1031        umull           v6.8h, v7.8b, v24.8b
1032        umull2          v7.8h, v7.16b, v24.16b
1033        subs            w4, w4, #4
1034        stp             q0, q1, [x0]
1035        stp             q2, q3, [x0, #32]
1036        stp             q4, q5, [x0, #64]
1037        stp             q6, q7, [x0, #96]
1038        add             x0, x0, #128
1039        b.gt            16b
1040        ret
1041320:
1042        AARCH64_VALID_JUMP_TARGET
104332:
1044        ldp             q4, q5, [x1]
1045        add             x1, x1, x2
1046        ldp             q6, q7, [x1]
1047        add             x1, x1, x2
1048        ushll           v0.8h, v4.8b, #4
1049        ushll2          v1.8h, v4.16b, #4
1050        umull           v2.8h, v5.8b, v24.8b
1051        umull2          v3.8h, v5.16b, v24.16b
1052        ushll           v4.8h, v6.8b, #4
1053        ushll2          v5.8h, v6.16b, #4
1054        umull           v6.8h, v7.8b, v24.8b
1055        umull2          v7.8h, v7.16b, v24.16b
1056        subs            w4, w4, #2
1057        stp             q0, q1, [x0]
1058        stp             q2, q3, [x0, #32]
1059        stp             q4, q5, [x0, #64]
1060        stp             q6, q7, [x0, #96]
1061        add             x0, x0, #128
1062        b.gt            32b
1063        ret
1064640:
1065        AARCH64_VALID_JUMP_TARGET
106664:
1067        ldp             q4, q5, [x1]
1068        ldp             q6, q7, [x1, #32]
1069        add             x1, x1, x2
1070        ushll           v0.8h, v4.8b, #4
1071        ushll2          v1.8h, v4.16b, #4
1072        umull           v2.8h, v5.8b, v24.8b
1073        umull2          v3.8h, v5.16b, v24.16b
1074        ushll           v4.8h, v6.8b, #4
1075        ushll2          v5.8h, v6.16b, #4
1076        umull           v6.8h, v7.8b, v24.8b
1077        umull2          v7.8h, v7.16b, v24.16b
1078        subs            w4, w4, #1
1079        stp             q0, q1, [x0]
1080        stp             q2, q3, [x0, #32]
1081        stp             q4, q5, [x0, #64]
1082        stp             q6, q7, [x0, #96]
1083        add             x0, x0, #128
1084        b.gt            64b
1085        ret
10861280:
1087        AARCH64_VALID_JUMP_TARGET
1088128:
1089        ldp             q28, q29, [x1]
1090        ldp             q30, q31, [x1, #32]
1091        ushll           v16.8h, v28.8b, #4
1092        ushll2          v17.8h, v28.16b, #4
1093        umull           v18.8h, v29.8b, v24.8b
1094        umull2          v19.8h, v29.16b, v24.16b
1095        ushll           v20.8h, v30.8b, #4
1096        ushll2          v21.8h, v30.16b, #4
1097        umull           v22.8h, v31.8b, v24.8b
1098        umull2          v23.8h, v31.16b, v24.16b
1099        ldp             q28, q29, [x1, #64]
1100        ldp             q30, q31, [x1, #96]
1101        add             x1, x1, x2
1102        stp             q16, q17, [x0]
1103        stp             q18, q19, [x0, #32]
1104        stp             q20, q21, [x0, #64]
1105        stp             q22, q23, [x0, #96]
1106        ushll           v16.8h, v28.8b, #4
1107        ushll2          v17.8h, v28.16b, #4
1108        umull           v18.8h, v29.8b, v24.8b
1109        umull2          v19.8h, v29.16b, v24.16b
1110        ushll           v20.8h, v30.8b, #4
1111        ushll2          v21.8h, v30.16b, #4
1112        umull           v22.8h, v31.8b, v24.8b
1113        umull2          v23.8h, v31.16b, v24.16b
1114        subs            w4, w4, #1
1115        stp             q16, q17, [x0, #128]
1116        stp             q18, q19, [x0, #160]
1117        stp             q20, q21, [x0, #192]
1118        stp             q22, q23, [x0, #224]
1119        add             x0, x0, #256
1120        b.gt            128b
1121        ret
1122endfunc
1123
1124jumptable prep_tbl
1125        .word 1280b - prep_tbl
1126        .word 640b  - prep_tbl
1127        .word 320b  - prep_tbl
1128        .word 160b  - prep_tbl
1129        .word 80b   - prep_tbl
1130        .word 40b   - prep_tbl
1131endjumptable
1132
1133
1134.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1135        ld1             {\d0\wd}[0], [\s0], \strd
1136        ld1             {\d1\wd}[0], [\s1], \strd
1137.ifnb \d2
1138        ld1             {\d2\wd}[0], [\s0], \strd
1139        ld1             {\d3\wd}[0], [\s1], \strd
1140.endif
1141.ifnb \d4
1142        ld1             {\d4\wd}[0], [\s0], \strd
1143.endif
1144.ifnb \d5
1145        ld1             {\d5\wd}[0], [\s1], \strd
1146.endif
1147.ifnb \d6
1148        ld1             {\d6\wd}[0], [\s0], \strd
1149.endif
1150.endm
1151.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1152        ld1             {\d0\wd}, [\s0], \strd
1153        ld1             {\d1\wd}, [\s1], \strd
1154.ifnb \d2
1155        ld1             {\d2\wd}, [\s0], \strd
1156        ld1             {\d3\wd}, [\s1], \strd
1157.endif
1158.ifnb \d4
1159        ld1             {\d4\wd}, [\s0], \strd
1160.endif
1161.ifnb \d5
1162        ld1             {\d5\wd}, [\s1], \strd
1163.endif
1164.ifnb \d6
1165        ld1             {\d6\wd}, [\s0], \strd
1166.endif
1167.endm
1168.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1169        load_slice      \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1170.endm
1171.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1172        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1173.endm
1174.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1175        load_reg        \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1176.endm
1177.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1178        load_reg        \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1179.endm
1180.macro interleave_1 wd, r0, r1, r2, r3, r4
1181        trn1            \r0\wd, \r0\wd, \r1\wd
1182        trn1            \r1\wd, \r1\wd, \r2\wd
1183.ifnb \r3
1184        trn1            \r2\wd, \r2\wd, \r3\wd
1185        trn1            \r3\wd, \r3\wd, \r4\wd
1186.endif
1187.endm
1188.macro interleave_1_h r0, r1, r2, r3, r4
1189        interleave_1    .4h, \r0, \r1, \r2, \r3, \r4
1190.endm
1191.macro interleave_1_s r0, r1, r2, r3, r4
1192        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
1193.endm
1194.macro interleave_2 wd, r0, r1, r2, r3, r4, r5
1195        trn1            \r0\wd,  \r0\wd, \r2\wd
1196        trn1            \r1\wd,  \r1\wd, \r3\wd
1197        trn1            \r2\wd,  \r2\wd, \r4\wd
1198        trn1            \r3\wd,  \r3\wd, \r5\wd
1199.endm
1200.macro interleave_2_s r0, r1, r2, r3, r4, r5
1201        interleave_2    .2s, \r0, \r1, \r2, \r3, \r4, \r5
1202.endm
1203.macro uxtl_b r0, r1, r2, r3, r4, r5, r6
1204        uxtl            \r0\().8h, \r0\().8b
1205        uxtl            \r1\().8h, \r1\().8b
1206.ifnb \r2
1207        uxtl            \r2\().8h, \r2\().8b
1208        uxtl            \r3\().8h, \r3\().8b
1209.endif
1210.ifnb \r4
1211        uxtl            \r4\().8h, \r4\().8b
1212.endif
1213.ifnb \r5
1214        uxtl            \r5\().8h, \r5\().8b
1215.endif
1216.ifnb \r6
1217        uxtl            \r6\().8h, \r6\().8b
1218.endif
1219.endm
1220.macro mul_mla_4tap d, s0, s1, s2, s3, wd
1221        mul             \d\wd,  \s0\wd,  v0.h[0]
1222        mla             \d\wd,  \s1\wd,  v0.h[1]
1223        mla             \d\wd,  \s2\wd,  v0.h[2]
1224        mla             \d\wd,  \s3\wd,  v0.h[3]
1225.endm
1226// Interleaving the mul/mla chains actually hurts performance
1227// significantly on Cortex A53, thus keeping mul/mla tightly
1228// chained like this.
1229.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
1230        mul             \d0\().4h, \s1\().4h, v0.h[1]
1231        mla             \d0\().4h, \s2\().4h, v0.h[2]
1232        mla             \d0\().4h, \s3\().4h, v0.h[3]
1233        mla             \d0\().4h, \s4\().4h, v0.h[4]
1234        mla             \d0\().4h, \s5\().4h, v0.h[5]
1235        mla             \d0\().4h, \s6\().4h, v0.h[6]
1236.endm
1237.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
1238        mul             \d0\().8h, \s1\().8h, v0.h[1]
1239        mla             \d0\().8h, \s2\().8h, v0.h[2]
1240        mla             \d0\().8h, \s3\().8h, v0.h[3]
1241        mla             \d0\().8h, \s4\().8h, v0.h[4]
1242        mla             \d0\().8h, \s5\().8h, v0.h[5]
1243        mla             \d0\().8h, \s6\().8h, v0.h[6]
1244.endm
1245.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
1246        mul             \d0\().8h, \s1\().8h, v0.h[1]
1247        mla             \d0\().8h, \s2\().8h, v0.h[2]
1248        mla             \d0\().8h, \s3\().8h, v0.h[3]
1249        mla             \d0\().8h, \s4\().8h, v0.h[4]
1250        mla             \d0\().8h, \s5\().8h, v0.h[5]
1251        mla             \d0\().8h, \s6\().8h, v0.h[6]
1252        mul             \d1\().8h, \s2\().8h, v0.h[1]
1253        mla             \d1\().8h, \s3\().8h, v0.h[2]
1254        mla             \d1\().8h, \s4\().8h, v0.h[3]
1255        mla             \d1\().8h, \s5\().8h, v0.h[4]
1256        mla             \d1\().8h, \s6\().8h, v0.h[5]
1257        mla             \d1\().8h, \s7\().8h, v0.h[6]
1258.endm
1259.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
1260        mul             \d0\().8h, \s1\().8h, v0.h[1]
1261        mla             \d0\().8h, \s2\().8h, v0.h[2]
1262        mla             \d0\().8h, \s3\().8h, v0.h[3]
1263        mla             \d0\().8h, \s4\().8h, v0.h[4]
1264        mla             \d0\().8h, \s5\().8h, v0.h[5]
1265        mla             \d0\().8h, \s6\().8h, v0.h[6]
1266        mul             \d1\().8h, \s3\().8h, v0.h[1]
1267        mla             \d1\().8h, \s4\().8h, v0.h[2]
1268        mla             \d1\().8h, \s5\().8h, v0.h[3]
1269        mla             \d1\().8h, \s6\().8h, v0.h[4]
1270        mla             \d1\().8h, \s7\().8h, v0.h[5]
1271        mla             \d1\().8h, \s8\().8h, v0.h[6]
1272.endm
1273.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
1274        mul             \d0\().4h, \s0\().4h, v0.h[0]
1275        mla             \d0\().4h, \s1\().4h, v0.h[1]
1276        mla             \d0\().4h, \s2\().4h, v0.h[2]
1277        mla             \d0\().4h, \s3\().4h, v0.h[3]
1278        mla             \d0\().4h, \s4\().4h, v0.h[4]
1279        mla             \d0\().4h, \s5\().4h, v0.h[5]
1280        mla             \d0\().4h, \s6\().4h, v0.h[6]
1281        mla             \d0\().4h, \s7\().4h, v0.h[7]
1282.endm
1283.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
1284        mul             \d0\().8h, \s0\().8h, v0.h[0]
1285        mla             \d0\().8h, \s1\().8h, v0.h[1]
1286        mla             \d0\().8h, \s2\().8h, v0.h[2]
1287        mla             \d0\().8h, \s3\().8h, v0.h[3]
1288        mla             \d0\().8h, \s4\().8h, v0.h[4]
1289        mla             \d0\().8h, \s5\().8h, v0.h[5]
1290        mla             \d0\().8h, \s6\().8h, v0.h[6]
1291        mla             \d0\().8h, \s7\().8h, v0.h[7]
1292.endm
1293.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
1294        mul             \d0\().8h, \s0\().8h, v0.h[0]
1295        mla             \d0\().8h, \s1\().8h, v0.h[1]
1296        mla             \d0\().8h, \s2\().8h, v0.h[2]
1297        mla             \d0\().8h, \s3\().8h, v0.h[3]
1298        mla             \d0\().8h, \s4\().8h, v0.h[4]
1299        mla             \d0\().8h, \s5\().8h, v0.h[5]
1300        mla             \d0\().8h, \s6\().8h, v0.h[6]
1301        mla             \d0\().8h, \s7\().8h, v0.h[7]
1302        mul             \d1\().8h, \s1\().8h, v0.h[0]
1303        mla             \d1\().8h, \s2\().8h, v0.h[1]
1304        mla             \d1\().8h, \s3\().8h, v0.h[2]
1305        mla             \d1\().8h, \s4\().8h, v0.h[3]
1306        mla             \d1\().8h, \s5\().8h, v0.h[4]
1307        mla             \d1\().8h, \s6\().8h, v0.h[5]
1308        mla             \d1\().8h, \s7\().8h, v0.h[6]
1309        mla             \d1\().8h, \s8\().8h, v0.h[7]
1310.endm
1311.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
1312        mul             \d0\().8h, \s0\().8h, v0.h[0]
1313        mla             \d0\().8h, \s1\().8h, v0.h[1]
1314        mla             \d0\().8h, \s2\().8h, v0.h[2]
1315        mla             \d0\().8h, \s3\().8h, v0.h[3]
1316        mla             \d0\().8h, \s4\().8h, v0.h[4]
1317        mla             \d0\().8h, \s5\().8h, v0.h[5]
1318        mla             \d0\().8h, \s6\().8h, v0.h[6]
1319        mla             \d0\().8h, \s7\().8h, v0.h[7]
1320        mul             \d1\().8h, \s2\().8h, v0.h[0]
1321        mla             \d1\().8h, \s3\().8h, v0.h[1]
1322        mla             \d1\().8h, \s4\().8h, v0.h[2]
1323        mla             \d1\().8h, \s5\().8h, v0.h[3]
1324        mla             \d1\().8h, \s6\().8h, v0.h[4]
1325        mla             \d1\().8h, \s7\().8h, v0.h[5]
1326        mla             \d1\().8h, \s8\().8h, v0.h[6]
1327        mla             \d1\().8h, \s9\().8h, v0.h[7]
1328.endm
1329.macro sqrshrun_b shift, r0, r1, r2, r3
1330        sqrshrun        \r0\().8b, \r0\().8h,  #\shift
1331.ifnb \r1
1332        sqrshrun        \r1\().8b, \r1\().8h,  #\shift
1333.endif
1334.ifnb \r2
1335        sqrshrun        \r2\().8b, \r2\().8h,  #\shift
1336        sqrshrun        \r3\().8b, \r3\().8h,  #\shift
1337.endif
1338.endm
1339.macro srshr_h shift, r0, r1, r2, r3
1340        srshr           \r0\().8h, \r0\().8h,  #\shift
1341.ifnb \r1
1342        srshr           \r1\().8h, \r1\().8h,  #\shift
1343.endif
1344.ifnb \r2
1345        srshr           \r2\().8h, \r2\().8h,  #\shift
1346        srshr           \r3\().8h, \r3\().8h,  #\shift
1347.endif
1348.endm
1349.macro st_h strd, reg, lanes
1350        st1             {\reg\().h}[0], [x0], \strd
1351        st1             {\reg\().h}[1], [x8], \strd
1352.if \lanes > 2
1353        st1             {\reg\().h}[2], [x0], \strd
1354        st1             {\reg\().h}[3], [x8], \strd
1355.endif
1356.endm
1357.macro st_s strd, r0, r1
1358        st1             {\r0\().s}[0], [x0], \strd
1359        st1             {\r0\().s}[1], [x8], \strd
1360.ifnb \r1
1361        st1             {\r1\().s}[0], [x0], \strd
1362        st1             {\r1\().s}[1], [x8], \strd
1363.endif
1364.endm
1365.macro st_d strd, r0, r1
1366        st1             {\r0\().8b},   [x0], \strd
1367        st1             {\r0\().d}[1], [x8], \strd
1368.ifnb \r1
1369        st1             {\r1\().8b},   [x0], \strd
1370        st1             {\r1\().d}[1], [x8], \strd
1371.endif
1372.endm
1373.macro shift_store_4 type, strd, r0, r1
1374.ifc \type, put
1375        sqrshrun_b      6,     \r0, \r1
1376        st_s            \strd, \r0, \r1
1377.else
1378        srshr_h         2,     \r0, \r1
1379        st_d            \strd, \r0, \r1
1380.endif
1381.endm
1382.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
1383        st1             {\r0\wd}, [x0], \strd
1384        st1             {\r1\wd}, [x8], \strd
1385.ifnb \r2
1386        st1             {\r2\wd}, [x0], \strd
1387        st1             {\r3\wd}, [x8], \strd
1388.endif
1389.ifnb \r4
1390        st1             {\r4\wd}, [x0], \strd
1391        st1             {\r5\wd}, [x8], \strd
1392        st1             {\r6\wd}, [x0], \strd
1393        st1             {\r7\wd}, [x8], \strd
1394.endif
1395.endm
1396.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
1397        st_reg          \strd, .8b,  \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1398.endm
1399.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
1400        st_reg          \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1401.endm
1402.macro shift_store_8 type, strd, r0, r1, r2, r3
1403.ifc \type, put
1404        sqrshrun_b      6,     \r0, \r1, \r2, \r3
1405        st_8b           \strd, \r0, \r1, \r2, \r3
1406.else
1407        srshr_h         2,     \r0, \r1, \r2, \r3
1408        st_16b          \strd, \r0, \r1, \r2, \r3
1409.endif
1410.endm
1411.macro shift_store_16 type, strd, r0, r1, r2, r3
1412.ifc \type, put
1413        sqrshrun        \r0\().8b,  \r0\().8h, #6
1414        sqrshrun2       \r0\().16b, \r1\().8h, #6
1415        sqrshrun        \r2\().8b,  \r2\().8h, #6
1416        sqrshrun2       \r2\().16b, \r3\().8h, #6
1417        st_16b          \strd, \r0, \r2
1418.else
1419        srshr_h         2,     \r0, \r1, \r2, \r3
1420        st1             {\r0\().8h, \r1\().8h}, [x0], \strd
1421        st1             {\r2\().8h, \r3\().8h}, [x8], \strd
1422.endif
1423.endm
1424
1425.macro make_8tap_fn op, type, type_h, type_v, taps
1426function \op\()_8tap_\type\()_8bpc_neon, export=1
1427        mov             x8,  \type_h
1428        mov             x9,  \type_v
1429        b               \op\()_\taps\()_neon
1430endfunc
1431.endm
1432
1433// No spaces in these expressions, due to gas-preprocessor.
1434#define REGULAR ((0*15<<7)|3*15)
1435#define SMOOTH  ((1*15<<7)|4*15)
1436#define SHARP   ((2*15<<7)|3*15)
1437
1438.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps
1439function \type\()_\taps\()_neon
1440        mov             w10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1441        mul             \mx,  \mx, w10
1442        mul             \my,  \my, w10
1443        add             \mx,  \mx, w8 // mx, 8tap_h, 4tap_h
1444        add             \my,  \my, w9 // my, 8tap_v, 4tap_v
1445.ifc \type, prep
1446        uxtw            \d_strd, \w
1447        lsl             \d_strd, \d_strd, #1
1448.endif
1449
1450        clz             w8,  \w
1451        tst             \mx, #(0x7f << 14)
1452        sub             w8,  w8,  #24
1453        movrel          x10, X(mc_subpel_filters), -8
1454        b.ne            L(\type\()_\taps\()_h)
1455        tst             \my, #(0x7f << 14)
1456        b.ne            L(\type\()_\taps\()_v)
1457        b               \type\()_neon
1458
1459L(\type\()_\taps\()_h):
1460        cmp             \w,  #4
1461        ubfx            w9,  \mx, #7, #7
1462        and             \mx, \mx, #0x7f
1463        b.le            4f
1464        mov             \mx,  w9
14654:
1466        tst             \my,  #(0x7f << 14)
1467        add             \xmx, x10, \mx, uxtw #3
1468        b.ne            L(\type\()_\taps\()_hv)
1469
1470        movrel          x9,  \type\()_\taps\()_h_tbl
1471        ldrsw           x8,  [x9, x8, lsl #2]
1472        add             x9,  x9,  x8
1473        br              x9
1474
147520:     // 2xN h
1476        AARCH64_VALID_JUMP_TARGET
1477.ifc \type, put
1478        ldur            s0,  [\xmx, #2]
1479        sub             \src,  \src,  #1
1480        add             \ds2,  \dst,  \d_strd
1481        add             \sr2,  \src,  \s_strd
1482        lsl             \d_strd,  \d_strd,  #1
1483        lsl             \s_strd,  \s_strd,  #1
1484        sxtl            v0.8h,  v0.8b
14852:
1486        ld1             {v4.8b},  [\src], \s_strd
1487        ld1             {v6.8b},  [\sr2], \s_strd
1488        uxtl            v4.8h,  v4.8b
1489        uxtl            v6.8h,  v6.8b
1490        ext             v5.16b, v4.16b, v4.16b, #2
1491        ext             v7.16b, v6.16b, v6.16b, #2
1492        subs            \h,  \h,  #2
1493        trn1            v3.2s,  v4.2s,  v6.2s
1494        trn2            v6.2s,  v4.2s,  v6.2s
1495        trn1            v4.2s,  v5.2s,  v7.2s
1496        trn2            v7.2s,  v5.2s,  v7.2s
1497        mul             v3.4h,  v3.4h,  v0.h[0]
1498        mla             v3.4h,  v4.4h,  v0.h[1]
1499        mla             v3.4h,  v6.4h,  v0.h[2]
1500        mla             v3.4h,  v7.4h,  v0.h[3]
1501        srshr           v3.4h,  v3.4h,  #2
1502        sqrshrun        v3.8b,  v3.8h,  #4
1503        st1             {v3.h}[0], [\dst], \d_strd
1504        st1             {v3.h}[1], [\ds2], \d_strd
1505        b.gt            2b
1506        ret
1507.endif
1508
150940:     // 4xN h
1510        AARCH64_VALID_JUMP_TARGET
1511        ldur            s0,  [\xmx, #2]
1512        sub             \src,  \src,  #1
1513        add             \ds2,  \dst,  \d_strd
1514        add             \sr2,  \src,  \s_strd
1515        lsl             \d_strd,  \d_strd,  #1
1516        lsl             \s_strd,  \s_strd,  #1
1517        sxtl            v0.8h,  v0.8b
15184:
1519        ld1             {v16.8b}, [\src], \s_strd
1520        ld1             {v20.8b}, [\sr2], \s_strd
1521        uxtl            v16.8h,  v16.8b
1522        uxtl            v20.8h,  v20.8b
1523        ext             v17.16b, v16.16b, v16.16b, #2
1524        ext             v18.16b, v16.16b, v16.16b, #4
1525        ext             v19.16b, v16.16b, v16.16b, #6
1526        ext             v21.16b, v20.16b, v20.16b, #2
1527        ext             v22.16b, v20.16b, v20.16b, #4
1528        ext             v23.16b, v20.16b, v20.16b, #6
1529        subs            \h,  \h,  #2
1530        mul             v16.4h,  v16.4h,  v0.h[0]
1531        mla             v16.4h,  v17.4h,  v0.h[1]
1532        mla             v16.4h,  v18.4h,  v0.h[2]
1533        mla             v16.4h,  v19.4h,  v0.h[3]
1534        mul             v20.4h,  v20.4h,  v0.h[0]
1535        mla             v20.4h,  v21.4h,  v0.h[1]
1536        mla             v20.4h,  v22.4h,  v0.h[2]
1537        mla             v20.4h,  v23.4h,  v0.h[3]
1538        srshr           v16.4h,  v16.4h,  #2
1539        srshr           v20.4h,  v20.4h,  #2
1540.ifc \type, put
1541        sqrshrun        v16.8b,  v16.8h,  #4
1542        sqrshrun        v20.8b,  v20.8h,  #4
1543        str             s16,  [\dst]
1544        str             s20,  [\ds2]
1545        add             \dst, \dst, \d_strd
1546        add             \ds2, \ds2, \d_strd
1547.else
1548        st1             {v16.4h}, [\dst], \d_strd
1549        st1             {v20.4h}, [\ds2], \d_strd
1550.endif
1551        b.gt            4b
1552        ret
1553
155480:     // 8xN h
1555        AARCH64_VALID_JUMP_TARGET
1556        ld1             {v0.8b}, [\xmx]
1557.ifc \taps, 6tap
1558        sub             \src,  \src,  #2
1559.else
1560        sub             \src,  \src,  #3
1561.endif
1562        add             \ds2,  \dst,  \d_strd
1563        add             \sr2,  \src,  \s_strd
1564        lsl             \d_strd,  \d_strd,  #1
1565        lsl             \s_strd,  \s_strd,  #1
1566        sxtl            v0.8h, v0.8b
15678:
1568        ld1             {v16.8b, v17.8b},  [\src], \s_strd
1569        ld1             {v20.8b, v21.8b},  [\sr2], \s_strd
1570        uxtl            v16.8h,  v16.8b
1571        uxtl            v17.8h,  v17.8b
1572        uxtl            v20.8h,  v20.8b
1573        uxtl            v21.8h,  v21.8b
1574
1575.ifc \taps, 6tap
1576        mul             v18.8h,  v16.8h,  v0.h[1]
1577        mul             v22.8h,  v20.8h,  v0.h[1]
1578    .irpc i, 23456
1579        ext             v19.16b, v16.16b, v17.16b, #(2*\i-2)
1580        ext             v23.16b, v20.16b, v21.16b, #(2*\i-2)
1581        mla             v18.8h,  v19.8h,  v0.h[\i]
1582        mla             v22.8h,  v23.8h,  v0.h[\i]
1583    .endr
1584.else   // 8tap
1585        mul             v18.8h,  v16.8h,  v0.h[0]
1586        mul             v22.8h,  v20.8h,  v0.h[0]
1587    .irpc i, 1234567
1588        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
1589        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
1590        mla             v18.8h,  v19.8h,  v0.h[\i]
1591        mla             v22.8h,  v23.8h,  v0.h[\i]
1592    .endr
1593.endif
1594        subs            \h,  \h,  #2
1595        srshr           v18.8h,  v18.8h, #2
1596        srshr           v22.8h,  v22.8h, #2
1597.ifc \type, put
1598        sqrshrun        v18.8b,  v18.8h, #4
1599        sqrshrun        v22.8b,  v22.8h, #4
1600        st1             {v18.8b}, [\dst], \d_strd
1601        st1             {v22.8b}, [\ds2], \d_strd
1602.else
1603        st1             {v18.8h}, [\dst], \d_strd
1604        st1             {v22.8h}, [\ds2], \d_strd
1605.endif
1606        b.gt            8b
1607        ret
1608160:
1609320:
1610640:
16111280:   // 16xN, 32xN, ... h
1612        AARCH64_VALID_JUMP_TARGET
1613        ld1             {v0.8b}, [\xmx]
1614.ifc \taps, 6tap
1615        sub             \src,  \src,  #2
1616.else
1617        sub             \src,  \src,  #3
1618.endif
1619        add             \ds2,  \dst,  \d_strd
1620        add             \sr2,  \src,  \s_strd
1621        lsl             \s_strd,  \s_strd,  #1
1622        sxtl            v0.8h, v0.8b
1623
1624        sub             \s_strd,  \s_strd,  \w, uxtw
1625        sub             \s_strd,  \s_strd,  #8
1626.ifc \type, put
1627        lsl             \d_strd,  \d_strd,  #1
1628        sub             \d_strd,  \d_strd,  \w, uxtw
1629.endif
1630161:
1631        ld1             {v16.8b, v17.8b, v18.8b},  [\src], #24
1632        ld1             {v20.8b, v21.8b, v22.8b},  [\sr2], #24
1633        mov             \mx, \w
1634        uxtl            v16.8h,  v16.8b
1635        uxtl            v17.8h,  v17.8b
1636        uxtl            v18.8h,  v18.8b
1637        uxtl            v20.8h,  v20.8b
1638        uxtl            v21.8h,  v21.8b
1639        uxtl            v22.8h,  v22.8b
1640
164116:
1642.ifc \taps, 6tap
1643        mul             v24.8h,  v16.8h,  v0.h[1]
1644        mul             v25.8h,  v17.8h,  v0.h[1]
1645        mul             v26.8h,  v20.8h,  v0.h[1]
1646        mul             v27.8h,  v21.8h,  v0.h[1]
1647    .irpc i, 23456
1648        ext             v28.16b, v16.16b, v17.16b, #(2*\i-2)
1649        ext             v29.16b, v17.16b, v18.16b, #(2*\i-2)
1650        ext             v30.16b, v20.16b, v21.16b, #(2*\i-2)
1651        ext             v31.16b, v21.16b, v22.16b, #(2*\i-2)
1652        mla             v24.8h,  v28.8h,  v0.h[\i]
1653        mla             v25.8h,  v29.8h,  v0.h[\i]
1654        mla             v26.8h,  v30.8h,  v0.h[\i]
1655        mla             v27.8h,  v31.8h,  v0.h[\i]
1656    .endr
1657.else   // 8tap
1658        mul             v24.8h,  v16.8h,  v0.h[0]
1659        mul             v25.8h,  v17.8h,  v0.h[0]
1660        mul             v26.8h,  v20.8h,  v0.h[0]
1661        mul             v27.8h,  v21.8h,  v0.h[0]
1662    .irpc i, 1234567
1663        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
1664        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
1665        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
1666        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
1667        mla             v24.8h,  v28.8h,  v0.h[\i]
1668        mla             v25.8h,  v29.8h,  v0.h[\i]
1669        mla             v26.8h,  v30.8h,  v0.h[\i]
1670        mla             v27.8h,  v31.8h,  v0.h[\i]
1671    .endr
1672.endif
1673        srshr           v24.8h,  v24.8h, #2
1674        srshr           v25.8h,  v25.8h, #2
1675        srshr           v26.8h,  v26.8h, #2
1676        srshr           v27.8h,  v27.8h, #2
1677        subs            \mx, \mx, #16
1678.ifc \type, put
1679        sqrshrun        v24.8b,  v24.8h, #4
1680        sqrshrun2       v24.16b, v25.8h, #4
1681        sqrshrun        v26.8b,  v26.8h, #4
1682        sqrshrun2       v26.16b, v27.8h, #4
1683        st1             {v24.16b}, [\dst], #16
1684        st1             {v26.16b}, [\ds2], #16
1685.else
1686        st1             {v24.8h, v25.8h}, [\dst], #32
1687        st1             {v26.8h, v27.8h}, [\ds2], #32
1688.endif
1689        b.le            9f
1690
1691        mov             v16.16b, v18.16b
1692        mov             v20.16b, v22.16b
1693        ld1             {v17.8b, v18.8b}, [\src], #16
1694        ld1             {v21.8b, v22.8b}, [\sr2], #16
1695        uxtl            v17.8h,  v17.8b
1696        uxtl            v18.8h,  v18.8b
1697        uxtl            v21.8h,  v21.8b
1698        uxtl            v22.8h,  v22.8b
1699        b               16b
1700
17019:
1702        add             \dst,  \dst,  \d_strd
1703        add             \ds2,  \ds2,  \d_strd
1704        add             \src,  \src,  \s_strd
1705        add             \sr2,  \sr2,  \s_strd
1706
1707        subs            \h,  \h,  #2
1708        b.gt            161b
1709        ret
1710endfunc
1711
1712jumptable \type\()_\taps\()_h_tbl
1713        .word 1280b - \type\()_\taps\()_h_tbl
1714        .word 640b  - \type\()_\taps\()_h_tbl
1715        .word 320b  - \type\()_\taps\()_h_tbl
1716        .word 160b  - \type\()_\taps\()_h_tbl
1717        .word 80b   - \type\()_\taps\()_h_tbl
1718        .word 40b   - \type\()_\taps\()_h_tbl
1719        .word 20b   - \type\()_\taps\()_h_tbl
1720endjumptable
1721
1722function L(\type\()_\taps\()_v)
1723        cmp             \h,  #4
1724        ubfx            w9,  \my, #7, #7
1725        and             \my, \my, #0x7f
1726        b.le            4f
1727        mov             \my, w9
17284:
1729        add             \xmy, x10, \my, uxtw #3
1730
1731        movrel          x9,  \type\()_\taps\()_v_tbl
1732        ldrsw           x8,  [x9, x8, lsl #2]
1733        add             x9,  x9,  x8
1734        br              x9
1735
173620:     // 2xN v
1737        AARCH64_VALID_JUMP_TARGET
1738.ifc \type, put
1739        b.gt            28f
1740
1741        cmp             \h,  #2
1742        ldur            s0,  [\xmy, #2]
1743        sub             \src,  \src,  \s_strd
1744        add             \ds2,  \dst,  \d_strd
1745        add             \sr2,  \src,  \s_strd
1746        lsl             \s_strd,  \s_strd,  #1
1747        lsl             \d_strd,  \d_strd,  #1
1748        sxtl            v0.8h, v0.8b
1749
1750        // 2x2 v
1751        load_h          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1752        interleave_1_h  v1, v2, v3, v4, v5
1753        b.gt            24f
1754        uxtl_b          v1, v2, v3, v4
1755        mul_mla_4tap    v6, v1, v2, v3, v4, .4h
1756        sqrshrun_b      6,  v6
1757        st_h            \d_strd, v6, 2
1758        ret
1759
176024:     // 2x4 v
1761        load_h          \sr2, \src, \s_strd, v6, v7
1762        interleave_1_h  v5, v6, v7
1763        interleave_2_s  v1, v2, v3, v4, v5, v6
1764        uxtl_b          v1, v2, v3, v4
1765        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
1766        sqrshrun_b      6,  v6
1767        st_h            \d_strd, v6, 4
1768        ret
1769
177028:     // 2x6, 2x8, 2x12, 2x16 v
1771        ld1             {v0.8b}, [\xmy]
1772        sub             \sr2,  \src,  \s_strd, lsl #1
1773        add             \ds2,  \dst,  \d_strd
1774        sub             \src,  \sr2,  \s_strd
1775        lsl             \d_strd,  \d_strd,  #1
1776        lsl             \s_strd,  \s_strd,  #1
1777        sxtl            v0.8h, v0.8b
1778
1779        load_h          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
1780        interleave_1_h  v1,  v2,  v3,  v4,  v5
1781        interleave_1_h  v5,  v6,  v7
1782        interleave_2_s  v1,  v2,  v3,  v4,  v5,  v6
1783        uxtl_b          v1,  v2,  v3,  v4
1784216:
1785        subs            \h,  \h,  #4
1786        load_h          \sr2, \src, \s_strd, v16, v17, v18, v19
1787        interleave_1_h  v7,  v16, v17, v18, v19
1788        interleave_2_s  v5,  v6,  v7,  v16, v17, v18
1789        uxtl_b          v5,  v6,  v7,  v16
1790        mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
1791        sqrshrun_b      6,   v30
1792        st_h            \d_strd, v30, 4
1793        b.le            0f
1794        cmp             \h,  #2
1795        mov             v1.16b,  v5.16b
1796        mov             v2.16b,  v6.16b
1797        mov             v3.16b,  v7.16b
1798        mov             v4.16b,  v16.16b
1799        mov             v5.16b,  v17.16b
1800        mov             v6.16b,  v18.16b
1801        mov             v7.16b,  v19.16b
1802        b.eq            26f
1803        b               216b
180426:
1805        load_h          \sr2, \src, \s_strd, v16, v17
1806        interleave_1_h  v7,  v16, v17
1807        uxtl_b          v5,  v6,  v7,  v16
1808        mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
1809        sqrshrun_b      6,   v30
1810        st_h            \d_strd, v30, 2
18110:
1812        ret
1813.endif
1814
181540:
1816        AARCH64_VALID_JUMP_TARGET
1817        b.gt            480f
1818
1819        // 4x2, 4x4 v
1820        cmp             \h,  #2
1821        ldur            s0,  [\xmy, #2]
1822        sub             \src, \src, \s_strd
1823        add             \ds2, \dst, \d_strd
1824        add             \sr2, \src, \s_strd
1825        lsl             \s_strd, \s_strd, #1
1826        lsl             \d_strd, \d_strd, #1
1827        sxtl            v0.8h, v0.8b
1828
1829        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1830        interleave_1_s  v1, v2, v3, v4, v5
1831        uxtl_b          v1, v2, v3, v4
1832        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
1833        shift_store_4   \type, \d_strd, v6
1834        b.le            0f
1835        load_s          \sr2, \src, \s_strd, v6, v7
1836        interleave_1_s  v5, v6, v7
1837        uxtl_b          v5, v6
1838        mul_mla_4tap    v7, v3, v4, v5, v6, .8h
1839        shift_store_4   \type, \d_strd, v7
18400:
1841        ret
1842
1843480:    // 4x6, 4x8, 4x12, 4x16 v
1844        ld1             {v0.8b}, [\xmy]
1845        sub             \sr2, \src, \s_strd, lsl #1
1846        add             \ds2, \dst, \d_strd
1847        sub             \src, \sr2, \s_strd
1848        lsl             \s_strd, \s_strd, #1
1849        lsl             \d_strd, \d_strd, #1
1850        sxtl            v0.8h, v0.8b
1851
1852        load_s          \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1853        interleave_1_s  v16, v17, v18
1854        interleave_1_s  v18, v19, v20, v21, v22
1855        uxtl_b          v16, v17
1856        uxtl_b          v18, v19, v20, v21
1857
185848:
1859        subs            \h,  \h,  #4
1860        load_s          \sr2, \src, \s_strd, v23, v24, v25, v26
1861        interleave_1_s  v22, v23, v24, v25, v26
1862        uxtl_b          v22, v23, v24, v25
1863        mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
1864        shift_store_4   \type, \d_strd, v1, v2
1865        b.le            0f
1866        load_s          \sr2,  \src, \s_strd, v27, v16
1867        subs            \h,  \h,  #2
1868        interleave_1_s  v26, v27, v16
1869        uxtl_b          v26, v27
1870        mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
1871        shift_store_4   \type, \d_strd, v1
1872        b.le            0f
1873        load_s          \sr2,  \src, \s_strd, v17, v18
1874        subs            \h,  \h,  #2
1875        interleave_1_s  v16, v17, v18
1876        uxtl_b          v16, v17
1877        mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
1878        shift_store_4   \type, \d_strd, v2
1879        b.le            0f
1880        subs            \h,  \h,  #4
1881        load_s          \sr2, \src, \s_strd, v19, v20, v21, v22
1882        interleave_1_s  v18, v19, v20, v21, v22
1883        uxtl_b          v18, v19, v20, v21
1884        mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
1885        shift_store_4   \type, \d_strd, v1, v2
1886        b.gt            48b
18870:
1888        ret
1889
189080:
1891        AARCH64_VALID_JUMP_TARGET
1892        b.gt            880f
1893
1894        // 8x2, 8x4 v
1895        cmp             \h,  #2
1896        ldur            s0,  [\xmy, #2]
1897        sub             \src, \src, \s_strd
1898        add             \ds2, \dst, \d_strd
1899        add             \sr2, \src, \s_strd
1900        lsl             \s_strd, \s_strd, #1
1901        lsl             \d_strd, \d_strd, #1
1902        sxtl            v0.8h, v0.8b
1903
1904        load_8b         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1905        uxtl_b          v1, v2, v3, v4, v5
1906        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
1907        mul_mla_4tap    v7, v2, v3, v4, v5, .8h
1908        shift_store_8   \type, \d_strd, v6, v7
1909        b.le            0f
1910        load_8b         \sr2, \src, \s_strd, v6, v7
1911        uxtl_b          v6, v7
1912        mul_mla_4tap    v1, v3, v4, v5, v6, .8h
1913        mul_mla_4tap    v2, v4, v5, v6, v7, .8h
1914        shift_store_8   \type, \d_strd, v1, v2
19150:
1916        ret
1917
1918880:    // 8x6, 8x8, 8x16, 8x32 v
19191680:   // 16x8, 16x16, ...
1920320:    // 32x8, 32x16, ...
1921640:
19221280:
1923        AARCH64_VALID_JUMP_TARGET
1924        ld1             {v0.8b}, [\xmy]
1925        sub             \src, \src, \s_strd
1926        sub             \src, \src, \s_strd, lsl #1
1927        sxtl            v0.8h, v0.8b
1928        mov             \my,  \h
1929168:
1930        add             \ds2, \dst, \d_strd
1931        add             \sr2, \src, \s_strd
1932        lsl             \s_strd, \s_strd, #1
1933        lsl             \d_strd, \d_strd, #1
1934
1935        load_8b         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1936        uxtl_b          v16, v17, v18, v19, v20, v21, v22
1937
193888:
1939        subs            \h,  \h,  #2
1940        load_8b         \sr2, \src, \s_strd, v23, v24
1941        uxtl_b          v23, v24
1942        mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
1943        shift_store_8   \type, \d_strd, v1, v2
1944        b.le            9f
1945        subs            \h,  \h,  #2
1946        load_8b         \sr2, \src, \s_strd, v25, v26
1947        uxtl_b          v25, v26
1948        mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
1949        shift_store_8   \type, \d_strd, v3, v4
1950        b.le            9f
1951        subs            \h,  \h,  #2
1952        load_8b         \sr2, \src, \s_strd, v27, v16
1953        uxtl_b          v27, v16
1954        mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
1955        shift_store_8   \type, \d_strd, v1, v2
1956        b.le            9f
1957        subs            \h,  \h,  #2
1958        load_8b         \sr2, \src, \s_strd, v17, v18
1959        uxtl_b          v17, v18
1960        mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
1961        shift_store_8   \type, \d_strd, v3, v4
1962        b.le            9f
1963        subs            \h,  \h,  #4
1964        load_8b         \sr2, \src, \s_strd, v19, v20, v21, v22
1965        uxtl_b          v19, v20, v21, v22
1966        mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
1967        mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
1968        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1969        b.gt            88b
19709:
1971        subs            \w,  \w,  #8
1972        b.le            0f
1973        asr             \s_strd, \s_strd, #1
1974        asr             \d_strd, \d_strd, #1
1975        msub            \src, \s_strd, \xmy, \src
1976        msub            \dst, \d_strd, \xmy, \dst
1977        sub             \src, \src, \s_strd, lsl #3
1978        mov             \h,  \my
1979        add             \src, \src, #8
1980.ifc \type, put
1981        add             \dst, \dst, #8
1982.else
1983        add             \dst, \dst, #16
1984.endif
1985        b               168b
19860:
1987        ret
1988
1989160:
1990        AARCH64_VALID_JUMP_TARGET
1991        b.gt            1680b
1992
1993        // 16x2, 16x4 v
1994        ldur            s0,  [\xmy, #2]
1995        sub             \src, \src, \s_strd
1996        add             \ds2, \dst, \d_strd
1997        add             \sr2, \src, \s_strd
1998        lsl             \s_strd, \s_strd, #1
1999        lsl             \d_strd, \d_strd, #1
2000        sxtl            v0.8h, v0.8b
2001
2002        cmp             \h,  #2
2003        load_16b        \src, \sr2, \s_strd, v1,  v2,  v3,  v4,  v5
2004        uxtl            v16.8h, v1.8b
2005        uxtl            v17.8h, v2.8b
2006        uxtl            v18.8h, v3.8b
2007        uxtl            v19.8h, v4.8b
2008        uxtl            v20.8h, v5.8b
2009        uxtl2           v23.8h, v1.16b
2010        uxtl2           v24.8h, v2.16b
2011        uxtl2           v25.8h, v3.16b
2012        uxtl2           v26.8h, v4.16b
2013        uxtl2           v27.8h, v5.16b
2014        mul_mla_4tap    v1,  v16, v17, v18, v19, .8h
2015        mul_mla_4tap    v16, v17, v18, v19, v20, .8h
2016        mul_mla_4tap    v2,  v23, v24, v25, v26, .8h
2017        mul_mla_4tap    v17, v24, v25, v26, v27, .8h
2018        shift_store_16  \type, \d_strd, v1, v2, v16, v17
2019        b.le            0f
2020        load_16b        \sr2, \src, \s_strd, v6,  v7
2021        uxtl            v21.8h, v6.8b
2022        uxtl            v22.8h, v7.8b
2023        uxtl2           v28.8h, v6.16b
2024        uxtl2           v29.8h, v7.16b
2025        mul_mla_4tap    v1,  v18, v19, v20, v21, .8h
2026        mul_mla_4tap    v3,  v19, v20, v21, v22, .8h
2027        mul_mla_4tap    v2,  v25, v26, v27, v28, .8h
2028        mul_mla_4tap    v4,  v26, v27, v28, v29, .8h
2029        shift_store_16  \type, \d_strd, v1, v2, v3, v4
20300:
2031        ret
2032endfunc
2033
2034jumptable \type\()_\taps\()_v_tbl
2035        .word 1280b - \type\()_\taps\()_v_tbl
2036        .word 640b  - \type\()_\taps\()_v_tbl
2037        .word 320b  - \type\()_\taps\()_v_tbl
2038        .word 160b  - \type\()_\taps\()_v_tbl
2039        .word 80b   - \type\()_\taps\()_v_tbl
2040        .word 40b   - \type\()_\taps\()_v_tbl
2041        .word 20b   - \type\()_\taps\()_v_tbl
2042endjumptable
2043
2044function L(\type\()_\taps\()_hv)
2045        cmp             \h,  #4
2046        ubfx            w9,  \my, #7, #7
2047        and             \my, \my, #0x7f
2048        b.le            4f
2049        mov             \my,  w9
20504:
2051        add             \xmy,  x10, \my, uxtw #3
2052
2053        movrel          x9,  \type\()_\taps\()_hv_tbl
2054        ldrsw           x8,  [x9, x8, lsl #2]
2055        add             x9,  x9,  x8
2056        br              x9
2057
205820:
2059        AARCH64_VALID_JUMP_TARGET
2060.ifc \type, put
2061        ldur            s0,  [\xmx, #2]
2062        b.gt            280f
2063        ldur            s1,  [\xmy, #2]
2064
2065        // 2x2, 2x4 hv
2066        sub             \sr2, \src, #1
2067        sub             \src, \sr2, \s_strd
2068        add             \ds2, \dst, \d_strd
2069        lsl             \s_strd, \s_strd, #1
2070        lsl             \d_strd, \d_strd, #1
2071        sxtl            v0.8h,  v0.8b
2072        sxtl            v1.8h,  v1.8b
2073        mov             x15, x30
2074
2075        ld1             {v28.8b}, [\src], \s_strd
2076        uxtl            v28.8h,  v28.8b
2077        ext             v29.16b, v28.16b, v28.16b, #2
2078        mul             v28.4h,  v28.4h,  v0.4h
2079        mul             v29.4h,  v29.4h,  v0.4h
2080        addp            v28.4h,  v28.4h,  v29.4h
2081        addp            v16.4h,  v28.4h,  v28.4h
2082        srshr           v16.4h,  v16.4h,  #2
2083        bl              L(\type\()_\taps\()_filter_2)
2084
2085        trn1            v16.2s, v16.2s, v28.2s
2086        mov             v17.8b, v28.8b
2087
20882:
2089        bl              L(\type\()_\taps\()_filter_2)
2090
2091        ext             v18.8b, v17.8b, v28.8b, #4
2092        smull           v2.4s,  v16.4h, v1.h[0]
2093        smlal           v2.4s,  v17.4h, v1.h[1]
2094        smlal           v2.4s,  v18.4h, v1.h[2]
2095        smlal           v2.4s,  v28.4h, v1.h[3]
2096
2097        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2098        sqxtun          v2.8b,  v2.8h
2099        subs            \h,  \h,  #2
2100        st1             {v2.h}[0], [\dst], \d_strd
2101        st1             {v2.h}[1], [\ds2], \d_strd
2102        b.le            0f
2103        mov             v16.8b, v18.8b
2104        mov             v17.8b, v28.8b
2105        b               2b
2106
2107280:    // 2x8, 2x16, 2x32 hv
2108        ld1             {v1.8b},  [\xmy]
2109        sub             \src, \src, #1
2110        sub             \sr2, \src, \s_strd, lsl #1
2111        sub             \src, \sr2, \s_strd
2112        add             \ds2, \dst, \d_strd
2113        lsl             \s_strd, \s_strd, #1
2114        lsl             \d_strd, \d_strd, #1
2115        sxtl            v0.8h,  v0.8b
2116        sxtl            v1.8h,  v1.8b
2117        mov             x15, x30
2118
2119        ld1             {v28.8b}, [\src], \s_strd
2120        uxtl            v28.8h,  v28.8b
2121        ext             v29.16b, v28.16b, v28.16b, #2
2122        mul             v28.4h,  v28.4h,  v0.4h
2123        mul             v29.4h,  v29.4h,  v0.4h
2124        addp            v28.4h,  v28.4h,  v29.4h
2125        addp            v16.4h,  v28.4h,  v28.4h
2126        srshr           v16.4h,  v16.4h,  #2
2127
2128        bl              L(\type\()_\taps\()_filter_2)
2129        trn1            v16.2s, v16.2s, v28.2s
2130        mov             v17.8b, v28.8b
2131        bl              L(\type\()_\taps\()_filter_2)
2132        ext             v18.8b, v17.8b, v28.8b, #4
2133        mov             v19.8b, v28.8b
2134        bl              L(\type\()_\taps\()_filter_2)
2135        ext             v20.8b, v19.8b, v28.8b, #4
2136        mov             v21.8b, v28.8b
2137
213828:
2139        bl              L(\type\()_\taps\()_filter_2)
2140        ext             v22.8b, v21.8b, v28.8b, #4
2141.ifc \taps, 6tap
2142        smull           v2.4s,  v17.4h, v1.h[1]
2143        smlal           v2.4s,  v18.4h, v1.h[2]
2144        smlal           v2.4s,  v19.4h, v1.h[3]
2145        smlal           v2.4s,  v20.4h, v1.h[4]
2146        smlal           v2.4s,  v21.4h, v1.h[5]
2147        smlal           v2.4s,  v22.4h, v1.h[6]
2148.else   // 8tap
2149        smull           v2.4s,  v16.4h, v1.h[0]
2150        smlal           v2.4s,  v17.4h, v1.h[1]
2151        smlal           v2.4s,  v18.4h, v1.h[2]
2152        smlal           v2.4s,  v19.4h, v1.h[3]
2153        smlal           v2.4s,  v20.4h, v1.h[4]
2154        smlal           v2.4s,  v21.4h, v1.h[5]
2155        smlal           v2.4s,  v22.4h, v1.h[6]
2156        smlal           v2.4s,  v28.4h, v1.h[7]
2157.endif
2158
2159        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2160        sqxtun          v2.8b,  v2.8h
2161        subs            \h,  \h,  #2
2162        st1             {v2.h}[0], [\dst], \d_strd
2163        st1             {v2.h}[1], [\ds2], \d_strd
2164        b.le            0f
2165        mov             v16.8b, v18.8b
2166        mov             v17.8b, v19.8b
2167        mov             v18.8b, v20.8b
2168        mov             v19.8b, v21.8b
2169        mov             v20.8b, v22.8b
2170        mov             v21.8b, v28.8b
2171        b               28b
2172
21730:
2174        ret             x15
2175
2176L(\type\()_\taps\()_filter_2):
2177        ld1             {v28.8b},  [\sr2], \s_strd
2178        ld1             {v30.8b},  [\src], \s_strd
2179        uxtl            v28.8h,  v28.8b
2180        uxtl            v30.8h,  v30.8b
2181        ext             v29.16b, v28.16b, v28.16b, #2
2182        ext             v31.16b, v30.16b, v30.16b, #2
2183        trn1            v27.2s,  v28.2s,  v30.2s
2184        trn2            v30.2s,  v28.2s,  v30.2s
2185        trn1            v28.2s,  v29.2s,  v31.2s
2186        trn2            v31.2s,  v29.2s,  v31.2s
2187        mul             v27.4h,  v27.4h,  v0.h[0]
2188        mla             v27.4h,  v28.4h,  v0.h[1]
2189        mla             v27.4h,  v30.4h,  v0.h[2]
2190        mla             v27.4h,  v31.4h,  v0.h[3]
2191        srshr           v28.4h,  v27.4h,  #2
2192        ret
2193.endif
2194
219540:
2196        AARCH64_VALID_JUMP_TARGET
2197        ldur            s0,  [\xmx, #2]
2198        b.gt            480f
2199        ldur            s1,  [\xmy, #2]
2200        sub             \sr2, \src, #1
2201        sub             \src, \sr2, \s_strd
2202        add             \ds2, \dst, \d_strd
2203        lsl             \s_strd, \s_strd, #1
2204        lsl             \d_strd, \d_strd, #1
2205        sxtl            v0.8h,  v0.8b
2206        sxtl            v1.8h,  v1.8b
2207        mov             x15, x30
2208
2209        // 4x2, 4x4 hv
2210        ld1             {v26.8b}, [\src], \s_strd
2211        uxtl            v26.8h,  v26.8b
2212        ext             v28.16b, v26.16b, v26.16b, #2
2213        ext             v29.16b, v26.16b, v26.16b, #4
2214        ext             v30.16b, v26.16b, v26.16b, #6
2215        mul             v31.4h,  v26.4h,  v0.h[0]
2216        mla             v31.4h,  v28.4h,  v0.h[1]
2217        mla             v31.4h,  v29.4h,  v0.h[2]
2218        mla             v31.4h,  v30.4h,  v0.h[3]
2219        srshr           v16.4h,  v31.4h,  #2
2220
2221        bl              L(\type\()_\taps\()_filter_4)
2222        mov             v17.8b, v28.8b
2223        mov             v18.8b, v29.8b
2224
22254:
2226        bl              L(\type\()_\taps\()_filter_4)
2227        // Interleaving the mul/mla chains actually hurts performance
2228        // significantly on Cortex A53, thus keeping mul/mla tightly
2229        // chained like this.
2230        smull           v2.4s,  v16.4h, v1.h[0]
2231        smlal           v2.4s,  v17.4h, v1.h[1]
2232        smlal           v2.4s,  v18.4h, v1.h[2]
2233        smlal           v2.4s,  v28.4h, v1.h[3]
2234        smull           v3.4s,  v17.4h, v1.h[0]
2235        smlal           v3.4s,  v18.4h, v1.h[1]
2236        smlal           v3.4s,  v28.4h, v1.h[2]
2237        smlal           v3.4s,  v29.4h, v1.h[3]
2238        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2239        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
2240        subs            \h,  \h,  #2
2241.ifc \type, put
2242        sqxtun          v2.8b,  v2.8h
2243        sqxtun          v3.8b,  v3.8h
2244        str             s2,  [\dst]
2245        str             s3,  [\ds2]
2246        add             \dst, \dst, \d_strd
2247        add             \ds2, \ds2, \d_strd
2248.else
2249        st1             {v2.4h}, [\dst], \d_strd
2250        st1             {v3.4h}, [\ds2], \d_strd
2251.endif
2252        b.le            0f
2253        mov             v16.8b,  v18.8b
2254        mov             v17.8b,  v28.8b
2255        mov             v18.8b,  v29.8b
2256        b               4b
2257
2258480:    // 4x8, 4x16, 4x32 hv
2259        ld1             {v1.8b},  [\xmy]
2260        sub             \src, \src, #1
2261.ifc \taps, 6tap
2262        sub             \sr2, \src, \s_strd
2263        sub             \src, \src, \s_strd, lsl #1
2264.else
2265        sub             \sr2, \src, \s_strd, lsl #1
2266        sub             \src, \sr2, \s_strd
2267.endif
2268        add             \ds2, \dst, \d_strd
2269        lsl             \s_strd, \s_strd, #1
2270        lsl             \d_strd, \d_strd, #1
2271        sxtl            v0.8h,  v0.8b
2272        sxtl            v1.8h,  v1.8b
2273        mov             x15, x30
2274
2275        ld1             {v26.8b}, [\src], \s_strd
2276        uxtl            v26.8h,  v26.8b
2277        ext             v28.16b, v26.16b, v26.16b, #2
2278        ext             v29.16b, v26.16b, v26.16b, #4
2279        ext             v30.16b, v26.16b, v26.16b, #6
2280        mul             v31.4h,  v26.4h,  v0.h[0]
2281        mla             v31.4h,  v28.4h,  v0.h[1]
2282        mla             v31.4h,  v29.4h,  v0.h[2]
2283        mla             v31.4h,  v30.4h,  v0.h[3]
2284.ifc \taps, 6tap
2285        srshr           v18.4h,  v31.4h,  #2
2286.else
2287        srshr           v16.4h,  v31.4h,  #2
2288
2289        bl              L(\type\()_\taps\()_filter_4)
2290        mov             v17.8b, v28.8b
2291        mov             v18.8b, v29.8b
2292.endif
2293        bl              L(\type\()_\taps\()_filter_4)
2294        mov             v19.8b, v28.8b
2295        mov             v20.8b, v29.8b
2296        bl              L(\type\()_\taps\()_filter_4)
2297        mov             v21.8b, v28.8b
2298        mov             v22.8b, v29.8b
2299
230048:
2301        bl              L(\type\()_\taps\()_filter_4)
2302.ifc \taps, 6tap
2303        smull           v2.4s,  v18.4h, v1.h[1]
2304        smlal           v2.4s,  v19.4h, v1.h[2]
2305        smlal           v2.4s,  v20.4h, v1.h[3]
2306        smlal           v2.4s,  v21.4h, v1.h[4]
2307        smlal           v2.4s,  v22.4h, v1.h[5]
2308        smlal           v2.4s,  v28.4h, v1.h[6]
2309        smull           v3.4s,  v19.4h, v1.h[1]
2310        smlal           v3.4s,  v20.4h, v1.h[2]
2311        smlal           v3.4s,  v21.4h, v1.h[3]
2312        smlal           v3.4s,  v22.4h, v1.h[4]
2313        smlal           v3.4s,  v28.4h, v1.h[5]
2314        smlal           v3.4s,  v29.4h, v1.h[6]
2315.else   // 8tap
2316        smull           v2.4s,  v16.4h, v1.h[0]
2317        smlal           v2.4s,  v17.4h, v1.h[1]
2318        smlal           v2.4s,  v18.4h, v1.h[2]
2319        smlal           v2.4s,  v19.4h, v1.h[3]
2320        smlal           v2.4s,  v20.4h, v1.h[4]
2321        smlal           v2.4s,  v21.4h, v1.h[5]
2322        smlal           v2.4s,  v22.4h, v1.h[6]
2323        smlal           v2.4s,  v28.4h, v1.h[7]
2324        smull           v3.4s,  v17.4h, v1.h[0]
2325        smlal           v3.4s,  v18.4h, v1.h[1]
2326        smlal           v3.4s,  v19.4h, v1.h[2]
2327        smlal           v3.4s,  v20.4h, v1.h[3]
2328        smlal           v3.4s,  v21.4h, v1.h[4]
2329        smlal           v3.4s,  v22.4h, v1.h[5]
2330        smlal           v3.4s,  v28.4h, v1.h[6]
2331        smlal           v3.4s,  v29.4h, v1.h[7]
2332.endif
2333        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2334        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
2335        subs            \h,  \h,  #2
2336.ifc \type, put
2337        sqxtun          v2.8b,  v2.8h
2338        sqxtun          v3.8b,  v3.8h
2339        str             s2,  [\dst]
2340        str             s3,  [\ds2]
2341        add             \dst, \dst, \d_strd
2342        add             \ds2, \ds2, \d_strd
2343.else
2344        st1             {v2.4h}, [\dst], \d_strd
2345        st1             {v3.4h}, [\ds2], \d_strd
2346.endif
2347        b.le            0f
2348.ifc \taps, 8tap
2349        mov             v16.8b,  v18.8b
2350        mov             v17.8b,  v19.8b
2351.endif
2352        mov             v18.8b,  v20.8b
2353        mov             v19.8b,  v21.8b
2354        mov             v20.8b,  v22.8b
2355        mov             v21.8b,  v28.8b
2356        mov             v22.8b,  v29.8b
2357        b               48b
23580:
2359        ret             x15
2360
2361L(\type\()_\taps\()_filter_4):
2362        ld1             {v26.8b}, [\sr2], \s_strd
2363        ld1             {v27.8b}, [\src], \s_strd
2364        uxtl            v26.8h,  v26.8b
2365        uxtl            v27.8h,  v27.8b
2366        ext             v28.16b, v26.16b, v26.16b, #2
2367        ext             v29.16b, v26.16b, v26.16b, #4
2368        ext             v30.16b, v26.16b, v26.16b, #6
2369        mul             v31.4h,  v26.4h,  v0.h[0]
2370        mla             v31.4h,  v28.4h,  v0.h[1]
2371        mla             v31.4h,  v29.4h,  v0.h[2]
2372        mla             v31.4h,  v30.4h,  v0.h[3]
2373        ext             v28.16b, v27.16b, v27.16b, #2
2374        ext             v29.16b, v27.16b, v27.16b, #4
2375        ext             v30.16b, v27.16b, v27.16b, #6
2376        mul             v27.4h,  v27.4h,  v0.h[0]
2377        mla             v27.4h,  v28.4h,  v0.h[1]
2378        mla             v27.4h,  v29.4h,  v0.h[2]
2379        mla             v27.4h,  v30.4h,  v0.h[3]
2380        srshr           v28.4h,  v31.4h,  #2
2381        srshr           v29.4h,  v27.4h,  #2
2382        ret
2383
238480:
2385160:
2386320:
2387        AARCH64_VALID_JUMP_TARGET
2388        b.gt            880f
2389        ld1             {v0.8b},  [\xmx]
2390        ldur            s1,  [\xmy, #2]
2391.ifc \taps, 6tap
2392        sub             \src,  \src,  #2
2393.else
2394        sub             \src,  \src,  #3
2395.endif
2396        sub             \src,  \src,  \s_strd
2397        sxtl            v0.8h,  v0.8b
2398        sxtl            v1.8h,  v1.8b
2399        mov             x15, x30
2400        mov             \my,  \h
2401
2402164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2403        add             \ds2,  \dst,  \d_strd
2404        add             \sr2,  \src,  \s_strd
2405        lsl             \d_strd, \d_strd, #1
2406        lsl             \s_strd, \s_strd, #1
2407
2408        bl              L(\type\()_\taps\()_filter_8_first)
2409        bl              L(\type\()_\taps\()_filter_8)
2410        mov             v17.16b, v24.16b
2411        mov             v18.16b, v25.16b
2412
24138:
2414        smull           v2.4s,  v16.4h, v1.h[0]
2415        smull2          v3.4s,  v16.8h, v1.h[0]
2416        bl              L(\type\()_\taps\()_filter_8)
2417        smull           v4.4s,  v17.4h, v1.h[0]
2418        smull2          v5.4s,  v17.8h, v1.h[0]
2419        smlal           v2.4s,  v17.4h, v1.h[1]
2420        smlal2          v3.4s,  v17.8h, v1.h[1]
2421        smlal           v4.4s,  v18.4h, v1.h[1]
2422        smlal2          v5.4s,  v18.8h, v1.h[1]
2423        smlal           v2.4s,  v18.4h, v1.h[2]
2424        smlal2          v3.4s,  v18.8h, v1.h[2]
2425        smlal           v4.4s,  v24.4h, v1.h[2]
2426        smlal2          v5.4s,  v24.8h, v1.h[2]
2427        smlal           v2.4s,  v24.4h, v1.h[3]
2428        smlal2          v3.4s,  v24.8h, v1.h[3]
2429        smlal           v4.4s,  v25.4h, v1.h[3]
2430        smlal2          v5.4s,  v25.8h, v1.h[3]
2431        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2432        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
2433        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
2434        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
2435        subs            \h,  \h,  #2
2436.ifc \type, put
2437        sqxtun          v2.8b,  v2.8h
2438        sqxtun          v4.8b,  v4.8h
2439        st1             {v2.8b}, [\dst], \d_strd
2440        st1             {v4.8b}, [\ds2], \d_strd
2441.else
2442        st1             {v2.8h}, [\dst], \d_strd
2443        st1             {v4.8h}, [\ds2], \d_strd
2444.endif
2445        b.le            9f
2446        mov             v16.16b, v18.16b
2447        mov             v17.16b, v24.16b
2448        mov             v18.16b, v25.16b
2449        b               8b
24509:
2451        subs            \w,  \w,  #8
2452        b.le            0f
2453        asr             \s_strd,  \s_strd,  #1
2454        asr             \d_strd,  \d_strd,  #1
2455        msub            \src,  \s_strd,  \xmy,  \src
2456        msub            \dst,  \d_strd,  \xmy,  \dst
2457        sub             \src,  \src,  \s_strd,  lsl #2
2458        mov             \h,  \my
2459        add             \src,  \src,  #8
2460.ifc \type, put
2461        add             \dst,  \dst,  #8
2462.else
2463        add             \dst,  \dst,  #16
2464.endif
2465        b               164b
2466
2467880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2468640:
24691280:
2470        AARCH64_VALID_JUMP_TARGET
2471        ld1             {v0.8b},  [\xmx]
2472        ld1             {v1.8b},  [\xmy]
2473.ifc \taps, 6tap
2474        sub             \src,  \src,  #2
2475.else
2476        sub             \src,  \src,  #3
2477        sub             \src,  \src,  \s_strd
2478.endif
2479        sub             \src,  \src,  \s_strd, lsl #1
2480        sxtl            v0.8h,  v0.8b
2481        sxtl            v1.8h,  v1.8b
2482        mov             x15, x30
2483        mov             \my,  \h
2484
2485168:
2486        add             \ds2,  \dst,  \d_strd
2487        add             \sr2,  \src,  \s_strd
2488        lsl             \d_strd, \d_strd, #1
2489        lsl             \s_strd, \s_strd, #1
2490
2491        bl              L(\type\()_\taps\()_filter_8_first)
2492.ifc \taps, 6tap
2493        mov             v18.16b, v16.16b
2494.else
2495        bl              L(\type\()_\taps\()_filter_8)
2496        mov             v17.16b, v24.16b
2497        mov             v18.16b, v25.16b
2498.endif
2499        bl              L(\type\()_\taps\()_filter_8)
2500        mov             v19.16b, v24.16b
2501        mov             v20.16b, v25.16b
2502        bl              L(\type\()_\taps\()_filter_8)
2503        mov             v21.16b, v24.16b
2504        mov             v22.16b, v25.16b
2505
250688:
2507.ifc \taps, 6tap
2508        smull           v2.4s,  v18.4h, v1.h[1]
2509        smull2          v3.4s,  v18.8h, v1.h[1]
2510        bl              L(\type\()_\taps\()_filter_8)
2511        smull           v4.4s,  v19.4h, v1.h[1]
2512        smull2          v5.4s,  v19.8h, v1.h[1]
2513        smlal           v2.4s,  v19.4h, v1.h[2]
2514        smlal2          v3.4s,  v19.8h, v1.h[2]
2515        smlal           v4.4s,  v20.4h, v1.h[2]
2516        smlal2          v5.4s,  v20.8h, v1.h[2]
2517        smlal           v2.4s,  v20.4h, v1.h[3]
2518        smlal2          v3.4s,  v20.8h, v1.h[3]
2519        smlal           v4.4s,  v21.4h, v1.h[3]
2520        smlal2          v5.4s,  v21.8h, v1.h[3]
2521        smlal           v2.4s,  v21.4h, v1.h[4]
2522        smlal2          v3.4s,  v21.8h, v1.h[4]
2523        smlal           v4.4s,  v22.4h, v1.h[4]
2524        smlal2          v5.4s,  v22.8h, v1.h[4]
2525        smlal           v2.4s,  v22.4h, v1.h[5]
2526        smlal2          v3.4s,  v22.8h, v1.h[5]
2527        smlal           v4.4s,  v24.4h, v1.h[5]
2528        smlal2          v5.4s,  v24.8h, v1.h[5]
2529        smlal           v2.4s,  v24.4h, v1.h[6]
2530        smlal2          v3.4s,  v24.8h, v1.h[6]
2531        smlal           v4.4s,  v25.4h, v1.h[6]
2532        smlal2          v5.4s,  v25.8h, v1.h[6]
2533.else   // 8tap
2534        smull           v2.4s,  v16.4h, v1.h[0]
2535        smull2          v3.4s,  v16.8h, v1.h[0]
2536        bl              L(\type\()_\taps\()_filter_8)
2537        smull           v4.4s,  v17.4h, v1.h[0]
2538        smull2          v5.4s,  v17.8h, v1.h[0]
2539        smlal           v2.4s,  v17.4h, v1.h[1]
2540        smlal2          v3.4s,  v17.8h, v1.h[1]
2541        smlal           v4.4s,  v18.4h, v1.h[1]
2542        smlal2          v5.4s,  v18.8h, v1.h[1]
2543        smlal           v2.4s,  v18.4h, v1.h[2]
2544        smlal2          v3.4s,  v18.8h, v1.h[2]
2545        smlal           v4.4s,  v19.4h, v1.h[2]
2546        smlal2          v5.4s,  v19.8h, v1.h[2]
2547        smlal           v2.4s,  v19.4h, v1.h[3]
2548        smlal2          v3.4s,  v19.8h, v1.h[3]
2549        smlal           v4.4s,  v20.4h, v1.h[3]
2550        smlal2          v5.4s,  v20.8h, v1.h[3]
2551        smlal           v2.4s,  v20.4h, v1.h[4]
2552        smlal2          v3.4s,  v20.8h, v1.h[4]
2553        smlal           v4.4s,  v21.4h, v1.h[4]
2554        smlal2          v5.4s,  v21.8h, v1.h[4]
2555        smlal           v2.4s,  v21.4h, v1.h[5]
2556        smlal2          v3.4s,  v21.8h, v1.h[5]
2557        smlal           v4.4s,  v22.4h, v1.h[5]
2558        smlal2          v5.4s,  v22.8h, v1.h[5]
2559        smlal           v2.4s,  v22.4h, v1.h[6]
2560        smlal2          v3.4s,  v22.8h, v1.h[6]
2561        smlal           v4.4s,  v24.4h, v1.h[6]
2562        smlal2          v5.4s,  v24.8h, v1.h[6]
2563        smlal           v2.4s,  v24.4h, v1.h[7]
2564        smlal2          v3.4s,  v24.8h, v1.h[7]
2565        smlal           v4.4s,  v25.4h, v1.h[7]
2566        smlal2          v5.4s,  v25.8h, v1.h[7]
2567.endif
2568        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2569        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
2570        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
2571        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
2572        subs            \h,  \h,  #2
2573.ifc \type, put
2574        sqxtun          v2.8b,  v2.8h
2575        sqxtun          v4.8b,  v4.8h
2576        st1             {v2.8b}, [\dst], \d_strd
2577        st1             {v4.8b}, [\ds2], \d_strd
2578.else
2579        st1             {v2.8h}, [\dst], \d_strd
2580        st1             {v4.8h}, [\ds2], \d_strd
2581.endif
2582        b.le            9f
2583.ifc \taps, 8tap
2584        mov             v16.16b, v18.16b
2585        mov             v17.16b, v19.16b
2586.endif
2587        mov             v18.16b, v20.16b
2588        mov             v19.16b, v21.16b
2589        mov             v20.16b, v22.16b
2590        mov             v21.16b, v24.16b
2591        mov             v22.16b, v25.16b
2592        b               88b
25939:
2594        subs            \w,  \w,  #8
2595        b.le            0f
2596        asr             \s_strd,  \s_strd,  #1
2597        asr             \d_strd,  \d_strd,  #1
2598        msub            \src,  \s_strd,  \xmy,  \src
2599        msub            \dst,  \d_strd,  \xmy,  \dst
2600        sub             \src,  \src,  \s_strd,  lsl #3
2601        mov             \h,  \my
2602        add             \src,  \src,  #8
2603.ifc \type, put
2604        add             \dst,  \dst,  #8
2605.else
2606        add             \dst,  \dst,  #16
2607.endif
2608.ifc \taps, 6tap
2609        add             \src,  \src,  \s_strd,  lsl #1
2610.endif
2611        b               168b
26120:
2613        ret             x15
2614
2615L(\type\()_\taps\()_filter_8_first):
2616        ld1             {v28.8b, v29.8b},  [\src], \s_strd
2617        uxtl            v28.8h,  v28.8b
2618        uxtl            v29.8h,  v29.8b
2619.ifc \taps, 6tap
2620        mul             v16.8h,  v28.8h,  v0.h[1]
2621        ext             v25.16b, v28.16b, v29.16b, #(2*1)
2622        ext             v26.16b, v28.16b, v29.16b, #(2*2)
2623        ext             v27.16b, v28.16b, v29.16b, #(2*3)
2624        mla             v16.8h,  v25.8h,  v0.h[2]
2625        mla             v16.8h,  v26.8h,  v0.h[3]
2626        mla             v16.8h,  v27.8h,  v0.h[4]
2627        ext             v24.16b, v28.16b, v29.16b, #(2*4)
2628        ext             v25.16b, v28.16b, v29.16b, #(2*5)
2629        mla             v16.8h,  v24.8h,  v0.h[5]
2630        mla             v16.8h,  v25.8h,  v0.h[6]
2631.else   // 8tap
2632        mul             v16.8h,  v28.8h,  v0.h[0]
2633        ext             v24.16b, v28.16b, v29.16b, #(2*1)
2634        ext             v25.16b, v28.16b, v29.16b, #(2*2)
2635        ext             v26.16b, v28.16b, v29.16b, #(2*3)
2636        ext             v27.16b, v28.16b, v29.16b, #(2*4)
2637        mla             v16.8h,  v24.8h,  v0.h[1]
2638        mla             v16.8h,  v25.8h,  v0.h[2]
2639        mla             v16.8h,  v26.8h,  v0.h[3]
2640        mla             v16.8h,  v27.8h,  v0.h[4]
2641        ext             v24.16b, v28.16b, v29.16b, #(2*5)
2642        ext             v25.16b, v28.16b, v29.16b, #(2*6)
2643        ext             v26.16b, v28.16b, v29.16b, #(2*7)
2644        mla             v16.8h,  v24.8h,  v0.h[5]
2645        mla             v16.8h,  v25.8h,  v0.h[6]
2646        mla             v16.8h,  v26.8h,  v0.h[7]
2647.endif
2648        srshr           v16.8h,  v16.8h,  #2
2649        ret
2650
2651L(\type\()_\taps\()_filter_8):
2652        ld1             {v28.8b, v29.8b},  [\sr2], \s_strd
2653        ld1             {v30.8b, v31.8b},  [\src], \s_strd
2654        uxtl            v28.8h,  v28.8b
2655        uxtl            v29.8h,  v29.8b
2656        uxtl            v30.8h,  v30.8b
2657        uxtl            v31.8h,  v31.8b
2658.ifc \taps, 6tap
2659        mul             v24.8h,  v28.8h,  v0.h[1]
2660        mul             v25.8h,  v30.8h,  v0.h[1]
2661    .irpc i, 23456
2662        ext             v26.16b, v28.16b, v29.16b, #(2*\i-2)
2663        ext             v27.16b, v30.16b, v31.16b, #(2*\i-2)
2664        mla             v24.8h,  v26.8h,  v0.h[\i]
2665        mla             v25.8h,  v27.8h,  v0.h[\i]
2666    .endr
2667.else   // 8tap
2668        mul             v24.8h,  v28.8h,  v0.h[0]
2669        mul             v25.8h,  v30.8h,  v0.h[0]
2670    .irpc i, 1234567
2671        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
2672        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
2673        mla             v24.8h,  v26.8h,  v0.h[\i]
2674        mla             v25.8h,  v27.8h,  v0.h[\i]
2675    .endr
2676.endif
2677        srshr           v24.8h,  v24.8h, #2
2678        srshr           v25.8h,  v25.8h, #2
2679        ret
2680endfunc
2681
2682jumptable \type\()_\taps\()_hv_tbl
2683        .word 1280b - \type\()_\taps\()_hv_tbl
2684        .word 640b  - \type\()_\taps\()_hv_tbl
2685        .word 320b  - \type\()_\taps\()_hv_tbl
2686        .word 160b  - \type\()_\taps\()_hv_tbl
2687        .word 80b   - \type\()_\taps\()_hv_tbl
2688        .word 40b   - \type\()_\taps\()_hv_tbl
2689        .word 20b   - \type\()_\taps\()_hv_tbl
2690endjumptable
2691.endm
2692
2693
2694.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
2695function \type\()_bilin_8bpc_neon, export=1
2696        dup             v1.16b, \mx
2697        dup             v3.16b, \my
2698        mov             w9,  #16
2699        sub             w8, w9, \mx
2700        sub             w9, w9, \my
2701        dup             v0.16b, w8
2702        dup             v2.16b, w9
2703.ifc \type, prep
2704        uxtw            \d_strd, \w
2705        lsl             \d_strd, \d_strd, #1
2706.endif
2707
2708        clz             w8,  \w
2709        sub             w8,  w8,  #24
2710        cbnz            \mx, L(\type\()_bilin_h)
2711        cbnz            \my, L(\type\()_bilin_v)
2712        b               \type\()_neon
2713
2714L(\type\()_bilin_h):
2715        cbnz            \my, L(\type\()_bilin_hv)
2716
2717        movrel          x9,  \type\()_bilin_h_tbl
2718        ldrsw           x8,  [x9, x8, lsl #2]
2719        add             x9,  x9,  x8
2720        br              x9
2721
272220:     // 2xN h
2723        AARCH64_VALID_JUMP_TARGET
2724.ifc \type, put
2725        add             \ds2,  \dst,  \d_strd
2726        add             \sr2,  \src,  \s_strd
2727        lsl             \d_strd,  \d_strd,  #1
2728        lsl             \s_strd,  \s_strd,  #1
27292:
2730        ld1r            {v4.4s},  [\src], \s_strd
2731        ld1r            {v6.4s},  [\sr2], \s_strd
2732        ext             v5.8b,  v4.8b,  v4.8b, #1
2733        ext             v7.8b,  v6.8b,  v6.8b, #1
2734        trn1            v4.4h,  v4.4h,  v6.4h
2735        trn1            v5.4h,  v5.4h,  v7.4h
2736        subs            \h,  \h,  #2
2737        umull           v4.8h,  v4.8b,  v0.8b
2738        umlal           v4.8h,  v5.8b,  v1.8b
2739        uqrshrn         v4.8b,  v4.8h,  #4
2740        st1             {v4.h}[0], [\dst], \d_strd
2741        st1             {v4.h}[1], [\ds2], \d_strd
2742        b.gt            2b
2743        ret
2744.endif
2745
274640:     // 4xN h
2747        AARCH64_VALID_JUMP_TARGET
2748        add             \ds2,  \dst,  \d_strd
2749        add             \sr2,  \src,  \s_strd
2750        lsl             \d_strd,  \d_strd,  #1
2751        lsl             \s_strd,  \s_strd,  #1
27524:
2753        ld1             {v4.8b}, [\src], \s_strd
2754        ld1             {v6.8b}, [\sr2], \s_strd
2755        ext             v5.8b,  v4.8b,  v4.8b, #1
2756        ext             v7.8b,  v6.8b,  v6.8b, #1
2757        trn1            v4.2s,  v4.2s,  v6.2s
2758        trn1            v5.2s,  v5.2s,  v7.2s
2759        subs            \h,  \h,  #2
2760        umull           v4.8h,  v4.8b,  v0.8b
2761        umlal           v4.8h,  v5.8b,  v1.8b
2762.ifc \type, put
2763        uqrshrn         v4.8b,  v4.8h,  #4
2764        st1             {v4.s}[0], [\dst], \d_strd
2765        st1             {v4.s}[1], [\ds2], \d_strd
2766.else
2767        st1             {v4.8b},   [\dst], \d_strd
2768        st1             {v4.d}[1], [\ds2], \d_strd
2769.endif
2770        b.gt            4b
2771        ret
2772
277380:     // 8xN h
2774        AARCH64_VALID_JUMP_TARGET
2775        add             \ds2,  \dst,  \d_strd
2776        add             \sr2,  \src,  \s_strd
2777        lsl             \d_strd,  \d_strd,  #1
2778        lsl             \s_strd,  \s_strd,  #1
27798:
2780        ld1             {v4.16b}, [\src], \s_strd
2781        ld1             {v6.16b}, [\sr2], \s_strd
2782        ext             v5.16b, v4.16b, v4.16b, #1
2783        ext             v7.16b, v6.16b, v6.16b, #1
2784        subs            \h,  \h,  #2
2785        umull           v4.8h,  v4.8b,  v0.8b
2786        umull           v6.8h,  v6.8b,  v0.8b
2787        umlal           v4.8h,  v5.8b,  v1.8b
2788        umlal           v6.8h,  v7.8b,  v1.8b
2789.ifc \type, put
2790        uqrshrn         v4.8b,  v4.8h,  #4
2791        uqrshrn         v6.8b,  v6.8h,  #4
2792        st1             {v4.8b}, [\dst], \d_strd
2793        st1             {v6.8b}, [\ds2], \d_strd
2794.else
2795        st1             {v4.8h}, [\dst], \d_strd
2796        st1             {v6.8h}, [\ds2], \d_strd
2797.endif
2798        b.gt            8b
2799        ret
2800160:
2801320:
2802640:
28031280:   // 16xN, 32xN, ... h
2804        AARCH64_VALID_JUMP_TARGET
2805        add             \ds2,  \dst,  \d_strd
2806        add             \sr2,  \src,  \s_strd
2807        lsl             \s_strd,  \s_strd,  #1
2808
2809        sub             \s_strd,  \s_strd,  \w, uxtw
2810        sub             \s_strd,  \s_strd,  #8
2811.ifc \type, put
2812        lsl             \d_strd,  \d_strd,  #1
2813        sub             \d_strd,  \d_strd,  \w, uxtw
2814.endif
2815161:
2816        ld1             {v16.d}[1],  [\src], #8
2817        ld1             {v20.d}[1],  [\sr2], #8
2818        mov             \mx, \w
2819
282016:
2821        ld1             {v18.16b},  [\src], #16
2822        ld1             {v22.16b},  [\sr2], #16
2823        ext             v17.16b, v16.16b, v18.16b, #8
2824        ext             v19.16b, v16.16b, v18.16b, #9
2825        ext             v21.16b, v20.16b, v22.16b, #8
2826        ext             v23.16b, v20.16b, v22.16b, #9
2827        umull           v16.8h,  v17.8b,  v0.8b
2828        umull2          v17.8h,  v17.16b, v0.16b
2829        umull           v20.8h,  v21.8b,  v0.8b
2830        umull2          v21.8h,  v21.16b, v0.16b
2831        umlal           v16.8h,  v19.8b,  v1.8b
2832        umlal2          v17.8h,  v19.16b, v1.16b
2833        umlal           v20.8h,  v23.8b,  v1.8b
2834        umlal2          v21.8h,  v23.16b, v1.16b
2835        subs            \mx, \mx, #16
2836.ifc \type, put
2837        uqrshrn         v16.8b,  v16.8h, #4
2838        uqrshrn2        v16.16b, v17.8h, #4
2839        uqrshrn         v20.8b,  v20.8h, #4
2840        uqrshrn2        v20.16b, v21.8h, #4
2841        st1             {v16.16b}, [\dst], #16
2842        st1             {v20.16b}, [\ds2], #16
2843.else
2844        st1             {v16.8h, v17.8h}, [\dst], #32
2845        st1             {v20.8h, v21.8h}, [\ds2], #32
2846.endif
2847        b.le            9f
2848
2849        mov             v16.16b, v18.16b
2850        mov             v20.16b, v22.16b
2851        b               16b
2852
28539:
2854        add             \dst,  \dst,  \d_strd
2855        add             \ds2,  \ds2,  \d_strd
2856        add             \src,  \src,  \s_strd
2857        add             \sr2,  \sr2,  \s_strd
2858
2859        subs            \h,  \h,  #2
2860        b.gt            161b
2861        ret
2862endfunc
2863
2864jumptable \type\()_bilin_h_tbl
2865        .word 1280b - \type\()_bilin_h_tbl
2866        .word 640b  - \type\()_bilin_h_tbl
2867        .word 320b  - \type\()_bilin_h_tbl
2868        .word 160b  - \type\()_bilin_h_tbl
2869        .word 80b   - \type\()_bilin_h_tbl
2870        .word 40b   - \type\()_bilin_h_tbl
2871        .word 20b   - \type\()_bilin_h_tbl
2872endjumptable
2873
2874
2875function L(\type\()_bilin_v)
2876        cmp             \h,  #4
2877        movrel          x9,  \type\()_bilin_v_tbl
2878        ldrsw           x8,  [x9, x8, lsl #2]
2879        add             x9,  x9,  x8
2880        br              x9
2881
288220:     // 2xN v
2883        AARCH64_VALID_JUMP_TARGET
2884.ifc \type, put
2885        cmp             \h,  #2
2886        add             \ds2,  \dst,  \d_strd
2887        add             \sr2,  \src,  \s_strd
2888        lsl             \s_strd,  \s_strd,  #1
2889        lsl             \d_strd,  \d_strd,  #1
2890
2891        // 2x2 v
2892        ld1r            {v16.8h}, [\src], \s_strd
2893        b.gt            24f
289422:
2895        ld1r            {v17.8h}, [\sr2], \s_strd
2896        ld1r            {v18.8h}, [\src], \s_strd
2897        trn1            v16.4h, v16.4h, v17.4h
2898        trn1            v17.4h, v17.4h, v18.4h
2899        umull           v4.8h,  v16.8b,  v2.8b
2900        umlal           v4.8h,  v17.8b,  v3.8b
2901        uqrshrn         v4.8b,  v4.8h,  #4
2902        str             h4,        [\dst]
2903        st1             {v4.h}[1], [\ds2]
2904        ret
290524:     // 2x4, 2x6, 2x8, ... v
2906        ld1r            {v17.8h}, [\sr2], \s_strd
2907        ld1r            {v18.8h}, [\src], \s_strd
2908        ld1r            {v19.8h}, [\sr2], \s_strd
2909        ld1r            {v20.8h}, [\src], \s_strd
2910        sub             \h,  \h,  #4
2911        trn1            v16.4h, v16.4h, v17.4h
2912        trn1            v17.4h, v17.4h, v18.4h
2913        trn1            v18.4h, v18.4h, v19.4h
2914        trn1            v19.4h, v19.4h, v20.4h
2915        trn1            v16.2s, v16.2s, v18.2s
2916        trn1            v17.2s, v17.2s, v19.2s
2917        umull           v4.8h,  v16.8b,  v2.8b
2918        umlal           v4.8h,  v17.8b,  v3.8b
2919        cmp             \h,  #2
2920        uqrshrn         v4.8b,  v4.8h,  #4
2921        st1             {v4.h}[0], [\dst], \d_strd
2922        st1             {v4.h}[1], [\ds2], \d_strd
2923        st1             {v4.h}[2], [\dst], \d_strd
2924        st1             {v4.h}[3], [\ds2], \d_strd
2925        b.lt            0f
2926        mov             v16.8b, v20.8b
2927        b.eq            22b
2928        b               24b
29290:
2930        ret
2931.endif
2932
293340:     // 4xN v
2934        AARCH64_VALID_JUMP_TARGET
2935        add             \ds2,  \dst,  \d_strd
2936        add             \sr2,  \src,  \s_strd
2937        lsl             \s_strd,  \s_strd,  #1
2938        lsl             \d_strd,  \d_strd,  #1
2939        ld1r            {v16.4s}, [\src], \s_strd
29404:
2941        ld1r            {v17.4s}, [\sr2], \s_strd
2942        ld1r            {v18.4s}, [\src], \s_strd
2943        trn1            v16.2s, v16.2s, v17.2s
2944        trn1            v17.2s, v17.2s, v18.2s
2945        umull           v4.8h,  v16.8b,  v2.8b
2946        umlal           v4.8h,  v17.8b,  v3.8b
2947        subs            \h,  \h,  #2
2948.ifc \type, put
2949        uqrshrn         v4.8b,  v4.8h,  #4
2950        st1             {v4.s}[0], [\dst], \d_strd
2951        st1             {v4.s}[1], [\ds2], \d_strd
2952.else
2953        st1             {v4.8b},   [\dst], \d_strd
2954        st1             {v4.d}[1], [\ds2], \d_strd
2955.endif
2956        b.le            0f
2957        mov             v16.8b, v18.8b
2958        b               4b
29590:
2960        ret
2961
296280:     // 8xN v
2963        AARCH64_VALID_JUMP_TARGET
2964        add             \ds2,  \dst,  \d_strd
2965        add             \sr2,  \src,  \s_strd
2966        lsl             \s_strd,  \s_strd,  #1
2967        lsl             \d_strd,  \d_strd,  #1
2968        ld1             {v16.8b}, [\src], \s_strd
29698:
2970        ld1             {v17.8b}, [\sr2], \s_strd
2971        ld1             {v18.8b}, [\src], \s_strd
2972        umull           v4.8h,  v16.8b,  v2.8b
2973        umull           v5.8h,  v17.8b,  v2.8b
2974        umlal           v4.8h,  v17.8b,  v3.8b
2975        umlal           v5.8h,  v18.8b,  v3.8b
2976        subs            \h,  \h,  #2
2977.ifc \type, put
2978        uqrshrn         v4.8b,  v4.8h,  #4
2979        uqrshrn         v5.8b,  v5.8h,  #4
2980        st1             {v4.8b}, [\dst], \d_strd
2981        st1             {v5.8b}, [\ds2], \d_strd
2982.else
2983        st1             {v4.8h}, [\dst], \d_strd
2984        st1             {v5.8h}, [\ds2], \d_strd
2985.endif
2986        b.le            0f
2987        mov             v16.8b, v18.8b
2988        b               8b
29890:
2990        ret
2991
2992160:    // 16xN, 32xN, ...
2993320:
2994640:
29951280:
2996        AARCH64_VALID_JUMP_TARGET
2997        mov             \my,  \h
29981:
2999        add             \ds2, \dst, \d_strd
3000        add             \sr2, \src, \s_strd
3001        lsl             \s_strd, \s_strd, #1
3002        lsl             \d_strd, \d_strd, #1
3003
3004        ld1             {v16.16b}, [\src], \s_strd
30052:
3006        ld1             {v17.16b}, [\sr2], \s_strd
3007        ld1             {v18.16b}, [\src], \s_strd
3008        umull           v4.8h,  v16.8b,  v2.8b
3009        umull2          v5.8h,  v16.16b, v2.16b
3010        umull           v6.8h,  v17.8b,  v2.8b
3011        umull2          v7.8h,  v17.16b, v2.16b
3012        umlal           v4.8h,  v17.8b,  v3.8b
3013        umlal2          v5.8h,  v17.16b, v3.16b
3014        umlal           v6.8h,  v18.8b,  v3.8b
3015        umlal2          v7.8h,  v18.16b, v3.16b
3016        subs            \h,  \h,  #2
3017.ifc \type, put
3018        uqrshrn         v4.8b,  v4.8h,  #4
3019        uqrshrn2        v4.16b, v5.8h,  #4
3020        uqrshrn         v6.8b,  v6.8h,  #4
3021        uqrshrn2        v6.16b, v7.8h,  #4
3022        st1             {v4.16b}, [\dst], \d_strd
3023        st1             {v6.16b}, [\ds2], \d_strd
3024.else
3025        st1             {v4.8h, v5.8h}, [\dst], \d_strd
3026        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
3027.endif
3028        b.le            9f
3029        mov             v16.16b, v18.16b
3030        b               2b
30319:
3032        subs            \w,  \w,  #16
3033        b.le            0f
3034        asr             \s_strd, \s_strd, #1
3035        asr             \d_strd, \d_strd, #1
3036        msub            \src, \s_strd, \xmy, \src
3037        msub            \dst, \d_strd, \xmy, \dst
3038        sub             \src, \src, \s_strd, lsl #1
3039        mov             \h,  \my
3040        add             \src, \src, #16
3041.ifc \type, put
3042        add             \dst, \dst, #16
3043.else
3044        add             \dst, \dst, #32
3045.endif
3046        b               1b
30470:
3048        ret
3049endfunc
3050
3051jumptable \type\()_bilin_v_tbl
3052        .word 1280b - \type\()_bilin_v_tbl
3053        .word 640b  - \type\()_bilin_v_tbl
3054        .word 320b  - \type\()_bilin_v_tbl
3055        .word 160b  - \type\()_bilin_v_tbl
3056        .word 80b   - \type\()_bilin_v_tbl
3057        .word 40b   - \type\()_bilin_v_tbl
3058        .word 20b   - \type\()_bilin_v_tbl
3059endjumptable
3060
3061function L(\type\()_bilin_hv)
3062        uxtl            v2.8h, v2.8b
3063        uxtl            v3.8h, v3.8b
3064        movrel          x9,  \type\()_bilin_hv_tbl
3065        ldrsw           x8,  [x9, x8, lsl #2]
3066        add             x9,  x9,  x8
3067        br              x9
3068
306920:     // 2xN hv
3070        AARCH64_VALID_JUMP_TARGET
3071.ifc \type, put
3072        add             \sr2, \src, \s_strd
3073        add             \ds2, \dst, \d_strd
3074        lsl             \s_strd, \s_strd, #1
3075        lsl             \d_strd, \d_strd, #1
3076
3077        ld1r            {v28.4s},  [\src], \s_strd
3078        ext             v29.8b, v28.8b, v28.8b, #1
3079        umull           v16.8h, v28.8b, v0.8b
3080        umlal           v16.8h, v29.8b, v1.8b
3081
30822:
3083        ld1r            {v28.4s},  [\sr2], \s_strd
3084        ld1r            {v30.4s},  [\src], \s_strd
3085        ext             v29.8b, v28.8b, v28.8b, #1
3086        ext             v31.8b, v30.8b, v30.8b, #1
3087        trn1            v28.4h, v28.4h, v30.4h
3088        trn1            v29.4h, v29.4h, v31.4h
3089        umull           v17.8h, v28.8b, v0.8b
3090        umlal           v17.8h, v29.8b, v1.8b
3091
3092        trn1            v16.2s, v16.2s, v17.2s
3093
3094        mul             v4.4h,  v16.4h, v2.4h
3095        mla             v4.4h,  v17.4h, v3.4h
3096        uqrshrn         v4.8b,  v4.8h,  #8
3097        subs            \h,  \h,  #2
3098        st1             {v4.h}[0], [\dst], \d_strd
3099        st1             {v4.h}[1], [\ds2], \d_strd
3100        b.le            0f
3101        trn2            v16.2s, v17.2s, v17.2s
3102        b               2b
31030:
3104        ret
3105.endif
3106
310740:     // 4xN hv
3108        AARCH64_VALID_JUMP_TARGET
3109        add             \sr2, \src, \s_strd
3110        add             \ds2, \dst, \d_strd
3111        lsl             \s_strd, \s_strd, #1
3112        lsl             \d_strd, \d_strd, #1
3113
3114        ld1             {v28.8b},  [\src], \s_strd
3115        ext             v29.8b, v28.8b, v28.8b, #1
3116        umull           v16.8h, v28.8b, v0.8b
3117        umlal           v16.8h, v29.8b, v1.8b
3118
31194:
3120        ld1             {v28.8b},  [\sr2], \s_strd
3121        ld1             {v30.8b},  [\src], \s_strd
3122        ext             v29.8b, v28.8b, v28.8b, #1
3123        ext             v31.8b, v30.8b, v30.8b, #1
3124        trn1            v28.2s, v28.2s, v30.2s
3125        trn1            v29.2s, v29.2s, v31.2s
3126        umull           v17.8h, v28.8b, v0.8b
3127        umlal           v17.8h, v29.8b, v1.8b
3128
3129        trn1            v16.2d, v16.2d, v17.2d
3130
3131        mul             v4.8h,  v16.8h, v2.8h
3132        mla             v4.8h,  v17.8h, v3.8h
3133        subs            \h,  \h,  #2
3134.ifc \type, put
3135        uqrshrn         v4.8b,  v4.8h,  #8
3136        st1             {v4.s}[0], [\dst], \d_strd
3137        st1             {v4.s}[1], [\ds2], \d_strd
3138.else
3139        urshr           v4.8h,  v4.8h,  #4
3140        st1             {v4.8b},   [\dst], \d_strd
3141        st1             {v4.d}[1], [\ds2], \d_strd
3142.endif
3143        b.le            0f
3144        trn2            v16.2d, v17.2d, v17.2d
3145        b               4b
31460:
3147        ret
3148
314980:     // 8xN, 16xN, ... hv
3150160:
3151320:
3152640:
31531280:
3154        AARCH64_VALID_JUMP_TARGET
3155        mov             \my,  \h
3156
31571:
3158        add             \sr2, \src, \s_strd
3159        add             \ds2, \dst, \d_strd
3160        lsl             \s_strd, \s_strd, #1
3161        lsl             \d_strd, \d_strd, #1
3162
3163        ld1             {v28.16b},  [\src], \s_strd
3164        ext             v29.16b, v28.16b, v28.16b, #1
3165        umull           v16.8h, v28.8b, v0.8b
3166        umlal           v16.8h, v29.8b, v1.8b
3167
31682:
3169        ld1             {v28.16b},  [\sr2], \s_strd
3170        ld1             {v30.16b},  [\src], \s_strd
3171        ext             v29.16b, v28.16b, v28.16b, #1
3172        ext             v31.16b, v30.16b, v30.16b, #1
3173        umull           v17.8h, v28.8b, v0.8b
3174        umlal           v17.8h, v29.8b, v1.8b
3175        umull           v18.8h, v30.8b, v0.8b
3176        umlal           v18.8h, v31.8b, v1.8b
3177
3178        mul             v4.8h,  v16.8h, v2.8h
3179        mla             v4.8h,  v17.8h, v3.8h
3180        mul             v5.8h,  v17.8h, v2.8h
3181        mla             v5.8h,  v18.8h, v3.8h
3182        subs            \h,  \h,  #2
3183.ifc \type, put
3184        uqrshrn         v4.8b,  v4.8h,  #8
3185        uqrshrn         v5.8b,  v5.8h,  #8
3186        st1             {v4.8b}, [\dst], \d_strd
3187        st1             {v5.8b}, [\ds2], \d_strd
3188.else
3189        urshr           v4.8h,  v4.8h,  #4
3190        urshr           v5.8h,  v5.8h,  #4
3191        st1             {v4.8h}, [\dst], \d_strd
3192        st1             {v5.8h}, [\ds2], \d_strd
3193.endif
3194        b.le            9f
3195        mov             v16.16b, v18.16b
3196        b               2b
31979:
3198        subs            \w,  \w,  #8
3199        b.le            0f
3200        asr             \s_strd,  \s_strd,  #1
3201        asr             \d_strd,  \d_strd,  #1
3202        msub            \src,  \s_strd,  \xmy,  \src
3203        msub            \dst,  \d_strd,  \xmy,  \dst
3204        sub             \src,  \src,  \s_strd,  lsl #1
3205        mov             \h,  \my
3206        add             \src,  \src,  #8
3207.ifc \type, put
3208        add             \dst,  \dst,  #8
3209.else
3210        add             \dst,  \dst,  #16
3211.endif
3212        b               1b
32130:
3214        ret
3215endfunc
3216
3217jumptable \type\()_bilin_hv_tbl
3218        .word 1280b - \type\()_bilin_hv_tbl
3219        .word 640b  - \type\()_bilin_hv_tbl
3220        .word 320b  - \type\()_bilin_hv_tbl
3221        .word 160b  - \type\()_bilin_hv_tbl
3222        .word 80b   - \type\()_bilin_hv_tbl
3223        .word 40b   - \type\()_bilin_hv_tbl
3224        .word 20b   - \type\()_bilin_hv_tbl
3225endjumptable
3226.endm
3227
3228make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
3229make_8tap_fn    put,  smooth_sharp,   SMOOTH,  SHARP,   8tap
3230make_8tap_fn    put,  sharp,          SHARP,   SHARP,   8tap
3231make_8tap_fn    put,  sharp_regular,  SHARP,   REGULAR, 8tap
3232make_8tap_fn    put,  sharp_smooth,   SHARP,   SMOOTH,  8tap
3233filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap
3234
3235make_8tap_fn    put,  regular,        REGULAR, REGULAR, 6tap
3236make_8tap_fn    put,  regular_smooth, REGULAR, SMOOTH,  6tap
3237make_8tap_fn    put,  smooth,         SMOOTH,  SMOOTH,  6tap
3238make_8tap_fn    put,  smooth_regular, SMOOTH,  REGULAR, 6tap
3239filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap
3240filter_bilin_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
3241
3242make_8tap_fn    prep, regular_sharp,  REGULAR, SHARP,   8tap
3243make_8tap_fn    prep, smooth_sharp,   SMOOTH,  SHARP,   8tap
3244make_8tap_fn    prep, sharp,          SHARP,   SHARP,   8tap
3245make_8tap_fn    prep, sharp_regular,  SHARP,   REGULAR, 8tap
3246make_8tap_fn    prep, sharp_smooth,   SHARP,   SMOOTH,  8tap
3247filter_fn       prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6,  8tap
3248
3249make_8tap_fn    prep, regular,        REGULAR, REGULAR, 6tap
3250make_8tap_fn    prep, regular_smooth, REGULAR, SMOOTH,  6tap
3251make_8tap_fn    prep, smooth,         SMOOTH,  SMOOTH,  6tap
3252make_8tap_fn    prep, smooth_regular, SMOOTH,  REGULAR, 6tap
3253filter_fn       prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6,  6tap
3254filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
3255
3256
3257.macro load_filter_row dst, src, inc
3258        asr             w13, \src, #10
3259        add             \src, \src, \inc
3260        ldr             \dst, [x11, w13, sxtw #3]
3261.endm
3262
3263function warp_filter_horz_neon
3264        add             w12, w5,  #512
3265
3266        ld1             {v16.8b, v17.8b}, [x2], x3
3267
3268        load_filter_row d0, w12, w7
3269        load_filter_row d1, w12, w7
3270        load_filter_row d2, w12, w7
3271        load_filter_row d3, w12, w7
3272        load_filter_row d4, w12, w7
3273        load_filter_row d5, w12, w7
3274        load_filter_row d6, w12, w7
3275        // subtract by 128 to allow using smull
3276        eor             v16.8b,  v16.8b,  v22.8b
3277        eor             v17.8b,  v17.8b,  v22.8b
3278        load_filter_row d7, w12, w7
3279
3280        ext             v18.8b,  v16.8b,  v17.8b,  #1
3281        ext             v19.8b,  v16.8b,  v17.8b,  #2
3282        smull           v0.8h,   v0.8b,   v16.8b
3283        smull           v1.8h,   v1.8b,   v18.8b
3284        ext             v18.8b,  v16.8b,  v17.8b,  #3
3285        ext             v20.8b,  v16.8b,  v17.8b,  #4
3286        smull           v2.8h,   v2.8b,   v19.8b
3287        smull           v3.8h,   v3.8b,   v18.8b
3288        ext             v18.8b,  v16.8b,  v17.8b,  #5
3289        ext             v19.8b,  v16.8b,  v17.8b,  #6
3290        smull           v4.8h,   v4.8b,   v20.8b
3291        smull           v5.8h,   v5.8b,   v18.8b
3292        ext             v18.8b,  v16.8b,  v17.8b,  #7
3293        smull           v6.8h,   v6.8b,   v19.8b
3294        smull           v7.8h,   v7.8b,   v18.8b
3295
3296        addp            v0.8h,   v0.8h,   v1.8h
3297        addp            v2.8h,   v2.8h,   v3.8h
3298        addp            v4.8h,   v4.8h,   v5.8h
3299        addp            v6.8h,   v6.8h,   v7.8h
3300
3301        addp            v0.8h,   v0.8h,   v2.8h
3302        addp            v4.8h,   v4.8h,   v6.8h
3303
3304        addp            v0.8h,   v0.8h,   v4.8h
3305
3306        add             w5,  w5,  w8
3307
3308        ret
3309endfunc
3310
3311// void dav1d_warp_affine_8x8_8bpc_neon(
3312//         pixel *dst, const ptrdiff_t dst_stride,
3313//         const pixel *src, const ptrdiff_t src_stride,
3314//         const int16_t *const abcd, int mx, int my)
3315.macro warp t, shift
3316function warp_affine_8x8\t\()_8bpc_neon, export=1
3317        ldr             x4,  [x4]
3318        sbfx            x7,  x4, #0,  #16
3319        sbfx            x8,  x4, #16, #16
3320        sbfx            x9,  x4, #32, #16
3321        sbfx            x4,  x4, #48, #16
3322        mov             w10, #8
3323        sub             x2,  x2,  x3, lsl #1
3324        sub             x2,  x2,  x3
3325        sub             x2,  x2,  #3
3326        movrel          x11, X(mc_warp_filter), 64*8
3327        mov             x15, x30
3328.ifnb \t
3329        lsl             x1,  x1,  #1
3330.endif
3331
3332        movi            v22.8b,  #128
3333.ifb \t
3334        movi            v23.8h,  #128
3335.else
3336        movi            v23.8h,  #8, lsl #8
3337.endif
3338
3339        bl              warp_filter_horz_neon
3340        srshr           v24.8h,  v0.8h,  #3
3341        bl              warp_filter_horz_neon
3342        srshr           v25.8h,  v0.8h,  #3
3343        bl              warp_filter_horz_neon
3344        srshr           v26.8h,  v0.8h,  #3
3345        bl              warp_filter_horz_neon
3346        srshr           v27.8h,  v0.8h,  #3
3347        bl              warp_filter_horz_neon
3348        srshr           v28.8h,  v0.8h,  #3
3349        bl              warp_filter_horz_neon
3350        srshr           v29.8h,  v0.8h,  #3
3351        bl              warp_filter_horz_neon
3352        srshr           v30.8h,  v0.8h,  #3
3353
33541:
3355        add             w14, w6,  #512
3356        bl              warp_filter_horz_neon
3357        srshr           v31.8h,  v0.8h,  #3
3358
3359        load_filter_row d0, w14, w9
3360        load_filter_row d1, w14, w9
3361        load_filter_row d2, w14, w9
3362        load_filter_row d3, w14, w9
3363        load_filter_row d4, w14, w9
3364        load_filter_row d5, w14, w9
3365        load_filter_row d6, w14, w9
3366        load_filter_row d7, w14, w9
3367        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
3368
3369        // This ordering of smull/smlal/smull2/smlal2 is highly
3370        // beneficial for Cortex A53 here.
3371        smull           v16.4s,  v24.4h,  v0.4h
3372        smlal           v16.4s,  v25.4h,  v1.4h
3373        smlal           v16.4s,  v26.4h,  v2.4h
3374        smlal           v16.4s,  v27.4h,  v3.4h
3375        smlal           v16.4s,  v28.4h,  v4.4h
3376        smlal           v16.4s,  v29.4h,  v5.4h
3377        smlal           v16.4s,  v30.4h,  v6.4h
3378        smlal           v16.4s,  v31.4h,  v7.4h
3379        smull2          v17.4s,  v24.8h,  v0.8h
3380        smlal2          v17.4s,  v25.8h,  v1.8h
3381        smlal2          v17.4s,  v26.8h,  v2.8h
3382        smlal2          v17.4s,  v27.8h,  v3.8h
3383        smlal2          v17.4s,  v28.8h,  v4.8h
3384        smlal2          v17.4s,  v29.8h,  v5.8h
3385        smlal2          v17.4s,  v30.8h,  v6.8h
3386        smlal2          v17.4s,  v31.8h,  v7.8h
3387
3388        mov             v24.16b, v25.16b
3389        mov             v25.16b, v26.16b
3390        sqrshrn         v16.4h,  v16.4s,  #\shift
3391        mov             v26.16b, v27.16b
3392        sqrshrn2        v16.8h,  v17.4s,  #\shift
3393        mov             v27.16b, v28.16b
3394        mov             v28.16b, v29.16b
3395        add             v16.8h,  v16.8h,  v23.8h
3396.ifb \t
3397        sqxtun          v16.8b,  v16.8h
3398.endif
3399        mov             v29.16b, v30.16b
3400        mov             v30.16b, v31.16b
3401        subs            w10, w10, #1
3402.ifnb \t
3403        st1             {v16.8h}, [x0], x1
3404.else
3405        st1             {v16.8b}, [x0], x1
3406.endif
3407
3408        add             w6,  w6,  w4
3409        b.gt            1b
3410
3411        ret             x15
3412endfunc
3413.endm
3414
3415warp  , 11
3416warp t, 7
3417
3418// void dav1d_emu_edge_8bpc_neon(
3419//         const intptr_t bw, const intptr_t bh,
3420//         const intptr_t iw, const intptr_t ih,
3421//         const intptr_t x, const intptr_t y,
3422//         pixel *dst, const ptrdiff_t dst_stride,
3423//         const pixel *ref, const ptrdiff_t ref_stride)
3424function emu_edge_8bpc_neon, export=1
3425        ldp             x8,  x9,  [sp]
3426
3427        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
3428        // ref += iclip(x, 0, iw - 1)
3429        sub             x12, x3,  #1           // ih - 1
3430        cmp             x5,  x3
3431        sub             x13, x2,  #1           // iw - 1
3432        csel            x12, x12, x5,  ge      // min(y, ih - 1)
3433        cmp             x4,  x2
3434        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
3435        csel            x13, x13, x4,  ge      // min(x, iw - 1)
3436        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
3437        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
3438        add             x8,  x8,  x13          // ref += iclip()
3439
3440        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
3441        // top_ext = iclip(-y, 0, bh - 1)
3442        add             x10, x5,  x1           // y + bh
3443        neg             x5,  x5                // -y
3444        sub             x10, x10, x3           // y + bh - ih
3445        sub             x12, x1,  #1           // bh - 1
3446        cmp             x10, x1
3447        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
3448        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
3449        cmp             x5,  x1
3450        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
3451        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
3452
3453        // right_ext = iclip(x + bw - iw, 0, bw - 1)
3454        // left_ext = iclip(-x, 0, bw - 1)
3455        add             x11, x4,  x0           // x + bw
3456        neg             x4,  x4                // -x
3457        sub             x11, x11, x2           // x + bw - iw
3458        sub             x13, x0,  #1           // bw - 1
3459        cmp             x11, x0
3460        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
3461        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
3462        cmp             x4,  x0
3463        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
3464        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
3465
3466        // center_h = bh - top_ext - bottom_ext
3467        // dst += top_ext * PXSTRIDE(dst_stride)
3468        // center_w = bw - left_ext - right_ext
3469        sub             x1,  x1,  x5           // bh - top_ext
3470        madd            x6,  x5,  x7,  x6
3471        sub             x2,  x0,  x4           // bw - left_ext
3472        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
3473        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
3474
3475        mov             x14, x6                // backup of dst
3476
3477.macro v_loop need_left, need_right
34780:
3479.if \need_left
3480        ld1r            {v0.16b}, [x8]
3481        mov             x12, x6                // out = dst
3482        mov             x3,  x4
34831:
3484        subs            x3,  x3,  #16
3485        st1             {v0.16b}, [x12], #16
3486        b.gt            1b
3487.endif
3488        mov             x13, x8
3489        add             x12, x6,  x4           // out = dst + left_ext
3490        mov             x3,  x2
34911:
3492        ld1             {v0.16b, v1.16b}, [x13], #32
3493        subs            x3,  x3,  #32
3494        st1             {v0.16b, v1.16b}, [x12], #32
3495        b.gt            1b
3496.if \need_right
3497        add             x3,  x8,  x2           // in + center_w
3498        sub             x3,  x3,  #1           // in + center_w - 1
3499        add             x12, x6,  x4           // dst + left_ext
3500        ld1r            {v0.16b}, [x3]
3501        add             x12, x12, x2           // out = dst + left_ext + center_w
3502        mov             x3,  x11
35031:
3504        subs            x3,  x3,  #16
3505        st1             {v0.16b}, [x12], #16
3506        b.gt            1b
3507.endif
3508
3509        subs            x1,  x1,  #1           // center_h--
3510        add             x6,  x6,  x7
3511        add             x8,  x8,  x9
3512        b.gt            0b
3513.endm
3514
3515        cbz             x4,  2f
3516        // need_left
3517        cbz             x11, 3f
3518        // need_left + need_right
3519        v_loop          1,   1
3520        b               5f
3521
35222:
3523        // !need_left
3524        cbz             x11, 4f
3525        // !need_left + need_right
3526        v_loop          0,   1
3527        b               5f
3528
35293:
3530        // need_left + !need_right
3531        v_loop          1,   0
3532        b               5f
3533
35344:
3535        // !need_left + !need_right
3536        v_loop          0,   0
3537
35385:
3539
3540        cbz             x10, 3f
3541        // need_bottom
3542        sub             x8,  x6,  x7           // ref = dst - stride
3543        mov             x4,  x0
35441:
3545        ld1             {v0.16b, v1.16b}, [x8], #32
3546        mov             x3,  x10
35472:
3548        subs            x3,  x3,  #1
3549        st1             {v0.16b, v1.16b}, [x6], x7
3550        b.gt            2b
3551        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
3552        subs            x4,  x4,  #32          // bw -= 32
3553        add             x6,  x6,  #32          // dst += 32
3554        b.gt            1b
3555
35563:
3557        cbz             x5,  3f
3558        // need_top
3559        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
35601:
3561        ld1             {v0.16b, v1.16b}, [x14], #32
3562        mov             x3,  x5
35632:
3564        subs            x3,  x3,  #1
3565        st1             {v0.16b, v1.16b}, [x6], x7
3566        b.gt            2b
3567        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
3568        subs            x0,  x0,  #32          // bw -= 32
3569        add             x6,  x6,  #32          // dst += 32
3570        b.gt            1b
3571
35723:
3573        ret
3574endfunc
3575