xref: /aosp_15_r20/external/libdav1d/src/arm/64/mc16.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Janne Grunau
4 * Copyright © 2020, Martin Storsjo
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "src/arm/asm.S"
30#include "util.S"
31
32#define PREP_BIAS 8192
33
34.macro avg d0, d1, t0, t1, t2, t3
35        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
36        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
37        sqadd           \t0\().8h,  \t0\().8h,  \t2\().8h
38        sqadd           \t1\().8h,  \t1\().8h,  \t3\().8h
39        smax            \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
40        smax            \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
41        sqsub           \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
42        sqsub           \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
43        sshl            \d0\().8h,  \t0\().8h,  v29.8h // -(intermediate_bits+1)
44        sshl            \d1\().8h,  \t1\().8h,  v29.8h // -(intermediate_bits+1)
45.endm
46
47.macro w_avg d0, d1, t0, t1, t2, t3
48        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
49        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
50        // This difference requires a 17 bit range, and all bits are
51        // significant for the following multiplication.
52        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
53        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
54        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
55        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
56        mul             \d0\().4s,  \d0\().4s,  v27.4s
57        mul             \t0\().4s,  \t0\().4s,  v27.4s
58        mul             \d1\().4s,  \d1\().4s,  v27.4s
59        mul             \t1\().4s,  \t1\().4s,  v27.4s
60        sshr            \d0\().4s,  \d0\().4s,  #4
61        sshr            \t0\().4s,  \t0\().4s,  #4
62        sshr            \d1\().4s,  \d1\().4s,  #4
63        sshr            \t1\().4s,  \t1\().4s,  #4
64        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
65        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
66        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
67        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
68        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h // Same as xtn, xtn2
69        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h // Ditto
70        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
71        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
72        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
73        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
74        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
75        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
76        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
77        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
78.endm
79
80.macro mask d0, d1, t0, t1, t2, t3
81        ld1             {v27.16b}, [x6],  16
82        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
83        neg             v27.16b, v27.16b
84        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
85        sxtl            v26.8h,  v27.8b
86        sxtl2           v27.8h,  v27.16b
87        sxtl            v24.4s,  v26.4h
88        sxtl2           v25.4s,  v26.8h
89        sxtl            v26.4s,  v27.4h
90        sxtl2           v27.4s,  v27.8h
91        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
92        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
93        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
94        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
95        mul             \d0\().4s,  \d0\().4s,  v24.4s
96        mul             \t0\().4s,  \t0\().4s,  v25.4s
97        mul             \d1\().4s,  \d1\().4s,  v26.4s
98        mul             \t1\().4s,  \t1\().4s,  v27.4s
99        sshr            \d0\().4s,  \d0\().4s,  #6
100        sshr            \t0\().4s,  \t0\().4s,  #6
101        sshr            \d1\().4s,  \d1\().4s,  #6
102        sshr            \t1\().4s,  \t1\().4s,  #6
103        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
104        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
105        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
106        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
107        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h  // Same as xtn, xtn2
108        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h  // Ditto
109        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
110        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
111        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
112        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
113        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
114        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
115        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
116        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
117.endm
118
119.macro bidir_fn type, bdmax
120function \type\()_16bpc_neon, export=1
121        clz             w4,  w4
122.ifnc \type, avg
123        dup             v31.8h,  \bdmax // bitdepth_max
124        movi            v30.8h,  #0
125.endif
126        clz             w7,  \bdmax
127        sub             w7,  w7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
128.ifc \type, avg
129        mov             w9,  #1
130        mov             w8,  #-2*PREP_BIAS
131        lsl             w9,  w9,  w7    // 1 << intermediate_bits
132        add             w7,  w7,  #1
133        sub             w8,  w8,  w9    // -2*PREP_BIAS - 1 << intermediate_bits
134        neg             w7,  w7         // -(intermediate_bits+1)
135        dup             v28.8h,   w8    // -2*PREP_BIAS - 1 << intermediate_bits
136        dup             v29.8h,   w7    // -(intermediate_bits+1)
137.else
138        mov             w8,  #PREP_BIAS
139        lsr             w8,  w8,  w7    // PREP_BIAS >> intermediate_bits
140        neg             w7,  w7         // -intermediate_bits
141        dup             v28.8h,  w8     // PREP_BIAS >> intermediate_bits
142        dup             v29.8h,  w7     // -intermediate_bits
143.endif
144.ifc \type, w_avg
145        dup             v27.4s,  w6
146        neg             v27.4s,  v27.4s
147.endif
148        movrel          x7,  \type\()_tbl
149        sub             w4,  w4,  #24
150        \type           v4,  v5,  v0,  v1,  v2,  v3
151        ldrsw           x4,  [x7, x4, lsl #2]
152        add             x7,  x7,  x4
153        br              x7
15440:
155        AARCH64_VALID_JUMP_TARGET
156        add             x7,  x0,  x1
157        lsl             x1,  x1,  #1
1584:
159        subs            w5,  w5,  #4
160        st1             {v4.8b},    [x0], x1
161        st1             {v4.d}[1],  [x7], x1
162        st1             {v5.8b},    [x0], x1
163        st1             {v5.d}[1],  [x7], x1
164        b.le            0f
165        \type           v4,  v5,  v0,  v1,  v2,  v3
166        b               4b
16780:
168        AARCH64_VALID_JUMP_TARGET
169        add             x7,  x0,  x1
170        lsl             x1,  x1,  #1
1718:
172        st1             {v4.8h},  [x0], x1
173        subs            w5,  w5,  #2
174        st1             {v5.8h},  [x7], x1
175        b.le            0f
176        \type           v4,  v5,  v0,  v1,  v2,  v3
177        b               8b
178160:
179        AARCH64_VALID_JUMP_TARGET
18016:
181        \type           v6,  v7,  v0,  v1,  v2,  v3
182        st1             {v4.8h, v5.8h}, [x0], x1
183        subs            w5,  w5,  #2
184        st1             {v6.8h, v7.8h}, [x0], x1
185        b.le            0f
186        \type           v4,  v5,  v0,  v1,  v2,  v3
187        b               16b
188320:
189        AARCH64_VALID_JUMP_TARGET
19032:
191        \type           v6,  v7,  v0,  v1,  v2,  v3
192        subs            w5,  w5,  #1
193        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
194        b.le            0f
195        \type           v4,  v5,  v0,  v1,  v2,  v3
196        b               32b
197640:
198        AARCH64_VALID_JUMP_TARGET
199        add             x7,  x0,  #64
20064:
201        \type           v6,  v7,  v0,  v1,  v2,  v3
202        \type           v16, v17, v0,  v1,  v2,  v3
203        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
204        \type           v18, v19, v0,  v1,  v2,  v3
205        subs            w5,  w5,  #1
206        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
207        b.le            0f
208        \type           v4,  v5,  v0,  v1,  v2,  v3
209        b               64b
2101280:
211        AARCH64_VALID_JUMP_TARGET
212        add             x7,  x0,  #64
213        mov             x8,  #128
214        sub             x1,  x1,  #128
215128:
216        \type           v6,  v7,  v0,  v1,  v2,  v3
217        \type           v16, v17, v0,  v1,  v2,  v3
218        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8
219        \type           v18, v19, v0,  v1,  v2,  v3
220        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
221        \type           v4,  v5,  v0,  v1,  v2,  v3
222        \type           v6,  v7,  v0,  v1,  v2,  v3
223        \type           v16, v17, v0,  v1,  v2,  v3
224        subs            w5,  w5,  #1
225        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
226        \type           v18, v19, v0,  v1,  v2,  v3
227        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
228        b.le            0f
229        \type           v4,  v5,  v0,  v1,  v2,  v3
230        b               128b
2310:
232        ret
233endfunc
234
235jumptable \type\()_tbl
236        .word 1280b - \type\()_tbl
237        .word 640b  - \type\()_tbl
238        .word 320b  - \type\()_tbl
239        .word 160b  - \type\()_tbl
240        .word 80b   - \type\()_tbl
241        .word 40b   - \type\()_tbl
242endjumptable
243.endm
244
245bidir_fn avg, w6
246bidir_fn w_avg, w7
247bidir_fn mask, w7
248
249
250.macro w_mask_fn type
251function w_mask_\type\()_16bpc_neon, export=1
252        ldr             w8,  [sp]
253        clz             w9,  w4
254        movrel          x10, w_mask_\type\()_tbl
255        dup             v31.8h,  w8   // bitdepth_max
256        sub             w9,  w9,  #24
257        clz             w8,  w8       // clz(bitdepth_max)
258        ldrsw           x9,  [x10,  x9,  lsl #2]
259        add             x10, x10, x9
260        sub             w8,  w8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
261        mov             w9,  #PREP_BIAS*64
262        neg             w8,  w8       // -sh
263        mov             w11, #27615   // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
264        dup             v30.4s,  w9   // PREP_BIAS*64
265        dup             v29.4s,  w8   // -sh
266        dup             v0.8h,   w11
267.if \type == 444
268        movi            v1.16b,  #64
269.elseif \type == 422
270        dup             v2.8b,   w7
271        movi            v3.8b,   #129
272        sub             v3.8b,   v3.8b,   v2.8b
273.elseif \type == 420
274        dup             v2.8h,   w7
275        movi            v3.8h,   #1, lsl #8
276        sub             v3.8h,   v3.8h,   v2.8h
277.endif
278        add             x12,  x0,  x1
279        lsl             x1,   x1,  #1
280        br              x10
28140:
282        AARCH64_VALID_JUMP_TARGET
2834:
284        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
285        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
286        subs            w5,  w5,  #4
287        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
288        sabd            v21.8h,  v5.8h,   v7.8h
289        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
290        ssubl2          v17.4s,  v6.8h,   v4.8h
291        ssubl           v18.4s,  v7.4h,   v5.4h
292        ssubl2          v19.4s,  v7.8h,   v5.8h
293        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
294        uqsub           v21.8h,  v0.8h,   v21.8h
295        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
296        sshll           v6.4s,   v5.4h,   #6
297        sshll2          v5.4s,   v4.8h,   #6
298        sshll           v4.4s,   v4.4h,   #6
299        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
300        ushr            v21.8h,  v21.8h,  #10
301        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
302        add             v5.4s,   v5.4s,   v30.4s
303        add             v6.4s,   v6.4s,   v30.4s
304        add             v7.4s,   v7.4s,   v30.4s
305        uxtl            v22.4s,  v20.4h
306        uxtl2           v23.4s,  v20.8h
307        uxtl            v24.4s,  v21.4h
308        uxtl2           v25.4s,  v21.8h
309        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
310        mla             v5.4s,   v17.4s,  v23.4s
311        mla             v6.4s,   v18.4s,  v24.4s
312        mla             v7.4s,   v19.4s,  v25.4s
313        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
314        srshl           v5.4s,   v5.4s,   v29.4s
315        srshl           v6.4s,   v6.4s,   v29.4s
316        srshl           v7.4s,   v7.4s,   v29.4s
317        sqxtun          v4.4h,   v4.4s            // iclip_pixel
318        sqxtun2         v4.8h,   v5.4s
319        sqxtun          v5.4h,   v6.4s
320        sqxtun2         v5.8h,   v7.4s
321        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
322        umin            v5.8h,   v5.8h,   v31.8h
323.if \type == 444
324        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
325        sub             v20.16b, v1.16b,  v20.16b // m
326        st1             {v20.16b}, [x6], #16
327.elseif \type == 422
328        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
329        xtn             v20.8b,  v20.8h
330        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
331        st1             {v20.8b}, [x6], #8
332.elseif \type == 420
333        trn1            v24.2d,  v20.2d,  v21.2d
334        trn2            v25.2d,  v20.2d,  v21.2d
335        add             v24.8h,  v24.8h,  v25.8h  // (64 - my1) + (64 - my2) (row wise addition)
336        addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)
337        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
338        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
339        str             s20,        [x6],  #4
340.endif
341        st1             {v4.8b},    [x0],  x1
342        st1             {v4.d}[1],  [x12], x1
343        st1             {v5.8b},    [x0],  x1
344        st1             {v5.d}[1],  [x12], x1
345        b.gt            4b
346        ret
34780:
348        AARCH64_VALID_JUMP_TARGET
3498:
350        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1
351        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2
352        subs            w5,  w5,  #2
353        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
354        sabd            v21.8h,  v5.8h,   v7.8h
355        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
356        ssubl2          v17.4s,  v6.8h,   v4.8h
357        ssubl           v18.4s,  v7.4h,   v5.4h
358        ssubl2          v19.4s,  v7.8h,   v5.8h
359        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
360        uqsub           v21.8h,  v0.8h,   v21.8h
361        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
362        sshll           v6.4s,   v5.4h,   #6
363        sshll2          v5.4s,   v4.8h,   #6
364        sshll           v4.4s,   v4.4h,   #6
365        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
366        ushr            v21.8h,  v21.8h,  #10
367        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
368        add             v5.4s,   v5.4s,   v30.4s
369        add             v6.4s,   v6.4s,   v30.4s
370        add             v7.4s,   v7.4s,   v30.4s
371        uxtl            v22.4s,  v20.4h
372        uxtl2           v23.4s,  v20.8h
373        uxtl            v24.4s,  v21.4h
374        uxtl2           v25.4s,  v21.8h
375        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
376        mla             v5.4s,   v17.4s,  v23.4s
377        mla             v6.4s,   v18.4s,  v24.4s
378        mla             v7.4s,   v19.4s,  v25.4s
379        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
380        srshl           v5.4s,   v5.4s,   v29.4s
381        srshl           v6.4s,   v6.4s,   v29.4s
382        srshl           v7.4s,   v7.4s,   v29.4s
383        sqxtun          v4.4h,   v4.4s            // iclip_pixel
384        sqxtun2         v4.8h,   v5.4s
385        sqxtun          v5.4h,   v6.4s
386        sqxtun2         v5.8h,   v7.4s
387        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
388        umin            v5.8h,   v5.8h,   v31.8h
389.if \type == 444
390        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
391        sub             v20.16b, v1.16b,  v20.16b // m
392        st1             {v20.16b}, [x6], #16
393.elseif \type == 422
394        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
395        xtn             v20.8b,  v20.8h
396        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
397        st1             {v20.8b}, [x6], #8
398.elseif \type == 420
399        add             v20.8h,  v20.8h,  v21.8h  // (64 - my1) + (64 - my2) (row wise addition)
400        addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)
401        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
402        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
403        str             s20,     [x6],  #4
404.endif
405        st1             {v4.8h}, [x0],  x1
406        st1             {v5.8h}, [x12], x1
407        b.gt            8b
408        ret
4091280:
410640:
411320:
412160:
413        AARCH64_VALID_JUMP_TARGET
414        mov             w11, w4
415        sub             x1,  x1,  w4,  uxtw #1
416.if \type == 444
417        add             x10, x6,  w4,  uxtw
418.elseif \type == 422
419        add             x10, x6,  x11, lsr #1
420.endif
421        add             x9,  x3,  w4,  uxtw #1
422        add             x7,  x2,  w4,  uxtw #1
423161:
424        mov             w8,  w4
42516:
426        ld1             {v4.8h,   v5.8h},  [x2], #32 // tmp1
427        ld1             {v16.8h,  v17.8h}, [x3], #32 // tmp2
428        ld1             {v6.8h,   v7.8h},  [x7], #32
429        ld1             {v18.8h,  v19.8h}, [x9], #32
430        subs            w8,  w8,  #16
431        sabd            v20.8h,  v4.8h,   v16.8h  // abs(tmp1 - tmp2)
432        sabd            v21.8h,  v5.8h,   v17.8h
433        ssubl           v22.4s,  v16.4h,  v4.4h   // tmp2 - tmp1 (requires 17 bit)
434        ssubl2          v23.4s,  v16.8h,  v4.8h
435        ssubl           v24.4s,  v17.4h,  v5.4h
436        ssubl2          v25.4s,  v17.8h,  v5.8h
437        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
438        uqsub           v21.8h,  v0.8h,   v21.8h
439        sshll2          v27.4s,  v5.8h,   #6      // tmp1 << 6
440        sshll           v26.4s,  v5.4h,   #6
441        sshll2          v5.4s,   v4.8h,   #6
442        sshll           v4.4s,   v4.4h,   #6
443        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
444        ushr            v21.8h,  v21.8h,  #10
445        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
446        add             v5.4s,   v5.4s,   v30.4s
447        add             v26.4s,  v26.4s,  v30.4s
448        add             v27.4s,  v27.4s,  v30.4s
449        uxtl            v16.4s,  v20.4h
450        uxtl2           v17.4s,  v20.8h
451        uxtl            v28.4s,  v21.4h
452        mla             v4.4s,   v22.4s,  v16.4s  // (tmp2-tmp1)*(64-m)
453        uxtl2           v16.4s,  v21.8h
454        mla             v5.4s,   v23.4s,  v17.4s
455        mla             v26.4s,  v24.4s,  v28.4s
456        mla             v27.4s,  v25.4s,  v16.4s
457        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
458        srshl           v5.4s,   v5.4s,   v29.4s
459        srshl           v26.4s,  v26.4s,  v29.4s
460        srshl           v27.4s,  v27.4s,  v29.4s
461        sqxtun          v4.4h,   v4.4s            // iclip_pixel
462        sqxtun2         v4.8h,   v5.4s
463        sqxtun          v5.4h,   v26.4s
464        sqxtun2         v5.8h,   v27.4s
465
466        // Start of other half
467        sabd            v22.8h,  v6.8h,   v18.8h  // abs(tmp1 - tmp2)
468        sabd            v23.8h,  v7.8h,   v19.8h
469
470        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
471        umin            v5.8h,   v5.8h,   v31.8h
472
473        ssubl           v16.4s,  v18.4h,  v6.4h   // tmp2 - tmp1 (requires 17 bit)
474        ssubl2          v17.4s,  v18.8h,  v6.8h
475        ssubl           v18.4s,  v19.4h,  v7.4h
476        ssubl2          v19.4s,  v19.8h,  v7.8h
477        uqsub           v22.8h,  v0.8h,   v22.8h  // 27615 - abs()
478        uqsub           v23.8h,  v0.8h,   v23.8h
479        sshll           v24.4s,  v6.4h,   #6      // tmp1 << 6
480        sshll2          v25.4s,  v6.8h,   #6
481        sshll           v26.4s,  v7.4h,   #6
482        sshll2          v27.4s,  v7.8h,   #6
483        ushr            v22.8h,  v22.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
484        ushr            v23.8h,  v23.8h,  #10
485        add             v24.4s,  v24.4s,  v30.4s  // += PREP_BIAS*64
486        add             v25.4s,  v25.4s,  v30.4s
487        add             v26.4s,  v26.4s,  v30.4s
488        add             v27.4s,  v27.4s,  v30.4s
489        uxtl            v6.4s,   v22.4h
490        uxtl2           v7.4s,   v22.8h
491        uxtl            v28.4s,  v23.4h
492        mla             v24.4s,  v16.4s,  v6.4s   // (tmp2-tmp1)*(64-m)
493        uxtl2           v6.4s,   v23.8h
494        mla             v25.4s,  v17.4s,  v7.4s
495        mla             v26.4s,  v18.4s,  v28.4s
496        mla             v27.4s,  v19.4s,  v6.4s
497        srshl           v24.4s,  v24.4s,  v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
498        srshl           v25.4s,  v25.4s,  v29.4s
499        srshl           v26.4s,  v26.4s,  v29.4s
500        srshl           v27.4s,  v27.4s,  v29.4s
501        sqxtun          v6.4h,   v24.4s           // iclip_pixel
502        sqxtun2         v6.8h,   v25.4s
503        sqxtun          v7.4h,   v26.4s
504        sqxtun2         v7.8h,   v27.4s
505        umin            v6.8h,   v6.8h,   v31.8h  // iclip_pixel
506        umin            v7.8h,   v7.8h,   v31.8h
507.if \type == 444
508        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
509        uzp1            v21.16b, v22.16b, v23.16b
510        sub             v20.16b, v1.16b,  v20.16b // m
511        sub             v21.16b, v1.16b,  v21.16b
512        st1             {v20.16b}, [x6],  #16
513        st1             {v21.16b}, [x10], #16
514.elseif \type == 422
515        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
516        addp            v21.8h,  v22.8h,  v23.8h
517        xtn             v20.8b,  v20.8h
518        xtn             v21.8b,  v21.8h
519        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
520        uhsub           v21.8b,  v3.8b,   v21.8b
521        st1             {v20.8b}, [x6],  #8
522        st1             {v21.8b}, [x10], #8
523.elseif \type == 420
524        add             v20.8h,  v20.8h,  v22.8h  // (64 - my1) + (64 - my2) (row wise addition)
525        add             v21.8h,  v21.8h,  v23.8h
526        addp            v20.8h,  v20.8h,  v21.8h  // (128 - m) + (128 - n) (column wise addition)
527        sub             v20.8h,  v3.8h,   v20.8h  // (256 - sign) - ((128 - m) + (128 - n))
528        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
529        st1             {v20.8b}, [x6], #8
530.endif
531        st1             {v4.8h, v5.8h}, [x0],  #32
532        st1             {v6.8h, v7.8h}, [x12], #32
533        b.gt            16b
534        subs            w5,  w5,  #2
535        add             x2,  x2,  w4,  uxtw #1
536        add             x3,  x3,  w4,  uxtw #1
537        add             x7,  x7,  w4,  uxtw #1
538        add             x9,  x9,  w4,  uxtw #1
539.if \type == 444
540        add             x6,  x6,  w4,  uxtw
541        add             x10, x10, w4,  uxtw
542.elseif \type == 422
543        add             x6,  x6,  x11, lsr #1
544        add             x10, x10, x11, lsr #1
545.endif
546        add             x0,  x0,  x1
547        add             x12, x12, x1
548        b.gt            161b
549        ret
550endfunc
551
552jumptable w_mask_\type\()_tbl
553        .word 1280b - w_mask_\type\()_tbl
554        .word 640b  - w_mask_\type\()_tbl
555        .word 320b  - w_mask_\type\()_tbl
556        .word 160b  - w_mask_\type\()_tbl
557        .word 80b   - w_mask_\type\()_tbl
558        .word 40b   - w_mask_\type\()_tbl
559endjumptable
560.endm
561
562w_mask_fn 444
563w_mask_fn 422
564w_mask_fn 420
565
566
567function blend_16bpc_neon, export=1
568        movrel          x6,  blend_tbl
569        clz             w3,  w3
570        sub             w3,  w3,  #26
571        ldrsw           x3,  [x6,  x3,  lsl #2]
572        add             x6,  x6,  x3
573        add             x8,  x0,  x1
574        br              x6
57540:
576        AARCH64_VALID_JUMP_TARGET
577        lsl             x1,  x1,  #1
5784:
579        ld1             {v2.8b},   [x5], #8
580        ld1             {v1.8h},   [x2], #16
581        ldr             d0,        [x0]
582        neg             v2.8b,   v2.8b            // -m
583        subs            w4,  w4,  #2
584        ld1             {v0.d}[1], [x8]
585        sxtl            v2.8h,   v2.8b
586        shl             v2.8h,   v2.8h,   #9      // -m << 9
587        sub             v1.8h,   v0.8h,   v1.8h   // a - b
588        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
589        add             v0.8h,   v0.8h,   v1.8h
590        st1             {v0.8b},   [x0], x1
591        st1             {v0.d}[1], [x8], x1
592        b.gt            4b
593        ret
59480:
595        AARCH64_VALID_JUMP_TARGET
596        lsl             x1,  x1,  #1
5978:
598        ld1             {v4.16b},       [x5], #16
599        ld1             {v2.8h, v3.8h}, [x2], #32
600        neg             v5.16b,  v4.16b           // -m
601        ld1             {v0.8h},   [x0]
602        ld1             {v1.8h},   [x8]
603        sxtl            v4.8h,   v5.8b
604        sxtl2           v5.8h,   v5.16b
605        shl             v4.8h,   v4.8h,   #9      // -m << 9
606        shl             v5.8h,   v5.8h,   #9
607        sub             v2.8h,   v0.8h,   v2.8h   // a - b
608        sub             v3.8h,   v1.8h,   v3.8h
609        subs            w4,  w4,  #2
610        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
611        sqrdmulh        v3.8h,   v3.8h,   v5.8h
612        add             v0.8h,   v0.8h,   v2.8h
613        add             v1.8h,   v1.8h,   v3.8h
614        st1             {v0.8h}, [x0], x1
615        st1             {v1.8h}, [x8], x1
616        b.gt            8b
617        ret
618160:
619        AARCH64_VALID_JUMP_TARGET
620        lsl             x1,  x1,  #1
62116:
622        ld1             {v16.16b, v17.16b},           [x5], #32
623        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
624        subs            w4,  w4,  #2
625        neg             v18.16b, v16.16b          // -m
626        neg             v19.16b, v17.16b
627        ld1             {v0.8h, v1.8h}, [x0]
628        sxtl            v16.8h,  v18.8b
629        sxtl2           v17.8h,  v18.16b
630        sxtl            v18.8h,  v19.8b
631        sxtl2           v19.8h,  v19.16b
632        ld1             {v2.8h, v3.8h}, [x8]
633        shl             v16.8h,  v16.8h,  #9      // -m << 9
634        shl             v17.8h,  v17.8h,  #9
635        shl             v18.8h,  v18.8h,  #9
636        shl             v19.8h,  v19.8h,  #9
637        sub             v4.8h,   v0.8h,   v4.8h   // a - b
638        sub             v5.8h,   v1.8h,   v5.8h
639        sub             v6.8h,   v2.8h,   v6.8h
640        sub             v7.8h,   v3.8h,   v7.8h
641        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
642        sqrdmulh        v5.8h,   v5.8h,   v17.8h
643        sqrdmulh        v6.8h,   v6.8h,   v18.8h
644        sqrdmulh        v7.8h,   v7.8h,   v19.8h
645        add             v0.8h,   v0.8h,   v4.8h
646        add             v1.8h,   v1.8h,   v5.8h
647        add             v2.8h,   v2.8h,   v6.8h
648        add             v3.8h,   v3.8h,   v7.8h
649        st1             {v0.8h, v1.8h}, [x0], x1
650        st1             {v2.8h, v3.8h}, [x8], x1
651        b.gt            16b
652        ret
653320:
654        AARCH64_VALID_JUMP_TARGET
65532:
656        ld1             {v16.16b, v17.16b},           [x5], #32
657        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
658        subs            w4,  w4,  #1
659        neg             v18.16b, v16.16b          // -m
660        neg             v19.16b, v17.16b
661        sxtl            v16.8h,  v18.8b
662        sxtl2           v17.8h,  v18.16b
663        sxtl            v18.8h,  v19.8b
664        sxtl2           v19.8h,  v19.16b
665        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
666        shl             v16.8h,  v16.8h,  #9      // -m << 9
667        shl             v17.8h,  v17.8h,  #9
668        shl             v18.8h,  v18.8h,  #9
669        shl             v19.8h,  v19.8h,  #9
670        sub             v4.8h,   v0.8h,   v4.8h   // a - b
671        sub             v5.8h,   v1.8h,   v5.8h
672        sub             v6.8h,   v2.8h,   v6.8h
673        sub             v7.8h,   v3.8h,   v7.8h
674        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
675        sqrdmulh        v5.8h,   v5.8h,   v17.8h
676        sqrdmulh        v6.8h,   v6.8h,   v18.8h
677        sqrdmulh        v7.8h,   v7.8h,   v19.8h
678        add             v0.8h,   v0.8h,   v4.8h
679        add             v1.8h,   v1.8h,   v5.8h
680        add             v2.8h,   v2.8h,   v6.8h
681        add             v3.8h,   v3.8h,   v7.8h
682        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
683        b.gt            32b
684        ret
685endfunc
686
687jumptable blend_tbl
688        .word 320b - blend_tbl
689        .word 160b - blend_tbl
690        .word 80b  - blend_tbl
691        .word 40b  - blend_tbl
692endjumptable
693
694function blend_h_16bpc_neon, export=1
695        movrel          x6,  blend_h_tbl
696        movrel          x5,  X(obmc_masks)
697        add             x5,  x5,  w4,  uxtw
698        sub             w4,  w4,  w4,  lsr #2
699        clz             w7,  w3
700        add             x8,  x0,  x1
701        lsl             x1,  x1,  #1
702        sub             w7,  w7,  #24
703        ldrsw           x7,  [x6,  x7,  lsl #2]
704        add             x6,  x6,  x7
705        br              x6
70620:
707        AARCH64_VALID_JUMP_TARGET
7082:
709        ld2r            {v2.8b, v3.8b}, [x5], #2
710        ld1             {v1.4h},        [x2], #8
711        ext             v2.8b,   v2.8b,   v3.8b,   #6
712        subs            w4,  w4,  #2
713        neg             v2.8b,   v2.8b            // -m
714        ldr             s0,        [x0]
715        ld1             {v0.s}[1], [x8]
716        sxtl            v2.8h,   v2.8b
717        shl             v2.4h,   v2.4h,   #9      // -m << 9
718        sub             v1.4h,   v0.4h,   v1.4h   // a - b
719        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
720        add             v0.4h,   v0.4h,   v1.4h
721        st1             {v0.s}[0], [x0], x1
722        st1             {v0.s}[1], [x8], x1
723        b.gt            2b
724        ret
72540:
726        AARCH64_VALID_JUMP_TARGET
7274:
728        ld2r            {v2.8b, v3.8b}, [x5], #2
729        ld1             {v1.8h},        [x2], #16
730        ext             v2.8b,   v2.8b,   v3.8b,   #4
731        subs            w4,  w4,  #2
732        neg             v2.8b,   v2.8b            // -m
733        ldr             d0,          [x0]
734        ld1             {v0.d}[1],   [x8]
735        sxtl            v2.8h,   v2.8b
736        shl             v2.8h,   v2.8h,   #9      // -m << 9
737        sub             v1.8h,   v0.8h,   v1.8h   // a - b
738        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
739        add             v0.8h,   v0.8h,   v1.8h
740        st1             {v0.8b},   [x0], x1
741        st1             {v0.d}[1], [x8], x1
742        b.gt            4b
743        ret
74480:
745        AARCH64_VALID_JUMP_TARGET
7468:
747        ld2r            {v4.8b, v5.8b}, [x5], #2
748        ld1             {v2.8h, v3.8h}, [x2], #32
749        neg             v4.8b,   v4.8b            // -m
750        neg             v5.8b,   v5.8b
751        ld1             {v0.8h}, [x0]
752        subs            w4,  w4,  #2
753        sxtl            v4.8h,   v4.8b
754        sxtl            v5.8h,   v5.8b
755        ld1             {v1.8h}, [x8]
756        shl             v4.8h,   v4.8h,   #9      // -m << 9
757        shl             v5.8h,   v5.8h,   #9
758        sub             v2.8h,   v0.8h,   v2.8h   // a - b
759        sub             v3.8h,   v1.8h,   v3.8h
760        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
761        sqrdmulh        v3.8h,   v3.8h,   v5.8h
762        add             v0.8h,   v0.8h,   v2.8h
763        add             v1.8h,   v1.8h,   v3.8h
764        st1             {v0.8h}, [x0], x1
765        st1             {v1.8h}, [x8], x1
766        b.gt            8b
767        ret
768160:
769        AARCH64_VALID_JUMP_TARGET
77016:
771        ld2r            {v16.8b, v17.8b}, [x5], #2
772        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
773        neg             v16.8b,  v16.8b           // -m
774        neg             v17.8b,  v17.8b
775        ld1             {v0.8h, v1.8h},  [x0]
776        ld1             {v2.8h, v3.8h},  [x8]
777        subs            w4,  w4,  #2
778        sxtl            v16.8h,  v16.8b
779        sxtl            v17.8h,  v17.8b
780        shl             v16.8h,  v16.8h,  #9      // -m << 9
781        shl             v17.8h,  v17.8h,  #9
782        sub             v4.8h,   v0.8h,   v4.8h   // a - b
783        sub             v5.8h,   v1.8h,   v5.8h
784        sub             v6.8h,   v2.8h,   v6.8h
785        sub             v7.8h,   v3.8h,   v7.8h
786        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
787        sqrdmulh        v5.8h,   v5.8h,   v16.8h
788        sqrdmulh        v6.8h,   v6.8h,   v17.8h
789        sqrdmulh        v7.8h,   v7.8h,   v17.8h
790        add             v0.8h,   v0.8h,   v4.8h
791        add             v1.8h,   v1.8h,   v5.8h
792        add             v2.8h,   v2.8h,   v6.8h
793        add             v3.8h,   v3.8h,   v7.8h
794        st1             {v0.8h, v1.8h}, [x0], x1
795        st1             {v2.8h, v3.8h}, [x8], x1
796        b.gt            16b
797        ret
7981280:
799640:
800320:
801        AARCH64_VALID_JUMP_TARGET
802        sub             x1,  x1,  w3,  uxtw #1
803        add             x7,  x2,  w3,  uxtw #1
804321:
805        ld2r            {v24.8b, v25.8b}, [x5], #2
806        mov             w6,  w3
807        neg             v24.8b,  v24.8b           // -m
808        neg             v25.8b,  v25.8b
809        sxtl            v24.8h,  v24.8b
810        sxtl            v25.8h,  v25.8b
811        shl             v24.8h,  v24.8h,  #9      // -m << 9
812        shl             v25.8h,  v25.8h,  #9
81332:
814        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
815        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0]
816        subs            w6,  w6,  #32
817        sub             v16.8h,  v0.8h,   v16.8h  // a - b
818        sub             v17.8h,  v1.8h,   v17.8h
819        sub             v18.8h,  v2.8h,   v18.8h
820        sub             v19.8h,  v3.8h,   v19.8h
821        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
822        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8]
823        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
824        sqrdmulh        v17.8h,  v17.8h,  v24.8h
825        sqrdmulh        v18.8h,  v18.8h,  v24.8h
826        sqrdmulh        v19.8h,  v19.8h,  v24.8h
827        sub             v20.8h,  v4.8h,   v20.8h  // a - b
828        sub             v21.8h,  v5.8h,   v21.8h
829        sub             v22.8h,  v6.8h,   v22.8h
830        sub             v23.8h,  v7.8h,   v23.8h
831        add             v0.8h,   v0.8h,   v16.8h
832        add             v1.8h,   v1.8h,   v17.8h
833        add             v2.8h,   v2.8h,   v18.8h
834        add             v3.8h,   v3.8h,   v19.8h
835        sqrdmulh        v20.8h,  v20.8h,  v25.8h  // ((a-b)*-m + 32) >> 6
836        sqrdmulh        v21.8h,  v21.8h,  v25.8h
837        sqrdmulh        v22.8h,  v22.8h,  v25.8h
838        sqrdmulh        v23.8h,  v23.8h,  v25.8h
839        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
840        add             v4.8h,   v4.8h,   v20.8h
841        add             v5.8h,   v5.8h,   v21.8h
842        add             v6.8h,   v6.8h,   v22.8h
843        add             v7.8h,   v7.8h,   v23.8h
844        st1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8], #64
845        b.gt            32b
846        subs            w4,  w4,  #2
847        add             x0,  x0,  x1
848        add             x8,  x8,  x1
849        add             x2,  x2,  w3,  uxtw #1
850        add             x7,  x7,  w3,  uxtw #1
851        b.gt            321b
852        ret
853endfunc
854
855jumptable blend_h_tbl
856        .word 1280b - blend_h_tbl
857        .word 640b  - blend_h_tbl
858        .word 320b  - blend_h_tbl
859        .word 160b  - blend_h_tbl
860        .word 80b   - blend_h_tbl
861        .word 40b   - blend_h_tbl
862        .word 20b   - blend_h_tbl
863endjumptable
864
865function blend_v_16bpc_neon, export=1
866        movrel          x6,  blend_v_tbl
867        movrel          x5,  X(obmc_masks)
868        add             x5,  x5,  w3,  uxtw
869        clz             w3,  w3
870        add             x8,  x0,  x1
871        lsl             x1,  x1,  #1
872        sub             w3,  w3,  #26
873        ldrsw           x3,  [x6,  x3,  lsl #2]
874        add             x6,  x6,  x3
875        br              x6
87620:
877        AARCH64_VALID_JUMP_TARGET
878        ld1r            {v2.8b}, [x5]
879        neg             v2.8b,   v2.8b            // -m
880        sxtl            v2.8h,   v2.8b
881        shl             v2.4h,   v2.4h,   #9      // -m << 9
8822:
883        ldr             s1,  [x2],  #4
884        ldr             h0,  [x0]
885        subs            w4,  w4,  #2
886        ld1             {v1.h}[1], [x2]
887        ld1             {v0.h}[1], [x8]
888        add             x2,  x2,  #4
889        sub             v1.4h,   v0.4h,   v1.4h   // a - b
890        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
891        add             v0.4h,   v0.4h,   v1.4h
892        st1             {v0.h}[0], [x0],  x1
893        st1             {v0.h}[1], [x8],  x1
894        b.gt            2b
895        ret
89640:
897        AARCH64_VALID_JUMP_TARGET
898        ld1r            {v2.2s}, [x5]
899        sub             x1,  x1,  #4
900        neg             v2.8b,   v2.8b            // -m
901        sxtl            v2.8h,   v2.8b
902        shl             v2.8h,   v2.8h,   #9      // -m << 9
9034:
904        ld1             {v1.8h},   [x2], #16
905        ldr             d0,        [x0]
906        ld1             {v0.d}[1], [x8]
907        subs            w4,  w4,  #2
908        sub             v1.8h,   v0.8h,   v1.8h   // a - b
909        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
910        add             v0.8h,   v0.8h,   v1.8h
911        str             s0,        [x0], #4
912        st1             {v0.s}[2], [x8], #4
913        st1             {v0.h}[2], [x0], x1
914        st1             {v0.h}[6], [x8], x1
915        b.gt            4b
916        ret
91780:
918        AARCH64_VALID_JUMP_TARGET
919        ld1             {v4.8b}, [x5]
920        sub             x1,  x1,  #8
921        neg             v4.8b,   v4.8b            // -m
922        sxtl            v4.8h,   v4.8b
923        shl             v4.8h,   v4.8h,   #9      // -m << 9
9248:
925        ld1             {v2.8h, v3.8h}, [x2], #32
926        ld1             {v0.8h}, [x0]
927        ld1             {v1.8h}, [x8]
928        subs            w4,  w4,  #2
929        sub             v2.8h,   v0.8h,   v2.8h   // a - b
930        sub             v3.8h,   v1.8h,   v3.8h
931        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
932        sqrdmulh        v3.8h,   v3.8h,   v4.8h
933        add             v0.8h,   v0.8h,   v2.8h
934        add             v1.8h,   v1.8h,   v3.8h
935        str             d0,        [x0], #8
936        str             d1,        [x8], #8
937        st1             {v0.s}[2], [x0], x1
938        st1             {v1.s}[2], [x8], x1
939        b.gt            8b
940        ret
941160:
942        AARCH64_VALID_JUMP_TARGET
943        ld1             {v16.16b}, [x5]
944        sub             x1,  x1,  #16
945        neg             v17.16b, v16.16b          // -m
946        sxtl            v16.8h,  v17.8b
947        sxtl2           v17.8h,  v17.16b
948        shl             v16.8h,  v16.8h,  #9      // -m << 9
949        shl             v17.4h,  v17.4h,  #9
95016:
951        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
952        ld1             {v0.8h, v1.8h}, [x0]
953        subs            w4,  w4,  #2
954        ld1             {v2.8h, v3.8h}, [x8]
955        sub             v4.8h,   v0.8h,   v4.8h   // a - b
956        sub             v5.4h,   v1.4h,   v5.4h
957        sub             v6.8h,   v2.8h,   v6.8h
958        sub             v7.4h,   v3.4h,   v7.4h
959        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
960        sqrdmulh        v5.4h,   v5.4h,   v17.4h
961        sqrdmulh        v6.8h,   v6.8h,   v16.8h
962        sqrdmulh        v7.4h,   v7.4h,   v17.4h
963        add             v0.8h,   v0.8h,   v4.8h
964        add             v1.4h,   v1.4h,   v5.4h
965        add             v2.8h,   v2.8h,   v6.8h
966        add             v3.4h,   v3.4h,   v7.4h
967        st1             {v0.8h}, [x0], #16
968        st1             {v2.8h}, [x8], #16
969        st1             {v1.4h}, [x0], x1
970        st1             {v3.4h}, [x8], x1
971        b.gt            16b
972        ret
973320:
974        AARCH64_VALID_JUMP_TARGET
975        ld1             {v24.16b, v25.16b},  [x5]
976        neg             v26.16b, v24.16b          // -m
977        neg             v27.8b,  v25.8b
978        sxtl            v24.8h,  v26.8b
979        sxtl2           v25.8h,  v26.16b
980        sxtl            v26.8h,  v27.8b
981        shl             v24.8h,  v24.8h,  #9      // -m << 9
982        shl             v25.8h,  v25.8h,  #9
983        shl             v26.8h,  v26.8h,  #9
98432:
985        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
986        ld1             {v0.8h, v1.8h, v2.8h}, [x0]
987        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
988        ld1             {v4.8h, v5.8h, v6.8h}, [x8]
989        subs            w4,  w4,  #2
990        sub             v16.8h,  v0.8h,   v16.8h  // a - b
991        sub             v17.8h,  v1.8h,   v17.8h
992        sub             v18.8h,  v2.8h,   v18.8h
993        sub             v20.8h,  v4.8h,   v20.8h
994        sub             v21.8h,  v5.8h,   v21.8h
995        sub             v22.8h,  v6.8h,   v22.8h
996        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
997        sqrdmulh        v17.8h,  v17.8h,  v25.8h
998        sqrdmulh        v18.8h,  v18.8h,  v26.8h
999        sqrdmulh        v20.8h,  v20.8h,  v24.8h
1000        sqrdmulh        v21.8h,  v21.8h,  v25.8h
1001        sqrdmulh        v22.8h,  v22.8h,  v26.8h
1002        add             v0.8h,   v0.8h,   v16.8h
1003        add             v1.8h,   v1.8h,   v17.8h
1004        add             v2.8h,   v2.8h,   v18.8h
1005        add             v4.8h,   v4.8h,   v20.8h
1006        add             v5.8h,   v5.8h,   v21.8h
1007        add             v6.8h,   v6.8h,   v22.8h
1008        st1             {v0.8h, v1.8h, v2.8h}, [x0], x1
1009        st1             {v4.8h, v5.8h, v6.8h}, [x8], x1
1010        b.gt            32b
1011        ret
1012endfunc
1013
1014jumptable blend_v_tbl
1015        .word 320b - blend_v_tbl
1016        .word 160b - blend_v_tbl
1017        .word 80b  - blend_v_tbl
1018        .word 40b  - blend_v_tbl
1019        .word 20b  - blend_v_tbl
1020endjumptable
1021
1022
1023// This has got the same signature as the put_8tap functions,
1024// and assumes that x9 is set to (clz(w)-24).
1025function put_16bpc_neon, export=1
1026        movrel          x10, put_16bpc_tbl
1027        ldrsw           x9, [x10, x9, lsl #2]
1028        add             x10, x10, x9
1029        br              x10
1030
103120:
1032        AARCH64_VALID_JUMP_TARGET
10332:
1034        ld1r            {v0.4s},   [x2], x3
1035        ld1r            {v1.4s},   [x2], x3
1036        subs            w5,  w5,  #2
1037        st1             {v0.s}[0], [x0], x1
1038        st1             {v1.s}[0], [x0], x1
1039        b.gt            2b
1040        ret
104140:
1042        AARCH64_VALID_JUMP_TARGET
10434:
1044        ld1             {v0.4h}, [x2], x3
1045        ld1             {v1.4h}, [x2], x3
1046        subs            w5,  w5,  #2
1047        st1             {v0.4h}, [x0], x1
1048        st1             {v1.4h}, [x0], x1
1049        b.gt            4b
1050        ret
105180:
1052        AARCH64_VALID_JUMP_TARGET
1053        add             x8,  x0,  x1
1054        lsl             x1,  x1,  #1
1055        add             x9,  x2,  x3
1056        lsl             x3,  x3,  #1
10578:
1058        ld1             {v0.8h}, [x2], x3
1059        ld1             {v1.8h}, [x9], x3
1060        subs            w5,  w5,  #2
1061        st1             {v0.8h}, [x0], x1
1062        st1             {v1.8h}, [x8], x1
1063        b.gt            8b
1064        ret
1065160:
1066        AARCH64_VALID_JUMP_TARGET
106716:
1068        ldp             x6,  x7,  [x2]
1069        ldp             x8,  x9,  [x2, #16]
1070        stp             x6,  x7,  [x0]
1071        subs            w5,  w5,  #1
1072        stp             x8,  x9,  [x0, #16]
1073        add             x2,  x2,  x3
1074        add             x0,  x0,  x1
1075        b.gt            16b
1076        ret
1077320:
1078        AARCH64_VALID_JUMP_TARGET
107932:
1080        ldp             x6,  x7,  [x2]
1081        ldp             x8,  x9,  [x2, #16]
1082        stp             x6,  x7,  [x0]
1083        ldp             x10, x11, [x2, #32]
1084        stp             x8,  x9,  [x0, #16]
1085        subs            w5,  w5,  #1
1086        ldp             x12, x13, [x2, #48]
1087        stp             x10, x11, [x0, #32]
1088        stp             x12, x13, [x0, #48]
1089        add             x2,  x2,  x3
1090        add             x0,  x0,  x1
1091        b.gt            32b
1092        ret
1093640:
1094        AARCH64_VALID_JUMP_TARGET
109564:
1096        ldp             q0,  q1,  [x2]
1097        ldp             q2,  q3,  [x2, #32]
1098        stp             q0,  q1,  [x0]
1099        ldp             q4,  q5,  [x2, #64]
1100        stp             q2,  q3,  [x0, #32]
1101        ldp             q6,  q7,  [x2, #96]
1102        subs            w5,  w5,  #1
1103        stp             q4,  q5,  [x0, #64]
1104        stp             q6,  q7,  [x0, #96]
1105        add             x2,  x2,  x3
1106        add             x0,  x0,  x1
1107        b.gt            64b
1108        ret
11091280:
1110        AARCH64_VALID_JUMP_TARGET
1111128:
1112        ldp             q0,  q1,  [x2]
1113        ldp             q2,  q3,  [x2, #32]
1114        stp             q0,  q1,  [x0]
1115        ldp             q4,  q5,  [x2, #64]
1116        stp             q2,  q3,  [x0, #32]
1117        ldp             q6,  q7,  [x2, #96]
1118        subs            w5,  w5,  #1
1119        stp             q4,  q5,  [x0, #64]
1120        ldp             q16, q17, [x2, #128]
1121        stp             q6,  q7,  [x0, #96]
1122        ldp             q18, q19, [x2, #160]
1123        stp             q16, q17, [x0, #128]
1124        ldp             q20, q21, [x2, #192]
1125        stp             q18, q19, [x0, #160]
1126        ldp             q22, q23, [x2, #224]
1127        stp             q20, q21, [x0, #192]
1128        stp             q22, q23, [x0, #224]
1129        add             x2,  x2,  x3
1130        add             x0,  x0,  x1
1131        b.gt            128b
1132        ret
1133endfunc
1134
1135jumptable put_16bpc_tbl
1136        .word 1280b - put_16bpc_tbl
1137        .word 640b  - put_16bpc_tbl
1138        .word 320b  - put_16bpc_tbl
1139        .word 160b  - put_16bpc_tbl
1140        .word 80b   - put_16bpc_tbl
1141        .word 40b   - put_16bpc_tbl
1142        .word 20b   - put_16bpc_tbl
1143endjumptable
1144
1145
1146// This has got the same signature as the prep_8tap functions,
1147// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
1148// x8 to w*2.
1149function prep_16bpc_neon
1150        movrel          x10, prep_16bpc_tbl
1151        ldrsw           x9, [x10, x9, lsl #2]
1152        dup             v31.8h,  w7   // intermediate_bits
1153        movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8
1154        add             x10, x10, x9
1155        br              x10
1156
115740:
1158        AARCH64_VALID_JUMP_TARGET
1159        add             x9,  x1,  x2
1160        lsl             x2,  x2,  #1
11614:
1162        ld1             {v0.8b},   [x1], x2
1163        ld1             {v0.d}[1], [x9], x2
1164        subs            w4,  w4,  #2
1165        sshl            v0.8h,   v0.8h,   v31.8h
1166        sub             v0.8h,   v0.8h,   v30.8h
1167        st1             {v0.8h}, [x0], #16
1168        b.gt            4b
1169        ret
117080:
1171        AARCH64_VALID_JUMP_TARGET
1172        add             x9,  x1,  x2
1173        lsl             x2,  x2,  #1
11748:
1175        ld1             {v0.8h}, [x1], x2
1176        ld1             {v1.8h}, [x9], x2
1177        subs            w4,  w4,  #2
1178        sshl            v0.8h,   v0.8h,   v31.8h
1179        sshl            v1.8h,   v1.8h,   v31.8h
1180        sub             v0.8h,   v0.8h,   v30.8h
1181        sub             v1.8h,   v1.8h,   v30.8h
1182        st1             {v0.8h, v1.8h}, [x0], #32
1183        b.gt            8b
1184        ret
1185160:
1186        AARCH64_VALID_JUMP_TARGET
118716:
1188        ldp             q0,  q1,  [x1]
1189        add             x1,  x1,  x2
1190        sshl            v0.8h,   v0.8h,   v31.8h
1191        ldp             q2,  q3,  [x1]
1192        add             x1,  x1,  x2
1193        subs            w4,  w4,  #2
1194        sshl            v1.8h,   v1.8h,   v31.8h
1195        sshl            v2.8h,   v2.8h,   v31.8h
1196        sshl            v3.8h,   v3.8h,   v31.8h
1197        sub             v0.8h,   v0.8h,   v30.8h
1198        sub             v1.8h,   v1.8h,   v30.8h
1199        sub             v2.8h,   v2.8h,   v30.8h
1200        sub             v3.8h,   v3.8h,   v30.8h
1201        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
1202        b.gt            16b
1203        ret
1204320:
1205        AARCH64_VALID_JUMP_TARGET
120632:
1207        ldp             q0,  q1,  [x1]
1208        sshl            v0.8h,   v0.8h,   v31.8h
1209        ldp             q2,  q3,  [x1, #32]
1210        add             x1,  x1,  x2
1211        sshl            v1.8h,   v1.8h,   v31.8h
1212        sshl            v2.8h,   v2.8h,   v31.8h
1213        sshl            v3.8h,   v3.8h,   v31.8h
1214        subs            w4,  w4,  #1
1215        sub             v0.8h,   v0.8h,   v30.8h
1216        sub             v1.8h,   v1.8h,   v30.8h
1217        sub             v2.8h,   v2.8h,   v30.8h
1218        sub             v3.8h,   v3.8h,   v30.8h
1219        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
1220        b.gt            32b
1221        ret
1222640:
1223        AARCH64_VALID_JUMP_TARGET
122464:
1225        ldp             q0,  q1,  [x1]
1226        subs            w4,  w4,  #1
1227        sshl            v0.8h,   v0.8h,   v31.8h
1228        ldp             q2,  q3,  [x1, #32]
1229        sshl            v1.8h,   v1.8h,   v31.8h
1230        ldp             q4,  q5,  [x1, #64]
1231        sshl            v2.8h,   v2.8h,   v31.8h
1232        sshl            v3.8h,   v3.8h,   v31.8h
1233        ldp             q6,  q7,  [x1, #96]
1234        add             x1,  x1,  x2
1235        sshl            v4.8h,   v4.8h,   v31.8h
1236        sshl            v5.8h,   v5.8h,   v31.8h
1237        sshl            v6.8h,   v6.8h,   v31.8h
1238        sshl            v7.8h,   v7.8h,   v31.8h
1239        sub             v0.8h,   v0.8h,   v30.8h
1240        sub             v1.8h,   v1.8h,   v30.8h
1241        sub             v2.8h,   v2.8h,   v30.8h
1242        sub             v3.8h,   v3.8h,   v30.8h
1243        stp             q0,  q1,  [x0]
1244        sub             v4.8h,   v4.8h,   v30.8h
1245        sub             v5.8h,   v5.8h,   v30.8h
1246        stp             q2,  q3,  [x0, #32]
1247        sub             v6.8h,   v6.8h,   v30.8h
1248        sub             v7.8h,   v7.8h,   v30.8h
1249        stp             q4,  q5,  [x0, #64]
1250        stp             q6,  q7,  [x0, #96]
1251        add             x0,  x0,  x8
1252        b.gt            64b
1253        ret
12541280:
1255        AARCH64_VALID_JUMP_TARGET
1256128:
1257        ldp             q0,  q1,  [x1]
1258        subs            w4,  w4,  #1
1259        sshl            v0.8h,   v0.8h,   v31.8h
1260        ldp             q2,  q3,  [x1, #32]
1261        sshl            v1.8h,   v1.8h,   v31.8h
1262        ldp             q4,  q5,  [x1, #64]
1263        sshl            v2.8h,   v2.8h,   v31.8h
1264        sshl            v3.8h,   v3.8h,   v31.8h
1265        ldp             q6,  q7,  [x1, #96]
1266        sshl            v4.8h,   v4.8h,   v31.8h
1267        sshl            v5.8h,   v5.8h,   v31.8h
1268        ldp             q16, q17, [x1, #128]
1269        sshl            v6.8h,   v6.8h,   v31.8h
1270        sshl            v7.8h,   v7.8h,   v31.8h
1271        ldp             q18, q19, [x1, #160]
1272        sshl            v16.8h,  v16.8h,  v31.8h
1273        sshl            v17.8h,  v17.8h,  v31.8h
1274        ldp             q20, q21, [x1, #192]
1275        sshl            v18.8h,  v18.8h,  v31.8h
1276        sshl            v19.8h,  v19.8h,  v31.8h
1277        ldp             q22, q23, [x1, #224]
1278        add             x1,  x1,  x2
1279        sshl            v20.8h,  v20.8h,  v31.8h
1280        sshl            v21.8h,  v21.8h,  v31.8h
1281        sshl            v22.8h,  v22.8h,  v31.8h
1282        sshl            v23.8h,  v23.8h,  v31.8h
1283        sub             v0.8h,   v0.8h,   v30.8h
1284        sub             v1.8h,   v1.8h,   v30.8h
1285        sub             v2.8h,   v2.8h,   v30.8h
1286        sub             v3.8h,   v3.8h,   v30.8h
1287        stp             q0,  q1,  [x0]
1288        sub             v4.8h,   v4.8h,   v30.8h
1289        sub             v5.8h,   v5.8h,   v30.8h
1290        stp             q2,  q3,  [x0, #32]
1291        sub             v6.8h,   v6.8h,   v30.8h
1292        sub             v7.8h,   v7.8h,   v30.8h
1293        stp             q4,  q5,  [x0, #64]
1294        sub             v16.8h,  v16.8h,  v30.8h
1295        sub             v17.8h,  v17.8h,  v30.8h
1296        stp             q6,  q7,  [x0, #96]
1297        sub             v18.8h,  v18.8h,  v30.8h
1298        sub             v19.8h,  v19.8h,  v30.8h
1299        stp             q16, q17, [x0, #128]
1300        sub             v20.8h,  v20.8h,  v30.8h
1301        sub             v21.8h,  v21.8h,  v30.8h
1302        stp             q18, q19, [x0, #160]
1303        sub             v22.8h,  v22.8h,  v30.8h
1304        sub             v23.8h,  v23.8h,  v30.8h
1305        stp             q20, q21, [x0, #192]
1306        stp             q22, q23, [x0, #224]
1307        add             x0,  x0,  x8
1308        b.gt            128b
1309        ret
1310endfunc
1311
1312jumptable prep_16bpc_tbl
1313        .word 1280b - prep_16bpc_tbl
1314        .word 640b  - prep_16bpc_tbl
1315        .word 320b  - prep_16bpc_tbl
1316        .word 160b  - prep_16bpc_tbl
1317        .word 80b   - prep_16bpc_tbl
1318        .word 40b   - prep_16bpc_tbl
1319endjumptable
1320
1321
1322.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1323        ld1             {\d0\wd}[0], [\s0], \strd
1324        ld1             {\d1\wd}[0], [\s1], \strd
1325.ifnb \d2
1326        ld1             {\d2\wd}[0], [\s0], \strd
1327        ld1             {\d3\wd}[0], [\s1], \strd
1328.endif
1329.ifnb \d4
1330        ld1             {\d4\wd}[0], [\s0], \strd
1331.endif
1332.ifnb \d5
1333        ld1             {\d5\wd}[0], [\s1], \strd
1334.endif
1335.ifnb \d6
1336        ld1             {\d6\wd}[0], [\s0], \strd
1337.endif
1338.endm
1339.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1340        ld1             {\d0\wd}, [\s0], \strd
1341        ld1             {\d1\wd}, [\s1], \strd
1342.ifnb \d2
1343        ld1             {\d2\wd}, [\s0], \strd
1344        ld1             {\d3\wd}, [\s1], \strd
1345.endif
1346.ifnb \d4
1347        ld1             {\d4\wd}, [\s0], \strd
1348.endif
1349.ifnb \d5
1350        ld1             {\d5\wd}, [\s1], \strd
1351.endif
1352.ifnb \d6
1353        ld1             {\d6\wd}, [\s0], \strd
1354.endif
1355.endm
1356.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
1357        ld1             {\d0\wd, \d1\wd}, [\s0], \strd
1358.ifnb \d2
1359        ld1             {\d2\wd, \d3\wd}, [\s1], \strd
1360.endif
1361.ifnb \d4
1362        ld1             {\d4\wd, \d5\wd}, [\s0], \strd
1363.endif
1364.endm
1365.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1366        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1367.endm
1368.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1369        load_reg        \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1370.endm
1371.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1372        load_reg        \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1373.endm
1374.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
1375        load_regpair    \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
1376.endm
1377.macro interleave_1 wd, r0, r1, r2, r3, r4
1378        trn1            \r0\wd, \r0\wd, \r1\wd
1379        trn1            \r1\wd, \r1\wd, \r2\wd
1380.ifnb \r3
1381        trn1            \r2\wd, \r2\wd, \r3\wd
1382        trn1            \r3\wd, \r3\wd, \r4\wd
1383.endif
1384.endm
1385.macro interleave_1_s r0, r1, r2, r3, r4
1386        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
1387.endm
1388.macro umin_h c, wd, r0, r1, r2, r3
1389        umin            \r0\wd,  \r0\wd,  \c\wd
1390.ifnb \r1
1391        umin            \r1\wd,  \r1\wd,  \c\wd
1392.endif
1393.ifnb \r2
1394        umin            \r2\wd,  \r2\wd,  \c\wd
1395        umin            \r3\wd,  \r3\wd,  \c\wd
1396.endif
1397.endm
1398.macro sub_h c, wd, r0, r1, r2, r3
1399        sub             \r0\wd,  \r0\wd,  \c\wd
1400.ifnb \r1
1401        sub             \r1\wd,  \r1\wd,  \c\wd
1402.endif
1403.ifnb \r2
1404        sub             \r2\wd,  \r2\wd,  \c\wd
1405        sub             \r3\wd,  \r3\wd,  \c\wd
1406.endif
1407.endm
1408.macro smull_smlal_4tap d, s0, s1, s2, s3
1409        smull           \d\().4s,  \s0\().4h,  v0.h[0]
1410        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
1411        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1412        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1413.endm
1414.macro smull2_smlal2_4tap d, s0, s1, s2, s3
1415        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
1416        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
1417        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1418        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1419.endm
1420.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
1421        smull           \d\().4s,  \s1\().4h,  v0.h[1]
1422        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1423        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1424        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
1425        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
1426        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
1427.endm
1428.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
1429        smull2          \d\().4s,  \s1\().8h,  v0.h[1]
1430        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1431        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1432        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
1433        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
1434        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
1435.endm
1436.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
1437        smull           \d\().4s,  \s0\().4h,  v0.h[0]
1438        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
1439        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1440        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1441        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
1442        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
1443        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
1444        smlal           \d\().4s,  \s7\().4h,  v0.h[7]
1445.endm
1446.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
1447        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
1448        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
1449        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1450        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1451        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
1452        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
1453        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
1454        smlal2          \d\().4s,  \s7\().8h,  v0.h[7]
1455.endm
1456.macro sqrshrun_h shift, r0, r1, r2, r3
1457        sqrshrun        \r0\().4h, \r0\().4s,  #\shift
1458.ifnb \r1
1459        sqrshrun2       \r0\().8h, \r1\().4s,  #\shift
1460.endif
1461.ifnb \r2
1462        sqrshrun        \r2\().4h, \r2\().4s,  #\shift
1463        sqrshrun2       \r2\().8h, \r3\().4s,  #\shift
1464.endif
1465.endm
1466.macro xtn_h r0, r1, r2, r3
1467        uzp1            \r0\().8h,  \r0\().8h,  \r1\().8h // Same as xtn, xtn2
1468.ifnb \r2
1469        uzp1            \r2\().8h,  \r2\().8h,  \r3\().8h // Ditto
1470.endif
1471.endm
1472.macro srshl_s shift, r0, r1, r2, r3
1473        srshl           \r0\().4s,  \r0\().4s,  \shift\().4s
1474        srshl           \r1\().4s,  \r1\().4s,  \shift\().4s
1475.ifnb \r2
1476        srshl           \r2\().4s,  \r2\().4s,  \shift\().4s
1477        srshl           \r3\().4s,  \r3\().4s,  \shift\().4s
1478.endif
1479.endm
1480.macro st_s strd, reg, lanes
1481        st1             {\reg\().s}[0], [x0], \strd
1482        st1             {\reg\().s}[1], [x9], \strd
1483.if \lanes > 2
1484        st1             {\reg\().s}[2], [x0], \strd
1485        st1             {\reg\().s}[3], [x9], \strd
1486.endif
1487.endm
1488.macro st_d strd, r0, r1
1489        st1             {\r0\().8b},   [x0], \strd
1490        st1             {\r0\().d}[1], [x9], \strd
1491.ifnb \r1
1492        st1             {\r1\().8b},   [x0], \strd
1493        st1             {\r1\().d}[1], [x9], \strd
1494.endif
1495.endm
1496.macro shift_store_4 type, strd, r0, r1, r2, r3
1497.ifc \type, put
1498        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1499        umin_h          v31, .8h, \r0, \r2
1500.else
1501        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1502        xtn_h           \r0, \r1, \r2, \r3
1503        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
1504.endif
1505        st_d            \strd, \r0, \r2
1506.endm
1507.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
1508        st1             {\r0\wd}, [x0], \strd
1509        st1             {\r1\wd}, [x9], \strd
1510.ifnb \r2
1511        st1             {\r2\wd}, [x0], \strd
1512        st1             {\r3\wd}, [x9], \strd
1513.endif
1514.ifnb \r4
1515        st1             {\r4\wd}, [x0], \strd
1516        st1             {\r5\wd}, [x9], \strd
1517        st1             {\r6\wd}, [x0], \strd
1518        st1             {\r7\wd}, [x9], \strd
1519.endif
1520.endm
1521.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
1522        st_reg          \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1523.endm
1524.macro shift_store_8 type, strd, r0, r1, r2, r3
1525.ifc \type, put
1526        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1527        umin_h          v31, .8h, \r0, \r2
1528.else
1529        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1530        xtn_h           \r0, \r1, \r2, \r3
1531        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
1532.endif
1533        st_8h           \strd, \r0, \r2
1534.endm
1535.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
1536.ifc \type, put
1537        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1538        umin            \r0\().8h, \r0\().8h, v31.8h
1539        umin            \r1\().8h, \r2\().8h, v31.8h
1540.else
1541        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1542        xtn_h           \r0, \r1, \r2, \r3
1543        sub             \r0\().8h, \r0\().8h, v29.8h
1544        sub             \r1\().8h, \r2\().8h, v29.8h
1545.endif
1546        st1             {\r0\().8h, \r1\().8h}, [\dst], \strd
1547.endm
1548
1549.macro make_8tap_fn op, type, type_h, type_v, taps
1550function \op\()_8tap_\type\()_16bpc_neon, export=1
1551        mov             w9,  \type_h
1552        mov             w10, \type_v
1553        b               \op\()_\taps\()_neon
1554endfunc
1555.endm
1556
1557// No spaces in these expressions, due to gas-preprocessor.
1558#define REGULAR ((0*15<<7)|3*15)
1559#define SMOOTH  ((1*15<<7)|4*15)
1560#define SHARP   ((2*15<<7)|3*15)
1561
1562.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps
1563function \type\()_\taps\()_neon
1564.ifc \bdmax, w8
1565        ldr             w8,  [sp]
1566.endif
1567        mov             w11,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1568        mul             \mx,  \mx, w11
1569        mul             \my,  \my, w11
1570        add             \mx,  \mx, w9  // mx, 8tap_h, 4tap_h
1571        add             \my,  \my, w10 // my, 8tap_v, 4tap_v
1572.ifc \type, prep
1573        uxtw            \d_strd, \w
1574        lsl             \d_strd, \d_strd, #1
1575.endif
1576
1577        dup             v31.8h,  \bdmax        // bitdepth_max
1578        clz             \bdmax,  \bdmax
1579        clz             w9,  \w
1580        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18
1581        mov             w12, #6
1582        tst             \mx, #(0x7f << 14)
1583        sub             w9,  w9,  #24
1584        add             w13, w12, \bdmax       // 6 + intermediate_bits
1585        sub             w12, w12, \bdmax       // 6 - intermediate_bits
1586        movrel          x11, X(mc_subpel_filters), -8
1587        b.ne            L(\type\()_\taps\()_h)
1588        tst             \my, #(0x7f << 14)
1589        b.ne            L(\type\()_\taps\()_v)
1590        b               \type\()_16bpc_neon
1591
1592L(\type\()_\taps\()_h):
1593        cmp             \w,   #4
1594        ubfx            w10,  \mx, #7, #7
1595        and             \mx,  \mx, #0x7f
1596        b.le            4f
1597        mov             \mx,  w10
15984:
1599        tst             \my,  #(0x7f << 14)
1600        add             \xmx, x11, \mx, uxtw #3
1601        b.ne            L(\type\()_\taps\()_hv)
1602
1603        movrel          x10, \type\()_\taps\()_h_tbl
1604        ldrsw           x9,  [x10, x9, lsl #2]
1605.ifc \type, put
1606        mov             w12,  #34              // rounding for 10-bit
1607        mov             w13,  #40              // rounding for 12-bit
1608        cmp             \bdmax, #2             // 10-bit: 4, 12-bit: 2
1609        csel            w12,  w12,  w13,  ne   // select rounding based on \bdmax
1610.else
1611        neg             w12,  w12              // -(6 - intermediate_bits)
1612        movi            v28.8h,  #(PREP_BIAS >> 8), lsl #8
1613.endif
1614        add             x10, x10, x9
1615        dup             v30.4s,  w12           // rounding or shift amount
1616        br              x10
1617
161820:     // 2xN h
1619        AARCH64_VALID_JUMP_TARGET
1620.ifc \type, put
1621        ldur            s0,  [\xmx, #2]
1622        sub             \src,  \src,  #2
1623        add             \ds2,  \dst,  \d_strd
1624        add             \sr2,  \src,  \s_strd
1625        lsl             \d_strd,  \d_strd,  #1
1626        lsl             \s_strd,  \s_strd,  #1
1627        sxtl            v0.8h,   v0.8b
16282:
1629        ld1             {v4.8h},  [\src], \s_strd
1630        ld1             {v6.8h},  [\sr2], \s_strd
1631        mov             v2.16b,  v30.16b
1632        ext             v5.16b,  v4.16b,  v4.16b,  #2
1633        ext             v7.16b,  v6.16b,  v6.16b,  #2
1634        subs            \h,  \h,  #2
1635        trn1            v3.2s,   v4.2s,   v6.2s
1636        trn2            v6.2s,   v4.2s,   v6.2s
1637        trn1            v4.2s,   v5.2s,   v7.2s
1638        trn2            v7.2s,   v5.2s,   v7.2s
1639        smlal           v2.4s,   v3.4h,   v0.h[0]
1640        smlal           v2.4s,   v4.4h,   v0.h[1]
1641        smlal           v2.4s,   v6.4h,   v0.h[2]
1642        smlal           v2.4s,   v7.4h,   v0.h[3]
1643        sqshrun         v2.4h,   v2.4s,   #6
1644        umin            v2.4h,   v2.4h,   v31.4h
1645        st1             {v2.s}[0], [\dst], \d_strd
1646        st1             {v2.s}[1], [\ds2], \d_strd
1647        b.gt            2b
1648        ret
1649.endif
1650
165140:     // 4xN h
1652        AARCH64_VALID_JUMP_TARGET
1653        ldur            s0,  [\xmx, #2]
1654        sub             \src,  \src,  #2
1655        add             \ds2,  \dst,  \d_strd
1656        add             \sr2,  \src,  \s_strd
1657        lsl             \d_strd,  \d_strd,  #1
1658        lsl             \s_strd,  \s_strd,  #1
1659        sxtl            v0.8h,   v0.8b
16604:
1661        ld1             {v16.8h}, [\src], \s_strd
1662        ld1             {v20.8h}, [\sr2], \s_strd
1663.ifc \type, put
1664        mov             v2.16b,  v30.16b
1665        mov             v3.16b,  v30.16b
1666.endif
1667        ext             v17.16b, v16.16b, v16.16b, #2
1668        ext             v18.16b, v16.16b, v16.16b, #4
1669        ext             v19.16b, v16.16b, v16.16b, #6
1670        ext             v21.16b, v20.16b, v20.16b, #2
1671        ext             v22.16b, v20.16b, v20.16b, #4
1672        ext             v23.16b, v20.16b, v20.16b, #6
1673        subs            \h,  \h,  #2
1674.ifc \type, put
1675        smlal           v2.4s,   v16.4h,  v0.h[0]
1676.else
1677        smull           v2.4s,   v16.4h,  v0.h[0]
1678.endif
1679        smlal           v2.4s,   v17.4h,  v0.h[1]
1680        smlal           v2.4s,   v18.4h,  v0.h[2]
1681        smlal           v2.4s,   v19.4h,  v0.h[3]
1682.ifc \type, put
1683        smlal           v3.4s,   v20.4h,  v0.h[0]
1684.else
1685        smull           v3.4s,   v20.4h,  v0.h[0]
1686.endif
1687        smlal           v3.4s,   v21.4h,  v0.h[1]
1688        smlal           v3.4s,   v22.4h,  v0.h[2]
1689        smlal           v3.4s,   v23.4h,  v0.h[3]
1690.ifc \type, put
1691        sqshrun         v16.4h,  v2.4s,   #6
1692        sqshrun2        v16.8h,  v3.4s,   #6
1693        umin            v16.8h,  v16.8h,  v31.8h
1694.else
1695        srshl           v16.4s,  v2.4s,   v30.4s // -(6-intermediate_bits)
1696        srshl           v20.4s,  v3.4s,   v30.4s // -(6-intermediate_bits)
1697        uzp1            v16.8h,  v16.8h,  v20.8h // Same as xtn, xtn2
1698        sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
1699.endif
1700        st1             {v16.8b},   [\dst], \d_strd
1701        st1             {v16.d}[1], [\ds2], \d_strd
1702        b.gt            4b
1703        ret
1704
170580:
1706160:
1707320:
1708640:
17091280:   // 8xN, 16xN, 32xN, ... h
1710        AARCH64_VALID_JUMP_TARGET
1711        ld1             {v0.8b}, [\xmx]
1712.ifc \taps, 6tap
1713        sub             \src,  \src,  #4
1714.else
1715        sub             \src,  \src,  #6
1716.endif
1717        add             \ds2,  \dst,  \d_strd
1718        add             \sr2,  \src,  \s_strd
1719        lsl             \s_strd,  \s_strd,  #1
1720        sxtl            v0.8h,   v0.8b
1721
1722        sub             \s_strd,  \s_strd,  \w, uxtw #1
1723        sub             \s_strd,  \s_strd,  #16
1724.ifc \type, put
1725        lsl             \d_strd,  \d_strd,  #1
1726        sub             \d_strd,  \d_strd,  \w, uxtw #1
1727.endif
172881:
1729        ld1             {v16.8h, v17.8h},  [\src], #32
1730        ld1             {v20.8h, v21.8h},  [\sr2], #32
1731        mov             \mx, \w
1732
17338:
1734.ifc \taps, 6tap
1735    .ifc \type, put
1736        mov             v18.16b, v30.16b
1737        mov             v19.16b, v30.16b
1738        smlal           v18.4s,  v16.4h,  v0.h[1]
1739        smlal2          v19.4s,  v16.8h,  v0.h[1]
1740        mov             v22.16b, v30.16b
1741        mov             v23.16b, v30.16b
1742        smlal           v22.4s,  v20.4h,  v0.h[1]
1743        smlal2          v23.4s,  v20.8h,  v0.h[1]
1744    .else
1745        smull           v18.4s,  v16.4h,  v0.h[1]
1746        smull2          v19.4s,  v16.8h,  v0.h[1]
1747        smull           v22.4s,  v20.4h,  v0.h[1]
1748        smull2          v23.4s,  v20.8h,  v0.h[1]
1749    .endif
1750    .irpc i, 23456
1751        ext             v24.16b, v16.16b, v17.16b, #(2*\i-2)
1752        ext             v25.16b, v20.16b, v21.16b, #(2*\i-2)
1753        smlal           v18.4s,  v24.4h,  v0.h[\i]
1754        smlal2          v19.4s,  v24.8h,  v0.h[\i]
1755        smlal           v22.4s,  v25.4h,  v0.h[\i]
1756        smlal2          v23.4s,  v25.8h,  v0.h[\i]
1757    .endr
1758.else   // 8tap
1759    .ifc \type, put
1760        mov             v18.16b, v30.16b
1761        mov             v19.16b, v30.16b
1762        smlal           v18.4s,  v16.4h,  v0.h[0]
1763        smlal2          v19.4s,  v16.8h,  v0.h[0]
1764        mov             v22.16b, v30.16b
1765        mov             v23.16b, v30.16b
1766        smlal           v22.4s,  v20.4h,  v0.h[0]
1767        smlal2          v23.4s,  v20.8h,  v0.h[0]
1768    .else
1769        smull           v18.4s,  v16.4h,  v0.h[0]
1770        smull2          v19.4s,  v16.8h,  v0.h[0]
1771        smull           v22.4s,  v20.4h,  v0.h[0]
1772        smull2          v23.4s,  v20.8h,  v0.h[0]
1773    .endif
1774    .irpc i, 1234567
1775        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
1776        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
1777        smlal           v18.4s,  v24.4h,  v0.h[\i]
1778        smlal2          v19.4s,  v24.8h,  v0.h[\i]
1779        smlal           v22.4s,  v25.4h,  v0.h[\i]
1780        smlal2          v23.4s,  v25.8h,  v0.h[\i]
1781    .endr
1782.endif
1783        subs            \mx, \mx, #8
1784.ifc \type, put
1785        sqshrun         v18.4h,  v18.4s,  #6
1786        sqshrun2        v18.8h,  v19.4s,  #6
1787        sqshrun         v22.4h,  v22.4s,  #6
1788        sqshrun2        v22.8h,  v23.4s,  #6
1789        umin            v18.8h,  v18.8h,  v31.8h
1790        umin            v22.8h,  v22.8h,  v31.8h
1791.else
1792        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
1793        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
1794        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
1795        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
1796        uzp1            v18.8h,  v18.8h,  v19.8h // Same as xtn, xtn2
1797        uzp1            v22.8h,  v22.8h,  v23.8h // Ditto
1798        sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS
1799        sub             v22.8h,  v22.8h,  v28.8h // PREP_BIAS
1800.endif
1801        st1             {v18.8h}, [\dst], #16
1802        st1             {v22.8h}, [\ds2], #16
1803        b.le            9f
1804
1805        mov             v16.16b, v17.16b
1806        mov             v20.16b, v21.16b
1807        ld1             {v17.8h}, [\src], #16
1808        ld1             {v21.8h}, [\sr2], #16
1809        b               8b
1810
18119:
1812        add             \dst,  \dst,  \d_strd
1813        add             \ds2,  \ds2,  \d_strd
1814        add             \src,  \src,  \s_strd
1815        add             \sr2,  \sr2,  \s_strd
1816
1817        subs            \h,  \h,  #2
1818        b.gt            81b
1819        ret
1820endfunc
1821
1822jumptable \type\()_\taps\()_h_tbl
1823        .word 1280b - \type\()_\taps\()_h_tbl
1824        .word 640b  - \type\()_\taps\()_h_tbl
1825        .word 320b  - \type\()_\taps\()_h_tbl
1826        .word 160b  - \type\()_\taps\()_h_tbl
1827        .word 80b   - \type\()_\taps\()_h_tbl
1828        .word 40b   - \type\()_\taps\()_h_tbl
1829        .word 20b   - \type\()_\taps\()_h_tbl
1830endjumptable
1831
1832
1833function L(\type\()_\taps\()_v)
1834        cmp             \h,  #4
1835        ubfx            w10, \my, #7, #7
1836        and             \my, \my, #0x7f
1837        b.le            4f
1838        mov             \my, w10
18394:
1840        add             \xmy, x11, \my, uxtw #3
1841
1842.ifc \type, prep
1843        dup             v30.4s,  w12           // 6 - intermediate_bits
1844        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
1845.endif
1846        movrel          x10, \type\()_\taps\()_v_tbl
1847        ldrsw           x9,  [x10, x9, lsl #2]
1848.ifc \type, prep
1849        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
1850.endif
1851        add             x10, x10, x9
1852        br              x10
1853
185420:     // 2xN v
1855        AARCH64_VALID_JUMP_TARGET
1856.ifc \type, put
1857        b.gt            28f
1858
1859        cmp             \h,  #2
1860        ldur            s0,  [\xmy, #2]
1861        sub             \src,  \src,  \s_strd
1862        add             \ds2,  \dst,  \d_strd
1863        add             \sr2,  \src,  \s_strd
1864        lsl             \s_strd,  \s_strd,  #1
1865        lsl             \d_strd,  \d_strd,  #1
1866        sxtl            v0.8h,   v0.8b
1867
1868        // 2x2 v
1869        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1870        interleave_1_s  v1,  v2,  v3,  v4,  v5
1871        b.gt            24f
1872        smull_smlal_4tap v6, v1,  v2,  v3,  v4
1873        sqrshrun_h      6,   v6
1874        umin_h          v31, .8h, v6
1875        st_s            \d_strd, v6, 2
1876        ret
1877
187824:     // 2x4 v
1879        load_s          \sr2, \src, \s_strd, v6, v7
1880        interleave_1_s  v5,  v6,  v7
1881        smull_smlal_4tap v16, v1, v2, v3, v4
1882        smull_smlal_4tap v17, v3, v4, v5, v6
1883        sqrshrun_h      6,   v16, v17
1884        umin_h          v31, .8h, v16
1885        st_s            \d_strd, v16, 4
1886        ret
1887
188828:     // 2x6, 2x8, 2x12, 2x16 v
1889        ld1             {v0.8b}, [\xmy]
1890        sub             \sr2,  \src,  \s_strd, lsl #1
1891        add             \ds2,  \dst,  \d_strd
1892        sub             \src,  \sr2,  \s_strd
1893        lsl             \d_strd,  \d_strd,  #1
1894        lsl             \s_strd,  \s_strd,  #1
1895        sxtl            v0.8h,   v0.8b
1896
1897        load_s          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
1898        interleave_1_s  v1,  v2,  v3,  v4,  v5
1899        interleave_1_s  v5,  v6,  v7
1900216:
1901        subs            \h,  \h,  #4
1902        load_s          \sr2, \src, \s_strd, v16, v17, v18, v19
1903        interleave_1_s  v7,  v16, v17, v18, v19
1904        smull_smlal_\taps v24, v1,  v2,  v3,  v4,  v5,  v6,  v7, v16
1905        smull_smlal_\taps v25, v3,  v4,  v5,  v6,  v7, v16, v17, v18
1906        sqrshrun_h      6,   v24, v25
1907        umin_h          v31, .8h, v24
1908        st_s            \d_strd, v24, 4
1909        b.le            0f
1910        cmp             \h,  #2
1911        mov             v1.16b,  v5.16b
1912        mov             v2.16b,  v6.16b
1913        mov             v3.16b,  v7.16b
1914        mov             v4.16b,  v16.16b
1915        mov             v5.16b,  v17.16b
1916        mov             v6.16b,  v18.16b
1917        mov             v7.16b,  v19.16b
1918        b.eq            26f
1919        b               216b
192026:
1921        load_s          \sr2, \src, \s_strd, v16, v17
1922        interleave_1_s  v7,  v16, v17
1923        smull_smlal_\taps v24, v1, v2,  v3,  v4,  v5,  v6,  v7, v16
1924        sqrshrun_h      6,   v24
1925        umin_h          v31, .4h, v24
1926        st_s            \d_strd, v24, 2
19270:
1928        ret
1929.endif
1930
193140:
1932        AARCH64_VALID_JUMP_TARGET
1933        b.gt            480f
1934
1935        // 4x2, 4x4 v
1936        cmp             \h,  #2
1937        ldur            s0,  [\xmy, #2]
1938        sub             \src, \src, \s_strd
1939        add             \ds2, \dst, \d_strd
1940        add             \sr2, \src, \s_strd
1941        lsl             \s_strd, \s_strd, #1
1942        lsl             \d_strd, \d_strd, #1
1943        sxtl            v0.8h,   v0.8b
1944
1945        load_4h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1946        smull_smlal_4tap v6,  v1,  v2,  v3,  v4
1947        smull_smlal_4tap v7,  v2,  v3,  v4,  v5
1948        shift_store_4   \type, \d_strd, v6, v7
1949        b.le            0f
1950        load_4h         \sr2, \src, \s_strd, v6, v7
1951        smull_smlal_4tap v1,  v3,  v4,  v5,  v6
1952        smull_smlal_4tap v2,  v4,  v5,  v6,  v7
1953        shift_store_4   \type, \d_strd, v1, v2
19540:
1955        ret
1956
1957480:    // 4x6, 4x8, 4x12, 4x16 v
1958        ld1             {v0.8b}, [\xmy]
1959        sub             \sr2, \src, \s_strd, lsl #1
1960        add             \ds2, \dst, \d_strd
1961        sub             \src, \sr2, \s_strd
1962        lsl             \s_strd, \s_strd, #1
1963        lsl             \d_strd, \d_strd, #1
1964        sxtl            v0.8h,   v0.8b
1965
1966        load_4h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1967
196848:
1969        subs            \h,  \h,  #4
1970        load_4h         \sr2, \src, \s_strd, v23, v24, v25, v26
1971        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
1972        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
1973        smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25
1974        smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
1975        shift_store_4   \type, \d_strd, v1, v2, v3, v4
1976        b.le            0f
1977        cmp             \h,  #2
1978        mov             v16.8b,  v20.8b
1979        mov             v17.8b,  v21.8b
1980        mov             v18.8b,  v22.8b
1981        mov             v19.8b,  v23.8b
1982        mov             v20.8b,  v24.8b
1983        mov             v21.8b,  v25.8b
1984        mov             v22.8b,  v26.8b
1985        b.eq            46f
1986        b               48b
198746:
1988        load_4h         \sr2, \src, \s_strd, v23, v24
1989        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
1990        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
1991        shift_store_4   \type, \d_strd, v1, v2
19920:
1993        ret
1994
199580:
1996        AARCH64_VALID_JUMP_TARGET
1997        b.gt            880f
1998
1999        // 8x2, 8x4 v
2000        cmp             \h,  #2
2001        ldur            s0,  [\xmy, #2]
2002        sub             \src, \src, \s_strd
2003        add             \ds2, \dst, \d_strd
2004        add             \sr2, \src, \s_strd
2005        lsl             \s_strd, \s_strd, #1
2006        lsl             \d_strd, \d_strd, #1
2007        sxtl            v0.8h,   v0.8b
2008
2009        load_8h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
2010        smull_smlal_4tap   v16, v1,  v2,  v3,  v4
2011        smull2_smlal2_4tap v17, v1,  v2,  v3,  v4
2012        smull_smlal_4tap   v18, v2,  v3,  v4,  v5
2013        smull2_smlal2_4tap v19, v2,  v3,  v4,  v5
2014        shift_store_8   \type, \d_strd, v16, v17, v18, v19
2015        b.le            0f
2016        load_8h         \sr2, \src, \s_strd, v6, v7
2017        smull_smlal_4tap   v16, v3,  v4,  v5,  v6
2018        smull2_smlal2_4tap v17, v3,  v4,  v5,  v6
2019        smull_smlal_4tap   v18, v4,  v5,  v6,  v7
2020        smull2_smlal2_4tap v19, v4,  v5,  v6,  v7
2021        shift_store_8   \type, \d_strd, v16, v17, v18, v19
20220:
2023        ret
2024
2025880:    // 8x6, 8x8, 8x16, 8x32 v
20261680:   // 16x8, 16x16, ...
2027320:    // 32x8, 32x16, ...
2028640:
20291280:
2030        AARCH64_VALID_JUMP_TARGET
2031        ld1             {v0.8b}, [\xmy]
2032        sub             \src, \src, \s_strd
2033        sub             \src, \src, \s_strd, lsl #1
2034        sxtl            v0.8h,   v0.8b
2035        mov             \my,  \h
2036168:
2037        add             \ds2, \dst, \d_strd
2038        add             \sr2, \src, \s_strd
2039        lsl             \s_strd, \s_strd, #1
2040        lsl             \d_strd, \d_strd, #1
2041
2042        load_8h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
2043
204488:
2045        subs            \h,  \h,  #2
2046        load_8h         \sr2, \src, \s_strd, v23, v24
2047        smull_smlal_\taps   v1, v16, v17, v18, v19, v20, v21, v22, v23
2048        smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23
2049        smull_smlal_\taps   v3, v17, v18, v19, v20, v21, v22, v23, v24
2050        smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24
2051        shift_store_8   \type, \d_strd, v1, v2, v3, v4
2052        b.le            9f
2053        subs            \h,  \h,  #2
2054        load_8h         \sr2, \src, \s_strd, v25, v26
2055        smull_smlal_\taps   v1, v18, v19, v20, v21, v22, v23, v24, v25
2056        smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25
2057        smull_smlal_\taps   v3, v19, v20, v21, v22, v23, v24, v25, v26
2058        smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
2059        shift_store_8   \type, \d_strd, v1, v2, v3, v4
2060        b.le            9f
2061        mov             v16.16b, v20.16b
2062        mov             v17.16b, v21.16b
2063        mov             v18.16b, v22.16b
2064        mov             v19.16b, v23.16b
2065        mov             v20.16b, v24.16b
2066        mov             v21.16b, v25.16b
2067        mov             v22.16b, v26.16b
2068        b               88b
20699:
2070        subs            \w,  \w,  #8
2071        b.le            0f
2072        asr             \s_strd, \s_strd, #1
2073        asr             \d_strd, \d_strd, #1
2074        msub            \src, \s_strd, \xmy, \src
2075        msub            \dst, \d_strd, \xmy, \dst
2076        sub             \src, \src, \s_strd, lsl #3
2077        mov             \h,  \my
2078        add             \src, \src, #16
2079        add             \dst, \dst, #16
2080        b               168b
20810:
2082        ret
2083
2084160:
2085        AARCH64_VALID_JUMP_TARGET
2086        b.gt            1680b
2087
2088        // 16x2, 16x4 v
2089        ldur            s0,  [\xmy, #2]
2090        sub             \src, \src, \s_strd
2091        sxtl            v0.8h,   v0.8b
2092
2093        load_16h        \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
209416:
2095        load_16h        \src, \src, \s_strd, v22, v23
2096        subs            \h,  \h,  #1
2097        smull_smlal_4tap   v1, v16, v18, v20, v22
2098        smull2_smlal2_4tap v2, v16, v18, v20, v22
2099        smull_smlal_4tap   v3, v17, v19, v21, v23
2100        smull2_smlal2_4tap v4, v17, v19, v21, v23
2101        shift_store_16  \type, \d_strd, x0, v1, v2, v3, v4
2102        b.le            0f
2103        mov             v16.16b, v18.16b
2104        mov             v17.16b, v19.16b
2105        mov             v18.16b, v20.16b
2106        mov             v19.16b, v21.16b
2107        mov             v20.16b, v22.16b
2108        mov             v21.16b, v23.16b
2109        b               16b
21100:
2111        ret
2112endfunc
2113
2114jumptable \type\()_\taps\()_v_tbl
2115        .word 1280b - \type\()_\taps\()_v_tbl
2116        .word 640b  - \type\()_\taps\()_v_tbl
2117        .word 320b  - \type\()_\taps\()_v_tbl
2118        .word 160b  - \type\()_\taps\()_v_tbl
2119        .word 80b   - \type\()_\taps\()_v_tbl
2120        .word 40b   - \type\()_\taps\()_v_tbl
2121        .word 20b   - \type\()_\taps\()_v_tbl
2122endjumptable
2123
2124function L(\type\()_\taps\()_hv)
2125        cmp             \h,  #4
2126        ubfx            w10, \my, #7, #7
2127        and             \my, \my, #0x7f
2128        b.le            4f
2129        mov             \my,  w10
21304:
2131        add             \xmy, x11, \my, uxtw #3
2132
2133        movrel          x10, \type\()_\taps\()_hv_tbl
2134        dup             v30.4s,  w12           // 6 - intermediate_bits
2135        ldrsw           x9,  [x10, x9, lsl #2]
2136        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
2137.ifc \type, put
2138        dup             v29.4s,  w13           // 6 + intermediate_bits
2139.else
2140        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2141.endif
2142        add             x10, x10, x9
2143.ifc \type, put
2144        neg             v29.4s,  v29.4s        // -(6+intermediate_bits)
2145.endif
2146        br              x10
2147
214820:
2149        AARCH64_VALID_JUMP_TARGET
2150.ifc \type, put
2151        ldur            s0,  [\xmx, #2]
2152        b.gt            280f
2153        ldur            s1,  [\xmy, #2]
2154
2155        // 2x2, 2x4 hv
2156        sub             \sr2, \src, #2
2157        sub             \src, \sr2, \s_strd
2158        add             \ds2, \dst, \d_strd
2159        lsl             \s_strd, \s_strd, #1
2160        lsl             \d_strd, \d_strd, #1
2161        sxtl            v0.8h,   v0.8b
2162        sxtl            v1.8h,   v1.8b
2163        mov             x15, x30
2164
2165        ld1             {v27.8h}, [\src], \s_strd
2166        ext             v28.16b, v27.16b, v27.16b, #2
2167        smull           v27.4s,  v27.4h,  v0.4h
2168        smull           v28.4s,  v28.4h,  v0.4h
2169        addp            v27.4s,  v27.4s,  v28.4s
2170        addp            v16.4s,  v27.4s,  v27.4s
2171        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
2172        bl              L(\type\()_\taps\()_filter_2)
2173        // The intermediates from the horizontal pass fit in 16 bit without
2174        // any bias; we could just as well keep them as .4s, but narrowing
2175        // them to .4h gives a significant speedup on out of order cores
2176        // (at the cost of a smaller slowdown on in-order cores such as A53).
2177        xtn             v16.4h,  v16.4s
2178
2179        trn1            v16.2s,  v16.2s,  v24.2s
2180        mov             v17.8b,  v24.8b
2181
21822:
2183        bl              L(\type\()_\taps\()_filter_2)
2184
2185        ext             v18.8b,  v17.8b,  v24.8b,  #4
2186        smull           v2.4s,   v16.4h,  v1.h[0]
2187        smlal           v2.4s,   v17.4h,  v1.h[1]
2188        smlal           v2.4s,   v18.4h,  v1.h[2]
2189        smlal           v2.4s,   v24.4h,  v1.h[3]
2190
2191        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2192        sqxtun          v2.4h,   v2.4s
2193        umin            v2.4h,   v2.4h,   v31.4h
2194        subs            \h,  \h,  #2
2195        st1             {v2.s}[0], [\dst], \d_strd
2196        st1             {v2.s}[1], [\ds2], \d_strd
2197        b.le            0f
2198        mov             v16.8b,  v18.8b
2199        mov             v17.8b,  v24.8b
2200        b               2b
2201
2202280:    // 2x8, 2x16, 2x32 hv
2203        ld1             {v1.8b},  [\xmy]
2204        sub             \src, \src, #2
2205        sub             \sr2, \src, \s_strd, lsl #1
2206        sub             \src, \sr2, \s_strd
2207        add             \ds2, \dst, \d_strd
2208        lsl             \s_strd, \s_strd, #1
2209        lsl             \d_strd, \d_strd, #1
2210        sxtl            v0.8h,   v0.8b
2211        sxtl            v1.8h,   v1.8b
2212        mov             x15, x30
2213
2214        ld1             {v27.8h}, [\src], \s_strd
2215        ext             v28.16b, v27.16b, v27.16b, #2
2216        smull           v27.4s,  v27.4h,  v0.4h
2217        smull           v28.4s,  v28.4h,  v0.4h
2218        addp            v27.4s,  v27.4s,  v28.4s
2219        addp            v16.4s,  v27.4s,  v27.4s
2220        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
2221        // The intermediates from the horizontal pass fit in 16 bit without
2222        // any bias; we could just as well keep them as .4s, but narrowing
2223        // them to .4h gives a significant speedup on out of order cores
2224        // (at the cost of a smaller slowdown on in-order cores such as A53).
2225
2226        bl              L(\type\()_\taps\()_filter_2)
2227        xtn             v16.4h,  v16.4s
2228        trn1            v16.2s,  v16.2s,  v24.2s
2229        mov             v17.8b,  v24.8b
2230        bl              L(\type\()_\taps\()_filter_2)
2231        ext             v18.8b,  v17.8b,  v24.8b,  #4
2232        mov             v19.8b,  v24.8b
2233        bl              L(\type\()_\taps\()_filter_2)
2234        ext             v20.8b,  v19.8b,  v24.8b,  #4
2235        mov             v21.8b,  v24.8b
2236
223728:
2238        bl              L(\type\()_\taps\()_filter_2)
2239        ext             v22.8b,  v21.8b,  v24.8b,  #4
2240.ifc \taps, 6tap
2241        smull           v3.4s,   v17.4h,  v1.h[1]
2242        smlal           v3.4s,   v18.4h,  v1.h[2]
2243        smlal           v3.4s,   v19.4h,  v1.h[3]
2244        smlal           v3.4s,   v20.4h,  v1.h[4]
2245        smlal           v3.4s,   v21.4h,  v1.h[5]
2246        smlal           v3.4s,   v22.4h,  v1.h[6]
2247.else   // 8tap
2248        smull           v3.4s,   v16.4h,  v1.h[0]
2249        smlal           v3.4s,   v17.4h,  v1.h[1]
2250        smlal           v3.4s,   v18.4h,  v1.h[2]
2251        smlal           v3.4s,   v19.4h,  v1.h[3]
2252        smlal           v3.4s,   v20.4h,  v1.h[4]
2253        smlal           v3.4s,   v21.4h,  v1.h[5]
2254        smlal           v3.4s,   v22.4h,  v1.h[6]
2255        smlal           v3.4s,   v24.4h,  v1.h[7]
2256.endif
2257
2258        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2259        sqxtun          v3.4h,   v3.4s
2260        umin            v3.4h,   v3.4h,   v31.4h
2261        subs            \h,  \h,  #2
2262        st1             {v3.s}[0], [\dst], \d_strd
2263        st1             {v3.s}[1], [\ds2], \d_strd
2264        b.le            0f
2265        mov             v16.8b,  v18.8b
2266        mov             v17.8b,  v19.8b
2267        mov             v18.8b,  v20.8b
2268        mov             v19.8b,  v21.8b
2269        mov             v20.8b,  v22.8b
2270        mov             v21.8b,  v24.8b
2271        b               28b
2272
22730:
2274        ret             x15
2275
2276L(\type\()_\taps\()_filter_2):
2277        ld1             {v25.8h},  [\sr2], \s_strd
2278        ld1             {v27.8h},  [\src], \s_strd
2279        ext             v26.16b, v25.16b, v25.16b, #2
2280        ext             v28.16b, v27.16b, v27.16b, #2
2281        trn1            v24.2s,  v25.2s,  v27.2s
2282        trn2            v27.2s,  v25.2s,  v27.2s
2283        trn1            v25.2s,  v26.2s,  v28.2s
2284        trn2            v28.2s,  v26.2s,  v28.2s
2285        smull           v24.4s,  v24.4h,  v0.h[0]
2286        smlal           v24.4s,  v25.4h,  v0.h[1]
2287        smlal           v24.4s,  v27.4h,  v0.h[2]
2288        smlal           v24.4s,  v28.4h,  v0.h[3]
2289        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2290        xtn             v24.4h,  v24.4s
2291        ret
2292.endif
2293
229440:
2295        AARCH64_VALID_JUMP_TARGET
2296        ldur            s0,  [\xmx, #2]
2297        b.gt            480f
2298        ldur            s1,  [\xmy, #2]
2299        sub             \sr2, \src, #2
2300        sub             \src, \sr2, \s_strd
2301        add             \ds2, \dst, \d_strd
2302        lsl             \s_strd, \s_strd, #1
2303        lsl             \d_strd, \d_strd, #1
2304        sxtl            v0.8h,   v0.8b
2305        sxtl            v1.8h,   v1.8b
2306        mov             x15, x30
2307
2308        // 4x2, 4x4 hv
2309        ld1             {v25.8h}, [\src], \s_strd
2310        ext             v26.16b, v25.16b, v25.16b, #2
2311        ext             v27.16b, v25.16b, v25.16b, #4
2312        ext             v28.16b, v25.16b, v25.16b, #6
2313        smull           v25.4s,  v25.4h,  v0.h[0]
2314        smlal           v25.4s,  v26.4h,  v0.h[1]
2315        smlal           v25.4s,  v27.4h,  v0.h[2]
2316        smlal           v25.4s,  v28.4h,  v0.h[3]
2317        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2318        // The intermediates from the horizontal pass fit in 16 bit without
2319        // any bias; we could just as well keep them as .4s, but narrowing
2320        // them to .4h gives a significant speedup on out of order cores
2321        // (at the cost of a smaller slowdown on in-order cores such as A53).
2322        xtn             v16.4h,  v16.4s
2323
2324        bl              L(\type\()_\taps\()_filter_4)
2325        mov             v17.8b,  v24.8b
2326        mov             v18.8b,  v25.8b
2327
23284:
2329        bl              L(\type\()_\taps\()_filter_4)
2330        smull           v2.4s,   v16.4h,  v1.h[0]
2331        smlal           v2.4s,   v17.4h,  v1.h[1]
2332        smlal           v2.4s,   v18.4h,  v1.h[2]
2333        smlal           v2.4s,   v24.4h,  v1.h[3]
2334        smull           v3.4s,   v17.4h,  v1.h[0]
2335        smlal           v3.4s,   v18.4h,  v1.h[1]
2336        smlal           v3.4s,   v24.4h,  v1.h[2]
2337        smlal           v3.4s,   v25.4h,  v1.h[3]
2338.ifc \type, put
2339        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2340        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2341        sqxtun          v2.4h,   v2.4s
2342        sqxtun2         v2.8h,   v3.4s
2343        umin            v2.8h,   v2.8h,   v31.8h
2344.else
2345        rshrn           v2.4h,   v2.4s,   #6
2346        rshrn2          v2.8h,   v3.4s,   #6
2347        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2348.endif
2349        subs            \h,  \h,  #2
2350
2351        st1             {v2.8b},   [\dst], \d_strd
2352        st1             {v2.d}[1], [\ds2], \d_strd
2353        b.le            0f
2354        mov             v16.8b,  v18.8b
2355        mov             v17.8b,  v24.8b
2356        mov             v18.8b,  v25.8b
2357        b               4b
2358
2359480:    // 4x8, 4x16, 4x32 hv
2360        ld1             {v1.8b},  [\xmy]
2361        sub             \src, \src, #2
2362.ifc \taps, 6tap
2363        sub             \sr2, \src, \s_strd
2364        sub             \src, \src, \s_strd, lsl #1
2365.else
2366        sub             \sr2, \src, \s_strd, lsl #1
2367        sub             \src, \sr2, \s_strd
2368.endif
2369        add             \ds2, \dst, \d_strd
2370        lsl             \s_strd, \s_strd, #1
2371        lsl             \d_strd, \d_strd, #1
2372        sxtl            v0.8h,   v0.8b
2373        sxtl            v1.8h,   v1.8b
2374        mov             x15, x30
2375
2376        ld1             {v25.8h}, [\src], \s_strd
2377        ext             v26.16b, v25.16b, v25.16b, #2
2378        ext             v27.16b, v25.16b, v25.16b, #4
2379        ext             v28.16b, v25.16b, v25.16b, #6
2380        smull           v25.4s,  v25.4h,  v0.h[0]
2381        smlal           v25.4s,  v26.4h,  v0.h[1]
2382        smlal           v25.4s,  v27.4h,  v0.h[2]
2383        smlal           v25.4s,  v28.4h,  v0.h[3]
2384        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2385        // The intermediates from the horizontal pass fit in 16 bit without
2386        // any bias; we could just as well keep them as .4s, but narrowing
2387        // them to .4h gives a significant speedup on out of order cores
2388        // (at the cost of a smaller slowdown on in-order cores such as A53).
2389.ifc \taps, 6tap
2390        xtn             v18.4h,  v16.4s
2391.else
2392        xtn             v16.4h,  v16.4s
2393
2394        bl              L(\type\()_\taps\()_filter_4)
2395        mov             v17.8b,  v24.8b
2396        mov             v18.8b,  v25.8b
2397.endif
2398        bl              L(\type\()_\taps\()_filter_4)
2399        mov             v19.8b,  v24.8b
2400        mov             v20.8b,  v25.8b
2401        bl              L(\type\()_\taps\()_filter_4)
2402        mov             v21.8b,  v24.8b
2403        mov             v22.8b,  v25.8b
2404
240548:
2406        bl              L(\type\()_\taps\()_filter_4)
2407.ifc \taps, 6tap
2408        smull           v3.4s,   v18.4h,  v1.h[1]
2409        smlal           v3.4s,   v19.4h,  v1.h[2]
2410        smlal           v3.4s,   v20.4h,  v1.h[3]
2411        smlal           v3.4s,   v21.4h,  v1.h[4]
2412        smlal           v3.4s,   v22.4h,  v1.h[5]
2413        smlal           v3.4s,   v24.4h,  v1.h[6]
2414        smull           v4.4s,   v19.4h,  v1.h[1]
2415        smlal           v4.4s,   v20.4h,  v1.h[2]
2416        smlal           v4.4s,   v21.4h,  v1.h[3]
2417        smlal           v4.4s,   v22.4h,  v1.h[4]
2418        smlal           v4.4s,   v24.4h,  v1.h[5]
2419        smlal           v4.4s,   v25.4h,  v1.h[6]
2420.else   // 8tap
2421        smull           v3.4s,   v16.4h,  v1.h[0]
2422        smlal           v3.4s,   v17.4h,  v1.h[1]
2423        smlal           v3.4s,   v18.4h,  v1.h[2]
2424        smlal           v3.4s,   v19.4h,  v1.h[3]
2425        smlal           v3.4s,   v20.4h,  v1.h[4]
2426        smlal           v3.4s,   v21.4h,  v1.h[5]
2427        smlal           v3.4s,   v22.4h,  v1.h[6]
2428        smlal           v3.4s,   v24.4h,  v1.h[7]
2429        smull           v4.4s,   v17.4h,  v1.h[0]
2430        smlal           v4.4s,   v18.4h,  v1.h[1]
2431        smlal           v4.4s,   v19.4h,  v1.h[2]
2432        smlal           v4.4s,   v20.4h,  v1.h[3]
2433        smlal           v4.4s,   v21.4h,  v1.h[4]
2434        smlal           v4.4s,   v22.4h,  v1.h[5]
2435        smlal           v4.4s,   v24.4h,  v1.h[6]
2436        smlal           v4.4s,   v25.4h,  v1.h[7]
2437.endif
2438.ifc \type, put
2439        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2440        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2441        sqxtun          v3.4h,   v3.4s
2442        sqxtun2         v3.8h,   v4.4s
2443        umin            v3.8h,   v3.8h,   v31.8h
2444.else
2445        rshrn           v3.4h,   v3.4s,   #6
2446        rshrn2          v3.8h,   v4.4s,   #6
2447        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2448.endif
2449        subs            \h,  \h,  #2
2450        st1             {v3.8b},   [\dst], \d_strd
2451        st1             {v3.d}[1], [\ds2], \d_strd
2452        b.le            0f
2453.ifc \taps, 8tap
2454        mov             v16.8b,  v18.8b
2455        mov             v17.8b,  v19.8b
2456.endif
2457        mov             v18.8b,  v20.8b
2458        mov             v19.8b,  v21.8b
2459        mov             v20.8b,  v22.8b
2460        mov             v21.8b,  v24.8b
2461        mov             v22.8b,  v25.8b
2462        b               48b
24630:
2464        ret             x15
2465
2466L(\type\()_\taps\()_filter_4):
2467        ld1             {v24.8h}, [\sr2], \s_strd
2468        ld1             {v25.8h}, [\src], \s_strd
2469        ext             v26.16b, v24.16b, v24.16b, #2
2470        ext             v27.16b, v24.16b, v24.16b, #4
2471        ext             v28.16b, v24.16b, v24.16b, #6
2472        smull           v24.4s,  v24.4h,  v0.h[0]
2473        smlal           v24.4s,  v26.4h,  v0.h[1]
2474        smlal           v24.4s,  v27.4h,  v0.h[2]
2475        smlal           v24.4s,  v28.4h,  v0.h[3]
2476        ext             v26.16b, v25.16b, v25.16b, #2
2477        ext             v27.16b, v25.16b, v25.16b, #4
2478        ext             v28.16b, v25.16b, v25.16b, #6
2479        smull           v25.4s,  v25.4h,  v0.h[0]
2480        smlal           v25.4s,  v26.4h,  v0.h[1]
2481        smlal           v25.4s,  v27.4h,  v0.h[2]
2482        smlal           v25.4s,  v28.4h,  v0.h[3]
2483        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2484        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2485        xtn             v24.4h,  v24.4s
2486        xtn             v25.4h,  v25.4s
2487        ret
2488
248980:
2490160:
2491320:
2492        AARCH64_VALID_JUMP_TARGET
2493        b.gt            880f
2494        ld1             {v0.8b},  [\xmx]
2495        ldur            s1,  [\xmy, #2]
2496.ifc \taps, 6tap
2497        sub             \src,  \src,  #4
2498.else
2499        sub             \src,  \src,  #6
2500.endif
2501        sub             \src,  \src,  \s_strd
2502        sxtl            v0.8h,   v0.8b
2503        sxtl            v1.8h,   v1.8b
2504        mov             x15, x30
2505        mov             \my, \h
2506
2507164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2508        add             \ds2,  \dst,  \d_strd
2509        add             \sr2,  \src,  \s_strd
2510        lsl             \d_strd, \d_strd, #1
2511        lsl             \s_strd, \s_strd, #1
2512
2513        ld1             {v27.8h, v28.8h},  [\src], \s_strd
2514.ifc \taps, 6tap
2515        smull           v24.4s,  v27.4h,  v0.h[1]
2516        smull2          v25.4s,  v27.8h,  v0.h[1]
2517    .irpc i, 23456
2518        ext             v26.16b, v27.16b, v28.16b, #(2*\i-2)
2519        smlal           v24.4s,  v26.4h,  v0.h[\i]
2520        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2521    .endr
2522.else
2523        smull           v24.4s,  v27.4h,  v0.h[0]
2524        smull2          v25.4s,  v27.8h,  v0.h[0]
2525    .irpc i, 1234567
2526        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2527        smlal           v24.4s,  v26.4h,  v0.h[\i]
2528        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2529    .endr
2530.endif
2531        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2532        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2533        // The intermediates from the horizontal pass fit in 16 bit without
2534        // any bias; we could just as well keep them as .4s, but narrowing
2535        // them to .4h gives a significant speedup on out of order cores
2536        // (at the cost of a smaller slowdown on in-order cores such as A53),
2537        // and conserves register space (no need to clobber v8-v15).
2538        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
2539
2540        bl              L(\type\()_\taps\()_filter_8)
2541        mov             v17.16b, v23.16b
2542        mov             v18.16b, v24.16b
2543
25448:
2545        smull           v2.4s,   v16.4h,  v1.h[0]
2546        smull2          v3.4s,   v16.8h,  v1.h[0]
2547        bl              L(\type\()_\taps\()_filter_8)
2548        smull           v4.4s,   v17.4h,  v1.h[0]
2549        smull2          v5.4s,   v17.8h,  v1.h[0]
2550        smlal           v2.4s,   v17.4h,  v1.h[1]
2551        smlal2          v3.4s,   v17.8h,  v1.h[1]
2552        smlal           v4.4s,   v18.4h,  v1.h[1]
2553        smlal2          v5.4s,   v18.8h,  v1.h[1]
2554        smlal           v2.4s,   v18.4h,  v1.h[2]
2555        smlal2          v3.4s,   v18.8h,  v1.h[2]
2556        smlal           v4.4s,   v23.4h,  v1.h[2]
2557        smlal2          v5.4s,   v23.8h,  v1.h[2]
2558        smlal           v2.4s,   v23.4h,  v1.h[3]
2559        smlal2          v3.4s,   v23.8h,  v1.h[3]
2560        smlal           v4.4s,   v24.4h,  v1.h[3]
2561        smlal2          v5.4s,   v24.8h,  v1.h[3]
2562.ifc \type, put
2563        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2564        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2565        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2566        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
2567        sqxtun          v2.4h,   v2.4s
2568        sqxtun2         v2.8h,   v3.4s
2569        sqxtun          v3.4h,   v4.4s
2570        sqxtun2         v3.8h,   v5.4s
2571        umin            v2.8h,   v2.8h,   v31.8h
2572        umin            v3.8h,   v3.8h,   v31.8h
2573.else
2574        rshrn           v2.4h,   v2.4s,   #6
2575        rshrn2          v2.8h,   v3.4s,   #6
2576        rshrn           v3.4h,   v4.4s,   #6
2577        rshrn2          v3.8h,   v5.4s,   #6
2578        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2579        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2580.endif
2581        subs            \h,  \h,  #2
2582        st1             {v2.8h}, [\dst], \d_strd
2583        st1             {v3.8h}, [\ds2], \d_strd
2584        b.le            9f
2585        mov             v16.16b, v18.16b
2586        mov             v17.16b, v23.16b
2587        mov             v18.16b, v24.16b
2588        b               8b
25899:
2590        subs            \w,  \w,  #8
2591        b.le            0f
2592        asr             \s_strd,  \s_strd,  #1
2593        asr             \d_strd,  \d_strd,  #1
2594        msub            \src,  \s_strd,  \xmy,  \src
2595        msub            \dst,  \d_strd,  \xmy,  \dst
2596        sub             \src,  \src,  \s_strd,  lsl #2
2597        mov             \h,  \my
2598        add             \src,  \src,  #16
2599        add             \dst,  \dst,  #16
2600        b               164b
2601
2602880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2603640:
26041280:
2605        AARCH64_VALID_JUMP_TARGET
2606        ld1             {v0.8b},  [\xmx]
2607        ld1             {v1.8b},  [\xmy]
2608.ifc \taps, 6tap
2609        sub             \src,  \src,  #4
2610.else
2611        sub             \src,  \src,  #6
2612        sub             \src,  \src,  \s_strd
2613.endif
2614        sub             \src,  \src,  \s_strd, lsl #1
2615        sxtl            v0.8h,   v0.8b
2616        sxtl            v1.8h,   v1.8b
2617        mov             x15, x30
2618        mov             \my, \h
2619
2620168:
2621        add             \ds2,  \dst,  \d_strd
2622        add             \sr2,  \src,  \s_strd
2623        lsl             \d_strd, \d_strd, #1
2624        lsl             \s_strd, \s_strd, #1
2625
2626        ld1             {v27.8h, v28.8h},  [\src], \s_strd
2627.ifc \taps, 6tap
2628        smull           v24.4s,  v27.4h,  v0.h[1]
2629        smull2          v25.4s,  v27.8h,  v0.h[1]
2630    .irpc i, 23456
2631        ext             v26.16b, v27.16b, v28.16b, #(2*\i-2)
2632        smlal           v24.4s,  v26.4h,  v0.h[\i]
2633        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2634    .endr
2635.else   // 8tap
2636        smull           v24.4s,  v27.4h,  v0.h[0]
2637        smull2          v25.4s,  v27.8h,  v0.h[0]
2638    .irpc i, 1234567
2639        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2640        smlal           v24.4s,  v26.4h,  v0.h[\i]
2641        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2642    .endr
2643.endif
2644        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2645        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2646        // The intermediates from the horizontal pass fit in 16 bit without
2647        // any bias; we could just as well keep them as .4s, but narrowing
2648        // them to .4h gives a significant speedup on out of order cores
2649        // (at the cost of a smaller slowdown on in-order cores such as A53),
2650        // and conserves register space (no need to clobber v8-v15).
2651.ifc \taps, 6tap
2652        uzp1            v18.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
2653.else
2654        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
2655
2656        bl              L(\type\()_\taps\()_filter_8)
2657        mov             v17.16b, v23.16b
2658        mov             v18.16b, v24.16b
2659.endif
2660        bl              L(\type\()_\taps\()_filter_8)
2661        mov             v19.16b, v23.16b
2662        mov             v20.16b, v24.16b
2663        bl              L(\type\()_\taps\()_filter_8)
2664        mov             v21.16b, v23.16b
2665        mov             v22.16b, v24.16b
2666
266788:
2668.ifc \taps, 6tap
2669        smull           v2.4s,   v18.4h,  v1.h[1]
2670        smull2          v3.4s,   v18.8h,  v1.h[1]
2671        bl              L(\type\()_\taps\()_filter_8)
2672        smull           v4.4s,   v19.4h,  v1.h[1]
2673        smull2          v5.4s,   v19.8h,  v1.h[1]
2674        smlal           v2.4s,   v19.4h,  v1.h[2]
2675        smlal2          v3.4s,   v19.8h,  v1.h[2]
2676        smlal           v4.4s,   v20.4h,  v1.h[2]
2677        smlal2          v5.4s,   v20.8h,  v1.h[2]
2678        smlal           v2.4s,   v20.4h,  v1.h[3]
2679        smlal2          v3.4s,   v20.8h,  v1.h[3]
2680        smlal           v4.4s,   v21.4h,  v1.h[3]
2681        smlal2          v5.4s,   v21.8h,  v1.h[3]
2682        smlal           v2.4s,   v21.4h,  v1.h[4]
2683        smlal2          v3.4s,   v21.8h,  v1.h[4]
2684        smlal           v4.4s,   v22.4h,  v1.h[4]
2685        smlal2          v5.4s,   v22.8h,  v1.h[4]
2686        smlal           v2.4s,   v22.4h,  v1.h[5]
2687        smlal2          v3.4s,   v22.8h,  v1.h[5]
2688        smlal           v4.4s,   v23.4h,  v1.h[5]
2689        smlal2          v5.4s,   v23.8h,  v1.h[5]
2690        smlal           v2.4s,   v23.4h,  v1.h[6]
2691        smlal2          v3.4s,   v23.8h,  v1.h[6]
2692        smlal           v4.4s,   v24.4h,  v1.h[6]
2693        smlal2          v5.4s,   v24.8h,  v1.h[6]
2694.else   // 8tap
2695        smull           v2.4s,   v16.4h,  v1.h[0]
2696        smull2          v3.4s,   v16.8h,  v1.h[0]
2697        bl              L(\type\()_\taps\()_filter_8)
2698        smull           v4.4s,   v17.4h,  v1.h[0]
2699        smull2          v5.4s,   v17.8h,  v1.h[0]
2700        smlal           v2.4s,   v17.4h,  v1.h[1]
2701        smlal2          v3.4s,   v17.8h,  v1.h[1]
2702        smlal           v4.4s,   v18.4h,  v1.h[1]
2703        smlal2          v5.4s,   v18.8h,  v1.h[1]
2704        smlal           v2.4s,   v18.4h,  v1.h[2]
2705        smlal2          v3.4s,   v18.8h,  v1.h[2]
2706        smlal           v4.4s,   v19.4h,  v1.h[2]
2707        smlal2          v5.4s,   v19.8h,  v1.h[2]
2708        smlal           v2.4s,   v19.4h,  v1.h[3]
2709        smlal2          v3.4s,   v19.8h,  v1.h[3]
2710        smlal           v4.4s,   v20.4h,  v1.h[3]
2711        smlal2          v5.4s,   v20.8h,  v1.h[3]
2712        smlal           v2.4s,   v20.4h,  v1.h[4]
2713        smlal2          v3.4s,   v20.8h,  v1.h[4]
2714        smlal           v4.4s,   v21.4h,  v1.h[4]
2715        smlal2          v5.4s,   v21.8h,  v1.h[4]
2716        smlal           v2.4s,   v21.4h,  v1.h[5]
2717        smlal2          v3.4s,   v21.8h,  v1.h[5]
2718        smlal           v4.4s,   v22.4h,  v1.h[5]
2719        smlal2          v5.4s,   v22.8h,  v1.h[5]
2720        smlal           v2.4s,   v22.4h,  v1.h[6]
2721        smlal2          v3.4s,   v22.8h,  v1.h[6]
2722        smlal           v4.4s,   v23.4h,  v1.h[6]
2723        smlal2          v5.4s,   v23.8h,  v1.h[6]
2724        smlal           v2.4s,   v23.4h,  v1.h[7]
2725        smlal2          v3.4s,   v23.8h,  v1.h[7]
2726        smlal           v4.4s,   v24.4h,  v1.h[7]
2727        smlal2          v5.4s,   v24.8h,  v1.h[7]
2728.endif
2729.ifc \type, put
2730        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2731        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2732        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2733        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
2734        sqxtun          v2.4h,   v2.4s
2735        sqxtun2         v2.8h,   v3.4s
2736        sqxtun          v3.4h,   v4.4s
2737        sqxtun2         v3.8h,   v5.4s
2738        umin            v2.8h,   v2.8h,   v31.8h
2739        umin            v3.8h,   v3.8h,   v31.8h
2740.else
2741        rshrn           v2.4h,   v2.4s,   #6
2742        rshrn2          v2.8h,   v3.4s,   #6
2743        rshrn           v3.4h,   v4.4s,   #6
2744        rshrn2          v3.8h,   v5.4s,   #6
2745        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2746        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2747.endif
2748        subs            \h,  \h,  #2
2749        st1             {v2.8h}, [\dst], \d_strd
2750        st1             {v3.8h}, [\ds2], \d_strd
2751        b.le            9f
2752.ifc \taps, 8tap
2753        mov             v16.16b, v18.16b
2754        mov             v17.16b, v19.16b
2755.endif
2756        mov             v18.16b, v20.16b
2757        mov             v19.16b, v21.16b
2758        mov             v20.16b, v22.16b
2759        mov             v21.16b, v23.16b
2760        mov             v22.16b, v24.16b
2761        b               88b
27629:
2763        subs            \w,  \w,  #8
2764        b.le            0f
2765        asr             \s_strd,  \s_strd,  #1
2766        asr             \d_strd,  \d_strd,  #1
2767        msub            \src,  \s_strd,  \xmy,  \src
2768        msub            \dst,  \d_strd,  \xmy,  \dst
2769        sub             \src,  \src,  \s_strd,  lsl #3
2770        mov             \h,  \my
2771        add             \src,  \src,  #16
2772        add             \dst,  \dst,  #16
2773.ifc \taps, 6tap
2774        add             \src,  \src,  \s_strd,  lsl #1
2775.endif
2776        b               168b
27770:
2778        ret             x15
2779
2780L(\type\()_\taps\()_filter_8):
2781        ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
2782        ld1             {v6.8h, v7.8h},  [\src], \s_strd
2783.ifc \taps, 6tap
2784        smull           v25.4s,  v4.4h,   v0.h[1]
2785        smull2          v26.4s,  v4.8h,   v0.h[1]
2786        smull           v27.4s,  v6.4h,   v0.h[1]
2787        smull2          v28.4s,  v6.8h,   v0.h[1]
2788.irpc i, 23456
2789        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i-2)
2790        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i-2)
2791        smlal           v25.4s,  v23.4h,  v0.h[\i]
2792        smlal2          v26.4s,  v23.8h,  v0.h[\i]
2793        smlal           v27.4s,  v24.4h,  v0.h[\i]
2794        smlal2          v28.4s,  v24.8h,  v0.h[\i]
2795.endr
2796.else   // 8tap
2797        smull           v25.4s,  v4.4h,   v0.h[0]
2798        smull2          v26.4s,  v4.8h,   v0.h[0]
2799        smull           v27.4s,  v6.4h,   v0.h[0]
2800        smull2          v28.4s,  v6.8h,   v0.h[0]
2801.irpc i, 1234567
2802        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
2803        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
2804        smlal           v25.4s,  v23.4h,  v0.h[\i]
2805        smlal2          v26.4s,  v23.8h,  v0.h[\i]
2806        smlal           v27.4s,  v24.4h,  v0.h[\i]
2807        smlal2          v28.4s,  v24.8h,  v0.h[\i]
2808.endr
2809.endif
2810        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2811        srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)
2812        srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)
2813        srshl           v28.4s,  v28.4s,  v30.4s // -(6-intermediate_bits)
2814        uzp1            v23.8h,  v25.8h,  v26.8h // Same as xtn, xtn2
2815        uzp1            v24.8h,  v27.8h,  v28.8h // Ditto
2816        ret
2817endfunc
2818
2819jumptable \type\()_\taps\()_hv_tbl
2820        .word 1280b - \type\()_\taps\()_hv_tbl
2821        .word 640b  - \type\()_\taps\()_hv_tbl
2822        .word 320b  - \type\()_\taps\()_hv_tbl
2823        .word 160b  - \type\()_\taps\()_hv_tbl
2824        .word 80b   - \type\()_\taps\()_hv_tbl
2825        .word 40b   - \type\()_\taps\()_hv_tbl
2826        .word 20b   - \type\()_\taps\()_hv_tbl
2827endjumptable
2828.endm
2829
2830
2831.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
2832function \type\()_bilin_16bpc_neon, export=1
2833.ifc \bdmax, w8
2834        ldr             w8,  [sp]
2835.endif
2836        dup             v1.8h,   \mx
2837        dup             v3.8h,   \my
2838        mov             w10, #16
2839        sub             w9,  w10, \mx
2840        sub             w10, w10, \my
2841        dup             v0.8h,   w9
2842        dup             v2.8h,   w10
2843.ifc \type, prep
2844        uxtw            \d_strd, \w
2845        lsl             \d_strd, \d_strd, #1
2846.endif
2847
2848        clz             \bdmax,   \bdmax       // bitdepth_max
2849        clz             w9,  \w
2850        sub             \bdmax,   \bdmax,  #18 // intermediate_bits = clz(bitdepth_max) - 18
2851        mov             w11, #4
2852        sub             w9,  w9,  #24
2853        sub             w11, w11, \bdmax  // 4 - intermediate_bits
2854        add             w12, \bdmax, #4   // 4 + intermediate_bits
2855        cbnz            \mx, L(\type\()_bilin_h)
2856        cbnz            \my, L(\type\()_bilin_v)
2857        b               \type\()_16bpc_neon
2858
2859L(\type\()_bilin_h):
2860        cbnz            \my, L(\type\()_bilin_hv)
2861
2862        movrel          x10, \type\()_bilin_h_tbl
2863        dup             v31.8h,  w11      // 4 - intermediate_bits
2864        ldrsw           x9,  [x10, x9, lsl #2]
2865        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2866.ifc \type, put
2867        dup             v30.8h,  \bdmax   // intermediate_bits
2868.else
2869        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2870.endif
2871        add             x10, x10, x9
2872.ifc \type, put
2873        neg             v30.8h,  v30.8h   // -intermediate_bits
2874.endif
2875        br              x10
2876
287720:     // 2xN h
2878        AARCH64_VALID_JUMP_TARGET
2879.ifc \type, put
2880        add             \ds2,  \dst,  \d_strd
2881        add             \sr2,  \src,  \s_strd
2882        lsl             \d_strd,  \d_strd,  #1
2883        lsl             \s_strd,  \s_strd,  #1
28842:
2885        ld1             {v4.4h},  [\src], \s_strd
2886        ld1             {v6.4h},  [\sr2], \s_strd
2887        ext             v5.8b,   v4.8b,   v4.8b,   #2
2888        ext             v7.8b,   v6.8b,   v6.8b,   #2
2889        trn1            v4.2s,   v4.2s,   v6.2s
2890        trn1            v5.2s,   v5.2s,   v7.2s
2891        subs            \h,  \h,  #2
2892        mul             v4.4h,   v4.4h,   v0.4h
2893        mla             v4.4h,   v5.4h,   v1.4h
2894        urshl           v4.4h,   v4.4h,   v31.4h
2895        urshl           v4.4h,   v4.4h,   v30.4h
2896        st1             {v4.s}[0], [\dst], \d_strd
2897        st1             {v4.s}[1], [\ds2], \d_strd
2898        b.gt            2b
2899        ret
2900.endif
2901
290240:     // 4xN h
2903        AARCH64_VALID_JUMP_TARGET
2904        add             \ds2,  \dst,  \d_strd
2905        add             \sr2,  \src,  \s_strd
2906        lsl             \d_strd,  \d_strd,  #1
2907        lsl             \s_strd,  \s_strd,  #1
29084:
2909        ld1             {v4.8h}, [\src], \s_strd
2910        ld1             {v6.8h}, [\sr2], \s_strd
2911        ext             v5.16b,  v4.16b,  v4.16b,  #2
2912        ext             v7.16b,  v6.16b,  v6.16b,  #2
2913        trn1            v4.2d,   v4.2d,   v6.2d
2914        trn1            v5.2d,   v5.2d,   v7.2d
2915        subs            \h,  \h,  #2
2916        mul             v4.8h,   v4.8h,   v0.8h
2917        mla             v4.8h,   v5.8h,   v1.8h
2918        urshl           v4.8h,   v4.8h,   v31.8h
2919.ifc \type, put
2920        urshl           v4.8h,   v4.8h,   v30.8h
2921.else
2922        sub             v4.8h,   v4.8h,   v29.8h
2923.endif
2924        st1             {v4.8b},   [\dst], \d_strd
2925        st1             {v4.d}[1], [\ds2], \d_strd
2926        b.gt            4b
2927        ret
2928
292980:     // 8xN h
2930        AARCH64_VALID_JUMP_TARGET
2931        add             \ds2,  \dst,  \d_strd
2932        add             \sr2,  \src,  \s_strd
2933        lsl             \d_strd,  \d_strd,  #1
2934        lsl             \s_strd,  \s_strd,  #1
29358:
2936        ldr             h5,  [\src, #16]
2937        ldr             h7,  [\sr2, #16]
2938        ld1             {v4.8h}, [\src], \s_strd
2939        ld1             {v6.8h}, [\sr2], \s_strd
2940        ext             v5.16b,  v4.16b,  v5.16b,  #2
2941        ext             v7.16b,  v6.16b,  v7.16b,  #2
2942        subs            \h,  \h,  #2
2943        mul             v4.8h,   v4.8h,   v0.8h
2944        mla             v4.8h,   v5.8h,   v1.8h
2945        mul             v6.8h,   v6.8h,   v0.8h
2946        mla             v6.8h,   v7.8h,   v1.8h
2947        urshl           v4.8h,   v4.8h,   v31.8h
2948        urshl           v6.8h,   v6.8h,   v31.8h
2949.ifc \type, put
2950        urshl           v4.8h,   v4.8h,   v30.8h
2951        urshl           v6.8h,   v6.8h,   v30.8h
2952.else
2953        sub             v4.8h,   v4.8h,   v29.8h
2954        sub             v6.8h,   v6.8h,   v29.8h
2955.endif
2956        st1             {v4.8h}, [\dst], \d_strd
2957        st1             {v6.8h}, [\ds2], \d_strd
2958        b.gt            8b
2959        ret
2960160:
2961320:
2962640:
29631280:   // 16xN, 32xN, ... h
2964        AARCH64_VALID_JUMP_TARGET
2965        add             \ds2,  \dst,  \d_strd
2966        add             \sr2,  \src,  \s_strd
2967        lsl             \s_strd,  \s_strd,  #1
2968
2969        sub             \s_strd,  \s_strd,  \w, uxtw #1
2970        sub             \s_strd,  \s_strd,  #16
2971.ifc \type, put
2972        lsl             \d_strd,  \d_strd,  #1
2973        sub             \d_strd,  \d_strd,  \w, uxtw #1
2974.endif
2975161:
2976        ld1             {v16.8h},  [\src], #16
2977        ld1             {v21.8h},  [\sr2], #16
2978        mov             \mx, \w
2979
298016:
2981        ld1             {v17.8h, v18.8h},  [\src], #32
2982        ld1             {v22.8h, v23.8h},  [\sr2], #32
2983        ext             v19.16b, v16.16b, v17.16b, #2
2984        ext             v20.16b, v17.16b, v18.16b, #2
2985        ext             v24.16b, v21.16b, v22.16b, #2
2986        ext             v25.16b, v22.16b, v23.16b, #2
2987        mul             v16.8h,  v16.8h,  v0.8h
2988        mla             v16.8h,  v19.8h,  v1.8h
2989        mul             v17.8h,  v17.8h,  v0.8h
2990        mla             v17.8h,  v20.8h,  v1.8h
2991        mul             v21.8h,  v21.8h,  v0.8h
2992        mla             v21.8h,  v24.8h,  v1.8h
2993        mul             v22.8h,  v22.8h,  v0.8h
2994        mla             v22.8h,  v25.8h,  v1.8h
2995        urshl           v16.8h,  v16.8h,  v31.8h
2996        urshl           v17.8h,  v17.8h,  v31.8h
2997        urshl           v21.8h,  v21.8h,  v31.8h
2998        urshl           v22.8h,  v22.8h,  v31.8h
2999        subs            \mx, \mx, #16
3000.ifc \type, put
3001        urshl           v16.8h,  v16.8h,  v30.8h
3002        urshl           v17.8h,  v17.8h,  v30.8h
3003        urshl           v21.8h,  v21.8h,  v30.8h
3004        urshl           v22.8h,  v22.8h,  v30.8h
3005.else
3006        sub             v16.8h,  v16.8h,  v29.8h
3007        sub             v17.8h,  v17.8h,  v29.8h
3008        sub             v21.8h,  v21.8h,  v29.8h
3009        sub             v22.8h,  v22.8h,  v29.8h
3010.endif
3011        st1             {v16.8h, v17.8h}, [\dst], #32
3012        st1             {v21.8h, v22.8h}, [\ds2], #32
3013        b.le            9f
3014
3015        mov             v16.16b, v18.16b
3016        mov             v21.16b, v23.16b
3017        b               16b
3018
30199:
3020        add             \dst,  \dst,  \d_strd
3021        add             \ds2,  \ds2,  \d_strd
3022        add             \src,  \src,  \s_strd
3023        add             \sr2,  \sr2,  \s_strd
3024
3025        subs            \h,  \h,  #2
3026        b.gt            161b
3027        ret
3028endfunc
3029
3030jumptable \type\()_bilin_h_tbl
3031        .word 1280b - \type\()_bilin_h_tbl
3032        .word 640b  - \type\()_bilin_h_tbl
3033        .word 320b  - \type\()_bilin_h_tbl
3034        .word 160b  - \type\()_bilin_h_tbl
3035        .word 80b   - \type\()_bilin_h_tbl
3036        .word 40b   - \type\()_bilin_h_tbl
3037        .word 20b   - \type\()_bilin_h_tbl
3038endjumptable
3039
3040
3041function L(\type\()_bilin_v)
3042        cmp             \h,  #4
3043        movrel          x10, \type\()_bilin_v_tbl
3044.ifc \type, prep
3045        dup             v31.8h,  w11      // 4 - intermediate_bits
3046.endif
3047        ldrsw           x9,  [x10, x9, lsl #2]
3048.ifc \type, prep
3049        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
3050        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
3051.endif
3052        add             x10, x10, x9
3053        br              x10
3054
305520:     // 2xN v
3056        AARCH64_VALID_JUMP_TARGET
3057.ifc \type, put
3058        cmp             \h,  #2
3059        add             \ds2,  \dst,  \d_strd
3060        add             \sr2,  \src,  \s_strd
3061        lsl             \s_strd,  \s_strd,  #1
3062        lsl             \d_strd,  \d_strd,  #1
3063
3064        // 2x2 v
3065        ld1r            {v16.4s}, [\src], \s_strd
3066        b.gt            24f
306722:
3068        ld1r            {v17.4s}, [\sr2], \s_strd
3069        ld1r            {v18.4s}, [\src], \s_strd
3070        trn1            v16.2s,  v16.2s,  v17.2s
3071        trn1            v17.2s,  v17.2s,  v18.2s
3072        mul             v4.4h,   v16.4h,  v2.4h
3073        mla             v4.4h,   v17.4h,  v3.4h
3074        urshr           v4.8h,   v4.8h,   #4
3075        str             s4,        [\dst]
3076        st1             {v4.s}[1], [\ds2]
3077        ret
307824:     // 2x4, 2x6, 2x8, ... v
3079        ld1r            {v17.4s}, [\sr2], \s_strd
3080        ld1r            {v18.4s}, [\src], \s_strd
3081        ld1r            {v19.4s}, [\sr2], \s_strd
3082        ld1r            {v20.4s}, [\src], \s_strd
3083        sub             \h,  \h,  #4
3084        trn1            v16.2s,  v16.2s,  v17.2s
3085        trn1            v17.2s,  v17.2s,  v18.2s
3086        trn1            v18.2s,  v18.2s,  v19.2s
3087        trn1            v19.2s,  v19.2s,  v20.2s
3088        trn1            v16.2d,  v16.2d,  v18.2d
3089        trn1            v17.2d,  v17.2d,  v19.2d
3090        mul             v4.8h,   v16.8h,  v2.8h
3091        mla             v4.8h,   v17.8h,  v3.8h
3092        cmp             \h,  #2
3093        urshr           v4.8h,   v4.8h,   #4
3094        st1             {v4.s}[0], [\dst], \d_strd
3095        st1             {v4.s}[1], [\ds2], \d_strd
3096        st1             {v4.s}[2], [\dst], \d_strd
3097        st1             {v4.s}[3], [\ds2], \d_strd
3098        b.lt            0f
3099        mov             v16.8b,  v20.8b
3100        b.eq            22b
3101        b               24b
31020:
3103        ret
3104.endif
3105
310640:     // 4xN v
3107        AARCH64_VALID_JUMP_TARGET
3108        add             \ds2,  \dst,  \d_strd
3109        add             \sr2,  \src,  \s_strd
3110        lsl             \s_strd,  \s_strd,  #1
3111        lsl             \d_strd,  \d_strd,  #1
3112        ld1             {v16.4h}, [\src], \s_strd
31134:
3114        ld1             {v17.4h}, [\sr2], \s_strd
3115        ld1             {v18.4h}, [\src], \s_strd
3116        trn1            v16.2d,  v16.2d,  v17.2d
3117        trn1            v17.2d,  v17.2d,  v18.2d
3118        mul             v4.8h,   v16.8h,  v2.8h
3119        mla             v4.8h,   v17.8h,  v3.8h
3120        subs            \h,  \h,  #2
3121.ifc \type, put
3122        urshr           v4.8h,   v4.8h,   #4
3123.else
3124        urshl           v4.8h,   v4.8h,   v31.8h
3125        sub             v4.8h,   v4.8h,   v29.8h
3126.endif
3127        st1             {v4.8b},   [\dst], \d_strd
3128        st1             {v4.d}[1], [\ds2], \d_strd
3129        b.le            0f
3130        mov             v16.8b,  v18.8b
3131        b               4b
31320:
3133        ret
3134
313580:     // 8xN v
3136        AARCH64_VALID_JUMP_TARGET
3137        add             \ds2,  \dst,  \d_strd
3138        add             \sr2,  \src,  \s_strd
3139        lsl             \s_strd,  \s_strd,  #1
3140        lsl             \d_strd,  \d_strd,  #1
3141        ld1             {v16.8h}, [\src], \s_strd
31428:
3143        ld1             {v17.8h}, [\sr2], \s_strd
3144        ld1             {v18.8h}, [\src], \s_strd
3145        mul             v4.8h,   v16.8h,  v2.8h
3146        mla             v4.8h,   v17.8h,  v3.8h
3147        mul             v5.8h,   v17.8h,  v2.8h
3148        mla             v5.8h,   v18.8h,  v3.8h
3149        subs            \h,  \h,  #2
3150.ifc \type, put
3151        urshr           v4.8h,   v4.8h,   #4
3152        urshr           v5.8h,   v5.8h,   #4
3153.else
3154        urshl           v4.8h,   v4.8h,   v31.8h
3155        urshl           v5.8h,   v5.8h,   v31.8h
3156        sub             v4.8h,   v4.8h,   v29.8h
3157        sub             v5.8h,   v5.8h,   v29.8h
3158.endif
3159        st1             {v4.8h}, [\dst], \d_strd
3160        st1             {v5.8h}, [\ds2], \d_strd
3161        b.le            0f
3162        mov             v16.16b, v18.16b
3163        b               8b
31640:
3165        ret
3166
3167160:    // 16xN, 32xN, ...
3168320:
3169640:
31701280:
3171        AARCH64_VALID_JUMP_TARGET
3172        mov             \my, \h
31731:
3174        add             \ds2, \dst, \d_strd
3175        add             \sr2, \src, \s_strd
3176        lsl             \s_strd, \s_strd, #1
3177        lsl             \d_strd, \d_strd, #1
3178
3179        ld1             {v16.8h, v17.8h}, [\src], \s_strd
31802:
3181        ld1             {v18.8h, v19.8h}, [\sr2], \s_strd
3182        ld1             {v20.8h, v21.8h}, [\src], \s_strd
3183        mul             v4.8h,   v16.8h,  v2.8h
3184        mla             v4.8h,   v18.8h,  v3.8h
3185        mul             v5.8h,   v17.8h,  v2.8h
3186        mla             v5.8h,   v19.8h,  v3.8h
3187        mul             v6.8h,   v18.8h,  v2.8h
3188        mla             v6.8h,   v20.8h,  v3.8h
3189        mul             v7.8h,   v19.8h,  v2.8h
3190        mla             v7.8h,   v21.8h,  v3.8h
3191        subs            \h,  \h,  #2
3192.ifc \type, put
3193        urshr           v4.8h,   v4.8h,   #4
3194        urshr           v5.8h,   v5.8h,   #4
3195        urshr           v6.8h,   v6.8h,   #4
3196        urshr           v7.8h,   v7.8h,   #4
3197.else
3198        urshl           v4.8h,   v4.8h,   v31.8h
3199        urshl           v5.8h,   v5.8h,   v31.8h
3200        urshl           v6.8h,   v6.8h,   v31.8h
3201        urshl           v7.8h,   v7.8h,   v31.8h
3202        sub             v4.8h,   v4.8h,   v29.8h
3203        sub             v5.8h,   v5.8h,   v29.8h
3204        sub             v6.8h,   v6.8h,   v29.8h
3205        sub             v7.8h,   v7.8h,   v29.8h
3206.endif
3207        st1             {v4.8h, v5.8h}, [\dst], \d_strd
3208        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
3209        b.le            9f
3210        mov             v16.16b, v20.16b
3211        mov             v17.16b, v21.16b
3212        b               2b
32139:
3214        subs            \w,  \w,  #16
3215        b.le            0f
3216        asr             \s_strd, \s_strd, #1
3217        asr             \d_strd, \d_strd, #1
3218        msub            \src, \s_strd, \xmy, \src
3219        msub            \dst, \d_strd, \xmy, \dst
3220        sub             \src, \src, \s_strd, lsl #1
3221        mov             \h,  \my
3222        add             \src, \src, #32
3223        add             \dst, \dst, #32
3224        b               1b
32250:
3226        ret
3227endfunc
3228
3229jumptable \type\()_bilin_v_tbl
3230        .word 1280b - \type\()_bilin_v_tbl
3231        .word 640b  - \type\()_bilin_v_tbl
3232        .word 320b  - \type\()_bilin_v_tbl
3233        .word 160b  - \type\()_bilin_v_tbl
3234        .word 80b   - \type\()_bilin_v_tbl
3235        .word 40b   - \type\()_bilin_v_tbl
3236        .word 20b   - \type\()_bilin_v_tbl
3237endjumptable
3238
3239function L(\type\()_bilin_hv)
3240        movrel          x10, \type\()_bilin_hv_tbl
3241        dup             v31.8h,  w11      // 4 - intermediate_bits
3242        ldrsw           x9,  [x10, x9, lsl #2]
3243        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
3244.ifc \type, put
3245        dup             v30.4s,  w12      // 4 + intermediate_bits
3246.else
3247        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
3248.endif
3249        add             x10, x10, x9
3250.ifc \type, put
3251        neg             v30.4s,  v30.4s   // -(4+intermediate_bits)
3252.endif
3253        br              x10
3254
325520:     // 2xN hv
3256        AARCH64_VALID_JUMP_TARGET
3257.ifc \type, put
3258        add             \sr2, \src, \s_strd
3259        add             \ds2, \dst, \d_strd
3260        lsl             \s_strd, \s_strd, #1
3261        lsl             \d_strd, \d_strd, #1
3262
3263        ld1             {v20.4h},  [\src], \s_strd
3264        ext             v21.8b,  v20.8b,  v20.8b,  #2
3265        mul             v16.4h,  v20.4h,  v0.4h
3266        mla             v16.4h,  v21.4h,  v1.4h
3267        urshl           v16.4h,  v16.4h,  v31.4h
3268
32692:
3270        ld1             {v22.4h},  [\sr2], \s_strd
3271        ld1             {v24.4h},  [\src], \s_strd
3272        ext             v23.8b,  v22.8b,  v22.8b,  #2
3273        ext             v25.8b,  v24.8b,  v24.8b,  #2
3274        trn1            v22.2s,  v22.2s,  v24.2s
3275        trn1            v23.2s,  v23.2s,  v25.2s
3276        mul             v17.4h,  v22.4h,  v0.4h
3277        mla             v17.4h,  v23.4h,  v1.4h
3278        urshl           v17.4h,  v17.4h,  v31.4h
3279
3280        trn1            v16.2s,  v16.2s,  v17.2s
3281
3282        umull           v4.4s,   v16.4h,  v2.4h
3283        umlal           v4.4s,   v17.4h,  v3.4h
3284        urshl           v4.4s,   v4.4s,   v30.4s
3285        xtn             v4.4h,   v4.4s
3286        subs            \h,  \h,  #2
3287        st1             {v4.s}[0], [\dst], \d_strd
3288        st1             {v4.s}[1], [\ds2], \d_strd
3289        b.le            0f
3290        trn2            v16.2s,  v17.2s,  v17.2s
3291        b               2b
32920:
3293        ret
3294.endif
3295
329640:     // 4xN hv
3297        AARCH64_VALID_JUMP_TARGET
3298        add             \sr2, \src, \s_strd
3299        add             \ds2, \dst, \d_strd
3300        lsl             \s_strd, \s_strd, #1
3301        lsl             \d_strd, \d_strd, #1
3302
3303        ld1             {v20.8h},  [\src], \s_strd
3304        ext             v21.16b, v20.16b, v20.16b, #2
3305        mul             v16.4h,  v20.4h,  v0.4h
3306        mla             v16.4h,  v21.4h,  v1.4h
3307        urshl           v16.4h,  v16.4h,  v31.4h
3308
33094:
3310        ld1             {v22.8h},  [\sr2], \s_strd
3311        ld1             {v24.8h},  [\src], \s_strd
3312        ext             v23.16b, v22.16b, v22.16b, #2
3313        ext             v25.16b, v24.16b, v24.16b, #2
3314        trn1            v22.2d,  v22.2d,  v24.2d
3315        trn1            v23.2d,  v23.2d,  v25.2d
3316        mul             v17.8h,  v22.8h,  v0.8h
3317        mla             v17.8h,  v23.8h,  v1.8h
3318        urshl           v17.8h,  v17.8h,  v31.8h
3319
3320        trn1            v16.2d,  v16.2d,  v17.2d
3321
3322        umull           v4.4s,   v16.4h,  v2.4h
3323        umlal           v4.4s,   v17.4h,  v3.4h
3324        umull2          v5.4s,   v16.8h,  v2.8h
3325        umlal2          v5.4s,   v17.8h,  v3.8h
3326.ifc \type, put
3327        urshl           v4.4s,   v4.4s,   v30.4s
3328        urshl           v5.4s,   v5.4s,   v30.4s
3329        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2
3330.else
3331        rshrn           v4.4h,   v4.4s,   #4
3332        rshrn2          v4.8h,   v5.4s,   #4
3333        sub             v4.8h,   v4.8h,   v29.8h
3334.endif
3335        subs            \h,  \h,  #2
3336        st1             {v4.8b},   [\dst], \d_strd
3337        st1             {v4.d}[1], [\ds2], \d_strd
3338        b.le            0f
3339        trn2            v16.2d,  v17.2d,  v17.2d
3340        b               4b
33410:
3342        ret
3343
334480:     // 8xN, 16xN, ... hv
3345160:
3346320:
3347640:
33481280:
3349        AARCH64_VALID_JUMP_TARGET
3350        mov             \my, \h
3351
33521:
3353        add             \sr2, \src, \s_strd
3354        add             \ds2, \dst, \d_strd
3355        lsl             \s_strd, \s_strd, #1
3356        lsl             \d_strd, \d_strd, #1
3357
3358        ldr             h21, [\src, #16]
3359        ld1             {v20.8h},  [\src], \s_strd
3360        ext             v21.16b, v20.16b, v21.16b, #2
3361        mul             v16.8h,  v20.8h,  v0.8h
3362        mla             v16.8h,  v21.8h,  v1.8h
3363        urshl           v16.8h,  v16.8h,  v31.8h
3364
33652:
3366        ldr             h23, [\sr2, #16]
3367        ld1             {v22.8h},  [\sr2], \s_strd
3368        ldr             h25, [\src, #16]
3369        ld1             {v24.8h},  [\src], \s_strd
3370        ext             v23.16b, v22.16b, v23.16b, #2
3371        ext             v25.16b, v24.16b, v25.16b, #2
3372        mul             v17.8h,  v22.8h,  v0.8h
3373        mla             v17.8h,  v23.8h,  v1.8h
3374        mul             v18.8h,  v24.8h,  v0.8h
3375        mla             v18.8h,  v25.8h,  v1.8h
3376        urshl           v17.8h,  v17.8h,  v31.8h
3377        urshl           v18.8h,  v18.8h,  v31.8h
3378
3379        umull           v4.4s,   v16.4h,  v2.4h
3380        umlal           v4.4s,   v17.4h,  v3.4h
3381        umull2          v5.4s,   v16.8h,  v2.8h
3382        umlal2          v5.4s,   v17.8h,  v3.8h
3383        umull           v6.4s,   v17.4h,  v2.4h
3384        umlal           v6.4s,   v18.4h,  v3.4h
3385        umull2          v7.4s,   v17.8h,  v2.8h
3386        umlal2          v7.4s,   v18.8h,  v3.8h
3387.ifc \type, put
3388        urshl           v4.4s,   v4.4s,   v30.4s
3389        urshl           v5.4s,   v5.4s,   v30.4s
3390        urshl           v6.4s,   v6.4s,   v30.4s
3391        urshl           v7.4s,   v7.4s,   v30.4s
3392        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2
3393        uzp1            v5.8h,   v6.8h,   v7.8h  // Ditto
3394.else
3395        rshrn           v4.4h,   v4.4s,   #4
3396        rshrn2          v4.8h,   v5.4s,   #4
3397        rshrn           v5.4h,   v6.4s,   #4
3398        rshrn2          v5.8h,   v7.4s,   #4
3399        sub             v4.8h,   v4.8h,   v29.8h
3400        sub             v5.8h,   v5.8h,   v29.8h
3401.endif
3402        subs            \h,  \h,  #2
3403        st1             {v4.8h}, [\dst], \d_strd
3404        st1             {v5.8h}, [\ds2], \d_strd
3405        b.le            9f
3406        mov             v16.16b, v18.16b
3407        b               2b
34089:
3409        subs            \w,  \w,  #8
3410        b.le            0f
3411        asr             \s_strd,  \s_strd,  #1
3412        asr             \d_strd,  \d_strd,  #1
3413        msub            \src,  \s_strd,  \xmy,  \src
3414        msub            \dst,  \d_strd,  \xmy,  \dst
3415        sub             \src,  \src,  \s_strd,  lsl #1
3416        mov             \h,  \my
3417        add             \src,  \src,  #16
3418        add             \dst,  \dst,  #16
3419        b               1b
34200:
3421        ret
3422endfunc
3423
3424jumptable \type\()_bilin_hv_tbl
3425        .word 1280b - \type\()_bilin_hv_tbl
3426        .word 640b  - \type\()_bilin_hv_tbl
3427        .word 320b  - \type\()_bilin_hv_tbl
3428        .word 160b  - \type\()_bilin_hv_tbl
3429        .word 80b   - \type\()_bilin_hv_tbl
3430        .word 40b   - \type\()_bilin_hv_tbl
3431        .word 20b   - \type\()_bilin_hv_tbl
3432endjumptable
3433.endm
3434
3435make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
3436make_8tap_fn    put,  smooth_sharp,   SMOOTH,  SHARP,   8tap
3437make_8tap_fn    put,  sharp,          SHARP,   SHARP,   8tap
3438make_8tap_fn    put,  sharp_regular,  SHARP,   REGULAR, 8tap
3439make_8tap_fn    put,  sharp_smooth,   SHARP,   SMOOTH,  8tap
3440filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap
3441
3442make_8tap_fn    put,  regular,        REGULAR, REGULAR, 6tap
3443make_8tap_fn    put,  regular_smooth, REGULAR, SMOOTH,  6tap
3444make_8tap_fn    put,  smooth,         SMOOTH,  SMOOTH,  6tap
3445make_8tap_fn    put,  smooth_regular, SMOOTH,  REGULAR, 6tap
3446filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap
3447filter_bilin_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
3448
3449make_8tap_fn    prep,  regular_sharp,  REGULAR, SHARP,   8tap
3450make_8tap_fn    prep,  smooth_sharp,   SMOOTH,  SHARP,   8tap
3451make_8tap_fn    prep,  sharp,          SHARP,   SHARP,   8tap
3452make_8tap_fn    prep,  sharp_regular,  SHARP,   REGULAR, 8tap
3453make_8tap_fn    prep,  sharp_smooth,   SHARP,   SMOOTH,  8tap
3454filter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap
3455
3456make_8tap_fn    prep,  regular,        REGULAR, REGULAR, 6tap
3457make_8tap_fn    prep,  regular_smooth, REGULAR, SMOOTH,  6tap
3458make_8tap_fn    prep,  smooth,         SMOOTH,  SMOOTH,  6tap
3459make_8tap_fn    prep,  smooth_regular, SMOOTH,  REGULAR, 6tap
3460filter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap
3461filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
3462
3463
3464.macro load_filter_row dst, src, inc
3465        asr             w13, \src, #10
3466        add             \src, \src, \inc
3467        ldr             \dst, [x11, w13, sxtw #3]
3468.endm
3469
3470function warp_filter_horz_neon
3471        add             w12, w5,  #512
3472
3473        ld1             {v16.8h, v17.8h}, [x2], x3
3474
3475        load_filter_row d0, w12, w7
3476        load_filter_row d1, w12, w7
3477        load_filter_row d2, w12, w7
3478        sxtl            v0.8h,   v0.8b
3479        load_filter_row d3, w12, w7
3480        sxtl            v1.8h,   v1.8b
3481        load_filter_row d4, w12, w7
3482        sxtl            v2.8h,   v2.8b
3483        load_filter_row d5, w12, w7
3484        sxtl            v3.8h,   v3.8b
3485        load_filter_row d6, w12, w7
3486        sxtl            v4.8h,   v4.8b
3487        load_filter_row d7, w12, w7
3488        sxtl            v5.8h,   v5.8b
3489        ext             v18.16b, v16.16b, v17.16b, #2*1
3490        smull           v8.4s,   v16.4h,  v0.4h
3491        smull2          v9.4s,   v16.8h,  v0.8h
3492        sxtl            v6.8h,   v6.8b
3493        ext             v19.16b, v16.16b, v17.16b, #2*2
3494        smull           v10.4s,  v18.4h,  v1.4h
3495        smull2          v11.4s,  v18.8h,  v1.8h
3496        sxtl            v7.8h,   v7.8b
3497        ext             v20.16b, v16.16b, v17.16b, #2*3
3498        smull           v0.4s,   v19.4h,  v2.4h
3499        smull2          v1.4s,   v19.8h,  v2.8h
3500        ext             v21.16b, v16.16b, v17.16b, #2*4
3501        addp            v8.4s,   v8.4s,   v9.4s
3502        smull           v2.4s,   v20.4h,  v3.4h
3503        smull2          v3.4s,   v20.8h,  v3.8h
3504        ext             v22.16b, v16.16b, v17.16b, #2*5
3505        addp            v9.4s,   v10.4s,  v11.4s
3506        smull           v10.4s,  v21.4h,  v4.4h
3507        smull2          v11.4s,  v21.8h,  v4.8h
3508        ext             v23.16b, v16.16b, v17.16b, #2*6
3509        addp            v0.4s,   v0.4s,   v1.4s
3510        smull           v18.4s,  v22.4h,  v5.4h
3511        smull2          v19.4s,  v22.8h,  v5.8h
3512        ext             v16.16b, v16.16b, v17.16b, #2*7
3513        addp            v1.4s,   v2.4s,   v3.4s
3514        addp            v2.4s,   v10.4s,  v11.4s
3515        smull           v20.4s,  v23.4h,  v6.4h
3516        smull2          v21.4s,  v23.8h,  v6.8h
3517        addp            v3.4s,   v18.4s,  v19.4s
3518        smull           v22.4s,  v16.4h,  v7.4h
3519        smull2          v23.4s,  v16.8h,  v7.8h
3520        addp            v4.4s,   v20.4s,  v21.4s
3521        addp            v5.4s,   v22.4s,  v23.4s
3522
3523        addp            v8.4s,   v8.4s,   v9.4s
3524        addp            v0.4s,   v0.4s,   v1.4s
3525        addp            v2.4s,   v2.4s,   v3.4s
3526        addp            v4.4s,   v4.4s,   v5.4s
3527
3528        addp            v16.4s,  v8.4s,   v0.4s
3529        addp            v17.4s,  v2.4s,   v4.4s
3530
3531        add             w5,  w5,  w8
3532
3533        srshl           v16.4s,  v16.4s,  v14.4s // -(7 - intermediate_bits)
3534        srshl           v17.4s,  v17.4s,  v14.4s // -(7 - intermediate_bits)
3535
3536        ret
3537endfunc
3538
3539// void dav1d_warp_affine_8x8_16bpc_neon(
3540//         pixel *dst, const ptrdiff_t dst_stride,
3541//         const pixel *src, const ptrdiff_t src_stride,
3542//         const int16_t *const abcd, int mx, int my,
3543//         const int bitdepth_max)
3544.macro warp t
3545function warp_affine_8x8\t\()_16bpc_neon, export=1
3546        stp             d8,  d9,  [sp, #-0x40]!
3547        stp             d10, d11, [sp, #0x10]
3548        stp             d12, d13, [sp, #0x20]
3549        stp             d14, d15, [sp, #0x30]
3550
3551.ifb \t
3552        dup             v15.8h,  w7        // bitdepth_max
3553.else
3554        movi            v15.8h,  #(PREP_BIAS >> 8), lsl #8
3555.endif
3556        clz             w7,  w7
3557                                           // intermediate_bits = clz(bitdepth_max) - 18
3558.ifb \t
3559        sub             w8,  w7,  #11      // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
3560.endif
3561        sub             w7,  w7,  #25      // -(7 - intermediate_bits)
3562.ifb \t
3563        neg             w8,  w8            // -(7 + intermediate_bits)
3564.endif
3565        dup             v14.4s,  w7        // -(7 - intermediate_bits)
3566.ifb \t
3567        dup             v13.4s,  w8        // -(7 + intermediate_bits)
3568.endif
3569
3570        ldr             x4,  [x4]
3571        sbfx            x7,  x4, #0,  #16
3572        sbfx            x8,  x4, #16, #16
3573        sbfx            x9,  x4, #32, #16
3574        sbfx            x4,  x4, #48, #16
3575        mov             w10, #8
3576        sub             x2,  x2,  x3, lsl #1
3577        sub             x2,  x2,  x3
3578        sub             x2,  x2,  #6
3579        movrel          x11, X(mc_warp_filter), 64*8
3580        mov             x15, x30
3581.ifnb \t
3582        lsl             x1,  x1,  #1
3583.endif
3584
3585        bl              warp_filter_horz_neon
3586        uzp1            v24.8h,  v16.8h,  v17.8h // Same as xtn, xtn2
3587        bl              warp_filter_horz_neon
3588        uzp1            v25.8h,  v16.8h,  v17.8h // Ditto
3589        bl              warp_filter_horz_neon
3590        uzp1            v26.8h,  v16.8h,  v17.8h // Ditto
3591        bl              warp_filter_horz_neon
3592        uzp1            v27.8h,  v16.8h,  v17.8h // Ditto
3593        bl              warp_filter_horz_neon
3594        uzp1            v28.8h,  v16.8h,  v17.8h // Ditto
3595        bl              warp_filter_horz_neon
3596        uzp1            v29.8h,  v16.8h,  v17.8h // Ditto
3597        bl              warp_filter_horz_neon
3598        uzp1            v30.8h,  v16.8h,  v17.8h // Ditto
3599
36001:
3601        add             w14, w6,  #512
3602        bl              warp_filter_horz_neon
3603        uzp1            v31.8h,  v16.8h,  v17.8h // Same as xtn, xtn2
3604
3605        load_filter_row d0, w14, w9
3606        load_filter_row d1, w14, w9
3607        load_filter_row d2, w14, w9
3608        load_filter_row d3, w14, w9
3609        load_filter_row d4, w14, w9
3610        load_filter_row d5, w14, w9
3611        load_filter_row d6, w14, w9
3612        load_filter_row d7, w14, w9
3613        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
3614
3615        // This ordering of smull/smlal/smull2/smlal2 is highly
3616        // beneficial for Cortex A53 here.
3617        smull           v16.4s,  v24.4h,  v0.4h
3618        smlal           v16.4s,  v25.4h,  v1.4h
3619        smlal           v16.4s,  v26.4h,  v2.4h
3620        smlal           v16.4s,  v27.4h,  v3.4h
3621        smlal           v16.4s,  v28.4h,  v4.4h
3622        smlal           v16.4s,  v29.4h,  v5.4h
3623        smlal           v16.4s,  v30.4h,  v6.4h
3624        smlal           v16.4s,  v31.4h,  v7.4h
3625        smull2          v17.4s,  v24.8h,  v0.8h
3626        smlal2          v17.4s,  v25.8h,  v1.8h
3627        smlal2          v17.4s,  v26.8h,  v2.8h
3628        smlal2          v17.4s,  v27.8h,  v3.8h
3629        smlal2          v17.4s,  v28.8h,  v4.8h
3630        smlal2          v17.4s,  v29.8h,  v5.8h
3631        smlal2          v17.4s,  v30.8h,  v6.8h
3632        smlal2          v17.4s,  v31.8h,  v7.8h
3633
3634        mov             v24.16b, v25.16b
3635        mov             v25.16b, v26.16b
3636.ifb \t
3637        srshl           v16.4s,  v16.4s,  v13.4s // -(7 + intermediate_bits)
3638        srshl           v17.4s,  v17.4s,  v13.4s // -(7 + intermediate_bits)
3639.else
3640        rshrn           v16.4h,  v16.4s,  #7
3641        rshrn2          v16.8h,  v17.4s,  #7
3642.endif
3643        mov             v26.16b, v27.16b
3644.ifb \t
3645        sqxtun          v16.4h,  v16.4s
3646        sqxtun2         v16.8h,  v17.4s
3647.else
3648        sub             v16.8h,  v16.8h,  v15.8h // PREP_BIAS
3649.endif
3650        mov             v27.16b, v28.16b
3651        mov             v28.16b, v29.16b
3652.ifb \t
3653        umin            v16.8h,  v16.8h,  v15.8h // bitdepth_max
3654.endif
3655        mov             v29.16b, v30.16b
3656        mov             v30.16b, v31.16b
3657        subs            w10, w10, #1
3658        st1             {v16.8h}, [x0], x1
3659
3660        add             w6,  w6,  w4
3661        b.gt            1b
3662
3663        ldp             d14, d15, [sp, #0x30]
3664        ldp             d12, d13, [sp, #0x20]
3665        ldp             d10, d11, [sp, #0x10]
3666        ldp             d8,  d9,  [sp], 0x40
3667
3668        ret             x15
3669endfunc
3670.endm
3671
3672warp
3673warp t
3674
3675// void dav1d_emu_edge_16bpc_neon(
3676//         const intptr_t bw, const intptr_t bh,
3677//         const intptr_t iw, const intptr_t ih,
3678//         const intptr_t x, const intptr_t y,
3679//         pixel *dst, const ptrdiff_t dst_stride,
3680//         const pixel *ref, const ptrdiff_t ref_stride)
3681function emu_edge_16bpc_neon, export=1
3682        ldp             x8,  x9,  [sp]
3683
3684        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
3685        // ref += iclip(x, 0, iw - 1)
3686        sub             x12, x3,  #1           // ih - 1
3687        cmp             x5,  x3
3688        sub             x13, x2,  #1           // iw - 1
3689        csel            x12, x12, x5,  ge      // min(y, ih - 1)
3690        cmp             x4,  x2
3691        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
3692        csel            x13, x13, x4,  ge      // min(x, iw - 1)
3693        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
3694        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
3695        add             x8,  x8,  x13, lsl #1  // ref += iclip()
3696
3697        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
3698        // top_ext = iclip(-y, 0, bh - 1)
3699        add             x10, x5,  x1           // y + bh
3700        neg             x5,  x5                // -y
3701        sub             x10, x10, x3           // y + bh - ih
3702        sub             x12, x1,  #1           // bh - 1
3703        cmp             x10, x1
3704        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
3705        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
3706        cmp             x5,  x1
3707        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
3708        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
3709
3710        // right_ext = iclip(x + bw - iw, 0, bw - 1)
3711        // left_ext = iclip(-x, 0, bw - 1)
3712        add             x11, x4,  x0           // x + bw
3713        neg             x4,  x4                // -x
3714        sub             x11, x11, x2           // x + bw - iw
3715        sub             x13, x0,  #1           // bw - 1
3716        cmp             x11, x0
3717        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
3718        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
3719        cmp             x4,  x0
3720        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
3721        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
3722
3723        // center_h = bh - top_ext - bottom_ext
3724        // dst += top_ext * PXSTRIDE(dst_stride)
3725        // center_w = bw - left_ext - right_ext
3726        sub             x1,  x1,  x5           // bh - top_ext
3727        madd            x6,  x5,  x7,  x6
3728        sub             x2,  x0,  x4           // bw - left_ext
3729        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
3730        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
3731
3732        mov             x14, x6                // backup of dst
3733
3734.macro v_loop need_left, need_right
37350:
3736.if \need_left
3737        ld1r            {v0.8h}, [x8]
3738        mov             x12, x6                // out = dst
3739        mov             x3,  x4
3740        mov             v1.16b,  v0.16b
37411:
3742        subs            x3,  x3,  #16
3743        st1             {v0.8h, v1.8h}, [x12], #32
3744        b.gt            1b
3745.endif
3746        mov             x13, x8
3747        add             x12, x6,  x4, lsl #1   // out = dst + left_ext
3748        mov             x3,  x2
37491:
3750        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
3751        subs            x3,  x3,  #32
3752        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
3753        b.gt            1b
3754.if \need_right
3755        add             x3,  x8,  x2, lsl #1   // in + center_w
3756        sub             x3,  x3,  #2           // in + center_w - 1
3757        add             x12, x6,  x4, lsl #1   // dst + left_ext
3758        ld1r            {v0.8h}, [x3]
3759        add             x12, x12, x2, lsl #1   // out = dst + left_ext + center_w
3760        mov             x3,  x11
3761        mov             v1.16b,  v0.16b
37621:
3763        subs            x3,  x3,  #16
3764        st1             {v0.8h, v1.8h}, [x12], #32
3765        b.gt            1b
3766.endif
3767
3768        subs            x1,  x1,  #1           // center_h--
3769        add             x6,  x6,  x7
3770        add             x8,  x8,  x9
3771        b.gt            0b
3772.endm
3773
3774        cbz             x4,  2f
3775        // need_left
3776        cbz             x11, 3f
3777        // need_left + need_right
3778        v_loop          1,   1
3779        b               5f
3780
37812:
3782        // !need_left
3783        cbz             x11, 4f
3784        // !need_left + need_right
3785        v_loop          0,   1
3786        b               5f
3787
37883:
3789        // need_left + !need_right
3790        v_loop          1,   0
3791        b               5f
3792
37934:
3794        // !need_left + !need_right
3795        v_loop          0,   0
3796
37975:
3798
3799        cbz             x10, 3f
3800        // need_bottom
3801        sub             x8,  x6,  x7           // ref = dst - stride
3802        mov             x4,  x0
38031:
3804        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
3805        mov             x3,  x10
38062:
3807        subs            x3,  x3,  #1
3808        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
3809        b.gt            2b
3810        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
3811        subs            x4,  x4,  #32          // bw -= 32
3812        add             x6,  x6,  #64          // dst += 32
3813        b.gt            1b
3814
38153:
3816        cbz             x5,  3f
3817        // need_top
3818        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
38191:
3820        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
3821        mov             x3,  x5
38222:
3823        subs            x3,  x3,  #1
3824        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
3825        b.gt            2b
3826        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
3827        subs            x0,  x0,  #32          // bw -= 32
3828        add             x6,  x6,  #64          // dst += 32
3829        b.gt            1b
3830
38313:
3832        ret
3833endfunc
3834