xref: /aosp_15_r20/external/libdav1d/src/arm/64/mc_dotprod.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2024, VideoLAN and dav1d authors
3 * Copyright © 2024, Janne Grunau
4 * Copyright © 2024, Martin Storsjo
5 * Copyright © 2024, Arm Limited
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright notice, this
12 *    list of conditions and the following disclaimer.
13 *
14 * 2. Redistributions in binary form must reproduce the above copyright notice,
15 *    this list of conditions and the following disclaimer in the documentation
16 *    and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#include "src/arm/asm.S"
31#include "util.S"
32
33
34#if HAVE_DOTPROD
35ENABLE_DOTPROD
36
37// No spaces in these expressions, due to gas-preprocessor. It is translated by
38// -1 to save the negative offset at getting the address of `mc_subpel_filters`.
39#define REGULAR1        (((0*15-1)<<7)|(3*15-1))
40#define SMOOTH1         (((1*15-1)<<7)|(4*15-1))
41#define SHARP1          (((2*15-1)<<7)|(3*15-1))
42
43#define FUNC_ALIGN      2
44#define JUMP_ALIGN      2
45#define LOOP_ALIGN      2
46
47
48const h_tbl_neon_dotprod, align=4
49        // Shuffle indices to permute horizontal samples in preparation for
50        // input to SDOT instructions. The 8-tap horizontal convolution uses
51        // sample indices in the interval of [-3, 4] relative to the current
52        // sample position.
53        .byte  0,  1,  2,  3,   1,  2,  3,  4,   2,  3,  4,  5,   3,  4,  5,  6
54        .byte  4,  5,  6,  7,   5,  6,  7,  8,   6,  7,  8,  9,   7,  8,  9, 10
55        .byte  8,  9, 10, 11,   9, 10, 11, 12,  10, 11, 12, 13,  11, 12, 13, 14
56
57        // Shuffle indices to permute horizontal samples in preparation for
58        // input to USMMLA instructions.
59#define OFFSET_USMMLA 48
60        .byte  0,  1,  2,  3,   4,  5,  6,  7,   2,  3,  4,  5,   6,  7,  8,  9
61        .byte  4,  5,  6,  7,   8,  9, 10, 11,   6,  7,  8,  9,  10, 11, 12, 13
62
63        // Lookup table used to help conversion of shifted 32-bit values to 8-bit.
64#define OFFSET_CVT_32_8 80
65        .byte  1,  2,  5,  6,   9, 10, 13, 14,  17, 18, 21, 22,  25, 26, 29, 30
66endconst
67
68const v_tbl_neon_dotprod, align=4
69        // Vertical convolutions are also using SDOT instructions, where a
70        // 128-bit register contains a transposed 4x4 matrix of values.
71        // Subsequent iterations of the vertical convolution can reuse the
72        // 3x4 sub-matrix from the previous loop iteration. These shuffle
73        // indices shift and merge this 4x4 matrix with the values of a new
74        // line.
75        .byte  1,  2,  3, 16,   5,  6,  7, 20,   9, 10, 11, 24,  13, 14, 15, 28
76        .byte  1,  2,  3, 16,   5,  6,  7, 17,   9, 10, 11, 18,  13, 14, 15, 19
77        .byte  1,  2,  3, 20,   5,  6,  7, 21,   9, 10, 11, 22,  13, 14, 15, 23
78        .byte  1,  2,  3, 24,   5,  6,  7, 25,   9, 10, 11, 26,  13, 14, 15, 27
79        .byte  1,  2,  3, 28,   5,  6,  7, 29,   9, 10, 11, 30,  13, 14, 15, 31
80endconst
81
82
83.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
84function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN
85        mov             x9,  \type_h
86        mov             x10, \type_v
87    .if \jump
88        b               \op\()_8tap_\isa
89    .endif
90endfunc
91.endm
92
93.macro filter_8tap_fn type, dot, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd
94make_8tap_fn \type, sharp,          SHARP1,   SHARP1,   \isa
95make_8tap_fn \type, sharp_smooth,   SHARP1,   SMOOTH1,  \isa
96make_8tap_fn \type, sharp_regular,  SHARP1,   REGULAR1, \isa
97make_8tap_fn \type, smooth_sharp,   SMOOTH1,  SHARP1,   \isa
98make_8tap_fn \type, smooth,         SMOOTH1,  SMOOTH1,  \isa
99make_8tap_fn \type, smooth_regular, SMOOTH1,  REGULAR1, \isa
100make_8tap_fn \type, regular_sharp,  REGULAR1, SHARP1,   \isa
101make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1,  \isa
102make_8tap_fn \type, regular,        REGULAR1, REGULAR1, \isa, jump=0
103
104function \type\()_8tap_\isa, align=FUNC_ALIGN
105        clz             w8, \w
106        mov             w11,  #0x4081   // (1 << 14) | (1 << 7) | (1 << 0)
107        sub             w8, w8, #24     // for jump tables
108        movrel          x12, X(mc_subpel_filters)
109        cbnz            \mx, L(\type\()_8tap_h_hv_\isa)
110        cbnz            \my, L(\type\()_8tap_v_\isa)
111.ifc \type, prep
112        add             \wd_strd, \w, \w    // prep_neon needs w * 2 as stride
113.endif
114        b               X(\type\()_neon)
115
116        .align JUMP_ALIGN
117L(\type\()_8tap_v_\isa):
118        madd            \my, \my, w11, w10
119        movrel          x13, v_tbl_neon_dotprod
120        sub             \src, \src, \s_strd
121.ifc \isa, neon_dotprod
122    .ifc \type, prep
123        mov             w8, #0x2002         // FILTER_WEIGHT * 128 + rounding
124        dup             v4.4s, w8
125    .else
126        movi            v4.4s, #32, lsl #8  // FILTER_WEIGHT * 128, bias for SDOT
127    .endif
128.endif
129        ubfx            w11, \my, #7, #7
130        and             \my, \my, #0x7F
131        ldp             q6, q28, [x13]
132        cmp             \h, #4
133        csel            \my, \my, w11, le
134        sub             \src, \src, \s_strd, lsl #1     // src - s_strd * 3
135        add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
136        ldr             q29, [x13, #32]
137.ifc \isa, neon_dotprod
138        movi            v5.16b, #128
139.endif
140        ldr             d7, [\xmy]
141        cmp             \w, #8
142        b.eq            80f
143        b.lt            40f
144
145        // .align JUMP_ALIGN    // fallthrough
146160:    // V - 16xN+
147        ldp             q30, q31, [x13, #48]
148.ifc \type, prep
149        add             \wd_strd, \w, \w
150.endif
151        .align LOOP_ALIGN
152161:
153        mov             \lsrc, \src
154        mov             \ldst, \dst
155        sub             w8, \h, #1
156
157        ldr             q16, [\lsrc]
158        ldr             q17, [\lsrc, \s_strd]
159        add             \lsrc, \lsrc, \s_strd, lsl #1
160        ldr             q18, [\lsrc]
161        ldr             q19, [\lsrc, \s_strd]
162        add             \lsrc, \lsrc, \s_strd, lsl #1
163
164        zip1            v0.16b, v16.16b, v17.16b
165        zip2            v1.16b, v16.16b, v17.16b
166        zip1            v2.16b, v18.16b, v19.16b
167        zip2            v3.16b, v18.16b, v19.16b
168
169        ldr             q20, [\lsrc]
170        ldr             q21, [\lsrc, \s_strd]
171        add             \lsrc, \lsrc, \s_strd, lsl #1
172        ldr             q22, [\lsrc]
173        ldr             q23, [\lsrc, \s_strd]
174        add             \lsrc, \lsrc, \s_strd, lsl #1
175
176        zip1            v18.16b, v20.16b, v21.16b
177        zip2            v21.16b, v20.16b, v21.16b
178        zip1            v24.16b, v22.16b, v23.16b
179        zip2            v27.16b, v22.16b, v23.16b
180
181        zip1            v16.8h, v0.8h, v2.8h
182        zip2            v19.8h, v0.8h, v2.8h
183        zip1            v22.8h, v1.8h, v3.8h
184        zip2            v25.8h, v1.8h, v3.8h
185
186        zip1            v17.8h, v18.8h, v24.8h
187        zip2            v20.8h, v18.8h, v24.8h
188        zip1            v23.8h, v21.8h, v27.8h
189        zip2            v26.8h, v21.8h, v27.8h
190.ifc \isa, neon_dotprod
191        sub             v16.16b, v16.16b, v5.16b
192        sub             v19.16b, v19.16b, v5.16b
193        sub             v22.16b, v22.16b, v5.16b
194        sub             v25.16b, v25.16b, v5.16b
195
196        sub             v17.16b, v17.16b, v5.16b
197        sub             v20.16b, v20.16b, v5.16b
198        sub             v23.16b, v23.16b, v5.16b
199        sub             v26.16b, v26.16b, v5.16b
200.endif
201        .align LOOP_ALIGN
20216:
203.ifc \isa, neon_i8mm
204        ld1             {v18.16b}, [\lsrc], \s_strd
205        movi            v0.4s, #0
206        movi            v1.4s, #0
207        movi            v2.4s, #0
208        movi            v3.4s, #0
209        mov             v21.16b, v18.16b
210        mov             v24.16b, v18.16b
211        mov             v27.16b, v18.16b
212.else   // neon_dotprod
213        ld1             {v27.16b}, [\lsrc], \s_strd
214        mov             v0.16b, v4.16b
215        mov             v1.16b, v4.16b
216        mov             v2.16b, v4.16b
217        mov             v3.16b, v4.16b
218        sub             v18.16b, v27.16b, v5.16b
219        sub             v21.16b, v27.16b, v5.16b
220        sub             v24.16b, v27.16b, v5.16b
221        sub             v27.16b, v27.16b, v5.16b
222.endif
223        \dot            v0.4s, v16.16b, v7.4b[0]
224        \dot            v1.4s, v19.16b, v7.4b[0]
225        \dot            v2.4s, v22.16b, v7.4b[0]
226        \dot            v3.4s, v25.16b, v7.4b[0]
227
228        tbl             v16.16b, {v16.16b, v17.16b}, v6.16b
229        tbl             v19.16b, {v19.16b, v20.16b}, v6.16b
230        tbl             v22.16b, {v22.16b, v23.16b}, v6.16b
231        tbl             v25.16b, {v25.16b, v26.16b}, v6.16b
232
233        \dot            v0.4s, v17.16b, v7.4b[1]
234        \dot            v1.4s, v20.16b, v7.4b[1]
235        \dot            v2.4s, v23.16b, v7.4b[1]
236        \dot            v3.4s, v26.16b, v7.4b[1]
237
238        tbl             v17.16b, {v17.16b, v18.16b}, v28.16b
239        tbl             v20.16b, {v20.16b, v21.16b}, v29.16b
240        tbl             v23.16b, {v23.16b, v24.16b}, v30.16b
241        tbl             v26.16b, {v26.16b, v27.16b}, v31.16b
242
243        subs            w8, w8, #1
244        uzp1            v0.8h, v0.8h, v1.8h
245        uzp1            v2.8h, v2.8h, v3.8h
246.ifc \type, prep
247    .ifc \isa, neon_i8mm
248        srshr           v0.8h, v0.8h, #2
249        srshr           v1.8h, v2.8h, #2
250    .else
251        sshr            v0.8h, v0.8h, #2
252        sshr            v1.8h, v2.8h, #2
253    .endif
254        st1             {v0.8h, v1.8h}, [\ldst], \d_strd
255.else   // put
256        sqrshrun        v0.8b, v0.8h, #6
257        sqrshrun2       v0.16b, v2.8h, #6
258        st1             {v0.16b}, [\ldst], \d_strd
259.endif
260        b.gt            16b
261
262.ifc \isa, neon_i8mm
263        movi            v0.4s, #0
264        movi            v1.4s, #0
265        movi            v2.4s, #0
266        movi            v3.4s, #0
267.else   // neon_dotprod
268        mov             v0.16b, v4.16b
269        mov             v1.16b, v4.16b
270        mov             v2.16b, v4.16b
271        mov             v3.16b, v4.16b
272.endif
273        \dot            v0.4s, v16.16b, v7.4b[0]
274        \dot            v1.4s, v19.16b, v7.4b[0]
275        \dot            v2.4s, v22.16b, v7.4b[0]
276        \dot            v3.4s, v25.16b, v7.4b[0]
277
278        \dot            v0.4s, v17.16b, v7.4b[1]
279        \dot            v1.4s, v20.16b, v7.4b[1]
280        \dot            v2.4s, v23.16b, v7.4b[1]
281        \dot            v3.4s, v26.16b, v7.4b[1]
282
283        subs            \w, \w, #16
284        uzp1            v0.8h, v0.8h, v1.8h
285        uzp1            v2.8h, v2.8h, v3.8h
286.ifc \type, prep
287    .ifc \isa, neon_i8mm
288        srshr           v0.8h, v0.8h, #2
289        srshr           v1.8h, v2.8h, #2
290    .else
291        sshr            v0.8h, v0.8h, #2
292        sshr            v1.8h, v2.8h, #2
293    .endif
294        stp             q0, q1, [\ldst]
295        add             \dst, \dst, #32
296.else   // put
297        sqrshrun        v0.8b, v0.8h, #6
298        sqrshrun2       v0.16b, v2.8h, #6
299        str             q0, [\ldst]
300        add             \dst, \dst, #16
301.endif
302        add             \src, \src, #16
303        b.gt            161b
304        ret
305
306        .align JUMP_ALIGN
30780:     // V - 8xN
308        ldr             d16, [\src]
309        ldr             d17, [\src, \s_strd]
310        add             \src, \src, \s_strd, lsl #1
311        ldr             d18, [\src]
312        ldr             d19, [\src, \s_strd]
313        add             \src, \src, \s_strd, lsl #1
314
315        ldr             d20, [\src]
316        ldr             d21, [\src, \s_strd]
317        add             \src, \src, \s_strd, lsl #1
318        ldr             d22, [\src]
319        ldr             d23, [\src, \s_strd]
320        add             \src, \src, \s_strd, lsl #1
321        subs            \h, \h, #2  // for prep: sub is enough
322
323        zip1            v0.16b, v16.16b, v17.16b
324        zip1            v2.16b, v18.16b, v19.16b
325        zip1            v18.16b, v20.16b, v21.16b
326        zip1            v24.16b, v22.16b, v23.16b
327
328        zip1            v16.8h,  v0.8h,  v2.8h
329        zip2            v19.8h,  v0.8h,  v2.8h
330        zip1            v17.8h, v18.8h, v24.8h
331        zip2            v20.8h, v18.8h, v24.8h
332.ifc \isa, neon_dotprod
333        sub             v16.16b, v16.16b, v5.16b
334        sub             v19.16b, v19.16b, v5.16b
335        sub             v17.16b, v17.16b, v5.16b
336        sub             v20.16b, v20.16b, v5.16b
337.endif
338.ifc \type, put
339        b.eq            82f
340.endif
341        .align LOOP_ALIGN
3428:
343.ifc \isa, neon_i8mm
344        ldr             d18, [\src]
345        movi            v0.4s, #0
346        movi            v1.4s, #0
347        ldr             d24, [\src, \s_strd]
348        add             \src, \src, \s_strd, lsl #1
349        movi            v2.4s, #0
350        movi            v3.4s, #0
351        mov             v21.8b, v18.8b
352        mov             v27.8b, v24.8b
353.else   // neon_dotprod
354        ldr             d21, [\src]
355        ldr             d27, [\src, \s_strd]
356        add             \src, \src, \s_strd, lsl #1
357        mov             v0.16b, v4.16b
358        mov             v1.16b, v4.16b
359        mov             v2.16b, v4.16b
360        mov             v3.16b, v4.16b
361        sub             v18.16b, v21.16b, v5.16b
362        sub             v21.16b, v21.16b, v5.16b
363        sub             v24.16b, v27.16b, v5.16b
364        sub             v27.16b, v27.16b, v5.16b
365.endif
366        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
367        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
368        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
369        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
370
371        \dot            v0.4s, v16.16b, v7.4b[0]
372        \dot            v0.4s, v17.16b, v7.4b[1]
373        \dot            v1.4s, v19.16b, v7.4b[0]
374        \dot            v1.4s, v20.16b, v7.4b[1]
375
376        tbl             v16.16b, {v22.16b, v23.16b}, v6.16b
377        tbl             v19.16b, {v25.16b, v26.16b}, v6.16b
378        tbl             v17.16b, {v23.16b, v24.16b}, v28.16b
379        tbl             v20.16b, {v26.16b, v27.16b}, v29.16b
380
381        \dot            v2.4s, v22.16b, v7.4b[0]
382        \dot            v2.4s, v23.16b, v7.4b[1]
383        \dot            v3.4s, v25.16b, v7.4b[0]
384        \dot            v3.4s, v26.16b, v7.4b[1]
385
386        subs            \h, \h, #2
387        uzp1            v0.8h, v0.8h, v1.8h
388        uzp1            v2.8h, v2.8h, v3.8h
389.ifc \type, prep
390    .ifc \isa, neon_i8mm
391        srshr           v0.8h, v0.8h, #2
392        srshr           v1.8h, v2.8h, #2
393    .else
394        sshr            v0.8h, v0.8h, #2
395        sshr            v1.8h, v2.8h, #2
396    .endif
397        stp             q0, q1, [\dst], #32
398.else   // put
399        sqrshrun        v0.8b, v0.8h, #6
400        sqrshrun        v1.8b, v2.8h, #6
401        str             d0, [\dst]
402        str             d1, [\dst, \d_strd]
403        add             \dst, \dst, \d_strd, lsl #1
404.endif
405        b.gt            8b
406
407.ifc \type, put
408        .align JUMP_ALIGN
40982:
410.endif
411.ifc \isa, neon_i8mm
412        ldr             d18, [\src]
413        movi            v0.4s, #0
414        movi            v1.4s, #0
415        movi            v2.4s, #0
416        movi            v3.4s, #0
417        mov             v21.8b, v18.8b
418.else   // neon_dotprod
419        ldr             d21, [\src]
420        mov             v0.16b, v4.16b
421        mov             v1.16b, v4.16b
422        mov             v2.16b, v4.16b
423        mov             v3.16b, v4.16b
424        sub             v18.16b, v21.16b, v5.16b
425        sub             v21.16b, v21.16b, v5.16b
426.endif
427        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
428        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
429        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
430        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
431
432        \dot            v0.4s, v16.16b, v7.4b[0]
433        \dot            v0.4s, v17.16b, v7.4b[1]
434        \dot            v1.4s, v19.16b, v7.4b[0]
435        \dot            v1.4s, v20.16b, v7.4b[1]
436
437        \dot            v2.4s, v22.16b, v7.4b[0]
438        \dot            v2.4s, v23.16b, v7.4b[1]
439        \dot            v3.4s, v25.16b, v7.4b[0]
440        \dot            v3.4s, v26.16b, v7.4b[1]
441
442        uzp1            v0.8h, v0.8h, v1.8h
443        uzp1            v2.8h, v2.8h, v3.8h
444.ifc \type, prep
445    .ifc \isa, neon_i8mm
446        srshr           v0.8h, v0.8h, #2
447        srshr           v1.8h, v2.8h, #2
448    .else
449        sshr            v0.8h, v0.8h, #2
450        sshr            v1.8h, v2.8h, #2
451    .endif
452        stp             q0, q1, [\dst]
453.else   // put
454        sqrshrun        v0.8b, v0.8h, #6
455        sqrshrun        v1.8b, v2.8h, #6
456        str             d0, [\dst]
457        str             d1, [\dst, \d_strd]
458.endif
459        ret
460
461        .align JUMP_ALIGN
46240:     // V - 4xN or 2xN (put only)
463.ifc \type, put
464        cmp             \w, #2
465        b.eq            20f
466.endif
467        ldr             s16, [\src]
468        ldr             s17, [\src, \s_strd]
469        add             \src, \src, \s_strd, lsl #1
470        ldr             s18, [\src]
471        ldr             s19, [\src, \s_strd]
472        add             \src, \src, \s_strd, lsl #1
473
474        ldr             s20, [\src]
475        ldr             s21, [\src, \s_strd]
476        add             \src, \src, \s_strd, lsl #1
477        ldr             s22, [\src]
478        ldr             s23, [\src, \s_strd]
479        add             \src, \src, \s_strd, lsl #1
480        subs            \h, \h, #2  // for prep: sub is enough
481
482        zip1            v0.8b, v16.8b, v17.8b
483        zip1            v2.8b, v18.8b, v19.8b
484        zip1            v18.8b, v20.8b, v21.8b
485        zip1            v24.8b, v22.8b, v23.8b
486
487        zip1            v16.8h, v0.8h, v2.8h
488        zip1            v17.8h, v18.8h, v24.8h
489.ifc \isa, neon_dotprod
490        sub             v16.16b, v16.16b, v5.16b
491        sub             v17.16b, v17.16b, v5.16b
492.endif
493.ifc \type, put
494        b.eq            42f
495.endif
496        .align LOOP_ALIGN
4974:
498        ldr             s18, [\src]
499        ldr             s21, [\src, \s_strd]
500        add             \src, \src, \s_strd, lsl #1
501.ifc \isa, neon_i8mm
502        movi            v0.4s, #0
503        movi            v1.4s, #0
504.else   // neon_dotprod
505        mov             v0.16b, v4.16b
506        mov             v1.16b, v4.16b
507        sub             v18.16b, v18.16b, v5.16b
508        sub             v21.16b, v21.16b, v5.16b
509.endif
510        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
511        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
512
513        \dot            v0.4s, v16.16b, v7.4b[0]
514        \dot            v0.4s, v17.16b, v7.4b[1]
515
516        tbl             v16.16b, {v19.16b, v20.16b}, v6.16b
517        tbl             v17.16b, {v20.16b, v21.16b}, v28.16b
518
519        \dot            v1.4s, v19.16b, v7.4b[0]
520        \dot            v1.4s, v20.16b, v7.4b[1]
521.ifc \type, prep
522        subs            \h, \h, #2
523    .ifc \isa, neon_i8mm
524        rshrn           v0.4h, v0.4s, #2
525        rshrn2          v0.8h, v1.4s, #2
526    .else
527        shrn            v0.4h, v0.4s, #2
528        shrn2           v0.8h, v1.4s, #2
529    .endif
530        str             q0, [\dst], #16
531.else
532        uzp1            v0.8h, v0.8h, v1.8h
533        sqrshrun        v0.8b, v0.8h, #6
534        subs            \h, \h, #2
535        fmov            x8, d0
536        lsr             x9, x8, #32
537        str             w8, [\dst]
538        str             w9, [\dst, \d_strd]
539        add             \dst, \dst, \d_strd, lsl #1
540.endif
541        b.gt            4b
542
543.ifc \type, put
544        .align JUMP_ALIGN
54542:
546.endif
547        ldr             s18, [\src]
548.ifc \isa, neon_i8mm
549        movi            v0.4s, #0
550        movi            v1.4s, #0
551.else   // neon_dotprod
552        mov             v0.16b, v4.16b
553        mov             v1.16b, v4.16b
554        sub             v18.16b, v18.16b, v5.16b
555.endif
556        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
557        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
558
559        \dot            v0.4s, v16.16b, v7.4b[0]
560        \dot            v0.4s, v17.16b, v7.4b[1]
561
562        \dot            v1.4s, v19.16b, v7.4b[0]
563        \dot            v1.4s, v20.16b, v7.4b[1]
564.ifc \type, prep
565    .ifc \isa, neon_i8mm
566        rshrn           v0.4h, v0.4s, #2
567        rshrn2          v0.8h, v1.4s, #2
568    .else
569        shrn            v0.4h, v0.4s, #2
570        shrn2           v0.8h, v1.4s, #2
571    .endif
572        str             q0, [\dst]
573.else
574        uzp1            v0.8h, v0.8h, v1.8h
575        sqrshrun        v0.8b, v0.8h, #6
576        fmov            x8, d0
577        lsr             x9, x8, #32
578        str             w8, [\dst]
579        str             w9, [\dst, \d_strd]
580.endif
581        ret
582
583.ifc \type, put
584        .align JUMP_ALIGN
58520:     // V - 2xN
586        ldr             h16, [\src]
587        ldr             h17, [\src, \s_strd]
588        add             \src, \src, \s_strd, lsl #1
589        ldr             h18, [\src]
590        ldr             h19, [\src, \s_strd]
591        add             \src, \src, \s_strd, lsl #1
592
593        ldr             h20, [\src]
594        ldr             h21, [\src, \s_strd]
595        add             \src, \src, \s_strd, lsl #1
596        ldr             h22, [\src]
597        ldr             h23, [\src, \s_strd]
598        add             \src, \src, \s_strd, lsl #1
599        subs            \h, \h, #2
600
601        zip1            v0.8b, v16.8b, v17.8b
602        zip1            v2.8b, v18.8b, v19.8b
603        zip1            v18.8b, v20.8b, v21.8b
604        zip1            v24.8b, v22.8b, v23.8b
605
606        zip1            v16.4h, v0.4h, v2.4h
607        zip1            v17.4h, v18.4h, v24.4h
608    .ifc \isa, neon_dotprod
609        sub             v16.8b, v16.8b, v5.8b
610        sub             v17.8b, v17.8b, v5.8b
611    .endif
612        b.eq            22f
613
614        .align LOOP_ALIGN
6152:
616        ldr             h18, [\src]
617        ldr             h21, [\src, \s_strd]
618        add             \src, \src, \s_strd, lsl #1
619    .ifc \isa, neon_i8mm
620        movi            v0.4s, #0
621        movi            v1.4s, #0
622    .else   // put
623        mov             v0.16b, v4.16b
624        mov             v1.16b, v4.16b
625        sub             v18.8b, v18.8b, v5.8b
626        sub             v21.8b, v21.8b, v5.8b
627    .endif
628        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
629        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
630
631        \dot            v0.4s, v16.16b, v7.4b[0]
632        \dot            v0.4s, v17.16b, v7.4b[1]
633
634        tbl             v16.16b, {v19.16b, v20.16b}, v6.16b
635        tbl             v17.16b, {v20.16b, v21.16b}, v28.16b
636
637        \dot            v1.4s, v19.16b, v7.4b[0]
638        \dot            v1.4s, v20.16b, v7.4b[1]
639
640        uzp1            v0.8h, v0.8h, v1.8h
641        sqrshrun        v0.8b, v0.8h, #6
642
643        subs            \h, \h, #2
644        fmov            x8, d0
645        lsr             x9, x8, #32
646        strh            w8, [\dst]
647        strh            w9, [\dst, \d_strd]
648        add             \dst, \dst, \d_strd, lsl #1
649        b.gt            2b
650
651        .align JUMP_ALIGN
65222:
653        ldr             h18, [\src]
654    .ifc \isa, neon_i8mm
655        movi            v0.4s, #0
656        movi            v1.4s, #0
657    .else   // put
658        mov             v0.16b, v4.16b
659        mov             v1.16b, v4.16b
660        sub             v18.8b, v18.8b, v5.8b
661    .endif
662        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
663        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
664
665        \dot            v0.4s, v16.16b, v7.4b[0]
666        \dot            v0.4s, v17.16b, v7.4b[1]
667
668        \dot            v1.4s, v19.16b, v7.4b[0]
669        \dot            v1.4s, v20.16b, v7.4b[1]
670
671        uzp1            v0.8h, v0.8h, v1.8h
672        sqrshrun        v0.8b, v0.8h, #6
673
674        fmov            x8, d0
675        lsr             x9, x8, #32
676        strh            w8, [\dst]
677        strh            w9, [\dst, \d_strd]
678        ret
679.endif
680
681        .align JUMP_ALIGN
682L(\type\()_8tap_h_hv_\isa):
683        madd            \mx, \mx, w11, w9
684        madd            w14, \my, w11, w10      // for HV
685.ifc \isa, neon_dotprod
686        mov             w13, #0x2002            // FILTER_WEIGHT * 128 + rounding
687        dup             v27.4s, w13             // put H overrides this
688.endif
689        movrel          x13, h_tbl_neon_dotprod
690        sub             \src, \src, #3          // src - 3
691        ldr             q28, [x13]              // for 4-tap & 8-tap H filters
692        ubfx            w15, \mx, #7, #7
693        and             \mx, \mx, #0x7F
694        ubfx            w11, w14, #7, #7        // for HV
695        and             w14, w14, #0x7F         // for HV
696        cmp             \w, #4
697        csel            \mx, \mx, w15, le
698        add             \xmx, x12, \xmx, lsl #3 // subpel H filter address
699.ifc \isa, neon_dotprod
700        movi            v24.16b, #128
701.endif
702        cbz             \my, L(\type\()_8tap_h_\isa)
703
704        // HV cases
705        cmp             \h, #4
706        csel            w14, w14, w11, le
707        sub             \src, \src, \s_strd, lsl #1 // src - s_strd * 2 - 3
708        add             \xmy, x12, x14, lsl #3      // subpel V filter address
709        mov             x15, x30
710        ldr             d7, [\xmy]
711.ifc \type, put
712        ldr             q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion
713.endif                                                 // of 32b values to 8b
714        sxtl            v7.8h, v7.8b
715        cmp             w10, #SHARP1
716        b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1
717
718        // HV 8-tap cases
719        sub             \src, \src, \s_strd         // src - s_strd * 3 - 3
720        cmp             \w, #4
721        b.eq            40f
722.ifc \type, put
723        b.lt            20f
724.endif
725
726        // .align JUMP_ALIGN    // fallthrough
72780:     // HV8 - 8xN+
728        ldp             q29, q30, [x13, #16]
729        ldr             d26, [\xmx]
730.ifc \type, prep
731        add             \wd_strd, \w, \w
732.endif
733        .align LOOP_ALIGN
73481:
735        mov             \lsrc, \src
736        mov             \ldst, \dst
737        mov             w8, \h
738.ifc \isa, neon_i8mm
739        bl              L(\type\()_hv_filter8_\isa)
740        srshr           v16.8h, v22.8h, #2
741        bl              L(\type\()_hv_filter8_\isa)
742        srshr           v17.8h, v22.8h, #2
743        bl              L(\type\()_hv_filter8_\isa)
744        srshr           v18.8h, v22.8h, #2
745        bl              L(\type\()_hv_filter8_\isa)
746        srshr           v19.8h, v22.8h, #2
747        bl              L(\type\()_hv_filter8_\isa)
748        srshr           v20.8h, v22.8h, #2
749        bl              L(\type\()_hv_filter8_\isa)
750        srshr           v21.8h, v22.8h, #2
751        bl              L(\type\()_hv_filter8_\isa)
752        srshr           v22.8h, v22.8h, #2
753.else
754        bl              L(\type\()_hv_filter8_\isa)
755        sshr            v16.8h, v22.8h, #2
756        bl              L(\type\()_hv_filter8_\isa)
757        sshr            v17.8h, v22.8h, #2
758        bl              L(\type\()_hv_filter8_\isa)
759        sshr            v18.8h, v22.8h, #2
760        bl              L(\type\()_hv_filter8_\isa)
761        sshr            v19.8h, v22.8h, #2
762        bl              L(\type\()_hv_filter8_\isa)
763        sshr            v20.8h, v22.8h, #2
764        bl              L(\type\()_hv_filter8_\isa)
765        sshr            v21.8h, v22.8h, #2
766        bl              L(\type\()_hv_filter8_\isa)
767        sshr            v22.8h, v22.8h, #2
768.endif
769        .align LOOP_ALIGN
7708:
771        ldr             q23, [\lsrc]
772        add             \lsrc, \lsrc, \s_strd
773
774        smull           v0.4s, v16.4h, v7.h[0]
775        smull2          v1.4s, v16.8h, v7.h[0]
776        mov             v16.16b, v17.16b
777.ifc \isa, neon_i8mm
778        movi            v5.4s, #0
779        movi            v6.4s, #0
780        tbl             v2.16b, {v23.16b}, v28.16b
781        tbl             v3.16b, {v23.16b}, v29.16b
782.else   // neon_dotprod
783        sub             v23.16b, v23.16b, v24.16b
784        mov             v5.16b, v27.16b
785        mov             v6.16b, v27.16b
786.endif
787        smlal           v0.4s, v17.4h, v7.h[1]
788        smlal2          v1.4s, v17.8h, v7.h[1]
789.ifc \isa, neon_i8mm
790        tbl             v4.16b, {v23.16b}, v30.16b
791        mov             v17.16b, v18.16b
792.else   // neon_dotprod
793        mov             v17.16b, v18.16b
794        tbl             v2.16b, {v23.16b}, v28.16b
795        tbl             v3.16b, {v23.16b}, v29.16b
796        tbl             v4.16b, {v23.16b}, v30.16b
797.endif
798        smlal           v0.4s, v18.4h, v7.h[2]
799        smlal2          v1.4s, v18.8h, v7.h[2]
800        mov             v18.16b, v19.16b
801
802        \dot            v5.4s, v2.16b, v26.4b[0]
803        \dot            v6.4s, v3.16b, v26.4b[0]
804
805        smlal           v0.4s, v19.4h, v7.h[3]
806        smlal2          v1.4s, v19.8h, v7.h[3]
807        mov             v19.16b, v20.16b
808
809        \dot            v5.4s, v3.16b, v26.4b[1]
810        \dot            v6.4s, v4.16b, v26.4b[1]
811
812        smlal           v0.4s, v20.4h, v7.h[4]
813        smlal2          v1.4s, v20.8h, v7.h[4]
814        mov             v20.16b, v21.16b
815
816        smlal           v0.4s, v21.4h, v7.h[5]
817        smlal2          v1.4s, v21.8h, v7.h[5]
818.ifc \type, prep
819        uzp1            v23.8h, v5.8h, v6.8h
820.endif
821        mov             v21.16b, v22.16b
822        smlal           v0.4s, v22.4h, v7.h[6]
823        smlal2          v1.4s, v22.8h, v7.h[6]
824.ifc \isa, neon_i8mm
825        subs            w8, w8, #1
826.endif
827.ifc \type, prep
828    .ifc \isa, neon_i8mm
829        srshr           v22.8h, v23.8h, #2
830    .else
831        sshr            v22.8h, v23.8h, #2
832    .endif
833        smlal           v0.4s, v22.4h, v7.h[7]
834        smlal2          v1.4s, v22.8h, v7.h[7]
835        rshrn           v0.4h, v0.4s, #6
836        rshrn2          v0.8h, v1.4s, #6
837.else   // put
838    .ifc \isa, neon_i8mm
839        rshrn           v22.4h, v5.4s, #2
840        rshrn2          v22.8h, v6.4s, #2
841    .else
842        shrn            v22.4h, v5.4s, #2
843        shrn2           v22.8h, v6.4s, #2
844    .endif
845        smlal           v0.4s, v22.4h, v7.h[7]
846        smlal2          v1.4s, v22.8h, v7.h[7]
847        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
848        sqrshrun        v0.8b, v0.8h, #2
849.endif
850.ifc \isa, neon_dotprod
851        subs            w8, w8, #1
852.endif
853.ifc \type, prep
854        st1             {v0.8h}, [\ldst], \d_strd
855        b.gt            8b
856        add             \dst, \dst, #16
857.else
858        st1             {v0.8b}, [\ldst], \d_strd
859        b.gt            8b
860        add             \dst, \dst, #8
861.endif
862        add             \src, \src, #8
863        subs            \w, \w, #8
864        b.gt            81b
865        ret             x15
866
867        .align JUMP_ALIGN
86840:     // HV8 - 4xN
869        ldur            s26, [\xmx, #2]
870        add             \src, \src, #2
871
872        bl              L(\type\()_hv_filter4_\isa)
873        shrn            v16.4h, v22.4s, #2
874        bl              L(\type\()_hv_filter4_\isa)
875        shrn            v17.4h, v22.4s, #2
876        bl              L(\type\()_hv_filter4_\isa)
877        shrn            v18.4h, v22.4s, #2
878        bl              L(\type\()_hv_filter4_\isa)
879        shrn            v19.4h, v22.4s, #2
880        bl              L(\type\()_hv_filter4_\isa)
881        shrn            v20.4h, v22.4s, #2
882        bl              L(\type\()_hv_filter4_\isa)
883        shrn            v21.4h, v22.4s, #2
884        bl              L(\type\()_hv_filter4_\isa)
885        shrn            v22.4h, v22.4s, #2
886
887        .align LOOP_ALIGN
8884:
889        ld1             {v4.8b}, [\src], \s_strd
890
891        smull           v0.4s, v16.4h, v7.h[0]
892        smlal           v0.4s, v17.4h, v7.h[1]
893        mov             v16.16b, v17.16b
894        mov             v17.16b, v18.16b
895.ifc \isa, neon_dotprod
896        sub             v4.16b, v4.16b, v24.16b
897.endif
898        smlal           v0.4s, v18.4h, v7.h[2]
899        smlal           v0.4s, v19.4h, v7.h[3]
900        tbl             v2.16b, {v4.16b}, v28.16b
901.ifc \isa, neon_i8mm
902        movi            v5.4s, #0
903.else
904        mov             v5.16b, v27.16b
905.endif
906        mov             v18.16b, v19.16b
907        mov             v19.16b, v20.16b
908
909        smlal           v0.4s, v20.4h, v7.h[4]
910        smlal           v0.4s, v21.4h, v7.h[5]
911
912        \dot            v5.4s, v2.16b, v26.4b[0]
913        mov             v20.16b, v21.16b
914        mov             v21.16b, v22.16b
915        smlal           v0.4s, v22.4h, v7.h[6]
916.ifc \isa, neon_i8mm
917        rshrn           v22.4h, v5.4s, #2
918.else
919        shrn            v22.4h, v5.4s, #2
920.endif
921        smlal           v0.4s, v22.4h, v7.h[7]
922.ifc \type, prep
923        rshrn           v0.4h, v0.4s, #6
924        str             d0, [\dst], #8
925        subs            \h, \h, #1
926.else
927        subs            \h, \h, #1
928        tbl             v0.8b, {v0.16b}, v25.8b
929        sqrshrun        v0.8b, v0.8h, #2
930        str             s0, [\dst]
931        add             \dst, \dst, \d_strd
932.endif
933        b.gt            4b
934        ret             x15
935
936.ifc \type, put
937        .align JUMP_ALIGN
93820:     // HV8 - 2xN
939        ldur            s26, [\xmx, #2]
940        add             \src, \src, #2
941
942        bl              L(\type\()_hv_filter4_\isa)
943        shrn            v16.4h, v22.4s, #2
944        bl              L(\type\()_hv_filter4_\isa)
945        shrn            v17.4h, v22.4s, #2
946        bl              L(\type\()_hv_filter4_\isa)
947        shrn            v18.4h, v22.4s, #2
948        bl              L(\type\()_hv_filter4_\isa)
949        shrn            v19.4h, v22.4s, #2
950        bl              L(\type\()_hv_filter4_\isa)
951        shrn            v20.4h, v22.4s, #2
952        bl              L(\type\()_hv_filter4_\isa)
953        shrn            v21.4h, v22.4s, #2
954        bl              L(\type\()_hv_filter4_\isa)
955        shrn            v22.4h, v22.4s, #2
956
957        .align LOOP_ALIGN
9582:
959        ld1             {v4.8b}, [\src], \s_strd
960
961        smull           v0.4s, v16.4h, v7.h[0]
962        smlal           v0.4s, v17.4h, v7.h[1]
963        mov             v16.16b, v17.16b
964        mov             v17.16b, v18.16b
965    .ifc \isa, neon_dotprod
966        sub             v4.16b, v4.16b, v24.16b
967    .endif
968        smlal           v0.4s, v18.4h, v7.h[2]
969        smlal           v0.4s, v19.4h, v7.h[3]
970        tbl             v2.16b, {v4.16b}, v28.16b
971    .ifc \isa, neon_i8mm
972        movi            v5.4s, #0
973    .else
974        mov             v5.16b, v27.16b
975    .endif
976        mov             v18.16b, v19.16b
977        mov             v19.16b, v20.16b
978
979        smlal           v0.4s, v20.4h, v7.h[4]
980        smlal           v0.4s, v21.4h, v7.h[5]
981
982        \dot            v5.4s, v2.16b, v26.4b[0]
983        mov             v20.16b, v21.16b
984        mov             v21.16b, v22.16b
985
986        smlal           v0.4s, v22.4h, v7.h[6]
987    .ifc \isa, neon_i8mm
988        rshrn           v22.4h, v5.4s, #2
989    .else
990        shrn            v22.4h, v5.4s, #2
991    .endif
992        smlal           v0.4s, v22.4h, v7.h[7]
993        subs            \h, \h, #1
994
995        tbl             v0.8b, {v0.16b}, v25.8b
996        sqrshrun        v0.8b, v0.8h, #2
997
998        str             h0, [\dst]
999        add             \dst, \dst, \d_strd
1000        b.gt            2b
1001        ret             x15
1002.endif
1003
1004        .align JUMP_ALIGN
1005L(\type\()_6tap_hv_\isa):
1006        cmp             \w, #4
1007        b.eq            40f
1008.ifc \type, put
1009        b.lt            20f
1010.endif
1011
1012        // .align JUMP_ALIGN    // fallthrough
101380:     // HV6 - 8xN+
1014        ldr             d26, [\xmx]
1015.ifc \type, prep
1016        add             \wd_strd, \w, \w
1017.endif
1018.ifc \isa, neon_i8mm
1019        cmp             w9, #SHARP1
1020        b.eq            88f             // horizontal == SHARP1
1021
1022        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
1023        ext             v0.8b, v26.8b, v26.8b, #7
1024        ins             v26.d[1], v0.d[0]
1025
1026        .align LOOP_ALIGN
102781:
1028        mov             \lsrc, \src
1029        mov             \ldst, \dst
1030        mov             w8, \h
1031
1032        bl              L(\type\()_hv_filter6_neon_i8mm)
1033        srshr           v16.8h, v22.8h, #2
1034        bl              L(\type\()_hv_filter6_neon_i8mm)
1035        srshr           v17.8h, v22.8h, #2
1036        bl              L(\type\()_hv_filter6_neon_i8mm)
1037        srshr           v18.8h, v22.8h, #2
1038        bl              L(\type\()_hv_filter6_neon_i8mm)
1039        srshr           v19.8h, v22.8h, #2
1040        bl              L(\type\()_hv_filter6_neon_i8mm)
1041        srshr           v20.8h, v22.8h, #2
1042
1043        .align LOOP_ALIGN
10448:
1045        ld1             {v23.16b}, [\lsrc], \s_strd
1046
1047        smull           v0.4s, v16.4h, v7.h[1]
1048        smull2          v1.4s, v16.8h, v7.h[1]
1049        mov             v16.16b, v17.16b
1050        movi            v5.4s, #0
1051        movi            v6.4s, #0
1052        tbl             v2.16b, {v23.16b}, v29.16b
1053        tbl             v3.16b, {v23.16b}, v30.16b
1054
1055        smlal           v0.4s, v17.4h, v7.h[2]
1056        smlal2          v1.4s, v17.8h, v7.h[2]
1057        mov             v17.16b, v18.16b
1058
1059        usmmla          v5.4s, v2.16b, v26.16b
1060        usmmla          v6.4s, v3.16b, v26.16b
1061
1062        smlal           v0.4s, v18.4h, v7.h[3]
1063        smlal2          v1.4s, v18.8h, v7.h[3]
1064        mov             v18.16b, v19.16b
1065        subs            w8, w8, #1
1066
1067        smlal           v0.4s, v19.4h, v7.h[4]
1068        smlal2          v1.4s, v19.8h, v7.h[4]
1069        uzp1            v23.8h, v5.8h, v6.8h
1070        mov             v19.16b, v20.16b
1071
1072        smlal           v0.4s, v20.4h, v7.h[5]
1073        smlal2          v1.4s, v20.8h, v7.h[5]
1074        srshr           v20.8h, v23.8h, #2
1075        smlal           v0.4s, v20.4h, v7.h[6]
1076        smlal2          v1.4s, v20.8h, v7.h[6]
1077    .ifc \type, prep
1078        rshrn           v0.4h, v0.4s, #6
1079        rshrn2          v0.8h, v1.4s, #6
1080        st1             {v0.8h}, [\ldst], \d_strd
1081        b.gt            8b
1082        add             \dst, \dst, #16
1083    .else
1084        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
1085        sqrshrun        v0.8b, v0.8h, #2
1086        st1             {v0.8b}, [\ldst], \d_strd
1087        b.gt            8b
1088        add             \dst, \dst, #8
1089    .endif
1090        add             \src, \src, #8
1091        subs            \w, \w, #8
1092        b.gt            81b
1093        ret             x15
1094
1095        .align JUMP_ALIGN
109688:
1097.endif  // neon_i8mm
1098        ldp             q29, q30, [x13, #16]
1099
1100        .align LOOP_ALIGN
110181:
1102        mov             \lsrc, \src
1103        mov             \ldst, \dst
1104        mov             w8, \h
1105.ifc \isa, neon_i8mm
1106        bl              L(\type\()_hv_filter8_\isa)
1107        srshr           v16.8h, v22.8h, #2
1108        bl              L(\type\()_hv_filter8_\isa)
1109        srshr           v17.8h, v22.8h, #2
1110        bl              L(\type\()_hv_filter8_\isa)
1111        srshr           v18.8h, v22.8h, #2
1112        bl              L(\type\()_hv_filter8_\isa)
1113        srshr           v19.8h, v22.8h, #2
1114        bl              L(\type\()_hv_filter8_\isa)
1115        srshr           v20.8h, v22.8h, #2
1116.else
1117        bl              L(\type\()_hv_filter8_\isa)
1118        sshr            v16.8h, v22.8h, #2
1119        bl              L(\type\()_hv_filter8_\isa)
1120        sshr            v17.8h, v22.8h, #2
1121        bl              L(\type\()_hv_filter8_\isa)
1122        sshr            v18.8h, v22.8h, #2
1123        bl              L(\type\()_hv_filter8_\isa)
1124        sshr            v19.8h, v22.8h, #2
1125        bl              L(\type\()_hv_filter8_\isa)
1126        sshr            v20.8h, v22.8h, #2
1127.endif
1128        .align LOOP_ALIGN
11298:
1130        ldr             q23, [\lsrc]
1131        add             \lsrc, \lsrc, \s_strd
1132
1133        smull           v0.4s, v16.4h, v7.h[1]
1134        smull2          v1.4s, v16.8h, v7.h[1]
1135.ifc \isa, neon_dotprod
1136        sub             v23.16b, v23.16b, v24.16b
1137.endif
1138        mov             v16.16b, v17.16b
1139.ifc \isa, neon_i8mm
1140        movi            v5.4s, #0
1141        movi            v6.4s, #0
1142.else
1143        mov             v5.16b, v27.16b
1144        mov             v6.16b, v27.16b
1145.endif
1146        tbl             v2.16b, {v23.16b}, v28.16b
1147        tbl             v3.16b, {v23.16b}, v29.16b
1148
1149        smlal           v0.4s, v17.4h, v7.h[2]
1150        smlal2          v1.4s, v17.8h, v7.h[2]
1151        tbl             v4.16b, {v23.16b}, v30.16b
1152        mov             v17.16b, v18.16b
1153
1154        \dot            v5.4s, v2.16b, v26.4b[0]
1155        \dot            v6.4s, v3.16b, v26.4b[0]
1156
1157        smlal           v0.4s, v18.4h, v7.h[3]
1158        smlal2          v1.4s, v18.8h, v7.h[3]
1159        mov             v18.16b, v19.16b
1160
1161        \dot            v5.4s, v3.16b, v26.4b[1]
1162        \dot            v6.4s, v4.16b, v26.4b[1]
1163
1164        smlal           v0.4s, v19.4h, v7.h[4]
1165        smlal2          v1.4s, v19.8h, v7.h[4]
1166        mov             v19.16b, v20.16b
1167        uzp1            v23.8h, v5.8h, v6.8h
1168
1169        smlal           v0.4s, v20.4h, v7.h[5]
1170        smlal2          v1.4s, v20.8h, v7.h[5]
1171.ifc \isa, neon_i8mm
1172        srshr           v20.8h, v23.8h, #2
1173.else
1174        sshr            v20.8h, v23.8h, #2
1175.endif
1176        subs            w8, w8, #1
1177        smlal           v0.4s, v20.4h, v7.h[6]
1178        smlal2          v1.4s, v20.8h, v7.h[6]
1179.ifc \type, prep
1180        rshrn           v0.4h, v0.4s, #6
1181        rshrn2          v0.8h, v1.4s, #6
1182        st1             {v0.8h}, [\ldst], \d_strd
1183        b.gt            8b
1184        add             \dst, \dst, #16
1185.else
1186        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
1187        sqrshrun        v0.8b, v0.8h, #2
1188        st1             {v0.8b}, [\ldst], \d_strd
1189        b.gt            8b
1190        add             \dst, \dst, #8
1191.endif
1192        add             \src, \src, #8
1193        subs            \w, \w, #8
1194        b.gt            81b
1195        ret             x15
1196
1197        .align FUNC_ALIGN
1198L(\type\()_hv_filter8_\isa):
1199        ld1             {v4.16b}, [\lsrc], \s_strd
1200.ifc \isa, neon_i8mm
1201        movi            v22.4s, #0
1202        movi            v23.4s, #0
1203.else   // neon_dotprod
1204        sub             v4.16b, v4.16b, v24.16b
1205        mov             v22.16b, v27.16b
1206        mov             v23.16b, v27.16b
1207.endif
1208        tbl             v2.16b, {v4.16b}, v28.16b
1209        tbl             v3.16b, {v4.16b}, v29.16b
1210        tbl             v4.16b, {v4.16b}, v30.16b
1211        \dot            v22.4s, v2.16b, v26.4b[0]
1212        \dot            v23.4s, v3.16b, v26.4b[0]
1213        \dot            v22.4s, v3.16b, v26.4b[1]
1214        \dot            v23.4s, v4.16b, v26.4b[1]
1215        uzp1            v22.8h, v22.8h, v23.8h
1216        ret
1217
1218.ifc \isa, neon_i8mm
1219        .align FUNC_ALIGN
1220L(\type\()_hv_filter6_neon_i8mm):
1221        ld1             {v4.16b}, [\lsrc], \s_strd
1222        movi            v22.4s, #0
1223        movi            v23.4s, #0
1224        tbl             v2.16b, {v4.16b}, v29.16b
1225        tbl             v3.16b, {v4.16b}, v30.16b
1226        usmmla          v22.4s, v2.16b, v26.16b
1227        usmmla          v23.4s, v3.16b, v26.16b
1228        uzp1            v22.8h, v22.8h, v23.8h
1229        ret
1230.endif
1231
1232        .align FUNC_ALIGN
1233L(\type\()_hv_filter4_\isa):
1234        ld1             {v4.8b}, [\src], \s_strd
1235.ifc \isa, neon_i8mm
1236        movi            v22.4s, #2
1237.else
1238        mov             v22.16b, v27.16b
1239        sub             v4.16b, v4.16b, v24.16b
1240.endif
1241        tbl             v2.16b, {v4.16b}, v28.16b
1242        \dot            v22.4s, v2.16b, v26.4b[0]
1243        ret
1244
1245        .align JUMP_ALIGN
124640:     // HV6 - 4xN
1247        ldur            s26, [\xmx, #2]
1248        add             \src, \src, #2
1249
1250        bl              L(\type\()_hv_filter4_\isa)
1251        shrn            v16.4h, v22.4s, #2
1252        bl              L(\type\()_hv_filter4_\isa)
1253        shrn            v17.4h, v22.4s, #2
1254        bl              L(\type\()_hv_filter4_\isa)
1255        shrn            v18.4h, v22.4s, #2
1256        bl              L(\type\()_hv_filter4_\isa)
1257        shrn            v19.4h, v22.4s, #2
1258        bl              L(\type\()_hv_filter4_\isa)
1259        shrn            v20.4h, v22.4s, #2
1260
1261        .align LOOP_ALIGN
12624:
1263        ld1             {v4.8b}, [\src], \s_strd
1264
1265        smull           v0.4s, v16.4h, v7.h[1]
1266        smlal           v0.4s, v17.4h, v7.h[2]
1267.ifc \isa, neon_dotprod
1268        sub             v4.16b, v4.16b, v24.16b
1269.endif
1270        mov             v16.16b, v17.16b
1271        mov             v17.16b, v18.16b
1272
1273        smlal           v0.4s, v18.4h, v7.h[3]
1274        smlal           v0.4s, v19.4h, v7.h[4]
1275        tbl             v2.16b, {v4.16b}, v28.16b
1276.ifc \isa, neon_i8mm
1277        movi            v5.4s, #0
1278.else
1279        mov             v5.16b, v27.16b
1280.endif
1281        mov             v18.16b, v19.16b
1282        mov             v19.16b, v20.16b
1283        \dot            v5.4s, v2.16b, v26.4b[0]
1284
1285        smlal           v0.4s, v20.4h, v7.h[5]
1286.ifc \isa, neon_i8mm
1287        rshrn           v20.4h, v5.4s, #2
1288.else
1289        shrn            v20.4h, v5.4s, #2
1290.endif
1291        subs            \h, \h, #1
1292        smlal           v0.4s, v20.4h, v7.h[6]
1293.ifc \type, prep
1294        rshrn           v0.4h, v0.4s, #6
1295        str             d0, [\dst], #8
1296.else
1297        tbl             v0.8b, {v0.16b}, v25.8b
1298        sqrshrun        v0.8b, v0.8h, #2
1299        str             s0, [\dst]
1300        add             \dst, \dst, \d_strd
1301.endif
1302        b.gt            4b
1303        ret             x15
1304
1305.ifc \type, put
1306        .align JUMP_ALIGN
130720:     // HV6 - 2xN
1308        ldur            s26, [\xmx, #2]
1309        add             \src, \src, #2
1310
1311        bl              L(\type\()_hv_filter4_\isa)
1312        shrn            v16.4h, v22.4s, #2
1313        bl              L(\type\()_hv_filter4_\isa)
1314        shrn            v17.4h, v22.4s, #2
1315        bl              L(\type\()_hv_filter4_\isa)
1316        shrn            v18.4h, v22.4s, #2
1317        bl              L(\type\()_hv_filter4_\isa)
1318        shrn            v19.4h, v22.4s, #2
1319        bl              L(\type\()_hv_filter4_\isa)
1320        shrn            v20.4h, v22.4s, #2
1321
1322        .align LOOP_ALIGN
13232:
1324        ld1             {v4.8b}, [\src], \s_strd
1325
1326        smull           v0.4s, v16.4h, v7.h[1]
1327        smlal           v0.4s, v17.4h, v7.h[2]
1328    .ifc \isa, neon_dotprod
1329        sub             v4.16b, v4.16b, v24.16b
1330    .endif
1331        mov             v16.16b, v17.16b
1332        mov             v17.16b, v18.16b
1333
1334        smlal           v0.4s, v18.4h, v7.h[3]
1335        smlal           v0.4s, v19.4h, v7.h[4]
1336        tbl             v2.16b, {v4.16b}, v28.16b
1337    .ifc \isa, neon_i8mm
1338        movi            v5.4s, #0
1339    .else
1340        mov             v5.16b, v27.16b
1341    .endif
1342
1343        mov             v18.16b, v19.16b
1344        mov             v19.16b, v20.16b
1345        \dot            v5.4s, v2.16b, v26.4b[0]
1346
1347        smlal           v0.4s, v20.4h, v7.h[5]
1348    .ifc \isa, neon_i8mm
1349        rshrn           v20.4h, v5.4s, #2
1350    .else
1351        shrn            v20.4h, v5.4s, #2
1352    .endif
1353
1354        subs            \h, \h, #1
1355        smlal           v0.4s, v20.4h, v7.h[6]
1356
1357        tbl             v0.8b, {v0.16b}, v25.8b
1358        sqrshrun        v0.8b, v0.8h, #2
1359
1360        str             h0, [\dst]
1361        add             \dst, \dst, \d_strd
1362        b.gt            2b
1363        ret             x15
1364.endif
1365
1366        .align JUMP_ALIGN
1367L(\type\()_8tap_h_\isa):
1368        movrel          x11, \type\()_8tap_h_\isa\()_tbl
1369        ldrsw           x8, [x11, x8, lsl #2]
1370.ifc \type, put
1371    .ifc \isa, neon_i8mm
1372        movi            v27.4s, #34     // special rounding
1373    .else
1374        mov             w10, #0x2022    // 64 * 128 + 34, bias and rounding for SDOT
1375        dup             v27.4s, w10
1376    .endif
1377.endif
1378        add             x11, x11, x8
1379        br              x11
1380
1381.ifc \type, put
1382        .align JUMP_ALIGN
138320:     // H - 2xN
1384        AARCH64_VALID_JUMP_TARGET
1385        add             \src, \src, #2
1386        ldur            s26, [\xmx, #2]
1387
1388        .align LOOP_ALIGN
13892:
1390        ldr             d0, [\src]
1391        ldr             d1, [\src, \s_strd]
1392        add             \src, \src, \s_strd, lsl #1
1393    .ifc \isa, neon_dotprod
1394        sub             v0.8b, v0.8b, v24.8b
1395        sub             v1.8b, v1.8b, v24.8b
1396    .endif
1397        mov             v4.16b, v27.16b
1398        mov             v5.16b, v27.16b
1399
1400        tbl             v2.16b, {v0.16b}, v28.16b
1401        tbl             v3.16b, {v1.16b}, v28.16b
1402
1403        \dot            v4.4s, v2.16b, v26.4b[0]
1404        \dot            v5.4s, v3.16b, v26.4b[0]
1405
1406        uzp1            v4.8h, v4.8h, v5.8h
1407        sqshrun         v4.8b, v4.8h, #6
1408
1409        subs            \h, \h, #2
1410        fmov            x8, d4
1411        lsr             x9, x8, #32
1412        strh            w8, [\dst]
1413        strh            w9, [\dst, \d_strd]
1414        add             \dst, \dst, \d_strd, lsl #1
1415        b.gt            2b
1416        ret
1417.endif
1418
1419        .align JUMP_ALIGN
142040:     // H - 4xN
1421        AARCH64_VALID_JUMP_TARGET
1422        add             \src, \src, #2
1423        ldur            s26, [\xmx, #2]
1424
1425        .align LOOP_ALIGN
14264:
1427        ldr             d0, [\src]
1428        ldr             d1, [\src, \s_strd]
1429        add             \src, \src, \s_strd, lsl #1
1430.ifc \type\()_\isa, prep_neon_i8mm
1431        movi            v4.4s, #0
1432        movi            v5.4s, #0
1433.else
1434    .ifc \isa, neon_dotprod
1435        sub             v0.8b, v0.8b, v24.8b
1436        sub             v1.8b, v1.8b, v24.8b
1437    .endif
1438        mov             v4.16b, v27.16b
1439        mov             v5.16b, v27.16b
1440.endif
1441        tbl             v2.16b, {v0.16b}, v28.16b
1442        tbl             v3.16b, {v1.16b}, v28.16b
1443
1444        \dot            v4.4s, v2.16b, v26.4b[0]
1445        \dot            v5.4s, v3.16b, v26.4b[0]
1446.ifc \type, prep
1447        subs            \h, \h, #2
1448    .ifc \isa, neon_i8mm
1449        uzp1            v4.8h, v4.8h, v5.8h
1450        srshr           v4.8h, v4.8h, #2
1451    .else
1452        shrn            v4.4h, v4.4s, #2
1453        shrn2           v4.8h, v5.4s, #2
1454    .endif
1455        str             q4, [\dst], #16
1456.else   // put
1457        uzp1            v4.8h, v4.8h, v5.8h
1458        sqshrun         v4.8b, v4.8h, #6
1459        subs            \h, \h, #2
1460        fmov            x8, d4
1461        lsr             x9, x8, #32
1462        str             w8, [\dst]
1463        str             w9, [\dst, \d_strd]
1464        add             \dst, \dst, \d_strd, lsl #1
1465.endif
1466        b.gt            4b
1467        ret
1468
1469        .align JUMP_ALIGN
147080:     // H - 8xN
1471        AARCH64_VALID_JUMP_TARGET
1472        ldr             d26, [\xmx]
1473.ifc \isa, neon_i8mm
1474        cmp             w9, #SHARP1
1475        b.eq            88f             // horizontal == SHARP1
1476
1477        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
1478        ext             v0.8b, v26.8b, v26.8b, #7
1479        ins             v26.d[1], v0.d[0]
1480
1481        .align LOOP_ALIGN
14828:
1483        ldr             q0, [\src]
1484        ldr             q16, [\src, \s_strd]
1485        add             \src, \src, \s_strd, lsl #1
1486    .ifc \type, prep
1487        movi            v4.4s, #0
1488        movi            v5.4s, #0
1489        movi            v20.4s, #0
1490        movi            v21.4s, #0
1491    .else
1492        mov             v4.16b, v27.16b
1493        mov             v5.16b, v27.16b
1494        mov             v20.16b, v27.16b
1495        mov             v21.16b, v27.16b
1496    .endif
1497        tbl             v1.16b, {v0.16b}, v29.16b
1498        tbl             v2.16b, {v0.16b}, v30.16b
1499        tbl             v17.16b, {v16.16b}, v29.16b
1500        tbl             v18.16b, {v16.16b}, v30.16b
1501
1502        usmmla          v4.4s, v1.16b, v26.16b
1503        usmmla          v5.4s, v2.16b, v26.16b
1504        usmmla          v20.4s, v17.16b, v26.16b
1505        usmmla          v21.4s, v18.16b, v26.16b
1506
1507        uzp1            v4.8h, v4.8h, v5.8h
1508        uzp1            v20.8h, v20.8h, v21.8h
1509    .ifc \type, prep
1510        srshr           v4.8h, v4.8h, #2
1511        srshr           v20.8h, v20.8h, #2
1512        subs            \h, \h, #2
1513        stp             q4, q20, [\dst], #32
1514    .else   // put
1515        sqshrun         v4.8b, v4.8h, #6
1516        sqshrun         v20.8b, v20.8h, #6
1517        subs            \h, \h, #2
1518        str             d4, [\dst]
1519        str             d20, [\dst, \d_strd]
1520        add             \dst, \dst, \d_strd, lsl #1
1521    .endif
1522        b.gt            8b
1523        ret
1524
1525        .align JUMP_ALIGN
152688:
1527.endif  // neon_i8mm
1528        ldp             q29, q30, [x13, #16]
1529
1530        .align LOOP_ALIGN
15318:
1532        ldr             q0, [\src]
1533        ldr             q16, [\src, \s_strd]
1534        add             \src, \src, \s_strd, lsl #1
1535.ifc \type\()_\isa, prep_neon_i8mm
1536        movi            v4.4s, #0
1537        movi            v5.4s, #0
1538        movi            v20.4s, #0
1539        movi            v21.4s, #0
1540.else
1541    .ifc \isa, neon_dotprod
1542        sub             v0.16b, v0.16b, v24.16b
1543        sub             v16.16b, v16.16b, v24.16b
1544    .endif
1545        mov             v4.16b, v27.16b
1546        mov             v5.16b, v27.16b
1547        mov             v20.16b, v27.16b
1548        mov             v21.16b, v27.16b
1549.endif
1550        tbl             v1.16b, {v0.16b}, v28.16b
1551        tbl             v2.16b, {v0.16b}, v29.16b
1552        tbl             v3.16b, {v0.16b}, v30.16b
1553        tbl             v17.16b, {v16.16b}, v28.16b
1554        tbl             v18.16b, {v16.16b}, v29.16b
1555        tbl             v19.16b, {v16.16b}, v30.16b
1556
1557        \dot            v4.4s, v1.16b, v26.4b[0]
1558        \dot            v5.4s, v2.16b, v26.4b[0]
1559        \dot            v20.4s, v17.16b, v26.4b[0]
1560        \dot            v21.4s, v18.16b, v26.4b[0]
1561        \dot            v4.4s, v2.16b, v26.4b[1]
1562        \dot            v5.4s, v3.16b, v26.4b[1]
1563        \dot            v20.4s, v18.16b, v26.4b[1]
1564        \dot            v21.4s, v19.16b, v26.4b[1]
1565
1566        uzp1            v4.8h, v4.8h, v5.8h
1567        uzp1            v20.8h, v20.8h, v21.8h
1568.ifc \type, prep
1569    .ifc \isa, neon_i8mm
1570        srshr           v4.8h, v4.8h, #2
1571        srshr           v20.8h, v20.8h, #2
1572    .else
1573        sshr            v4.8h, v4.8h, #2
1574        sshr            v20.8h, v20.8h, #2
1575    .endif
1576        subs            \h, \h, #2
1577        stp             q4, q20, [\dst], #32
1578.else   // put
1579        sqshrun         v4.8b, v4.8h, #6
1580        sqshrun         v20.8b, v20.8h, #6
1581        subs            \h, \h, #2
1582        str             d4, [\dst]
1583        str             d20, [\dst, \d_strd]
1584        add             \dst, \dst, \d_strd, lsl #1
1585.endif
1586        b.gt            8b
1587        ret
1588
1589        .align JUMP_ALIGN
1590160:    // H - 16xN
1591        AARCH64_VALID_JUMP_TARGET
1592        ldr             d26, [\xmx]
1593.ifc \isa, neon_i8mm
1594        cmp             w9, #SHARP1
1595        b.eq            168f            // horizontal == SHARP1
1596
1597        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
1598        ext             v0.8b, v26.8b, v26.8b, #7
1599        ins             v26.d[1], v0.d[0]
1600
1601        .align LOOP_ALIGN
160216:
1603        ldr             q16, [\src]
1604        ldur            q17, [\src, #8] // avoid 2 register TBL for small cores
1605        add             \src, \src, \s_strd
1606    .ifc \type, prep
1607        movi            v6.4s, #0
1608        movi            v7.4s, #0
1609        movi            v22.4s, #0
1610        movi            v23.4s, #0
1611    .else
1612        mov             v6.16b, v27.16b
1613        mov             v7.16b, v27.16b
1614        mov             v22.16b, v27.16b
1615        mov             v23.16b, v27.16b
1616    .endif
1617        tbl             v0.16b, {v16.16b}, v29.16b
1618        tbl             v1.16b, {v16.16b}, v30.16b
1619        tbl             v2.16b, {v17.16b}, v29.16b
1620        tbl             v3.16b, {v17.16b}, v30.16b
1621
1622        usmmla          v6.4s, v0.16b, v26.16b
1623        usmmla          v7.4s, v1.16b, v26.16b
1624        usmmla          v22.4s, v2.16b, v26.16b
1625        usmmla          v23.4s, v3.16b, v26.16b
1626
1627        uzp1            v6.8h, v6.8h, v7.8h
1628        uzp1            v22.8h, v22.8h, v23.8h
1629    .ifc \type, prep
1630        srshr           v6.8h, v6.8h, #2
1631        srshr           v22.8h, v22.8h, #2
1632        subs            \h, \h, #1
1633        stp             q6, q22, [\dst], #32
1634    .else   // put
1635        sqshrun         v6.8b, v6.8h, #6
1636        sqshrun2        v6.16b, v22.8h, #6
1637        subs            \h, \h, #1
1638        st1             {v6.16b}, [\dst], \d_strd
1639    .endif
1640        b.gt            16b
1641        ret
1642
1643        .align JUMP_ALIGN
1644168:
1645.endif  // neon_i8mm
1646        ldp             q29, q30, [x13, #16]
1647
1648        .align LOOP_ALIGN
164916:
1650        ldr             q16, [\src]
1651        ldur            q17, [\src, #12]  // avoid 2 register TBL for small cores
1652        add             \src, \src, \s_strd
1653.ifc \type\()_\isa, prep_neon_i8mm
1654        movi            v6.4s, #0
1655        movi            v7.4s, #0
1656        movi            v22.4s, #0
1657        movi            v23.4s, #0
1658.else
1659    .ifc \isa, neon_dotprod
1660        sub             v16.16b, v16.16b, v24.16b
1661        sub             v17.16b, v17.16b, v24.16b
1662    .endif
1663        mov             v6.16b, v27.16b
1664        mov             v7.16b, v27.16b
1665        mov             v22.16b, v27.16b
1666        mov             v23.16b, v27.16b
1667.endif
1668        tbl             v0.16b, {v16.16b}, v28.16b
1669        tbl             v1.16b, {v16.16b}, v29.16b
1670        tbl             v2.16b, {v16.16b}, v30.16b
1671        tbl             v3.16b, {v17.16b}, v28.16b
1672        tbl             v4.16b, {v17.16b}, v29.16b
1673
1674        \dot            v6.4s, v0.16b, v26.4b[0]
1675        \dot            v7.4s, v1.16b, v26.4b[0]
1676        \dot            v22.4s, v2.16b, v26.4b[0]
1677        \dot            v23.4s, v3.16b, v26.4b[0]
1678        \dot            v6.4s, v1.16b, v26.4b[1]
1679        \dot            v7.4s, v2.16b, v26.4b[1]
1680        \dot            v22.4s, v3.16b, v26.4b[1]
1681        \dot            v23.4s, v4.16b, v26.4b[1]
1682
1683        uzp1            v6.8h, v6.8h, v7.8h
1684        uzp1            v22.8h, v22.8h, v23.8h
1685.ifc \type, prep
1686    .ifc \isa, neon_i8mm
1687        srshr           v6.8h, v6.8h, #2
1688        srshr           v22.8h, v22.8h, #2
1689    .else
1690        sshr            v6.8h, v6.8h, #2
1691        sshr            v22.8h, v22.8h, #2
1692    .endif
1693        subs            \h, \h, #1
1694        stp             q6, q22, [\dst], #32
1695.else   // put
1696        sqshrun         v6.8b, v6.8h, #6
1697        sqshrun2        v6.16b, v22.8h, #6
1698        subs            \h, \h, #1
1699        st1             {v6.16b}, [\dst], \d_strd
1700.endif
1701        b.gt            16b
1702        ret
1703
1704        .align JUMP_ALIGN
1705320:    // H - 32xN+
1706640:
17071280:
1708        AARCH64_VALID_JUMP_TARGET
1709        ldr             d26, [\xmx]
1710.ifc \type, put
1711        sub             \d_strd, \d_strd, \w, uxtw
1712.endif
1713        sub             \s_strd, \s_strd, \w, uxtw
1714        mov             w8, \w
1715
1716.ifc \isa, neon_i8mm
1717        cmp             w9, #SHARP1
1718        b.eq            328f            // horizontal == SHARP1
1719
1720        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
1721        ext             v0.8b, v26.8b, v26.8b, #7
1722        ins             v26.d[1], v0.d[0]
1723
1724        .align LOOP_ALIGN
172532:
1726        ldr             q16, [\src]
1727        ldur            q17, [\src, #8] // avoid 2 register TBL for small cores
1728        add             \src, \src, #16
1729    .ifc \type, prep
1730        movi            v6.4s, #0
1731        movi            v7.4s, #0
1732        movi            v22.4s, #0
1733        movi            v23.4s, #0
1734    .else
1735        mov             v6.16b, v27.16b
1736        mov             v7.16b, v27.16b
1737        mov             v22.16b, v27.16b
1738        mov             v23.16b, v27.16b
1739    .endif
1740        tbl             v0.16b, {v16.16b}, v29.16b
1741        tbl             v1.16b, {v16.16b}, v30.16b
1742        tbl             v2.16b, {v17.16b}, v29.16b
1743        tbl             v3.16b, {v17.16b}, v30.16b
1744
1745        usmmla          v6.4s, v0.16b, v26.16b
1746        usmmla          v7.4s, v1.16b, v26.16b
1747        usmmla          v22.4s, v2.16b, v26.16b
1748        usmmla          v23.4s, v3.16b, v26.16b
1749
1750        uzp1            v6.8h, v6.8h, v7.8h
1751        uzp1            v22.8h, v22.8h, v23.8h
1752    .ifc \type, prep
1753        srshr           v6.8h, v6.8h, #2
1754        srshr           v22.8h, v22.8h, #2
1755        subs            w8, w8, #16
1756        stp             q6, q22, [\dst], #32
1757    .else   // put
1758        sqshrun         v6.8b, v6.8h, #6
1759        sqshrun2        v6.16b, v22.8h, #6
1760        subs            w8, w8, #16
1761        str             q6, [\dst], #16
1762    .endif
1763        b.gt            32b
1764
1765        add             \src, \src, \s_strd
1766    .ifc \type, put
1767        add             \dst, \dst, \d_strd
1768    .endif
1769        mov             w8, \w
1770        subs            \h, \h, #1
1771        b.gt            32b
1772        ret
1773
1774        .align JUMP_ALIGN
1775328:
1776.endif  // neon_i8mm
1777        ldp             q29, q30, [x13, #16]
1778
1779        .align LOOP_ALIGN
178032:
1781        ldr             q16, [\src]
1782        ldur            q17, [\src, #12]  // avoid 2 register TBL for small cores
1783        add             \src, \src, #16
1784.ifc \type\()_\isa, prep_neon_i8mm
1785        movi            v6.4s, #0
1786        movi            v7.4s, #0
1787        movi            v22.4s, #0
1788        movi            v23.4s, #0
1789.else
1790    .ifc \isa, neon_dotprod
1791        sub             v16.16b, v16.16b, v24.16b
1792        sub             v17.16b, v17.16b, v24.16b
1793    .endif
1794        mov             v6.16b, v27.16b
1795        mov             v7.16b, v27.16b
1796        mov             v22.16b, v27.16b
1797        mov             v23.16b, v27.16b
1798.endif
1799        tbl             v0.16b, {v16.16b}, v28.16b
1800        tbl             v1.16b, {v16.16b}, v29.16b
1801        tbl             v2.16b, {v16.16b}, v30.16b
1802        tbl             v3.16b, {v17.16b}, v28.16b
1803        tbl             v4.16b, {v17.16b}, v29.16b
1804
1805        \dot            v6.4s, v0.16b, v26.4b[0]
1806        \dot            v7.4s, v1.16b, v26.4b[0]
1807        \dot            v22.4s, v2.16b, v26.4b[0]
1808        \dot            v23.4s, v3.16b, v26.4b[0]
1809        \dot            v6.4s, v1.16b, v26.4b[1]
1810        \dot            v7.4s, v2.16b, v26.4b[1]
1811        \dot            v22.4s, v3.16b, v26.4b[1]
1812        \dot            v23.4s, v4.16b, v26.4b[1]
1813
1814        uzp1            v6.8h, v6.8h, v7.8h
1815        uzp1            v22.8h, v22.8h, v23.8h
1816.ifc \type, prep
1817    .ifc \isa, neon_i8mm
1818        srshr           v6.8h, v6.8h, #2
1819        srshr           v22.8h, v22.8h, #2
1820    .else
1821        sshr            v6.8h, v6.8h, #2
1822        sshr            v22.8h, v22.8h, #2
1823    .endif
1824        subs            w8, w8, #16
1825        stp             q6, q22, [\dst], #32
1826.else   // put
1827        sqshrun         v6.8b, v6.8h, #6
1828        sqshrun2        v6.16b, v22.8h, #6
1829        subs            w8, w8, #16
1830        str             q6, [\dst], #16
1831.endif
1832        b.gt            32b
1833
1834        add             \src, \src, \s_strd
1835.ifc \type, put
1836        add             \dst, \dst, \d_strd
1837.endif
1838        mov             w8, \w
1839        subs            \h, \h, #1
1840        b.gt            32b
1841        ret
1842endfunc
1843
1844jumptable \type\()_8tap_h_\isa\()_tbl
1845        .word 1280b - \type\()_8tap_h_\isa\()_tbl
1846        .word 640b  - \type\()_8tap_h_\isa\()_tbl
1847        .word 320b  - \type\()_8tap_h_\isa\()_tbl
1848        .word 160b  - \type\()_8tap_h_\isa\()_tbl
1849        .word 80b   - \type\()_8tap_h_\isa\()_tbl
1850        .word 40b   - \type\()_8tap_h_\isa\()_tbl
1851.ifc \type, put
1852        .word 20b   - \type\()_8tap_h_\isa\()_tbl
1853.endif
1854endjumptable
1855.endm
1856
1857// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
1858// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7)
1859filter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
1860
1861// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7)
1862// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
1863filter_8tap_fn  put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
1864
1865#if HAVE_I8MM
1866ENABLE_I8MM
1867
1868// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
1869// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7)
1870filter_8tap_fn prep, usdot, neon_i8mm, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
1871
1872// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7)
1873// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
1874filter_8tap_fn  put, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
1875
1876DISABLE_I8MM
1877#endif  // HAVE_I8MM
1878
1879DISABLE_DOTPROD
1880#endif  // HAVE_DOTPROD
1881