xref: /aosp_15_r20/external/libdav1d/src/arm/64/mc_dotprod.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2024, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2024, Janne Grunau
4*c0909341SAndroid Build Coastguard Worker * Copyright © 2024, Martin Storsjo
5*c0909341SAndroid Build Coastguard Worker * Copyright © 2024, Arm Limited
6*c0909341SAndroid Build Coastguard Worker * All rights reserved.
7*c0909341SAndroid Build Coastguard Worker *
8*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
9*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
10*c0909341SAndroid Build Coastguard Worker *
11*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
12*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
13*c0909341SAndroid Build Coastguard Worker *
14*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
15*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
16*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
17*c0909341SAndroid Build Coastguard Worker *
18*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28*c0909341SAndroid Build Coastguard Worker */
29*c0909341SAndroid Build Coastguard Worker
30*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
31*c0909341SAndroid Build Coastguard Worker#include "util.S"
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Worker
34*c0909341SAndroid Build Coastguard Worker#if HAVE_DOTPROD
35*c0909341SAndroid Build Coastguard WorkerENABLE_DOTPROD
36*c0909341SAndroid Build Coastguard Worker
37*c0909341SAndroid Build Coastguard Worker// No spaces in these expressions, due to gas-preprocessor. It is translated by
38*c0909341SAndroid Build Coastguard Worker// -1 to save the negative offset at getting the address of `mc_subpel_filters`.
39*c0909341SAndroid Build Coastguard Worker#define REGULAR1        (((0*15-1)<<7)|(3*15-1))
40*c0909341SAndroid Build Coastguard Worker#define SMOOTH1         (((1*15-1)<<7)|(4*15-1))
41*c0909341SAndroid Build Coastguard Worker#define SHARP1          (((2*15-1)<<7)|(3*15-1))
42*c0909341SAndroid Build Coastguard Worker
43*c0909341SAndroid Build Coastguard Worker#define FUNC_ALIGN      2
44*c0909341SAndroid Build Coastguard Worker#define JUMP_ALIGN      2
45*c0909341SAndroid Build Coastguard Worker#define LOOP_ALIGN      2
46*c0909341SAndroid Build Coastguard Worker
47*c0909341SAndroid Build Coastguard Worker
48*c0909341SAndroid Build Coastguard Workerconst h_tbl_neon_dotprod, align=4
49*c0909341SAndroid Build Coastguard Worker        // Shuffle indices to permute horizontal samples in preparation for
50*c0909341SAndroid Build Coastguard Worker        // input to SDOT instructions. The 8-tap horizontal convolution uses
51*c0909341SAndroid Build Coastguard Worker        // sample indices in the interval of [-3, 4] relative to the current
52*c0909341SAndroid Build Coastguard Worker        // sample position.
53*c0909341SAndroid Build Coastguard Worker        .byte  0,  1,  2,  3,   1,  2,  3,  4,   2,  3,  4,  5,   3,  4,  5,  6
54*c0909341SAndroid Build Coastguard Worker        .byte  4,  5,  6,  7,   5,  6,  7,  8,   6,  7,  8,  9,   7,  8,  9, 10
55*c0909341SAndroid Build Coastguard Worker        .byte  8,  9, 10, 11,   9, 10, 11, 12,  10, 11, 12, 13,  11, 12, 13, 14
56*c0909341SAndroid Build Coastguard Worker
57*c0909341SAndroid Build Coastguard Worker        // Shuffle indices to permute horizontal samples in preparation for
58*c0909341SAndroid Build Coastguard Worker        // input to USMMLA instructions.
59*c0909341SAndroid Build Coastguard Worker#define OFFSET_USMMLA 48
60*c0909341SAndroid Build Coastguard Worker        .byte  0,  1,  2,  3,   4,  5,  6,  7,   2,  3,  4,  5,   6,  7,  8,  9
61*c0909341SAndroid Build Coastguard Worker        .byte  4,  5,  6,  7,   8,  9, 10, 11,   6,  7,  8,  9,  10, 11, 12, 13
62*c0909341SAndroid Build Coastguard Worker
63*c0909341SAndroid Build Coastguard Worker        // Lookup table used to help conversion of shifted 32-bit values to 8-bit.
64*c0909341SAndroid Build Coastguard Worker#define OFFSET_CVT_32_8 80
65*c0909341SAndroid Build Coastguard Worker        .byte  1,  2,  5,  6,   9, 10, 13, 14,  17, 18, 21, 22,  25, 26, 29, 30
66*c0909341SAndroid Build Coastguard Workerendconst
67*c0909341SAndroid Build Coastguard Worker
68*c0909341SAndroid Build Coastguard Workerconst v_tbl_neon_dotprod, align=4
69*c0909341SAndroid Build Coastguard Worker        // Vertical convolutions are also using SDOT instructions, where a
70*c0909341SAndroid Build Coastguard Worker        // 128-bit register contains a transposed 4x4 matrix of values.
71*c0909341SAndroid Build Coastguard Worker        // Subsequent iterations of the vertical convolution can reuse the
72*c0909341SAndroid Build Coastguard Worker        // 3x4 sub-matrix from the previous loop iteration. These shuffle
73*c0909341SAndroid Build Coastguard Worker        // indices shift and merge this 4x4 matrix with the values of a new
74*c0909341SAndroid Build Coastguard Worker        // line.
75*c0909341SAndroid Build Coastguard Worker        .byte  1,  2,  3, 16,   5,  6,  7, 20,   9, 10, 11, 24,  13, 14, 15, 28
76*c0909341SAndroid Build Coastguard Worker        .byte  1,  2,  3, 16,   5,  6,  7, 17,   9, 10, 11, 18,  13, 14, 15, 19
77*c0909341SAndroid Build Coastguard Worker        .byte  1,  2,  3, 20,   5,  6,  7, 21,   9, 10, 11, 22,  13, 14, 15, 23
78*c0909341SAndroid Build Coastguard Worker        .byte  1,  2,  3, 24,   5,  6,  7, 25,   9, 10, 11, 26,  13, 14, 15, 27
79*c0909341SAndroid Build Coastguard Worker        .byte  1,  2,  3, 28,   5,  6,  7, 29,   9, 10, 11, 30,  13, 14, 15, 31
80*c0909341SAndroid Build Coastguard Workerendconst
81*c0909341SAndroid Build Coastguard Worker
82*c0909341SAndroid Build Coastguard Worker
83*c0909341SAndroid Build Coastguard Worker.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
84*c0909341SAndroid Build Coastguard Workerfunction \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN
85*c0909341SAndroid Build Coastguard Worker        mov             x9,  \type_h
86*c0909341SAndroid Build Coastguard Worker        mov             x10, \type_v
87*c0909341SAndroid Build Coastguard Worker    .if \jump
88*c0909341SAndroid Build Coastguard Worker        b               \op\()_8tap_\isa
89*c0909341SAndroid Build Coastguard Worker    .endif
90*c0909341SAndroid Build Coastguard Workerendfunc
91*c0909341SAndroid Build Coastguard Worker.endm
92*c0909341SAndroid Build Coastguard Worker
93*c0909341SAndroid Build Coastguard Worker.macro filter_8tap_fn type, dot, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd
94*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, sharp,          SHARP1,   SHARP1,   \isa
95*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, sharp_smooth,   SHARP1,   SMOOTH1,  \isa
96*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, sharp_regular,  SHARP1,   REGULAR1, \isa
97*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, smooth_sharp,   SMOOTH1,  SHARP1,   \isa
98*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, smooth,         SMOOTH1,  SMOOTH1,  \isa
99*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, smooth_regular, SMOOTH1,  REGULAR1, \isa
100*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, regular_sharp,  REGULAR1, SHARP1,   \isa
101*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1,  \isa
102*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, regular,        REGULAR1, REGULAR1, \isa, jump=0
103*c0909341SAndroid Build Coastguard Worker
104*c0909341SAndroid Build Coastguard Workerfunction \type\()_8tap_\isa, align=FUNC_ALIGN
105*c0909341SAndroid Build Coastguard Worker        clz             w8, \w
106*c0909341SAndroid Build Coastguard Worker        mov             w11,  #0x4081   // (1 << 14) | (1 << 7) | (1 << 0)
107*c0909341SAndroid Build Coastguard Worker        sub             w8, w8, #24     // for jump tables
108*c0909341SAndroid Build Coastguard Worker        movrel          x12, X(mc_subpel_filters)
109*c0909341SAndroid Build Coastguard Worker        cbnz            \mx, L(\type\()_8tap_h_hv_\isa)
110*c0909341SAndroid Build Coastguard Worker        cbnz            \my, L(\type\()_8tap_v_\isa)
111*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
112*c0909341SAndroid Build Coastguard Worker        add             \wd_strd, \w, \w    // prep_neon needs w * 2 as stride
113*c0909341SAndroid Build Coastguard Worker.endif
114*c0909341SAndroid Build Coastguard Worker        b               X(\type\()_neon)
115*c0909341SAndroid Build Coastguard Worker
116*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
117*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_v_\isa):
118*c0909341SAndroid Build Coastguard Worker        madd            \my, \my, w11, w10
119*c0909341SAndroid Build Coastguard Worker        movrel          x13, v_tbl_neon_dotprod
120*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd
121*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod
122*c0909341SAndroid Build Coastguard Worker    .ifc \type, prep
123*c0909341SAndroid Build Coastguard Worker        mov             w8, #0x2002         // FILTER_WEIGHT * 128 + rounding
124*c0909341SAndroid Build Coastguard Worker        dup             v4.4s, w8
125*c0909341SAndroid Build Coastguard Worker    .else
126*c0909341SAndroid Build Coastguard Worker        movi            v4.4s, #32, lsl #8  // FILTER_WEIGHT * 128, bias for SDOT
127*c0909341SAndroid Build Coastguard Worker    .endif
128*c0909341SAndroid Build Coastguard Worker.endif
129*c0909341SAndroid Build Coastguard Worker        ubfx            w11, \my, #7, #7
130*c0909341SAndroid Build Coastguard Worker        and             \my, \my, #0x7F
131*c0909341SAndroid Build Coastguard Worker        ldp             q6, q28, [x13]
132*c0909341SAndroid Build Coastguard Worker        cmp             \h, #4
133*c0909341SAndroid Build Coastguard Worker        csel            \my, \my, w11, le
134*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd, lsl #1     // src - s_strd * 3
135*c0909341SAndroid Build Coastguard Worker        add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
136*c0909341SAndroid Build Coastguard Worker        ldr             q29, [x13, #32]
137*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod
138*c0909341SAndroid Build Coastguard Worker        movi            v5.16b, #128
139*c0909341SAndroid Build Coastguard Worker.endif
140*c0909341SAndroid Build Coastguard Worker        ldr             d7, [\xmy]
141*c0909341SAndroid Build Coastguard Worker        cmp             \w, #8
142*c0909341SAndroid Build Coastguard Worker        b.eq            80f
143*c0909341SAndroid Build Coastguard Worker        b.lt            40f
144*c0909341SAndroid Build Coastguard Worker
145*c0909341SAndroid Build Coastguard Worker        // .align JUMP_ALIGN    // fallthrough
146*c0909341SAndroid Build Coastguard Worker160:    // V - 16xN+
147*c0909341SAndroid Build Coastguard Worker        ldp             q30, q31, [x13, #48]
148*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
149*c0909341SAndroid Build Coastguard Worker        add             \wd_strd, \w, \w
150*c0909341SAndroid Build Coastguard Worker.endif
151*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
152*c0909341SAndroid Build Coastguard Worker161:
153*c0909341SAndroid Build Coastguard Worker        mov             \lsrc, \src
154*c0909341SAndroid Build Coastguard Worker        mov             \ldst, \dst
155*c0909341SAndroid Build Coastguard Worker        sub             w8, \h, #1
156*c0909341SAndroid Build Coastguard Worker
157*c0909341SAndroid Build Coastguard Worker        ldr             q16, [\lsrc]
158*c0909341SAndroid Build Coastguard Worker        ldr             q17, [\lsrc, \s_strd]
159*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd, lsl #1
160*c0909341SAndroid Build Coastguard Worker        ldr             q18, [\lsrc]
161*c0909341SAndroid Build Coastguard Worker        ldr             q19, [\lsrc, \s_strd]
162*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd, lsl #1
163*c0909341SAndroid Build Coastguard Worker
164*c0909341SAndroid Build Coastguard Worker        zip1            v0.16b, v16.16b, v17.16b
165*c0909341SAndroid Build Coastguard Worker        zip2            v1.16b, v16.16b, v17.16b
166*c0909341SAndroid Build Coastguard Worker        zip1            v2.16b, v18.16b, v19.16b
167*c0909341SAndroid Build Coastguard Worker        zip2            v3.16b, v18.16b, v19.16b
168*c0909341SAndroid Build Coastguard Worker
169*c0909341SAndroid Build Coastguard Worker        ldr             q20, [\lsrc]
170*c0909341SAndroid Build Coastguard Worker        ldr             q21, [\lsrc, \s_strd]
171*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd, lsl #1
172*c0909341SAndroid Build Coastguard Worker        ldr             q22, [\lsrc]
173*c0909341SAndroid Build Coastguard Worker        ldr             q23, [\lsrc, \s_strd]
174*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd, lsl #1
175*c0909341SAndroid Build Coastguard Worker
176*c0909341SAndroid Build Coastguard Worker        zip1            v18.16b, v20.16b, v21.16b
177*c0909341SAndroid Build Coastguard Worker        zip2            v21.16b, v20.16b, v21.16b
178*c0909341SAndroid Build Coastguard Worker        zip1            v24.16b, v22.16b, v23.16b
179*c0909341SAndroid Build Coastguard Worker        zip2            v27.16b, v22.16b, v23.16b
180*c0909341SAndroid Build Coastguard Worker
181*c0909341SAndroid Build Coastguard Worker        zip1            v16.8h, v0.8h, v2.8h
182*c0909341SAndroid Build Coastguard Worker        zip2            v19.8h, v0.8h, v2.8h
183*c0909341SAndroid Build Coastguard Worker        zip1            v22.8h, v1.8h, v3.8h
184*c0909341SAndroid Build Coastguard Worker        zip2            v25.8h, v1.8h, v3.8h
185*c0909341SAndroid Build Coastguard Worker
186*c0909341SAndroid Build Coastguard Worker        zip1            v17.8h, v18.8h, v24.8h
187*c0909341SAndroid Build Coastguard Worker        zip2            v20.8h, v18.8h, v24.8h
188*c0909341SAndroid Build Coastguard Worker        zip1            v23.8h, v21.8h, v27.8h
189*c0909341SAndroid Build Coastguard Worker        zip2            v26.8h, v21.8h, v27.8h
190*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod
191*c0909341SAndroid Build Coastguard Worker        sub             v16.16b, v16.16b, v5.16b
192*c0909341SAndroid Build Coastguard Worker        sub             v19.16b, v19.16b, v5.16b
193*c0909341SAndroid Build Coastguard Worker        sub             v22.16b, v22.16b, v5.16b
194*c0909341SAndroid Build Coastguard Worker        sub             v25.16b, v25.16b, v5.16b
195*c0909341SAndroid Build Coastguard Worker
196*c0909341SAndroid Build Coastguard Worker        sub             v17.16b, v17.16b, v5.16b
197*c0909341SAndroid Build Coastguard Worker        sub             v20.16b, v20.16b, v5.16b
198*c0909341SAndroid Build Coastguard Worker        sub             v23.16b, v23.16b, v5.16b
199*c0909341SAndroid Build Coastguard Worker        sub             v26.16b, v26.16b, v5.16b
200*c0909341SAndroid Build Coastguard Worker.endif
201*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
202*c0909341SAndroid Build Coastguard Worker16:
203*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
204*c0909341SAndroid Build Coastguard Worker        ld1             {v18.16b}, [\lsrc], \s_strd
205*c0909341SAndroid Build Coastguard Worker        movi            v0.4s, #0
206*c0909341SAndroid Build Coastguard Worker        movi            v1.4s, #0
207*c0909341SAndroid Build Coastguard Worker        movi            v2.4s, #0
208*c0909341SAndroid Build Coastguard Worker        movi            v3.4s, #0
209*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v18.16b
210*c0909341SAndroid Build Coastguard Worker        mov             v24.16b, v18.16b
211*c0909341SAndroid Build Coastguard Worker        mov             v27.16b, v18.16b
212*c0909341SAndroid Build Coastguard Worker.else   // neon_dotprod
213*c0909341SAndroid Build Coastguard Worker        ld1             {v27.16b}, [\lsrc], \s_strd
214*c0909341SAndroid Build Coastguard Worker        mov             v0.16b, v4.16b
215*c0909341SAndroid Build Coastguard Worker        mov             v1.16b, v4.16b
216*c0909341SAndroid Build Coastguard Worker        mov             v2.16b, v4.16b
217*c0909341SAndroid Build Coastguard Worker        mov             v3.16b, v4.16b
218*c0909341SAndroid Build Coastguard Worker        sub             v18.16b, v27.16b, v5.16b
219*c0909341SAndroid Build Coastguard Worker        sub             v21.16b, v27.16b, v5.16b
220*c0909341SAndroid Build Coastguard Worker        sub             v24.16b, v27.16b, v5.16b
221*c0909341SAndroid Build Coastguard Worker        sub             v27.16b, v27.16b, v5.16b
222*c0909341SAndroid Build Coastguard Worker.endif
223*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v16.16b, v7.4b[0]
224*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v19.16b, v7.4b[0]
225*c0909341SAndroid Build Coastguard Worker        \dot            v2.4s, v22.16b, v7.4b[0]
226*c0909341SAndroid Build Coastguard Worker        \dot            v3.4s, v25.16b, v7.4b[0]
227*c0909341SAndroid Build Coastguard Worker
228*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v16.16b, v17.16b}, v6.16b
229*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v19.16b, v20.16b}, v6.16b
230*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v22.16b, v23.16b}, v6.16b
231*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v25.16b, v26.16b}, v6.16b
232*c0909341SAndroid Build Coastguard Worker
233*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v17.16b, v7.4b[1]
234*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v20.16b, v7.4b[1]
235*c0909341SAndroid Build Coastguard Worker        \dot            v2.4s, v23.16b, v7.4b[1]
236*c0909341SAndroid Build Coastguard Worker        \dot            v3.4s, v26.16b, v7.4b[1]
237*c0909341SAndroid Build Coastguard Worker
238*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v17.16b, v18.16b}, v28.16b
239*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v20.16b, v21.16b}, v29.16b
240*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v23.16b, v24.16b}, v30.16b
241*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v26.16b, v27.16b}, v31.16b
242*c0909341SAndroid Build Coastguard Worker
243*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #1
244*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
245*c0909341SAndroid Build Coastguard Worker        uzp1            v2.8h, v2.8h, v3.8h
246*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
247*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
248*c0909341SAndroid Build Coastguard Worker        srshr           v0.8h, v0.8h, #2
249*c0909341SAndroid Build Coastguard Worker        srshr           v1.8h, v2.8h, #2
250*c0909341SAndroid Build Coastguard Worker    .else
251*c0909341SAndroid Build Coastguard Worker        sshr            v0.8h, v0.8h, #2
252*c0909341SAndroid Build Coastguard Worker        sshr            v1.8h, v2.8h, #2
253*c0909341SAndroid Build Coastguard Worker    .endif
254*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [\ldst], \d_strd
255*c0909341SAndroid Build Coastguard Worker.else   // put
256*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #6
257*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v0.16b, v2.8h, #6
258*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [\ldst], \d_strd
259*c0909341SAndroid Build Coastguard Worker.endif
260*c0909341SAndroid Build Coastguard Worker        b.gt            16b
261*c0909341SAndroid Build Coastguard Worker
262*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
263*c0909341SAndroid Build Coastguard Worker        movi            v0.4s, #0
264*c0909341SAndroid Build Coastguard Worker        movi            v1.4s, #0
265*c0909341SAndroid Build Coastguard Worker        movi            v2.4s, #0
266*c0909341SAndroid Build Coastguard Worker        movi            v3.4s, #0
267*c0909341SAndroid Build Coastguard Worker.else   // neon_dotprod
268*c0909341SAndroid Build Coastguard Worker        mov             v0.16b, v4.16b
269*c0909341SAndroid Build Coastguard Worker        mov             v1.16b, v4.16b
270*c0909341SAndroid Build Coastguard Worker        mov             v2.16b, v4.16b
271*c0909341SAndroid Build Coastguard Worker        mov             v3.16b, v4.16b
272*c0909341SAndroid Build Coastguard Worker.endif
273*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v16.16b, v7.4b[0]
274*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v19.16b, v7.4b[0]
275*c0909341SAndroid Build Coastguard Worker        \dot            v2.4s, v22.16b, v7.4b[0]
276*c0909341SAndroid Build Coastguard Worker        \dot            v3.4s, v25.16b, v7.4b[0]
277*c0909341SAndroid Build Coastguard Worker
278*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v17.16b, v7.4b[1]
279*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v20.16b, v7.4b[1]
280*c0909341SAndroid Build Coastguard Worker        \dot            v2.4s, v23.16b, v7.4b[1]
281*c0909341SAndroid Build Coastguard Worker        \dot            v3.4s, v26.16b, v7.4b[1]
282*c0909341SAndroid Build Coastguard Worker
283*c0909341SAndroid Build Coastguard Worker        subs            \w, \w, #16
284*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
285*c0909341SAndroid Build Coastguard Worker        uzp1            v2.8h, v2.8h, v3.8h
286*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
287*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
288*c0909341SAndroid Build Coastguard Worker        srshr           v0.8h, v0.8h, #2
289*c0909341SAndroid Build Coastguard Worker        srshr           v1.8h, v2.8h, #2
290*c0909341SAndroid Build Coastguard Worker    .else
291*c0909341SAndroid Build Coastguard Worker        sshr            v0.8h, v0.8h, #2
292*c0909341SAndroid Build Coastguard Worker        sshr            v1.8h, v2.8h, #2
293*c0909341SAndroid Build Coastguard Worker    .endif
294*c0909341SAndroid Build Coastguard Worker        stp             q0, q1, [\ldst]
295*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #32
296*c0909341SAndroid Build Coastguard Worker.else   // put
297*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #6
298*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v0.16b, v2.8h, #6
299*c0909341SAndroid Build Coastguard Worker        str             q0, [\ldst]
300*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #16
301*c0909341SAndroid Build Coastguard Worker.endif
302*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #16
303*c0909341SAndroid Build Coastguard Worker        b.gt            161b
304*c0909341SAndroid Build Coastguard Worker        ret
305*c0909341SAndroid Build Coastguard Worker
306*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
307*c0909341SAndroid Build Coastguard Worker80:     // V - 8xN
308*c0909341SAndroid Build Coastguard Worker        ldr             d16, [\src]
309*c0909341SAndroid Build Coastguard Worker        ldr             d17, [\src, \s_strd]
310*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
311*c0909341SAndroid Build Coastguard Worker        ldr             d18, [\src]
312*c0909341SAndroid Build Coastguard Worker        ldr             d19, [\src, \s_strd]
313*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
314*c0909341SAndroid Build Coastguard Worker
315*c0909341SAndroid Build Coastguard Worker        ldr             d20, [\src]
316*c0909341SAndroid Build Coastguard Worker        ldr             d21, [\src, \s_strd]
317*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
318*c0909341SAndroid Build Coastguard Worker        ldr             d22, [\src]
319*c0909341SAndroid Build Coastguard Worker        ldr             d23, [\src, \s_strd]
320*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
321*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2  // for prep: sub is enough
322*c0909341SAndroid Build Coastguard Worker
323*c0909341SAndroid Build Coastguard Worker        zip1            v0.16b, v16.16b, v17.16b
324*c0909341SAndroid Build Coastguard Worker        zip1            v2.16b, v18.16b, v19.16b
325*c0909341SAndroid Build Coastguard Worker        zip1            v18.16b, v20.16b, v21.16b
326*c0909341SAndroid Build Coastguard Worker        zip1            v24.16b, v22.16b, v23.16b
327*c0909341SAndroid Build Coastguard Worker
328*c0909341SAndroid Build Coastguard Worker        zip1            v16.8h,  v0.8h,  v2.8h
329*c0909341SAndroid Build Coastguard Worker        zip2            v19.8h,  v0.8h,  v2.8h
330*c0909341SAndroid Build Coastguard Worker        zip1            v17.8h, v18.8h, v24.8h
331*c0909341SAndroid Build Coastguard Worker        zip2            v20.8h, v18.8h, v24.8h
332*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod
333*c0909341SAndroid Build Coastguard Worker        sub             v16.16b, v16.16b, v5.16b
334*c0909341SAndroid Build Coastguard Worker        sub             v19.16b, v19.16b, v5.16b
335*c0909341SAndroid Build Coastguard Worker        sub             v17.16b, v17.16b, v5.16b
336*c0909341SAndroid Build Coastguard Worker        sub             v20.16b, v20.16b, v5.16b
337*c0909341SAndroid Build Coastguard Worker.endif
338*c0909341SAndroid Build Coastguard Worker.ifc \type, put
339*c0909341SAndroid Build Coastguard Worker        b.eq            82f
340*c0909341SAndroid Build Coastguard Worker.endif
341*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
342*c0909341SAndroid Build Coastguard Worker8:
343*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
344*c0909341SAndroid Build Coastguard Worker        ldr             d18, [\src]
345*c0909341SAndroid Build Coastguard Worker        movi            v0.4s, #0
346*c0909341SAndroid Build Coastguard Worker        movi            v1.4s, #0
347*c0909341SAndroid Build Coastguard Worker        ldr             d24, [\src, \s_strd]
348*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
349*c0909341SAndroid Build Coastguard Worker        movi            v2.4s, #0
350*c0909341SAndroid Build Coastguard Worker        movi            v3.4s, #0
351*c0909341SAndroid Build Coastguard Worker        mov             v21.8b, v18.8b
352*c0909341SAndroid Build Coastguard Worker        mov             v27.8b, v24.8b
353*c0909341SAndroid Build Coastguard Worker.else   // neon_dotprod
354*c0909341SAndroid Build Coastguard Worker        ldr             d21, [\src]
355*c0909341SAndroid Build Coastguard Worker        ldr             d27, [\src, \s_strd]
356*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
357*c0909341SAndroid Build Coastguard Worker        mov             v0.16b, v4.16b
358*c0909341SAndroid Build Coastguard Worker        mov             v1.16b, v4.16b
359*c0909341SAndroid Build Coastguard Worker        mov             v2.16b, v4.16b
360*c0909341SAndroid Build Coastguard Worker        mov             v3.16b, v4.16b
361*c0909341SAndroid Build Coastguard Worker        sub             v18.16b, v21.16b, v5.16b
362*c0909341SAndroid Build Coastguard Worker        sub             v21.16b, v21.16b, v5.16b
363*c0909341SAndroid Build Coastguard Worker        sub             v24.16b, v27.16b, v5.16b
364*c0909341SAndroid Build Coastguard Worker        sub             v27.16b, v27.16b, v5.16b
365*c0909341SAndroid Build Coastguard Worker.endif
366*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
367*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
368*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
369*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
370*c0909341SAndroid Build Coastguard Worker
371*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v16.16b, v7.4b[0]
372*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v17.16b, v7.4b[1]
373*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v19.16b, v7.4b[0]
374*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v20.16b, v7.4b[1]
375*c0909341SAndroid Build Coastguard Worker
376*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v22.16b, v23.16b}, v6.16b
377*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v25.16b, v26.16b}, v6.16b
378*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v23.16b, v24.16b}, v28.16b
379*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v26.16b, v27.16b}, v29.16b
380*c0909341SAndroid Build Coastguard Worker
381*c0909341SAndroid Build Coastguard Worker        \dot            v2.4s, v22.16b, v7.4b[0]
382*c0909341SAndroid Build Coastguard Worker        \dot            v2.4s, v23.16b, v7.4b[1]
383*c0909341SAndroid Build Coastguard Worker        \dot            v3.4s, v25.16b, v7.4b[0]
384*c0909341SAndroid Build Coastguard Worker        \dot            v3.4s, v26.16b, v7.4b[1]
385*c0909341SAndroid Build Coastguard Worker
386*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
387*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
388*c0909341SAndroid Build Coastguard Worker        uzp1            v2.8h, v2.8h, v3.8h
389*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
390*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
391*c0909341SAndroid Build Coastguard Worker        srshr           v0.8h, v0.8h, #2
392*c0909341SAndroid Build Coastguard Worker        srshr           v1.8h, v2.8h, #2
393*c0909341SAndroid Build Coastguard Worker    .else
394*c0909341SAndroid Build Coastguard Worker        sshr            v0.8h, v0.8h, #2
395*c0909341SAndroid Build Coastguard Worker        sshr            v1.8h, v2.8h, #2
396*c0909341SAndroid Build Coastguard Worker    .endif
397*c0909341SAndroid Build Coastguard Worker        stp             q0, q1, [\dst], #32
398*c0909341SAndroid Build Coastguard Worker.else   // put
399*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #6
400*c0909341SAndroid Build Coastguard Worker        sqrshrun        v1.8b, v2.8h, #6
401*c0909341SAndroid Build Coastguard Worker        str             d0, [\dst]
402*c0909341SAndroid Build Coastguard Worker        str             d1, [\dst, \d_strd]
403*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #1
404*c0909341SAndroid Build Coastguard Worker.endif
405*c0909341SAndroid Build Coastguard Worker        b.gt            8b
406*c0909341SAndroid Build Coastguard Worker
407*c0909341SAndroid Build Coastguard Worker.ifc \type, put
408*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
409*c0909341SAndroid Build Coastguard Worker82:
410*c0909341SAndroid Build Coastguard Worker.endif
411*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
412*c0909341SAndroid Build Coastguard Worker        ldr             d18, [\src]
413*c0909341SAndroid Build Coastguard Worker        movi            v0.4s, #0
414*c0909341SAndroid Build Coastguard Worker        movi            v1.4s, #0
415*c0909341SAndroid Build Coastguard Worker        movi            v2.4s, #0
416*c0909341SAndroid Build Coastguard Worker        movi            v3.4s, #0
417*c0909341SAndroid Build Coastguard Worker        mov             v21.8b, v18.8b
418*c0909341SAndroid Build Coastguard Worker.else   // neon_dotprod
419*c0909341SAndroid Build Coastguard Worker        ldr             d21, [\src]
420*c0909341SAndroid Build Coastguard Worker        mov             v0.16b, v4.16b
421*c0909341SAndroid Build Coastguard Worker        mov             v1.16b, v4.16b
422*c0909341SAndroid Build Coastguard Worker        mov             v2.16b, v4.16b
423*c0909341SAndroid Build Coastguard Worker        mov             v3.16b, v4.16b
424*c0909341SAndroid Build Coastguard Worker        sub             v18.16b, v21.16b, v5.16b
425*c0909341SAndroid Build Coastguard Worker        sub             v21.16b, v21.16b, v5.16b
426*c0909341SAndroid Build Coastguard Worker.endif
427*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
428*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
429*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
430*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
431*c0909341SAndroid Build Coastguard Worker
432*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v16.16b, v7.4b[0]
433*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v17.16b, v7.4b[1]
434*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v19.16b, v7.4b[0]
435*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v20.16b, v7.4b[1]
436*c0909341SAndroid Build Coastguard Worker
437*c0909341SAndroid Build Coastguard Worker        \dot            v2.4s, v22.16b, v7.4b[0]
438*c0909341SAndroid Build Coastguard Worker        \dot            v2.4s, v23.16b, v7.4b[1]
439*c0909341SAndroid Build Coastguard Worker        \dot            v3.4s, v25.16b, v7.4b[0]
440*c0909341SAndroid Build Coastguard Worker        \dot            v3.4s, v26.16b, v7.4b[1]
441*c0909341SAndroid Build Coastguard Worker
442*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
443*c0909341SAndroid Build Coastguard Worker        uzp1            v2.8h, v2.8h, v3.8h
444*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
445*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
446*c0909341SAndroid Build Coastguard Worker        srshr           v0.8h, v0.8h, #2
447*c0909341SAndroid Build Coastguard Worker        srshr           v1.8h, v2.8h, #2
448*c0909341SAndroid Build Coastguard Worker    .else
449*c0909341SAndroid Build Coastguard Worker        sshr            v0.8h, v0.8h, #2
450*c0909341SAndroid Build Coastguard Worker        sshr            v1.8h, v2.8h, #2
451*c0909341SAndroid Build Coastguard Worker    .endif
452*c0909341SAndroid Build Coastguard Worker        stp             q0, q1, [\dst]
453*c0909341SAndroid Build Coastguard Worker.else   // put
454*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #6
455*c0909341SAndroid Build Coastguard Worker        sqrshrun        v1.8b, v2.8h, #6
456*c0909341SAndroid Build Coastguard Worker        str             d0, [\dst]
457*c0909341SAndroid Build Coastguard Worker        str             d1, [\dst, \d_strd]
458*c0909341SAndroid Build Coastguard Worker.endif
459*c0909341SAndroid Build Coastguard Worker        ret
460*c0909341SAndroid Build Coastguard Worker
461*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
462*c0909341SAndroid Build Coastguard Worker40:     // V - 4xN or 2xN (put only)
463*c0909341SAndroid Build Coastguard Worker.ifc \type, put
464*c0909341SAndroid Build Coastguard Worker        cmp             \w, #2
465*c0909341SAndroid Build Coastguard Worker        b.eq            20f
466*c0909341SAndroid Build Coastguard Worker.endif
467*c0909341SAndroid Build Coastguard Worker        ldr             s16, [\src]
468*c0909341SAndroid Build Coastguard Worker        ldr             s17, [\src, \s_strd]
469*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
470*c0909341SAndroid Build Coastguard Worker        ldr             s18, [\src]
471*c0909341SAndroid Build Coastguard Worker        ldr             s19, [\src, \s_strd]
472*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
473*c0909341SAndroid Build Coastguard Worker
474*c0909341SAndroid Build Coastguard Worker        ldr             s20, [\src]
475*c0909341SAndroid Build Coastguard Worker        ldr             s21, [\src, \s_strd]
476*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
477*c0909341SAndroid Build Coastguard Worker        ldr             s22, [\src]
478*c0909341SAndroid Build Coastguard Worker        ldr             s23, [\src, \s_strd]
479*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
480*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2  // for prep: sub is enough
481*c0909341SAndroid Build Coastguard Worker
482*c0909341SAndroid Build Coastguard Worker        zip1            v0.8b, v16.8b, v17.8b
483*c0909341SAndroid Build Coastguard Worker        zip1            v2.8b, v18.8b, v19.8b
484*c0909341SAndroid Build Coastguard Worker        zip1            v18.8b, v20.8b, v21.8b
485*c0909341SAndroid Build Coastguard Worker        zip1            v24.8b, v22.8b, v23.8b
486*c0909341SAndroid Build Coastguard Worker
487*c0909341SAndroid Build Coastguard Worker        zip1            v16.8h, v0.8h, v2.8h
488*c0909341SAndroid Build Coastguard Worker        zip1            v17.8h, v18.8h, v24.8h
489*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod
490*c0909341SAndroid Build Coastguard Worker        sub             v16.16b, v16.16b, v5.16b
491*c0909341SAndroid Build Coastguard Worker        sub             v17.16b, v17.16b, v5.16b
492*c0909341SAndroid Build Coastguard Worker.endif
493*c0909341SAndroid Build Coastguard Worker.ifc \type, put
494*c0909341SAndroid Build Coastguard Worker        b.eq            42f
495*c0909341SAndroid Build Coastguard Worker.endif
496*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
497*c0909341SAndroid Build Coastguard Worker4:
498*c0909341SAndroid Build Coastguard Worker        ldr             s18, [\src]
499*c0909341SAndroid Build Coastguard Worker        ldr             s21, [\src, \s_strd]
500*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
501*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
502*c0909341SAndroid Build Coastguard Worker        movi            v0.4s, #0
503*c0909341SAndroid Build Coastguard Worker        movi            v1.4s, #0
504*c0909341SAndroid Build Coastguard Worker.else   // neon_dotprod
505*c0909341SAndroid Build Coastguard Worker        mov             v0.16b, v4.16b
506*c0909341SAndroid Build Coastguard Worker        mov             v1.16b, v4.16b
507*c0909341SAndroid Build Coastguard Worker        sub             v18.16b, v18.16b, v5.16b
508*c0909341SAndroid Build Coastguard Worker        sub             v21.16b, v21.16b, v5.16b
509*c0909341SAndroid Build Coastguard Worker.endif
510*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
511*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
512*c0909341SAndroid Build Coastguard Worker
513*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v16.16b, v7.4b[0]
514*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v17.16b, v7.4b[1]
515*c0909341SAndroid Build Coastguard Worker
516*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v19.16b, v20.16b}, v6.16b
517*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v20.16b, v21.16b}, v28.16b
518*c0909341SAndroid Build Coastguard Worker
519*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v19.16b, v7.4b[0]
520*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v20.16b, v7.4b[1]
521*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
522*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
523*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
524*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v0.4s, #2
525*c0909341SAndroid Build Coastguard Worker        rshrn2          v0.8h, v1.4s, #2
526*c0909341SAndroid Build Coastguard Worker    .else
527*c0909341SAndroid Build Coastguard Worker        shrn            v0.4h, v0.4s, #2
528*c0909341SAndroid Build Coastguard Worker        shrn2           v0.8h, v1.4s, #2
529*c0909341SAndroid Build Coastguard Worker    .endif
530*c0909341SAndroid Build Coastguard Worker        str             q0, [\dst], #16
531*c0909341SAndroid Build Coastguard Worker.else
532*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
533*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #6
534*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
535*c0909341SAndroid Build Coastguard Worker        fmov            x8, d0
536*c0909341SAndroid Build Coastguard Worker        lsr             x9, x8, #32
537*c0909341SAndroid Build Coastguard Worker        str             w8, [\dst]
538*c0909341SAndroid Build Coastguard Worker        str             w9, [\dst, \d_strd]
539*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #1
540*c0909341SAndroid Build Coastguard Worker.endif
541*c0909341SAndroid Build Coastguard Worker        b.gt            4b
542*c0909341SAndroid Build Coastguard Worker
543*c0909341SAndroid Build Coastguard Worker.ifc \type, put
544*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
545*c0909341SAndroid Build Coastguard Worker42:
546*c0909341SAndroid Build Coastguard Worker.endif
547*c0909341SAndroid Build Coastguard Worker        ldr             s18, [\src]
548*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
549*c0909341SAndroid Build Coastguard Worker        movi            v0.4s, #0
550*c0909341SAndroid Build Coastguard Worker        movi            v1.4s, #0
551*c0909341SAndroid Build Coastguard Worker.else   // neon_dotprod
552*c0909341SAndroid Build Coastguard Worker        mov             v0.16b, v4.16b
553*c0909341SAndroid Build Coastguard Worker        mov             v1.16b, v4.16b
554*c0909341SAndroid Build Coastguard Worker        sub             v18.16b, v18.16b, v5.16b
555*c0909341SAndroid Build Coastguard Worker.endif
556*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
557*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
558*c0909341SAndroid Build Coastguard Worker
559*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v16.16b, v7.4b[0]
560*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v17.16b, v7.4b[1]
561*c0909341SAndroid Build Coastguard Worker
562*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v19.16b, v7.4b[0]
563*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v20.16b, v7.4b[1]
564*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
565*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
566*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v0.4s, #2
567*c0909341SAndroid Build Coastguard Worker        rshrn2          v0.8h, v1.4s, #2
568*c0909341SAndroid Build Coastguard Worker    .else
569*c0909341SAndroid Build Coastguard Worker        shrn            v0.4h, v0.4s, #2
570*c0909341SAndroid Build Coastguard Worker        shrn2           v0.8h, v1.4s, #2
571*c0909341SAndroid Build Coastguard Worker    .endif
572*c0909341SAndroid Build Coastguard Worker        str             q0, [\dst]
573*c0909341SAndroid Build Coastguard Worker.else
574*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
575*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #6
576*c0909341SAndroid Build Coastguard Worker        fmov            x8, d0
577*c0909341SAndroid Build Coastguard Worker        lsr             x9, x8, #32
578*c0909341SAndroid Build Coastguard Worker        str             w8, [\dst]
579*c0909341SAndroid Build Coastguard Worker        str             w9, [\dst, \d_strd]
580*c0909341SAndroid Build Coastguard Worker.endif
581*c0909341SAndroid Build Coastguard Worker        ret
582*c0909341SAndroid Build Coastguard Worker
583*c0909341SAndroid Build Coastguard Worker.ifc \type, put
584*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
585*c0909341SAndroid Build Coastguard Worker20:     // V - 2xN
586*c0909341SAndroid Build Coastguard Worker        ldr             h16, [\src]
587*c0909341SAndroid Build Coastguard Worker        ldr             h17, [\src, \s_strd]
588*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
589*c0909341SAndroid Build Coastguard Worker        ldr             h18, [\src]
590*c0909341SAndroid Build Coastguard Worker        ldr             h19, [\src, \s_strd]
591*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
592*c0909341SAndroid Build Coastguard Worker
593*c0909341SAndroid Build Coastguard Worker        ldr             h20, [\src]
594*c0909341SAndroid Build Coastguard Worker        ldr             h21, [\src, \s_strd]
595*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
596*c0909341SAndroid Build Coastguard Worker        ldr             h22, [\src]
597*c0909341SAndroid Build Coastguard Worker        ldr             h23, [\src, \s_strd]
598*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
599*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
600*c0909341SAndroid Build Coastguard Worker
601*c0909341SAndroid Build Coastguard Worker        zip1            v0.8b, v16.8b, v17.8b
602*c0909341SAndroid Build Coastguard Worker        zip1            v2.8b, v18.8b, v19.8b
603*c0909341SAndroid Build Coastguard Worker        zip1            v18.8b, v20.8b, v21.8b
604*c0909341SAndroid Build Coastguard Worker        zip1            v24.8b, v22.8b, v23.8b
605*c0909341SAndroid Build Coastguard Worker
606*c0909341SAndroid Build Coastguard Worker        zip1            v16.4h, v0.4h, v2.4h
607*c0909341SAndroid Build Coastguard Worker        zip1            v17.4h, v18.4h, v24.4h
608*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_dotprod
609*c0909341SAndroid Build Coastguard Worker        sub             v16.8b, v16.8b, v5.8b
610*c0909341SAndroid Build Coastguard Worker        sub             v17.8b, v17.8b, v5.8b
611*c0909341SAndroid Build Coastguard Worker    .endif
612*c0909341SAndroid Build Coastguard Worker        b.eq            22f
613*c0909341SAndroid Build Coastguard Worker
614*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
615*c0909341SAndroid Build Coastguard Worker2:
616*c0909341SAndroid Build Coastguard Worker        ldr             h18, [\src]
617*c0909341SAndroid Build Coastguard Worker        ldr             h21, [\src, \s_strd]
618*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
619*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
620*c0909341SAndroid Build Coastguard Worker        movi            v0.4s, #0
621*c0909341SAndroid Build Coastguard Worker        movi            v1.4s, #0
622*c0909341SAndroid Build Coastguard Worker    .else   // put
623*c0909341SAndroid Build Coastguard Worker        mov             v0.16b, v4.16b
624*c0909341SAndroid Build Coastguard Worker        mov             v1.16b, v4.16b
625*c0909341SAndroid Build Coastguard Worker        sub             v18.8b, v18.8b, v5.8b
626*c0909341SAndroid Build Coastguard Worker        sub             v21.8b, v21.8b, v5.8b
627*c0909341SAndroid Build Coastguard Worker    .endif
628*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
629*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
630*c0909341SAndroid Build Coastguard Worker
631*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v16.16b, v7.4b[0]
632*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v17.16b, v7.4b[1]
633*c0909341SAndroid Build Coastguard Worker
634*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v19.16b, v20.16b}, v6.16b
635*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v20.16b, v21.16b}, v28.16b
636*c0909341SAndroid Build Coastguard Worker
637*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v19.16b, v7.4b[0]
638*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v20.16b, v7.4b[1]
639*c0909341SAndroid Build Coastguard Worker
640*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
641*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #6
642*c0909341SAndroid Build Coastguard Worker
643*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
644*c0909341SAndroid Build Coastguard Worker        fmov            x8, d0
645*c0909341SAndroid Build Coastguard Worker        lsr             x9, x8, #32
646*c0909341SAndroid Build Coastguard Worker        strh            w8, [\dst]
647*c0909341SAndroid Build Coastguard Worker        strh            w9, [\dst, \d_strd]
648*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #1
649*c0909341SAndroid Build Coastguard Worker        b.gt            2b
650*c0909341SAndroid Build Coastguard Worker
651*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
652*c0909341SAndroid Build Coastguard Worker22:
653*c0909341SAndroid Build Coastguard Worker        ldr             h18, [\src]
654*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
655*c0909341SAndroid Build Coastguard Worker        movi            v0.4s, #0
656*c0909341SAndroid Build Coastguard Worker        movi            v1.4s, #0
657*c0909341SAndroid Build Coastguard Worker    .else   // put
658*c0909341SAndroid Build Coastguard Worker        mov             v0.16b, v4.16b
659*c0909341SAndroid Build Coastguard Worker        mov             v1.16b, v4.16b
660*c0909341SAndroid Build Coastguard Worker        sub             v18.8b, v18.8b, v5.8b
661*c0909341SAndroid Build Coastguard Worker    .endif
662*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
663*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
664*c0909341SAndroid Build Coastguard Worker
665*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v16.16b, v7.4b[0]
666*c0909341SAndroid Build Coastguard Worker        \dot            v0.4s, v17.16b, v7.4b[1]
667*c0909341SAndroid Build Coastguard Worker
668*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v19.16b, v7.4b[0]
669*c0909341SAndroid Build Coastguard Worker        \dot            v1.4s, v20.16b, v7.4b[1]
670*c0909341SAndroid Build Coastguard Worker
671*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
672*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #6
673*c0909341SAndroid Build Coastguard Worker
674*c0909341SAndroid Build Coastguard Worker        fmov            x8, d0
675*c0909341SAndroid Build Coastguard Worker        lsr             x9, x8, #32
676*c0909341SAndroid Build Coastguard Worker        strh            w8, [\dst]
677*c0909341SAndroid Build Coastguard Worker        strh            w9, [\dst, \d_strd]
678*c0909341SAndroid Build Coastguard Worker        ret
679*c0909341SAndroid Build Coastguard Worker.endif
680*c0909341SAndroid Build Coastguard Worker
681*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
682*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_h_hv_\isa):
683*c0909341SAndroid Build Coastguard Worker        madd            \mx, \mx, w11, w9
684*c0909341SAndroid Build Coastguard Worker        madd            w14, \my, w11, w10      // for HV
685*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod
686*c0909341SAndroid Build Coastguard Worker        mov             w13, #0x2002            // FILTER_WEIGHT * 128 + rounding
687*c0909341SAndroid Build Coastguard Worker        dup             v27.4s, w13             // put H overrides this
688*c0909341SAndroid Build Coastguard Worker.endif
689*c0909341SAndroid Build Coastguard Worker        movrel          x13, h_tbl_neon_dotprod
690*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, #3          // src - 3
691*c0909341SAndroid Build Coastguard Worker        ldr             q28, [x13]              // for 4-tap & 8-tap H filters
692*c0909341SAndroid Build Coastguard Worker        ubfx            w15, \mx, #7, #7
693*c0909341SAndroid Build Coastguard Worker        and             \mx, \mx, #0x7F
694*c0909341SAndroid Build Coastguard Worker        ubfx            w11, w14, #7, #7        // for HV
695*c0909341SAndroid Build Coastguard Worker        and             w14, w14, #0x7F         // for HV
696*c0909341SAndroid Build Coastguard Worker        cmp             \w, #4
697*c0909341SAndroid Build Coastguard Worker        csel            \mx, \mx, w15, le
698*c0909341SAndroid Build Coastguard Worker        add             \xmx, x12, \xmx, lsl #3 // subpel H filter address
699*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod
700*c0909341SAndroid Build Coastguard Worker        movi            v24.16b, #128
701*c0909341SAndroid Build Coastguard Worker.endif
702*c0909341SAndroid Build Coastguard Worker        cbz             \my, L(\type\()_8tap_h_\isa)
703*c0909341SAndroid Build Coastguard Worker
704*c0909341SAndroid Build Coastguard Worker        // HV cases
705*c0909341SAndroid Build Coastguard Worker        cmp             \h, #4
706*c0909341SAndroid Build Coastguard Worker        csel            w14, w14, w11, le
707*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd, lsl #1 // src - s_strd * 2 - 3
708*c0909341SAndroid Build Coastguard Worker        add             \xmy, x12, x14, lsl #3      // subpel V filter address
709*c0909341SAndroid Build Coastguard Worker        mov             x15, x30
710*c0909341SAndroid Build Coastguard Worker        ldr             d7, [\xmy]
711*c0909341SAndroid Build Coastguard Worker.ifc \type, put
712*c0909341SAndroid Build Coastguard Worker        ldr             q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion
713*c0909341SAndroid Build Coastguard Worker.endif                                                 // of 32b values to 8b
714*c0909341SAndroid Build Coastguard Worker        sxtl            v7.8h, v7.8b
715*c0909341SAndroid Build Coastguard Worker        cmp             w10, #SHARP1
716*c0909341SAndroid Build Coastguard Worker        b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1
717*c0909341SAndroid Build Coastguard Worker
718*c0909341SAndroid Build Coastguard Worker        // HV 8-tap cases
719*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd         // src - s_strd * 3 - 3
720*c0909341SAndroid Build Coastguard Worker        cmp             \w, #4
721*c0909341SAndroid Build Coastguard Worker        b.eq            40f
722*c0909341SAndroid Build Coastguard Worker.ifc \type, put
723*c0909341SAndroid Build Coastguard Worker        b.lt            20f
724*c0909341SAndroid Build Coastguard Worker.endif
725*c0909341SAndroid Build Coastguard Worker
726*c0909341SAndroid Build Coastguard Worker        // .align JUMP_ALIGN    // fallthrough
727*c0909341SAndroid Build Coastguard Worker80:     // HV8 - 8xN+
728*c0909341SAndroid Build Coastguard Worker        ldp             q29, q30, [x13, #16]
729*c0909341SAndroid Build Coastguard Worker        ldr             d26, [\xmx]
730*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
731*c0909341SAndroid Build Coastguard Worker        add             \wd_strd, \w, \w
732*c0909341SAndroid Build Coastguard Worker.endif
733*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
734*c0909341SAndroid Build Coastguard Worker81:
735*c0909341SAndroid Build Coastguard Worker        mov             \lsrc, \src
736*c0909341SAndroid Build Coastguard Worker        mov             \ldst, \dst
737*c0909341SAndroid Build Coastguard Worker        mov             w8, \h
738*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
739*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
740*c0909341SAndroid Build Coastguard Worker        srshr           v16.8h, v22.8h, #2
741*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
742*c0909341SAndroid Build Coastguard Worker        srshr           v17.8h, v22.8h, #2
743*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
744*c0909341SAndroid Build Coastguard Worker        srshr           v18.8h, v22.8h, #2
745*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
746*c0909341SAndroid Build Coastguard Worker        srshr           v19.8h, v22.8h, #2
747*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
748*c0909341SAndroid Build Coastguard Worker        srshr           v20.8h, v22.8h, #2
749*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
750*c0909341SAndroid Build Coastguard Worker        srshr           v21.8h, v22.8h, #2
751*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
752*c0909341SAndroid Build Coastguard Worker        srshr           v22.8h, v22.8h, #2
753*c0909341SAndroid Build Coastguard Worker.else
754*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
755*c0909341SAndroid Build Coastguard Worker        sshr            v16.8h, v22.8h, #2
756*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
757*c0909341SAndroid Build Coastguard Worker        sshr            v17.8h, v22.8h, #2
758*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
759*c0909341SAndroid Build Coastguard Worker        sshr            v18.8h, v22.8h, #2
760*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
761*c0909341SAndroid Build Coastguard Worker        sshr            v19.8h, v22.8h, #2
762*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
763*c0909341SAndroid Build Coastguard Worker        sshr            v20.8h, v22.8h, #2
764*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
765*c0909341SAndroid Build Coastguard Worker        sshr            v21.8h, v22.8h, #2
766*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
767*c0909341SAndroid Build Coastguard Worker        sshr            v22.8h, v22.8h, #2
768*c0909341SAndroid Build Coastguard Worker.endif
769*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
770*c0909341SAndroid Build Coastguard Worker8:
771*c0909341SAndroid Build Coastguard Worker        ldr             q23, [\lsrc]
772*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd
773*c0909341SAndroid Build Coastguard Worker
774*c0909341SAndroid Build Coastguard Worker        smull           v0.4s, v16.4h, v7.h[0]
775*c0909341SAndroid Build Coastguard Worker        smull2          v1.4s, v16.8h, v7.h[0]
776*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v17.16b
777*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
778*c0909341SAndroid Build Coastguard Worker        movi            v5.4s, #0
779*c0909341SAndroid Build Coastguard Worker        movi            v6.4s, #0
780*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v23.16b}, v28.16b
781*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v23.16b}, v29.16b
782*c0909341SAndroid Build Coastguard Worker.else   // neon_dotprod
783*c0909341SAndroid Build Coastguard Worker        sub             v23.16b, v23.16b, v24.16b
784*c0909341SAndroid Build Coastguard Worker        mov             v5.16b, v27.16b
785*c0909341SAndroid Build Coastguard Worker        mov             v6.16b, v27.16b
786*c0909341SAndroid Build Coastguard Worker.endif
787*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v17.4h, v7.h[1]
788*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v17.8h, v7.h[1]
789*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
790*c0909341SAndroid Build Coastguard Worker        tbl             v4.16b, {v23.16b}, v30.16b
791*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
792*c0909341SAndroid Build Coastguard Worker.else   // neon_dotprod
793*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
794*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v23.16b}, v28.16b
795*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v23.16b}, v29.16b
796*c0909341SAndroid Build Coastguard Worker        tbl             v4.16b, {v23.16b}, v30.16b
797*c0909341SAndroid Build Coastguard Worker.endif
798*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v18.4h, v7.h[2]
799*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v18.8h, v7.h[2]
800*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
801*c0909341SAndroid Build Coastguard Worker
802*c0909341SAndroid Build Coastguard Worker        \dot            v5.4s, v2.16b, v26.4b[0]
803*c0909341SAndroid Build Coastguard Worker        \dot            v6.4s, v3.16b, v26.4b[0]
804*c0909341SAndroid Build Coastguard Worker
805*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v19.4h, v7.h[3]
806*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v19.8h, v7.h[3]
807*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v20.16b
808*c0909341SAndroid Build Coastguard Worker
809*c0909341SAndroid Build Coastguard Worker        \dot            v5.4s, v3.16b, v26.4b[1]
810*c0909341SAndroid Build Coastguard Worker        \dot            v6.4s, v4.16b, v26.4b[1]
811*c0909341SAndroid Build Coastguard Worker
812*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[4]
813*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v20.8h, v7.h[4]
814*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v21.16b
815*c0909341SAndroid Build Coastguard Worker
816*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v21.4h, v7.h[5]
817*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v21.8h, v7.h[5]
818*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
819*c0909341SAndroid Build Coastguard Worker        uzp1            v23.8h, v5.8h, v6.8h
820*c0909341SAndroid Build Coastguard Worker.endif
821*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v22.16b
822*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v22.4h, v7.h[6]
823*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v22.8h, v7.h[6]
824*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
825*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #1
826*c0909341SAndroid Build Coastguard Worker.endif
827*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
828*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
829*c0909341SAndroid Build Coastguard Worker        srshr           v22.8h, v23.8h, #2
830*c0909341SAndroid Build Coastguard Worker    .else
831*c0909341SAndroid Build Coastguard Worker        sshr            v22.8h, v23.8h, #2
832*c0909341SAndroid Build Coastguard Worker    .endif
833*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v22.4h, v7.h[7]
834*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v22.8h, v7.h[7]
835*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v0.4s, #6
836*c0909341SAndroid Build Coastguard Worker        rshrn2          v0.8h, v1.4s, #6
837*c0909341SAndroid Build Coastguard Worker.else   // put
838*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
839*c0909341SAndroid Build Coastguard Worker        rshrn           v22.4h, v5.4s, #2
840*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.8h, v6.4s, #2
841*c0909341SAndroid Build Coastguard Worker    .else
842*c0909341SAndroid Build Coastguard Worker        shrn            v22.4h, v5.4s, #2
843*c0909341SAndroid Build Coastguard Worker        shrn2           v22.8h, v6.4s, #2
844*c0909341SAndroid Build Coastguard Worker    .endif
845*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v22.4h, v7.h[7]
846*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v22.8h, v7.h[7]
847*c0909341SAndroid Build Coastguard Worker        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
848*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #2
849*c0909341SAndroid Build Coastguard Worker.endif
850*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod
851*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #1
852*c0909341SAndroid Build Coastguard Worker.endif
853*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
854*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [\ldst], \d_strd
855*c0909341SAndroid Build Coastguard Worker        b.gt            8b
856*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #16
857*c0909341SAndroid Build Coastguard Worker.else
858*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b}, [\ldst], \d_strd
859*c0909341SAndroid Build Coastguard Worker        b.gt            8b
860*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #8
861*c0909341SAndroid Build Coastguard Worker.endif
862*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #8
863*c0909341SAndroid Build Coastguard Worker        subs            \w, \w, #8
864*c0909341SAndroid Build Coastguard Worker        b.gt            81b
865*c0909341SAndroid Build Coastguard Worker        ret             x15
866*c0909341SAndroid Build Coastguard Worker
867*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
868*c0909341SAndroid Build Coastguard Worker40:     // HV8 - 4xN
869*c0909341SAndroid Build Coastguard Worker        ldur            s26, [\xmx, #2]
870*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #2
871*c0909341SAndroid Build Coastguard Worker
872*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
873*c0909341SAndroid Build Coastguard Worker        shrn            v16.4h, v22.4s, #2
874*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
875*c0909341SAndroid Build Coastguard Worker        shrn            v17.4h, v22.4s, #2
876*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
877*c0909341SAndroid Build Coastguard Worker        shrn            v18.4h, v22.4s, #2
878*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
879*c0909341SAndroid Build Coastguard Worker        shrn            v19.4h, v22.4s, #2
880*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
881*c0909341SAndroid Build Coastguard Worker        shrn            v20.4h, v22.4s, #2
882*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
883*c0909341SAndroid Build Coastguard Worker        shrn            v21.4h, v22.4s, #2
884*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
885*c0909341SAndroid Build Coastguard Worker        shrn            v22.4h, v22.4s, #2
886*c0909341SAndroid Build Coastguard Worker
887*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
888*c0909341SAndroid Build Coastguard Worker4:
889*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [\src], \s_strd
890*c0909341SAndroid Build Coastguard Worker
891*c0909341SAndroid Build Coastguard Worker        smull           v0.4s, v16.4h, v7.h[0]
892*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v17.4h, v7.h[1]
893*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v17.16b
894*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
895*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod
896*c0909341SAndroid Build Coastguard Worker        sub             v4.16b, v4.16b, v24.16b
897*c0909341SAndroid Build Coastguard Worker.endif
898*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v18.4h, v7.h[2]
899*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v19.4h, v7.h[3]
900*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v4.16b}, v28.16b
901*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
902*c0909341SAndroid Build Coastguard Worker        movi            v5.4s, #0
903*c0909341SAndroid Build Coastguard Worker.else
904*c0909341SAndroid Build Coastguard Worker        mov             v5.16b, v27.16b
905*c0909341SAndroid Build Coastguard Worker.endif
906*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
907*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v20.16b
908*c0909341SAndroid Build Coastguard Worker
909*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[4]
910*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v21.4h, v7.h[5]
911*c0909341SAndroid Build Coastguard Worker
912*c0909341SAndroid Build Coastguard Worker        \dot            v5.4s, v2.16b, v26.4b[0]
913*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v21.16b
914*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v22.16b
915*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v22.4h, v7.h[6]
916*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
917*c0909341SAndroid Build Coastguard Worker        rshrn           v22.4h, v5.4s, #2
918*c0909341SAndroid Build Coastguard Worker.else
919*c0909341SAndroid Build Coastguard Worker        shrn            v22.4h, v5.4s, #2
920*c0909341SAndroid Build Coastguard Worker.endif
921*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v22.4h, v7.h[7]
922*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
923*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v0.4s, #6
924*c0909341SAndroid Build Coastguard Worker        str             d0, [\dst], #8
925*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
926*c0909341SAndroid Build Coastguard Worker.else
927*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
928*c0909341SAndroid Build Coastguard Worker        tbl             v0.8b, {v0.16b}, v25.8b
929*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #2
930*c0909341SAndroid Build Coastguard Worker        str             s0, [\dst]
931*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd
932*c0909341SAndroid Build Coastguard Worker.endif
933*c0909341SAndroid Build Coastguard Worker        b.gt            4b
934*c0909341SAndroid Build Coastguard Worker        ret             x15
935*c0909341SAndroid Build Coastguard Worker
936*c0909341SAndroid Build Coastguard Worker.ifc \type, put
937*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
938*c0909341SAndroid Build Coastguard Worker20:     // HV8 - 2xN
939*c0909341SAndroid Build Coastguard Worker        ldur            s26, [\xmx, #2]
940*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #2
941*c0909341SAndroid Build Coastguard Worker
942*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
943*c0909341SAndroid Build Coastguard Worker        shrn            v16.4h, v22.4s, #2
944*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
945*c0909341SAndroid Build Coastguard Worker        shrn            v17.4h, v22.4s, #2
946*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
947*c0909341SAndroid Build Coastguard Worker        shrn            v18.4h, v22.4s, #2
948*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
949*c0909341SAndroid Build Coastguard Worker        shrn            v19.4h, v22.4s, #2
950*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
951*c0909341SAndroid Build Coastguard Worker        shrn            v20.4h, v22.4s, #2
952*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
953*c0909341SAndroid Build Coastguard Worker        shrn            v21.4h, v22.4s, #2
954*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
955*c0909341SAndroid Build Coastguard Worker        shrn            v22.4h, v22.4s, #2
956*c0909341SAndroid Build Coastguard Worker
957*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
958*c0909341SAndroid Build Coastguard Worker2:
959*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [\src], \s_strd
960*c0909341SAndroid Build Coastguard Worker
961*c0909341SAndroid Build Coastguard Worker        smull           v0.4s, v16.4h, v7.h[0]
962*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v17.4h, v7.h[1]
963*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v17.16b
964*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
965*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_dotprod
966*c0909341SAndroid Build Coastguard Worker        sub             v4.16b, v4.16b, v24.16b
967*c0909341SAndroid Build Coastguard Worker    .endif
968*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v18.4h, v7.h[2]
969*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v19.4h, v7.h[3]
970*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v4.16b}, v28.16b
971*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
972*c0909341SAndroid Build Coastguard Worker        movi            v5.4s, #0
973*c0909341SAndroid Build Coastguard Worker    .else
974*c0909341SAndroid Build Coastguard Worker        mov             v5.16b, v27.16b
975*c0909341SAndroid Build Coastguard Worker    .endif
976*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
977*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v20.16b
978*c0909341SAndroid Build Coastguard Worker
979*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[4]
980*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v21.4h, v7.h[5]
981*c0909341SAndroid Build Coastguard Worker
982*c0909341SAndroid Build Coastguard Worker        \dot            v5.4s, v2.16b, v26.4b[0]
983*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v21.16b
984*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v22.16b
985*c0909341SAndroid Build Coastguard Worker
986*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v22.4h, v7.h[6]
987*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
988*c0909341SAndroid Build Coastguard Worker        rshrn           v22.4h, v5.4s, #2
989*c0909341SAndroid Build Coastguard Worker    .else
990*c0909341SAndroid Build Coastguard Worker        shrn            v22.4h, v5.4s, #2
991*c0909341SAndroid Build Coastguard Worker    .endif
992*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v22.4h, v7.h[7]
993*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
994*c0909341SAndroid Build Coastguard Worker
995*c0909341SAndroid Build Coastguard Worker        tbl             v0.8b, {v0.16b}, v25.8b
996*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #2
997*c0909341SAndroid Build Coastguard Worker
998*c0909341SAndroid Build Coastguard Worker        str             h0, [\dst]
999*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd
1000*c0909341SAndroid Build Coastguard Worker        b.gt            2b
1001*c0909341SAndroid Build Coastguard Worker        ret             x15
1002*c0909341SAndroid Build Coastguard Worker.endif
1003*c0909341SAndroid Build Coastguard Worker
1004*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1005*c0909341SAndroid Build Coastguard WorkerL(\type\()_6tap_hv_\isa):
1006*c0909341SAndroid Build Coastguard Worker        cmp             \w, #4
1007*c0909341SAndroid Build Coastguard Worker        b.eq            40f
1008*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1009*c0909341SAndroid Build Coastguard Worker        b.lt            20f
1010*c0909341SAndroid Build Coastguard Worker.endif
1011*c0909341SAndroid Build Coastguard Worker
1012*c0909341SAndroid Build Coastguard Worker        // .align JUMP_ALIGN    // fallthrough
1013*c0909341SAndroid Build Coastguard Worker80:     // HV6 - 8xN+
1014*c0909341SAndroid Build Coastguard Worker        ldr             d26, [\xmx]
1015*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1016*c0909341SAndroid Build Coastguard Worker        add             \wd_strd, \w, \w
1017*c0909341SAndroid Build Coastguard Worker.endif
1018*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
1019*c0909341SAndroid Build Coastguard Worker        cmp             w9, #SHARP1
1020*c0909341SAndroid Build Coastguard Worker        b.eq            88f             // horizontal == SHARP1
1021*c0909341SAndroid Build Coastguard Worker
1022*c0909341SAndroid Build Coastguard Worker        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
1023*c0909341SAndroid Build Coastguard Worker        ext             v0.8b, v26.8b, v26.8b, #7
1024*c0909341SAndroid Build Coastguard Worker        ins             v26.d[1], v0.d[0]
1025*c0909341SAndroid Build Coastguard Worker
1026*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1027*c0909341SAndroid Build Coastguard Worker81:
1028*c0909341SAndroid Build Coastguard Worker        mov             \lsrc, \src
1029*c0909341SAndroid Build Coastguard Worker        mov             \ldst, \dst
1030*c0909341SAndroid Build Coastguard Worker        mov             w8, \h
1031*c0909341SAndroid Build Coastguard Worker
1032*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter6_neon_i8mm)
1033*c0909341SAndroid Build Coastguard Worker        srshr           v16.8h, v22.8h, #2
1034*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter6_neon_i8mm)
1035*c0909341SAndroid Build Coastguard Worker        srshr           v17.8h, v22.8h, #2
1036*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter6_neon_i8mm)
1037*c0909341SAndroid Build Coastguard Worker        srshr           v18.8h, v22.8h, #2
1038*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter6_neon_i8mm)
1039*c0909341SAndroid Build Coastguard Worker        srshr           v19.8h, v22.8h, #2
1040*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter6_neon_i8mm)
1041*c0909341SAndroid Build Coastguard Worker        srshr           v20.8h, v22.8h, #2
1042*c0909341SAndroid Build Coastguard Worker
1043*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1044*c0909341SAndroid Build Coastguard Worker8:
1045*c0909341SAndroid Build Coastguard Worker        ld1             {v23.16b}, [\lsrc], \s_strd
1046*c0909341SAndroid Build Coastguard Worker
1047*c0909341SAndroid Build Coastguard Worker        smull           v0.4s, v16.4h, v7.h[1]
1048*c0909341SAndroid Build Coastguard Worker        smull2          v1.4s, v16.8h, v7.h[1]
1049*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v17.16b
1050*c0909341SAndroid Build Coastguard Worker        movi            v5.4s, #0
1051*c0909341SAndroid Build Coastguard Worker        movi            v6.4s, #0
1052*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v23.16b}, v29.16b
1053*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v23.16b}, v30.16b
1054*c0909341SAndroid Build Coastguard Worker
1055*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v17.4h, v7.h[2]
1056*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v17.8h, v7.h[2]
1057*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
1058*c0909341SAndroid Build Coastguard Worker
1059*c0909341SAndroid Build Coastguard Worker        usmmla          v5.4s, v2.16b, v26.16b
1060*c0909341SAndroid Build Coastguard Worker        usmmla          v6.4s, v3.16b, v26.16b
1061*c0909341SAndroid Build Coastguard Worker
1062*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v18.4h, v7.h[3]
1063*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v18.8h, v7.h[3]
1064*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
1065*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #1
1066*c0909341SAndroid Build Coastguard Worker
1067*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v19.4h, v7.h[4]
1068*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v19.8h, v7.h[4]
1069*c0909341SAndroid Build Coastguard Worker        uzp1            v23.8h, v5.8h, v6.8h
1070*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v20.16b
1071*c0909341SAndroid Build Coastguard Worker
1072*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[5]
1073*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v20.8h, v7.h[5]
1074*c0909341SAndroid Build Coastguard Worker        srshr           v20.8h, v23.8h, #2
1075*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[6]
1076*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v20.8h, v7.h[6]
1077*c0909341SAndroid Build Coastguard Worker    .ifc \type, prep
1078*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v0.4s, #6
1079*c0909341SAndroid Build Coastguard Worker        rshrn2          v0.8h, v1.4s, #6
1080*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [\ldst], \d_strd
1081*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1082*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #16
1083*c0909341SAndroid Build Coastguard Worker    .else
1084*c0909341SAndroid Build Coastguard Worker        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
1085*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #2
1086*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b}, [\ldst], \d_strd
1087*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1088*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #8
1089*c0909341SAndroid Build Coastguard Worker    .endif
1090*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #8
1091*c0909341SAndroid Build Coastguard Worker        subs            \w, \w, #8
1092*c0909341SAndroid Build Coastguard Worker        b.gt            81b
1093*c0909341SAndroid Build Coastguard Worker        ret             x15
1094*c0909341SAndroid Build Coastguard Worker
1095*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1096*c0909341SAndroid Build Coastguard Worker88:
1097*c0909341SAndroid Build Coastguard Worker.endif  // neon_i8mm
1098*c0909341SAndroid Build Coastguard Worker        ldp             q29, q30, [x13, #16]
1099*c0909341SAndroid Build Coastguard Worker
1100*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1101*c0909341SAndroid Build Coastguard Worker81:
1102*c0909341SAndroid Build Coastguard Worker        mov             \lsrc, \src
1103*c0909341SAndroid Build Coastguard Worker        mov             \ldst, \dst
1104*c0909341SAndroid Build Coastguard Worker        mov             w8, \h
1105*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
1106*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
1107*c0909341SAndroid Build Coastguard Worker        srshr           v16.8h, v22.8h, #2
1108*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
1109*c0909341SAndroid Build Coastguard Worker        srshr           v17.8h, v22.8h, #2
1110*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
1111*c0909341SAndroid Build Coastguard Worker        srshr           v18.8h, v22.8h, #2
1112*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
1113*c0909341SAndroid Build Coastguard Worker        srshr           v19.8h, v22.8h, #2
1114*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
1115*c0909341SAndroid Build Coastguard Worker        srshr           v20.8h, v22.8h, #2
1116*c0909341SAndroid Build Coastguard Worker.else
1117*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
1118*c0909341SAndroid Build Coastguard Worker        sshr            v16.8h, v22.8h, #2
1119*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
1120*c0909341SAndroid Build Coastguard Worker        sshr            v17.8h, v22.8h, #2
1121*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
1122*c0909341SAndroid Build Coastguard Worker        sshr            v18.8h, v22.8h, #2
1123*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
1124*c0909341SAndroid Build Coastguard Worker        sshr            v19.8h, v22.8h, #2
1125*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
1126*c0909341SAndroid Build Coastguard Worker        sshr            v20.8h, v22.8h, #2
1127*c0909341SAndroid Build Coastguard Worker.endif
1128*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1129*c0909341SAndroid Build Coastguard Worker8:
1130*c0909341SAndroid Build Coastguard Worker        ldr             q23, [\lsrc]
1131*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd
1132*c0909341SAndroid Build Coastguard Worker
1133*c0909341SAndroid Build Coastguard Worker        smull           v0.4s, v16.4h, v7.h[1]
1134*c0909341SAndroid Build Coastguard Worker        smull2          v1.4s, v16.8h, v7.h[1]
1135*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod
1136*c0909341SAndroid Build Coastguard Worker        sub             v23.16b, v23.16b, v24.16b
1137*c0909341SAndroid Build Coastguard Worker.endif
1138*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v17.16b
1139*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
1140*c0909341SAndroid Build Coastguard Worker        movi            v5.4s, #0
1141*c0909341SAndroid Build Coastguard Worker        movi            v6.4s, #0
1142*c0909341SAndroid Build Coastguard Worker.else
1143*c0909341SAndroid Build Coastguard Worker        mov             v5.16b, v27.16b
1144*c0909341SAndroid Build Coastguard Worker        mov             v6.16b, v27.16b
1145*c0909341SAndroid Build Coastguard Worker.endif
1146*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v23.16b}, v28.16b
1147*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v23.16b}, v29.16b
1148*c0909341SAndroid Build Coastguard Worker
1149*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v17.4h, v7.h[2]
1150*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v17.8h, v7.h[2]
1151*c0909341SAndroid Build Coastguard Worker        tbl             v4.16b, {v23.16b}, v30.16b
1152*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
1153*c0909341SAndroid Build Coastguard Worker
1154*c0909341SAndroid Build Coastguard Worker        \dot            v5.4s, v2.16b, v26.4b[0]
1155*c0909341SAndroid Build Coastguard Worker        \dot            v6.4s, v3.16b, v26.4b[0]
1156*c0909341SAndroid Build Coastguard Worker
1157*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v18.4h, v7.h[3]
1158*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v18.8h, v7.h[3]
1159*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
1160*c0909341SAndroid Build Coastguard Worker
1161*c0909341SAndroid Build Coastguard Worker        \dot            v5.4s, v3.16b, v26.4b[1]
1162*c0909341SAndroid Build Coastguard Worker        \dot            v6.4s, v4.16b, v26.4b[1]
1163*c0909341SAndroid Build Coastguard Worker
1164*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v19.4h, v7.h[4]
1165*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v19.8h, v7.h[4]
1166*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v20.16b
1167*c0909341SAndroid Build Coastguard Worker        uzp1            v23.8h, v5.8h, v6.8h
1168*c0909341SAndroid Build Coastguard Worker
1169*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[5]
1170*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v20.8h, v7.h[5]
1171*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
1172*c0909341SAndroid Build Coastguard Worker        srshr           v20.8h, v23.8h, #2
1173*c0909341SAndroid Build Coastguard Worker.else
1174*c0909341SAndroid Build Coastguard Worker        sshr            v20.8h, v23.8h, #2
1175*c0909341SAndroid Build Coastguard Worker.endif
1176*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #1
1177*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[6]
1178*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v20.8h, v7.h[6]
1179*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1180*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v0.4s, #6
1181*c0909341SAndroid Build Coastguard Worker        rshrn2          v0.8h, v1.4s, #6
1182*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [\ldst], \d_strd
1183*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1184*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #16
1185*c0909341SAndroid Build Coastguard Worker.else
1186*c0909341SAndroid Build Coastguard Worker        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
1187*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #2
1188*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b}, [\ldst], \d_strd
1189*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1190*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #8
1191*c0909341SAndroid Build Coastguard Worker.endif
1192*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #8
1193*c0909341SAndroid Build Coastguard Worker        subs            \w, \w, #8
1194*c0909341SAndroid Build Coastguard Worker        b.gt            81b
1195*c0909341SAndroid Build Coastguard Worker        ret             x15
1196*c0909341SAndroid Build Coastguard Worker
1197*c0909341SAndroid Build Coastguard Worker        .align FUNC_ALIGN
1198*c0909341SAndroid Build Coastguard WorkerL(\type\()_hv_filter8_\isa):
1199*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b}, [\lsrc], \s_strd
1200*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
1201*c0909341SAndroid Build Coastguard Worker        movi            v22.4s, #0
1202*c0909341SAndroid Build Coastguard Worker        movi            v23.4s, #0
1203*c0909341SAndroid Build Coastguard Worker.else   // neon_dotprod
1204*c0909341SAndroid Build Coastguard Worker        sub             v4.16b, v4.16b, v24.16b
1205*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v27.16b
1206*c0909341SAndroid Build Coastguard Worker        mov             v23.16b, v27.16b
1207*c0909341SAndroid Build Coastguard Worker.endif
1208*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v4.16b}, v28.16b
1209*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v4.16b}, v29.16b
1210*c0909341SAndroid Build Coastguard Worker        tbl             v4.16b, {v4.16b}, v30.16b
1211*c0909341SAndroid Build Coastguard Worker        \dot            v22.4s, v2.16b, v26.4b[0]
1212*c0909341SAndroid Build Coastguard Worker        \dot            v23.4s, v3.16b, v26.4b[0]
1213*c0909341SAndroid Build Coastguard Worker        \dot            v22.4s, v3.16b, v26.4b[1]
1214*c0909341SAndroid Build Coastguard Worker        \dot            v23.4s, v4.16b, v26.4b[1]
1215*c0909341SAndroid Build Coastguard Worker        uzp1            v22.8h, v22.8h, v23.8h
1216*c0909341SAndroid Build Coastguard Worker        ret
1217*c0909341SAndroid Build Coastguard Worker
1218*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
1219*c0909341SAndroid Build Coastguard Worker        .align FUNC_ALIGN
1220*c0909341SAndroid Build Coastguard WorkerL(\type\()_hv_filter6_neon_i8mm):
1221*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b}, [\lsrc], \s_strd
1222*c0909341SAndroid Build Coastguard Worker        movi            v22.4s, #0
1223*c0909341SAndroid Build Coastguard Worker        movi            v23.4s, #0
1224*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v4.16b}, v29.16b
1225*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v4.16b}, v30.16b
1226*c0909341SAndroid Build Coastguard Worker        usmmla          v22.4s, v2.16b, v26.16b
1227*c0909341SAndroid Build Coastguard Worker        usmmla          v23.4s, v3.16b, v26.16b
1228*c0909341SAndroid Build Coastguard Worker        uzp1            v22.8h, v22.8h, v23.8h
1229*c0909341SAndroid Build Coastguard Worker        ret
1230*c0909341SAndroid Build Coastguard Worker.endif
1231*c0909341SAndroid Build Coastguard Worker
1232*c0909341SAndroid Build Coastguard Worker        .align FUNC_ALIGN
1233*c0909341SAndroid Build Coastguard WorkerL(\type\()_hv_filter4_\isa):
1234*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [\src], \s_strd
1235*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
1236*c0909341SAndroid Build Coastguard Worker        movi            v22.4s, #2
1237*c0909341SAndroid Build Coastguard Worker.else
1238*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v27.16b
1239*c0909341SAndroid Build Coastguard Worker        sub             v4.16b, v4.16b, v24.16b
1240*c0909341SAndroid Build Coastguard Worker.endif
1241*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v4.16b}, v28.16b
1242*c0909341SAndroid Build Coastguard Worker        \dot            v22.4s, v2.16b, v26.4b[0]
1243*c0909341SAndroid Build Coastguard Worker        ret
1244*c0909341SAndroid Build Coastguard Worker
1245*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1246*c0909341SAndroid Build Coastguard Worker40:     // HV6 - 4xN
1247*c0909341SAndroid Build Coastguard Worker        ldur            s26, [\xmx, #2]
1248*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #2
1249*c0909341SAndroid Build Coastguard Worker
1250*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1251*c0909341SAndroid Build Coastguard Worker        shrn            v16.4h, v22.4s, #2
1252*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1253*c0909341SAndroid Build Coastguard Worker        shrn            v17.4h, v22.4s, #2
1254*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1255*c0909341SAndroid Build Coastguard Worker        shrn            v18.4h, v22.4s, #2
1256*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1257*c0909341SAndroid Build Coastguard Worker        shrn            v19.4h, v22.4s, #2
1258*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1259*c0909341SAndroid Build Coastguard Worker        shrn            v20.4h, v22.4s, #2
1260*c0909341SAndroid Build Coastguard Worker
1261*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1262*c0909341SAndroid Build Coastguard Worker4:
1263*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [\src], \s_strd
1264*c0909341SAndroid Build Coastguard Worker
1265*c0909341SAndroid Build Coastguard Worker        smull           v0.4s, v16.4h, v7.h[1]
1266*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v17.4h, v7.h[2]
1267*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod
1268*c0909341SAndroid Build Coastguard Worker        sub             v4.16b, v4.16b, v24.16b
1269*c0909341SAndroid Build Coastguard Worker.endif
1270*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v17.16b
1271*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
1272*c0909341SAndroid Build Coastguard Worker
1273*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v18.4h, v7.h[3]
1274*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v19.4h, v7.h[4]
1275*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v4.16b}, v28.16b
1276*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
1277*c0909341SAndroid Build Coastguard Worker        movi            v5.4s, #0
1278*c0909341SAndroid Build Coastguard Worker.else
1279*c0909341SAndroid Build Coastguard Worker        mov             v5.16b, v27.16b
1280*c0909341SAndroid Build Coastguard Worker.endif
1281*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
1282*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v20.16b
1283*c0909341SAndroid Build Coastguard Worker        \dot            v5.4s, v2.16b, v26.4b[0]
1284*c0909341SAndroid Build Coastguard Worker
1285*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[5]
1286*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
1287*c0909341SAndroid Build Coastguard Worker        rshrn           v20.4h, v5.4s, #2
1288*c0909341SAndroid Build Coastguard Worker.else
1289*c0909341SAndroid Build Coastguard Worker        shrn            v20.4h, v5.4s, #2
1290*c0909341SAndroid Build Coastguard Worker.endif
1291*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
1292*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[6]
1293*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1294*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v0.4s, #6
1295*c0909341SAndroid Build Coastguard Worker        str             d0, [\dst], #8
1296*c0909341SAndroid Build Coastguard Worker.else
1297*c0909341SAndroid Build Coastguard Worker        tbl             v0.8b, {v0.16b}, v25.8b
1298*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #2
1299*c0909341SAndroid Build Coastguard Worker        str             s0, [\dst]
1300*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd
1301*c0909341SAndroid Build Coastguard Worker.endif
1302*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1303*c0909341SAndroid Build Coastguard Worker        ret             x15
1304*c0909341SAndroid Build Coastguard Worker
1305*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1306*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1307*c0909341SAndroid Build Coastguard Worker20:     // HV6 - 2xN
1308*c0909341SAndroid Build Coastguard Worker        ldur            s26, [\xmx, #2]
1309*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #2
1310*c0909341SAndroid Build Coastguard Worker
1311*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1312*c0909341SAndroid Build Coastguard Worker        shrn            v16.4h, v22.4s, #2
1313*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1314*c0909341SAndroid Build Coastguard Worker        shrn            v17.4h, v22.4s, #2
1315*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1316*c0909341SAndroid Build Coastguard Worker        shrn            v18.4h, v22.4s, #2
1317*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1318*c0909341SAndroid Build Coastguard Worker        shrn            v19.4h, v22.4s, #2
1319*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1320*c0909341SAndroid Build Coastguard Worker        shrn            v20.4h, v22.4s, #2
1321*c0909341SAndroid Build Coastguard Worker
1322*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1323*c0909341SAndroid Build Coastguard Worker2:
1324*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [\src], \s_strd
1325*c0909341SAndroid Build Coastguard Worker
1326*c0909341SAndroid Build Coastguard Worker        smull           v0.4s, v16.4h, v7.h[1]
1327*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v17.4h, v7.h[2]
1328*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_dotprod
1329*c0909341SAndroid Build Coastguard Worker        sub             v4.16b, v4.16b, v24.16b
1330*c0909341SAndroid Build Coastguard Worker    .endif
1331*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v17.16b
1332*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
1333*c0909341SAndroid Build Coastguard Worker
1334*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v18.4h, v7.h[3]
1335*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v19.4h, v7.h[4]
1336*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v4.16b}, v28.16b
1337*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
1338*c0909341SAndroid Build Coastguard Worker        movi            v5.4s, #0
1339*c0909341SAndroid Build Coastguard Worker    .else
1340*c0909341SAndroid Build Coastguard Worker        mov             v5.16b, v27.16b
1341*c0909341SAndroid Build Coastguard Worker    .endif
1342*c0909341SAndroid Build Coastguard Worker
1343*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
1344*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v20.16b
1345*c0909341SAndroid Build Coastguard Worker        \dot            v5.4s, v2.16b, v26.4b[0]
1346*c0909341SAndroid Build Coastguard Worker
1347*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[5]
1348*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
1349*c0909341SAndroid Build Coastguard Worker        rshrn           v20.4h, v5.4s, #2
1350*c0909341SAndroid Build Coastguard Worker    .else
1351*c0909341SAndroid Build Coastguard Worker        shrn            v20.4h, v5.4s, #2
1352*c0909341SAndroid Build Coastguard Worker    .endif
1353*c0909341SAndroid Build Coastguard Worker
1354*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
1355*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[6]
1356*c0909341SAndroid Build Coastguard Worker
1357*c0909341SAndroid Build Coastguard Worker        tbl             v0.8b, {v0.16b}, v25.8b
1358*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.8b, v0.8h, #2
1359*c0909341SAndroid Build Coastguard Worker
1360*c0909341SAndroid Build Coastguard Worker        str             h0, [\dst]
1361*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd
1362*c0909341SAndroid Build Coastguard Worker        b.gt            2b
1363*c0909341SAndroid Build Coastguard Worker        ret             x15
1364*c0909341SAndroid Build Coastguard Worker.endif
1365*c0909341SAndroid Build Coastguard Worker
1366*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1367*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_h_\isa):
1368*c0909341SAndroid Build Coastguard Worker        movrel          x11, \type\()_8tap_h_\isa\()_tbl
1369*c0909341SAndroid Build Coastguard Worker        ldrsw           x8, [x11, x8, lsl #2]
1370*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1371*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
1372*c0909341SAndroid Build Coastguard Worker        movi            v27.4s, #34     // special rounding
1373*c0909341SAndroid Build Coastguard Worker    .else
1374*c0909341SAndroid Build Coastguard Worker        mov             w10, #0x2022    // 64 * 128 + 34, bias and rounding for SDOT
1375*c0909341SAndroid Build Coastguard Worker        dup             v27.4s, w10
1376*c0909341SAndroid Build Coastguard Worker    .endif
1377*c0909341SAndroid Build Coastguard Worker.endif
1378*c0909341SAndroid Build Coastguard Worker        add             x11, x11, x8
1379*c0909341SAndroid Build Coastguard Worker        br              x11
1380*c0909341SAndroid Build Coastguard Worker
1381*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1382*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1383*c0909341SAndroid Build Coastguard Worker20:     // H - 2xN
1384*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1385*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #2
1386*c0909341SAndroid Build Coastguard Worker        ldur            s26, [\xmx, #2]
1387*c0909341SAndroid Build Coastguard Worker
1388*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1389*c0909341SAndroid Build Coastguard Worker2:
1390*c0909341SAndroid Build Coastguard Worker        ldr             d0, [\src]
1391*c0909341SAndroid Build Coastguard Worker        ldr             d1, [\src, \s_strd]
1392*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
1393*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_dotprod
1394*c0909341SAndroid Build Coastguard Worker        sub             v0.8b, v0.8b, v24.8b
1395*c0909341SAndroid Build Coastguard Worker        sub             v1.8b, v1.8b, v24.8b
1396*c0909341SAndroid Build Coastguard Worker    .endif
1397*c0909341SAndroid Build Coastguard Worker        mov             v4.16b, v27.16b
1398*c0909341SAndroid Build Coastguard Worker        mov             v5.16b, v27.16b
1399*c0909341SAndroid Build Coastguard Worker
1400*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v0.16b}, v28.16b
1401*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v1.16b}, v28.16b
1402*c0909341SAndroid Build Coastguard Worker
1403*c0909341SAndroid Build Coastguard Worker        \dot            v4.4s, v2.16b, v26.4b[0]
1404*c0909341SAndroid Build Coastguard Worker        \dot            v5.4s, v3.16b, v26.4b[0]
1405*c0909341SAndroid Build Coastguard Worker
1406*c0909341SAndroid Build Coastguard Worker        uzp1            v4.8h, v4.8h, v5.8h
1407*c0909341SAndroid Build Coastguard Worker        sqshrun         v4.8b, v4.8h, #6
1408*c0909341SAndroid Build Coastguard Worker
1409*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
1410*c0909341SAndroid Build Coastguard Worker        fmov            x8, d4
1411*c0909341SAndroid Build Coastguard Worker        lsr             x9, x8, #32
1412*c0909341SAndroid Build Coastguard Worker        strh            w8, [\dst]
1413*c0909341SAndroid Build Coastguard Worker        strh            w9, [\dst, \d_strd]
1414*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #1
1415*c0909341SAndroid Build Coastguard Worker        b.gt            2b
1416*c0909341SAndroid Build Coastguard Worker        ret
1417*c0909341SAndroid Build Coastguard Worker.endif
1418*c0909341SAndroid Build Coastguard Worker
1419*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1420*c0909341SAndroid Build Coastguard Worker40:     // H - 4xN
1421*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1422*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #2
1423*c0909341SAndroid Build Coastguard Worker        ldur            s26, [\xmx, #2]
1424*c0909341SAndroid Build Coastguard Worker
1425*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1426*c0909341SAndroid Build Coastguard Worker4:
1427*c0909341SAndroid Build Coastguard Worker        ldr             d0, [\src]
1428*c0909341SAndroid Build Coastguard Worker        ldr             d1, [\src, \s_strd]
1429*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
1430*c0909341SAndroid Build Coastguard Worker.ifc \type\()_\isa, prep_neon_i8mm
1431*c0909341SAndroid Build Coastguard Worker        movi            v4.4s, #0
1432*c0909341SAndroid Build Coastguard Worker        movi            v5.4s, #0
1433*c0909341SAndroid Build Coastguard Worker.else
1434*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_dotprod
1435*c0909341SAndroid Build Coastguard Worker        sub             v0.8b, v0.8b, v24.8b
1436*c0909341SAndroid Build Coastguard Worker        sub             v1.8b, v1.8b, v24.8b
1437*c0909341SAndroid Build Coastguard Worker    .endif
1438*c0909341SAndroid Build Coastguard Worker        mov             v4.16b, v27.16b
1439*c0909341SAndroid Build Coastguard Worker        mov             v5.16b, v27.16b
1440*c0909341SAndroid Build Coastguard Worker.endif
1441*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v0.16b}, v28.16b
1442*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v1.16b}, v28.16b
1443*c0909341SAndroid Build Coastguard Worker
1444*c0909341SAndroid Build Coastguard Worker        \dot            v4.4s, v2.16b, v26.4b[0]
1445*c0909341SAndroid Build Coastguard Worker        \dot            v5.4s, v3.16b, v26.4b[0]
1446*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1447*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
1448*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
1449*c0909341SAndroid Build Coastguard Worker        uzp1            v4.8h, v4.8h, v5.8h
1450*c0909341SAndroid Build Coastguard Worker        srshr           v4.8h, v4.8h, #2
1451*c0909341SAndroid Build Coastguard Worker    .else
1452*c0909341SAndroid Build Coastguard Worker        shrn            v4.4h, v4.4s, #2
1453*c0909341SAndroid Build Coastguard Worker        shrn2           v4.8h, v5.4s, #2
1454*c0909341SAndroid Build Coastguard Worker    .endif
1455*c0909341SAndroid Build Coastguard Worker        str             q4, [\dst], #16
1456*c0909341SAndroid Build Coastguard Worker.else   // put
1457*c0909341SAndroid Build Coastguard Worker        uzp1            v4.8h, v4.8h, v5.8h
1458*c0909341SAndroid Build Coastguard Worker        sqshrun         v4.8b, v4.8h, #6
1459*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
1460*c0909341SAndroid Build Coastguard Worker        fmov            x8, d4
1461*c0909341SAndroid Build Coastguard Worker        lsr             x9, x8, #32
1462*c0909341SAndroid Build Coastguard Worker        str             w8, [\dst]
1463*c0909341SAndroid Build Coastguard Worker        str             w9, [\dst, \d_strd]
1464*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #1
1465*c0909341SAndroid Build Coastguard Worker.endif
1466*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1467*c0909341SAndroid Build Coastguard Worker        ret
1468*c0909341SAndroid Build Coastguard Worker
1469*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1470*c0909341SAndroid Build Coastguard Worker80:     // H - 8xN
1471*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1472*c0909341SAndroid Build Coastguard Worker        ldr             d26, [\xmx]
1473*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
1474*c0909341SAndroid Build Coastguard Worker        cmp             w9, #SHARP1
1475*c0909341SAndroid Build Coastguard Worker        b.eq            88f             // horizontal == SHARP1
1476*c0909341SAndroid Build Coastguard Worker
1477*c0909341SAndroid Build Coastguard Worker        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
1478*c0909341SAndroid Build Coastguard Worker        ext             v0.8b, v26.8b, v26.8b, #7
1479*c0909341SAndroid Build Coastguard Worker        ins             v26.d[1], v0.d[0]
1480*c0909341SAndroid Build Coastguard Worker
1481*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1482*c0909341SAndroid Build Coastguard Worker8:
1483*c0909341SAndroid Build Coastguard Worker        ldr             q0, [\src]
1484*c0909341SAndroid Build Coastguard Worker        ldr             q16, [\src, \s_strd]
1485*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
1486*c0909341SAndroid Build Coastguard Worker    .ifc \type, prep
1487*c0909341SAndroid Build Coastguard Worker        movi            v4.4s, #0
1488*c0909341SAndroid Build Coastguard Worker        movi            v5.4s, #0
1489*c0909341SAndroid Build Coastguard Worker        movi            v20.4s, #0
1490*c0909341SAndroid Build Coastguard Worker        movi            v21.4s, #0
1491*c0909341SAndroid Build Coastguard Worker    .else
1492*c0909341SAndroid Build Coastguard Worker        mov             v4.16b, v27.16b
1493*c0909341SAndroid Build Coastguard Worker        mov             v5.16b, v27.16b
1494*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v27.16b
1495*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v27.16b
1496*c0909341SAndroid Build Coastguard Worker    .endif
1497*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v0.16b}, v29.16b
1498*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v0.16b}, v30.16b
1499*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v16.16b}, v29.16b
1500*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v16.16b}, v30.16b
1501*c0909341SAndroid Build Coastguard Worker
1502*c0909341SAndroid Build Coastguard Worker        usmmla          v4.4s, v1.16b, v26.16b
1503*c0909341SAndroid Build Coastguard Worker        usmmla          v5.4s, v2.16b, v26.16b
1504*c0909341SAndroid Build Coastguard Worker        usmmla          v20.4s, v17.16b, v26.16b
1505*c0909341SAndroid Build Coastguard Worker        usmmla          v21.4s, v18.16b, v26.16b
1506*c0909341SAndroid Build Coastguard Worker
1507*c0909341SAndroid Build Coastguard Worker        uzp1            v4.8h, v4.8h, v5.8h
1508*c0909341SAndroid Build Coastguard Worker        uzp1            v20.8h, v20.8h, v21.8h
1509*c0909341SAndroid Build Coastguard Worker    .ifc \type, prep
1510*c0909341SAndroid Build Coastguard Worker        srshr           v4.8h, v4.8h, #2
1511*c0909341SAndroid Build Coastguard Worker        srshr           v20.8h, v20.8h, #2
1512*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
1513*c0909341SAndroid Build Coastguard Worker        stp             q4, q20, [\dst], #32
1514*c0909341SAndroid Build Coastguard Worker    .else   // put
1515*c0909341SAndroid Build Coastguard Worker        sqshrun         v4.8b, v4.8h, #6
1516*c0909341SAndroid Build Coastguard Worker        sqshrun         v20.8b, v20.8h, #6
1517*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
1518*c0909341SAndroid Build Coastguard Worker        str             d4, [\dst]
1519*c0909341SAndroid Build Coastguard Worker        str             d20, [\dst, \d_strd]
1520*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #1
1521*c0909341SAndroid Build Coastguard Worker    .endif
1522*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1523*c0909341SAndroid Build Coastguard Worker        ret
1524*c0909341SAndroid Build Coastguard Worker
1525*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1526*c0909341SAndroid Build Coastguard Worker88:
1527*c0909341SAndroid Build Coastguard Worker.endif  // neon_i8mm
1528*c0909341SAndroid Build Coastguard Worker        ldp             q29, q30, [x13, #16]
1529*c0909341SAndroid Build Coastguard Worker
1530*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1531*c0909341SAndroid Build Coastguard Worker8:
1532*c0909341SAndroid Build Coastguard Worker        ldr             q0, [\src]
1533*c0909341SAndroid Build Coastguard Worker        ldr             q16, [\src, \s_strd]
1534*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
1535*c0909341SAndroid Build Coastguard Worker.ifc \type\()_\isa, prep_neon_i8mm
1536*c0909341SAndroid Build Coastguard Worker        movi            v4.4s, #0
1537*c0909341SAndroid Build Coastguard Worker        movi            v5.4s, #0
1538*c0909341SAndroid Build Coastguard Worker        movi            v20.4s, #0
1539*c0909341SAndroid Build Coastguard Worker        movi            v21.4s, #0
1540*c0909341SAndroid Build Coastguard Worker.else
1541*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_dotprod
1542*c0909341SAndroid Build Coastguard Worker        sub             v0.16b, v0.16b, v24.16b
1543*c0909341SAndroid Build Coastguard Worker        sub             v16.16b, v16.16b, v24.16b
1544*c0909341SAndroid Build Coastguard Worker    .endif
1545*c0909341SAndroid Build Coastguard Worker        mov             v4.16b, v27.16b
1546*c0909341SAndroid Build Coastguard Worker        mov             v5.16b, v27.16b
1547*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v27.16b
1548*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v27.16b
1549*c0909341SAndroid Build Coastguard Worker.endif
1550*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v0.16b}, v28.16b
1551*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v0.16b}, v29.16b
1552*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v0.16b}, v30.16b
1553*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v16.16b}, v28.16b
1554*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v16.16b}, v29.16b
1555*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v16.16b}, v30.16b
1556*c0909341SAndroid Build Coastguard Worker
1557*c0909341SAndroid Build Coastguard Worker        \dot            v4.4s, v1.16b, v26.4b[0]
1558*c0909341SAndroid Build Coastguard Worker        \dot            v5.4s, v2.16b, v26.4b[0]
1559*c0909341SAndroid Build Coastguard Worker        \dot            v20.4s, v17.16b, v26.4b[0]
1560*c0909341SAndroid Build Coastguard Worker        \dot            v21.4s, v18.16b, v26.4b[0]
1561*c0909341SAndroid Build Coastguard Worker        \dot            v4.4s, v2.16b, v26.4b[1]
1562*c0909341SAndroid Build Coastguard Worker        \dot            v5.4s, v3.16b, v26.4b[1]
1563*c0909341SAndroid Build Coastguard Worker        \dot            v20.4s, v18.16b, v26.4b[1]
1564*c0909341SAndroid Build Coastguard Worker        \dot            v21.4s, v19.16b, v26.4b[1]
1565*c0909341SAndroid Build Coastguard Worker
1566*c0909341SAndroid Build Coastguard Worker        uzp1            v4.8h, v4.8h, v5.8h
1567*c0909341SAndroid Build Coastguard Worker        uzp1            v20.8h, v20.8h, v21.8h
1568*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1569*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
1570*c0909341SAndroid Build Coastguard Worker        srshr           v4.8h, v4.8h, #2
1571*c0909341SAndroid Build Coastguard Worker        srshr           v20.8h, v20.8h, #2
1572*c0909341SAndroid Build Coastguard Worker    .else
1573*c0909341SAndroid Build Coastguard Worker        sshr            v4.8h, v4.8h, #2
1574*c0909341SAndroid Build Coastguard Worker        sshr            v20.8h, v20.8h, #2
1575*c0909341SAndroid Build Coastguard Worker    .endif
1576*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
1577*c0909341SAndroid Build Coastguard Worker        stp             q4, q20, [\dst], #32
1578*c0909341SAndroid Build Coastguard Worker.else   // put
1579*c0909341SAndroid Build Coastguard Worker        sqshrun         v4.8b, v4.8h, #6
1580*c0909341SAndroid Build Coastguard Worker        sqshrun         v20.8b, v20.8h, #6
1581*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
1582*c0909341SAndroid Build Coastguard Worker        str             d4, [\dst]
1583*c0909341SAndroid Build Coastguard Worker        str             d20, [\dst, \d_strd]
1584*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #1
1585*c0909341SAndroid Build Coastguard Worker.endif
1586*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1587*c0909341SAndroid Build Coastguard Worker        ret
1588*c0909341SAndroid Build Coastguard Worker
1589*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1590*c0909341SAndroid Build Coastguard Worker160:    // H - 16xN
1591*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1592*c0909341SAndroid Build Coastguard Worker        ldr             d26, [\xmx]
1593*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
1594*c0909341SAndroid Build Coastguard Worker        cmp             w9, #SHARP1
1595*c0909341SAndroid Build Coastguard Worker        b.eq            168f            // horizontal == SHARP1
1596*c0909341SAndroid Build Coastguard Worker
1597*c0909341SAndroid Build Coastguard Worker        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
1598*c0909341SAndroid Build Coastguard Worker        ext             v0.8b, v26.8b, v26.8b, #7
1599*c0909341SAndroid Build Coastguard Worker        ins             v26.d[1], v0.d[0]
1600*c0909341SAndroid Build Coastguard Worker
1601*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1602*c0909341SAndroid Build Coastguard Worker16:
1603*c0909341SAndroid Build Coastguard Worker        ldr             q16, [\src]
1604*c0909341SAndroid Build Coastguard Worker        ldur            q17, [\src, #8] // avoid 2 register TBL for small cores
1605*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd
1606*c0909341SAndroid Build Coastguard Worker    .ifc \type, prep
1607*c0909341SAndroid Build Coastguard Worker        movi            v6.4s, #0
1608*c0909341SAndroid Build Coastguard Worker        movi            v7.4s, #0
1609*c0909341SAndroid Build Coastguard Worker        movi            v22.4s, #0
1610*c0909341SAndroid Build Coastguard Worker        movi            v23.4s, #0
1611*c0909341SAndroid Build Coastguard Worker    .else
1612*c0909341SAndroid Build Coastguard Worker        mov             v6.16b, v27.16b
1613*c0909341SAndroid Build Coastguard Worker        mov             v7.16b, v27.16b
1614*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v27.16b
1615*c0909341SAndroid Build Coastguard Worker        mov             v23.16b, v27.16b
1616*c0909341SAndroid Build Coastguard Worker    .endif
1617*c0909341SAndroid Build Coastguard Worker        tbl             v0.16b, {v16.16b}, v29.16b
1618*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v16.16b}, v30.16b
1619*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v17.16b}, v29.16b
1620*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v17.16b}, v30.16b
1621*c0909341SAndroid Build Coastguard Worker
1622*c0909341SAndroid Build Coastguard Worker        usmmla          v6.4s, v0.16b, v26.16b
1623*c0909341SAndroid Build Coastguard Worker        usmmla          v7.4s, v1.16b, v26.16b
1624*c0909341SAndroid Build Coastguard Worker        usmmla          v22.4s, v2.16b, v26.16b
1625*c0909341SAndroid Build Coastguard Worker        usmmla          v23.4s, v3.16b, v26.16b
1626*c0909341SAndroid Build Coastguard Worker
1627*c0909341SAndroid Build Coastguard Worker        uzp1            v6.8h, v6.8h, v7.8h
1628*c0909341SAndroid Build Coastguard Worker        uzp1            v22.8h, v22.8h, v23.8h
1629*c0909341SAndroid Build Coastguard Worker    .ifc \type, prep
1630*c0909341SAndroid Build Coastguard Worker        srshr           v6.8h, v6.8h, #2
1631*c0909341SAndroid Build Coastguard Worker        srshr           v22.8h, v22.8h, #2
1632*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
1633*c0909341SAndroid Build Coastguard Worker        stp             q6, q22, [\dst], #32
1634*c0909341SAndroid Build Coastguard Worker    .else   // put
1635*c0909341SAndroid Build Coastguard Worker        sqshrun         v6.8b, v6.8h, #6
1636*c0909341SAndroid Build Coastguard Worker        sqshrun2        v6.16b, v22.8h, #6
1637*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
1638*c0909341SAndroid Build Coastguard Worker        st1             {v6.16b}, [\dst], \d_strd
1639*c0909341SAndroid Build Coastguard Worker    .endif
1640*c0909341SAndroid Build Coastguard Worker        b.gt            16b
1641*c0909341SAndroid Build Coastguard Worker        ret
1642*c0909341SAndroid Build Coastguard Worker
1643*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1644*c0909341SAndroid Build Coastguard Worker168:
1645*c0909341SAndroid Build Coastguard Worker.endif  // neon_i8mm
1646*c0909341SAndroid Build Coastguard Worker        ldp             q29, q30, [x13, #16]
1647*c0909341SAndroid Build Coastguard Worker
1648*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1649*c0909341SAndroid Build Coastguard Worker16:
1650*c0909341SAndroid Build Coastguard Worker        ldr             q16, [\src]
1651*c0909341SAndroid Build Coastguard Worker        ldur            q17, [\src, #12]  // avoid 2 register TBL for small cores
1652*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd
1653*c0909341SAndroid Build Coastguard Worker.ifc \type\()_\isa, prep_neon_i8mm
1654*c0909341SAndroid Build Coastguard Worker        movi            v6.4s, #0
1655*c0909341SAndroid Build Coastguard Worker        movi            v7.4s, #0
1656*c0909341SAndroid Build Coastguard Worker        movi            v22.4s, #0
1657*c0909341SAndroid Build Coastguard Worker        movi            v23.4s, #0
1658*c0909341SAndroid Build Coastguard Worker.else
1659*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_dotprod
1660*c0909341SAndroid Build Coastguard Worker        sub             v16.16b, v16.16b, v24.16b
1661*c0909341SAndroid Build Coastguard Worker        sub             v17.16b, v17.16b, v24.16b
1662*c0909341SAndroid Build Coastguard Worker    .endif
1663*c0909341SAndroid Build Coastguard Worker        mov             v6.16b, v27.16b
1664*c0909341SAndroid Build Coastguard Worker        mov             v7.16b, v27.16b
1665*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v27.16b
1666*c0909341SAndroid Build Coastguard Worker        mov             v23.16b, v27.16b
1667*c0909341SAndroid Build Coastguard Worker.endif
1668*c0909341SAndroid Build Coastguard Worker        tbl             v0.16b, {v16.16b}, v28.16b
1669*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v16.16b}, v29.16b
1670*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v16.16b}, v30.16b
1671*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v17.16b}, v28.16b
1672*c0909341SAndroid Build Coastguard Worker        tbl             v4.16b, {v17.16b}, v29.16b
1673*c0909341SAndroid Build Coastguard Worker
1674*c0909341SAndroid Build Coastguard Worker        \dot            v6.4s, v0.16b, v26.4b[0]
1675*c0909341SAndroid Build Coastguard Worker        \dot            v7.4s, v1.16b, v26.4b[0]
1676*c0909341SAndroid Build Coastguard Worker        \dot            v22.4s, v2.16b, v26.4b[0]
1677*c0909341SAndroid Build Coastguard Worker        \dot            v23.4s, v3.16b, v26.4b[0]
1678*c0909341SAndroid Build Coastguard Worker        \dot            v6.4s, v1.16b, v26.4b[1]
1679*c0909341SAndroid Build Coastguard Worker        \dot            v7.4s, v2.16b, v26.4b[1]
1680*c0909341SAndroid Build Coastguard Worker        \dot            v22.4s, v3.16b, v26.4b[1]
1681*c0909341SAndroid Build Coastguard Worker        \dot            v23.4s, v4.16b, v26.4b[1]
1682*c0909341SAndroid Build Coastguard Worker
1683*c0909341SAndroid Build Coastguard Worker        uzp1            v6.8h, v6.8h, v7.8h
1684*c0909341SAndroid Build Coastguard Worker        uzp1            v22.8h, v22.8h, v23.8h
1685*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1686*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
1687*c0909341SAndroid Build Coastguard Worker        srshr           v6.8h, v6.8h, #2
1688*c0909341SAndroid Build Coastguard Worker        srshr           v22.8h, v22.8h, #2
1689*c0909341SAndroid Build Coastguard Worker    .else
1690*c0909341SAndroid Build Coastguard Worker        sshr            v6.8h, v6.8h, #2
1691*c0909341SAndroid Build Coastguard Worker        sshr            v22.8h, v22.8h, #2
1692*c0909341SAndroid Build Coastguard Worker    .endif
1693*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
1694*c0909341SAndroid Build Coastguard Worker        stp             q6, q22, [\dst], #32
1695*c0909341SAndroid Build Coastguard Worker.else   // put
1696*c0909341SAndroid Build Coastguard Worker        sqshrun         v6.8b, v6.8h, #6
1697*c0909341SAndroid Build Coastguard Worker        sqshrun2        v6.16b, v22.8h, #6
1698*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
1699*c0909341SAndroid Build Coastguard Worker        st1             {v6.16b}, [\dst], \d_strd
1700*c0909341SAndroid Build Coastguard Worker.endif
1701*c0909341SAndroid Build Coastguard Worker        b.gt            16b
1702*c0909341SAndroid Build Coastguard Worker        ret
1703*c0909341SAndroid Build Coastguard Worker
1704*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1705*c0909341SAndroid Build Coastguard Worker320:    // H - 32xN+
1706*c0909341SAndroid Build Coastguard Worker640:
1707*c0909341SAndroid Build Coastguard Worker1280:
1708*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1709*c0909341SAndroid Build Coastguard Worker        ldr             d26, [\xmx]
1710*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1711*c0909341SAndroid Build Coastguard Worker        sub             \d_strd, \d_strd, \w, uxtw
1712*c0909341SAndroid Build Coastguard Worker.endif
1713*c0909341SAndroid Build Coastguard Worker        sub             \s_strd, \s_strd, \w, uxtw
1714*c0909341SAndroid Build Coastguard Worker        mov             w8, \w
1715*c0909341SAndroid Build Coastguard Worker
1716*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm
1717*c0909341SAndroid Build Coastguard Worker        cmp             w9, #SHARP1
1718*c0909341SAndroid Build Coastguard Worker        b.eq            328f            // horizontal == SHARP1
1719*c0909341SAndroid Build Coastguard Worker
1720*c0909341SAndroid Build Coastguard Worker        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
1721*c0909341SAndroid Build Coastguard Worker        ext             v0.8b, v26.8b, v26.8b, #7
1722*c0909341SAndroid Build Coastguard Worker        ins             v26.d[1], v0.d[0]
1723*c0909341SAndroid Build Coastguard Worker
1724*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1725*c0909341SAndroid Build Coastguard Worker32:
1726*c0909341SAndroid Build Coastguard Worker        ldr             q16, [\src]
1727*c0909341SAndroid Build Coastguard Worker        ldur            q17, [\src, #8] // avoid 2 register TBL for small cores
1728*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #16
1729*c0909341SAndroid Build Coastguard Worker    .ifc \type, prep
1730*c0909341SAndroid Build Coastguard Worker        movi            v6.4s, #0
1731*c0909341SAndroid Build Coastguard Worker        movi            v7.4s, #0
1732*c0909341SAndroid Build Coastguard Worker        movi            v22.4s, #0
1733*c0909341SAndroid Build Coastguard Worker        movi            v23.4s, #0
1734*c0909341SAndroid Build Coastguard Worker    .else
1735*c0909341SAndroid Build Coastguard Worker        mov             v6.16b, v27.16b
1736*c0909341SAndroid Build Coastguard Worker        mov             v7.16b, v27.16b
1737*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v27.16b
1738*c0909341SAndroid Build Coastguard Worker        mov             v23.16b, v27.16b
1739*c0909341SAndroid Build Coastguard Worker    .endif
1740*c0909341SAndroid Build Coastguard Worker        tbl             v0.16b, {v16.16b}, v29.16b
1741*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v16.16b}, v30.16b
1742*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v17.16b}, v29.16b
1743*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v17.16b}, v30.16b
1744*c0909341SAndroid Build Coastguard Worker
1745*c0909341SAndroid Build Coastguard Worker        usmmla          v6.4s, v0.16b, v26.16b
1746*c0909341SAndroid Build Coastguard Worker        usmmla          v7.4s, v1.16b, v26.16b
1747*c0909341SAndroid Build Coastguard Worker        usmmla          v22.4s, v2.16b, v26.16b
1748*c0909341SAndroid Build Coastguard Worker        usmmla          v23.4s, v3.16b, v26.16b
1749*c0909341SAndroid Build Coastguard Worker
1750*c0909341SAndroid Build Coastguard Worker        uzp1            v6.8h, v6.8h, v7.8h
1751*c0909341SAndroid Build Coastguard Worker        uzp1            v22.8h, v22.8h, v23.8h
1752*c0909341SAndroid Build Coastguard Worker    .ifc \type, prep
1753*c0909341SAndroid Build Coastguard Worker        srshr           v6.8h, v6.8h, #2
1754*c0909341SAndroid Build Coastguard Worker        srshr           v22.8h, v22.8h, #2
1755*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #16
1756*c0909341SAndroid Build Coastguard Worker        stp             q6, q22, [\dst], #32
1757*c0909341SAndroid Build Coastguard Worker    .else   // put
1758*c0909341SAndroid Build Coastguard Worker        sqshrun         v6.8b, v6.8h, #6
1759*c0909341SAndroid Build Coastguard Worker        sqshrun2        v6.16b, v22.8h, #6
1760*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #16
1761*c0909341SAndroid Build Coastguard Worker        str             q6, [\dst], #16
1762*c0909341SAndroid Build Coastguard Worker    .endif
1763*c0909341SAndroid Build Coastguard Worker        b.gt            32b
1764*c0909341SAndroid Build Coastguard Worker
1765*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd
1766*c0909341SAndroid Build Coastguard Worker    .ifc \type, put
1767*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd
1768*c0909341SAndroid Build Coastguard Worker    .endif
1769*c0909341SAndroid Build Coastguard Worker        mov             w8, \w
1770*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
1771*c0909341SAndroid Build Coastguard Worker        b.gt            32b
1772*c0909341SAndroid Build Coastguard Worker        ret
1773*c0909341SAndroid Build Coastguard Worker
1774*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1775*c0909341SAndroid Build Coastguard Worker328:
1776*c0909341SAndroid Build Coastguard Worker.endif  // neon_i8mm
1777*c0909341SAndroid Build Coastguard Worker        ldp             q29, q30, [x13, #16]
1778*c0909341SAndroid Build Coastguard Worker
1779*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1780*c0909341SAndroid Build Coastguard Worker32:
1781*c0909341SAndroid Build Coastguard Worker        ldr             q16, [\src]
1782*c0909341SAndroid Build Coastguard Worker        ldur            q17, [\src, #12]  // avoid 2 register TBL for small cores
1783*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #16
1784*c0909341SAndroid Build Coastguard Worker.ifc \type\()_\isa, prep_neon_i8mm
1785*c0909341SAndroid Build Coastguard Worker        movi            v6.4s, #0
1786*c0909341SAndroid Build Coastguard Worker        movi            v7.4s, #0
1787*c0909341SAndroid Build Coastguard Worker        movi            v22.4s, #0
1788*c0909341SAndroid Build Coastguard Worker        movi            v23.4s, #0
1789*c0909341SAndroid Build Coastguard Worker.else
1790*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_dotprod
1791*c0909341SAndroid Build Coastguard Worker        sub             v16.16b, v16.16b, v24.16b
1792*c0909341SAndroid Build Coastguard Worker        sub             v17.16b, v17.16b, v24.16b
1793*c0909341SAndroid Build Coastguard Worker    .endif
1794*c0909341SAndroid Build Coastguard Worker        mov             v6.16b, v27.16b
1795*c0909341SAndroid Build Coastguard Worker        mov             v7.16b, v27.16b
1796*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v27.16b
1797*c0909341SAndroid Build Coastguard Worker        mov             v23.16b, v27.16b
1798*c0909341SAndroid Build Coastguard Worker.endif
1799*c0909341SAndroid Build Coastguard Worker        tbl             v0.16b, {v16.16b}, v28.16b
1800*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v16.16b}, v29.16b
1801*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v16.16b}, v30.16b
1802*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v17.16b}, v28.16b
1803*c0909341SAndroid Build Coastguard Worker        tbl             v4.16b, {v17.16b}, v29.16b
1804*c0909341SAndroid Build Coastguard Worker
1805*c0909341SAndroid Build Coastguard Worker        \dot            v6.4s, v0.16b, v26.4b[0]
1806*c0909341SAndroid Build Coastguard Worker        \dot            v7.4s, v1.16b, v26.4b[0]
1807*c0909341SAndroid Build Coastguard Worker        \dot            v22.4s, v2.16b, v26.4b[0]
1808*c0909341SAndroid Build Coastguard Worker        \dot            v23.4s, v3.16b, v26.4b[0]
1809*c0909341SAndroid Build Coastguard Worker        \dot            v6.4s, v1.16b, v26.4b[1]
1810*c0909341SAndroid Build Coastguard Worker        \dot            v7.4s, v2.16b, v26.4b[1]
1811*c0909341SAndroid Build Coastguard Worker        \dot            v22.4s, v3.16b, v26.4b[1]
1812*c0909341SAndroid Build Coastguard Worker        \dot            v23.4s, v4.16b, v26.4b[1]
1813*c0909341SAndroid Build Coastguard Worker
1814*c0909341SAndroid Build Coastguard Worker        uzp1            v6.8h, v6.8h, v7.8h
1815*c0909341SAndroid Build Coastguard Worker        uzp1            v22.8h, v22.8h, v23.8h
1816*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1817*c0909341SAndroid Build Coastguard Worker    .ifc \isa, neon_i8mm
1818*c0909341SAndroid Build Coastguard Worker        srshr           v6.8h, v6.8h, #2
1819*c0909341SAndroid Build Coastguard Worker        srshr           v22.8h, v22.8h, #2
1820*c0909341SAndroid Build Coastguard Worker    .else
1821*c0909341SAndroid Build Coastguard Worker        sshr            v6.8h, v6.8h, #2
1822*c0909341SAndroid Build Coastguard Worker        sshr            v22.8h, v22.8h, #2
1823*c0909341SAndroid Build Coastguard Worker    .endif
1824*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #16
1825*c0909341SAndroid Build Coastguard Worker        stp             q6, q22, [\dst], #32
1826*c0909341SAndroid Build Coastguard Worker.else   // put
1827*c0909341SAndroid Build Coastguard Worker        sqshrun         v6.8b, v6.8h, #6
1828*c0909341SAndroid Build Coastguard Worker        sqshrun2        v6.16b, v22.8h, #6
1829*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #16
1830*c0909341SAndroid Build Coastguard Worker        str             q6, [\dst], #16
1831*c0909341SAndroid Build Coastguard Worker.endif
1832*c0909341SAndroid Build Coastguard Worker        b.gt            32b
1833*c0909341SAndroid Build Coastguard Worker
1834*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd
1835*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1836*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd
1837*c0909341SAndroid Build Coastguard Worker.endif
1838*c0909341SAndroid Build Coastguard Worker        mov             w8, \w
1839*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
1840*c0909341SAndroid Build Coastguard Worker        b.gt            32b
1841*c0909341SAndroid Build Coastguard Worker        ret
1842*c0909341SAndroid Build Coastguard Workerendfunc
1843*c0909341SAndroid Build Coastguard Worker
1844*c0909341SAndroid Build Coastguard Workerjumptable \type\()_8tap_h_\isa\()_tbl
1845*c0909341SAndroid Build Coastguard Worker        .word 1280b - \type\()_8tap_h_\isa\()_tbl
1846*c0909341SAndroid Build Coastguard Worker        .word 640b  - \type\()_8tap_h_\isa\()_tbl
1847*c0909341SAndroid Build Coastguard Worker        .word 320b  - \type\()_8tap_h_\isa\()_tbl
1848*c0909341SAndroid Build Coastguard Worker        .word 160b  - \type\()_8tap_h_\isa\()_tbl
1849*c0909341SAndroid Build Coastguard Worker        .word 80b   - \type\()_8tap_h_\isa\()_tbl
1850*c0909341SAndroid Build Coastguard Worker        .word 40b   - \type\()_8tap_h_\isa\()_tbl
1851*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1852*c0909341SAndroid Build Coastguard Worker        .word 20b   - \type\()_8tap_h_\isa\()_tbl
1853*c0909341SAndroid Build Coastguard Worker.endif
1854*c0909341SAndroid Build Coastguard Workerendjumptable
1855*c0909341SAndroid Build Coastguard Worker.endm
1856*c0909341SAndroid Build Coastguard Worker
1857*c0909341SAndroid Build Coastguard Worker// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
1858*c0909341SAndroid Build Coastguard Worker// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7)
1859*c0909341SAndroid Build Coastguard Workerfilter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
1860*c0909341SAndroid Build Coastguard Worker
1861*c0909341SAndroid Build Coastguard Worker// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7)
1862*c0909341SAndroid Build Coastguard Worker// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
1863*c0909341SAndroid Build Coastguard Workerfilter_8tap_fn  put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
1864*c0909341SAndroid Build Coastguard Worker
1865*c0909341SAndroid Build Coastguard Worker#if HAVE_I8MM
1866*c0909341SAndroid Build Coastguard WorkerENABLE_I8MM
1867*c0909341SAndroid Build Coastguard Worker
1868*c0909341SAndroid Build Coastguard Worker// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
1869*c0909341SAndroid Build Coastguard Worker// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7)
1870*c0909341SAndroid Build Coastguard Workerfilter_8tap_fn prep, usdot, neon_i8mm, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
1871*c0909341SAndroid Build Coastguard Worker
1872*c0909341SAndroid Build Coastguard Worker// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7)
1873*c0909341SAndroid Build Coastguard Worker// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
1874*c0909341SAndroid Build Coastguard Workerfilter_8tap_fn  put, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
1875*c0909341SAndroid Build Coastguard Worker
1876*c0909341SAndroid Build Coastguard WorkerDISABLE_I8MM
1877*c0909341SAndroid Build Coastguard Worker#endif  // HAVE_I8MM
1878*c0909341SAndroid Build Coastguard Worker
1879*c0909341SAndroid Build Coastguard WorkerDISABLE_DOTPROD
1880*c0909341SAndroid Build Coastguard Worker#endif  // HAVE_DOTPROD
1881