xref: /aosp_15_r20/external/libdav1d/src/arm/64/mc16_sve.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2024, Arm Limited
3*c0909341SAndroid Build Coastguard Worker * All rights reserved.
4*c0909341SAndroid Build Coastguard Worker *
5*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker *
8*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker *
11*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker *
15*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker */
26*c0909341SAndroid Build Coastguard Worker
27*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
28*c0909341SAndroid Build Coastguard Worker#include "util.S"
29*c0909341SAndroid Build Coastguard Worker
30*c0909341SAndroid Build Coastguard Worker#define PREP_BIAS 32, lsl #8        // 8192
31*c0909341SAndroid Build Coastguard Worker#define PREP_BIAS_NEG 224, lsl #8   // -8192
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Worker#if HAVE_SVE2
34*c0909341SAndroid Build Coastguard WorkerENABLE_SVE
35*c0909341SAndroid Build Coastguard WorkerENABLE_SVE2
36*c0909341SAndroid Build Coastguard Worker
37*c0909341SAndroid Build Coastguard Worker// No spaces in these expressions, due to gas-preprocessor. It is translated by
38*c0909341SAndroid Build Coastguard Worker// -1 to save the negative offset when getting the address of `mc_subpel_filters`.
39*c0909341SAndroid Build Coastguard Worker#define REGULAR1        (((0*15-1)<<7)|(3*15-1))
40*c0909341SAndroid Build Coastguard Worker#define SMOOTH1         (((1*15-1)<<7)|(4*15-1))
41*c0909341SAndroid Build Coastguard Worker#define SHARP1          (((2*15-1)<<7)|(3*15-1))
42*c0909341SAndroid Build Coastguard Worker
43*c0909341SAndroid Build Coastguard Worker#define FUNC_ALIGN      2
44*c0909341SAndroid Build Coastguard Worker#define JUMP_ALIGN      2
45*c0909341SAndroid Build Coastguard Worker#define LOOP_ALIGN      2
46*c0909341SAndroid Build Coastguard Worker
47*c0909341SAndroid Build Coastguard Worker
48*c0909341SAndroid Build Coastguard Worker// Shuffle indices to permute horizontal samples in preparation for input to
49*c0909341SAndroid Build Coastguard Worker// 16-bit SDOT instructions. The 8-tap horizontal convolution uses sample
50*c0909341SAndroid Build Coastguard Worker// indices in the interval of [-3, 4] relative to the current sample position.
51*c0909341SAndroid Build Coastguard Workerconst h_tbl_sve, align=4
52*c0909341SAndroid Build Coastguard Worker        .byte  0,  1,  2,  3,  4,  5,  6,  7,   2,  3,  4,  5,  6,  7,  8,  9
53*c0909341SAndroid Build Coastguard Worker        .byte  4,  5,  6,  7,  8,  9, 10, 11,   6,  7,  8,  9, 10, 11, 12, 13
54*c0909341SAndroid Build Coastguard Workerendconst
55*c0909341SAndroid Build Coastguard Worker
56*c0909341SAndroid Build Coastguard Worker// Vertical convolutions also use 16-bit SDOT instructions, where two 128-bit
57*c0909341SAndroid Build Coastguard Worker// registers contain a transposed 4x4 matrix of values. Subsequent iterations
58*c0909341SAndroid Build Coastguard Worker// of the vertical convolution can reuse the 3x4 sub-matrix from the previous
59*c0909341SAndroid Build Coastguard Worker// loop iteration. These shuffle indices shift and merge this 4x4 matrix with
60*c0909341SAndroid Build Coastguard Worker// the values of a new line.
61*c0909341SAndroid Build Coastguard Workerconst v_tbl_sve, align=4
62*c0909341SAndroid Build Coastguard Worker        .byte  2,  3,  4,  5,  6,  7, 16, 17,  10, 11, 12, 13, 14, 15, 24, 25
63*c0909341SAndroid Build Coastguard Worker        .byte  2,  3,  4,  5,  6,  7, 16, 17,  10, 11, 12, 13, 14, 15, 18, 19
64*c0909341SAndroid Build Coastguard Worker        .byte  2,  3,  4,  5,  6,  7, 20, 21,  10, 11, 12, 13, 14, 15, 22, 23
65*c0909341SAndroid Build Coastguard Worker        .byte  2,  3,  4,  5,  6,  7, 24, 25,  10, 11, 12, 13, 14, 15, 26, 27
66*c0909341SAndroid Build Coastguard Worker        .byte  2,  3,  4,  5,  6,  7, 28, 29,  10, 11, 12, 13, 14, 15, 30, 31
67*c0909341SAndroid Build Coastguard Workerendconst
68*c0909341SAndroid Build Coastguard Worker
69*c0909341SAndroid Build Coastguard Worker
70*c0909341SAndroid Build Coastguard Worker.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
71*c0909341SAndroid Build Coastguard Workerfunction \op\()_8tap_\type\()_16bpc_\isa, export=1, align=FUNC_ALIGN
72*c0909341SAndroid Build Coastguard Worker        mov             x9,  \type_h
73*c0909341SAndroid Build Coastguard Worker        mov             x10, \type_v
74*c0909341SAndroid Build Coastguard Worker    .if \jump
75*c0909341SAndroid Build Coastguard Worker        b               \op\()_8tap_\isa
76*c0909341SAndroid Build Coastguard Worker    .endif
77*c0909341SAndroid Build Coastguard Workerendfunc
78*c0909341SAndroid Build Coastguard Worker.endm
79*c0909341SAndroid Build Coastguard Worker
80*c0909341SAndroid Build Coastguard Worker.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, xmx, xmy, ldst, lsrc, wd_strd, ws_strd
81*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, sharp,          SHARP1,   SHARP1,   \isa
82*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, sharp_smooth,   SHARP1,   SMOOTH1,  \isa
83*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, sharp_regular,  SHARP1,   REGULAR1, \isa
84*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, smooth_sharp,   SMOOTH1,  SHARP1,   \isa
85*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, smooth,         SMOOTH1,  SMOOTH1,  \isa
86*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, smooth_regular, SMOOTH1,  REGULAR1, \isa
87*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, regular_sharp,  REGULAR1, SHARP1,   \isa
88*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1,  \isa
89*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, regular,        REGULAR1, REGULAR1, \isa, jump=0
90*c0909341SAndroid Build Coastguard Worker
91*c0909341SAndroid Build Coastguard Workerfunction \type\()_8tap_\isa, align=FUNC_ALIGN
92*c0909341SAndroid Build Coastguard Worker        clz             w8, \w
93*c0909341SAndroid Build Coastguard Worker        mov             w11, #0x4081                    // (1<<14) | (1<<7) | 1
94*c0909341SAndroid Build Coastguard Worker        ptrue           p0.b, vl16
95*c0909341SAndroid Build Coastguard Worker        sub             w8, w8, #24                     // for jump tables
96*c0909341SAndroid Build Coastguard Worker        movrel          x12, X(mc_subpel_filters)
97*c0909341SAndroid Build Coastguard Worker        cbnz            \mx, L(\type\()_8tap_h_hv_\isa)
98*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
99*c0909341SAndroid Build Coastguard Worker        cbz             \my, prep_sve
100*c0909341SAndroid Build Coastguard Worker.else   // put
101*c0909341SAndroid Build Coastguard Worker        cbnz            \my, L(\type\()_8tap_v_\isa)
102*c0909341SAndroid Build Coastguard Worker        mov             w9, w8
103*c0909341SAndroid Build Coastguard Worker        b               X(put_16bpc_neon)
104*c0909341SAndroid Build Coastguard Worker
105*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
106*c0909341SAndroid Build Coastguard Worker.endif
107*c0909341SAndroid Build Coastguard Worker
108*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_v_\isa):
109*c0909341SAndroid Build Coastguard Worker        madd            \my, \my, w11, w10
110*c0909341SAndroid Build Coastguard Worker        movrel          x13, v_tbl_sve
111*c0909341SAndroid Build Coastguard Worker.ifc \bdmax, w8                                         // put case, but skip
112*c0909341SAndroid Build Coastguard Worker        ld1r            {v5.8h}, [sp]                   // loading into w8
113*c0909341SAndroid Build Coastguard Worker.endif
114*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd             // src - s_strd
115*c0909341SAndroid Build Coastguard Worker        ubfx            w11, \my, #7, #7
116*c0909341SAndroid Build Coastguard Worker        and             \my, \my, #0x7F
117*c0909341SAndroid Build Coastguard Worker        ldr             q6, [x13]
118*c0909341SAndroid Build Coastguard Worker        cmp             \h, #4
119*c0909341SAndroid Build Coastguard Worker        csel            \my, \my, w11, le
120*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd, lsl #1     // src - 3 * s_strd
121*c0909341SAndroid Build Coastguard Worker        add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
122*c0909341SAndroid Build Coastguard Worker        ldp             q28, q29, [x13, #16]
123*c0909341SAndroid Build Coastguard Worker        ld1sb           {z7.h}, p0/z, [\xmy]
124*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
125*c0909341SAndroid Build Coastguard Worker        clz             \bdmax, \bdmax
126*c0909341SAndroid Build Coastguard Worker        sub             \bdmax, \bdmax, #24
127*c0909341SAndroid Build Coastguard Worker        dup             v5.4s, \bdmax
128*c0909341SAndroid Build Coastguard Worker.endif
129*c0909341SAndroid Build Coastguard Worker        cmp             \w, #8
130*c0909341SAndroid Build Coastguard Worker        b.lt            40f
131*c0909341SAndroid Build Coastguard Worker
132*c0909341SAndroid Build Coastguard Worker        // .align JUMP_ALIGN   // fallthrough
133*c0909341SAndroid Build Coastguard Worker80:     // V - 8xN+
134*c0909341SAndroid Build Coastguard Worker        ldp             q30, q31, [x13, #48]
135*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
136*c0909341SAndroid Build Coastguard Worker        add             \wd_strd, \w, \w                // d_strd = 2 * w
137*c0909341SAndroid Build Coastguard Worker.endif
138*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
139*c0909341SAndroid Build Coastguard Worker81:
140*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \src, \s_strd, lsl #1
141*c0909341SAndroid Build Coastguard Worker
142*c0909341SAndroid Build Coastguard Worker        ldr             q16, [\src]
143*c0909341SAndroid Build Coastguard Worker        ldr             q17, [\src, \s_strd]
144*c0909341SAndroid Build Coastguard Worker        ldr             q18, [\lsrc]
145*c0909341SAndroid Build Coastguard Worker        ldr             q19, [\lsrc, \s_strd]
146*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd, lsl #1
147*c0909341SAndroid Build Coastguard Worker        mov             \ldst, \dst
148*c0909341SAndroid Build Coastguard Worker
149*c0909341SAndroid Build Coastguard Worker        ldr             q20, [\lsrc]
150*c0909341SAndroid Build Coastguard Worker        ldr             q21, [\lsrc, \s_strd]
151*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd, lsl #1
152*c0909341SAndroid Build Coastguard Worker        ldr             q22, [\lsrc]
153*c0909341SAndroid Build Coastguard Worker        ldr             q23, [\lsrc, \s_strd]
154*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd, lsl #1
155*c0909341SAndroid Build Coastguard Worker        sub             w8, \h, #1
156*c0909341SAndroid Build Coastguard Worker
157*c0909341SAndroid Build Coastguard Worker        zip1            v0.8h, v16.8h, v17.8h
158*c0909341SAndroid Build Coastguard Worker        zip2            v1.8h, v16.8h, v17.8h
159*c0909341SAndroid Build Coastguard Worker        zip1            v2.8h, v18.8h, v19.8h
160*c0909341SAndroid Build Coastguard Worker        zip2            v3.8h, v18.8h, v19.8h
161*c0909341SAndroid Build Coastguard Worker
162*c0909341SAndroid Build Coastguard Worker        zip1            v18.8h, v20.8h, v21.8h
163*c0909341SAndroid Build Coastguard Worker        zip2            v21.8h, v20.8h, v21.8h
164*c0909341SAndroid Build Coastguard Worker        zip1            v24.8h, v22.8h, v23.8h
165*c0909341SAndroid Build Coastguard Worker        zip2            v27.8h, v22.8h, v23.8h
166*c0909341SAndroid Build Coastguard Worker
167*c0909341SAndroid Build Coastguard Worker        zip1            v16.4s, v0.4s, v2.4s
168*c0909341SAndroid Build Coastguard Worker        zip2            v19.4s, v0.4s, v2.4s
169*c0909341SAndroid Build Coastguard Worker        zip1            v22.4s, v1.4s, v3.4s
170*c0909341SAndroid Build Coastguard Worker        zip2            v25.4s, v1.4s, v3.4s
171*c0909341SAndroid Build Coastguard Worker
172*c0909341SAndroid Build Coastguard Worker        zip1            v17.4s, v18.4s, v24.4s
173*c0909341SAndroid Build Coastguard Worker        zip2            v20.4s, v18.4s, v24.4s
174*c0909341SAndroid Build Coastguard Worker        zip1            v23.4s, v21.4s, v27.4s
175*c0909341SAndroid Build Coastguard Worker        zip2            v26.4s, v21.4s, v27.4s
176*c0909341SAndroid Build Coastguard Worker
177*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
178*c0909341SAndroid Build Coastguard Worker8:
179*c0909341SAndroid Build Coastguard Worker        ld1             {v18.16b}, [\lsrc], \s_strd
180*c0909341SAndroid Build Coastguard Worker
181*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
182*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
183*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
184*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
185*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v18.16b
186*c0909341SAndroid Build Coastguard Worker        mov             v24.16b, v18.16b
187*c0909341SAndroid Build Coastguard Worker        mov             v27.16b, v18.16b
188*c0909341SAndroid Build Coastguard Worker
189*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z16.h, z7.h[0]
190*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v16.16b, v17.16b}, v6.16b
191*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z19.h, z7.h[0]
192*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v19.16b, v20.16b}, v6.16b
193*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z22.h, z7.h[0]
194*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v22.16b, v23.16b}, v6.16b
195*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #1
196*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z25.h, z7.h[0]
197*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v25.16b, v26.16b}, v6.16b
198*c0909341SAndroid Build Coastguard Worker
199*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z17.h, z7.h[1]
200*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v17.16b, v18.16b}, v28.16b
201*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z20.h, z7.h[1]
202*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v20.16b, v21.16b}, v29.16b
203*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z23.h, z7.h[1]
204*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v23.16b, v24.16b}, v30.16b
205*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z26.h, z7.h[1]
206*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v26.16b, v27.16b}, v31.16b
207*c0909341SAndroid Build Coastguard Worker
208*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v1.4s
209*c0909341SAndroid Build Coastguard Worker        uzp1            v1.4s, v2.4s, v3.4s
210*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
211*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
212*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v5.4s
213*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
214*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
215*c0909341SAndroid Build Coastguard Worker.else   // put
216*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.4h, v0.4s, #6
217*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v0.8h, v1.4s, #6
218*c0909341SAndroid Build Coastguard Worker        umin            v0.8h, v0.8h, v5.8h
219*c0909341SAndroid Build Coastguard Worker.endif
220*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [\ldst], \d_strd
221*c0909341SAndroid Build Coastguard Worker        b.gt            8b
222*c0909341SAndroid Build Coastguard Worker
223*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
224*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
225*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
226*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
227*c0909341SAndroid Build Coastguard Worker
228*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z16.h, z7.h[0]
229*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z19.h, z7.h[0]
230*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z22.h, z7.h[0]
231*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z25.h, z7.h[0]
232*c0909341SAndroid Build Coastguard Worker
233*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z17.h, z7.h[1]
234*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z20.h, z7.h[1]
235*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z23.h, z7.h[1]
236*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z26.h, z7.h[1]
237*c0909341SAndroid Build Coastguard Worker        subs            \w, \w, #8
238*c0909341SAndroid Build Coastguard Worker
239*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v1.4s
240*c0909341SAndroid Build Coastguard Worker        uzp1            v1.4s, v2.4s, v3.4s
241*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
242*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
243*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v5.4s
244*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
245*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
246*c0909341SAndroid Build Coastguard Worker.else   // put
247*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.4h, v0.4s, #6
248*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v0.8h, v1.4s, #6
249*c0909341SAndroid Build Coastguard Worker        umin            v0.8h, v0.8h, v5.8h
250*c0909341SAndroid Build Coastguard Worker.endif
251*c0909341SAndroid Build Coastguard Worker        str             q0, [\ldst]
252*c0909341SAndroid Build Coastguard Worker
253*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #16
254*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #16
255*c0909341SAndroid Build Coastguard Worker        b.gt            81b
256*c0909341SAndroid Build Coastguard Worker        ret
257*c0909341SAndroid Build Coastguard Worker
258*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
259*c0909341SAndroid Build Coastguard Worker40:     // V - 4xN, put only: 2xN
260*c0909341SAndroid Build Coastguard Worker.ifc \type, put
261*c0909341SAndroid Build Coastguard Worker        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
262*c0909341SAndroid Build Coastguard Worker        whilelt         p1.h, wzr, \w               // masking for writes
263*c0909341SAndroid Build Coastguard Worker.endif
264*c0909341SAndroid Build Coastguard Worker        cmp             \h, #4
265*c0909341SAndroid Build Coastguard Worker        b.le            44f
266*c0909341SAndroid Build Coastguard Worker
267*c0909341SAndroid Build Coastguard Worker        ldr             d16, [\src]
268*c0909341SAndroid Build Coastguard Worker        ldr             d17, [\src, \s_strd]
269*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
270*c0909341SAndroid Build Coastguard Worker        ldr             d18, [\src]
271*c0909341SAndroid Build Coastguard Worker        ldr             d19, [\src, \s_strd]
272*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
273*c0909341SAndroid Build Coastguard Worker
274*c0909341SAndroid Build Coastguard Worker        ldr             d20, [\src]
275*c0909341SAndroid Build Coastguard Worker        ldr             d21, [\src, \s_strd]
276*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
277*c0909341SAndroid Build Coastguard Worker        ldr             d22, [\src]
278*c0909341SAndroid Build Coastguard Worker        ldr             d23, [\src, \s_strd]
279*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
280*c0909341SAndroid Build Coastguard Worker        sub             \h, \h, #2
281*c0909341SAndroid Build Coastguard Worker
282*c0909341SAndroid Build Coastguard Worker        zip1            v0.8h, v16.8h, v17.8h
283*c0909341SAndroid Build Coastguard Worker        zip1            v2.8h, v18.8h, v19.8h
284*c0909341SAndroid Build Coastguard Worker        zip1            v18.8h, v20.8h, v21.8h
285*c0909341SAndroid Build Coastguard Worker        zip1            v24.8h, v22.8h, v23.8h
286*c0909341SAndroid Build Coastguard Worker
287*c0909341SAndroid Build Coastguard Worker        zip1            v16.4s, v0.4s, v2.4s
288*c0909341SAndroid Build Coastguard Worker        zip2            v19.4s, v0.4s, v2.4s
289*c0909341SAndroid Build Coastguard Worker        zip1            v17.4s, v18.4s, v24.4s
290*c0909341SAndroid Build Coastguard Worker        zip2            v20.4s, v18.4s, v24.4s
291*c0909341SAndroid Build Coastguard Worker
292*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
293*c0909341SAndroid Build Coastguard Worker4:
294*c0909341SAndroid Build Coastguard Worker        ldr             d18, [\src]
295*c0909341SAndroid Build Coastguard Worker        ldr             d24, [\src, \s_strd]
296*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
297*c0909341SAndroid Build Coastguard Worker
298*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
299*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
300*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
301*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
302*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v18.16b
303*c0909341SAndroid Build Coastguard Worker        mov             v27.16b, v24.16b
304*c0909341SAndroid Build Coastguard Worker
305*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z16.h, z7.h[0]
306*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
307*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z19.h, z7.h[0]
308*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
309*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z17.h, z7.h[1]
310*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
311*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z20.h, z7.h[1]
312*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
313*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
314*c0909341SAndroid Build Coastguard Worker
315*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z22.h, z7.h[0]
316*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v22.16b, v23.16b}, v6.16b
317*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z25.h, z7.h[0]
318*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v25.16b, v26.16b}, v6.16b
319*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z23.h, z7.h[1]
320*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v23.16b, v24.16b}, v28.16b
321*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z26.h, z7.h[1]
322*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v26.16b, v27.16b}, v29.16b
323*c0909341SAndroid Build Coastguard Worker
324*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v1.4s
325*c0909341SAndroid Build Coastguard Worker        uzp1            v1.4s, v2.4s, v3.4s
326*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
327*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
328*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v5.4s
329*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
330*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
331*c0909341SAndroid Build Coastguard Worker        str             q0, [\dst], #16
332*c0909341SAndroid Build Coastguard Worker.else   // put
333*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.4h, v0.4s, #6
334*c0909341SAndroid Build Coastguard Worker        sqrshrun        v1.4h, v1.4s, #6
335*c0909341SAndroid Build Coastguard Worker        umin            v0.4h, v0.4h, v5.4h
336*c0909341SAndroid Build Coastguard Worker        umin            v1.4h, v1.4h, v5.4h
337*c0909341SAndroid Build Coastguard Worker        st1h            {z0.h}, p1, [\dst]
338*c0909341SAndroid Build Coastguard Worker        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
339*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #2
340*c0909341SAndroid Build Coastguard Worker.endif
341*c0909341SAndroid Build Coastguard Worker        b.gt            4b
342*c0909341SAndroid Build Coastguard Worker
343*c0909341SAndroid Build Coastguard Worker        ldr             d18, [\src]
344*c0909341SAndroid Build Coastguard Worker
345*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
346*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
347*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
348*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
349*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v18.16b
350*c0909341SAndroid Build Coastguard Worker
351*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z16.h, z7.h[0]
352*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
353*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z19.h, z7.h[0]
354*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
355*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z17.h, z7.h[1]
356*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
357*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z20.h, z7.h[1]
358*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
359*c0909341SAndroid Build Coastguard Worker
360*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z22.h, z7.h[0]
361*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z25.h, z7.h[0]
362*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z23.h, z7.h[1]
363*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z26.h, z7.h[1]
364*c0909341SAndroid Build Coastguard Worker
365*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v1.4s
366*c0909341SAndroid Build Coastguard Worker        uzp1            v1.4s, v2.4s, v3.4s
367*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
368*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
369*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v5.4s
370*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
371*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
372*c0909341SAndroid Build Coastguard Worker        str             q0, [\dst]
373*c0909341SAndroid Build Coastguard Worker.else   // put
374*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.4h, v0.4s, #6
375*c0909341SAndroid Build Coastguard Worker        sqrshrun        v1.4h, v1.4s, #6
376*c0909341SAndroid Build Coastguard Worker        umin            v0.4h, v0.4h, v5.4h
377*c0909341SAndroid Build Coastguard Worker        umin            v1.4h, v1.4h, v5.4h
378*c0909341SAndroid Build Coastguard Worker        st1h            {z0.h}, p1, [\dst]
379*c0909341SAndroid Build Coastguard Worker        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
380*c0909341SAndroid Build Coastguard Worker.endif
381*c0909341SAndroid Build Coastguard Worker        ret
382*c0909341SAndroid Build Coastguard Worker
383*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
384*c0909341SAndroid Build Coastguard Worker44:     // V - 4x4, put only: 4x2, 2x4, 2x2
385*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1     // src - s_strd
386*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
387*c0909341SAndroid Build Coastguard Worker
388*c0909341SAndroid Build Coastguard Worker        ldr             d16, [\src]
389*c0909341SAndroid Build Coastguard Worker        ldr             d17, [\src, \s_strd]
390*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
391*c0909341SAndroid Build Coastguard Worker        ldr             d18, [\src]
392*c0909341SAndroid Build Coastguard Worker        ldr             d19, [\src, \s_strd]
393*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
394*c0909341SAndroid Build Coastguard Worker
395*c0909341SAndroid Build Coastguard Worker        ext             v7.16b, v7.16b, v7.16b, #4      // [\xmy + 2 * 2]
396*c0909341SAndroid Build Coastguard Worker
397*c0909341SAndroid Build Coastguard Worker        zip1            v0.8h, v16.8h, v17.8h
398*c0909341SAndroid Build Coastguard Worker        zip1            v2.8h, v18.8h, v19.8h
399*c0909341SAndroid Build Coastguard Worker        zip1            v16.4s, v0.4s, v2.4s
400*c0909341SAndroid Build Coastguard Worker        zip2            v19.4s, v0.4s, v2.4s
401*c0909341SAndroid Build Coastguard Worker
402*c0909341SAndroid Build Coastguard Worker.ifc \type, put
403*c0909341SAndroid Build Coastguard Worker        b.eq            42f
404*c0909341SAndroid Build Coastguard Worker.endif
405*c0909341SAndroid Build Coastguard Worker        ldr             d17, [\src]
406*c0909341SAndroid Build Coastguard Worker        ldr             d23, [\src, \s_strd]
407*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
408*c0909341SAndroid Build Coastguard Worker
409*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
410*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
411*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
412*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
413*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v17.16b
414*c0909341SAndroid Build Coastguard Worker        mov             v26.16b, v23.16b
415*c0909341SAndroid Build Coastguard Worker
416*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z16.h, z7.h[0]
417*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v16.16b, v17.16b}, v28.16b
418*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z19.h, z7.h[0]
419*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v19.16b, v20.16b}, v29.16b
420*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z22.h, z7.h[0]
421*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v22.16b, v23.16b}, v28.16b
422*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z25.h, z7.h[0]
423*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v25.16b, v26.16b}, v29.16b
424*c0909341SAndroid Build Coastguard Worker
425*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v1.4s
426*c0909341SAndroid Build Coastguard Worker        uzp1            v1.4s, v2.4s, v3.4s
427*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
428*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
429*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v5.4s
430*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
431*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
432*c0909341SAndroid Build Coastguard Worker        str             q0, [\dst], #16
433*c0909341SAndroid Build Coastguard Worker.else   // put
434*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.4h, v0.4s, #6
435*c0909341SAndroid Build Coastguard Worker        sqrshrun        v1.4h, v1.4s, #6
436*c0909341SAndroid Build Coastguard Worker        umin            v0.4h, v0.4h, v5.4h
437*c0909341SAndroid Build Coastguard Worker        umin            v1.4h, v1.4h, v5.4h
438*c0909341SAndroid Build Coastguard Worker        st1h            {z0.h}, p1, [\dst]
439*c0909341SAndroid Build Coastguard Worker        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
440*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #2
441*c0909341SAndroid Build Coastguard Worker.endif
442*c0909341SAndroid Build Coastguard Worker
443*c0909341SAndroid Build Coastguard Worker.ifc \type, put
444*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
445*c0909341SAndroid Build Coastguard Worker42:
446*c0909341SAndroid Build Coastguard Worker.endif
447*c0909341SAndroid Build Coastguard Worker        ldr             d17, [\src]
448*c0909341SAndroid Build Coastguard Worker
449*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
450*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
451*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
452*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
453*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v17.16b
454*c0909341SAndroid Build Coastguard Worker
455*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z16.h, z7.h[0]
456*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v16.16b, v17.16b}, v28.16b
457*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z19.h, z7.h[0]
458*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v19.16b, v20.16b}, v29.16b
459*c0909341SAndroid Build Coastguard Worker
460*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z22.h, z7.h[0]
461*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z25.h, z7.h[0]
462*c0909341SAndroid Build Coastguard Worker
463*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v1.4s
464*c0909341SAndroid Build Coastguard Worker        uzp1            v1.4s, v2.4s, v3.4s
465*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
466*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
467*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v5.4s
468*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
469*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
470*c0909341SAndroid Build Coastguard Worker        str             q0, [\dst]
471*c0909341SAndroid Build Coastguard Worker.else   // put
472*c0909341SAndroid Build Coastguard Worker        sqrshrun        v0.4h, v0.4s, #6
473*c0909341SAndroid Build Coastguard Worker        sqrshrun        v1.4h, v1.4s, #6
474*c0909341SAndroid Build Coastguard Worker        umin            v0.4h, v0.4h, v5.4h
475*c0909341SAndroid Build Coastguard Worker        umin            v1.4h, v1.4h, v5.4h
476*c0909341SAndroid Build Coastguard Worker        st1h            {z0.h}, p1, [\dst]
477*c0909341SAndroid Build Coastguard Worker        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
478*c0909341SAndroid Build Coastguard Worker.endif
479*c0909341SAndroid Build Coastguard Worker        ret
480*c0909341SAndroid Build Coastguard Worker
481*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
482*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_h_hv_\isa):
483*c0909341SAndroid Build Coastguard Worker        madd            \mx, \mx, w11, w9
484*c0909341SAndroid Build Coastguard Worker        movrel          x13, h_tbl_sve
485*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, #6              // src - 3 * 2
486*c0909341SAndroid Build Coastguard Worker        ubfx            w9, \mx, #7, #7
487*c0909341SAndroid Build Coastguard Worker        and             \mx, \mx, #0x7F
488*c0909341SAndroid Build Coastguard Worker        cmp             \w, #4
489*c0909341SAndroid Build Coastguard Worker        csel            \mx, \mx, w9, le
490*c0909341SAndroid Build Coastguard Worker        ldp             q30, q31, [x13]
491*c0909341SAndroid Build Coastguard Worker        add             \xmx, x12, \xmx, lsl #3     // subpel H filter address
492*c0909341SAndroid Build Coastguard Worker        cbz             \my, L(\type\()_8tap_h_\isa)
493*c0909341SAndroid Build Coastguard Worker
494*c0909341SAndroid Build Coastguard Worker        // HV cases
495*c0909341SAndroid Build Coastguard Worker        madd            w14, \my, w11, w10
496*c0909341SAndroid Build Coastguard Worker.ifc \bdmax, w8
497*c0909341SAndroid Build Coastguard Worker        ldr             \bdmax, [sp]
498*c0909341SAndroid Build Coastguard Worker.endif
499*c0909341SAndroid Build Coastguard Worker        ubfx            w11, w14, #7, #7
500*c0909341SAndroid Build Coastguard Worker        and             w14, w14, #0x7F
501*c0909341SAndroid Build Coastguard Worker        ld1sb           {z4.h}, p0/z, [\xmx]
502*c0909341SAndroid Build Coastguard Worker        cmp             \h, #4
503*c0909341SAndroid Build Coastguard Worker        csel            w14, w14, w11, le
504*c0909341SAndroid Build Coastguard Worker.ifc \type, put
505*c0909341SAndroid Build Coastguard Worker        dup             v29.8h, \bdmax
506*c0909341SAndroid Build Coastguard Worker.endif
507*c0909341SAndroid Build Coastguard Worker        clz             \bdmax, \bdmax
508*c0909341SAndroid Build Coastguard Worker        add             \xmy, x12, x14, lsl #3      // subpel V filter address
509*c0909341SAndroid Build Coastguard Worker        ld1sb           {z7.h}, p0/z, [\xmy]
510*c0909341SAndroid Build Coastguard Worker.ifc \type, put
511*c0909341SAndroid Build Coastguard Worker        mov             w9, #12
512*c0909341SAndroid Build Coastguard Worker        sub             w9, w9, \bdmax
513*c0909341SAndroid Build Coastguard Worker        dup             v6.4s, w9
514*c0909341SAndroid Build Coastguard Worker.endif
515*c0909341SAndroid Build Coastguard Worker        sub             \bdmax, \bdmax, #24
516*c0909341SAndroid Build Coastguard Worker        mov             x15, x30
517*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd         // src - s_strd - 3 * 2
518*c0909341SAndroid Build Coastguard Worker        dup             v5.4s, \bdmax
519*c0909341SAndroid Build Coastguard Worker        cmp             w10, SHARP1
520*c0909341SAndroid Build Coastguard Worker        b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1
521*c0909341SAndroid Build Coastguard Worker
522*c0909341SAndroid Build Coastguard Worker        // HV 8-tap cases
523*c0909341SAndroid Build Coastguard Worker        cmp             \w, #4
524*c0909341SAndroid Build Coastguard Worker        b.le            40f
525*c0909341SAndroid Build Coastguard Worker
526*c0909341SAndroid Build Coastguard Worker        // .align JUMP_ALIGN    // fallthrough
527*c0909341SAndroid Build Coastguard Worker80:     // HV8 - 8xN+
528*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
529*c0909341SAndroid Build Coastguard Worker        add             \wd_strd, \w, \w                // d_strd = 2 * w
530*c0909341SAndroid Build Coastguard Worker.endif
531*c0909341SAndroid Build Coastguard Worker        cmp             \h, #4
532*c0909341SAndroid Build Coastguard Worker        b.le            84f
533*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd, lsl #1     // src - 3 * s_strd - 3 * 2
534*c0909341SAndroid Build Coastguard Worker
535*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
536*c0909341SAndroid Build Coastguard Worker81:
537*c0909341SAndroid Build Coastguard Worker        mov             \lsrc, \src
538*c0909341SAndroid Build Coastguard Worker        mov             \ldst, \dst
539*c0909341SAndroid Build Coastguard Worker        mov             w8, \h
540*c0909341SAndroid Build Coastguard Worker
541*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
542*c0909341SAndroid Build Coastguard Worker        uzp1            v16.8h, v23.8h, v24.8h
543*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
544*c0909341SAndroid Build Coastguard Worker        uzp1            v17.8h, v23.8h, v24.8h
545*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
546*c0909341SAndroid Build Coastguard Worker        uzp1            v18.8h, v23.8h, v24.8h
547*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
548*c0909341SAndroid Build Coastguard Worker        uzp1            v19.8h, v23.8h, v24.8h
549*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
550*c0909341SAndroid Build Coastguard Worker        uzp1            v20.8h, v23.8h, v24.8h
551*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
552*c0909341SAndroid Build Coastguard Worker        uzp1            v21.8h, v23.8h, v24.8h
553*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
554*c0909341SAndroid Build Coastguard Worker        uzp1            v22.8h, v23.8h, v24.8h
555*c0909341SAndroid Build Coastguard Worker
556*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
557*c0909341SAndroid Build Coastguard Worker8:
558*c0909341SAndroid Build Coastguard Worker        ldp             q24, q28, [\lsrc]
559*c0909341SAndroid Build Coastguard Worker        smull           v0.4s, v16.4h, v7.h[0]
560*c0909341SAndroid Build Coastguard Worker        smull2          v1.4s, v16.8h, v7.h[0]
561*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v17.16b
562*c0909341SAndroid Build Coastguard Worker
563*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
564*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
565*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v24.16b}, v30.16b
566*c0909341SAndroid Build Coastguard Worker        tbl             v24.16b, {v24.16b}, v31.16b
567*c0909341SAndroid Build Coastguard Worker
568*c0909341SAndroid Build Coastguard Worker        ldur            q26, [\lsrc, #8]
569*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v17.4h, v7.h[1]
570*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v17.8h, v7.h[1]
571*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
572*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd
573*c0909341SAndroid Build Coastguard Worker
574*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z23.h, z4.h[0]
575*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z24.h, z4.h[0]
576*c0909341SAndroid Build Coastguard Worker        movi            v23.2d, #0
577*c0909341SAndroid Build Coastguard Worker        movi            v24.2d, #0
578*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v26.16b}, v30.16b
579*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v26.16b}, v31.16b
580*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v18.4h, v7.h[2]
581*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v18.8h, v7.h[2]
582*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
583*c0909341SAndroid Build Coastguard Worker
584*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z25.h, z4.h[0]
585*c0909341SAndroid Build Coastguard Worker        sdot            z24.d, z26.h, z4.h[0]
586*c0909341SAndroid Build Coastguard Worker        tbl             v27.16b, {v28.16b}, v30.16b
587*c0909341SAndroid Build Coastguard Worker        tbl             v28.16b, {v28.16b}, v31.16b
588*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v19.4h, v7.h[3]
589*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v19.8h, v7.h[3]
590*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v20.16b
591*c0909341SAndroid Build Coastguard Worker
592*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #1
593*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z25.h, z4.h[1]
594*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z26.h, z4.h[1]
595*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z27.h, z4.h[1]
596*c0909341SAndroid Build Coastguard Worker        sdot            z24.d, z28.h, z4.h[1]
597*c0909341SAndroid Build Coastguard Worker
598*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[4]
599*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v20.8h, v7.h[4]
600*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v21.16b
601*c0909341SAndroid Build Coastguard Worker
602*c0909341SAndroid Build Coastguard Worker        uzp1            v3.4s, v2.4s, v3.4s
603*c0909341SAndroid Build Coastguard Worker        uzp1            v24.4s, v23.4s, v24.4s
604*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v21.4h, v7.h[5]
605*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v21.8h, v7.h[5]
606*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v22.16b
607*c0909341SAndroid Build Coastguard Worker
608*c0909341SAndroid Build Coastguard Worker        srshl           v23.4s, v3.4s, v5.4s
609*c0909341SAndroid Build Coastguard Worker        srshl           v24.4s, v24.4s, v5.4s
610*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v22.4h, v7.h[6]
611*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v22.8h, v7.h[6]
612*c0909341SAndroid Build Coastguard Worker
613*c0909341SAndroid Build Coastguard Worker        uzp1            v22.8h, v23.8h, v24.8h
614*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v22.4h, v7.h[7]
615*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v22.8h, v7.h[7]
616*c0909341SAndroid Build Coastguard Worker
617*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
618*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v0.4s, #6
619*c0909341SAndroid Build Coastguard Worker        rshrn2          v0.8h, v1.4s, #6
620*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
621*c0909341SAndroid Build Coastguard Worker.else   // put
622*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v6.4s
623*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v6.4s
624*c0909341SAndroid Build Coastguard Worker        sqxtun          v0.4h, v0.4s
625*c0909341SAndroid Build Coastguard Worker        sqxtun2         v0.8h, v1.4s
626*c0909341SAndroid Build Coastguard Worker        umin            v0.8h, v0.8h, v29.8h
627*c0909341SAndroid Build Coastguard Worker.endif
628*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [\ldst], \d_strd
629*c0909341SAndroid Build Coastguard Worker        b.gt            8b
630*c0909341SAndroid Build Coastguard Worker
631*c0909341SAndroid Build Coastguard Worker        subs            \w, \w, #8
632*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #16
633*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #16
634*c0909341SAndroid Build Coastguard Worker        b.gt            81b
635*c0909341SAndroid Build Coastguard Worker        ret             x15
636*c0909341SAndroid Build Coastguard Worker
637*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
638*c0909341SAndroid Build Coastguard Worker40:     // HV8 - 4xN, put only: 2xN
639*c0909341SAndroid Build Coastguard Worker.ifc \type, put
640*c0909341SAndroid Build Coastguard Worker        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
641*c0909341SAndroid Build Coastguard Worker        whilelt         p1.h, wzr, \w               // masking for writes
642*c0909341SAndroid Build Coastguard Worker.endif
643*c0909341SAndroid Build Coastguard Worker        ext             v4.16b, v4.16b, v4.16b, #4  // [\xmy + 2 * 2]
644*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #4
645*c0909341SAndroid Build Coastguard Worker
646*c0909341SAndroid Build Coastguard Worker        cmp             \h, #4
647*c0909341SAndroid Build Coastguard Worker        b.le            44f
648*c0909341SAndroid Build Coastguard Worker
649*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd, lsl #1 // src - 3 * s_strd - 3 * 2
650*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
651*c0909341SAndroid Build Coastguard Worker        xtn             v16.4h, v0.4s
652*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
653*c0909341SAndroid Build Coastguard Worker        xtn             v17.4h, v0.4s
654*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
655*c0909341SAndroid Build Coastguard Worker        xtn             v18.4h, v0.4s
656*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
657*c0909341SAndroid Build Coastguard Worker        xtn             v19.4h, v0.4s
658*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
659*c0909341SAndroid Build Coastguard Worker        xtn             v20.4h, v0.4s
660*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
661*c0909341SAndroid Build Coastguard Worker        xtn             v21.4h, v0.4s
662*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
663*c0909341SAndroid Build Coastguard Worker        xtn             v22.4h, v0.4s
664*c0909341SAndroid Build Coastguard Worker
665*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
666*c0909341SAndroid Build Coastguard Worker4:
667*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [\src], \s_strd
668*c0909341SAndroid Build Coastguard Worker
669*c0909341SAndroid Build Coastguard Worker        smull           v24.4s, v16.4h, v7.h[0]
670*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v17.4h, v7.h[1]
671*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v3.16b}, v30.16b
672*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v3.16b}, v31.16b
673*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
674*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
675*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v17.16b
676*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
677*c0909341SAndroid Build Coastguard Worker
678*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v18.4h, v7.h[2]
679*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v19.4h, v7.h[3]
680*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z2.h, z4.h[0]
681*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z3.h, z4.h[0]
682*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
683*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v20.16b
684*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v1.4s
685*c0909341SAndroid Build Coastguard Worker
686*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v20.4h, v7.h[4]
687*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v21.4h, v7.h[5]
688*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
689*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v21.16b
690*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v22.16b
691*c0909341SAndroid Build Coastguard Worker
692*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
693*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v22.4h, v7.h[6]
694*c0909341SAndroid Build Coastguard Worker        xtn             v22.4h, v0.4s
695*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v22.4h, v7.h[7]
696*c0909341SAndroid Build Coastguard Worker
697*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
698*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v24.4s, #6
699*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
700*c0909341SAndroid Build Coastguard Worker        str             d0, [\dst], #8
701*c0909341SAndroid Build Coastguard Worker.else   // put
702*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v24.4s, v6.4s
703*c0909341SAndroid Build Coastguard Worker        sqxtun          v0.4h, v0.4s
704*c0909341SAndroid Build Coastguard Worker        umin            v0.4h, v0.4h, v29.4h
705*c0909341SAndroid Build Coastguard Worker        st1h            {z0.h}, p1, [\dst]
706*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #1
707*c0909341SAndroid Build Coastguard Worker.endif
708*c0909341SAndroid Build Coastguard Worker        b.gt            4b
709*c0909341SAndroid Build Coastguard Worker        ret             x15
710*c0909341SAndroid Build Coastguard Worker
711*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
712*c0909341SAndroid Build Coastguard WorkerL(\type\()_6tap_hv_\isa):
713*c0909341SAndroid Build Coastguard Worker        cmp             \w, #4
714*c0909341SAndroid Build Coastguard Worker        b.le            46f
715*c0909341SAndroid Build Coastguard Worker
716*c0909341SAndroid Build Coastguard Worker        // .align JUMP_ALIGN    // fallthrough
717*c0909341SAndroid Build Coastguard Worker80:     // HV6 - 8xN+
718*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
719*c0909341SAndroid Build Coastguard Worker        add             \wd_strd, \w, \w        // d_strd = 2 * w
720*c0909341SAndroid Build Coastguard Worker.endif
721*c0909341SAndroid Build Coastguard Worker        cmp             \h, #4
722*c0909341SAndroid Build Coastguard Worker        b.le            84f
723*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd     // src - 2 * s_strd - 3 * 2
724*c0909341SAndroid Build Coastguard Worker
725*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
726*c0909341SAndroid Build Coastguard Worker81:
727*c0909341SAndroid Build Coastguard Worker        mov             \lsrc, \src
728*c0909341SAndroid Build Coastguard Worker        mov             \ldst, \dst
729*c0909341SAndroid Build Coastguard Worker        mov             w8, \h
730*c0909341SAndroid Build Coastguard Worker
731*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
732*c0909341SAndroid Build Coastguard Worker        uzp1            v16.8h, v23.8h, v24.8h
733*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
734*c0909341SAndroid Build Coastguard Worker        uzp1            v17.8h, v23.8h, v24.8h
735*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
736*c0909341SAndroid Build Coastguard Worker        uzp1            v18.8h, v23.8h, v24.8h
737*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
738*c0909341SAndroid Build Coastguard Worker        uzp1            v19.8h, v23.8h, v24.8h
739*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
740*c0909341SAndroid Build Coastguard Worker        uzp1            v20.8h, v23.8h, v24.8h
741*c0909341SAndroid Build Coastguard Worker
742*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
743*c0909341SAndroid Build Coastguard Worker8:
744*c0909341SAndroid Build Coastguard Worker        ldp             q24, q28, [\lsrc]
745*c0909341SAndroid Build Coastguard Worker
746*c0909341SAndroid Build Coastguard Worker        smull           v0.4s, v16.4h, v7.h[1]
747*c0909341SAndroid Build Coastguard Worker        smull2          v1.4s, v16.8h, v7.h[1]
748*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v17.16b
749*c0909341SAndroid Build Coastguard Worker
750*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v24.16b}, v30.16b
751*c0909341SAndroid Build Coastguard Worker        tbl             v24.16b, {v24.16b}, v31.16b
752*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
753*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
754*c0909341SAndroid Build Coastguard Worker
755*c0909341SAndroid Build Coastguard Worker        ldur            q26, [\lsrc, #8]
756*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd
757*c0909341SAndroid Build Coastguard Worker
758*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z23.h, z4.h[0]
759*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z24.h, z4.h[0]
760*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v26.16b}, v30.16b
761*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v26.16b}, v31.16b
762*c0909341SAndroid Build Coastguard Worker        movi            v23.2d, #0
763*c0909341SAndroid Build Coastguard Worker        movi            v24.2d, #0
764*c0909341SAndroid Build Coastguard Worker
765*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z25.h, z4.h[0]
766*c0909341SAndroid Build Coastguard Worker        sdot            z24.d, z26.h, z4.h[0]
767*c0909341SAndroid Build Coastguard Worker        tbl             v27.16b, {v28.16b}, v30.16b
768*c0909341SAndroid Build Coastguard Worker        tbl             v28.16b, {v28.16b}, v31.16b
769*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v17.4h, v7.h[2]
770*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v17.8h, v7.h[2]
771*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
772*c0909341SAndroid Build Coastguard Worker
773*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z25.h, z4.h[1]
774*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z26.h, z4.h[1]
775*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z27.h, z4.h[1]
776*c0909341SAndroid Build Coastguard Worker        sdot            z24.d, z28.h, z4.h[1]
777*c0909341SAndroid Build Coastguard Worker
778*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v18.4h, v7.h[3]
779*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v18.8h, v7.h[3]
780*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
781*c0909341SAndroid Build Coastguard Worker
782*c0909341SAndroid Build Coastguard Worker        uzp1            v3.4s, v2.4s, v3.4s
783*c0909341SAndroid Build Coastguard Worker        uzp1            v24.4s, v23.4s, v24.4s
784*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v19.4h, v7.h[4]
785*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v19.8h, v7.h[4]
786*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v20.16b
787*c0909341SAndroid Build Coastguard Worker
788*c0909341SAndroid Build Coastguard Worker        srshl           v23.4s, v3.4s, v5.4s
789*c0909341SAndroid Build Coastguard Worker        srshl           v24.4s, v24.4s, v5.4s
790*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[5]
791*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v20.8h, v7.h[5]
792*c0909341SAndroid Build Coastguard Worker
793*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #1
794*c0909341SAndroid Build Coastguard Worker        uzp1            v20.8h, v23.8h, v24.8h
795*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v20.4h, v7.h[6]
796*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v20.8h, v7.h[6]
797*c0909341SAndroid Build Coastguard Worker
798*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
799*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v0.4s, #6
800*c0909341SAndroid Build Coastguard Worker        rshrn2          v0.8h, v1.4s, #6
801*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
802*c0909341SAndroid Build Coastguard Worker.else   // put
803*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v6.4s
804*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v6.4s
805*c0909341SAndroid Build Coastguard Worker        sqxtun          v0.4h, v0.4s
806*c0909341SAndroid Build Coastguard Worker        sqxtun2         v0.8h, v1.4s
807*c0909341SAndroid Build Coastguard Worker        umin            v0.8h, v0.8h, v29.8h
808*c0909341SAndroid Build Coastguard Worker.endif
809*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [\ldst], \d_strd
810*c0909341SAndroid Build Coastguard Worker        b.gt            8b
811*c0909341SAndroid Build Coastguard Worker
812*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #16
813*c0909341SAndroid Build Coastguard Worker        subs            \w, \w, #8
814*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #16
815*c0909341SAndroid Build Coastguard Worker        b.gt            81b
816*c0909341SAndroid Build Coastguard Worker        ret             x15
817*c0909341SAndroid Build Coastguard Worker
818*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
819*c0909341SAndroid Build Coastguard Worker84:     // HV4 - 8x4, 8x2
820*c0909341SAndroid Build Coastguard Worker        mov             \lsrc, \src
821*c0909341SAndroid Build Coastguard Worker        mov             \ldst, \dst
822*c0909341SAndroid Build Coastguard Worker        mov             w8, \h
823*c0909341SAndroid Build Coastguard Worker
824*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
825*c0909341SAndroid Build Coastguard Worker        uzp1            v17.8h, v23.8h, v24.8h
826*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
827*c0909341SAndroid Build Coastguard Worker        uzp1            v18.8h, v23.8h, v24.8h
828*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter8_\isa)
829*c0909341SAndroid Build Coastguard Worker        uzp1            v19.8h, v23.8h, v24.8h
830*c0909341SAndroid Build Coastguard Worker
831*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
832*c0909341SAndroid Build Coastguard Worker81:
833*c0909341SAndroid Build Coastguard Worker        ldp             q24, q28, [\lsrc]
834*c0909341SAndroid Build Coastguard Worker        ldur            q26, [\lsrc, #8]
835*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd
836*c0909341SAndroid Build Coastguard Worker
837*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v24.16b}, v30.16b
838*c0909341SAndroid Build Coastguard Worker        tbl             v24.16b, {v24.16b}, v31.16b
839*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
840*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
841*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z23.h, z4.h[0]
842*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z24.h, z4.h[0]
843*c0909341SAndroid Build Coastguard Worker
844*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v26.16b}, v30.16b
845*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v26.16b}, v31.16b
846*c0909341SAndroid Build Coastguard Worker        movi            v23.2d, #0
847*c0909341SAndroid Build Coastguard Worker        movi            v24.2d, #0
848*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z25.h, z4.h[0]
849*c0909341SAndroid Build Coastguard Worker        sdot            z24.d, z26.h, z4.h[0]
850*c0909341SAndroid Build Coastguard Worker
851*c0909341SAndroid Build Coastguard Worker        tbl             v27.16b, {v28.16b}, v30.16b
852*c0909341SAndroid Build Coastguard Worker        tbl             v28.16b, {v28.16b}, v31.16b
853*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z25.h, z4.h[1]
854*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z26.h, z4.h[1]
855*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z27.h, z4.h[1]
856*c0909341SAndroid Build Coastguard Worker        sdot            z24.d, z28.h, z4.h[1]
857*c0909341SAndroid Build Coastguard Worker
858*c0909341SAndroid Build Coastguard Worker        smull           v0.4s, v17.4h, v7.h[2]
859*c0909341SAndroid Build Coastguard Worker        smull2          v1.4s, v17.8h, v7.h[2]
860*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
861*c0909341SAndroid Build Coastguard Worker
862*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #1
863*c0909341SAndroid Build Coastguard Worker        uzp1            v3.4s, v2.4s, v3.4s
864*c0909341SAndroid Build Coastguard Worker        uzp1            v24.4s, v23.4s, v24.4s
865*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v18.4h, v7.h[3]
866*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v18.8h, v7.h[3]
867*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
868*c0909341SAndroid Build Coastguard Worker
869*c0909341SAndroid Build Coastguard Worker        srshl           v23.4s, v3.4s, v5.4s
870*c0909341SAndroid Build Coastguard Worker        srshl           v24.4s, v24.4s, v5.4s
871*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v19.4h, v7.h[4]
872*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v19.8h, v7.h[4]
873*c0909341SAndroid Build Coastguard Worker
874*c0909341SAndroid Build Coastguard Worker        uzp1            v19.8h, v23.8h, v24.8h
875*c0909341SAndroid Build Coastguard Worker        smlal           v0.4s, v19.4h, v7.h[5]
876*c0909341SAndroid Build Coastguard Worker        smlal2          v1.4s, v19.8h, v7.h[5]
877*c0909341SAndroid Build Coastguard Worker
878*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
879*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v0.4s, #6
880*c0909341SAndroid Build Coastguard Worker        rshrn2          v0.8h, v1.4s, #6
881*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
882*c0909341SAndroid Build Coastguard Worker.else   // put
883*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v6.4s
884*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v6.4s
885*c0909341SAndroid Build Coastguard Worker        sqxtun          v0.4h, v0.4s
886*c0909341SAndroid Build Coastguard Worker        sqxtun2         v0.8h, v1.4s
887*c0909341SAndroid Build Coastguard Worker        umin            v0.8h, v0.8h, v29.8h
888*c0909341SAndroid Build Coastguard Worker.endif
889*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [\ldst], \d_strd
890*c0909341SAndroid Build Coastguard Worker        b.gt            81b
891*c0909341SAndroid Build Coastguard Worker
892*c0909341SAndroid Build Coastguard Worker        subs            \w, \w, #8
893*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #16
894*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #16
895*c0909341SAndroid Build Coastguard Worker        b.gt            84b
896*c0909341SAndroid Build Coastguard Worker        ret             x15
897*c0909341SAndroid Build Coastguard Worker
898*c0909341SAndroid Build Coastguard Worker        .align FUNC_ALIGN
899*c0909341SAndroid Build Coastguard WorkerL(\type\()_hv_filter8_\isa):
900*c0909341SAndroid Build Coastguard Worker        ldp             q24, q28, [\lsrc]
901*c0909341SAndroid Build Coastguard Worker        ldur            q26, [\lsrc, #8]
902*c0909341SAndroid Build Coastguard Worker        add             \lsrc, \lsrc, \s_strd
903*c0909341SAndroid Build Coastguard Worker
904*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v24.16b}, v30.16b
905*c0909341SAndroid Build Coastguard Worker        tbl             v24.16b, {v24.16b}, v31.16b
906*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
907*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
908*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z23.h, z4.h[0]
909*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z24.h, z4.h[0]
910*c0909341SAndroid Build Coastguard Worker
911*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v26.16b}, v30.16b
912*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v26.16b}, v31.16b
913*c0909341SAndroid Build Coastguard Worker        movi            v23.2d, #0
914*c0909341SAndroid Build Coastguard Worker        movi            v24.2d, #0
915*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z25.h, z4.h[0]
916*c0909341SAndroid Build Coastguard Worker        sdot            z24.d, z26.h, z4.h[0]
917*c0909341SAndroid Build Coastguard Worker
918*c0909341SAndroid Build Coastguard Worker        tbl             v27.16b, {v28.16b}, v30.16b
919*c0909341SAndroid Build Coastguard Worker        tbl             v28.16b, {v28.16b}, v31.16b
920*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z25.h, z4.h[1]
921*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z26.h, z4.h[1]
922*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z27.h, z4.h[1]
923*c0909341SAndroid Build Coastguard Worker        sdot            z24.d, z28.h, z4.h[1]
924*c0909341SAndroid Build Coastguard Worker
925*c0909341SAndroid Build Coastguard Worker        uzp1            v3.4s, v2.4s, v3.4s
926*c0909341SAndroid Build Coastguard Worker        uzp1            v24.4s, v23.4s, v24.4s
927*c0909341SAndroid Build Coastguard Worker        srshl           v23.4s, v3.4s, v5.4s
928*c0909341SAndroid Build Coastguard Worker        srshl           v24.4s, v24.4s, v5.4s
929*c0909341SAndroid Build Coastguard Worker        ret
930*c0909341SAndroid Build Coastguard Worker
931*c0909341SAndroid Build Coastguard Worker        .align FUNC_ALIGN
932*c0909341SAndroid Build Coastguard WorkerL(\type\()_hv_filter4_\isa):
933*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [\src], \s_strd
934*c0909341SAndroid Build Coastguard Worker
935*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v3.16b}, v30.16b
936*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v3.16b}, v31.16b
937*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
938*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
939*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z2.h, z4.h[0]
940*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z3.h, z4.h[0]
941*c0909341SAndroid Build Coastguard Worker
942*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v1.4s
943*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
944*c0909341SAndroid Build Coastguard Worker        ret
945*c0909341SAndroid Build Coastguard Worker
946*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
947*c0909341SAndroid Build Coastguard Worker46:     // H4V6 - 4xN, put only: 2xN
948*c0909341SAndroid Build Coastguard Worker.ifc \type, put
949*c0909341SAndroid Build Coastguard Worker        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
950*c0909341SAndroid Build Coastguard Worker        whilelt         p1.h, wzr, \w               // masking for writes
951*c0909341SAndroid Build Coastguard Worker.endif
952*c0909341SAndroid Build Coastguard Worker        ext             v4.16b, v4.16b, v4.16b, #4  // [\xmy + 2 * 2]
953*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #4
954*c0909341SAndroid Build Coastguard Worker
955*c0909341SAndroid Build Coastguard Worker        cmp             \h, #4
956*c0909341SAndroid Build Coastguard Worker        b.le            44f
957*c0909341SAndroid Build Coastguard Worker
958*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd         // src - 2 * s_strd - 3 * 2
959*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
960*c0909341SAndroid Build Coastguard Worker        xtn             v16.4h, v0.4s
961*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
962*c0909341SAndroid Build Coastguard Worker        xtn             v17.4h, v0.4s
963*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
964*c0909341SAndroid Build Coastguard Worker        xtn             v18.4h, v0.4s
965*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
966*c0909341SAndroid Build Coastguard Worker        xtn             v19.4h, v0.4s
967*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
968*c0909341SAndroid Build Coastguard Worker        xtn             v20.4h, v0.4s
969*c0909341SAndroid Build Coastguard Worker
970*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
971*c0909341SAndroid Build Coastguard Worker4:
972*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [\src], \s_strd
973*c0909341SAndroid Build Coastguard Worker        smull           v24.4s, v16.4h, v7.h[1]
974*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v17.4h, v7.h[2]
975*c0909341SAndroid Build Coastguard Worker
976*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v3.16b}, v30.16b
977*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v3.16b}, v31.16b
978*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
979*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
980*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z2.h, z4.h[0]
981*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z3.h, z4.h[0]
982*c0909341SAndroid Build Coastguard Worker
983*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v17.16b
984*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
985*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v18.4h, v7.h[3]
986*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v19.4h, v7.h[4]
987*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v1.4s
988*c0909341SAndroid Build Coastguard Worker
989*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
990*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v20.16b
991*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
992*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
993*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v20.4h, v7.h[5]
994*c0909341SAndroid Build Coastguard Worker        xtn             v20.4h, v0.4s
995*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v20.4h, v7.h[6]
996*c0909341SAndroid Build Coastguard Worker
997*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
998*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v24.4s, #6
999*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
1000*c0909341SAndroid Build Coastguard Worker        str             d0, [\dst], #8
1001*c0909341SAndroid Build Coastguard Worker.else   // put
1002*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v24.4s, v6.4s
1003*c0909341SAndroid Build Coastguard Worker        sqxtun          v0.4h, v0.4s
1004*c0909341SAndroid Build Coastguard Worker        umin            v0.4h, v0.4h, v29.4h
1005*c0909341SAndroid Build Coastguard Worker        st1h            {z0.h}, p1, [\dst]
1006*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #1
1007*c0909341SAndroid Build Coastguard Worker.endif
1008*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1009*c0909341SAndroid Build Coastguard Worker        ret             x15
1010*c0909341SAndroid Build Coastguard Worker
1011*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1012*c0909341SAndroid Build Coastguard Worker44:     // H4V4 - 4x4, put only: 4x2, 2x4, 2x2
1013*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1014*c0909341SAndroid Build Coastguard Worker        xtn             v17.4h, v0.4s
1015*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1016*c0909341SAndroid Build Coastguard Worker        xtn             v18.4h, v0.4s
1017*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_hv_filter4_\isa)
1018*c0909341SAndroid Build Coastguard Worker        xtn             v19.4h, v0.4s
1019*c0909341SAndroid Build Coastguard Worker
1020*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1021*c0909341SAndroid Build Coastguard Worker4:
1022*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [\src], \s_strd
1023*c0909341SAndroid Build Coastguard Worker        smull           v24.4s, v17.4h, v7.h[2]
1024*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v18.4h, v7.h[3]
1025*c0909341SAndroid Build Coastguard Worker
1026*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v3.16b}, v30.16b
1027*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v3.16b}, v31.16b
1028*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
1029*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
1030*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z2.h, z4.h[0]
1031*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z3.h, z4.h[0]
1032*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v1.4s
1033*c0909341SAndroid Build Coastguard Worker
1034*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v18.16b
1035*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v19.16b
1036*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
1037*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
1038*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v19.4h, v7.h[4]
1039*c0909341SAndroid Build Coastguard Worker        xtn             v19.4h, v0.4s
1040*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s, v19.4h, v7.h[5]
1041*c0909341SAndroid Build Coastguard Worker
1042*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1043*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h, v24.4s, #6
1044*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
1045*c0909341SAndroid Build Coastguard Worker        str             d0, [\dst], #8
1046*c0909341SAndroid Build Coastguard Worker.else   // put
1047*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v24.4s, v6.4s
1048*c0909341SAndroid Build Coastguard Worker        sqxtun          v0.4h, v0.4s
1049*c0909341SAndroid Build Coastguard Worker        umin            v0.4h, v0.4h, v29.4h
1050*c0909341SAndroid Build Coastguard Worker        st1h            {z0.h}, p1, [\dst]
1051*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #1
1052*c0909341SAndroid Build Coastguard Worker.endif
1053*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1054*c0909341SAndroid Build Coastguard Worker        ret             x15
1055*c0909341SAndroid Build Coastguard Worker
1056*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1057*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_h_\isa):
1058*c0909341SAndroid Build Coastguard Worker        movrel          x11, \type\()_8tap_h_\isa\()_tbl
1059*c0909341SAndroid Build Coastguard Worker        ldrsw           x12, [x11, x8, lsl #2]
1060*c0909341SAndroid Build Coastguard Worker.ifc \bdmax, w8
1061*c0909341SAndroid Build Coastguard Worker        ldr             \bdmax, [sp]
1062*c0909341SAndroid Build Coastguard Worker.endif
1063*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1064*c0909341SAndroid Build Coastguard Worker        clz             \bdmax, \bdmax
1065*c0909341SAndroid Build Coastguard Worker        sub             \bdmax, \bdmax, #24
1066*c0909341SAndroid Build Coastguard Worker        dup             v5.4s, \bdmax
1067*c0909341SAndroid Build Coastguard Worker.else   // put
1068*c0909341SAndroid Build Coastguard Worker        mov             w9, #34             // rounding for 10-bit case
1069*c0909341SAndroid Build Coastguard Worker        mov             w10, #40            // rounding for 12-bit case
1070*c0909341SAndroid Build Coastguard Worker        cmp             \bdmax, #0xFFF
1071*c0909341SAndroid Build Coastguard Worker        csel            w9, w9, w10, ne     // select rounding based on \bdmax
1072*c0909341SAndroid Build Coastguard Worker        dup             v5.8h, \bdmax
1073*c0909341SAndroid Build Coastguard Worker        dup             v6.2d, x9
1074*c0909341SAndroid Build Coastguard Worker.endif
1075*c0909341SAndroid Build Coastguard Worker        add             x11, x11, x12
1076*c0909341SAndroid Build Coastguard Worker        ld1sb           {z4.h}, p0/z, [\xmx]
1077*c0909341SAndroid Build Coastguard Worker        br              x11
1078*c0909341SAndroid Build Coastguard Worker
1079*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1080*c0909341SAndroid Build Coastguard Worker20:     // H - 4xN, put only: 2xN
1081*c0909341SAndroid Build Coastguard Worker40:
1082*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1083*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #4              // src - 1 * 2
1084*c0909341SAndroid Build Coastguard Worker        ext             v4.16b, v4.16b, v4.16b, #4  // [\xmy + 2 * 2]
1085*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1086*c0909341SAndroid Build Coastguard Worker        lsr             \d_strd, \d_strd, #1        // hword index for `st1h`
1087*c0909341SAndroid Build Coastguard Worker        whilelt         p1.h, wzr, \w               // masking for writes
1088*c0909341SAndroid Build Coastguard Worker.endif
1089*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1090*c0909341SAndroid Build Coastguard Worker4:
1091*c0909341SAndroid Build Coastguard Worker        ldr             q17, [\src]
1092*c0909341SAndroid Build Coastguard Worker        ldr             q19, [\src, \s_strd]
1093*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd, lsl #1
1094*c0909341SAndroid Build Coastguard Worker
1095*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1096*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
1097*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
1098*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
1099*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
1100*c0909341SAndroid Build Coastguard Worker.else
1101*c0909341SAndroid Build Coastguard Worker        mov             v0.16b, v6.16b
1102*c0909341SAndroid Build Coastguard Worker        mov             v1.16b, v6.16b
1103*c0909341SAndroid Build Coastguard Worker        mov             v2.16b, v6.16b
1104*c0909341SAndroid Build Coastguard Worker        mov             v3.16b, v6.16b
1105*c0909341SAndroid Build Coastguard Worker.endif
1106*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v17.16b}, v30.16b
1107*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v17.16b}, v31.16b
1108*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z16.h, z4.h[0]
1109*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z17.h, z4.h[0]
1110*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
1111*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v19.16b}, v30.16b
1112*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v19.16b}, v31.16b
1113*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z18.h, z4.h[0]
1114*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z19.h, z4.h[0]
1115*c0909341SAndroid Build Coastguard Worker
1116*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v1.4s
1117*c0909341SAndroid Build Coastguard Worker        uzp1            v1.4s, v2.4s, v3.4s
1118*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1119*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
1120*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v5.4s
1121*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v1.8h
1122*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
1123*c0909341SAndroid Build Coastguard Worker        str             q0, [\dst], #16
1124*c0909341SAndroid Build Coastguard Worker.else   // put
1125*c0909341SAndroid Build Coastguard Worker        sqshrun         v0.4h, v0.4s, #6
1126*c0909341SAndroid Build Coastguard Worker        sqshrun         v1.4h, v1.4s, #6
1127*c0909341SAndroid Build Coastguard Worker        umin            v0.4h, v0.4h, v5.4h
1128*c0909341SAndroid Build Coastguard Worker        umin            v1.4h, v1.4h, v5.4h
1129*c0909341SAndroid Build Coastguard Worker        st1h            {z0.h}, p1, [\dst]
1130*c0909341SAndroid Build Coastguard Worker        st1h            {z1.h}, p1, [\dst, \d_strd, lsl #1]
1131*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd, lsl #2
1132*c0909341SAndroid Build Coastguard Worker.endif
1133*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1134*c0909341SAndroid Build Coastguard Worker        ret
1135*c0909341SAndroid Build Coastguard Worker
1136*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1137*c0909341SAndroid Build Coastguard Worker80:     // H - 8xN
1138*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1139*c0909341SAndroid Build Coastguard Worker
1140*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1141*c0909341SAndroid Build Coastguard Worker8:
1142*c0909341SAndroid Build Coastguard Worker        ldp             q17, q21, [\src]
1143*c0909341SAndroid Build Coastguard Worker        ldur            q19, [\src, #8]
1144*c0909341SAndroid Build Coastguard Worker
1145*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1146*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
1147*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
1148*c0909341SAndroid Build Coastguard Worker.else
1149*c0909341SAndroid Build Coastguard Worker        mov             v0.16b, v6.16b
1150*c0909341SAndroid Build Coastguard Worker        mov             v2.16b, v6.16b
1151*c0909341SAndroid Build Coastguard Worker.endif
1152*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v17.16b}, v30.16b
1153*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v17.16b}, v31.16b
1154*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd
1155*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z16.h, z4.h[0]
1156*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z17.h, z4.h[0]
1157*c0909341SAndroid Build Coastguard Worker
1158*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v19.16b}, v30.16b
1159*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v19.16b}, v31.16b
1160*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1161*c0909341SAndroid Build Coastguard Worker        movi            v16.2d, #0
1162*c0909341SAndroid Build Coastguard Worker        movi            v17.2d, #0
1163*c0909341SAndroid Build Coastguard Worker.else
1164*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v6.16b
1165*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v6.16b
1166*c0909341SAndroid Build Coastguard Worker.endif
1167*c0909341SAndroid Build Coastguard Worker        ldp             q23, q27, [\src]
1168*c0909341SAndroid Build Coastguard Worker        ldur            q25, [\src, #8]
1169*c0909341SAndroid Build Coastguard Worker
1170*c0909341SAndroid Build Coastguard Worker        sdot            z16.d, z18.h, z4.h[0]
1171*c0909341SAndroid Build Coastguard Worker        sdot            z17.d, z19.h, z4.h[0]
1172*c0909341SAndroid Build Coastguard Worker
1173*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v23.16b}, v30.16b
1174*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v23.16b}, v31.16b
1175*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1176*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
1177*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
1178*c0909341SAndroid Build Coastguard Worker.else
1179*c0909341SAndroid Build Coastguard Worker        mov             v1.16b, v6.16b
1180*c0909341SAndroid Build Coastguard Worker        mov             v3.16b, v6.16b
1181*c0909341SAndroid Build Coastguard Worker.endif
1182*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd
1183*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z22.h, z4.h[0]
1184*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z23.h, z4.h[0]
1185*c0909341SAndroid Build Coastguard Worker
1186*c0909341SAndroid Build Coastguard Worker        tbl             v24.16b, {v25.16b}, v30.16b
1187*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v25.16b}, v31.16b
1188*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1189*c0909341SAndroid Build Coastguard Worker        movi            v22.2d, #0
1190*c0909341SAndroid Build Coastguard Worker        movi            v23.2d, #0
1191*c0909341SAndroid Build Coastguard Worker.else
1192*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v6.16b
1193*c0909341SAndroid Build Coastguard Worker        mov             v23.16b, v6.16b
1194*c0909341SAndroid Build Coastguard Worker.endif
1195*c0909341SAndroid Build Coastguard Worker        sdot            z22.d, z24.h, z4.h[0]
1196*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z25.h, z4.h[0]
1197*c0909341SAndroid Build Coastguard Worker
1198*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v21.16b}, v30.16b
1199*c0909341SAndroid Build Coastguard Worker        tbl             v21.16b, {v21.16b}, v31.16b
1200*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z18.h, z4.h[1]
1201*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z19.h, z4.h[1]
1202*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v27.16b}, v30.16b
1203*c0909341SAndroid Build Coastguard Worker        tbl             v27.16b, {v27.16b}, v31.16b
1204*c0909341SAndroid Build Coastguard Worker        sdot            z16.d, z20.h, z4.h[1]
1205*c0909341SAndroid Build Coastguard Worker        sdot            z17.d, z21.h, z4.h[1]
1206*c0909341SAndroid Build Coastguard Worker
1207*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z24.h, z4.h[1]
1208*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z25.h, z4.h[1]
1209*c0909341SAndroid Build Coastguard Worker
1210*c0909341SAndroid Build Coastguard Worker        sdot            z22.d, z26.h, z4.h[1]
1211*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z27.h, z4.h[1]
1212*c0909341SAndroid Build Coastguard Worker
1213*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #2
1214*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v2.4s
1215*c0909341SAndroid Build Coastguard Worker        uzp1            v2.4s, v16.4s, v17.4s
1216*c0909341SAndroid Build Coastguard Worker        uzp1            v1.4s, v1.4s, v3.4s
1217*c0909341SAndroid Build Coastguard Worker        uzp1            v3.4s, v22.4s, v23.4s
1218*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1219*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
1220*c0909341SAndroid Build Coastguard Worker        srshl           v2.4s, v2.4s, v5.4s
1221*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v5.4s
1222*c0909341SAndroid Build Coastguard Worker        srshl           v3.4s, v3.4s, v5.4s
1223*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v2.8h
1224*c0909341SAndroid Build Coastguard Worker        uzp1            v1.8h, v1.8h, v3.8h
1225*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
1226*c0909341SAndroid Build Coastguard Worker        sub             z1.h, z1.h, #PREP_BIAS
1227*c0909341SAndroid Build Coastguard Worker        stp             q0, q1, [\dst], #32
1228*c0909341SAndroid Build Coastguard Worker.else   // put
1229*c0909341SAndroid Build Coastguard Worker        sqshrun         v0.4h, v0.4s, #6
1230*c0909341SAndroid Build Coastguard Worker        sqshrun2        v0.8h, v2.4s, #6
1231*c0909341SAndroid Build Coastguard Worker        sqshrun         v1.4h, v1.4s, #6
1232*c0909341SAndroid Build Coastguard Worker        sqshrun2        v1.8h, v3.4s, #6
1233*c0909341SAndroid Build Coastguard Worker        umin            v0.8h, v0.8h, v5.8h
1234*c0909341SAndroid Build Coastguard Worker        umin            v1.8h, v1.8h, v5.8h
1235*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [\dst], \d_strd
1236*c0909341SAndroid Build Coastguard Worker        st1             {v1.16b}, [\dst], \d_strd
1237*c0909341SAndroid Build Coastguard Worker.endif
1238*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1239*c0909341SAndroid Build Coastguard Worker        ret
1240*c0909341SAndroid Build Coastguard Worker
1241*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1242*c0909341SAndroid Build Coastguard Worker160:    // H - 16xN
1243*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1244*c0909341SAndroid Build Coastguard Worker
1245*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1246*c0909341SAndroid Build Coastguard Worker16:
1247*c0909341SAndroid Build Coastguard Worker        ldp             q17, q21, [\src]
1248*c0909341SAndroid Build Coastguard Worker        ldur            q19, [\src, #8]
1249*c0909341SAndroid Build Coastguard Worker
1250*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1251*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
1252*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
1253*c0909341SAndroid Build Coastguard Worker.else
1254*c0909341SAndroid Build Coastguard Worker        mov             v0.16b, v6.16b
1255*c0909341SAndroid Build Coastguard Worker        mov             v2.16b, v6.16b
1256*c0909341SAndroid Build Coastguard Worker.endif
1257*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v17.16b}, v30.16b
1258*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v17.16b}, v31.16b
1259*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z16.h, z4.h[0]
1260*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z17.h, z4.h[0]
1261*c0909341SAndroid Build Coastguard Worker
1262*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v19.16b}, v30.16b
1263*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v19.16b}, v31.16b
1264*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1265*c0909341SAndroid Build Coastguard Worker        movi            v16.2d, #0
1266*c0909341SAndroid Build Coastguard Worker        movi            v17.2d, #0
1267*c0909341SAndroid Build Coastguard Worker.else
1268*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v6.16b
1269*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v6.16b
1270*c0909341SAndroid Build Coastguard Worker.endif
1271*c0909341SAndroid Build Coastguard Worker        ldur            q25, [\src, #24]
1272*c0909341SAndroid Build Coastguard Worker        ldr             q27, [\src, #32]
1273*c0909341SAndroid Build Coastguard Worker
1274*c0909341SAndroid Build Coastguard Worker        sdot            z16.d, z18.h, z4.h[0]
1275*c0909341SAndroid Build Coastguard Worker        sdot            z17.d, z19.h, z4.h[0]
1276*c0909341SAndroid Build Coastguard Worker
1277*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v21.16b}, v30.16b
1278*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v21.16b}, v31.16b
1279*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1280*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
1281*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
1282*c0909341SAndroid Build Coastguard Worker.else
1283*c0909341SAndroid Build Coastguard Worker        mov             v1.16b, v6.16b
1284*c0909341SAndroid Build Coastguard Worker        mov             v3.16b, v6.16b
1285*c0909341SAndroid Build Coastguard Worker.endif
1286*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd
1287*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z22.h, z4.h[0]
1288*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z23.h, z4.h[0]
1289*c0909341SAndroid Build Coastguard Worker
1290*c0909341SAndroid Build Coastguard Worker        tbl             v24.16b, {v25.16b}, v30.16b
1291*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v25.16b}, v31.16b
1292*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1293*c0909341SAndroid Build Coastguard Worker        movi            v22.2d, #0
1294*c0909341SAndroid Build Coastguard Worker        movi            v23.2d, #0
1295*c0909341SAndroid Build Coastguard Worker.else
1296*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v6.16b
1297*c0909341SAndroid Build Coastguard Worker        mov             v23.16b, v6.16b
1298*c0909341SAndroid Build Coastguard Worker.endif
1299*c0909341SAndroid Build Coastguard Worker        sdot            z22.d, z24.h, z4.h[0]
1300*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z25.h, z4.h[0]
1301*c0909341SAndroid Build Coastguard Worker
1302*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v21.16b}, v30.16b
1303*c0909341SAndroid Build Coastguard Worker        tbl             v21.16b, {v21.16b}, v31.16b
1304*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z18.h, z4.h[1]
1305*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z19.h, z4.h[1]
1306*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v27.16b}, v30.16b
1307*c0909341SAndroid Build Coastguard Worker        tbl             v27.16b, {v27.16b}, v31.16b
1308*c0909341SAndroid Build Coastguard Worker        sdot            z16.d, z20.h, z4.h[1]
1309*c0909341SAndroid Build Coastguard Worker        sdot            z17.d, z21.h, z4.h[1]
1310*c0909341SAndroid Build Coastguard Worker
1311*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z24.h, z4.h[1]
1312*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z25.h, z4.h[1]
1313*c0909341SAndroid Build Coastguard Worker
1314*c0909341SAndroid Build Coastguard Worker        sdot            z22.d, z26.h, z4.h[1]
1315*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z27.h, z4.h[1]
1316*c0909341SAndroid Build Coastguard Worker
1317*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
1318*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v2.4s
1319*c0909341SAndroid Build Coastguard Worker        uzp1            v2.4s, v16.4s, v17.4s
1320*c0909341SAndroid Build Coastguard Worker        uzp1            v1.4s, v1.4s, v3.4s
1321*c0909341SAndroid Build Coastguard Worker        uzp1            v3.4s, v22.4s, v23.4s
1322*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1323*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
1324*c0909341SAndroid Build Coastguard Worker        srshl           v2.4s, v2.4s, v5.4s
1325*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v5.4s
1326*c0909341SAndroid Build Coastguard Worker        srshl           v3.4s, v3.4s, v5.4s
1327*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v2.8h
1328*c0909341SAndroid Build Coastguard Worker        uzp1            v1.8h, v1.8h, v3.8h
1329*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
1330*c0909341SAndroid Build Coastguard Worker        sub             z1.h, z1.h, #PREP_BIAS
1331*c0909341SAndroid Build Coastguard Worker        stp             q0, q1, [\dst], #32
1332*c0909341SAndroid Build Coastguard Worker.else   // put
1333*c0909341SAndroid Build Coastguard Worker        sqshrun         v0.4h, v0.4s, #6
1334*c0909341SAndroid Build Coastguard Worker        sqshrun2        v0.8h, v2.4s, #6
1335*c0909341SAndroid Build Coastguard Worker        sqshrun         v1.4h, v1.4s, #6
1336*c0909341SAndroid Build Coastguard Worker        sqshrun2        v1.8h, v3.4s, #6
1337*c0909341SAndroid Build Coastguard Worker        umin            v0.8h, v0.8h, v5.8h
1338*c0909341SAndroid Build Coastguard Worker        umin            v1.8h, v1.8h, v5.8h
1339*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [\dst], \d_strd
1340*c0909341SAndroid Build Coastguard Worker.endif
1341*c0909341SAndroid Build Coastguard Worker        b.gt            16b
1342*c0909341SAndroid Build Coastguard Worker        ret
1343*c0909341SAndroid Build Coastguard Worker
1344*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1345*c0909341SAndroid Build Coastguard Worker320:    // H - 32xN+
1346*c0909341SAndroid Build Coastguard Worker640:
1347*c0909341SAndroid Build Coastguard Worker1280:
1348*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1349*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1350*c0909341SAndroid Build Coastguard Worker        sub             \d_strd, \d_strd, \w, uxtw #1
1351*c0909341SAndroid Build Coastguard Worker.endif
1352*c0909341SAndroid Build Coastguard Worker        sub             \s_strd, \s_strd, \w, uxtw #1
1353*c0909341SAndroid Build Coastguard Worker        mov             w8, \w
1354*c0909341SAndroid Build Coastguard Worker
1355*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1356*c0909341SAndroid Build Coastguard Worker32:
1357*c0909341SAndroid Build Coastguard Worker        ldp             q17, q21, [\src]
1358*c0909341SAndroid Build Coastguard Worker        ldur            q19, [\src, #8]
1359*c0909341SAndroid Build Coastguard Worker
1360*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1361*c0909341SAndroid Build Coastguard Worker        movi            v0.2d, #0
1362*c0909341SAndroid Build Coastguard Worker        movi            v2.2d, #0
1363*c0909341SAndroid Build Coastguard Worker.else
1364*c0909341SAndroid Build Coastguard Worker        mov             v0.16b, v6.16b
1365*c0909341SAndroid Build Coastguard Worker        mov             v2.16b, v6.16b
1366*c0909341SAndroid Build Coastguard Worker.endif
1367*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v17.16b}, v30.16b
1368*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v17.16b}, v31.16b
1369*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z16.h, z4.h[0]
1370*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z17.h, z4.h[0]
1371*c0909341SAndroid Build Coastguard Worker
1372*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v19.16b}, v30.16b
1373*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v19.16b}, v31.16b
1374*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1375*c0909341SAndroid Build Coastguard Worker        movi            v16.2d, #0
1376*c0909341SAndroid Build Coastguard Worker        movi            v17.2d, #0
1377*c0909341SAndroid Build Coastguard Worker.else
1378*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v6.16b
1379*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v6.16b
1380*c0909341SAndroid Build Coastguard Worker.endif
1381*c0909341SAndroid Build Coastguard Worker        ldur            q25, [\src, #24]
1382*c0909341SAndroid Build Coastguard Worker
1383*c0909341SAndroid Build Coastguard Worker        sdot            z16.d, z18.h, z4.h[0]
1384*c0909341SAndroid Build Coastguard Worker        sdot            z17.d, z19.h, z4.h[0]
1385*c0909341SAndroid Build Coastguard Worker
1386*c0909341SAndroid Build Coastguard Worker        ldr             q27, [\src, #32]!
1387*c0909341SAndroid Build Coastguard Worker
1388*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v21.16b}, v30.16b
1389*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v21.16b}, v31.16b
1390*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1391*c0909341SAndroid Build Coastguard Worker        movi            v1.2d, #0
1392*c0909341SAndroid Build Coastguard Worker        movi            v3.2d, #0
1393*c0909341SAndroid Build Coastguard Worker.else
1394*c0909341SAndroid Build Coastguard Worker        mov             v1.16b, v6.16b
1395*c0909341SAndroid Build Coastguard Worker        mov             v3.16b, v6.16b
1396*c0909341SAndroid Build Coastguard Worker.endif
1397*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z22.h, z4.h[0]
1398*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z23.h, z4.h[0]
1399*c0909341SAndroid Build Coastguard Worker
1400*c0909341SAndroid Build Coastguard Worker        tbl             v24.16b, {v25.16b}, v30.16b
1401*c0909341SAndroid Build Coastguard Worker        tbl             v25.16b, {v25.16b}, v31.16b
1402*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1403*c0909341SAndroid Build Coastguard Worker        movi            v22.2d, #0
1404*c0909341SAndroid Build Coastguard Worker        movi            v23.2d, #0
1405*c0909341SAndroid Build Coastguard Worker.else
1406*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v6.16b
1407*c0909341SAndroid Build Coastguard Worker        mov             v23.16b, v6.16b
1408*c0909341SAndroid Build Coastguard Worker.endif
1409*c0909341SAndroid Build Coastguard Worker        sdot            z22.d, z24.h, z4.h[0]
1410*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z25.h, z4.h[0]
1411*c0909341SAndroid Build Coastguard Worker
1412*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v21.16b}, v30.16b
1413*c0909341SAndroid Build Coastguard Worker        tbl             v21.16b, {v21.16b}, v31.16b
1414*c0909341SAndroid Build Coastguard Worker        sdot            z0.d, z18.h, z4.h[1]
1415*c0909341SAndroid Build Coastguard Worker        sdot            z2.d, z19.h, z4.h[1]
1416*c0909341SAndroid Build Coastguard Worker        tbl             v26.16b, {v27.16b}, v30.16b
1417*c0909341SAndroid Build Coastguard Worker        tbl             v27.16b, {v27.16b}, v31.16b
1418*c0909341SAndroid Build Coastguard Worker        sdot            z16.d, z20.h, z4.h[1]
1419*c0909341SAndroid Build Coastguard Worker        sdot            z17.d, z21.h, z4.h[1]
1420*c0909341SAndroid Build Coastguard Worker
1421*c0909341SAndroid Build Coastguard Worker        sdot            z1.d, z24.h, z4.h[1]
1422*c0909341SAndroid Build Coastguard Worker        sdot            z3.d, z25.h, z4.h[1]
1423*c0909341SAndroid Build Coastguard Worker
1424*c0909341SAndroid Build Coastguard Worker        sdot            z22.d, z26.h, z4.h[1]
1425*c0909341SAndroid Build Coastguard Worker        sdot            z23.d, z27.h, z4.h[1]
1426*c0909341SAndroid Build Coastguard Worker
1427*c0909341SAndroid Build Coastguard Worker        subs            w8, w8, #16
1428*c0909341SAndroid Build Coastguard Worker        uzp1            v0.4s, v0.4s, v2.4s
1429*c0909341SAndroid Build Coastguard Worker        uzp1            v2.4s, v16.4s, v17.4s
1430*c0909341SAndroid Build Coastguard Worker        uzp1            v1.4s, v1.4s, v3.4s
1431*c0909341SAndroid Build Coastguard Worker        uzp1            v3.4s, v22.4s, v23.4s
1432*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1433*c0909341SAndroid Build Coastguard Worker        srshl           v0.4s, v0.4s, v5.4s
1434*c0909341SAndroid Build Coastguard Worker        srshl           v2.4s, v2.4s, v5.4s
1435*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s, v1.4s, v5.4s
1436*c0909341SAndroid Build Coastguard Worker        srshl           v3.4s, v3.4s, v5.4s
1437*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h, v0.8h, v2.8h
1438*c0909341SAndroid Build Coastguard Worker        uzp1            v1.8h, v1.8h, v3.8h
1439*c0909341SAndroid Build Coastguard Worker        sub             z0.h, z0.h, #PREP_BIAS
1440*c0909341SAndroid Build Coastguard Worker        sub             z1.h, z1.h, #PREP_BIAS
1441*c0909341SAndroid Build Coastguard Worker.else   // put
1442*c0909341SAndroid Build Coastguard Worker        sqshrun         v0.4h, v0.4s, #6
1443*c0909341SAndroid Build Coastguard Worker        sqshrun2        v0.8h, v2.4s, #6
1444*c0909341SAndroid Build Coastguard Worker        sqshrun         v1.4h, v1.4s, #6
1445*c0909341SAndroid Build Coastguard Worker        sqshrun2        v1.8h, v3.4s, #6
1446*c0909341SAndroid Build Coastguard Worker        umin            v0.8h, v0.8h, v5.8h
1447*c0909341SAndroid Build Coastguard Worker        umin            v1.8h, v1.8h, v5.8h
1448*c0909341SAndroid Build Coastguard Worker.endif
1449*c0909341SAndroid Build Coastguard Worker        stp             q0, q1, [\dst], #32
1450*c0909341SAndroid Build Coastguard Worker        b.gt            32b
1451*c0909341SAndroid Build Coastguard Worker
1452*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \s_strd
1453*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1454*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, \d_strd
1455*c0909341SAndroid Build Coastguard Worker.endif
1456*c0909341SAndroid Build Coastguard Worker        subs            \h, \h, #1
1457*c0909341SAndroid Build Coastguard Worker        mov             w8, \w
1458*c0909341SAndroid Build Coastguard Worker        b.gt            32b
1459*c0909341SAndroid Build Coastguard Worker        ret
1460*c0909341SAndroid Build Coastguard Workerendfunc
1461*c0909341SAndroid Build Coastguard Worker
1462*c0909341SAndroid Build Coastguard Workerjumptable \type\()_8tap_h_\isa\()_tbl
1463*c0909341SAndroid Build Coastguard Worker        .word 1280b - \type\()_8tap_h_\isa\()_tbl
1464*c0909341SAndroid Build Coastguard Worker        .word 640b  - \type\()_8tap_h_\isa\()_tbl
1465*c0909341SAndroid Build Coastguard Worker        .word 320b  - \type\()_8tap_h_\isa\()_tbl
1466*c0909341SAndroid Build Coastguard Worker        .word 160b  - \type\()_8tap_h_\isa\()_tbl
1467*c0909341SAndroid Build Coastguard Worker        .word 80b   - \type\()_8tap_h_\isa\()_tbl
1468*c0909341SAndroid Build Coastguard Worker        .word 40b   - \type\()_8tap_h_\isa\()_tbl
1469*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1470*c0909341SAndroid Build Coastguard Worker        .word 20b   - \type\()_8tap_h_\isa\()_tbl
1471*c0909341SAndroid Build Coastguard Worker.endif
1472*c0909341SAndroid Build Coastguard Workerendjumptable
1473*c0909341SAndroid Build Coastguard Worker.endm
1474*c0909341SAndroid Build Coastguard Worker
1475*c0909341SAndroid Build Coastguard Worker
1476*c0909341SAndroid Build Coastguard Workerfunction prep_sve
1477*c0909341SAndroid Build Coastguard Worker        movrel          x9, prep_tbl
1478*c0909341SAndroid Build Coastguard Worker        mov             w6, #19
1479*c0909341SAndroid Build Coastguard Worker        ldrsw           x8, [x9, x8, lsl #2]
1480*c0909341SAndroid Build Coastguard Worker        sub             w6, w6, w7, lsr #8          // 19 - bdmax / 256
1481*c0909341SAndroid Build Coastguard Worker        add             x9, x9, x8
1482*c0909341SAndroid Build Coastguard Worker        movi            v30.8h, #PREP_BIAS_NEG
1483*c0909341SAndroid Build Coastguard Worker        dup             v29.8h, w6                  // 10b: 1 << 4, 12b: 1 << 2
1484*c0909341SAndroid Build Coastguard Worker        br              x9
1485*c0909341SAndroid Build Coastguard Worker
1486*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1487*c0909341SAndroid Build Coastguard Worker40:     // prep - 4xN
1488*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1489*c0909341SAndroid Build Coastguard Worker
1490*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1491*c0909341SAndroid Build Coastguard Worker4:
1492*c0909341SAndroid Build Coastguard Worker        ldr             d0, [x1]
1493*c0909341SAndroid Build Coastguard Worker        ldr             d1, [x1, x2]
1494*c0909341SAndroid Build Coastguard Worker        add             x1, x1, x2, lsl #1
1495*c0909341SAndroid Build Coastguard Worker        subs            w4, w4, #2
1496*c0909341SAndroid Build Coastguard Worker        mad             z0.h, p0/m, z29.h, z30.h
1497*c0909341SAndroid Build Coastguard Worker        mad             z1.h, p0/m, z29.h, z30.h
1498*c0909341SAndroid Build Coastguard Worker        stp             d0, d1, [x0], #16
1499*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1500*c0909341SAndroid Build Coastguard Worker        ret
1501*c0909341SAndroid Build Coastguard Worker
1502*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1503*c0909341SAndroid Build Coastguard Worker80:     // prep - 8xN
1504*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1505*c0909341SAndroid Build Coastguard Worker
1506*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1507*c0909341SAndroid Build Coastguard Worker8:
1508*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x1], x2
1509*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h}, [x1], x2
1510*c0909341SAndroid Build Coastguard Worker        subs            w4, w4, #2
1511*c0909341SAndroid Build Coastguard Worker        mad             z0.h, p0/m, z29.h, z30.h
1512*c0909341SAndroid Build Coastguard Worker        mad             z1.h, p0/m, z29.h, z30.h
1513*c0909341SAndroid Build Coastguard Worker        stp             q0, q1, [x0], #32
1514*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1515*c0909341SAndroid Build Coastguard Worker        ret
1516*c0909341SAndroid Build Coastguard Worker
1517*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1518*c0909341SAndroid Build Coastguard Worker160:    // prep - 16xN
1519*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1520*c0909341SAndroid Build Coastguard Worker
1521*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1522*c0909341SAndroid Build Coastguard Worker16:
1523*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x1], x2
1524*c0909341SAndroid Build Coastguard Worker        mad             z0.h, p0/m, z29.h, z30.h
1525*c0909341SAndroid Build Coastguard Worker        mad             z1.h, p0/m, z29.h, z30.h
1526*c0909341SAndroid Build Coastguard Worker        subs            w4, w4, #2
1527*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x1], x2
1528*c0909341SAndroid Build Coastguard Worker        mad             z2.h, p0/m, z29.h, z30.h
1529*c0909341SAndroid Build Coastguard Worker        mad             z3.h, p0/m, z29.h, z30.h
1530*c0909341SAndroid Build Coastguard Worker        stp             q0, q1, [x0]
1531*c0909341SAndroid Build Coastguard Worker        stp             q2, q3, [x0, #32]
1532*c0909341SAndroid Build Coastguard Worker        add             x0, x0, #64
1533*c0909341SAndroid Build Coastguard Worker        b.gt            16b
1534*c0909341SAndroid Build Coastguard Worker        ret
1535*c0909341SAndroid Build Coastguard Worker
1536*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1537*c0909341SAndroid Build Coastguard Worker320:    // prep - 32xN
1538*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1539*c0909341SAndroid Build Coastguard Worker
1540*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1541*c0909341SAndroid Build Coastguard Worker32:
1542*c0909341SAndroid Build Coastguard Worker        ldp             q0, q1, [x1]
1543*c0909341SAndroid Build Coastguard Worker        mad             z0.h, p0/m, z29.h, z30.h
1544*c0909341SAndroid Build Coastguard Worker        mad             z1.h, p0/m, z29.h, z30.h
1545*c0909341SAndroid Build Coastguard Worker        ldp             q2, q3, [x1, #32]
1546*c0909341SAndroid Build Coastguard Worker        subs            w4, w4, #1
1547*c0909341SAndroid Build Coastguard Worker        mad             z2.h, p0/m, z29.h, z30.h
1548*c0909341SAndroid Build Coastguard Worker        mad             z3.h, p0/m, z29.h, z30.h
1549*c0909341SAndroid Build Coastguard Worker        add             x1, x1, x2
1550*c0909341SAndroid Build Coastguard Worker        stp             q0, q1, [x0]
1551*c0909341SAndroid Build Coastguard Worker        stp             q2, q3, [x0, #32]
1552*c0909341SAndroid Build Coastguard Worker        add             x0, x0, #64
1553*c0909341SAndroid Build Coastguard Worker        b.gt            32b
1554*c0909341SAndroid Build Coastguard Worker        ret
1555*c0909341SAndroid Build Coastguard Worker
1556*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1557*c0909341SAndroid Build Coastguard Worker640:    // prep - 64xN
1558*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1559*c0909341SAndroid Build Coastguard Worker
1560*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1561*c0909341SAndroid Build Coastguard Worker64:
1562*c0909341SAndroid Build Coastguard Worker        ldp             q0, q1, [x1]
1563*c0909341SAndroid Build Coastguard Worker        mad             z0.h, p0/m, z29.h, z30.h
1564*c0909341SAndroid Build Coastguard Worker        mad             z1.h, p0/m, z29.h, z30.h
1565*c0909341SAndroid Build Coastguard Worker        ldp             q2, q3, [x1, #32]
1566*c0909341SAndroid Build Coastguard Worker        mad             z2.h, p0/m, z29.h, z30.h
1567*c0909341SAndroid Build Coastguard Worker        mad             z3.h, p0/m, z29.h, z30.h
1568*c0909341SAndroid Build Coastguard Worker        ldp             q4, q5, [x1, #64]
1569*c0909341SAndroid Build Coastguard Worker        mad             z4.h, p0/m, z29.h, z30.h
1570*c0909341SAndroid Build Coastguard Worker        mad             z5.h, p0/m, z29.h, z30.h
1571*c0909341SAndroid Build Coastguard Worker        ldp             q6, q7, [x1, #96]
1572*c0909341SAndroid Build Coastguard Worker        add             x1, x1, x2
1573*c0909341SAndroid Build Coastguard Worker        subs            w4, w4, #1
1574*c0909341SAndroid Build Coastguard Worker        mad             z6.h, p0/m, z29.h, z30.h
1575*c0909341SAndroid Build Coastguard Worker        mad             z7.h, p0/m, z29.h, z30.h
1576*c0909341SAndroid Build Coastguard Worker        stp             q0, q1, [x0]
1577*c0909341SAndroid Build Coastguard Worker        stp             q2, q3, [x0, #32]
1578*c0909341SAndroid Build Coastguard Worker        stp             q4, q5, [x0, #64]
1579*c0909341SAndroid Build Coastguard Worker        stp             q6, q7, [x0, #96]
1580*c0909341SAndroid Build Coastguard Worker        add             x0, x0, #128
1581*c0909341SAndroid Build Coastguard Worker        b.gt            64b
1582*c0909341SAndroid Build Coastguard Worker        ret
1583*c0909341SAndroid Build Coastguard Worker
1584*c0909341SAndroid Build Coastguard Worker        .align JUMP_ALIGN
1585*c0909341SAndroid Build Coastguard Worker1280:   // prep - 128xN
1586*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1587*c0909341SAndroid Build Coastguard Worker
1588*c0909341SAndroid Build Coastguard Worker        .align LOOP_ALIGN
1589*c0909341SAndroid Build Coastguard Worker128:
1590*c0909341SAndroid Build Coastguard Worker        ldp             q0, q1, [x1]
1591*c0909341SAndroid Build Coastguard Worker        mad             z0.h, p0/m, z29.h, z30.h
1592*c0909341SAndroid Build Coastguard Worker        mad             z1.h, p0/m, z29.h, z30.h
1593*c0909341SAndroid Build Coastguard Worker        ldp             q2, q3, [x1, #32]
1594*c0909341SAndroid Build Coastguard Worker        mad             z2.h, p0/m, z29.h, z30.h
1595*c0909341SAndroid Build Coastguard Worker        mad             z3.h, p0/m, z29.h, z30.h
1596*c0909341SAndroid Build Coastguard Worker        ldp             q4, q5, [x1, #64]
1597*c0909341SAndroid Build Coastguard Worker        mad             z4.h, p0/m, z29.h, z30.h
1598*c0909341SAndroid Build Coastguard Worker        mad             z5.h, p0/m, z29.h, z30.h
1599*c0909341SAndroid Build Coastguard Worker        ldp             q6, q7, [x1, #96]
1600*c0909341SAndroid Build Coastguard Worker        mad             z6.h, p0/m, z29.h, z30.h
1601*c0909341SAndroid Build Coastguard Worker        mad             z7.h, p0/m, z29.h, z30.h
1602*c0909341SAndroid Build Coastguard Worker        ldp             q16, q17, [x1, #128]
1603*c0909341SAndroid Build Coastguard Worker        mad             z16.h, p0/m, z29.h, z30.h
1604*c0909341SAndroid Build Coastguard Worker        mad             z17.h, p0/m, z29.h, z30.h
1605*c0909341SAndroid Build Coastguard Worker        ldp             q18, q19, [x1, #160]
1606*c0909341SAndroid Build Coastguard Worker        mad             z18.h, p0/m, z29.h, z30.h
1607*c0909341SAndroid Build Coastguard Worker        mad             z19.h, p0/m, z29.h, z30.h
1608*c0909341SAndroid Build Coastguard Worker        ldp             q20, q21, [x1, #192]
1609*c0909341SAndroid Build Coastguard Worker        mad             z20.h, p0/m, z29.h, z30.h
1610*c0909341SAndroid Build Coastguard Worker        mad             z21.h, p0/m, z29.h, z30.h
1611*c0909341SAndroid Build Coastguard Worker        ldp             q22, q23, [x1, #224]
1612*c0909341SAndroid Build Coastguard Worker        add             x1, x1, x2
1613*c0909341SAndroid Build Coastguard Worker        mad             z22.h, p0/m, z29.h, z30.h
1614*c0909341SAndroid Build Coastguard Worker        mad             z23.h, p0/m, z29.h, z30.h
1615*c0909341SAndroid Build Coastguard Worker        subs            w4, w4, #1
1616*c0909341SAndroid Build Coastguard Worker        stp             q0, q1, [x0]
1617*c0909341SAndroid Build Coastguard Worker        stp             q2, q3, [x0, #32]
1618*c0909341SAndroid Build Coastguard Worker        stp             q4, q5, [x0, #64]
1619*c0909341SAndroid Build Coastguard Worker        stp             q6, q7, [x0, #96]
1620*c0909341SAndroid Build Coastguard Worker        stp             q16, q17, [x0, #128]
1621*c0909341SAndroid Build Coastguard Worker        stp             q18, q19, [x0, #160]
1622*c0909341SAndroid Build Coastguard Worker        stp             q20, q21, [x0, #192]
1623*c0909341SAndroid Build Coastguard Worker        stp             q22, q23, [x0, #224]
1624*c0909341SAndroid Build Coastguard Worker        add             x0, x0, #256
1625*c0909341SAndroid Build Coastguard Worker        b.gt            128b
1626*c0909341SAndroid Build Coastguard Worker        ret
1627*c0909341SAndroid Build Coastguard Workerendfunc
1628*c0909341SAndroid Build Coastguard Worker
1629*c0909341SAndroid Build Coastguard Workerjumptable prep_tbl
1630*c0909341SAndroid Build Coastguard Worker        .word 1280b - prep_tbl
1631*c0909341SAndroid Build Coastguard Worker        .word 640b  - prep_tbl
1632*c0909341SAndroid Build Coastguard Worker        .word 320b  - prep_tbl
1633*c0909341SAndroid Build Coastguard Worker        .word 160b  - prep_tbl
1634*c0909341SAndroid Build Coastguard Worker        .word 80b   - prep_tbl
1635*c0909341SAndroid Build Coastguard Worker        .word 40b   - prep_tbl
1636*c0909341SAndroid Build Coastguard Workerendjumptable
1637*c0909341SAndroid Build Coastguard Worker
1638*c0909341SAndroid Build Coastguard Worker
1639*c0909341SAndroid Build Coastguard Worker// dst(x0), d_strd(x9), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6), bdmax(w7)
1640*c0909341SAndroid Build Coastguard Worker// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w9), ws_strd(w2)
1641*c0909341SAndroid Build Coastguard Workerfilter_8tap_fn prep, sve2, x0, x9, x1, x2, w3, w4, w5, w6, w7, x5, x6, x5, x6, w9, w2
1642*c0909341SAndroid Build Coastguard Worker
1643*c0909341SAndroid Build Coastguard Worker// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7), bdmax(w8)
1644*c0909341SAndroid Build Coastguard Worker// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1), ws_strd(w3)
1645*c0909341SAndroid Build Coastguard Workerfilter_8tap_fn  put, sve2, x0, x1, x2, x3, w4, w5, w6, w7, w8, x6, x7, x6, x7, w1, w3
1646*c0909341SAndroid Build Coastguard Worker
1647*c0909341SAndroid Build Coastguard WorkerDISABLE_SVE2
1648*c0909341SAndroid Build Coastguard WorkerDISABLE_SVE
1649*c0909341SAndroid Build Coastguard Worker#endif  // HAVE_SVE2
1650