1*c0909341SAndroid Build Coastguard Worker/* 2*c0909341SAndroid Build Coastguard Worker * Copyright © 2024, VideoLAN and dav1d authors 3*c0909341SAndroid Build Coastguard Worker * Copyright © 2024, Janne Grunau 4*c0909341SAndroid Build Coastguard Worker * Copyright © 2024, Martin Storsjo 5*c0909341SAndroid Build Coastguard Worker * Copyright © 2024, Arm Limited 6*c0909341SAndroid Build Coastguard Worker * All rights reserved. 7*c0909341SAndroid Build Coastguard Worker * 8*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without 9*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met: 10*c0909341SAndroid Build Coastguard Worker * 11*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this 12*c0909341SAndroid Build Coastguard Worker * list of conditions and the following disclaimer. 13*c0909341SAndroid Build Coastguard Worker * 14*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice, 15*c0909341SAndroid Build Coastguard Worker * this list of conditions and the following disclaimer in the documentation 16*c0909341SAndroid Build Coastguard Worker * and/or other materials provided with the distribution. 17*c0909341SAndroid Build Coastguard Worker * 18*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 25*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28*c0909341SAndroid Build Coastguard Worker */ 29*c0909341SAndroid Build Coastguard Worker 30*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S" 31*c0909341SAndroid Build Coastguard Worker#include "util.S" 32*c0909341SAndroid Build Coastguard Worker 33*c0909341SAndroid Build Coastguard Worker 34*c0909341SAndroid Build Coastguard Worker#if HAVE_DOTPROD 35*c0909341SAndroid Build Coastguard WorkerENABLE_DOTPROD 36*c0909341SAndroid Build Coastguard Worker 37*c0909341SAndroid Build Coastguard Worker// No spaces in these expressions, due to gas-preprocessor. It is translated by 38*c0909341SAndroid Build Coastguard Worker// -1 to save the negative offset at getting the address of `mc_subpel_filters`. 39*c0909341SAndroid Build Coastguard Worker#define REGULAR1 (((0*15-1)<<7)|(3*15-1)) 40*c0909341SAndroid Build Coastguard Worker#define SMOOTH1 (((1*15-1)<<7)|(4*15-1)) 41*c0909341SAndroid Build Coastguard Worker#define SHARP1 (((2*15-1)<<7)|(3*15-1)) 42*c0909341SAndroid Build Coastguard Worker 43*c0909341SAndroid Build Coastguard Worker#define FUNC_ALIGN 2 44*c0909341SAndroid Build Coastguard Worker#define JUMP_ALIGN 2 45*c0909341SAndroid Build Coastguard Worker#define LOOP_ALIGN 2 46*c0909341SAndroid Build Coastguard Worker 47*c0909341SAndroid Build Coastguard Worker 48*c0909341SAndroid Build Coastguard Workerconst h_tbl_neon_dotprod, align=4 49*c0909341SAndroid Build Coastguard Worker // Shuffle indices to permute horizontal samples in preparation for 50*c0909341SAndroid Build Coastguard Worker // input to SDOT instructions. The 8-tap horizontal convolution uses 51*c0909341SAndroid Build Coastguard Worker // sample indices in the interval of [-3, 4] relative to the current 52*c0909341SAndroid Build Coastguard Worker // sample position. 53*c0909341SAndroid Build Coastguard Worker .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 54*c0909341SAndroid Build Coastguard Worker .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 55*c0909341SAndroid Build Coastguard Worker .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 56*c0909341SAndroid Build Coastguard Worker 57*c0909341SAndroid Build Coastguard Worker // Shuffle indices to permute horizontal samples in preparation for 58*c0909341SAndroid Build Coastguard Worker // input to USMMLA instructions. 59*c0909341SAndroid Build Coastguard Worker#define OFFSET_USMMLA 48 60*c0909341SAndroid Build Coastguard Worker .byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 61*c0909341SAndroid Build Coastguard Worker .byte 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 62*c0909341SAndroid Build Coastguard Worker 63*c0909341SAndroid Build Coastguard Worker // Lookup table used to help conversion of shifted 32-bit values to 8-bit. 64*c0909341SAndroid Build Coastguard Worker#define OFFSET_CVT_32_8 80 65*c0909341SAndroid Build Coastguard Worker .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 66*c0909341SAndroid Build Coastguard Workerendconst 67*c0909341SAndroid Build Coastguard Worker 68*c0909341SAndroid Build Coastguard Workerconst v_tbl_neon_dotprod, align=4 69*c0909341SAndroid Build Coastguard Worker // Vertical convolutions are also using SDOT instructions, where a 70*c0909341SAndroid Build Coastguard Worker // 128-bit register contains a transposed 4x4 matrix of values. 71*c0909341SAndroid Build Coastguard Worker // Subsequent iterations of the vertical convolution can reuse the 72*c0909341SAndroid Build Coastguard Worker // 3x4 sub-matrix from the previous loop iteration. These shuffle 73*c0909341SAndroid Build Coastguard Worker // indices shift and merge this 4x4 matrix with the values of a new 74*c0909341SAndroid Build Coastguard Worker // line. 75*c0909341SAndroid Build Coastguard Worker .byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28 76*c0909341SAndroid Build Coastguard Worker .byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19 77*c0909341SAndroid Build Coastguard Worker .byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23 78*c0909341SAndroid Build Coastguard Worker .byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27 79*c0909341SAndroid Build Coastguard Worker .byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31 80*c0909341SAndroid Build Coastguard Workerendconst 81*c0909341SAndroid Build Coastguard Worker 82*c0909341SAndroid Build Coastguard Worker 83*c0909341SAndroid Build Coastguard Worker.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1 84*c0909341SAndroid Build Coastguard Workerfunction \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN 85*c0909341SAndroid Build Coastguard Worker mov x9, \type_h 86*c0909341SAndroid Build Coastguard Worker mov x10, \type_v 87*c0909341SAndroid Build Coastguard Worker .if \jump 88*c0909341SAndroid Build Coastguard Worker b \op\()_8tap_\isa 89*c0909341SAndroid Build Coastguard Worker .endif 90*c0909341SAndroid Build Coastguard Workerendfunc 91*c0909341SAndroid Build Coastguard Worker.endm 92*c0909341SAndroid Build Coastguard Worker 93*c0909341SAndroid Build Coastguard Worker.macro filter_8tap_fn type, dot, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd 94*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, sharp, SHARP1, SHARP1, \isa 95*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa 96*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa 97*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, smooth_sharp, SMOOTH1, SHARP1, \isa 98*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, smooth, SMOOTH1, SMOOTH1, \isa 99*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, smooth_regular, SMOOTH1, REGULAR1, \isa 100*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, regular_sharp, REGULAR1, SHARP1, \isa 101*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1, \isa 102*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, regular, REGULAR1, REGULAR1, \isa, jump=0 103*c0909341SAndroid Build Coastguard Worker 104*c0909341SAndroid Build Coastguard Workerfunction \type\()_8tap_\isa, align=FUNC_ALIGN 105*c0909341SAndroid Build Coastguard Worker clz w8, \w 106*c0909341SAndroid Build Coastguard Worker mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 107*c0909341SAndroid Build Coastguard Worker sub w8, w8, #24 // for jump tables 108*c0909341SAndroid Build Coastguard Worker movrel x12, X(mc_subpel_filters) 109*c0909341SAndroid Build Coastguard Worker cbnz \mx, L(\type\()_8tap_h_hv_\isa) 110*c0909341SAndroid Build Coastguard Worker cbnz \my, L(\type\()_8tap_v_\isa) 111*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 112*c0909341SAndroid Build Coastguard Worker add \wd_strd, \w, \w // prep_neon needs w * 2 as stride 113*c0909341SAndroid Build Coastguard Worker.endif 114*c0909341SAndroid Build Coastguard Worker b X(\type\()_neon) 115*c0909341SAndroid Build Coastguard Worker 116*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 117*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_v_\isa): 118*c0909341SAndroid Build Coastguard Worker madd \my, \my, w11, w10 119*c0909341SAndroid Build Coastguard Worker movrel x13, v_tbl_neon_dotprod 120*c0909341SAndroid Build Coastguard Worker sub \src, \src, \s_strd 121*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod 122*c0909341SAndroid Build Coastguard Worker .ifc \type, prep 123*c0909341SAndroid Build Coastguard Worker mov w8, #0x2002 // FILTER_WEIGHT * 128 + rounding 124*c0909341SAndroid Build Coastguard Worker dup v4.4s, w8 125*c0909341SAndroid Build Coastguard Worker .else 126*c0909341SAndroid Build Coastguard Worker movi v4.4s, #32, lsl #8 // FILTER_WEIGHT * 128, bias for SDOT 127*c0909341SAndroid Build Coastguard Worker .endif 128*c0909341SAndroid Build Coastguard Worker.endif 129*c0909341SAndroid Build Coastguard Worker ubfx w11, \my, #7, #7 130*c0909341SAndroid Build Coastguard Worker and \my, \my, #0x7F 131*c0909341SAndroid Build Coastguard Worker ldp q6, q28, [x13] 132*c0909341SAndroid Build Coastguard Worker cmp \h, #4 133*c0909341SAndroid Build Coastguard Worker csel \my, \my, w11, le 134*c0909341SAndroid Build Coastguard Worker sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3 135*c0909341SAndroid Build Coastguard Worker add \xmy, x12, \xmy, lsl #3 // subpel V filter address 136*c0909341SAndroid Build Coastguard Worker ldr q29, [x13, #32] 137*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod 138*c0909341SAndroid Build Coastguard Worker movi v5.16b, #128 139*c0909341SAndroid Build Coastguard Worker.endif 140*c0909341SAndroid Build Coastguard Worker ldr d7, [\xmy] 141*c0909341SAndroid Build Coastguard Worker cmp \w, #8 142*c0909341SAndroid Build Coastguard Worker b.eq 80f 143*c0909341SAndroid Build Coastguard Worker b.lt 40f 144*c0909341SAndroid Build Coastguard Worker 145*c0909341SAndroid Build Coastguard Worker // .align JUMP_ALIGN // fallthrough 146*c0909341SAndroid Build Coastguard Worker160: // V - 16xN+ 147*c0909341SAndroid Build Coastguard Worker ldp q30, q31, [x13, #48] 148*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 149*c0909341SAndroid Build Coastguard Worker add \wd_strd, \w, \w 150*c0909341SAndroid Build Coastguard Worker.endif 151*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 152*c0909341SAndroid Build Coastguard Worker161: 153*c0909341SAndroid Build Coastguard Worker mov \lsrc, \src 154*c0909341SAndroid Build Coastguard Worker mov \ldst, \dst 155*c0909341SAndroid Build Coastguard Worker sub w8, \h, #1 156*c0909341SAndroid Build Coastguard Worker 157*c0909341SAndroid Build Coastguard Worker ldr q16, [\lsrc] 158*c0909341SAndroid Build Coastguard Worker ldr q17, [\lsrc, \s_strd] 159*c0909341SAndroid Build Coastguard Worker add \lsrc, \lsrc, \s_strd, lsl #1 160*c0909341SAndroid Build Coastguard Worker ldr q18, [\lsrc] 161*c0909341SAndroid Build Coastguard Worker ldr q19, [\lsrc, \s_strd] 162*c0909341SAndroid Build Coastguard Worker add \lsrc, \lsrc, \s_strd, lsl #1 163*c0909341SAndroid Build Coastguard Worker 164*c0909341SAndroid Build Coastguard Worker zip1 v0.16b, v16.16b, v17.16b 165*c0909341SAndroid Build Coastguard Worker zip2 v1.16b, v16.16b, v17.16b 166*c0909341SAndroid Build Coastguard Worker zip1 v2.16b, v18.16b, v19.16b 167*c0909341SAndroid Build Coastguard Worker zip2 v3.16b, v18.16b, v19.16b 168*c0909341SAndroid Build Coastguard Worker 169*c0909341SAndroid Build Coastguard Worker ldr q20, [\lsrc] 170*c0909341SAndroid Build Coastguard Worker ldr q21, [\lsrc, \s_strd] 171*c0909341SAndroid Build Coastguard Worker add \lsrc, \lsrc, \s_strd, lsl #1 172*c0909341SAndroid Build Coastguard Worker ldr q22, [\lsrc] 173*c0909341SAndroid Build Coastguard Worker ldr q23, [\lsrc, \s_strd] 174*c0909341SAndroid Build Coastguard Worker add \lsrc, \lsrc, \s_strd, lsl #1 175*c0909341SAndroid Build Coastguard Worker 176*c0909341SAndroid Build Coastguard Worker zip1 v18.16b, v20.16b, v21.16b 177*c0909341SAndroid Build Coastguard Worker zip2 v21.16b, v20.16b, v21.16b 178*c0909341SAndroid Build Coastguard Worker zip1 v24.16b, v22.16b, v23.16b 179*c0909341SAndroid Build Coastguard Worker zip2 v27.16b, v22.16b, v23.16b 180*c0909341SAndroid Build Coastguard Worker 181*c0909341SAndroid Build Coastguard Worker zip1 v16.8h, v0.8h, v2.8h 182*c0909341SAndroid Build Coastguard Worker zip2 v19.8h, v0.8h, v2.8h 183*c0909341SAndroid Build Coastguard Worker zip1 v22.8h, v1.8h, v3.8h 184*c0909341SAndroid Build Coastguard Worker zip2 v25.8h, v1.8h, v3.8h 185*c0909341SAndroid Build Coastguard Worker 186*c0909341SAndroid Build Coastguard Worker zip1 v17.8h, v18.8h, v24.8h 187*c0909341SAndroid Build Coastguard Worker zip2 v20.8h, v18.8h, v24.8h 188*c0909341SAndroid Build Coastguard Worker zip1 v23.8h, v21.8h, v27.8h 189*c0909341SAndroid Build Coastguard Worker zip2 v26.8h, v21.8h, v27.8h 190*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod 191*c0909341SAndroid Build Coastguard Worker sub v16.16b, v16.16b, v5.16b 192*c0909341SAndroid Build Coastguard Worker sub v19.16b, v19.16b, v5.16b 193*c0909341SAndroid Build Coastguard Worker sub v22.16b, v22.16b, v5.16b 194*c0909341SAndroid Build Coastguard Worker sub v25.16b, v25.16b, v5.16b 195*c0909341SAndroid Build Coastguard Worker 196*c0909341SAndroid Build Coastguard Worker sub v17.16b, v17.16b, v5.16b 197*c0909341SAndroid Build Coastguard Worker sub v20.16b, v20.16b, v5.16b 198*c0909341SAndroid Build Coastguard Worker sub v23.16b, v23.16b, v5.16b 199*c0909341SAndroid Build Coastguard Worker sub v26.16b, v26.16b, v5.16b 200*c0909341SAndroid Build Coastguard Worker.endif 201*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 202*c0909341SAndroid Build Coastguard Worker16: 203*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 204*c0909341SAndroid Build Coastguard Worker ld1 {v18.16b}, [\lsrc], \s_strd 205*c0909341SAndroid Build Coastguard Worker movi v0.4s, #0 206*c0909341SAndroid Build Coastguard Worker movi v1.4s, #0 207*c0909341SAndroid Build Coastguard Worker movi v2.4s, #0 208*c0909341SAndroid Build Coastguard Worker movi v3.4s, #0 209*c0909341SAndroid Build Coastguard Worker mov v21.16b, v18.16b 210*c0909341SAndroid Build Coastguard Worker mov v24.16b, v18.16b 211*c0909341SAndroid Build Coastguard Worker mov v27.16b, v18.16b 212*c0909341SAndroid Build Coastguard Worker.else // neon_dotprod 213*c0909341SAndroid Build Coastguard Worker ld1 {v27.16b}, [\lsrc], \s_strd 214*c0909341SAndroid Build Coastguard Worker mov v0.16b, v4.16b 215*c0909341SAndroid Build Coastguard Worker mov v1.16b, v4.16b 216*c0909341SAndroid Build Coastguard Worker mov v2.16b, v4.16b 217*c0909341SAndroid Build Coastguard Worker mov v3.16b, v4.16b 218*c0909341SAndroid Build Coastguard Worker sub v18.16b, v27.16b, v5.16b 219*c0909341SAndroid Build Coastguard Worker sub v21.16b, v27.16b, v5.16b 220*c0909341SAndroid Build Coastguard Worker sub v24.16b, v27.16b, v5.16b 221*c0909341SAndroid Build Coastguard Worker sub v27.16b, v27.16b, v5.16b 222*c0909341SAndroid Build Coastguard Worker.endif 223*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v16.16b, v7.4b[0] 224*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v19.16b, v7.4b[0] 225*c0909341SAndroid Build Coastguard Worker \dot v2.4s, v22.16b, v7.4b[0] 226*c0909341SAndroid Build Coastguard Worker \dot v3.4s, v25.16b, v7.4b[0] 227*c0909341SAndroid Build Coastguard Worker 228*c0909341SAndroid Build Coastguard Worker tbl v16.16b, {v16.16b, v17.16b}, v6.16b 229*c0909341SAndroid Build Coastguard Worker tbl v19.16b, {v19.16b, v20.16b}, v6.16b 230*c0909341SAndroid Build Coastguard Worker tbl v22.16b, {v22.16b, v23.16b}, v6.16b 231*c0909341SAndroid Build Coastguard Worker tbl v25.16b, {v25.16b, v26.16b}, v6.16b 232*c0909341SAndroid Build Coastguard Worker 233*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v17.16b, v7.4b[1] 234*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v20.16b, v7.4b[1] 235*c0909341SAndroid Build Coastguard Worker \dot v2.4s, v23.16b, v7.4b[1] 236*c0909341SAndroid Build Coastguard Worker \dot v3.4s, v26.16b, v7.4b[1] 237*c0909341SAndroid Build Coastguard Worker 238*c0909341SAndroid Build Coastguard Worker tbl v17.16b, {v17.16b, v18.16b}, v28.16b 239*c0909341SAndroid Build Coastguard Worker tbl v20.16b, {v20.16b, v21.16b}, v29.16b 240*c0909341SAndroid Build Coastguard Worker tbl v23.16b, {v23.16b, v24.16b}, v30.16b 241*c0909341SAndroid Build Coastguard Worker tbl v26.16b, {v26.16b, v27.16b}, v31.16b 242*c0909341SAndroid Build Coastguard Worker 243*c0909341SAndroid Build Coastguard Worker subs w8, w8, #1 244*c0909341SAndroid Build Coastguard Worker uzp1 v0.8h, v0.8h, v1.8h 245*c0909341SAndroid Build Coastguard Worker uzp1 v2.8h, v2.8h, v3.8h 246*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 247*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 248*c0909341SAndroid Build Coastguard Worker srshr v0.8h, v0.8h, #2 249*c0909341SAndroid Build Coastguard Worker srshr v1.8h, v2.8h, #2 250*c0909341SAndroid Build Coastguard Worker .else 251*c0909341SAndroid Build Coastguard Worker sshr v0.8h, v0.8h, #2 252*c0909341SAndroid Build Coastguard Worker sshr v1.8h, v2.8h, #2 253*c0909341SAndroid Build Coastguard Worker .endif 254*c0909341SAndroid Build Coastguard Worker st1 {v0.8h, v1.8h}, [\ldst], \d_strd 255*c0909341SAndroid Build Coastguard Worker.else // put 256*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #6 257*c0909341SAndroid Build Coastguard Worker sqrshrun2 v0.16b, v2.8h, #6 258*c0909341SAndroid Build Coastguard Worker st1 {v0.16b}, [\ldst], \d_strd 259*c0909341SAndroid Build Coastguard Worker.endif 260*c0909341SAndroid Build Coastguard Worker b.gt 16b 261*c0909341SAndroid Build Coastguard Worker 262*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 263*c0909341SAndroid Build Coastguard Worker movi v0.4s, #0 264*c0909341SAndroid Build Coastguard Worker movi v1.4s, #0 265*c0909341SAndroid Build Coastguard Worker movi v2.4s, #0 266*c0909341SAndroid Build Coastguard Worker movi v3.4s, #0 267*c0909341SAndroid Build Coastguard Worker.else // neon_dotprod 268*c0909341SAndroid Build Coastguard Worker mov v0.16b, v4.16b 269*c0909341SAndroid Build Coastguard Worker mov v1.16b, v4.16b 270*c0909341SAndroid Build Coastguard Worker mov v2.16b, v4.16b 271*c0909341SAndroid Build Coastguard Worker mov v3.16b, v4.16b 272*c0909341SAndroid Build Coastguard Worker.endif 273*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v16.16b, v7.4b[0] 274*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v19.16b, v7.4b[0] 275*c0909341SAndroid Build Coastguard Worker \dot v2.4s, v22.16b, v7.4b[0] 276*c0909341SAndroid Build Coastguard Worker \dot v3.4s, v25.16b, v7.4b[0] 277*c0909341SAndroid Build Coastguard Worker 278*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v17.16b, v7.4b[1] 279*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v20.16b, v7.4b[1] 280*c0909341SAndroid Build Coastguard Worker \dot v2.4s, v23.16b, v7.4b[1] 281*c0909341SAndroid Build Coastguard Worker \dot v3.4s, v26.16b, v7.4b[1] 282*c0909341SAndroid Build Coastguard Worker 283*c0909341SAndroid Build Coastguard Worker subs \w, \w, #16 284*c0909341SAndroid Build Coastguard Worker uzp1 v0.8h, v0.8h, v1.8h 285*c0909341SAndroid Build Coastguard Worker uzp1 v2.8h, v2.8h, v3.8h 286*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 287*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 288*c0909341SAndroid Build Coastguard Worker srshr v0.8h, v0.8h, #2 289*c0909341SAndroid Build Coastguard Worker srshr v1.8h, v2.8h, #2 290*c0909341SAndroid Build Coastguard Worker .else 291*c0909341SAndroid Build Coastguard Worker sshr v0.8h, v0.8h, #2 292*c0909341SAndroid Build Coastguard Worker sshr v1.8h, v2.8h, #2 293*c0909341SAndroid Build Coastguard Worker .endif 294*c0909341SAndroid Build Coastguard Worker stp q0, q1, [\ldst] 295*c0909341SAndroid Build Coastguard Worker add \dst, \dst, #32 296*c0909341SAndroid Build Coastguard Worker.else // put 297*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #6 298*c0909341SAndroid Build Coastguard Worker sqrshrun2 v0.16b, v2.8h, #6 299*c0909341SAndroid Build Coastguard Worker str q0, [\ldst] 300*c0909341SAndroid Build Coastguard Worker add \dst, \dst, #16 301*c0909341SAndroid Build Coastguard Worker.endif 302*c0909341SAndroid Build Coastguard Worker add \src, \src, #16 303*c0909341SAndroid Build Coastguard Worker b.gt 161b 304*c0909341SAndroid Build Coastguard Worker ret 305*c0909341SAndroid Build Coastguard Worker 306*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 307*c0909341SAndroid Build Coastguard Worker80: // V - 8xN 308*c0909341SAndroid Build Coastguard Worker ldr d16, [\src] 309*c0909341SAndroid Build Coastguard Worker ldr d17, [\src, \s_strd] 310*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 311*c0909341SAndroid Build Coastguard Worker ldr d18, [\src] 312*c0909341SAndroid Build Coastguard Worker ldr d19, [\src, \s_strd] 313*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 314*c0909341SAndroid Build Coastguard Worker 315*c0909341SAndroid Build Coastguard Worker ldr d20, [\src] 316*c0909341SAndroid Build Coastguard Worker ldr d21, [\src, \s_strd] 317*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 318*c0909341SAndroid Build Coastguard Worker ldr d22, [\src] 319*c0909341SAndroid Build Coastguard Worker ldr d23, [\src, \s_strd] 320*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 321*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 // for prep: sub is enough 322*c0909341SAndroid Build Coastguard Worker 323*c0909341SAndroid Build Coastguard Worker zip1 v0.16b, v16.16b, v17.16b 324*c0909341SAndroid Build Coastguard Worker zip1 v2.16b, v18.16b, v19.16b 325*c0909341SAndroid Build Coastguard Worker zip1 v18.16b, v20.16b, v21.16b 326*c0909341SAndroid Build Coastguard Worker zip1 v24.16b, v22.16b, v23.16b 327*c0909341SAndroid Build Coastguard Worker 328*c0909341SAndroid Build Coastguard Worker zip1 v16.8h, v0.8h, v2.8h 329*c0909341SAndroid Build Coastguard Worker zip2 v19.8h, v0.8h, v2.8h 330*c0909341SAndroid Build Coastguard Worker zip1 v17.8h, v18.8h, v24.8h 331*c0909341SAndroid Build Coastguard Worker zip2 v20.8h, v18.8h, v24.8h 332*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod 333*c0909341SAndroid Build Coastguard Worker sub v16.16b, v16.16b, v5.16b 334*c0909341SAndroid Build Coastguard Worker sub v19.16b, v19.16b, v5.16b 335*c0909341SAndroid Build Coastguard Worker sub v17.16b, v17.16b, v5.16b 336*c0909341SAndroid Build Coastguard Worker sub v20.16b, v20.16b, v5.16b 337*c0909341SAndroid Build Coastguard Worker.endif 338*c0909341SAndroid Build Coastguard Worker.ifc \type, put 339*c0909341SAndroid Build Coastguard Worker b.eq 82f 340*c0909341SAndroid Build Coastguard Worker.endif 341*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 342*c0909341SAndroid Build Coastguard Worker8: 343*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 344*c0909341SAndroid Build Coastguard Worker ldr d18, [\src] 345*c0909341SAndroid Build Coastguard Worker movi v0.4s, #0 346*c0909341SAndroid Build Coastguard Worker movi v1.4s, #0 347*c0909341SAndroid Build Coastguard Worker ldr d24, [\src, \s_strd] 348*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 349*c0909341SAndroid Build Coastguard Worker movi v2.4s, #0 350*c0909341SAndroid Build Coastguard Worker movi v3.4s, #0 351*c0909341SAndroid Build Coastguard Worker mov v21.8b, v18.8b 352*c0909341SAndroid Build Coastguard Worker mov v27.8b, v24.8b 353*c0909341SAndroid Build Coastguard Worker.else // neon_dotprod 354*c0909341SAndroid Build Coastguard Worker ldr d21, [\src] 355*c0909341SAndroid Build Coastguard Worker ldr d27, [\src, \s_strd] 356*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 357*c0909341SAndroid Build Coastguard Worker mov v0.16b, v4.16b 358*c0909341SAndroid Build Coastguard Worker mov v1.16b, v4.16b 359*c0909341SAndroid Build Coastguard Worker mov v2.16b, v4.16b 360*c0909341SAndroid Build Coastguard Worker mov v3.16b, v4.16b 361*c0909341SAndroid Build Coastguard Worker sub v18.16b, v21.16b, v5.16b 362*c0909341SAndroid Build Coastguard Worker sub v21.16b, v21.16b, v5.16b 363*c0909341SAndroid Build Coastguard Worker sub v24.16b, v27.16b, v5.16b 364*c0909341SAndroid Build Coastguard Worker sub v27.16b, v27.16b, v5.16b 365*c0909341SAndroid Build Coastguard Worker.endif 366*c0909341SAndroid Build Coastguard Worker tbl v22.16b, {v16.16b, v17.16b}, v6.16b 367*c0909341SAndroid Build Coastguard Worker tbl v25.16b, {v19.16b, v20.16b}, v6.16b 368*c0909341SAndroid Build Coastguard Worker tbl v23.16b, {v17.16b, v18.16b}, v28.16b 369*c0909341SAndroid Build Coastguard Worker tbl v26.16b, {v20.16b, v21.16b}, v29.16b 370*c0909341SAndroid Build Coastguard Worker 371*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v16.16b, v7.4b[0] 372*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v17.16b, v7.4b[1] 373*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v19.16b, v7.4b[0] 374*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v20.16b, v7.4b[1] 375*c0909341SAndroid Build Coastguard Worker 376*c0909341SAndroid Build Coastguard Worker tbl v16.16b, {v22.16b, v23.16b}, v6.16b 377*c0909341SAndroid Build Coastguard Worker tbl v19.16b, {v25.16b, v26.16b}, v6.16b 378*c0909341SAndroid Build Coastguard Worker tbl v17.16b, {v23.16b, v24.16b}, v28.16b 379*c0909341SAndroid Build Coastguard Worker tbl v20.16b, {v26.16b, v27.16b}, v29.16b 380*c0909341SAndroid Build Coastguard Worker 381*c0909341SAndroid Build Coastguard Worker \dot v2.4s, v22.16b, v7.4b[0] 382*c0909341SAndroid Build Coastguard Worker \dot v2.4s, v23.16b, v7.4b[1] 383*c0909341SAndroid Build Coastguard Worker \dot v3.4s, v25.16b, v7.4b[0] 384*c0909341SAndroid Build Coastguard Worker \dot v3.4s, v26.16b, v7.4b[1] 385*c0909341SAndroid Build Coastguard Worker 386*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 387*c0909341SAndroid Build Coastguard Worker uzp1 v0.8h, v0.8h, v1.8h 388*c0909341SAndroid Build Coastguard Worker uzp1 v2.8h, v2.8h, v3.8h 389*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 390*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 391*c0909341SAndroid Build Coastguard Worker srshr v0.8h, v0.8h, #2 392*c0909341SAndroid Build Coastguard Worker srshr v1.8h, v2.8h, #2 393*c0909341SAndroid Build Coastguard Worker .else 394*c0909341SAndroid Build Coastguard Worker sshr v0.8h, v0.8h, #2 395*c0909341SAndroid Build Coastguard Worker sshr v1.8h, v2.8h, #2 396*c0909341SAndroid Build Coastguard Worker .endif 397*c0909341SAndroid Build Coastguard Worker stp q0, q1, [\dst], #32 398*c0909341SAndroid Build Coastguard Worker.else // put 399*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #6 400*c0909341SAndroid Build Coastguard Worker sqrshrun v1.8b, v2.8h, #6 401*c0909341SAndroid Build Coastguard Worker str d0, [\dst] 402*c0909341SAndroid Build Coastguard Worker str d1, [\dst, \d_strd] 403*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd, lsl #1 404*c0909341SAndroid Build Coastguard Worker.endif 405*c0909341SAndroid Build Coastguard Worker b.gt 8b 406*c0909341SAndroid Build Coastguard Worker 407*c0909341SAndroid Build Coastguard Worker.ifc \type, put 408*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 409*c0909341SAndroid Build Coastguard Worker82: 410*c0909341SAndroid Build Coastguard Worker.endif 411*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 412*c0909341SAndroid Build Coastguard Worker ldr d18, [\src] 413*c0909341SAndroid Build Coastguard Worker movi v0.4s, #0 414*c0909341SAndroid Build Coastguard Worker movi v1.4s, #0 415*c0909341SAndroid Build Coastguard Worker movi v2.4s, #0 416*c0909341SAndroid Build Coastguard Worker movi v3.4s, #0 417*c0909341SAndroid Build Coastguard Worker mov v21.8b, v18.8b 418*c0909341SAndroid Build Coastguard Worker.else // neon_dotprod 419*c0909341SAndroid Build Coastguard Worker ldr d21, [\src] 420*c0909341SAndroid Build Coastguard Worker mov v0.16b, v4.16b 421*c0909341SAndroid Build Coastguard Worker mov v1.16b, v4.16b 422*c0909341SAndroid Build Coastguard Worker mov v2.16b, v4.16b 423*c0909341SAndroid Build Coastguard Worker mov v3.16b, v4.16b 424*c0909341SAndroid Build Coastguard Worker sub v18.16b, v21.16b, v5.16b 425*c0909341SAndroid Build Coastguard Worker sub v21.16b, v21.16b, v5.16b 426*c0909341SAndroid Build Coastguard Worker.endif 427*c0909341SAndroid Build Coastguard Worker tbl v22.16b, {v16.16b, v17.16b}, v6.16b 428*c0909341SAndroid Build Coastguard Worker tbl v25.16b, {v19.16b, v20.16b}, v6.16b 429*c0909341SAndroid Build Coastguard Worker tbl v23.16b, {v17.16b, v18.16b}, v28.16b 430*c0909341SAndroid Build Coastguard Worker tbl v26.16b, {v20.16b, v21.16b}, v29.16b 431*c0909341SAndroid Build Coastguard Worker 432*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v16.16b, v7.4b[0] 433*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v17.16b, v7.4b[1] 434*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v19.16b, v7.4b[0] 435*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v20.16b, v7.4b[1] 436*c0909341SAndroid Build Coastguard Worker 437*c0909341SAndroid Build Coastguard Worker \dot v2.4s, v22.16b, v7.4b[0] 438*c0909341SAndroid Build Coastguard Worker \dot v2.4s, v23.16b, v7.4b[1] 439*c0909341SAndroid Build Coastguard Worker \dot v3.4s, v25.16b, v7.4b[0] 440*c0909341SAndroid Build Coastguard Worker \dot v3.4s, v26.16b, v7.4b[1] 441*c0909341SAndroid Build Coastguard Worker 442*c0909341SAndroid Build Coastguard Worker uzp1 v0.8h, v0.8h, v1.8h 443*c0909341SAndroid Build Coastguard Worker uzp1 v2.8h, v2.8h, v3.8h 444*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 445*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 446*c0909341SAndroid Build Coastguard Worker srshr v0.8h, v0.8h, #2 447*c0909341SAndroid Build Coastguard Worker srshr v1.8h, v2.8h, #2 448*c0909341SAndroid Build Coastguard Worker .else 449*c0909341SAndroid Build Coastguard Worker sshr v0.8h, v0.8h, #2 450*c0909341SAndroid Build Coastguard Worker sshr v1.8h, v2.8h, #2 451*c0909341SAndroid Build Coastguard Worker .endif 452*c0909341SAndroid Build Coastguard Worker stp q0, q1, [\dst] 453*c0909341SAndroid Build Coastguard Worker.else // put 454*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #6 455*c0909341SAndroid Build Coastguard Worker sqrshrun v1.8b, v2.8h, #6 456*c0909341SAndroid Build Coastguard Worker str d0, [\dst] 457*c0909341SAndroid Build Coastguard Worker str d1, [\dst, \d_strd] 458*c0909341SAndroid Build Coastguard Worker.endif 459*c0909341SAndroid Build Coastguard Worker ret 460*c0909341SAndroid Build Coastguard Worker 461*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 462*c0909341SAndroid Build Coastguard Worker40: // V - 4xN or 2xN (put only) 463*c0909341SAndroid Build Coastguard Worker.ifc \type, put 464*c0909341SAndroid Build Coastguard Worker cmp \w, #2 465*c0909341SAndroid Build Coastguard Worker b.eq 20f 466*c0909341SAndroid Build Coastguard Worker.endif 467*c0909341SAndroid Build Coastguard Worker ldr s16, [\src] 468*c0909341SAndroid Build Coastguard Worker ldr s17, [\src, \s_strd] 469*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 470*c0909341SAndroid Build Coastguard Worker ldr s18, [\src] 471*c0909341SAndroid Build Coastguard Worker ldr s19, [\src, \s_strd] 472*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 473*c0909341SAndroid Build Coastguard Worker 474*c0909341SAndroid Build Coastguard Worker ldr s20, [\src] 475*c0909341SAndroid Build Coastguard Worker ldr s21, [\src, \s_strd] 476*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 477*c0909341SAndroid Build Coastguard Worker ldr s22, [\src] 478*c0909341SAndroid Build Coastguard Worker ldr s23, [\src, \s_strd] 479*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 480*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 // for prep: sub is enough 481*c0909341SAndroid Build Coastguard Worker 482*c0909341SAndroid Build Coastguard Worker zip1 v0.8b, v16.8b, v17.8b 483*c0909341SAndroid Build Coastguard Worker zip1 v2.8b, v18.8b, v19.8b 484*c0909341SAndroid Build Coastguard Worker zip1 v18.8b, v20.8b, v21.8b 485*c0909341SAndroid Build Coastguard Worker zip1 v24.8b, v22.8b, v23.8b 486*c0909341SAndroid Build Coastguard Worker 487*c0909341SAndroid Build Coastguard Worker zip1 v16.8h, v0.8h, v2.8h 488*c0909341SAndroid Build Coastguard Worker zip1 v17.8h, v18.8h, v24.8h 489*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod 490*c0909341SAndroid Build Coastguard Worker sub v16.16b, v16.16b, v5.16b 491*c0909341SAndroid Build Coastguard Worker sub v17.16b, v17.16b, v5.16b 492*c0909341SAndroid Build Coastguard Worker.endif 493*c0909341SAndroid Build Coastguard Worker.ifc \type, put 494*c0909341SAndroid Build Coastguard Worker b.eq 42f 495*c0909341SAndroid Build Coastguard Worker.endif 496*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 497*c0909341SAndroid Build Coastguard Worker4: 498*c0909341SAndroid Build Coastguard Worker ldr s18, [\src] 499*c0909341SAndroid Build Coastguard Worker ldr s21, [\src, \s_strd] 500*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 501*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 502*c0909341SAndroid Build Coastguard Worker movi v0.4s, #0 503*c0909341SAndroid Build Coastguard Worker movi v1.4s, #0 504*c0909341SAndroid Build Coastguard Worker.else // neon_dotprod 505*c0909341SAndroid Build Coastguard Worker mov v0.16b, v4.16b 506*c0909341SAndroid Build Coastguard Worker mov v1.16b, v4.16b 507*c0909341SAndroid Build Coastguard Worker sub v18.16b, v18.16b, v5.16b 508*c0909341SAndroid Build Coastguard Worker sub v21.16b, v21.16b, v5.16b 509*c0909341SAndroid Build Coastguard Worker.endif 510*c0909341SAndroid Build Coastguard Worker tbl v19.16b, {v16.16b, v17.16b}, v6.16b 511*c0909341SAndroid Build Coastguard Worker tbl v20.16b, {v17.16b, v18.16b}, v28.16b 512*c0909341SAndroid Build Coastguard Worker 513*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v16.16b, v7.4b[0] 514*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v17.16b, v7.4b[1] 515*c0909341SAndroid Build Coastguard Worker 516*c0909341SAndroid Build Coastguard Worker tbl v16.16b, {v19.16b, v20.16b}, v6.16b 517*c0909341SAndroid Build Coastguard Worker tbl v17.16b, {v20.16b, v21.16b}, v28.16b 518*c0909341SAndroid Build Coastguard Worker 519*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v19.16b, v7.4b[0] 520*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v20.16b, v7.4b[1] 521*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 522*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 523*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 524*c0909341SAndroid Build Coastguard Worker rshrn v0.4h, v0.4s, #2 525*c0909341SAndroid Build Coastguard Worker rshrn2 v0.8h, v1.4s, #2 526*c0909341SAndroid Build Coastguard Worker .else 527*c0909341SAndroid Build Coastguard Worker shrn v0.4h, v0.4s, #2 528*c0909341SAndroid Build Coastguard Worker shrn2 v0.8h, v1.4s, #2 529*c0909341SAndroid Build Coastguard Worker .endif 530*c0909341SAndroid Build Coastguard Worker str q0, [\dst], #16 531*c0909341SAndroid Build Coastguard Worker.else 532*c0909341SAndroid Build Coastguard Worker uzp1 v0.8h, v0.8h, v1.8h 533*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #6 534*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 535*c0909341SAndroid Build Coastguard Worker fmov x8, d0 536*c0909341SAndroid Build Coastguard Worker lsr x9, x8, #32 537*c0909341SAndroid Build Coastguard Worker str w8, [\dst] 538*c0909341SAndroid Build Coastguard Worker str w9, [\dst, \d_strd] 539*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd, lsl #1 540*c0909341SAndroid Build Coastguard Worker.endif 541*c0909341SAndroid Build Coastguard Worker b.gt 4b 542*c0909341SAndroid Build Coastguard Worker 543*c0909341SAndroid Build Coastguard Worker.ifc \type, put 544*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 545*c0909341SAndroid Build Coastguard Worker42: 546*c0909341SAndroid Build Coastguard Worker.endif 547*c0909341SAndroid Build Coastguard Worker ldr s18, [\src] 548*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 549*c0909341SAndroid Build Coastguard Worker movi v0.4s, #0 550*c0909341SAndroid Build Coastguard Worker movi v1.4s, #0 551*c0909341SAndroid Build Coastguard Worker.else // neon_dotprod 552*c0909341SAndroid Build Coastguard Worker mov v0.16b, v4.16b 553*c0909341SAndroid Build Coastguard Worker mov v1.16b, v4.16b 554*c0909341SAndroid Build Coastguard Worker sub v18.16b, v18.16b, v5.16b 555*c0909341SAndroid Build Coastguard Worker.endif 556*c0909341SAndroid Build Coastguard Worker tbl v19.16b, {v16.16b, v17.16b}, v6.16b 557*c0909341SAndroid Build Coastguard Worker tbl v20.16b, {v17.16b, v18.16b}, v28.16b 558*c0909341SAndroid Build Coastguard Worker 559*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v16.16b, v7.4b[0] 560*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v17.16b, v7.4b[1] 561*c0909341SAndroid Build Coastguard Worker 562*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v19.16b, v7.4b[0] 563*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v20.16b, v7.4b[1] 564*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 565*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 566*c0909341SAndroid Build Coastguard Worker rshrn v0.4h, v0.4s, #2 567*c0909341SAndroid Build Coastguard Worker rshrn2 v0.8h, v1.4s, #2 568*c0909341SAndroid Build Coastguard Worker .else 569*c0909341SAndroid Build Coastguard Worker shrn v0.4h, v0.4s, #2 570*c0909341SAndroid Build Coastguard Worker shrn2 v0.8h, v1.4s, #2 571*c0909341SAndroid Build Coastguard Worker .endif 572*c0909341SAndroid Build Coastguard Worker str q0, [\dst] 573*c0909341SAndroid Build Coastguard Worker.else 574*c0909341SAndroid Build Coastguard Worker uzp1 v0.8h, v0.8h, v1.8h 575*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #6 576*c0909341SAndroid Build Coastguard Worker fmov x8, d0 577*c0909341SAndroid Build Coastguard Worker lsr x9, x8, #32 578*c0909341SAndroid Build Coastguard Worker str w8, [\dst] 579*c0909341SAndroid Build Coastguard Worker str w9, [\dst, \d_strd] 580*c0909341SAndroid Build Coastguard Worker.endif 581*c0909341SAndroid Build Coastguard Worker ret 582*c0909341SAndroid Build Coastguard Worker 583*c0909341SAndroid Build Coastguard Worker.ifc \type, put 584*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 585*c0909341SAndroid Build Coastguard Worker20: // V - 2xN 586*c0909341SAndroid Build Coastguard Worker ldr h16, [\src] 587*c0909341SAndroid Build Coastguard Worker ldr h17, [\src, \s_strd] 588*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 589*c0909341SAndroid Build Coastguard Worker ldr h18, [\src] 590*c0909341SAndroid Build Coastguard Worker ldr h19, [\src, \s_strd] 591*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 592*c0909341SAndroid Build Coastguard Worker 593*c0909341SAndroid Build Coastguard Worker ldr h20, [\src] 594*c0909341SAndroid Build Coastguard Worker ldr h21, [\src, \s_strd] 595*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 596*c0909341SAndroid Build Coastguard Worker ldr h22, [\src] 597*c0909341SAndroid Build Coastguard Worker ldr h23, [\src, \s_strd] 598*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 599*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 600*c0909341SAndroid Build Coastguard Worker 601*c0909341SAndroid Build Coastguard Worker zip1 v0.8b, v16.8b, v17.8b 602*c0909341SAndroid Build Coastguard Worker zip1 v2.8b, v18.8b, v19.8b 603*c0909341SAndroid Build Coastguard Worker zip1 v18.8b, v20.8b, v21.8b 604*c0909341SAndroid Build Coastguard Worker zip1 v24.8b, v22.8b, v23.8b 605*c0909341SAndroid Build Coastguard Worker 606*c0909341SAndroid Build Coastguard Worker zip1 v16.4h, v0.4h, v2.4h 607*c0909341SAndroid Build Coastguard Worker zip1 v17.4h, v18.4h, v24.4h 608*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_dotprod 609*c0909341SAndroid Build Coastguard Worker sub v16.8b, v16.8b, v5.8b 610*c0909341SAndroid Build Coastguard Worker sub v17.8b, v17.8b, v5.8b 611*c0909341SAndroid Build Coastguard Worker .endif 612*c0909341SAndroid Build Coastguard Worker b.eq 22f 613*c0909341SAndroid Build Coastguard Worker 614*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 615*c0909341SAndroid Build Coastguard Worker2: 616*c0909341SAndroid Build Coastguard Worker ldr h18, [\src] 617*c0909341SAndroid Build Coastguard Worker ldr h21, [\src, \s_strd] 618*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 619*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 620*c0909341SAndroid Build Coastguard Worker movi v0.4s, #0 621*c0909341SAndroid Build Coastguard Worker movi v1.4s, #0 622*c0909341SAndroid Build Coastguard Worker .else // put 623*c0909341SAndroid Build Coastguard Worker mov v0.16b, v4.16b 624*c0909341SAndroid Build Coastguard Worker mov v1.16b, v4.16b 625*c0909341SAndroid Build Coastguard Worker sub v18.8b, v18.8b, v5.8b 626*c0909341SAndroid Build Coastguard Worker sub v21.8b, v21.8b, v5.8b 627*c0909341SAndroid Build Coastguard Worker .endif 628*c0909341SAndroid Build Coastguard Worker tbl v19.16b, {v16.16b, v17.16b}, v6.16b 629*c0909341SAndroid Build Coastguard Worker tbl v20.16b, {v17.16b, v18.16b}, v28.16b 630*c0909341SAndroid Build Coastguard Worker 631*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v16.16b, v7.4b[0] 632*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v17.16b, v7.4b[1] 633*c0909341SAndroid Build Coastguard Worker 634*c0909341SAndroid Build Coastguard Worker tbl v16.16b, {v19.16b, v20.16b}, v6.16b 635*c0909341SAndroid Build Coastguard Worker tbl v17.16b, {v20.16b, v21.16b}, v28.16b 636*c0909341SAndroid Build Coastguard Worker 637*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v19.16b, v7.4b[0] 638*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v20.16b, v7.4b[1] 639*c0909341SAndroid Build Coastguard Worker 640*c0909341SAndroid Build Coastguard Worker uzp1 v0.8h, v0.8h, v1.8h 641*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #6 642*c0909341SAndroid Build Coastguard Worker 643*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 644*c0909341SAndroid Build Coastguard Worker fmov x8, d0 645*c0909341SAndroid Build Coastguard Worker lsr x9, x8, #32 646*c0909341SAndroid Build Coastguard Worker strh w8, [\dst] 647*c0909341SAndroid Build Coastguard Worker strh w9, [\dst, \d_strd] 648*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd, lsl #1 649*c0909341SAndroid Build Coastguard Worker b.gt 2b 650*c0909341SAndroid Build Coastguard Worker 651*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 652*c0909341SAndroid Build Coastguard Worker22: 653*c0909341SAndroid Build Coastguard Worker ldr h18, [\src] 654*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 655*c0909341SAndroid Build Coastguard Worker movi v0.4s, #0 656*c0909341SAndroid Build Coastguard Worker movi v1.4s, #0 657*c0909341SAndroid Build Coastguard Worker .else // put 658*c0909341SAndroid Build Coastguard Worker mov v0.16b, v4.16b 659*c0909341SAndroid Build Coastguard Worker mov v1.16b, v4.16b 660*c0909341SAndroid Build Coastguard Worker sub v18.8b, v18.8b, v5.8b 661*c0909341SAndroid Build Coastguard Worker .endif 662*c0909341SAndroid Build Coastguard Worker tbl v19.16b, {v16.16b, v17.16b}, v6.16b 663*c0909341SAndroid Build Coastguard Worker tbl v20.16b, {v17.16b, v18.16b}, v28.16b 664*c0909341SAndroid Build Coastguard Worker 665*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v16.16b, v7.4b[0] 666*c0909341SAndroid Build Coastguard Worker \dot v0.4s, v17.16b, v7.4b[1] 667*c0909341SAndroid Build Coastguard Worker 668*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v19.16b, v7.4b[0] 669*c0909341SAndroid Build Coastguard Worker \dot v1.4s, v20.16b, v7.4b[1] 670*c0909341SAndroid Build Coastguard Worker 671*c0909341SAndroid Build Coastguard Worker uzp1 v0.8h, v0.8h, v1.8h 672*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #6 673*c0909341SAndroid Build Coastguard Worker 674*c0909341SAndroid Build Coastguard Worker fmov x8, d0 675*c0909341SAndroid Build Coastguard Worker lsr x9, x8, #32 676*c0909341SAndroid Build Coastguard Worker strh w8, [\dst] 677*c0909341SAndroid Build Coastguard Worker strh w9, [\dst, \d_strd] 678*c0909341SAndroid Build Coastguard Worker ret 679*c0909341SAndroid Build Coastguard Worker.endif 680*c0909341SAndroid Build Coastguard Worker 681*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 682*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_h_hv_\isa): 683*c0909341SAndroid Build Coastguard Worker madd \mx, \mx, w11, w9 684*c0909341SAndroid Build Coastguard Worker madd w14, \my, w11, w10 // for HV 685*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod 686*c0909341SAndroid Build Coastguard Worker mov w13, #0x2002 // FILTER_WEIGHT * 128 + rounding 687*c0909341SAndroid Build Coastguard Worker dup v27.4s, w13 // put H overrides this 688*c0909341SAndroid Build Coastguard Worker.endif 689*c0909341SAndroid Build Coastguard Worker movrel x13, h_tbl_neon_dotprod 690*c0909341SAndroid Build Coastguard Worker sub \src, \src, #3 // src - 3 691*c0909341SAndroid Build Coastguard Worker ldr q28, [x13] // for 4-tap & 8-tap H filters 692*c0909341SAndroid Build Coastguard Worker ubfx w15, \mx, #7, #7 693*c0909341SAndroid Build Coastguard Worker and \mx, \mx, #0x7F 694*c0909341SAndroid Build Coastguard Worker ubfx w11, w14, #7, #7 // for HV 695*c0909341SAndroid Build Coastguard Worker and w14, w14, #0x7F // for HV 696*c0909341SAndroid Build Coastguard Worker cmp \w, #4 697*c0909341SAndroid Build Coastguard Worker csel \mx, \mx, w15, le 698*c0909341SAndroid Build Coastguard Worker add \xmx, x12, \xmx, lsl #3 // subpel H filter address 699*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod 700*c0909341SAndroid Build Coastguard Worker movi v24.16b, #128 701*c0909341SAndroid Build Coastguard Worker.endif 702*c0909341SAndroid Build Coastguard Worker cbz \my, L(\type\()_8tap_h_\isa) 703*c0909341SAndroid Build Coastguard Worker 704*c0909341SAndroid Build Coastguard Worker // HV cases 705*c0909341SAndroid Build Coastguard Worker cmp \h, #4 706*c0909341SAndroid Build Coastguard Worker csel w14, w14, w11, le 707*c0909341SAndroid Build Coastguard Worker sub \src, \src, \s_strd, lsl #1 // src - s_strd * 2 - 3 708*c0909341SAndroid Build Coastguard Worker add \xmy, x12, x14, lsl #3 // subpel V filter address 709*c0909341SAndroid Build Coastguard Worker mov x15, x30 710*c0909341SAndroid Build Coastguard Worker ldr d7, [\xmy] 711*c0909341SAndroid Build Coastguard Worker.ifc \type, put 712*c0909341SAndroid Build Coastguard Worker ldr q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion 713*c0909341SAndroid Build Coastguard Worker.endif // of 32b values to 8b 714*c0909341SAndroid Build Coastguard Worker sxtl v7.8h, v7.8b 715*c0909341SAndroid Build Coastguard Worker cmp w10, #SHARP1 716*c0909341SAndroid Build Coastguard Worker b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 717*c0909341SAndroid Build Coastguard Worker 718*c0909341SAndroid Build Coastguard Worker // HV 8-tap cases 719*c0909341SAndroid Build Coastguard Worker sub \src, \src, \s_strd // src - s_strd * 3 - 3 720*c0909341SAndroid Build Coastguard Worker cmp \w, #4 721*c0909341SAndroid Build Coastguard Worker b.eq 40f 722*c0909341SAndroid Build Coastguard Worker.ifc \type, put 723*c0909341SAndroid Build Coastguard Worker b.lt 20f 724*c0909341SAndroid Build Coastguard Worker.endif 725*c0909341SAndroid Build Coastguard Worker 726*c0909341SAndroid Build Coastguard Worker // .align JUMP_ALIGN // fallthrough 727*c0909341SAndroid Build Coastguard Worker80: // HV8 - 8xN+ 728*c0909341SAndroid Build Coastguard Worker ldp q29, q30, [x13, #16] 729*c0909341SAndroid Build Coastguard Worker ldr d26, [\xmx] 730*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 731*c0909341SAndroid Build Coastguard Worker add \wd_strd, \w, \w 732*c0909341SAndroid Build Coastguard Worker.endif 733*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 734*c0909341SAndroid Build Coastguard Worker81: 735*c0909341SAndroid Build Coastguard Worker mov \lsrc, \src 736*c0909341SAndroid Build Coastguard Worker mov \ldst, \dst 737*c0909341SAndroid Build Coastguard Worker mov w8, \h 738*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 739*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 740*c0909341SAndroid Build Coastguard Worker srshr v16.8h, v22.8h, #2 741*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 742*c0909341SAndroid Build Coastguard Worker srshr v17.8h, v22.8h, #2 743*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 744*c0909341SAndroid Build Coastguard Worker srshr v18.8h, v22.8h, #2 745*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 746*c0909341SAndroid Build Coastguard Worker srshr v19.8h, v22.8h, #2 747*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 748*c0909341SAndroid Build Coastguard Worker srshr v20.8h, v22.8h, #2 749*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 750*c0909341SAndroid Build Coastguard Worker srshr v21.8h, v22.8h, #2 751*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 752*c0909341SAndroid Build Coastguard Worker srshr v22.8h, v22.8h, #2 753*c0909341SAndroid Build Coastguard Worker.else 754*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 755*c0909341SAndroid Build Coastguard Worker sshr v16.8h, v22.8h, #2 756*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 757*c0909341SAndroid Build Coastguard Worker sshr v17.8h, v22.8h, #2 758*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 759*c0909341SAndroid Build Coastguard Worker sshr v18.8h, v22.8h, #2 760*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 761*c0909341SAndroid Build Coastguard Worker sshr v19.8h, v22.8h, #2 762*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 763*c0909341SAndroid Build Coastguard Worker sshr v20.8h, v22.8h, #2 764*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 765*c0909341SAndroid Build Coastguard Worker sshr v21.8h, v22.8h, #2 766*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 767*c0909341SAndroid Build Coastguard Worker sshr v22.8h, v22.8h, #2 768*c0909341SAndroid Build Coastguard Worker.endif 769*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 770*c0909341SAndroid Build Coastguard Worker8: 771*c0909341SAndroid Build Coastguard Worker ldr q23, [\lsrc] 772*c0909341SAndroid Build Coastguard Worker add \lsrc, \lsrc, \s_strd 773*c0909341SAndroid Build Coastguard Worker 774*c0909341SAndroid Build Coastguard Worker smull v0.4s, v16.4h, v7.h[0] 775*c0909341SAndroid Build Coastguard Worker smull2 v1.4s, v16.8h, v7.h[0] 776*c0909341SAndroid Build Coastguard Worker mov v16.16b, v17.16b 777*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 778*c0909341SAndroid Build Coastguard Worker movi v5.4s, #0 779*c0909341SAndroid Build Coastguard Worker movi v6.4s, #0 780*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v23.16b}, v28.16b 781*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v23.16b}, v29.16b 782*c0909341SAndroid Build Coastguard Worker.else // neon_dotprod 783*c0909341SAndroid Build Coastguard Worker sub v23.16b, v23.16b, v24.16b 784*c0909341SAndroid Build Coastguard Worker mov v5.16b, v27.16b 785*c0909341SAndroid Build Coastguard Worker mov v6.16b, v27.16b 786*c0909341SAndroid Build Coastguard Worker.endif 787*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v17.4h, v7.h[1] 788*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v17.8h, v7.h[1] 789*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 790*c0909341SAndroid Build Coastguard Worker tbl v4.16b, {v23.16b}, v30.16b 791*c0909341SAndroid Build Coastguard Worker mov v17.16b, v18.16b 792*c0909341SAndroid Build Coastguard Worker.else // neon_dotprod 793*c0909341SAndroid Build Coastguard Worker mov v17.16b, v18.16b 794*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v23.16b}, v28.16b 795*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v23.16b}, v29.16b 796*c0909341SAndroid Build Coastguard Worker tbl v4.16b, {v23.16b}, v30.16b 797*c0909341SAndroid Build Coastguard Worker.endif 798*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v18.4h, v7.h[2] 799*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v18.8h, v7.h[2] 800*c0909341SAndroid Build Coastguard Worker mov v18.16b, v19.16b 801*c0909341SAndroid Build Coastguard Worker 802*c0909341SAndroid Build Coastguard Worker \dot v5.4s, v2.16b, v26.4b[0] 803*c0909341SAndroid Build Coastguard Worker \dot v6.4s, v3.16b, v26.4b[0] 804*c0909341SAndroid Build Coastguard Worker 805*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v19.4h, v7.h[3] 806*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v19.8h, v7.h[3] 807*c0909341SAndroid Build Coastguard Worker mov v19.16b, v20.16b 808*c0909341SAndroid Build Coastguard Worker 809*c0909341SAndroid Build Coastguard Worker \dot v5.4s, v3.16b, v26.4b[1] 810*c0909341SAndroid Build Coastguard Worker \dot v6.4s, v4.16b, v26.4b[1] 811*c0909341SAndroid Build Coastguard Worker 812*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v20.4h, v7.h[4] 813*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v20.8h, v7.h[4] 814*c0909341SAndroid Build Coastguard Worker mov v20.16b, v21.16b 815*c0909341SAndroid Build Coastguard Worker 816*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v21.4h, v7.h[5] 817*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v21.8h, v7.h[5] 818*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 819*c0909341SAndroid Build Coastguard Worker uzp1 v23.8h, v5.8h, v6.8h 820*c0909341SAndroid Build Coastguard Worker.endif 821*c0909341SAndroid Build Coastguard Worker mov v21.16b, v22.16b 822*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v22.4h, v7.h[6] 823*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v22.8h, v7.h[6] 824*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 825*c0909341SAndroid Build Coastguard Worker subs w8, w8, #1 826*c0909341SAndroid Build Coastguard Worker.endif 827*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 828*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 829*c0909341SAndroid Build Coastguard Worker srshr v22.8h, v23.8h, #2 830*c0909341SAndroid Build Coastguard Worker .else 831*c0909341SAndroid Build Coastguard Worker sshr v22.8h, v23.8h, #2 832*c0909341SAndroid Build Coastguard Worker .endif 833*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v22.4h, v7.h[7] 834*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v22.8h, v7.h[7] 835*c0909341SAndroid Build Coastguard Worker rshrn v0.4h, v0.4s, #6 836*c0909341SAndroid Build Coastguard Worker rshrn2 v0.8h, v1.4s, #6 837*c0909341SAndroid Build Coastguard Worker.else // put 838*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 839*c0909341SAndroid Build Coastguard Worker rshrn v22.4h, v5.4s, #2 840*c0909341SAndroid Build Coastguard Worker rshrn2 v22.8h, v6.4s, #2 841*c0909341SAndroid Build Coastguard Worker .else 842*c0909341SAndroid Build Coastguard Worker shrn v22.4h, v5.4s, #2 843*c0909341SAndroid Build Coastguard Worker shrn2 v22.8h, v6.4s, #2 844*c0909341SAndroid Build Coastguard Worker .endif 845*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v22.4h, v7.h[7] 846*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v22.8h, v7.h[7] 847*c0909341SAndroid Build Coastguard Worker tbl v0.16b, {v0.16b, v1.16b}, v25.16b 848*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #2 849*c0909341SAndroid Build Coastguard Worker.endif 850*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod 851*c0909341SAndroid Build Coastguard Worker subs w8, w8, #1 852*c0909341SAndroid Build Coastguard Worker.endif 853*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 854*c0909341SAndroid Build Coastguard Worker st1 {v0.8h}, [\ldst], \d_strd 855*c0909341SAndroid Build Coastguard Worker b.gt 8b 856*c0909341SAndroid Build Coastguard Worker add \dst, \dst, #16 857*c0909341SAndroid Build Coastguard Worker.else 858*c0909341SAndroid Build Coastguard Worker st1 {v0.8b}, [\ldst], \d_strd 859*c0909341SAndroid Build Coastguard Worker b.gt 8b 860*c0909341SAndroid Build Coastguard Worker add \dst, \dst, #8 861*c0909341SAndroid Build Coastguard Worker.endif 862*c0909341SAndroid Build Coastguard Worker add \src, \src, #8 863*c0909341SAndroid Build Coastguard Worker subs \w, \w, #8 864*c0909341SAndroid Build Coastguard Worker b.gt 81b 865*c0909341SAndroid Build Coastguard Worker ret x15 866*c0909341SAndroid Build Coastguard Worker 867*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 868*c0909341SAndroid Build Coastguard Worker40: // HV8 - 4xN 869*c0909341SAndroid Build Coastguard Worker ldur s26, [\xmx, #2] 870*c0909341SAndroid Build Coastguard Worker add \src, \src, #2 871*c0909341SAndroid Build Coastguard Worker 872*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 873*c0909341SAndroid Build Coastguard Worker shrn v16.4h, v22.4s, #2 874*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 875*c0909341SAndroid Build Coastguard Worker shrn v17.4h, v22.4s, #2 876*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 877*c0909341SAndroid Build Coastguard Worker shrn v18.4h, v22.4s, #2 878*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 879*c0909341SAndroid Build Coastguard Worker shrn v19.4h, v22.4s, #2 880*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 881*c0909341SAndroid Build Coastguard Worker shrn v20.4h, v22.4s, #2 882*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 883*c0909341SAndroid Build Coastguard Worker shrn v21.4h, v22.4s, #2 884*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 885*c0909341SAndroid Build Coastguard Worker shrn v22.4h, v22.4s, #2 886*c0909341SAndroid Build Coastguard Worker 887*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 888*c0909341SAndroid Build Coastguard Worker4: 889*c0909341SAndroid Build Coastguard Worker ld1 {v4.8b}, [\src], \s_strd 890*c0909341SAndroid Build Coastguard Worker 891*c0909341SAndroid Build Coastguard Worker smull v0.4s, v16.4h, v7.h[0] 892*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v17.4h, v7.h[1] 893*c0909341SAndroid Build Coastguard Worker mov v16.16b, v17.16b 894*c0909341SAndroid Build Coastguard Worker mov v17.16b, v18.16b 895*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod 896*c0909341SAndroid Build Coastguard Worker sub v4.16b, v4.16b, v24.16b 897*c0909341SAndroid Build Coastguard Worker.endif 898*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v18.4h, v7.h[2] 899*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v19.4h, v7.h[3] 900*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v4.16b}, v28.16b 901*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 902*c0909341SAndroid Build Coastguard Worker movi v5.4s, #0 903*c0909341SAndroid Build Coastguard Worker.else 904*c0909341SAndroid Build Coastguard Worker mov v5.16b, v27.16b 905*c0909341SAndroid Build Coastguard Worker.endif 906*c0909341SAndroid Build Coastguard Worker mov v18.16b, v19.16b 907*c0909341SAndroid Build Coastguard Worker mov v19.16b, v20.16b 908*c0909341SAndroid Build Coastguard Worker 909*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v20.4h, v7.h[4] 910*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v21.4h, v7.h[5] 911*c0909341SAndroid Build Coastguard Worker 912*c0909341SAndroid Build Coastguard Worker \dot v5.4s, v2.16b, v26.4b[0] 913*c0909341SAndroid Build Coastguard Worker mov v20.16b, v21.16b 914*c0909341SAndroid Build Coastguard Worker mov v21.16b, v22.16b 915*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v22.4h, v7.h[6] 916*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 917*c0909341SAndroid Build Coastguard Worker rshrn v22.4h, v5.4s, #2 918*c0909341SAndroid Build Coastguard Worker.else 919*c0909341SAndroid Build Coastguard Worker shrn v22.4h, v5.4s, #2 920*c0909341SAndroid Build Coastguard Worker.endif 921*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v22.4h, v7.h[7] 922*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 923*c0909341SAndroid Build Coastguard Worker rshrn v0.4h, v0.4s, #6 924*c0909341SAndroid Build Coastguard Worker str d0, [\dst], #8 925*c0909341SAndroid Build Coastguard Worker subs \h, \h, #1 926*c0909341SAndroid Build Coastguard Worker.else 927*c0909341SAndroid Build Coastguard Worker subs \h, \h, #1 928*c0909341SAndroid Build Coastguard Worker tbl v0.8b, {v0.16b}, v25.8b 929*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #2 930*c0909341SAndroid Build Coastguard Worker str s0, [\dst] 931*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd 932*c0909341SAndroid Build Coastguard Worker.endif 933*c0909341SAndroid Build Coastguard Worker b.gt 4b 934*c0909341SAndroid Build Coastguard Worker ret x15 935*c0909341SAndroid Build Coastguard Worker 936*c0909341SAndroid Build Coastguard Worker.ifc \type, put 937*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 938*c0909341SAndroid Build Coastguard Worker20: // HV8 - 2xN 939*c0909341SAndroid Build Coastguard Worker ldur s26, [\xmx, #2] 940*c0909341SAndroid Build Coastguard Worker add \src, \src, #2 941*c0909341SAndroid Build Coastguard Worker 942*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 943*c0909341SAndroid Build Coastguard Worker shrn v16.4h, v22.4s, #2 944*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 945*c0909341SAndroid Build Coastguard Worker shrn v17.4h, v22.4s, #2 946*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 947*c0909341SAndroid Build Coastguard Worker shrn v18.4h, v22.4s, #2 948*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 949*c0909341SAndroid Build Coastguard Worker shrn v19.4h, v22.4s, #2 950*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 951*c0909341SAndroid Build Coastguard Worker shrn v20.4h, v22.4s, #2 952*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 953*c0909341SAndroid Build Coastguard Worker shrn v21.4h, v22.4s, #2 954*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 955*c0909341SAndroid Build Coastguard Worker shrn v22.4h, v22.4s, #2 956*c0909341SAndroid Build Coastguard Worker 957*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 958*c0909341SAndroid Build Coastguard Worker2: 959*c0909341SAndroid Build Coastguard Worker ld1 {v4.8b}, [\src], \s_strd 960*c0909341SAndroid Build Coastguard Worker 961*c0909341SAndroid Build Coastguard Worker smull v0.4s, v16.4h, v7.h[0] 962*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v17.4h, v7.h[1] 963*c0909341SAndroid Build Coastguard Worker mov v16.16b, v17.16b 964*c0909341SAndroid Build Coastguard Worker mov v17.16b, v18.16b 965*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_dotprod 966*c0909341SAndroid Build Coastguard Worker sub v4.16b, v4.16b, v24.16b 967*c0909341SAndroid Build Coastguard Worker .endif 968*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v18.4h, v7.h[2] 969*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v19.4h, v7.h[3] 970*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v4.16b}, v28.16b 971*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 972*c0909341SAndroid Build Coastguard Worker movi v5.4s, #0 973*c0909341SAndroid Build Coastguard Worker .else 974*c0909341SAndroid Build Coastguard Worker mov v5.16b, v27.16b 975*c0909341SAndroid Build Coastguard Worker .endif 976*c0909341SAndroid Build Coastguard Worker mov v18.16b, v19.16b 977*c0909341SAndroid Build Coastguard Worker mov v19.16b, v20.16b 978*c0909341SAndroid Build Coastguard Worker 979*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v20.4h, v7.h[4] 980*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v21.4h, v7.h[5] 981*c0909341SAndroid Build Coastguard Worker 982*c0909341SAndroid Build Coastguard Worker \dot v5.4s, v2.16b, v26.4b[0] 983*c0909341SAndroid Build Coastguard Worker mov v20.16b, v21.16b 984*c0909341SAndroid Build Coastguard Worker mov v21.16b, v22.16b 985*c0909341SAndroid Build Coastguard Worker 986*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v22.4h, v7.h[6] 987*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 988*c0909341SAndroid Build Coastguard Worker rshrn v22.4h, v5.4s, #2 989*c0909341SAndroid Build Coastguard Worker .else 990*c0909341SAndroid Build Coastguard Worker shrn v22.4h, v5.4s, #2 991*c0909341SAndroid Build Coastguard Worker .endif 992*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v22.4h, v7.h[7] 993*c0909341SAndroid Build Coastguard Worker subs \h, \h, #1 994*c0909341SAndroid Build Coastguard Worker 995*c0909341SAndroid Build Coastguard Worker tbl v0.8b, {v0.16b}, v25.8b 996*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #2 997*c0909341SAndroid Build Coastguard Worker 998*c0909341SAndroid Build Coastguard Worker str h0, [\dst] 999*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd 1000*c0909341SAndroid Build Coastguard Worker b.gt 2b 1001*c0909341SAndroid Build Coastguard Worker ret x15 1002*c0909341SAndroid Build Coastguard Worker.endif 1003*c0909341SAndroid Build Coastguard Worker 1004*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1005*c0909341SAndroid Build Coastguard WorkerL(\type\()_6tap_hv_\isa): 1006*c0909341SAndroid Build Coastguard Worker cmp \w, #4 1007*c0909341SAndroid Build Coastguard Worker b.eq 40f 1008*c0909341SAndroid Build Coastguard Worker.ifc \type, put 1009*c0909341SAndroid Build Coastguard Worker b.lt 20f 1010*c0909341SAndroid Build Coastguard Worker.endif 1011*c0909341SAndroid Build Coastguard Worker 1012*c0909341SAndroid Build Coastguard Worker // .align JUMP_ALIGN // fallthrough 1013*c0909341SAndroid Build Coastguard Worker80: // HV6 - 8xN+ 1014*c0909341SAndroid Build Coastguard Worker ldr d26, [\xmx] 1015*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 1016*c0909341SAndroid Build Coastguard Worker add \wd_strd, \w, \w 1017*c0909341SAndroid Build Coastguard Worker.endif 1018*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 1019*c0909341SAndroid Build Coastguard Worker cmp w9, #SHARP1 1020*c0909341SAndroid Build Coastguard Worker b.eq 88f // horizontal == SHARP1 1021*c0909341SAndroid Build Coastguard Worker 1022*c0909341SAndroid Build Coastguard Worker ldp q29, q30, [x13, #(OFFSET_USMMLA)] 1023*c0909341SAndroid Build Coastguard Worker ext v0.8b, v26.8b, v26.8b, #7 1024*c0909341SAndroid Build Coastguard Worker ins v26.d[1], v0.d[0] 1025*c0909341SAndroid Build Coastguard Worker 1026*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1027*c0909341SAndroid Build Coastguard Worker81: 1028*c0909341SAndroid Build Coastguard Worker mov \lsrc, \src 1029*c0909341SAndroid Build Coastguard Worker mov \ldst, \dst 1030*c0909341SAndroid Build Coastguard Worker mov w8, \h 1031*c0909341SAndroid Build Coastguard Worker 1032*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter6_neon_i8mm) 1033*c0909341SAndroid Build Coastguard Worker srshr v16.8h, v22.8h, #2 1034*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter6_neon_i8mm) 1035*c0909341SAndroid Build Coastguard Worker srshr v17.8h, v22.8h, #2 1036*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter6_neon_i8mm) 1037*c0909341SAndroid Build Coastguard Worker srshr v18.8h, v22.8h, #2 1038*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter6_neon_i8mm) 1039*c0909341SAndroid Build Coastguard Worker srshr v19.8h, v22.8h, #2 1040*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter6_neon_i8mm) 1041*c0909341SAndroid Build Coastguard Worker srshr v20.8h, v22.8h, #2 1042*c0909341SAndroid Build Coastguard Worker 1043*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1044*c0909341SAndroid Build Coastguard Worker8: 1045*c0909341SAndroid Build Coastguard Worker ld1 {v23.16b}, [\lsrc], \s_strd 1046*c0909341SAndroid Build Coastguard Worker 1047*c0909341SAndroid Build Coastguard Worker smull v0.4s, v16.4h, v7.h[1] 1048*c0909341SAndroid Build Coastguard Worker smull2 v1.4s, v16.8h, v7.h[1] 1049*c0909341SAndroid Build Coastguard Worker mov v16.16b, v17.16b 1050*c0909341SAndroid Build Coastguard Worker movi v5.4s, #0 1051*c0909341SAndroid Build Coastguard Worker movi v6.4s, #0 1052*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v23.16b}, v29.16b 1053*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v23.16b}, v30.16b 1054*c0909341SAndroid Build Coastguard Worker 1055*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v17.4h, v7.h[2] 1056*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v17.8h, v7.h[2] 1057*c0909341SAndroid Build Coastguard Worker mov v17.16b, v18.16b 1058*c0909341SAndroid Build Coastguard Worker 1059*c0909341SAndroid Build Coastguard Worker usmmla v5.4s, v2.16b, v26.16b 1060*c0909341SAndroid Build Coastguard Worker usmmla v6.4s, v3.16b, v26.16b 1061*c0909341SAndroid Build Coastguard Worker 1062*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v18.4h, v7.h[3] 1063*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v18.8h, v7.h[3] 1064*c0909341SAndroid Build Coastguard Worker mov v18.16b, v19.16b 1065*c0909341SAndroid Build Coastguard Worker subs w8, w8, #1 1066*c0909341SAndroid Build Coastguard Worker 1067*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v19.4h, v7.h[4] 1068*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v19.8h, v7.h[4] 1069*c0909341SAndroid Build Coastguard Worker uzp1 v23.8h, v5.8h, v6.8h 1070*c0909341SAndroid Build Coastguard Worker mov v19.16b, v20.16b 1071*c0909341SAndroid Build Coastguard Worker 1072*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v20.4h, v7.h[5] 1073*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v20.8h, v7.h[5] 1074*c0909341SAndroid Build Coastguard Worker srshr v20.8h, v23.8h, #2 1075*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v20.4h, v7.h[6] 1076*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v20.8h, v7.h[6] 1077*c0909341SAndroid Build Coastguard Worker .ifc \type, prep 1078*c0909341SAndroid Build Coastguard Worker rshrn v0.4h, v0.4s, #6 1079*c0909341SAndroid Build Coastguard Worker rshrn2 v0.8h, v1.4s, #6 1080*c0909341SAndroid Build Coastguard Worker st1 {v0.8h}, [\ldst], \d_strd 1081*c0909341SAndroid Build Coastguard Worker b.gt 8b 1082*c0909341SAndroid Build Coastguard Worker add \dst, \dst, #16 1083*c0909341SAndroid Build Coastguard Worker .else 1084*c0909341SAndroid Build Coastguard Worker tbl v0.16b, {v0.16b, v1.16b}, v25.16b 1085*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #2 1086*c0909341SAndroid Build Coastguard Worker st1 {v0.8b}, [\ldst], \d_strd 1087*c0909341SAndroid Build Coastguard Worker b.gt 8b 1088*c0909341SAndroid Build Coastguard Worker add \dst, \dst, #8 1089*c0909341SAndroid Build Coastguard Worker .endif 1090*c0909341SAndroid Build Coastguard Worker add \src, \src, #8 1091*c0909341SAndroid Build Coastguard Worker subs \w, \w, #8 1092*c0909341SAndroid Build Coastguard Worker b.gt 81b 1093*c0909341SAndroid Build Coastguard Worker ret x15 1094*c0909341SAndroid Build Coastguard Worker 1095*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1096*c0909341SAndroid Build Coastguard Worker88: 1097*c0909341SAndroid Build Coastguard Worker.endif // neon_i8mm 1098*c0909341SAndroid Build Coastguard Worker ldp q29, q30, [x13, #16] 1099*c0909341SAndroid Build Coastguard Worker 1100*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1101*c0909341SAndroid Build Coastguard Worker81: 1102*c0909341SAndroid Build Coastguard Worker mov \lsrc, \src 1103*c0909341SAndroid Build Coastguard Worker mov \ldst, \dst 1104*c0909341SAndroid Build Coastguard Worker mov w8, \h 1105*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 1106*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 1107*c0909341SAndroid Build Coastguard Worker srshr v16.8h, v22.8h, #2 1108*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 1109*c0909341SAndroid Build Coastguard Worker srshr v17.8h, v22.8h, #2 1110*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 1111*c0909341SAndroid Build Coastguard Worker srshr v18.8h, v22.8h, #2 1112*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 1113*c0909341SAndroid Build Coastguard Worker srshr v19.8h, v22.8h, #2 1114*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 1115*c0909341SAndroid Build Coastguard Worker srshr v20.8h, v22.8h, #2 1116*c0909341SAndroid Build Coastguard Worker.else 1117*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 1118*c0909341SAndroid Build Coastguard Worker sshr v16.8h, v22.8h, #2 1119*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 1120*c0909341SAndroid Build Coastguard Worker sshr v17.8h, v22.8h, #2 1121*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 1122*c0909341SAndroid Build Coastguard Worker sshr v18.8h, v22.8h, #2 1123*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 1124*c0909341SAndroid Build Coastguard Worker sshr v19.8h, v22.8h, #2 1125*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter8_\isa) 1126*c0909341SAndroid Build Coastguard Worker sshr v20.8h, v22.8h, #2 1127*c0909341SAndroid Build Coastguard Worker.endif 1128*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1129*c0909341SAndroid Build Coastguard Worker8: 1130*c0909341SAndroid Build Coastguard Worker ldr q23, [\lsrc] 1131*c0909341SAndroid Build Coastguard Worker add \lsrc, \lsrc, \s_strd 1132*c0909341SAndroid Build Coastguard Worker 1133*c0909341SAndroid Build Coastguard Worker smull v0.4s, v16.4h, v7.h[1] 1134*c0909341SAndroid Build Coastguard Worker smull2 v1.4s, v16.8h, v7.h[1] 1135*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod 1136*c0909341SAndroid Build Coastguard Worker sub v23.16b, v23.16b, v24.16b 1137*c0909341SAndroid Build Coastguard Worker.endif 1138*c0909341SAndroid Build Coastguard Worker mov v16.16b, v17.16b 1139*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 1140*c0909341SAndroid Build Coastguard Worker movi v5.4s, #0 1141*c0909341SAndroid Build Coastguard Worker movi v6.4s, #0 1142*c0909341SAndroid Build Coastguard Worker.else 1143*c0909341SAndroid Build Coastguard Worker mov v5.16b, v27.16b 1144*c0909341SAndroid Build Coastguard Worker mov v6.16b, v27.16b 1145*c0909341SAndroid Build Coastguard Worker.endif 1146*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v23.16b}, v28.16b 1147*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v23.16b}, v29.16b 1148*c0909341SAndroid Build Coastguard Worker 1149*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v17.4h, v7.h[2] 1150*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v17.8h, v7.h[2] 1151*c0909341SAndroid Build Coastguard Worker tbl v4.16b, {v23.16b}, v30.16b 1152*c0909341SAndroid Build Coastguard Worker mov v17.16b, v18.16b 1153*c0909341SAndroid Build Coastguard Worker 1154*c0909341SAndroid Build Coastguard Worker \dot v5.4s, v2.16b, v26.4b[0] 1155*c0909341SAndroid Build Coastguard Worker \dot v6.4s, v3.16b, v26.4b[0] 1156*c0909341SAndroid Build Coastguard Worker 1157*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v18.4h, v7.h[3] 1158*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v18.8h, v7.h[3] 1159*c0909341SAndroid Build Coastguard Worker mov v18.16b, v19.16b 1160*c0909341SAndroid Build Coastguard Worker 1161*c0909341SAndroid Build Coastguard Worker \dot v5.4s, v3.16b, v26.4b[1] 1162*c0909341SAndroid Build Coastguard Worker \dot v6.4s, v4.16b, v26.4b[1] 1163*c0909341SAndroid Build Coastguard Worker 1164*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v19.4h, v7.h[4] 1165*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v19.8h, v7.h[4] 1166*c0909341SAndroid Build Coastguard Worker mov v19.16b, v20.16b 1167*c0909341SAndroid Build Coastguard Worker uzp1 v23.8h, v5.8h, v6.8h 1168*c0909341SAndroid Build Coastguard Worker 1169*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v20.4h, v7.h[5] 1170*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v20.8h, v7.h[5] 1171*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 1172*c0909341SAndroid Build Coastguard Worker srshr v20.8h, v23.8h, #2 1173*c0909341SAndroid Build Coastguard Worker.else 1174*c0909341SAndroid Build Coastguard Worker sshr v20.8h, v23.8h, #2 1175*c0909341SAndroid Build Coastguard Worker.endif 1176*c0909341SAndroid Build Coastguard Worker subs w8, w8, #1 1177*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v20.4h, v7.h[6] 1178*c0909341SAndroid Build Coastguard Worker smlal2 v1.4s, v20.8h, v7.h[6] 1179*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 1180*c0909341SAndroid Build Coastguard Worker rshrn v0.4h, v0.4s, #6 1181*c0909341SAndroid Build Coastguard Worker rshrn2 v0.8h, v1.4s, #6 1182*c0909341SAndroid Build Coastguard Worker st1 {v0.8h}, [\ldst], \d_strd 1183*c0909341SAndroid Build Coastguard Worker b.gt 8b 1184*c0909341SAndroid Build Coastguard Worker add \dst, \dst, #16 1185*c0909341SAndroid Build Coastguard Worker.else 1186*c0909341SAndroid Build Coastguard Worker tbl v0.16b, {v0.16b, v1.16b}, v25.16b 1187*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #2 1188*c0909341SAndroid Build Coastguard Worker st1 {v0.8b}, [\ldst], \d_strd 1189*c0909341SAndroid Build Coastguard Worker b.gt 8b 1190*c0909341SAndroid Build Coastguard Worker add \dst, \dst, #8 1191*c0909341SAndroid Build Coastguard Worker.endif 1192*c0909341SAndroid Build Coastguard Worker add \src, \src, #8 1193*c0909341SAndroid Build Coastguard Worker subs \w, \w, #8 1194*c0909341SAndroid Build Coastguard Worker b.gt 81b 1195*c0909341SAndroid Build Coastguard Worker ret x15 1196*c0909341SAndroid Build Coastguard Worker 1197*c0909341SAndroid Build Coastguard Worker .align FUNC_ALIGN 1198*c0909341SAndroid Build Coastguard WorkerL(\type\()_hv_filter8_\isa): 1199*c0909341SAndroid Build Coastguard Worker ld1 {v4.16b}, [\lsrc], \s_strd 1200*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 1201*c0909341SAndroid Build Coastguard Worker movi v22.4s, #0 1202*c0909341SAndroid Build Coastguard Worker movi v23.4s, #0 1203*c0909341SAndroid Build Coastguard Worker.else // neon_dotprod 1204*c0909341SAndroid Build Coastguard Worker sub v4.16b, v4.16b, v24.16b 1205*c0909341SAndroid Build Coastguard Worker mov v22.16b, v27.16b 1206*c0909341SAndroid Build Coastguard Worker mov v23.16b, v27.16b 1207*c0909341SAndroid Build Coastguard Worker.endif 1208*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v4.16b}, v28.16b 1209*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v4.16b}, v29.16b 1210*c0909341SAndroid Build Coastguard Worker tbl v4.16b, {v4.16b}, v30.16b 1211*c0909341SAndroid Build Coastguard Worker \dot v22.4s, v2.16b, v26.4b[0] 1212*c0909341SAndroid Build Coastguard Worker \dot v23.4s, v3.16b, v26.4b[0] 1213*c0909341SAndroid Build Coastguard Worker \dot v22.4s, v3.16b, v26.4b[1] 1214*c0909341SAndroid Build Coastguard Worker \dot v23.4s, v4.16b, v26.4b[1] 1215*c0909341SAndroid Build Coastguard Worker uzp1 v22.8h, v22.8h, v23.8h 1216*c0909341SAndroid Build Coastguard Worker ret 1217*c0909341SAndroid Build Coastguard Worker 1218*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 1219*c0909341SAndroid Build Coastguard Worker .align FUNC_ALIGN 1220*c0909341SAndroid Build Coastguard WorkerL(\type\()_hv_filter6_neon_i8mm): 1221*c0909341SAndroid Build Coastguard Worker ld1 {v4.16b}, [\lsrc], \s_strd 1222*c0909341SAndroid Build Coastguard Worker movi v22.4s, #0 1223*c0909341SAndroid Build Coastguard Worker movi v23.4s, #0 1224*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v4.16b}, v29.16b 1225*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v4.16b}, v30.16b 1226*c0909341SAndroid Build Coastguard Worker usmmla v22.4s, v2.16b, v26.16b 1227*c0909341SAndroid Build Coastguard Worker usmmla v23.4s, v3.16b, v26.16b 1228*c0909341SAndroid Build Coastguard Worker uzp1 v22.8h, v22.8h, v23.8h 1229*c0909341SAndroid Build Coastguard Worker ret 1230*c0909341SAndroid Build Coastguard Worker.endif 1231*c0909341SAndroid Build Coastguard Worker 1232*c0909341SAndroid Build Coastguard Worker .align FUNC_ALIGN 1233*c0909341SAndroid Build Coastguard WorkerL(\type\()_hv_filter4_\isa): 1234*c0909341SAndroid Build Coastguard Worker ld1 {v4.8b}, [\src], \s_strd 1235*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 1236*c0909341SAndroid Build Coastguard Worker movi v22.4s, #2 1237*c0909341SAndroid Build Coastguard Worker.else 1238*c0909341SAndroid Build Coastguard Worker mov v22.16b, v27.16b 1239*c0909341SAndroid Build Coastguard Worker sub v4.16b, v4.16b, v24.16b 1240*c0909341SAndroid Build Coastguard Worker.endif 1241*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v4.16b}, v28.16b 1242*c0909341SAndroid Build Coastguard Worker \dot v22.4s, v2.16b, v26.4b[0] 1243*c0909341SAndroid Build Coastguard Worker ret 1244*c0909341SAndroid Build Coastguard Worker 1245*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1246*c0909341SAndroid Build Coastguard Worker40: // HV6 - 4xN 1247*c0909341SAndroid Build Coastguard Worker ldur s26, [\xmx, #2] 1248*c0909341SAndroid Build Coastguard Worker add \src, \src, #2 1249*c0909341SAndroid Build Coastguard Worker 1250*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 1251*c0909341SAndroid Build Coastguard Worker shrn v16.4h, v22.4s, #2 1252*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 1253*c0909341SAndroid Build Coastguard Worker shrn v17.4h, v22.4s, #2 1254*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 1255*c0909341SAndroid Build Coastguard Worker shrn v18.4h, v22.4s, #2 1256*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 1257*c0909341SAndroid Build Coastguard Worker shrn v19.4h, v22.4s, #2 1258*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 1259*c0909341SAndroid Build Coastguard Worker shrn v20.4h, v22.4s, #2 1260*c0909341SAndroid Build Coastguard Worker 1261*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1262*c0909341SAndroid Build Coastguard Worker4: 1263*c0909341SAndroid Build Coastguard Worker ld1 {v4.8b}, [\src], \s_strd 1264*c0909341SAndroid Build Coastguard Worker 1265*c0909341SAndroid Build Coastguard Worker smull v0.4s, v16.4h, v7.h[1] 1266*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v17.4h, v7.h[2] 1267*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_dotprod 1268*c0909341SAndroid Build Coastguard Worker sub v4.16b, v4.16b, v24.16b 1269*c0909341SAndroid Build Coastguard Worker.endif 1270*c0909341SAndroid Build Coastguard Worker mov v16.16b, v17.16b 1271*c0909341SAndroid Build Coastguard Worker mov v17.16b, v18.16b 1272*c0909341SAndroid Build Coastguard Worker 1273*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v18.4h, v7.h[3] 1274*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v19.4h, v7.h[4] 1275*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v4.16b}, v28.16b 1276*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 1277*c0909341SAndroid Build Coastguard Worker movi v5.4s, #0 1278*c0909341SAndroid Build Coastguard Worker.else 1279*c0909341SAndroid Build Coastguard Worker mov v5.16b, v27.16b 1280*c0909341SAndroid Build Coastguard Worker.endif 1281*c0909341SAndroid Build Coastguard Worker mov v18.16b, v19.16b 1282*c0909341SAndroid Build Coastguard Worker mov v19.16b, v20.16b 1283*c0909341SAndroid Build Coastguard Worker \dot v5.4s, v2.16b, v26.4b[0] 1284*c0909341SAndroid Build Coastguard Worker 1285*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v20.4h, v7.h[5] 1286*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 1287*c0909341SAndroid Build Coastguard Worker rshrn v20.4h, v5.4s, #2 1288*c0909341SAndroid Build Coastguard Worker.else 1289*c0909341SAndroid Build Coastguard Worker shrn v20.4h, v5.4s, #2 1290*c0909341SAndroid Build Coastguard Worker.endif 1291*c0909341SAndroid Build Coastguard Worker subs \h, \h, #1 1292*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v20.4h, v7.h[6] 1293*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 1294*c0909341SAndroid Build Coastguard Worker rshrn v0.4h, v0.4s, #6 1295*c0909341SAndroid Build Coastguard Worker str d0, [\dst], #8 1296*c0909341SAndroid Build Coastguard Worker.else 1297*c0909341SAndroid Build Coastguard Worker tbl v0.8b, {v0.16b}, v25.8b 1298*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #2 1299*c0909341SAndroid Build Coastguard Worker str s0, [\dst] 1300*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd 1301*c0909341SAndroid Build Coastguard Worker.endif 1302*c0909341SAndroid Build Coastguard Worker b.gt 4b 1303*c0909341SAndroid Build Coastguard Worker ret x15 1304*c0909341SAndroid Build Coastguard Worker 1305*c0909341SAndroid Build Coastguard Worker.ifc \type, put 1306*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1307*c0909341SAndroid Build Coastguard Worker20: // HV6 - 2xN 1308*c0909341SAndroid Build Coastguard Worker ldur s26, [\xmx, #2] 1309*c0909341SAndroid Build Coastguard Worker add \src, \src, #2 1310*c0909341SAndroid Build Coastguard Worker 1311*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 1312*c0909341SAndroid Build Coastguard Worker shrn v16.4h, v22.4s, #2 1313*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 1314*c0909341SAndroid Build Coastguard Worker shrn v17.4h, v22.4s, #2 1315*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 1316*c0909341SAndroid Build Coastguard Worker shrn v18.4h, v22.4s, #2 1317*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 1318*c0909341SAndroid Build Coastguard Worker shrn v19.4h, v22.4s, #2 1319*c0909341SAndroid Build Coastguard Worker bl L(\type\()_hv_filter4_\isa) 1320*c0909341SAndroid Build Coastguard Worker shrn v20.4h, v22.4s, #2 1321*c0909341SAndroid Build Coastguard Worker 1322*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1323*c0909341SAndroid Build Coastguard Worker2: 1324*c0909341SAndroid Build Coastguard Worker ld1 {v4.8b}, [\src], \s_strd 1325*c0909341SAndroid Build Coastguard Worker 1326*c0909341SAndroid Build Coastguard Worker smull v0.4s, v16.4h, v7.h[1] 1327*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v17.4h, v7.h[2] 1328*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_dotprod 1329*c0909341SAndroid Build Coastguard Worker sub v4.16b, v4.16b, v24.16b 1330*c0909341SAndroid Build Coastguard Worker .endif 1331*c0909341SAndroid Build Coastguard Worker mov v16.16b, v17.16b 1332*c0909341SAndroid Build Coastguard Worker mov v17.16b, v18.16b 1333*c0909341SAndroid Build Coastguard Worker 1334*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v18.4h, v7.h[3] 1335*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v19.4h, v7.h[4] 1336*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v4.16b}, v28.16b 1337*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 1338*c0909341SAndroid Build Coastguard Worker movi v5.4s, #0 1339*c0909341SAndroid Build Coastguard Worker .else 1340*c0909341SAndroid Build Coastguard Worker mov v5.16b, v27.16b 1341*c0909341SAndroid Build Coastguard Worker .endif 1342*c0909341SAndroid Build Coastguard Worker 1343*c0909341SAndroid Build Coastguard Worker mov v18.16b, v19.16b 1344*c0909341SAndroid Build Coastguard Worker mov v19.16b, v20.16b 1345*c0909341SAndroid Build Coastguard Worker \dot v5.4s, v2.16b, v26.4b[0] 1346*c0909341SAndroid Build Coastguard Worker 1347*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v20.4h, v7.h[5] 1348*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 1349*c0909341SAndroid Build Coastguard Worker rshrn v20.4h, v5.4s, #2 1350*c0909341SAndroid Build Coastguard Worker .else 1351*c0909341SAndroid Build Coastguard Worker shrn v20.4h, v5.4s, #2 1352*c0909341SAndroid Build Coastguard Worker .endif 1353*c0909341SAndroid Build Coastguard Worker 1354*c0909341SAndroid Build Coastguard Worker subs \h, \h, #1 1355*c0909341SAndroid Build Coastguard Worker smlal v0.4s, v20.4h, v7.h[6] 1356*c0909341SAndroid Build Coastguard Worker 1357*c0909341SAndroid Build Coastguard Worker tbl v0.8b, {v0.16b}, v25.8b 1358*c0909341SAndroid Build Coastguard Worker sqrshrun v0.8b, v0.8h, #2 1359*c0909341SAndroid Build Coastguard Worker 1360*c0909341SAndroid Build Coastguard Worker str h0, [\dst] 1361*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd 1362*c0909341SAndroid Build Coastguard Worker b.gt 2b 1363*c0909341SAndroid Build Coastguard Worker ret x15 1364*c0909341SAndroid Build Coastguard Worker.endif 1365*c0909341SAndroid Build Coastguard Worker 1366*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1367*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_h_\isa): 1368*c0909341SAndroid Build Coastguard Worker movrel x11, \type\()_8tap_h_\isa\()_tbl 1369*c0909341SAndroid Build Coastguard Worker ldrsw x8, [x11, x8, lsl #2] 1370*c0909341SAndroid Build Coastguard Worker.ifc \type, put 1371*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 1372*c0909341SAndroid Build Coastguard Worker movi v27.4s, #34 // special rounding 1373*c0909341SAndroid Build Coastguard Worker .else 1374*c0909341SAndroid Build Coastguard Worker mov w10, #0x2022 // 64 * 128 + 34, bias and rounding for SDOT 1375*c0909341SAndroid Build Coastguard Worker dup v27.4s, w10 1376*c0909341SAndroid Build Coastguard Worker .endif 1377*c0909341SAndroid Build Coastguard Worker.endif 1378*c0909341SAndroid Build Coastguard Worker add x11, x11, x8 1379*c0909341SAndroid Build Coastguard Worker br x11 1380*c0909341SAndroid Build Coastguard Worker 1381*c0909341SAndroid Build Coastguard Worker.ifc \type, put 1382*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1383*c0909341SAndroid Build Coastguard Worker20: // H - 2xN 1384*c0909341SAndroid Build Coastguard Worker AARCH64_VALID_JUMP_TARGET 1385*c0909341SAndroid Build Coastguard Worker add \src, \src, #2 1386*c0909341SAndroid Build Coastguard Worker ldur s26, [\xmx, #2] 1387*c0909341SAndroid Build Coastguard Worker 1388*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1389*c0909341SAndroid Build Coastguard Worker2: 1390*c0909341SAndroid Build Coastguard Worker ldr d0, [\src] 1391*c0909341SAndroid Build Coastguard Worker ldr d1, [\src, \s_strd] 1392*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 1393*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_dotprod 1394*c0909341SAndroid Build Coastguard Worker sub v0.8b, v0.8b, v24.8b 1395*c0909341SAndroid Build Coastguard Worker sub v1.8b, v1.8b, v24.8b 1396*c0909341SAndroid Build Coastguard Worker .endif 1397*c0909341SAndroid Build Coastguard Worker mov v4.16b, v27.16b 1398*c0909341SAndroid Build Coastguard Worker mov v5.16b, v27.16b 1399*c0909341SAndroid Build Coastguard Worker 1400*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v0.16b}, v28.16b 1401*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v1.16b}, v28.16b 1402*c0909341SAndroid Build Coastguard Worker 1403*c0909341SAndroid Build Coastguard Worker \dot v4.4s, v2.16b, v26.4b[0] 1404*c0909341SAndroid Build Coastguard Worker \dot v5.4s, v3.16b, v26.4b[0] 1405*c0909341SAndroid Build Coastguard Worker 1406*c0909341SAndroid Build Coastguard Worker uzp1 v4.8h, v4.8h, v5.8h 1407*c0909341SAndroid Build Coastguard Worker sqshrun v4.8b, v4.8h, #6 1408*c0909341SAndroid Build Coastguard Worker 1409*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 1410*c0909341SAndroid Build Coastguard Worker fmov x8, d4 1411*c0909341SAndroid Build Coastguard Worker lsr x9, x8, #32 1412*c0909341SAndroid Build Coastguard Worker strh w8, [\dst] 1413*c0909341SAndroid Build Coastguard Worker strh w9, [\dst, \d_strd] 1414*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd, lsl #1 1415*c0909341SAndroid Build Coastguard Worker b.gt 2b 1416*c0909341SAndroid Build Coastguard Worker ret 1417*c0909341SAndroid Build Coastguard Worker.endif 1418*c0909341SAndroid Build Coastguard Worker 1419*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1420*c0909341SAndroid Build Coastguard Worker40: // H - 4xN 1421*c0909341SAndroid Build Coastguard Worker AARCH64_VALID_JUMP_TARGET 1422*c0909341SAndroid Build Coastguard Worker add \src, \src, #2 1423*c0909341SAndroid Build Coastguard Worker ldur s26, [\xmx, #2] 1424*c0909341SAndroid Build Coastguard Worker 1425*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1426*c0909341SAndroid Build Coastguard Worker4: 1427*c0909341SAndroid Build Coastguard Worker ldr d0, [\src] 1428*c0909341SAndroid Build Coastguard Worker ldr d1, [\src, \s_strd] 1429*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 1430*c0909341SAndroid Build Coastguard Worker.ifc \type\()_\isa, prep_neon_i8mm 1431*c0909341SAndroid Build Coastguard Worker movi v4.4s, #0 1432*c0909341SAndroid Build Coastguard Worker movi v5.4s, #0 1433*c0909341SAndroid Build Coastguard Worker.else 1434*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_dotprod 1435*c0909341SAndroid Build Coastguard Worker sub v0.8b, v0.8b, v24.8b 1436*c0909341SAndroid Build Coastguard Worker sub v1.8b, v1.8b, v24.8b 1437*c0909341SAndroid Build Coastguard Worker .endif 1438*c0909341SAndroid Build Coastguard Worker mov v4.16b, v27.16b 1439*c0909341SAndroid Build Coastguard Worker mov v5.16b, v27.16b 1440*c0909341SAndroid Build Coastguard Worker.endif 1441*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v0.16b}, v28.16b 1442*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v1.16b}, v28.16b 1443*c0909341SAndroid Build Coastguard Worker 1444*c0909341SAndroid Build Coastguard Worker \dot v4.4s, v2.16b, v26.4b[0] 1445*c0909341SAndroid Build Coastguard Worker \dot v5.4s, v3.16b, v26.4b[0] 1446*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 1447*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 1448*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 1449*c0909341SAndroid Build Coastguard Worker uzp1 v4.8h, v4.8h, v5.8h 1450*c0909341SAndroid Build Coastguard Worker srshr v4.8h, v4.8h, #2 1451*c0909341SAndroid Build Coastguard Worker .else 1452*c0909341SAndroid Build Coastguard Worker shrn v4.4h, v4.4s, #2 1453*c0909341SAndroid Build Coastguard Worker shrn2 v4.8h, v5.4s, #2 1454*c0909341SAndroid Build Coastguard Worker .endif 1455*c0909341SAndroid Build Coastguard Worker str q4, [\dst], #16 1456*c0909341SAndroid Build Coastguard Worker.else // put 1457*c0909341SAndroid Build Coastguard Worker uzp1 v4.8h, v4.8h, v5.8h 1458*c0909341SAndroid Build Coastguard Worker sqshrun v4.8b, v4.8h, #6 1459*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 1460*c0909341SAndroid Build Coastguard Worker fmov x8, d4 1461*c0909341SAndroid Build Coastguard Worker lsr x9, x8, #32 1462*c0909341SAndroid Build Coastguard Worker str w8, [\dst] 1463*c0909341SAndroid Build Coastguard Worker str w9, [\dst, \d_strd] 1464*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd, lsl #1 1465*c0909341SAndroid Build Coastguard Worker.endif 1466*c0909341SAndroid Build Coastguard Worker b.gt 4b 1467*c0909341SAndroid Build Coastguard Worker ret 1468*c0909341SAndroid Build Coastguard Worker 1469*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1470*c0909341SAndroid Build Coastguard Worker80: // H - 8xN 1471*c0909341SAndroid Build Coastguard Worker AARCH64_VALID_JUMP_TARGET 1472*c0909341SAndroid Build Coastguard Worker ldr d26, [\xmx] 1473*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 1474*c0909341SAndroid Build Coastguard Worker cmp w9, #SHARP1 1475*c0909341SAndroid Build Coastguard Worker b.eq 88f // horizontal == SHARP1 1476*c0909341SAndroid Build Coastguard Worker 1477*c0909341SAndroid Build Coastguard Worker ldp q29, q30, [x13, #(OFFSET_USMMLA)] 1478*c0909341SAndroid Build Coastguard Worker ext v0.8b, v26.8b, v26.8b, #7 1479*c0909341SAndroid Build Coastguard Worker ins v26.d[1], v0.d[0] 1480*c0909341SAndroid Build Coastguard Worker 1481*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1482*c0909341SAndroid Build Coastguard Worker8: 1483*c0909341SAndroid Build Coastguard Worker ldr q0, [\src] 1484*c0909341SAndroid Build Coastguard Worker ldr q16, [\src, \s_strd] 1485*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 1486*c0909341SAndroid Build Coastguard Worker .ifc \type, prep 1487*c0909341SAndroid Build Coastguard Worker movi v4.4s, #0 1488*c0909341SAndroid Build Coastguard Worker movi v5.4s, #0 1489*c0909341SAndroid Build Coastguard Worker movi v20.4s, #0 1490*c0909341SAndroid Build Coastguard Worker movi v21.4s, #0 1491*c0909341SAndroid Build Coastguard Worker .else 1492*c0909341SAndroid Build Coastguard Worker mov v4.16b, v27.16b 1493*c0909341SAndroid Build Coastguard Worker mov v5.16b, v27.16b 1494*c0909341SAndroid Build Coastguard Worker mov v20.16b, v27.16b 1495*c0909341SAndroid Build Coastguard Worker mov v21.16b, v27.16b 1496*c0909341SAndroid Build Coastguard Worker .endif 1497*c0909341SAndroid Build Coastguard Worker tbl v1.16b, {v0.16b}, v29.16b 1498*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v0.16b}, v30.16b 1499*c0909341SAndroid Build Coastguard Worker tbl v17.16b, {v16.16b}, v29.16b 1500*c0909341SAndroid Build Coastguard Worker tbl v18.16b, {v16.16b}, v30.16b 1501*c0909341SAndroid Build Coastguard Worker 1502*c0909341SAndroid Build Coastguard Worker usmmla v4.4s, v1.16b, v26.16b 1503*c0909341SAndroid Build Coastguard Worker usmmla v5.4s, v2.16b, v26.16b 1504*c0909341SAndroid Build Coastguard Worker usmmla v20.4s, v17.16b, v26.16b 1505*c0909341SAndroid Build Coastguard Worker usmmla v21.4s, v18.16b, v26.16b 1506*c0909341SAndroid Build Coastguard Worker 1507*c0909341SAndroid Build Coastguard Worker uzp1 v4.8h, v4.8h, v5.8h 1508*c0909341SAndroid Build Coastguard Worker uzp1 v20.8h, v20.8h, v21.8h 1509*c0909341SAndroid Build Coastguard Worker .ifc \type, prep 1510*c0909341SAndroid Build Coastguard Worker srshr v4.8h, v4.8h, #2 1511*c0909341SAndroid Build Coastguard Worker srshr v20.8h, v20.8h, #2 1512*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 1513*c0909341SAndroid Build Coastguard Worker stp q4, q20, [\dst], #32 1514*c0909341SAndroid Build Coastguard Worker .else // put 1515*c0909341SAndroid Build Coastguard Worker sqshrun v4.8b, v4.8h, #6 1516*c0909341SAndroid Build Coastguard Worker sqshrun v20.8b, v20.8h, #6 1517*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 1518*c0909341SAndroid Build Coastguard Worker str d4, [\dst] 1519*c0909341SAndroid Build Coastguard Worker str d20, [\dst, \d_strd] 1520*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd, lsl #1 1521*c0909341SAndroid Build Coastguard Worker .endif 1522*c0909341SAndroid Build Coastguard Worker b.gt 8b 1523*c0909341SAndroid Build Coastguard Worker ret 1524*c0909341SAndroid Build Coastguard Worker 1525*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1526*c0909341SAndroid Build Coastguard Worker88: 1527*c0909341SAndroid Build Coastguard Worker.endif // neon_i8mm 1528*c0909341SAndroid Build Coastguard Worker ldp q29, q30, [x13, #16] 1529*c0909341SAndroid Build Coastguard Worker 1530*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1531*c0909341SAndroid Build Coastguard Worker8: 1532*c0909341SAndroid Build Coastguard Worker ldr q0, [\src] 1533*c0909341SAndroid Build Coastguard Worker ldr q16, [\src, \s_strd] 1534*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd, lsl #1 1535*c0909341SAndroid Build Coastguard Worker.ifc \type\()_\isa, prep_neon_i8mm 1536*c0909341SAndroid Build Coastguard Worker movi v4.4s, #0 1537*c0909341SAndroid Build Coastguard Worker movi v5.4s, #0 1538*c0909341SAndroid Build Coastguard Worker movi v20.4s, #0 1539*c0909341SAndroid Build Coastguard Worker movi v21.4s, #0 1540*c0909341SAndroid Build Coastguard Worker.else 1541*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_dotprod 1542*c0909341SAndroid Build Coastguard Worker sub v0.16b, v0.16b, v24.16b 1543*c0909341SAndroid Build Coastguard Worker sub v16.16b, v16.16b, v24.16b 1544*c0909341SAndroid Build Coastguard Worker .endif 1545*c0909341SAndroid Build Coastguard Worker mov v4.16b, v27.16b 1546*c0909341SAndroid Build Coastguard Worker mov v5.16b, v27.16b 1547*c0909341SAndroid Build Coastguard Worker mov v20.16b, v27.16b 1548*c0909341SAndroid Build Coastguard Worker mov v21.16b, v27.16b 1549*c0909341SAndroid Build Coastguard Worker.endif 1550*c0909341SAndroid Build Coastguard Worker tbl v1.16b, {v0.16b}, v28.16b 1551*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v0.16b}, v29.16b 1552*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v0.16b}, v30.16b 1553*c0909341SAndroid Build Coastguard Worker tbl v17.16b, {v16.16b}, v28.16b 1554*c0909341SAndroid Build Coastguard Worker tbl v18.16b, {v16.16b}, v29.16b 1555*c0909341SAndroid Build Coastguard Worker tbl v19.16b, {v16.16b}, v30.16b 1556*c0909341SAndroid Build Coastguard Worker 1557*c0909341SAndroid Build Coastguard Worker \dot v4.4s, v1.16b, v26.4b[0] 1558*c0909341SAndroid Build Coastguard Worker \dot v5.4s, v2.16b, v26.4b[0] 1559*c0909341SAndroid Build Coastguard Worker \dot v20.4s, v17.16b, v26.4b[0] 1560*c0909341SAndroid Build Coastguard Worker \dot v21.4s, v18.16b, v26.4b[0] 1561*c0909341SAndroid Build Coastguard Worker \dot v4.4s, v2.16b, v26.4b[1] 1562*c0909341SAndroid Build Coastguard Worker \dot v5.4s, v3.16b, v26.4b[1] 1563*c0909341SAndroid Build Coastguard Worker \dot v20.4s, v18.16b, v26.4b[1] 1564*c0909341SAndroid Build Coastguard Worker \dot v21.4s, v19.16b, v26.4b[1] 1565*c0909341SAndroid Build Coastguard Worker 1566*c0909341SAndroid Build Coastguard Worker uzp1 v4.8h, v4.8h, v5.8h 1567*c0909341SAndroid Build Coastguard Worker uzp1 v20.8h, v20.8h, v21.8h 1568*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 1569*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 1570*c0909341SAndroid Build Coastguard Worker srshr v4.8h, v4.8h, #2 1571*c0909341SAndroid Build Coastguard Worker srshr v20.8h, v20.8h, #2 1572*c0909341SAndroid Build Coastguard Worker .else 1573*c0909341SAndroid Build Coastguard Worker sshr v4.8h, v4.8h, #2 1574*c0909341SAndroid Build Coastguard Worker sshr v20.8h, v20.8h, #2 1575*c0909341SAndroid Build Coastguard Worker .endif 1576*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 1577*c0909341SAndroid Build Coastguard Worker stp q4, q20, [\dst], #32 1578*c0909341SAndroid Build Coastguard Worker.else // put 1579*c0909341SAndroid Build Coastguard Worker sqshrun v4.8b, v4.8h, #6 1580*c0909341SAndroid Build Coastguard Worker sqshrun v20.8b, v20.8h, #6 1581*c0909341SAndroid Build Coastguard Worker subs \h, \h, #2 1582*c0909341SAndroid Build Coastguard Worker str d4, [\dst] 1583*c0909341SAndroid Build Coastguard Worker str d20, [\dst, \d_strd] 1584*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd, lsl #1 1585*c0909341SAndroid Build Coastguard Worker.endif 1586*c0909341SAndroid Build Coastguard Worker b.gt 8b 1587*c0909341SAndroid Build Coastguard Worker ret 1588*c0909341SAndroid Build Coastguard Worker 1589*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1590*c0909341SAndroid Build Coastguard Worker160: // H - 16xN 1591*c0909341SAndroid Build Coastguard Worker AARCH64_VALID_JUMP_TARGET 1592*c0909341SAndroid Build Coastguard Worker ldr d26, [\xmx] 1593*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 1594*c0909341SAndroid Build Coastguard Worker cmp w9, #SHARP1 1595*c0909341SAndroid Build Coastguard Worker b.eq 168f // horizontal == SHARP1 1596*c0909341SAndroid Build Coastguard Worker 1597*c0909341SAndroid Build Coastguard Worker ldp q29, q30, [x13, #(OFFSET_USMMLA)] 1598*c0909341SAndroid Build Coastguard Worker ext v0.8b, v26.8b, v26.8b, #7 1599*c0909341SAndroid Build Coastguard Worker ins v26.d[1], v0.d[0] 1600*c0909341SAndroid Build Coastguard Worker 1601*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1602*c0909341SAndroid Build Coastguard Worker16: 1603*c0909341SAndroid Build Coastguard Worker ldr q16, [\src] 1604*c0909341SAndroid Build Coastguard Worker ldur q17, [\src, #8] // avoid 2 register TBL for small cores 1605*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd 1606*c0909341SAndroid Build Coastguard Worker .ifc \type, prep 1607*c0909341SAndroid Build Coastguard Worker movi v6.4s, #0 1608*c0909341SAndroid Build Coastguard Worker movi v7.4s, #0 1609*c0909341SAndroid Build Coastguard Worker movi v22.4s, #0 1610*c0909341SAndroid Build Coastguard Worker movi v23.4s, #0 1611*c0909341SAndroid Build Coastguard Worker .else 1612*c0909341SAndroid Build Coastguard Worker mov v6.16b, v27.16b 1613*c0909341SAndroid Build Coastguard Worker mov v7.16b, v27.16b 1614*c0909341SAndroid Build Coastguard Worker mov v22.16b, v27.16b 1615*c0909341SAndroid Build Coastguard Worker mov v23.16b, v27.16b 1616*c0909341SAndroid Build Coastguard Worker .endif 1617*c0909341SAndroid Build Coastguard Worker tbl v0.16b, {v16.16b}, v29.16b 1618*c0909341SAndroid Build Coastguard Worker tbl v1.16b, {v16.16b}, v30.16b 1619*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v17.16b}, v29.16b 1620*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v17.16b}, v30.16b 1621*c0909341SAndroid Build Coastguard Worker 1622*c0909341SAndroid Build Coastguard Worker usmmla v6.4s, v0.16b, v26.16b 1623*c0909341SAndroid Build Coastguard Worker usmmla v7.4s, v1.16b, v26.16b 1624*c0909341SAndroid Build Coastguard Worker usmmla v22.4s, v2.16b, v26.16b 1625*c0909341SAndroid Build Coastguard Worker usmmla v23.4s, v3.16b, v26.16b 1626*c0909341SAndroid Build Coastguard Worker 1627*c0909341SAndroid Build Coastguard Worker uzp1 v6.8h, v6.8h, v7.8h 1628*c0909341SAndroid Build Coastguard Worker uzp1 v22.8h, v22.8h, v23.8h 1629*c0909341SAndroid Build Coastguard Worker .ifc \type, prep 1630*c0909341SAndroid Build Coastguard Worker srshr v6.8h, v6.8h, #2 1631*c0909341SAndroid Build Coastguard Worker srshr v22.8h, v22.8h, #2 1632*c0909341SAndroid Build Coastguard Worker subs \h, \h, #1 1633*c0909341SAndroid Build Coastguard Worker stp q6, q22, [\dst], #32 1634*c0909341SAndroid Build Coastguard Worker .else // put 1635*c0909341SAndroid Build Coastguard Worker sqshrun v6.8b, v6.8h, #6 1636*c0909341SAndroid Build Coastguard Worker sqshrun2 v6.16b, v22.8h, #6 1637*c0909341SAndroid Build Coastguard Worker subs \h, \h, #1 1638*c0909341SAndroid Build Coastguard Worker st1 {v6.16b}, [\dst], \d_strd 1639*c0909341SAndroid Build Coastguard Worker .endif 1640*c0909341SAndroid Build Coastguard Worker b.gt 16b 1641*c0909341SAndroid Build Coastguard Worker ret 1642*c0909341SAndroid Build Coastguard Worker 1643*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1644*c0909341SAndroid Build Coastguard Worker168: 1645*c0909341SAndroid Build Coastguard Worker.endif // neon_i8mm 1646*c0909341SAndroid Build Coastguard Worker ldp q29, q30, [x13, #16] 1647*c0909341SAndroid Build Coastguard Worker 1648*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1649*c0909341SAndroid Build Coastguard Worker16: 1650*c0909341SAndroid Build Coastguard Worker ldr q16, [\src] 1651*c0909341SAndroid Build Coastguard Worker ldur q17, [\src, #12] // avoid 2 register TBL for small cores 1652*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd 1653*c0909341SAndroid Build Coastguard Worker.ifc \type\()_\isa, prep_neon_i8mm 1654*c0909341SAndroid Build Coastguard Worker movi v6.4s, #0 1655*c0909341SAndroid Build Coastguard Worker movi v7.4s, #0 1656*c0909341SAndroid Build Coastguard Worker movi v22.4s, #0 1657*c0909341SAndroid Build Coastguard Worker movi v23.4s, #0 1658*c0909341SAndroid Build Coastguard Worker.else 1659*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_dotprod 1660*c0909341SAndroid Build Coastguard Worker sub v16.16b, v16.16b, v24.16b 1661*c0909341SAndroid Build Coastguard Worker sub v17.16b, v17.16b, v24.16b 1662*c0909341SAndroid Build Coastguard Worker .endif 1663*c0909341SAndroid Build Coastguard Worker mov v6.16b, v27.16b 1664*c0909341SAndroid Build Coastguard Worker mov v7.16b, v27.16b 1665*c0909341SAndroid Build Coastguard Worker mov v22.16b, v27.16b 1666*c0909341SAndroid Build Coastguard Worker mov v23.16b, v27.16b 1667*c0909341SAndroid Build Coastguard Worker.endif 1668*c0909341SAndroid Build Coastguard Worker tbl v0.16b, {v16.16b}, v28.16b 1669*c0909341SAndroid Build Coastguard Worker tbl v1.16b, {v16.16b}, v29.16b 1670*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v16.16b}, v30.16b 1671*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v17.16b}, v28.16b 1672*c0909341SAndroid Build Coastguard Worker tbl v4.16b, {v17.16b}, v29.16b 1673*c0909341SAndroid Build Coastguard Worker 1674*c0909341SAndroid Build Coastguard Worker \dot v6.4s, v0.16b, v26.4b[0] 1675*c0909341SAndroid Build Coastguard Worker \dot v7.4s, v1.16b, v26.4b[0] 1676*c0909341SAndroid Build Coastguard Worker \dot v22.4s, v2.16b, v26.4b[0] 1677*c0909341SAndroid Build Coastguard Worker \dot v23.4s, v3.16b, v26.4b[0] 1678*c0909341SAndroid Build Coastguard Worker \dot v6.4s, v1.16b, v26.4b[1] 1679*c0909341SAndroid Build Coastguard Worker \dot v7.4s, v2.16b, v26.4b[1] 1680*c0909341SAndroid Build Coastguard Worker \dot v22.4s, v3.16b, v26.4b[1] 1681*c0909341SAndroid Build Coastguard Worker \dot v23.4s, v4.16b, v26.4b[1] 1682*c0909341SAndroid Build Coastguard Worker 1683*c0909341SAndroid Build Coastguard Worker uzp1 v6.8h, v6.8h, v7.8h 1684*c0909341SAndroid Build Coastguard Worker uzp1 v22.8h, v22.8h, v23.8h 1685*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 1686*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 1687*c0909341SAndroid Build Coastguard Worker srshr v6.8h, v6.8h, #2 1688*c0909341SAndroid Build Coastguard Worker srshr v22.8h, v22.8h, #2 1689*c0909341SAndroid Build Coastguard Worker .else 1690*c0909341SAndroid Build Coastguard Worker sshr v6.8h, v6.8h, #2 1691*c0909341SAndroid Build Coastguard Worker sshr v22.8h, v22.8h, #2 1692*c0909341SAndroid Build Coastguard Worker .endif 1693*c0909341SAndroid Build Coastguard Worker subs \h, \h, #1 1694*c0909341SAndroid Build Coastguard Worker stp q6, q22, [\dst], #32 1695*c0909341SAndroid Build Coastguard Worker.else // put 1696*c0909341SAndroid Build Coastguard Worker sqshrun v6.8b, v6.8h, #6 1697*c0909341SAndroid Build Coastguard Worker sqshrun2 v6.16b, v22.8h, #6 1698*c0909341SAndroid Build Coastguard Worker subs \h, \h, #1 1699*c0909341SAndroid Build Coastguard Worker st1 {v6.16b}, [\dst], \d_strd 1700*c0909341SAndroid Build Coastguard Worker.endif 1701*c0909341SAndroid Build Coastguard Worker b.gt 16b 1702*c0909341SAndroid Build Coastguard Worker ret 1703*c0909341SAndroid Build Coastguard Worker 1704*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1705*c0909341SAndroid Build Coastguard Worker320: // H - 32xN+ 1706*c0909341SAndroid Build Coastguard Worker640: 1707*c0909341SAndroid Build Coastguard Worker1280: 1708*c0909341SAndroid Build Coastguard Worker AARCH64_VALID_JUMP_TARGET 1709*c0909341SAndroid Build Coastguard Worker ldr d26, [\xmx] 1710*c0909341SAndroid Build Coastguard Worker.ifc \type, put 1711*c0909341SAndroid Build Coastguard Worker sub \d_strd, \d_strd, \w, uxtw 1712*c0909341SAndroid Build Coastguard Worker.endif 1713*c0909341SAndroid Build Coastguard Worker sub \s_strd, \s_strd, \w, uxtw 1714*c0909341SAndroid Build Coastguard Worker mov w8, \w 1715*c0909341SAndroid Build Coastguard Worker 1716*c0909341SAndroid Build Coastguard Worker.ifc \isa, neon_i8mm 1717*c0909341SAndroid Build Coastguard Worker cmp w9, #SHARP1 1718*c0909341SAndroid Build Coastguard Worker b.eq 328f // horizontal == SHARP1 1719*c0909341SAndroid Build Coastguard Worker 1720*c0909341SAndroid Build Coastguard Worker ldp q29, q30, [x13, #(OFFSET_USMMLA)] 1721*c0909341SAndroid Build Coastguard Worker ext v0.8b, v26.8b, v26.8b, #7 1722*c0909341SAndroid Build Coastguard Worker ins v26.d[1], v0.d[0] 1723*c0909341SAndroid Build Coastguard Worker 1724*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1725*c0909341SAndroid Build Coastguard Worker32: 1726*c0909341SAndroid Build Coastguard Worker ldr q16, [\src] 1727*c0909341SAndroid Build Coastguard Worker ldur q17, [\src, #8] // avoid 2 register TBL for small cores 1728*c0909341SAndroid Build Coastguard Worker add \src, \src, #16 1729*c0909341SAndroid Build Coastguard Worker .ifc \type, prep 1730*c0909341SAndroid Build Coastguard Worker movi v6.4s, #0 1731*c0909341SAndroid Build Coastguard Worker movi v7.4s, #0 1732*c0909341SAndroid Build Coastguard Worker movi v22.4s, #0 1733*c0909341SAndroid Build Coastguard Worker movi v23.4s, #0 1734*c0909341SAndroid Build Coastguard Worker .else 1735*c0909341SAndroid Build Coastguard Worker mov v6.16b, v27.16b 1736*c0909341SAndroid Build Coastguard Worker mov v7.16b, v27.16b 1737*c0909341SAndroid Build Coastguard Worker mov v22.16b, v27.16b 1738*c0909341SAndroid Build Coastguard Worker mov v23.16b, v27.16b 1739*c0909341SAndroid Build Coastguard Worker .endif 1740*c0909341SAndroid Build Coastguard Worker tbl v0.16b, {v16.16b}, v29.16b 1741*c0909341SAndroid Build Coastguard Worker tbl v1.16b, {v16.16b}, v30.16b 1742*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v17.16b}, v29.16b 1743*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v17.16b}, v30.16b 1744*c0909341SAndroid Build Coastguard Worker 1745*c0909341SAndroid Build Coastguard Worker usmmla v6.4s, v0.16b, v26.16b 1746*c0909341SAndroid Build Coastguard Worker usmmla v7.4s, v1.16b, v26.16b 1747*c0909341SAndroid Build Coastguard Worker usmmla v22.4s, v2.16b, v26.16b 1748*c0909341SAndroid Build Coastguard Worker usmmla v23.4s, v3.16b, v26.16b 1749*c0909341SAndroid Build Coastguard Worker 1750*c0909341SAndroid Build Coastguard Worker uzp1 v6.8h, v6.8h, v7.8h 1751*c0909341SAndroid Build Coastguard Worker uzp1 v22.8h, v22.8h, v23.8h 1752*c0909341SAndroid Build Coastguard Worker .ifc \type, prep 1753*c0909341SAndroid Build Coastguard Worker srshr v6.8h, v6.8h, #2 1754*c0909341SAndroid Build Coastguard Worker srshr v22.8h, v22.8h, #2 1755*c0909341SAndroid Build Coastguard Worker subs w8, w8, #16 1756*c0909341SAndroid Build Coastguard Worker stp q6, q22, [\dst], #32 1757*c0909341SAndroid Build Coastguard Worker .else // put 1758*c0909341SAndroid Build Coastguard Worker sqshrun v6.8b, v6.8h, #6 1759*c0909341SAndroid Build Coastguard Worker sqshrun2 v6.16b, v22.8h, #6 1760*c0909341SAndroid Build Coastguard Worker subs w8, w8, #16 1761*c0909341SAndroid Build Coastguard Worker str q6, [\dst], #16 1762*c0909341SAndroid Build Coastguard Worker .endif 1763*c0909341SAndroid Build Coastguard Worker b.gt 32b 1764*c0909341SAndroid Build Coastguard Worker 1765*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd 1766*c0909341SAndroid Build Coastguard Worker .ifc \type, put 1767*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd 1768*c0909341SAndroid Build Coastguard Worker .endif 1769*c0909341SAndroid Build Coastguard Worker mov w8, \w 1770*c0909341SAndroid Build Coastguard Worker subs \h, \h, #1 1771*c0909341SAndroid Build Coastguard Worker b.gt 32b 1772*c0909341SAndroid Build Coastguard Worker ret 1773*c0909341SAndroid Build Coastguard Worker 1774*c0909341SAndroid Build Coastguard Worker .align JUMP_ALIGN 1775*c0909341SAndroid Build Coastguard Worker328: 1776*c0909341SAndroid Build Coastguard Worker.endif // neon_i8mm 1777*c0909341SAndroid Build Coastguard Worker ldp q29, q30, [x13, #16] 1778*c0909341SAndroid Build Coastguard Worker 1779*c0909341SAndroid Build Coastguard Worker .align LOOP_ALIGN 1780*c0909341SAndroid Build Coastguard Worker32: 1781*c0909341SAndroid Build Coastguard Worker ldr q16, [\src] 1782*c0909341SAndroid Build Coastguard Worker ldur q17, [\src, #12] // avoid 2 register TBL for small cores 1783*c0909341SAndroid Build Coastguard Worker add \src, \src, #16 1784*c0909341SAndroid Build Coastguard Worker.ifc \type\()_\isa, prep_neon_i8mm 1785*c0909341SAndroid Build Coastguard Worker movi v6.4s, #0 1786*c0909341SAndroid Build Coastguard Worker movi v7.4s, #0 1787*c0909341SAndroid Build Coastguard Worker movi v22.4s, #0 1788*c0909341SAndroid Build Coastguard Worker movi v23.4s, #0 1789*c0909341SAndroid Build Coastguard Worker.else 1790*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_dotprod 1791*c0909341SAndroid Build Coastguard Worker sub v16.16b, v16.16b, v24.16b 1792*c0909341SAndroid Build Coastguard Worker sub v17.16b, v17.16b, v24.16b 1793*c0909341SAndroid Build Coastguard Worker .endif 1794*c0909341SAndroid Build Coastguard Worker mov v6.16b, v27.16b 1795*c0909341SAndroid Build Coastguard Worker mov v7.16b, v27.16b 1796*c0909341SAndroid Build Coastguard Worker mov v22.16b, v27.16b 1797*c0909341SAndroid Build Coastguard Worker mov v23.16b, v27.16b 1798*c0909341SAndroid Build Coastguard Worker.endif 1799*c0909341SAndroid Build Coastguard Worker tbl v0.16b, {v16.16b}, v28.16b 1800*c0909341SAndroid Build Coastguard Worker tbl v1.16b, {v16.16b}, v29.16b 1801*c0909341SAndroid Build Coastguard Worker tbl v2.16b, {v16.16b}, v30.16b 1802*c0909341SAndroid Build Coastguard Worker tbl v3.16b, {v17.16b}, v28.16b 1803*c0909341SAndroid Build Coastguard Worker tbl v4.16b, {v17.16b}, v29.16b 1804*c0909341SAndroid Build Coastguard Worker 1805*c0909341SAndroid Build Coastguard Worker \dot v6.4s, v0.16b, v26.4b[0] 1806*c0909341SAndroid Build Coastguard Worker \dot v7.4s, v1.16b, v26.4b[0] 1807*c0909341SAndroid Build Coastguard Worker \dot v22.4s, v2.16b, v26.4b[0] 1808*c0909341SAndroid Build Coastguard Worker \dot v23.4s, v3.16b, v26.4b[0] 1809*c0909341SAndroid Build Coastguard Worker \dot v6.4s, v1.16b, v26.4b[1] 1810*c0909341SAndroid Build Coastguard Worker \dot v7.4s, v2.16b, v26.4b[1] 1811*c0909341SAndroid Build Coastguard Worker \dot v22.4s, v3.16b, v26.4b[1] 1812*c0909341SAndroid Build Coastguard Worker \dot v23.4s, v4.16b, v26.4b[1] 1813*c0909341SAndroid Build Coastguard Worker 1814*c0909341SAndroid Build Coastguard Worker uzp1 v6.8h, v6.8h, v7.8h 1815*c0909341SAndroid Build Coastguard Worker uzp1 v22.8h, v22.8h, v23.8h 1816*c0909341SAndroid Build Coastguard Worker.ifc \type, prep 1817*c0909341SAndroid Build Coastguard Worker .ifc \isa, neon_i8mm 1818*c0909341SAndroid Build Coastguard Worker srshr v6.8h, v6.8h, #2 1819*c0909341SAndroid Build Coastguard Worker srshr v22.8h, v22.8h, #2 1820*c0909341SAndroid Build Coastguard Worker .else 1821*c0909341SAndroid Build Coastguard Worker sshr v6.8h, v6.8h, #2 1822*c0909341SAndroid Build Coastguard Worker sshr v22.8h, v22.8h, #2 1823*c0909341SAndroid Build Coastguard Worker .endif 1824*c0909341SAndroid Build Coastguard Worker subs w8, w8, #16 1825*c0909341SAndroid Build Coastguard Worker stp q6, q22, [\dst], #32 1826*c0909341SAndroid Build Coastguard Worker.else // put 1827*c0909341SAndroid Build Coastguard Worker sqshrun v6.8b, v6.8h, #6 1828*c0909341SAndroid Build Coastguard Worker sqshrun2 v6.16b, v22.8h, #6 1829*c0909341SAndroid Build Coastguard Worker subs w8, w8, #16 1830*c0909341SAndroid Build Coastguard Worker str q6, [\dst], #16 1831*c0909341SAndroid Build Coastguard Worker.endif 1832*c0909341SAndroid Build Coastguard Worker b.gt 32b 1833*c0909341SAndroid Build Coastguard Worker 1834*c0909341SAndroid Build Coastguard Worker add \src, \src, \s_strd 1835*c0909341SAndroid Build Coastguard Worker.ifc \type, put 1836*c0909341SAndroid Build Coastguard Worker add \dst, \dst, \d_strd 1837*c0909341SAndroid Build Coastguard Worker.endif 1838*c0909341SAndroid Build Coastguard Worker mov w8, \w 1839*c0909341SAndroid Build Coastguard Worker subs \h, \h, #1 1840*c0909341SAndroid Build Coastguard Worker b.gt 32b 1841*c0909341SAndroid Build Coastguard Worker ret 1842*c0909341SAndroid Build Coastguard Workerendfunc 1843*c0909341SAndroid Build Coastguard Worker 1844*c0909341SAndroid Build Coastguard Workerjumptable \type\()_8tap_h_\isa\()_tbl 1845*c0909341SAndroid Build Coastguard Worker .word 1280b - \type\()_8tap_h_\isa\()_tbl 1846*c0909341SAndroid Build Coastguard Worker .word 640b - \type\()_8tap_h_\isa\()_tbl 1847*c0909341SAndroid Build Coastguard Worker .word 320b - \type\()_8tap_h_\isa\()_tbl 1848*c0909341SAndroid Build Coastguard Worker .word 160b - \type\()_8tap_h_\isa\()_tbl 1849*c0909341SAndroid Build Coastguard Worker .word 80b - \type\()_8tap_h_\isa\()_tbl 1850*c0909341SAndroid Build Coastguard Worker .word 40b - \type\()_8tap_h_\isa\()_tbl 1851*c0909341SAndroid Build Coastguard Worker.ifc \type, put 1852*c0909341SAndroid Build Coastguard Worker .word 20b - \type\()_8tap_h_\isa\()_tbl 1853*c0909341SAndroid Build Coastguard Worker.endif 1854*c0909341SAndroid Build Coastguard Workerendjumptable 1855*c0909341SAndroid Build Coastguard Worker.endm 1856*c0909341SAndroid Build Coastguard Worker 1857*c0909341SAndroid Build Coastguard Worker// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) 1858*c0909341SAndroid Build Coastguard Worker// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) 1859*c0909341SAndroid Build Coastguard Workerfilter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 1860*c0909341SAndroid Build Coastguard Worker 1861*c0909341SAndroid Build Coastguard Worker// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) 1862*c0909341SAndroid Build Coastguard Worker// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) 1863*c0909341SAndroid Build Coastguard Workerfilter_8tap_fn put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 1864*c0909341SAndroid Build Coastguard Worker 1865*c0909341SAndroid Build Coastguard Worker#if HAVE_I8MM 1866*c0909341SAndroid Build Coastguard WorkerENABLE_I8MM 1867*c0909341SAndroid Build Coastguard Worker 1868*c0909341SAndroid Build Coastguard Worker// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) 1869*c0909341SAndroid Build Coastguard Worker// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) 1870*c0909341SAndroid Build Coastguard Workerfilter_8tap_fn prep, usdot, neon_i8mm, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 1871*c0909341SAndroid Build Coastguard Worker 1872*c0909341SAndroid Build Coastguard Worker// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) 1873*c0909341SAndroid Build Coastguard Worker// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) 1874*c0909341SAndroid Build Coastguard Workerfilter_8tap_fn put, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 1875*c0909341SAndroid Build Coastguard Worker 1876*c0909341SAndroid Build Coastguard WorkerDISABLE_I8MM 1877*c0909341SAndroid Build Coastguard Worker#endif // HAVE_I8MM 1878*c0909341SAndroid Build Coastguard Worker 1879*c0909341SAndroid Build Coastguard WorkerDISABLE_DOTPROD 1880*c0909341SAndroid Build Coastguard Worker#endif // HAVE_DOTPROD 1881