1/* 2 * Copyright © 2024, VideoLAN and dav1d authors 3 * Copyright © 2024, Janne Grunau 4 * Copyright © 2024, Martin Storsjo 5 * Copyright © 2024, Arm Limited 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright notice, this 12 * list of conditions and the following disclaimer. 13 * 14 * 2. Redistributions in binary form must reproduce the above copyright notice, 15 * this list of conditions and the following disclaimer in the documentation 16 * and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 25 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30#include "src/arm/asm.S" 31#include "util.S" 32 33 34#if HAVE_DOTPROD 35ENABLE_DOTPROD 36 37// No spaces in these expressions, due to gas-preprocessor. It is translated by 38// -1 to save the negative offset at getting the address of `mc_subpel_filters`. 39#define REGULAR1 (((0*15-1)<<7)|(3*15-1)) 40#define SMOOTH1 (((1*15-1)<<7)|(4*15-1)) 41#define SHARP1 (((2*15-1)<<7)|(3*15-1)) 42 43#define FUNC_ALIGN 2 44#define JUMP_ALIGN 2 45#define LOOP_ALIGN 2 46 47 48const h_tbl_neon_dotprod, align=4 49 // Shuffle indices to permute horizontal samples in preparation for 50 // input to SDOT instructions. The 8-tap horizontal convolution uses 51 // sample indices in the interval of [-3, 4] relative to the current 52 // sample position. 53 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 54 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 55 .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 56 57 // Shuffle indices to permute horizontal samples in preparation for 58 // input to USMMLA instructions. 59#define OFFSET_USMMLA 48 60 .byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 61 .byte 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 62 63 // Lookup table used to help conversion of shifted 32-bit values to 8-bit. 64#define OFFSET_CVT_32_8 80 65 .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 66endconst 67 68const v_tbl_neon_dotprod, align=4 69 // Vertical convolutions are also using SDOT instructions, where a 70 // 128-bit register contains a transposed 4x4 matrix of values. 71 // Subsequent iterations of the vertical convolution can reuse the 72 // 3x4 sub-matrix from the previous loop iteration. These shuffle 73 // indices shift and merge this 4x4 matrix with the values of a new 74 // line. 75 .byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28 76 .byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19 77 .byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23 78 .byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27 79 .byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31 80endconst 81 82 83.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1 84function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN 85 mov x9, \type_h 86 mov x10, \type_v 87 .if \jump 88 b \op\()_8tap_\isa 89 .endif 90endfunc 91.endm 92 93.macro filter_8tap_fn type, dot, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd 94make_8tap_fn \type, sharp, SHARP1, SHARP1, \isa 95make_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa 96make_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa 97make_8tap_fn \type, smooth_sharp, SMOOTH1, SHARP1, \isa 98make_8tap_fn \type, smooth, SMOOTH1, SMOOTH1, \isa 99make_8tap_fn \type, smooth_regular, SMOOTH1, REGULAR1, \isa 100make_8tap_fn \type, regular_sharp, REGULAR1, SHARP1, \isa 101make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1, \isa 102make_8tap_fn \type, regular, REGULAR1, REGULAR1, \isa, jump=0 103 104function \type\()_8tap_\isa, align=FUNC_ALIGN 105 clz w8, \w 106 mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 107 sub w8, w8, #24 // for jump tables 108 movrel x12, X(mc_subpel_filters) 109 cbnz \mx, L(\type\()_8tap_h_hv_\isa) 110 cbnz \my, L(\type\()_8tap_v_\isa) 111.ifc \type, prep 112 add \wd_strd, \w, \w // prep_neon needs w * 2 as stride 113.endif 114 b X(\type\()_neon) 115 116 .align JUMP_ALIGN 117L(\type\()_8tap_v_\isa): 118 madd \my, \my, w11, w10 119 movrel x13, v_tbl_neon_dotprod 120 sub \src, \src, \s_strd 121.ifc \isa, neon_dotprod 122 .ifc \type, prep 123 mov w8, #0x2002 // FILTER_WEIGHT * 128 + rounding 124 dup v4.4s, w8 125 .else 126 movi v4.4s, #32, lsl #8 // FILTER_WEIGHT * 128, bias for SDOT 127 .endif 128.endif 129 ubfx w11, \my, #7, #7 130 and \my, \my, #0x7F 131 ldp q6, q28, [x13] 132 cmp \h, #4 133 csel \my, \my, w11, le 134 sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3 135 add \xmy, x12, \xmy, lsl #3 // subpel V filter address 136 ldr q29, [x13, #32] 137.ifc \isa, neon_dotprod 138 movi v5.16b, #128 139.endif 140 ldr d7, [\xmy] 141 cmp \w, #8 142 b.eq 80f 143 b.lt 40f 144 145 // .align JUMP_ALIGN // fallthrough 146160: // V - 16xN+ 147 ldp q30, q31, [x13, #48] 148.ifc \type, prep 149 add \wd_strd, \w, \w 150.endif 151 .align LOOP_ALIGN 152161: 153 mov \lsrc, \src 154 mov \ldst, \dst 155 sub w8, \h, #1 156 157 ldr q16, [\lsrc] 158 ldr q17, [\lsrc, \s_strd] 159 add \lsrc, \lsrc, \s_strd, lsl #1 160 ldr q18, [\lsrc] 161 ldr q19, [\lsrc, \s_strd] 162 add \lsrc, \lsrc, \s_strd, lsl #1 163 164 zip1 v0.16b, v16.16b, v17.16b 165 zip2 v1.16b, v16.16b, v17.16b 166 zip1 v2.16b, v18.16b, v19.16b 167 zip2 v3.16b, v18.16b, v19.16b 168 169 ldr q20, [\lsrc] 170 ldr q21, [\lsrc, \s_strd] 171 add \lsrc, \lsrc, \s_strd, lsl #1 172 ldr q22, [\lsrc] 173 ldr q23, [\lsrc, \s_strd] 174 add \lsrc, \lsrc, \s_strd, lsl #1 175 176 zip1 v18.16b, v20.16b, v21.16b 177 zip2 v21.16b, v20.16b, v21.16b 178 zip1 v24.16b, v22.16b, v23.16b 179 zip2 v27.16b, v22.16b, v23.16b 180 181 zip1 v16.8h, v0.8h, v2.8h 182 zip2 v19.8h, v0.8h, v2.8h 183 zip1 v22.8h, v1.8h, v3.8h 184 zip2 v25.8h, v1.8h, v3.8h 185 186 zip1 v17.8h, v18.8h, v24.8h 187 zip2 v20.8h, v18.8h, v24.8h 188 zip1 v23.8h, v21.8h, v27.8h 189 zip2 v26.8h, v21.8h, v27.8h 190.ifc \isa, neon_dotprod 191 sub v16.16b, v16.16b, v5.16b 192 sub v19.16b, v19.16b, v5.16b 193 sub v22.16b, v22.16b, v5.16b 194 sub v25.16b, v25.16b, v5.16b 195 196 sub v17.16b, v17.16b, v5.16b 197 sub v20.16b, v20.16b, v5.16b 198 sub v23.16b, v23.16b, v5.16b 199 sub v26.16b, v26.16b, v5.16b 200.endif 201 .align LOOP_ALIGN 20216: 203.ifc \isa, neon_i8mm 204 ld1 {v18.16b}, [\lsrc], \s_strd 205 movi v0.4s, #0 206 movi v1.4s, #0 207 movi v2.4s, #0 208 movi v3.4s, #0 209 mov v21.16b, v18.16b 210 mov v24.16b, v18.16b 211 mov v27.16b, v18.16b 212.else // neon_dotprod 213 ld1 {v27.16b}, [\lsrc], \s_strd 214 mov v0.16b, v4.16b 215 mov v1.16b, v4.16b 216 mov v2.16b, v4.16b 217 mov v3.16b, v4.16b 218 sub v18.16b, v27.16b, v5.16b 219 sub v21.16b, v27.16b, v5.16b 220 sub v24.16b, v27.16b, v5.16b 221 sub v27.16b, v27.16b, v5.16b 222.endif 223 \dot v0.4s, v16.16b, v7.4b[0] 224 \dot v1.4s, v19.16b, v7.4b[0] 225 \dot v2.4s, v22.16b, v7.4b[0] 226 \dot v3.4s, v25.16b, v7.4b[0] 227 228 tbl v16.16b, {v16.16b, v17.16b}, v6.16b 229 tbl v19.16b, {v19.16b, v20.16b}, v6.16b 230 tbl v22.16b, {v22.16b, v23.16b}, v6.16b 231 tbl v25.16b, {v25.16b, v26.16b}, v6.16b 232 233 \dot v0.4s, v17.16b, v7.4b[1] 234 \dot v1.4s, v20.16b, v7.4b[1] 235 \dot v2.4s, v23.16b, v7.4b[1] 236 \dot v3.4s, v26.16b, v7.4b[1] 237 238 tbl v17.16b, {v17.16b, v18.16b}, v28.16b 239 tbl v20.16b, {v20.16b, v21.16b}, v29.16b 240 tbl v23.16b, {v23.16b, v24.16b}, v30.16b 241 tbl v26.16b, {v26.16b, v27.16b}, v31.16b 242 243 subs w8, w8, #1 244 uzp1 v0.8h, v0.8h, v1.8h 245 uzp1 v2.8h, v2.8h, v3.8h 246.ifc \type, prep 247 .ifc \isa, neon_i8mm 248 srshr v0.8h, v0.8h, #2 249 srshr v1.8h, v2.8h, #2 250 .else 251 sshr v0.8h, v0.8h, #2 252 sshr v1.8h, v2.8h, #2 253 .endif 254 st1 {v0.8h, v1.8h}, [\ldst], \d_strd 255.else // put 256 sqrshrun v0.8b, v0.8h, #6 257 sqrshrun2 v0.16b, v2.8h, #6 258 st1 {v0.16b}, [\ldst], \d_strd 259.endif 260 b.gt 16b 261 262.ifc \isa, neon_i8mm 263 movi v0.4s, #0 264 movi v1.4s, #0 265 movi v2.4s, #0 266 movi v3.4s, #0 267.else // neon_dotprod 268 mov v0.16b, v4.16b 269 mov v1.16b, v4.16b 270 mov v2.16b, v4.16b 271 mov v3.16b, v4.16b 272.endif 273 \dot v0.4s, v16.16b, v7.4b[0] 274 \dot v1.4s, v19.16b, v7.4b[0] 275 \dot v2.4s, v22.16b, v7.4b[0] 276 \dot v3.4s, v25.16b, v7.4b[0] 277 278 \dot v0.4s, v17.16b, v7.4b[1] 279 \dot v1.4s, v20.16b, v7.4b[1] 280 \dot v2.4s, v23.16b, v7.4b[1] 281 \dot v3.4s, v26.16b, v7.4b[1] 282 283 subs \w, \w, #16 284 uzp1 v0.8h, v0.8h, v1.8h 285 uzp1 v2.8h, v2.8h, v3.8h 286.ifc \type, prep 287 .ifc \isa, neon_i8mm 288 srshr v0.8h, v0.8h, #2 289 srshr v1.8h, v2.8h, #2 290 .else 291 sshr v0.8h, v0.8h, #2 292 sshr v1.8h, v2.8h, #2 293 .endif 294 stp q0, q1, [\ldst] 295 add \dst, \dst, #32 296.else // put 297 sqrshrun v0.8b, v0.8h, #6 298 sqrshrun2 v0.16b, v2.8h, #6 299 str q0, [\ldst] 300 add \dst, \dst, #16 301.endif 302 add \src, \src, #16 303 b.gt 161b 304 ret 305 306 .align JUMP_ALIGN 30780: // V - 8xN 308 ldr d16, [\src] 309 ldr d17, [\src, \s_strd] 310 add \src, \src, \s_strd, lsl #1 311 ldr d18, [\src] 312 ldr d19, [\src, \s_strd] 313 add \src, \src, \s_strd, lsl #1 314 315 ldr d20, [\src] 316 ldr d21, [\src, \s_strd] 317 add \src, \src, \s_strd, lsl #1 318 ldr d22, [\src] 319 ldr d23, [\src, \s_strd] 320 add \src, \src, \s_strd, lsl #1 321 subs \h, \h, #2 // for prep: sub is enough 322 323 zip1 v0.16b, v16.16b, v17.16b 324 zip1 v2.16b, v18.16b, v19.16b 325 zip1 v18.16b, v20.16b, v21.16b 326 zip1 v24.16b, v22.16b, v23.16b 327 328 zip1 v16.8h, v0.8h, v2.8h 329 zip2 v19.8h, v0.8h, v2.8h 330 zip1 v17.8h, v18.8h, v24.8h 331 zip2 v20.8h, v18.8h, v24.8h 332.ifc \isa, neon_dotprod 333 sub v16.16b, v16.16b, v5.16b 334 sub v19.16b, v19.16b, v5.16b 335 sub v17.16b, v17.16b, v5.16b 336 sub v20.16b, v20.16b, v5.16b 337.endif 338.ifc \type, put 339 b.eq 82f 340.endif 341 .align LOOP_ALIGN 3428: 343.ifc \isa, neon_i8mm 344 ldr d18, [\src] 345 movi v0.4s, #0 346 movi v1.4s, #0 347 ldr d24, [\src, \s_strd] 348 add \src, \src, \s_strd, lsl #1 349 movi v2.4s, #0 350 movi v3.4s, #0 351 mov v21.8b, v18.8b 352 mov v27.8b, v24.8b 353.else // neon_dotprod 354 ldr d21, [\src] 355 ldr d27, [\src, \s_strd] 356 add \src, \src, \s_strd, lsl #1 357 mov v0.16b, v4.16b 358 mov v1.16b, v4.16b 359 mov v2.16b, v4.16b 360 mov v3.16b, v4.16b 361 sub v18.16b, v21.16b, v5.16b 362 sub v21.16b, v21.16b, v5.16b 363 sub v24.16b, v27.16b, v5.16b 364 sub v27.16b, v27.16b, v5.16b 365.endif 366 tbl v22.16b, {v16.16b, v17.16b}, v6.16b 367 tbl v25.16b, {v19.16b, v20.16b}, v6.16b 368 tbl v23.16b, {v17.16b, v18.16b}, v28.16b 369 tbl v26.16b, {v20.16b, v21.16b}, v29.16b 370 371 \dot v0.4s, v16.16b, v7.4b[0] 372 \dot v0.4s, v17.16b, v7.4b[1] 373 \dot v1.4s, v19.16b, v7.4b[0] 374 \dot v1.4s, v20.16b, v7.4b[1] 375 376 tbl v16.16b, {v22.16b, v23.16b}, v6.16b 377 tbl v19.16b, {v25.16b, v26.16b}, v6.16b 378 tbl v17.16b, {v23.16b, v24.16b}, v28.16b 379 tbl v20.16b, {v26.16b, v27.16b}, v29.16b 380 381 \dot v2.4s, v22.16b, v7.4b[0] 382 \dot v2.4s, v23.16b, v7.4b[1] 383 \dot v3.4s, v25.16b, v7.4b[0] 384 \dot v3.4s, v26.16b, v7.4b[1] 385 386 subs \h, \h, #2 387 uzp1 v0.8h, v0.8h, v1.8h 388 uzp1 v2.8h, v2.8h, v3.8h 389.ifc \type, prep 390 .ifc \isa, neon_i8mm 391 srshr v0.8h, v0.8h, #2 392 srshr v1.8h, v2.8h, #2 393 .else 394 sshr v0.8h, v0.8h, #2 395 sshr v1.8h, v2.8h, #2 396 .endif 397 stp q0, q1, [\dst], #32 398.else // put 399 sqrshrun v0.8b, v0.8h, #6 400 sqrshrun v1.8b, v2.8h, #6 401 str d0, [\dst] 402 str d1, [\dst, \d_strd] 403 add \dst, \dst, \d_strd, lsl #1 404.endif 405 b.gt 8b 406 407.ifc \type, put 408 .align JUMP_ALIGN 40982: 410.endif 411.ifc \isa, neon_i8mm 412 ldr d18, [\src] 413 movi v0.4s, #0 414 movi v1.4s, #0 415 movi v2.4s, #0 416 movi v3.4s, #0 417 mov v21.8b, v18.8b 418.else // neon_dotprod 419 ldr d21, [\src] 420 mov v0.16b, v4.16b 421 mov v1.16b, v4.16b 422 mov v2.16b, v4.16b 423 mov v3.16b, v4.16b 424 sub v18.16b, v21.16b, v5.16b 425 sub v21.16b, v21.16b, v5.16b 426.endif 427 tbl v22.16b, {v16.16b, v17.16b}, v6.16b 428 tbl v25.16b, {v19.16b, v20.16b}, v6.16b 429 tbl v23.16b, {v17.16b, v18.16b}, v28.16b 430 tbl v26.16b, {v20.16b, v21.16b}, v29.16b 431 432 \dot v0.4s, v16.16b, v7.4b[0] 433 \dot v0.4s, v17.16b, v7.4b[1] 434 \dot v1.4s, v19.16b, v7.4b[0] 435 \dot v1.4s, v20.16b, v7.4b[1] 436 437 \dot v2.4s, v22.16b, v7.4b[0] 438 \dot v2.4s, v23.16b, v7.4b[1] 439 \dot v3.4s, v25.16b, v7.4b[0] 440 \dot v3.4s, v26.16b, v7.4b[1] 441 442 uzp1 v0.8h, v0.8h, v1.8h 443 uzp1 v2.8h, v2.8h, v3.8h 444.ifc \type, prep 445 .ifc \isa, neon_i8mm 446 srshr v0.8h, v0.8h, #2 447 srshr v1.8h, v2.8h, #2 448 .else 449 sshr v0.8h, v0.8h, #2 450 sshr v1.8h, v2.8h, #2 451 .endif 452 stp q0, q1, [\dst] 453.else // put 454 sqrshrun v0.8b, v0.8h, #6 455 sqrshrun v1.8b, v2.8h, #6 456 str d0, [\dst] 457 str d1, [\dst, \d_strd] 458.endif 459 ret 460 461 .align JUMP_ALIGN 46240: // V - 4xN or 2xN (put only) 463.ifc \type, put 464 cmp \w, #2 465 b.eq 20f 466.endif 467 ldr s16, [\src] 468 ldr s17, [\src, \s_strd] 469 add \src, \src, \s_strd, lsl #1 470 ldr s18, [\src] 471 ldr s19, [\src, \s_strd] 472 add \src, \src, \s_strd, lsl #1 473 474 ldr s20, [\src] 475 ldr s21, [\src, \s_strd] 476 add \src, \src, \s_strd, lsl #1 477 ldr s22, [\src] 478 ldr s23, [\src, \s_strd] 479 add \src, \src, \s_strd, lsl #1 480 subs \h, \h, #2 // for prep: sub is enough 481 482 zip1 v0.8b, v16.8b, v17.8b 483 zip1 v2.8b, v18.8b, v19.8b 484 zip1 v18.8b, v20.8b, v21.8b 485 zip1 v24.8b, v22.8b, v23.8b 486 487 zip1 v16.8h, v0.8h, v2.8h 488 zip1 v17.8h, v18.8h, v24.8h 489.ifc \isa, neon_dotprod 490 sub v16.16b, v16.16b, v5.16b 491 sub v17.16b, v17.16b, v5.16b 492.endif 493.ifc \type, put 494 b.eq 42f 495.endif 496 .align LOOP_ALIGN 4974: 498 ldr s18, [\src] 499 ldr s21, [\src, \s_strd] 500 add \src, \src, \s_strd, lsl #1 501.ifc \isa, neon_i8mm 502 movi v0.4s, #0 503 movi v1.4s, #0 504.else // neon_dotprod 505 mov v0.16b, v4.16b 506 mov v1.16b, v4.16b 507 sub v18.16b, v18.16b, v5.16b 508 sub v21.16b, v21.16b, v5.16b 509.endif 510 tbl v19.16b, {v16.16b, v17.16b}, v6.16b 511 tbl v20.16b, {v17.16b, v18.16b}, v28.16b 512 513 \dot v0.4s, v16.16b, v7.4b[0] 514 \dot v0.4s, v17.16b, v7.4b[1] 515 516 tbl v16.16b, {v19.16b, v20.16b}, v6.16b 517 tbl v17.16b, {v20.16b, v21.16b}, v28.16b 518 519 \dot v1.4s, v19.16b, v7.4b[0] 520 \dot v1.4s, v20.16b, v7.4b[1] 521.ifc \type, prep 522 subs \h, \h, #2 523 .ifc \isa, neon_i8mm 524 rshrn v0.4h, v0.4s, #2 525 rshrn2 v0.8h, v1.4s, #2 526 .else 527 shrn v0.4h, v0.4s, #2 528 shrn2 v0.8h, v1.4s, #2 529 .endif 530 str q0, [\dst], #16 531.else 532 uzp1 v0.8h, v0.8h, v1.8h 533 sqrshrun v0.8b, v0.8h, #6 534 subs \h, \h, #2 535 fmov x8, d0 536 lsr x9, x8, #32 537 str w8, [\dst] 538 str w9, [\dst, \d_strd] 539 add \dst, \dst, \d_strd, lsl #1 540.endif 541 b.gt 4b 542 543.ifc \type, put 544 .align JUMP_ALIGN 54542: 546.endif 547 ldr s18, [\src] 548.ifc \isa, neon_i8mm 549 movi v0.4s, #0 550 movi v1.4s, #0 551.else // neon_dotprod 552 mov v0.16b, v4.16b 553 mov v1.16b, v4.16b 554 sub v18.16b, v18.16b, v5.16b 555.endif 556 tbl v19.16b, {v16.16b, v17.16b}, v6.16b 557 tbl v20.16b, {v17.16b, v18.16b}, v28.16b 558 559 \dot v0.4s, v16.16b, v7.4b[0] 560 \dot v0.4s, v17.16b, v7.4b[1] 561 562 \dot v1.4s, v19.16b, v7.4b[0] 563 \dot v1.4s, v20.16b, v7.4b[1] 564.ifc \type, prep 565 .ifc \isa, neon_i8mm 566 rshrn v0.4h, v0.4s, #2 567 rshrn2 v0.8h, v1.4s, #2 568 .else 569 shrn v0.4h, v0.4s, #2 570 shrn2 v0.8h, v1.4s, #2 571 .endif 572 str q0, [\dst] 573.else 574 uzp1 v0.8h, v0.8h, v1.8h 575 sqrshrun v0.8b, v0.8h, #6 576 fmov x8, d0 577 lsr x9, x8, #32 578 str w8, [\dst] 579 str w9, [\dst, \d_strd] 580.endif 581 ret 582 583.ifc \type, put 584 .align JUMP_ALIGN 58520: // V - 2xN 586 ldr h16, [\src] 587 ldr h17, [\src, \s_strd] 588 add \src, \src, \s_strd, lsl #1 589 ldr h18, [\src] 590 ldr h19, [\src, \s_strd] 591 add \src, \src, \s_strd, lsl #1 592 593 ldr h20, [\src] 594 ldr h21, [\src, \s_strd] 595 add \src, \src, \s_strd, lsl #1 596 ldr h22, [\src] 597 ldr h23, [\src, \s_strd] 598 add \src, \src, \s_strd, lsl #1 599 subs \h, \h, #2 600 601 zip1 v0.8b, v16.8b, v17.8b 602 zip1 v2.8b, v18.8b, v19.8b 603 zip1 v18.8b, v20.8b, v21.8b 604 zip1 v24.8b, v22.8b, v23.8b 605 606 zip1 v16.4h, v0.4h, v2.4h 607 zip1 v17.4h, v18.4h, v24.4h 608 .ifc \isa, neon_dotprod 609 sub v16.8b, v16.8b, v5.8b 610 sub v17.8b, v17.8b, v5.8b 611 .endif 612 b.eq 22f 613 614 .align LOOP_ALIGN 6152: 616 ldr h18, [\src] 617 ldr h21, [\src, \s_strd] 618 add \src, \src, \s_strd, lsl #1 619 .ifc \isa, neon_i8mm 620 movi v0.4s, #0 621 movi v1.4s, #0 622 .else // put 623 mov v0.16b, v4.16b 624 mov v1.16b, v4.16b 625 sub v18.8b, v18.8b, v5.8b 626 sub v21.8b, v21.8b, v5.8b 627 .endif 628 tbl v19.16b, {v16.16b, v17.16b}, v6.16b 629 tbl v20.16b, {v17.16b, v18.16b}, v28.16b 630 631 \dot v0.4s, v16.16b, v7.4b[0] 632 \dot v0.4s, v17.16b, v7.4b[1] 633 634 tbl v16.16b, {v19.16b, v20.16b}, v6.16b 635 tbl v17.16b, {v20.16b, v21.16b}, v28.16b 636 637 \dot v1.4s, v19.16b, v7.4b[0] 638 \dot v1.4s, v20.16b, v7.4b[1] 639 640 uzp1 v0.8h, v0.8h, v1.8h 641 sqrshrun v0.8b, v0.8h, #6 642 643 subs \h, \h, #2 644 fmov x8, d0 645 lsr x9, x8, #32 646 strh w8, [\dst] 647 strh w9, [\dst, \d_strd] 648 add \dst, \dst, \d_strd, lsl #1 649 b.gt 2b 650 651 .align JUMP_ALIGN 65222: 653 ldr h18, [\src] 654 .ifc \isa, neon_i8mm 655 movi v0.4s, #0 656 movi v1.4s, #0 657 .else // put 658 mov v0.16b, v4.16b 659 mov v1.16b, v4.16b 660 sub v18.8b, v18.8b, v5.8b 661 .endif 662 tbl v19.16b, {v16.16b, v17.16b}, v6.16b 663 tbl v20.16b, {v17.16b, v18.16b}, v28.16b 664 665 \dot v0.4s, v16.16b, v7.4b[0] 666 \dot v0.4s, v17.16b, v7.4b[1] 667 668 \dot v1.4s, v19.16b, v7.4b[0] 669 \dot v1.4s, v20.16b, v7.4b[1] 670 671 uzp1 v0.8h, v0.8h, v1.8h 672 sqrshrun v0.8b, v0.8h, #6 673 674 fmov x8, d0 675 lsr x9, x8, #32 676 strh w8, [\dst] 677 strh w9, [\dst, \d_strd] 678 ret 679.endif 680 681 .align JUMP_ALIGN 682L(\type\()_8tap_h_hv_\isa): 683 madd \mx, \mx, w11, w9 684 madd w14, \my, w11, w10 // for HV 685.ifc \isa, neon_dotprod 686 mov w13, #0x2002 // FILTER_WEIGHT * 128 + rounding 687 dup v27.4s, w13 // put H overrides this 688.endif 689 movrel x13, h_tbl_neon_dotprod 690 sub \src, \src, #3 // src - 3 691 ldr q28, [x13] // for 4-tap & 8-tap H filters 692 ubfx w15, \mx, #7, #7 693 and \mx, \mx, #0x7F 694 ubfx w11, w14, #7, #7 // for HV 695 and w14, w14, #0x7F // for HV 696 cmp \w, #4 697 csel \mx, \mx, w15, le 698 add \xmx, x12, \xmx, lsl #3 // subpel H filter address 699.ifc \isa, neon_dotprod 700 movi v24.16b, #128 701.endif 702 cbz \my, L(\type\()_8tap_h_\isa) 703 704 // HV cases 705 cmp \h, #4 706 csel w14, w14, w11, le 707 sub \src, \src, \s_strd, lsl #1 // src - s_strd * 2 - 3 708 add \xmy, x12, x14, lsl #3 // subpel V filter address 709 mov x15, x30 710 ldr d7, [\xmy] 711.ifc \type, put 712 ldr q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion 713.endif // of 32b values to 8b 714 sxtl v7.8h, v7.8b 715 cmp w10, #SHARP1 716 b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 717 718 // HV 8-tap cases 719 sub \src, \src, \s_strd // src - s_strd * 3 - 3 720 cmp \w, #4 721 b.eq 40f 722.ifc \type, put 723 b.lt 20f 724.endif 725 726 // .align JUMP_ALIGN // fallthrough 72780: // HV8 - 8xN+ 728 ldp q29, q30, [x13, #16] 729 ldr d26, [\xmx] 730.ifc \type, prep 731 add \wd_strd, \w, \w 732.endif 733 .align LOOP_ALIGN 73481: 735 mov \lsrc, \src 736 mov \ldst, \dst 737 mov w8, \h 738.ifc \isa, neon_i8mm 739 bl L(\type\()_hv_filter8_\isa) 740 srshr v16.8h, v22.8h, #2 741 bl L(\type\()_hv_filter8_\isa) 742 srshr v17.8h, v22.8h, #2 743 bl L(\type\()_hv_filter8_\isa) 744 srshr v18.8h, v22.8h, #2 745 bl L(\type\()_hv_filter8_\isa) 746 srshr v19.8h, v22.8h, #2 747 bl L(\type\()_hv_filter8_\isa) 748 srshr v20.8h, v22.8h, #2 749 bl L(\type\()_hv_filter8_\isa) 750 srshr v21.8h, v22.8h, #2 751 bl L(\type\()_hv_filter8_\isa) 752 srshr v22.8h, v22.8h, #2 753.else 754 bl L(\type\()_hv_filter8_\isa) 755 sshr v16.8h, v22.8h, #2 756 bl L(\type\()_hv_filter8_\isa) 757 sshr v17.8h, v22.8h, #2 758 bl L(\type\()_hv_filter8_\isa) 759 sshr v18.8h, v22.8h, #2 760 bl L(\type\()_hv_filter8_\isa) 761 sshr v19.8h, v22.8h, #2 762 bl L(\type\()_hv_filter8_\isa) 763 sshr v20.8h, v22.8h, #2 764 bl L(\type\()_hv_filter8_\isa) 765 sshr v21.8h, v22.8h, #2 766 bl L(\type\()_hv_filter8_\isa) 767 sshr v22.8h, v22.8h, #2 768.endif 769 .align LOOP_ALIGN 7708: 771 ldr q23, [\lsrc] 772 add \lsrc, \lsrc, \s_strd 773 774 smull v0.4s, v16.4h, v7.h[0] 775 smull2 v1.4s, v16.8h, v7.h[0] 776 mov v16.16b, v17.16b 777.ifc \isa, neon_i8mm 778 movi v5.4s, #0 779 movi v6.4s, #0 780 tbl v2.16b, {v23.16b}, v28.16b 781 tbl v3.16b, {v23.16b}, v29.16b 782.else // neon_dotprod 783 sub v23.16b, v23.16b, v24.16b 784 mov v5.16b, v27.16b 785 mov v6.16b, v27.16b 786.endif 787 smlal v0.4s, v17.4h, v7.h[1] 788 smlal2 v1.4s, v17.8h, v7.h[1] 789.ifc \isa, neon_i8mm 790 tbl v4.16b, {v23.16b}, v30.16b 791 mov v17.16b, v18.16b 792.else // neon_dotprod 793 mov v17.16b, v18.16b 794 tbl v2.16b, {v23.16b}, v28.16b 795 tbl v3.16b, {v23.16b}, v29.16b 796 tbl v4.16b, {v23.16b}, v30.16b 797.endif 798 smlal v0.4s, v18.4h, v7.h[2] 799 smlal2 v1.4s, v18.8h, v7.h[2] 800 mov v18.16b, v19.16b 801 802 \dot v5.4s, v2.16b, v26.4b[0] 803 \dot v6.4s, v3.16b, v26.4b[0] 804 805 smlal v0.4s, v19.4h, v7.h[3] 806 smlal2 v1.4s, v19.8h, v7.h[3] 807 mov v19.16b, v20.16b 808 809 \dot v5.4s, v3.16b, v26.4b[1] 810 \dot v6.4s, v4.16b, v26.4b[1] 811 812 smlal v0.4s, v20.4h, v7.h[4] 813 smlal2 v1.4s, v20.8h, v7.h[4] 814 mov v20.16b, v21.16b 815 816 smlal v0.4s, v21.4h, v7.h[5] 817 smlal2 v1.4s, v21.8h, v7.h[5] 818.ifc \type, prep 819 uzp1 v23.8h, v5.8h, v6.8h 820.endif 821 mov v21.16b, v22.16b 822 smlal v0.4s, v22.4h, v7.h[6] 823 smlal2 v1.4s, v22.8h, v7.h[6] 824.ifc \isa, neon_i8mm 825 subs w8, w8, #1 826.endif 827.ifc \type, prep 828 .ifc \isa, neon_i8mm 829 srshr v22.8h, v23.8h, #2 830 .else 831 sshr v22.8h, v23.8h, #2 832 .endif 833 smlal v0.4s, v22.4h, v7.h[7] 834 smlal2 v1.4s, v22.8h, v7.h[7] 835 rshrn v0.4h, v0.4s, #6 836 rshrn2 v0.8h, v1.4s, #6 837.else // put 838 .ifc \isa, neon_i8mm 839 rshrn v22.4h, v5.4s, #2 840 rshrn2 v22.8h, v6.4s, #2 841 .else 842 shrn v22.4h, v5.4s, #2 843 shrn2 v22.8h, v6.4s, #2 844 .endif 845 smlal v0.4s, v22.4h, v7.h[7] 846 smlal2 v1.4s, v22.8h, v7.h[7] 847 tbl v0.16b, {v0.16b, v1.16b}, v25.16b 848 sqrshrun v0.8b, v0.8h, #2 849.endif 850.ifc \isa, neon_dotprod 851 subs w8, w8, #1 852.endif 853.ifc \type, prep 854 st1 {v0.8h}, [\ldst], \d_strd 855 b.gt 8b 856 add \dst, \dst, #16 857.else 858 st1 {v0.8b}, [\ldst], \d_strd 859 b.gt 8b 860 add \dst, \dst, #8 861.endif 862 add \src, \src, #8 863 subs \w, \w, #8 864 b.gt 81b 865 ret x15 866 867 .align JUMP_ALIGN 86840: // HV8 - 4xN 869 ldur s26, [\xmx, #2] 870 add \src, \src, #2 871 872 bl L(\type\()_hv_filter4_\isa) 873 shrn v16.4h, v22.4s, #2 874 bl L(\type\()_hv_filter4_\isa) 875 shrn v17.4h, v22.4s, #2 876 bl L(\type\()_hv_filter4_\isa) 877 shrn v18.4h, v22.4s, #2 878 bl L(\type\()_hv_filter4_\isa) 879 shrn v19.4h, v22.4s, #2 880 bl L(\type\()_hv_filter4_\isa) 881 shrn v20.4h, v22.4s, #2 882 bl L(\type\()_hv_filter4_\isa) 883 shrn v21.4h, v22.4s, #2 884 bl L(\type\()_hv_filter4_\isa) 885 shrn v22.4h, v22.4s, #2 886 887 .align LOOP_ALIGN 8884: 889 ld1 {v4.8b}, [\src], \s_strd 890 891 smull v0.4s, v16.4h, v7.h[0] 892 smlal v0.4s, v17.4h, v7.h[1] 893 mov v16.16b, v17.16b 894 mov v17.16b, v18.16b 895.ifc \isa, neon_dotprod 896 sub v4.16b, v4.16b, v24.16b 897.endif 898 smlal v0.4s, v18.4h, v7.h[2] 899 smlal v0.4s, v19.4h, v7.h[3] 900 tbl v2.16b, {v4.16b}, v28.16b 901.ifc \isa, neon_i8mm 902 movi v5.4s, #0 903.else 904 mov v5.16b, v27.16b 905.endif 906 mov v18.16b, v19.16b 907 mov v19.16b, v20.16b 908 909 smlal v0.4s, v20.4h, v7.h[4] 910 smlal v0.4s, v21.4h, v7.h[5] 911 912 \dot v5.4s, v2.16b, v26.4b[0] 913 mov v20.16b, v21.16b 914 mov v21.16b, v22.16b 915 smlal v0.4s, v22.4h, v7.h[6] 916.ifc \isa, neon_i8mm 917 rshrn v22.4h, v5.4s, #2 918.else 919 shrn v22.4h, v5.4s, #2 920.endif 921 smlal v0.4s, v22.4h, v7.h[7] 922.ifc \type, prep 923 rshrn v0.4h, v0.4s, #6 924 str d0, [\dst], #8 925 subs \h, \h, #1 926.else 927 subs \h, \h, #1 928 tbl v0.8b, {v0.16b}, v25.8b 929 sqrshrun v0.8b, v0.8h, #2 930 str s0, [\dst] 931 add \dst, \dst, \d_strd 932.endif 933 b.gt 4b 934 ret x15 935 936.ifc \type, put 937 .align JUMP_ALIGN 93820: // HV8 - 2xN 939 ldur s26, [\xmx, #2] 940 add \src, \src, #2 941 942 bl L(\type\()_hv_filter4_\isa) 943 shrn v16.4h, v22.4s, #2 944 bl L(\type\()_hv_filter4_\isa) 945 shrn v17.4h, v22.4s, #2 946 bl L(\type\()_hv_filter4_\isa) 947 shrn v18.4h, v22.4s, #2 948 bl L(\type\()_hv_filter4_\isa) 949 shrn v19.4h, v22.4s, #2 950 bl L(\type\()_hv_filter4_\isa) 951 shrn v20.4h, v22.4s, #2 952 bl L(\type\()_hv_filter4_\isa) 953 shrn v21.4h, v22.4s, #2 954 bl L(\type\()_hv_filter4_\isa) 955 shrn v22.4h, v22.4s, #2 956 957 .align LOOP_ALIGN 9582: 959 ld1 {v4.8b}, [\src], \s_strd 960 961 smull v0.4s, v16.4h, v7.h[0] 962 smlal v0.4s, v17.4h, v7.h[1] 963 mov v16.16b, v17.16b 964 mov v17.16b, v18.16b 965 .ifc \isa, neon_dotprod 966 sub v4.16b, v4.16b, v24.16b 967 .endif 968 smlal v0.4s, v18.4h, v7.h[2] 969 smlal v0.4s, v19.4h, v7.h[3] 970 tbl v2.16b, {v4.16b}, v28.16b 971 .ifc \isa, neon_i8mm 972 movi v5.4s, #0 973 .else 974 mov v5.16b, v27.16b 975 .endif 976 mov v18.16b, v19.16b 977 mov v19.16b, v20.16b 978 979 smlal v0.4s, v20.4h, v7.h[4] 980 smlal v0.4s, v21.4h, v7.h[5] 981 982 \dot v5.4s, v2.16b, v26.4b[0] 983 mov v20.16b, v21.16b 984 mov v21.16b, v22.16b 985 986 smlal v0.4s, v22.4h, v7.h[6] 987 .ifc \isa, neon_i8mm 988 rshrn v22.4h, v5.4s, #2 989 .else 990 shrn v22.4h, v5.4s, #2 991 .endif 992 smlal v0.4s, v22.4h, v7.h[7] 993 subs \h, \h, #1 994 995 tbl v0.8b, {v0.16b}, v25.8b 996 sqrshrun v0.8b, v0.8h, #2 997 998 str h0, [\dst] 999 add \dst, \dst, \d_strd 1000 b.gt 2b 1001 ret x15 1002.endif 1003 1004 .align JUMP_ALIGN 1005L(\type\()_6tap_hv_\isa): 1006 cmp \w, #4 1007 b.eq 40f 1008.ifc \type, put 1009 b.lt 20f 1010.endif 1011 1012 // .align JUMP_ALIGN // fallthrough 101380: // HV6 - 8xN+ 1014 ldr d26, [\xmx] 1015.ifc \type, prep 1016 add \wd_strd, \w, \w 1017.endif 1018.ifc \isa, neon_i8mm 1019 cmp w9, #SHARP1 1020 b.eq 88f // horizontal == SHARP1 1021 1022 ldp q29, q30, [x13, #(OFFSET_USMMLA)] 1023 ext v0.8b, v26.8b, v26.8b, #7 1024 ins v26.d[1], v0.d[0] 1025 1026 .align LOOP_ALIGN 102781: 1028 mov \lsrc, \src 1029 mov \ldst, \dst 1030 mov w8, \h 1031 1032 bl L(\type\()_hv_filter6_neon_i8mm) 1033 srshr v16.8h, v22.8h, #2 1034 bl L(\type\()_hv_filter6_neon_i8mm) 1035 srshr v17.8h, v22.8h, #2 1036 bl L(\type\()_hv_filter6_neon_i8mm) 1037 srshr v18.8h, v22.8h, #2 1038 bl L(\type\()_hv_filter6_neon_i8mm) 1039 srshr v19.8h, v22.8h, #2 1040 bl L(\type\()_hv_filter6_neon_i8mm) 1041 srshr v20.8h, v22.8h, #2 1042 1043 .align LOOP_ALIGN 10448: 1045 ld1 {v23.16b}, [\lsrc], \s_strd 1046 1047 smull v0.4s, v16.4h, v7.h[1] 1048 smull2 v1.4s, v16.8h, v7.h[1] 1049 mov v16.16b, v17.16b 1050 movi v5.4s, #0 1051 movi v6.4s, #0 1052 tbl v2.16b, {v23.16b}, v29.16b 1053 tbl v3.16b, {v23.16b}, v30.16b 1054 1055 smlal v0.4s, v17.4h, v7.h[2] 1056 smlal2 v1.4s, v17.8h, v7.h[2] 1057 mov v17.16b, v18.16b 1058 1059 usmmla v5.4s, v2.16b, v26.16b 1060 usmmla v6.4s, v3.16b, v26.16b 1061 1062 smlal v0.4s, v18.4h, v7.h[3] 1063 smlal2 v1.4s, v18.8h, v7.h[3] 1064 mov v18.16b, v19.16b 1065 subs w8, w8, #1 1066 1067 smlal v0.4s, v19.4h, v7.h[4] 1068 smlal2 v1.4s, v19.8h, v7.h[4] 1069 uzp1 v23.8h, v5.8h, v6.8h 1070 mov v19.16b, v20.16b 1071 1072 smlal v0.4s, v20.4h, v7.h[5] 1073 smlal2 v1.4s, v20.8h, v7.h[5] 1074 srshr v20.8h, v23.8h, #2 1075 smlal v0.4s, v20.4h, v7.h[6] 1076 smlal2 v1.4s, v20.8h, v7.h[6] 1077 .ifc \type, prep 1078 rshrn v0.4h, v0.4s, #6 1079 rshrn2 v0.8h, v1.4s, #6 1080 st1 {v0.8h}, [\ldst], \d_strd 1081 b.gt 8b 1082 add \dst, \dst, #16 1083 .else 1084 tbl v0.16b, {v0.16b, v1.16b}, v25.16b 1085 sqrshrun v0.8b, v0.8h, #2 1086 st1 {v0.8b}, [\ldst], \d_strd 1087 b.gt 8b 1088 add \dst, \dst, #8 1089 .endif 1090 add \src, \src, #8 1091 subs \w, \w, #8 1092 b.gt 81b 1093 ret x15 1094 1095 .align JUMP_ALIGN 109688: 1097.endif // neon_i8mm 1098 ldp q29, q30, [x13, #16] 1099 1100 .align LOOP_ALIGN 110181: 1102 mov \lsrc, \src 1103 mov \ldst, \dst 1104 mov w8, \h 1105.ifc \isa, neon_i8mm 1106 bl L(\type\()_hv_filter8_\isa) 1107 srshr v16.8h, v22.8h, #2 1108 bl L(\type\()_hv_filter8_\isa) 1109 srshr v17.8h, v22.8h, #2 1110 bl L(\type\()_hv_filter8_\isa) 1111 srshr v18.8h, v22.8h, #2 1112 bl L(\type\()_hv_filter8_\isa) 1113 srshr v19.8h, v22.8h, #2 1114 bl L(\type\()_hv_filter8_\isa) 1115 srshr v20.8h, v22.8h, #2 1116.else 1117 bl L(\type\()_hv_filter8_\isa) 1118 sshr v16.8h, v22.8h, #2 1119 bl L(\type\()_hv_filter8_\isa) 1120 sshr v17.8h, v22.8h, #2 1121 bl L(\type\()_hv_filter8_\isa) 1122 sshr v18.8h, v22.8h, #2 1123 bl L(\type\()_hv_filter8_\isa) 1124 sshr v19.8h, v22.8h, #2 1125 bl L(\type\()_hv_filter8_\isa) 1126 sshr v20.8h, v22.8h, #2 1127.endif 1128 .align LOOP_ALIGN 11298: 1130 ldr q23, [\lsrc] 1131 add \lsrc, \lsrc, \s_strd 1132 1133 smull v0.4s, v16.4h, v7.h[1] 1134 smull2 v1.4s, v16.8h, v7.h[1] 1135.ifc \isa, neon_dotprod 1136 sub v23.16b, v23.16b, v24.16b 1137.endif 1138 mov v16.16b, v17.16b 1139.ifc \isa, neon_i8mm 1140 movi v5.4s, #0 1141 movi v6.4s, #0 1142.else 1143 mov v5.16b, v27.16b 1144 mov v6.16b, v27.16b 1145.endif 1146 tbl v2.16b, {v23.16b}, v28.16b 1147 tbl v3.16b, {v23.16b}, v29.16b 1148 1149 smlal v0.4s, v17.4h, v7.h[2] 1150 smlal2 v1.4s, v17.8h, v7.h[2] 1151 tbl v4.16b, {v23.16b}, v30.16b 1152 mov v17.16b, v18.16b 1153 1154 \dot v5.4s, v2.16b, v26.4b[0] 1155 \dot v6.4s, v3.16b, v26.4b[0] 1156 1157 smlal v0.4s, v18.4h, v7.h[3] 1158 smlal2 v1.4s, v18.8h, v7.h[3] 1159 mov v18.16b, v19.16b 1160 1161 \dot v5.4s, v3.16b, v26.4b[1] 1162 \dot v6.4s, v4.16b, v26.4b[1] 1163 1164 smlal v0.4s, v19.4h, v7.h[4] 1165 smlal2 v1.4s, v19.8h, v7.h[4] 1166 mov v19.16b, v20.16b 1167 uzp1 v23.8h, v5.8h, v6.8h 1168 1169 smlal v0.4s, v20.4h, v7.h[5] 1170 smlal2 v1.4s, v20.8h, v7.h[5] 1171.ifc \isa, neon_i8mm 1172 srshr v20.8h, v23.8h, #2 1173.else 1174 sshr v20.8h, v23.8h, #2 1175.endif 1176 subs w8, w8, #1 1177 smlal v0.4s, v20.4h, v7.h[6] 1178 smlal2 v1.4s, v20.8h, v7.h[6] 1179.ifc \type, prep 1180 rshrn v0.4h, v0.4s, #6 1181 rshrn2 v0.8h, v1.4s, #6 1182 st1 {v0.8h}, [\ldst], \d_strd 1183 b.gt 8b 1184 add \dst, \dst, #16 1185.else 1186 tbl v0.16b, {v0.16b, v1.16b}, v25.16b 1187 sqrshrun v0.8b, v0.8h, #2 1188 st1 {v0.8b}, [\ldst], \d_strd 1189 b.gt 8b 1190 add \dst, \dst, #8 1191.endif 1192 add \src, \src, #8 1193 subs \w, \w, #8 1194 b.gt 81b 1195 ret x15 1196 1197 .align FUNC_ALIGN 1198L(\type\()_hv_filter8_\isa): 1199 ld1 {v4.16b}, [\lsrc], \s_strd 1200.ifc \isa, neon_i8mm 1201 movi v22.4s, #0 1202 movi v23.4s, #0 1203.else // neon_dotprod 1204 sub v4.16b, v4.16b, v24.16b 1205 mov v22.16b, v27.16b 1206 mov v23.16b, v27.16b 1207.endif 1208 tbl v2.16b, {v4.16b}, v28.16b 1209 tbl v3.16b, {v4.16b}, v29.16b 1210 tbl v4.16b, {v4.16b}, v30.16b 1211 \dot v22.4s, v2.16b, v26.4b[0] 1212 \dot v23.4s, v3.16b, v26.4b[0] 1213 \dot v22.4s, v3.16b, v26.4b[1] 1214 \dot v23.4s, v4.16b, v26.4b[1] 1215 uzp1 v22.8h, v22.8h, v23.8h 1216 ret 1217 1218.ifc \isa, neon_i8mm 1219 .align FUNC_ALIGN 1220L(\type\()_hv_filter6_neon_i8mm): 1221 ld1 {v4.16b}, [\lsrc], \s_strd 1222 movi v22.4s, #0 1223 movi v23.4s, #0 1224 tbl v2.16b, {v4.16b}, v29.16b 1225 tbl v3.16b, {v4.16b}, v30.16b 1226 usmmla v22.4s, v2.16b, v26.16b 1227 usmmla v23.4s, v3.16b, v26.16b 1228 uzp1 v22.8h, v22.8h, v23.8h 1229 ret 1230.endif 1231 1232 .align FUNC_ALIGN 1233L(\type\()_hv_filter4_\isa): 1234 ld1 {v4.8b}, [\src], \s_strd 1235.ifc \isa, neon_i8mm 1236 movi v22.4s, #2 1237.else 1238 mov v22.16b, v27.16b 1239 sub v4.16b, v4.16b, v24.16b 1240.endif 1241 tbl v2.16b, {v4.16b}, v28.16b 1242 \dot v22.4s, v2.16b, v26.4b[0] 1243 ret 1244 1245 .align JUMP_ALIGN 124640: // HV6 - 4xN 1247 ldur s26, [\xmx, #2] 1248 add \src, \src, #2 1249 1250 bl L(\type\()_hv_filter4_\isa) 1251 shrn v16.4h, v22.4s, #2 1252 bl L(\type\()_hv_filter4_\isa) 1253 shrn v17.4h, v22.4s, #2 1254 bl L(\type\()_hv_filter4_\isa) 1255 shrn v18.4h, v22.4s, #2 1256 bl L(\type\()_hv_filter4_\isa) 1257 shrn v19.4h, v22.4s, #2 1258 bl L(\type\()_hv_filter4_\isa) 1259 shrn v20.4h, v22.4s, #2 1260 1261 .align LOOP_ALIGN 12624: 1263 ld1 {v4.8b}, [\src], \s_strd 1264 1265 smull v0.4s, v16.4h, v7.h[1] 1266 smlal v0.4s, v17.4h, v7.h[2] 1267.ifc \isa, neon_dotprod 1268 sub v4.16b, v4.16b, v24.16b 1269.endif 1270 mov v16.16b, v17.16b 1271 mov v17.16b, v18.16b 1272 1273 smlal v0.4s, v18.4h, v7.h[3] 1274 smlal v0.4s, v19.4h, v7.h[4] 1275 tbl v2.16b, {v4.16b}, v28.16b 1276.ifc \isa, neon_i8mm 1277 movi v5.4s, #0 1278.else 1279 mov v5.16b, v27.16b 1280.endif 1281 mov v18.16b, v19.16b 1282 mov v19.16b, v20.16b 1283 \dot v5.4s, v2.16b, v26.4b[0] 1284 1285 smlal v0.4s, v20.4h, v7.h[5] 1286.ifc \isa, neon_i8mm 1287 rshrn v20.4h, v5.4s, #2 1288.else 1289 shrn v20.4h, v5.4s, #2 1290.endif 1291 subs \h, \h, #1 1292 smlal v0.4s, v20.4h, v7.h[6] 1293.ifc \type, prep 1294 rshrn v0.4h, v0.4s, #6 1295 str d0, [\dst], #8 1296.else 1297 tbl v0.8b, {v0.16b}, v25.8b 1298 sqrshrun v0.8b, v0.8h, #2 1299 str s0, [\dst] 1300 add \dst, \dst, \d_strd 1301.endif 1302 b.gt 4b 1303 ret x15 1304 1305.ifc \type, put 1306 .align JUMP_ALIGN 130720: // HV6 - 2xN 1308 ldur s26, [\xmx, #2] 1309 add \src, \src, #2 1310 1311 bl L(\type\()_hv_filter4_\isa) 1312 shrn v16.4h, v22.4s, #2 1313 bl L(\type\()_hv_filter4_\isa) 1314 shrn v17.4h, v22.4s, #2 1315 bl L(\type\()_hv_filter4_\isa) 1316 shrn v18.4h, v22.4s, #2 1317 bl L(\type\()_hv_filter4_\isa) 1318 shrn v19.4h, v22.4s, #2 1319 bl L(\type\()_hv_filter4_\isa) 1320 shrn v20.4h, v22.4s, #2 1321 1322 .align LOOP_ALIGN 13232: 1324 ld1 {v4.8b}, [\src], \s_strd 1325 1326 smull v0.4s, v16.4h, v7.h[1] 1327 smlal v0.4s, v17.4h, v7.h[2] 1328 .ifc \isa, neon_dotprod 1329 sub v4.16b, v4.16b, v24.16b 1330 .endif 1331 mov v16.16b, v17.16b 1332 mov v17.16b, v18.16b 1333 1334 smlal v0.4s, v18.4h, v7.h[3] 1335 smlal v0.4s, v19.4h, v7.h[4] 1336 tbl v2.16b, {v4.16b}, v28.16b 1337 .ifc \isa, neon_i8mm 1338 movi v5.4s, #0 1339 .else 1340 mov v5.16b, v27.16b 1341 .endif 1342 1343 mov v18.16b, v19.16b 1344 mov v19.16b, v20.16b 1345 \dot v5.4s, v2.16b, v26.4b[0] 1346 1347 smlal v0.4s, v20.4h, v7.h[5] 1348 .ifc \isa, neon_i8mm 1349 rshrn v20.4h, v5.4s, #2 1350 .else 1351 shrn v20.4h, v5.4s, #2 1352 .endif 1353 1354 subs \h, \h, #1 1355 smlal v0.4s, v20.4h, v7.h[6] 1356 1357 tbl v0.8b, {v0.16b}, v25.8b 1358 sqrshrun v0.8b, v0.8h, #2 1359 1360 str h0, [\dst] 1361 add \dst, \dst, \d_strd 1362 b.gt 2b 1363 ret x15 1364.endif 1365 1366 .align JUMP_ALIGN 1367L(\type\()_8tap_h_\isa): 1368 movrel x11, \type\()_8tap_h_\isa\()_tbl 1369 ldrsw x8, [x11, x8, lsl #2] 1370.ifc \type, put 1371 .ifc \isa, neon_i8mm 1372 movi v27.4s, #34 // special rounding 1373 .else 1374 mov w10, #0x2022 // 64 * 128 + 34, bias and rounding for SDOT 1375 dup v27.4s, w10 1376 .endif 1377.endif 1378 add x11, x11, x8 1379 br x11 1380 1381.ifc \type, put 1382 .align JUMP_ALIGN 138320: // H - 2xN 1384 AARCH64_VALID_JUMP_TARGET 1385 add \src, \src, #2 1386 ldur s26, [\xmx, #2] 1387 1388 .align LOOP_ALIGN 13892: 1390 ldr d0, [\src] 1391 ldr d1, [\src, \s_strd] 1392 add \src, \src, \s_strd, lsl #1 1393 .ifc \isa, neon_dotprod 1394 sub v0.8b, v0.8b, v24.8b 1395 sub v1.8b, v1.8b, v24.8b 1396 .endif 1397 mov v4.16b, v27.16b 1398 mov v5.16b, v27.16b 1399 1400 tbl v2.16b, {v0.16b}, v28.16b 1401 tbl v3.16b, {v1.16b}, v28.16b 1402 1403 \dot v4.4s, v2.16b, v26.4b[0] 1404 \dot v5.4s, v3.16b, v26.4b[0] 1405 1406 uzp1 v4.8h, v4.8h, v5.8h 1407 sqshrun v4.8b, v4.8h, #6 1408 1409 subs \h, \h, #2 1410 fmov x8, d4 1411 lsr x9, x8, #32 1412 strh w8, [\dst] 1413 strh w9, [\dst, \d_strd] 1414 add \dst, \dst, \d_strd, lsl #1 1415 b.gt 2b 1416 ret 1417.endif 1418 1419 .align JUMP_ALIGN 142040: // H - 4xN 1421 AARCH64_VALID_JUMP_TARGET 1422 add \src, \src, #2 1423 ldur s26, [\xmx, #2] 1424 1425 .align LOOP_ALIGN 14264: 1427 ldr d0, [\src] 1428 ldr d1, [\src, \s_strd] 1429 add \src, \src, \s_strd, lsl #1 1430.ifc \type\()_\isa, prep_neon_i8mm 1431 movi v4.4s, #0 1432 movi v5.4s, #0 1433.else 1434 .ifc \isa, neon_dotprod 1435 sub v0.8b, v0.8b, v24.8b 1436 sub v1.8b, v1.8b, v24.8b 1437 .endif 1438 mov v4.16b, v27.16b 1439 mov v5.16b, v27.16b 1440.endif 1441 tbl v2.16b, {v0.16b}, v28.16b 1442 tbl v3.16b, {v1.16b}, v28.16b 1443 1444 \dot v4.4s, v2.16b, v26.4b[0] 1445 \dot v5.4s, v3.16b, v26.4b[0] 1446.ifc \type, prep 1447 subs \h, \h, #2 1448 .ifc \isa, neon_i8mm 1449 uzp1 v4.8h, v4.8h, v5.8h 1450 srshr v4.8h, v4.8h, #2 1451 .else 1452 shrn v4.4h, v4.4s, #2 1453 shrn2 v4.8h, v5.4s, #2 1454 .endif 1455 str q4, [\dst], #16 1456.else // put 1457 uzp1 v4.8h, v4.8h, v5.8h 1458 sqshrun v4.8b, v4.8h, #6 1459 subs \h, \h, #2 1460 fmov x8, d4 1461 lsr x9, x8, #32 1462 str w8, [\dst] 1463 str w9, [\dst, \d_strd] 1464 add \dst, \dst, \d_strd, lsl #1 1465.endif 1466 b.gt 4b 1467 ret 1468 1469 .align JUMP_ALIGN 147080: // H - 8xN 1471 AARCH64_VALID_JUMP_TARGET 1472 ldr d26, [\xmx] 1473.ifc \isa, neon_i8mm 1474 cmp w9, #SHARP1 1475 b.eq 88f // horizontal == SHARP1 1476 1477 ldp q29, q30, [x13, #(OFFSET_USMMLA)] 1478 ext v0.8b, v26.8b, v26.8b, #7 1479 ins v26.d[1], v0.d[0] 1480 1481 .align LOOP_ALIGN 14828: 1483 ldr q0, [\src] 1484 ldr q16, [\src, \s_strd] 1485 add \src, \src, \s_strd, lsl #1 1486 .ifc \type, prep 1487 movi v4.4s, #0 1488 movi v5.4s, #0 1489 movi v20.4s, #0 1490 movi v21.4s, #0 1491 .else 1492 mov v4.16b, v27.16b 1493 mov v5.16b, v27.16b 1494 mov v20.16b, v27.16b 1495 mov v21.16b, v27.16b 1496 .endif 1497 tbl v1.16b, {v0.16b}, v29.16b 1498 tbl v2.16b, {v0.16b}, v30.16b 1499 tbl v17.16b, {v16.16b}, v29.16b 1500 tbl v18.16b, {v16.16b}, v30.16b 1501 1502 usmmla v4.4s, v1.16b, v26.16b 1503 usmmla v5.4s, v2.16b, v26.16b 1504 usmmla v20.4s, v17.16b, v26.16b 1505 usmmla v21.4s, v18.16b, v26.16b 1506 1507 uzp1 v4.8h, v4.8h, v5.8h 1508 uzp1 v20.8h, v20.8h, v21.8h 1509 .ifc \type, prep 1510 srshr v4.8h, v4.8h, #2 1511 srshr v20.8h, v20.8h, #2 1512 subs \h, \h, #2 1513 stp q4, q20, [\dst], #32 1514 .else // put 1515 sqshrun v4.8b, v4.8h, #6 1516 sqshrun v20.8b, v20.8h, #6 1517 subs \h, \h, #2 1518 str d4, [\dst] 1519 str d20, [\dst, \d_strd] 1520 add \dst, \dst, \d_strd, lsl #1 1521 .endif 1522 b.gt 8b 1523 ret 1524 1525 .align JUMP_ALIGN 152688: 1527.endif // neon_i8mm 1528 ldp q29, q30, [x13, #16] 1529 1530 .align LOOP_ALIGN 15318: 1532 ldr q0, [\src] 1533 ldr q16, [\src, \s_strd] 1534 add \src, \src, \s_strd, lsl #1 1535.ifc \type\()_\isa, prep_neon_i8mm 1536 movi v4.4s, #0 1537 movi v5.4s, #0 1538 movi v20.4s, #0 1539 movi v21.4s, #0 1540.else 1541 .ifc \isa, neon_dotprod 1542 sub v0.16b, v0.16b, v24.16b 1543 sub v16.16b, v16.16b, v24.16b 1544 .endif 1545 mov v4.16b, v27.16b 1546 mov v5.16b, v27.16b 1547 mov v20.16b, v27.16b 1548 mov v21.16b, v27.16b 1549.endif 1550 tbl v1.16b, {v0.16b}, v28.16b 1551 tbl v2.16b, {v0.16b}, v29.16b 1552 tbl v3.16b, {v0.16b}, v30.16b 1553 tbl v17.16b, {v16.16b}, v28.16b 1554 tbl v18.16b, {v16.16b}, v29.16b 1555 tbl v19.16b, {v16.16b}, v30.16b 1556 1557 \dot v4.4s, v1.16b, v26.4b[0] 1558 \dot v5.4s, v2.16b, v26.4b[0] 1559 \dot v20.4s, v17.16b, v26.4b[0] 1560 \dot v21.4s, v18.16b, v26.4b[0] 1561 \dot v4.4s, v2.16b, v26.4b[1] 1562 \dot v5.4s, v3.16b, v26.4b[1] 1563 \dot v20.4s, v18.16b, v26.4b[1] 1564 \dot v21.4s, v19.16b, v26.4b[1] 1565 1566 uzp1 v4.8h, v4.8h, v5.8h 1567 uzp1 v20.8h, v20.8h, v21.8h 1568.ifc \type, prep 1569 .ifc \isa, neon_i8mm 1570 srshr v4.8h, v4.8h, #2 1571 srshr v20.8h, v20.8h, #2 1572 .else 1573 sshr v4.8h, v4.8h, #2 1574 sshr v20.8h, v20.8h, #2 1575 .endif 1576 subs \h, \h, #2 1577 stp q4, q20, [\dst], #32 1578.else // put 1579 sqshrun v4.8b, v4.8h, #6 1580 sqshrun v20.8b, v20.8h, #6 1581 subs \h, \h, #2 1582 str d4, [\dst] 1583 str d20, [\dst, \d_strd] 1584 add \dst, \dst, \d_strd, lsl #1 1585.endif 1586 b.gt 8b 1587 ret 1588 1589 .align JUMP_ALIGN 1590160: // H - 16xN 1591 AARCH64_VALID_JUMP_TARGET 1592 ldr d26, [\xmx] 1593.ifc \isa, neon_i8mm 1594 cmp w9, #SHARP1 1595 b.eq 168f // horizontal == SHARP1 1596 1597 ldp q29, q30, [x13, #(OFFSET_USMMLA)] 1598 ext v0.8b, v26.8b, v26.8b, #7 1599 ins v26.d[1], v0.d[0] 1600 1601 .align LOOP_ALIGN 160216: 1603 ldr q16, [\src] 1604 ldur q17, [\src, #8] // avoid 2 register TBL for small cores 1605 add \src, \src, \s_strd 1606 .ifc \type, prep 1607 movi v6.4s, #0 1608 movi v7.4s, #0 1609 movi v22.4s, #0 1610 movi v23.4s, #0 1611 .else 1612 mov v6.16b, v27.16b 1613 mov v7.16b, v27.16b 1614 mov v22.16b, v27.16b 1615 mov v23.16b, v27.16b 1616 .endif 1617 tbl v0.16b, {v16.16b}, v29.16b 1618 tbl v1.16b, {v16.16b}, v30.16b 1619 tbl v2.16b, {v17.16b}, v29.16b 1620 tbl v3.16b, {v17.16b}, v30.16b 1621 1622 usmmla v6.4s, v0.16b, v26.16b 1623 usmmla v7.4s, v1.16b, v26.16b 1624 usmmla v22.4s, v2.16b, v26.16b 1625 usmmla v23.4s, v3.16b, v26.16b 1626 1627 uzp1 v6.8h, v6.8h, v7.8h 1628 uzp1 v22.8h, v22.8h, v23.8h 1629 .ifc \type, prep 1630 srshr v6.8h, v6.8h, #2 1631 srshr v22.8h, v22.8h, #2 1632 subs \h, \h, #1 1633 stp q6, q22, [\dst], #32 1634 .else // put 1635 sqshrun v6.8b, v6.8h, #6 1636 sqshrun2 v6.16b, v22.8h, #6 1637 subs \h, \h, #1 1638 st1 {v6.16b}, [\dst], \d_strd 1639 .endif 1640 b.gt 16b 1641 ret 1642 1643 .align JUMP_ALIGN 1644168: 1645.endif // neon_i8mm 1646 ldp q29, q30, [x13, #16] 1647 1648 .align LOOP_ALIGN 164916: 1650 ldr q16, [\src] 1651 ldur q17, [\src, #12] // avoid 2 register TBL for small cores 1652 add \src, \src, \s_strd 1653.ifc \type\()_\isa, prep_neon_i8mm 1654 movi v6.4s, #0 1655 movi v7.4s, #0 1656 movi v22.4s, #0 1657 movi v23.4s, #0 1658.else 1659 .ifc \isa, neon_dotprod 1660 sub v16.16b, v16.16b, v24.16b 1661 sub v17.16b, v17.16b, v24.16b 1662 .endif 1663 mov v6.16b, v27.16b 1664 mov v7.16b, v27.16b 1665 mov v22.16b, v27.16b 1666 mov v23.16b, v27.16b 1667.endif 1668 tbl v0.16b, {v16.16b}, v28.16b 1669 tbl v1.16b, {v16.16b}, v29.16b 1670 tbl v2.16b, {v16.16b}, v30.16b 1671 tbl v3.16b, {v17.16b}, v28.16b 1672 tbl v4.16b, {v17.16b}, v29.16b 1673 1674 \dot v6.4s, v0.16b, v26.4b[0] 1675 \dot v7.4s, v1.16b, v26.4b[0] 1676 \dot v22.4s, v2.16b, v26.4b[0] 1677 \dot v23.4s, v3.16b, v26.4b[0] 1678 \dot v6.4s, v1.16b, v26.4b[1] 1679 \dot v7.4s, v2.16b, v26.4b[1] 1680 \dot v22.4s, v3.16b, v26.4b[1] 1681 \dot v23.4s, v4.16b, v26.4b[1] 1682 1683 uzp1 v6.8h, v6.8h, v7.8h 1684 uzp1 v22.8h, v22.8h, v23.8h 1685.ifc \type, prep 1686 .ifc \isa, neon_i8mm 1687 srshr v6.8h, v6.8h, #2 1688 srshr v22.8h, v22.8h, #2 1689 .else 1690 sshr v6.8h, v6.8h, #2 1691 sshr v22.8h, v22.8h, #2 1692 .endif 1693 subs \h, \h, #1 1694 stp q6, q22, [\dst], #32 1695.else // put 1696 sqshrun v6.8b, v6.8h, #6 1697 sqshrun2 v6.16b, v22.8h, #6 1698 subs \h, \h, #1 1699 st1 {v6.16b}, [\dst], \d_strd 1700.endif 1701 b.gt 16b 1702 ret 1703 1704 .align JUMP_ALIGN 1705320: // H - 32xN+ 1706640: 17071280: 1708 AARCH64_VALID_JUMP_TARGET 1709 ldr d26, [\xmx] 1710.ifc \type, put 1711 sub \d_strd, \d_strd, \w, uxtw 1712.endif 1713 sub \s_strd, \s_strd, \w, uxtw 1714 mov w8, \w 1715 1716.ifc \isa, neon_i8mm 1717 cmp w9, #SHARP1 1718 b.eq 328f // horizontal == SHARP1 1719 1720 ldp q29, q30, [x13, #(OFFSET_USMMLA)] 1721 ext v0.8b, v26.8b, v26.8b, #7 1722 ins v26.d[1], v0.d[0] 1723 1724 .align LOOP_ALIGN 172532: 1726 ldr q16, [\src] 1727 ldur q17, [\src, #8] // avoid 2 register TBL for small cores 1728 add \src, \src, #16 1729 .ifc \type, prep 1730 movi v6.4s, #0 1731 movi v7.4s, #0 1732 movi v22.4s, #0 1733 movi v23.4s, #0 1734 .else 1735 mov v6.16b, v27.16b 1736 mov v7.16b, v27.16b 1737 mov v22.16b, v27.16b 1738 mov v23.16b, v27.16b 1739 .endif 1740 tbl v0.16b, {v16.16b}, v29.16b 1741 tbl v1.16b, {v16.16b}, v30.16b 1742 tbl v2.16b, {v17.16b}, v29.16b 1743 tbl v3.16b, {v17.16b}, v30.16b 1744 1745 usmmla v6.4s, v0.16b, v26.16b 1746 usmmla v7.4s, v1.16b, v26.16b 1747 usmmla v22.4s, v2.16b, v26.16b 1748 usmmla v23.4s, v3.16b, v26.16b 1749 1750 uzp1 v6.8h, v6.8h, v7.8h 1751 uzp1 v22.8h, v22.8h, v23.8h 1752 .ifc \type, prep 1753 srshr v6.8h, v6.8h, #2 1754 srshr v22.8h, v22.8h, #2 1755 subs w8, w8, #16 1756 stp q6, q22, [\dst], #32 1757 .else // put 1758 sqshrun v6.8b, v6.8h, #6 1759 sqshrun2 v6.16b, v22.8h, #6 1760 subs w8, w8, #16 1761 str q6, [\dst], #16 1762 .endif 1763 b.gt 32b 1764 1765 add \src, \src, \s_strd 1766 .ifc \type, put 1767 add \dst, \dst, \d_strd 1768 .endif 1769 mov w8, \w 1770 subs \h, \h, #1 1771 b.gt 32b 1772 ret 1773 1774 .align JUMP_ALIGN 1775328: 1776.endif // neon_i8mm 1777 ldp q29, q30, [x13, #16] 1778 1779 .align LOOP_ALIGN 178032: 1781 ldr q16, [\src] 1782 ldur q17, [\src, #12] // avoid 2 register TBL for small cores 1783 add \src, \src, #16 1784.ifc \type\()_\isa, prep_neon_i8mm 1785 movi v6.4s, #0 1786 movi v7.4s, #0 1787 movi v22.4s, #0 1788 movi v23.4s, #0 1789.else 1790 .ifc \isa, neon_dotprod 1791 sub v16.16b, v16.16b, v24.16b 1792 sub v17.16b, v17.16b, v24.16b 1793 .endif 1794 mov v6.16b, v27.16b 1795 mov v7.16b, v27.16b 1796 mov v22.16b, v27.16b 1797 mov v23.16b, v27.16b 1798.endif 1799 tbl v0.16b, {v16.16b}, v28.16b 1800 tbl v1.16b, {v16.16b}, v29.16b 1801 tbl v2.16b, {v16.16b}, v30.16b 1802 tbl v3.16b, {v17.16b}, v28.16b 1803 tbl v4.16b, {v17.16b}, v29.16b 1804 1805 \dot v6.4s, v0.16b, v26.4b[0] 1806 \dot v7.4s, v1.16b, v26.4b[0] 1807 \dot v22.4s, v2.16b, v26.4b[0] 1808 \dot v23.4s, v3.16b, v26.4b[0] 1809 \dot v6.4s, v1.16b, v26.4b[1] 1810 \dot v7.4s, v2.16b, v26.4b[1] 1811 \dot v22.4s, v3.16b, v26.4b[1] 1812 \dot v23.4s, v4.16b, v26.4b[1] 1813 1814 uzp1 v6.8h, v6.8h, v7.8h 1815 uzp1 v22.8h, v22.8h, v23.8h 1816.ifc \type, prep 1817 .ifc \isa, neon_i8mm 1818 srshr v6.8h, v6.8h, #2 1819 srshr v22.8h, v22.8h, #2 1820 .else 1821 sshr v6.8h, v6.8h, #2 1822 sshr v22.8h, v22.8h, #2 1823 .endif 1824 subs w8, w8, #16 1825 stp q6, q22, [\dst], #32 1826.else // put 1827 sqshrun v6.8b, v6.8h, #6 1828 sqshrun2 v6.16b, v22.8h, #6 1829 subs w8, w8, #16 1830 str q6, [\dst], #16 1831.endif 1832 b.gt 32b 1833 1834 add \src, \src, \s_strd 1835.ifc \type, put 1836 add \dst, \dst, \d_strd 1837.endif 1838 mov w8, \w 1839 subs \h, \h, #1 1840 b.gt 32b 1841 ret 1842endfunc 1843 1844jumptable \type\()_8tap_h_\isa\()_tbl 1845 .word 1280b - \type\()_8tap_h_\isa\()_tbl 1846 .word 640b - \type\()_8tap_h_\isa\()_tbl 1847 .word 320b - \type\()_8tap_h_\isa\()_tbl 1848 .word 160b - \type\()_8tap_h_\isa\()_tbl 1849 .word 80b - \type\()_8tap_h_\isa\()_tbl 1850 .word 40b - \type\()_8tap_h_\isa\()_tbl 1851.ifc \type, put 1852 .word 20b - \type\()_8tap_h_\isa\()_tbl 1853.endif 1854endjumptable 1855.endm 1856 1857// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) 1858// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) 1859filter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 1860 1861// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) 1862// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) 1863filter_8tap_fn put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 1864 1865#if HAVE_I8MM 1866ENABLE_I8MM 1867 1868// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) 1869// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) 1870filter_8tap_fn prep, usdot, neon_i8mm, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 1871 1872// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) 1873// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) 1874filter_8tap_fn put, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 1875 1876DISABLE_I8MM 1877#endif // HAVE_I8MM 1878 1879DISABLE_DOTPROD 1880#endif // HAVE_DOTPROD 1881