1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Janne Grunau 4 * Copyright © 2018, Martin Storsjo 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "src/arm/asm.S" 30#include "util.S" 31 32.macro avg dst, t0, t1, t2, t3 33 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 34 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 35 add \t0\().8h, \t0\().8h, \t2\().8h 36 add \t1\().8h, \t1\().8h, \t3\().8h 37 sqrshrun \dst\().8b, \t0\().8h, #5 38 sqrshrun2 \dst\().16b, \t1\().8h, #5 39.endm 40 41.macro w_avg dst, t0, t1, t2, t3 42 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 43 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 44 sub \t0\().8h, \t2\().8h, \t0\().8h 45 sub \t1\().8h, \t3\().8h, \t1\().8h 46 sqdmulh \t0\().8h, \t0\().8h, v30.8h 47 sqdmulh \t1\().8h, \t1\().8h, v30.8h 48 add \t0\().8h, \t2\().8h, \t0\().8h 49 add \t1\().8h, \t3\().8h, \t1\().8h 50 sqrshrun \dst\().8b, \t0\().8h, #4 51 sqrshrun2 \dst\().16b, \t1\().8h, #4 52.endm 53 54.macro mask dst, t0, t1, t2, t3 55 ld1 {v30.16b}, [x6], 16 56 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 57 mul v30.16b, v30.16b, v31.16b 58 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 59 shll v28.8h, v30.8b, #8 60 shll2 v29.8h, v30.16b, #8 61 sub \t0\().8h, \t2\().8h, \t0\().8h 62 sub \t1\().8h, \t3\().8h, \t1\().8h 63 sqdmulh \t0\().8h, \t0\().8h, v28.8h 64 sqdmulh \t1\().8h, \t1\().8h, v29.8h 65 add \t0\().8h, \t2\().8h, \t0\().8h 66 add \t1\().8h, \t3\().8h, \t1\().8h 67 sqrshrun \dst\().8b, \t0\().8h, #4 68 sqrshrun2 \dst\().16b, \t1\().8h, #4 69.endm 70 71.macro bidir_fn type 72function \type\()_8bpc_neon, export=1 73 clz w4, w4 74.ifc \type, w_avg 75 dup v30.8h, w6 76 neg v30.8h, v30.8h 77 shl v30.8h, v30.8h, #11 78.endif 79.ifc \type, mask 80 movi v31.16b, #256-2 81.endif 82 movrel x7, \type\()_tbl 83 sub w4, w4, #24 84 ldrsw x4, [x7, x4, lsl #2] 85 \type v4, v0, v1, v2, v3 86 add x7, x7, x4 87 br x7 8840: 89 AARCH64_VALID_JUMP_TARGET 90 add x7, x0, x1 91 lsl x1, x1, #1 924: 93 cmp w5, #4 94 st1 {v4.s}[0], [x0], x1 95 st1 {v4.s}[1], [x7], x1 96 st1 {v4.s}[2], [x0], x1 97 st1 {v4.s}[3], [x7], x1 98 b.eq 0f 99 \type v5, v0, v1, v2, v3 100 cmp w5, #8 101 st1 {v5.s}[0], [x0], x1 102 st1 {v5.s}[1], [x7], x1 103 st1 {v5.s}[2], [x0], x1 104 st1 {v5.s}[3], [x7], x1 105 b.eq 0f 106 \type v4, v0, v1, v2, v3 107 st1 {v4.s}[0], [x0], x1 108 st1 {v4.s}[1], [x7], x1 109 \type v5, v0, v1, v2, v3 110 st1 {v4.s}[2], [x0], x1 111 st1 {v4.s}[3], [x7], x1 112 st1 {v5.s}[0], [x0], x1 113 st1 {v5.s}[1], [x7], x1 114 st1 {v5.s}[2], [x0], x1 115 st1 {v5.s}[3], [x7], x1 116 ret 11780: 118 AARCH64_VALID_JUMP_TARGET 119 add x7, x0, x1 120 lsl x1, x1, #1 1218: 122 st1 {v4.8b}, [x0], x1 123 \type v5, v0, v1, v2, v3 124 st1 {v4.d}[1], [x7], x1 125 st1 {v5.8b}, [x0], x1 126 subs w5, w5, #4 127 st1 {v5.d}[1], [x7], x1 128 b.le 0f 129 \type v4, v0, v1, v2, v3 130 b 8b 131160: 132 AARCH64_VALID_JUMP_TARGET 13316: 134 \type v5, v0, v1, v2, v3 135 st1 {v4.16b}, [x0], x1 136 \type v6, v0, v1, v2, v3 137 st1 {v5.16b}, [x0], x1 138 \type v7, v0, v1, v2, v3 139 st1 {v6.16b}, [x0], x1 140 subs w5, w5, #4 141 st1 {v7.16b}, [x0], x1 142 b.le 0f 143 \type v4, v0, v1, v2, v3 144 b 16b 145320: 146 AARCH64_VALID_JUMP_TARGET 147 add x7, x0, x1 148 lsl x1, x1, #1 14932: 150 \type v5, v0, v1, v2, v3 151 \type v6, v0, v1, v2, v3 152 st1 {v4.16b,v5.16b}, [x0], x1 153 \type v7, v0, v1, v2, v3 154 subs w5, w5, #2 155 st1 {v6.16b,v7.16b}, [x7], x1 156 b.le 0f 157 \type v4, v0, v1, v2, v3 158 b 32b 159640: 160 AARCH64_VALID_JUMP_TARGET 161 add x7, x0, x1 162 lsl x1, x1, #1 16364: 164 \type v5, v0, v1, v2, v3 165 \type v6, v0, v1, v2, v3 166 \type v7, v0, v1, v2, v3 167 \type v16, v0, v1, v2, v3 168 \type v17, v0, v1, v2, v3 169 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 170 \type v18, v0, v1, v2, v3 171 \type v19, v0, v1, v2, v3 172 subs w5, w5, #2 173 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 174 b.le 0f 175 \type v4, v0, v1, v2, v3 176 b 64b 1771280: 178 AARCH64_VALID_JUMP_TARGET 179 add x7, x0, #64 180128: 181 \type v5, v0, v1, v2, v3 182 \type v6, v0, v1, v2, v3 183 \type v7, v0, v1, v2, v3 184 \type v16, v0, v1, v2, v3 185 \type v17, v0, v1, v2, v3 186 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 187 \type v18, v0, v1, v2, v3 188 \type v19, v0, v1, v2, v3 189 subs w5, w5, #1 190 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 191 b.le 0f 192 \type v4, v0, v1, v2, v3 193 b 128b 1940: 195 ret 196endfunc 197 198jumptable \type\()_tbl 199 .word 1280b - \type\()_tbl 200 .word 640b - \type\()_tbl 201 .word 320b - \type\()_tbl 202 .word 160b - \type\()_tbl 203 .word 80b - \type\()_tbl 204 .word 40b - \type\()_tbl 205endjumptable 206.endm 207 208bidir_fn avg 209bidir_fn w_avg 210bidir_fn mask 211 212 213.macro w_mask_fn type 214function w_mask_\type\()_8bpc_neon, export=1 215 clz w8, w4 216 movrel x9, w_mask_\type\()_tbl 217 sub w8, w8, #24 218 ldrsw x8, [x9, x8, lsl #2] 219 add x9, x9, x8 220 mov w10, #6903 221 dup v0.8h, w10 222.if \type == 444 223 movi v1.16b, #64 224.elseif \type == 422 225 dup v2.8b, w7 226 movi v3.8b, #129 227 sub v3.8b, v3.8b, v2.8b 228.elseif \type == 420 229 dup v2.8h, w7 230 movi v3.8h, #1, lsl #8 231 sub v3.8h, v3.8h, v2.8h 232.endif 233 add x12, x0, x1 234 lsl x1, x1, #1 235 br x9 23640: 237 AARCH64_VALID_JUMP_TARGET 2384: 239 ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) 240 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) 241 subs w5, w5, #4 242 sub v16.8h, v6.8h, v4.8h 243 sub v17.8h, v7.8h, v5.8h 244 sabd v18.8h, v4.8h, v6.8h 245 sabd v19.8h, v5.8h, v7.8h 246 uqsub v18.8h, v0.8h, v18.8h 247 uqsub v19.8h, v0.8h, v19.8h 248 ushr v18.8h, v18.8h, #8 249 ushr v19.8h, v19.8h, #8 250 shl v20.8h, v18.8h, #9 251 shl v21.8h, v19.8h, #9 252 sqdmulh v20.8h, v20.8h, v16.8h 253 sqdmulh v21.8h, v21.8h, v17.8h 254 add v20.8h, v20.8h, v4.8h 255 add v21.8h, v21.8h, v5.8h 256 sqrshrun v22.8b, v20.8h, #4 257 sqrshrun v23.8b, v21.8h, #4 258.if \type == 444 259 uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 260 sub v18.16b, v1.16b, v18.16b 261 st1 {v18.16b}, [x6], #16 262.elseif \type == 422 263 addp v18.8h, v18.8h, v19.8h 264 xtn v18.8b, v18.8h 265 uhsub v18.8b, v3.8b, v18.8b 266 st1 {v18.8b}, [x6], #8 267.elseif \type == 420 268 trn1 v24.2d, v18.2d, v19.2d 269 trn2 v25.2d, v18.2d, v19.2d 270 add v24.8h, v24.8h, v25.8h 271 addp v18.8h, v24.8h, v24.8h 272 sub v18.4h, v3.4h, v18.4h 273 rshrn v18.8b, v18.8h, #2 274 str s18, [x6], #4 275.endif 276 st1 {v22.s}[0], [x0], x1 277 st1 {v22.s}[1], [x12], x1 278 st1 {v23.s}[0], [x0], x1 279 st1 {v23.s}[1], [x12], x1 280 b.gt 4b 281 ret 28280: 283 AARCH64_VALID_JUMP_TARGET 2848: 285 ld1 {v4.8h, v5.8h}, [x2], #32 286 ld1 {v6.8h, v7.8h}, [x3], #32 287 subs w5, w5, #2 288 sub v16.8h, v6.8h, v4.8h 289 sub v17.8h, v7.8h, v5.8h 290 sabd v18.8h, v4.8h, v6.8h 291 sabd v19.8h, v5.8h, v7.8h 292 uqsub v18.8h, v0.8h, v18.8h 293 uqsub v19.8h, v0.8h, v19.8h 294 ushr v18.8h, v18.8h, #8 295 ushr v19.8h, v19.8h, #8 296 shl v20.8h, v18.8h, #9 297 shl v21.8h, v19.8h, #9 298 sqdmulh v20.8h, v20.8h, v16.8h 299 sqdmulh v21.8h, v21.8h, v17.8h 300 add v20.8h, v20.8h, v4.8h 301 add v21.8h, v21.8h, v5.8h 302 sqrshrun v22.8b, v20.8h, #4 303 sqrshrun v23.8b, v21.8h, #4 304.if \type == 444 305 uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 306 sub v18.16b, v1.16b, v18.16b 307 st1 {v18.16b}, [x6], #16 308.elseif \type == 422 309 addp v18.8h, v18.8h, v19.8h 310 xtn v18.8b, v18.8h 311 uhsub v18.8b, v3.8b, v18.8b 312 st1 {v18.8b}, [x6], #8 313.elseif \type == 420 314 add v18.8h, v18.8h, v19.8h 315 addp v18.8h, v18.8h, v18.8h 316 sub v18.4h, v3.4h, v18.4h 317 rshrn v18.8b, v18.8h, #2 318 str s18, [x6], #4 319.endif 320 st1 {v22.8b}, [x0], x1 321 st1 {v23.8b}, [x12], x1 322 b.gt 8b 323 ret 3241280: 325640: 326320: 327160: 328 AARCH64_VALID_JUMP_TARGET 329 mov w11, w4 330 sub x1, x1, w4, uxtw 331.if \type == 444 332 add x10, x6, w4, uxtw 333.elseif \type == 422 334 add x10, x6, x11, lsr #1 335.endif 336 add x9, x3, w4, uxtw #1 337 add x7, x2, w4, uxtw #1 338161: 339 mov w8, w4 34016: 341 ld1 {v4.8h, v5.8h}, [x2], #32 342 ld1 {v6.8h, v7.8h}, [x3], #32 343 ld1 {v16.8h, v17.8h}, [x7], #32 344 ld1 {v18.8h, v19.8h}, [x9], #32 345 subs w8, w8, #16 346 sub v6.8h, v6.8h, v4.8h 347 sub v7.8h, v7.8h, v5.8h 348 sub v18.8h, v18.8h, v16.8h 349 sub v19.8h, v19.8h, v17.8h 350 abs v20.8h, v6.8h 351 abs v21.8h, v7.8h 352 abs v22.8h, v18.8h 353 abs v23.8h, v19.8h 354 uqsub v20.8h, v0.8h, v20.8h 355 uqsub v21.8h, v0.8h, v21.8h 356 uqsub v22.8h, v0.8h, v22.8h 357 uqsub v23.8h, v0.8h, v23.8h 358 ushr v20.8h, v20.8h, #8 359 ushr v21.8h, v21.8h, #8 360 ushr v22.8h, v22.8h, #8 361 ushr v23.8h, v23.8h, #8 362 shl v24.8h, v20.8h, #9 363 shl v25.8h, v21.8h, #9 364 shl v26.8h, v22.8h, #9 365 shl v27.8h, v23.8h, #9 366 sqdmulh v24.8h, v24.8h, v6.8h 367 sqdmulh v25.8h, v25.8h, v7.8h 368 sqdmulh v26.8h, v26.8h, v18.8h 369 sqdmulh v27.8h, v27.8h, v19.8h 370 add v24.8h, v24.8h, v4.8h 371 add v25.8h, v25.8h, v5.8h 372 add v26.8h, v26.8h, v16.8h 373 add v27.8h, v27.8h, v17.8h 374 sqrshrun v24.8b, v24.8h, #4 375 sqrshrun v25.8b, v25.8h, #4 376 sqrshrun v26.8b, v26.8h, #4 377 sqrshrun v27.8b, v27.8h, #4 378.if \type == 444 379 uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2 380 uzp1 v21.16b, v22.16b, v23.16b // Ditto 381 sub v20.16b, v1.16b, v20.16b 382 sub v21.16b, v1.16b, v21.16b 383 st1 {v20.16b}, [x6], #16 384 st1 {v21.16b}, [x10], #16 385.elseif \type == 422 386 addp v20.8h, v20.8h, v21.8h 387 addp v21.8h, v22.8h, v23.8h 388 xtn v20.8b, v20.8h 389 xtn v21.8b, v21.8h 390 uhsub v20.8b, v3.8b, v20.8b 391 uhsub v21.8b, v3.8b, v21.8b 392 st1 {v20.8b}, [x6], #8 393 st1 {v21.8b}, [x10], #8 394.elseif \type == 420 395 add v20.8h, v20.8h, v22.8h 396 add v21.8h, v21.8h, v23.8h 397 addp v20.8h, v20.8h, v21.8h 398 sub v20.8h, v3.8h, v20.8h 399 rshrn v20.8b, v20.8h, #2 400 st1 {v20.8b}, [x6], #8 401.endif 402 st1 {v24.8b, v25.8b}, [x0], #16 403 st1 {v26.8b, v27.8b}, [x12], #16 404 b.gt 16b 405 subs w5, w5, #2 406 add x2, x2, w4, uxtw #1 407 add x3, x3, w4, uxtw #1 408 add x7, x7, w4, uxtw #1 409 add x9, x9, w4, uxtw #1 410.if \type == 444 411 add x6, x6, w4, uxtw 412 add x10, x10, w4, uxtw 413.elseif \type == 422 414 add x6, x6, x11, lsr #1 415 add x10, x10, x11, lsr #1 416.endif 417 add x0, x0, x1 418 add x12, x12, x1 419 b.gt 161b 420 ret 421endfunc 422 423jumptable w_mask_\type\()_tbl 424 .word 1280b - w_mask_\type\()_tbl 425 .word 640b - w_mask_\type\()_tbl 426 .word 320b - w_mask_\type\()_tbl 427 .word 160b - w_mask_\type\()_tbl 428 .word 80b - w_mask_\type\()_tbl 429 .word 40b - w_mask_\type\()_tbl 430endjumptable 431.endm 432 433w_mask_fn 444 434w_mask_fn 422 435w_mask_fn 420 436 437 438function blend_8bpc_neon, export=1 439 movrel x6, blend_tbl 440 clz w3, w3 441 sub w3, w3, #26 442 ldrsw x3, [x6, x3, lsl #2] 443 add x6, x6, x3 444 movi v4.16b, #64 445 add x8, x0, x1 446 lsl x1, x1, #1 447 br x6 44840: 449 AARCH64_VALID_JUMP_TARGET 4504: 451 ld1 {v2.8b}, [x5], #8 452 ldr d1, [x2], #8 453 ldr s0, [x0] 454 subs w4, w4, #2 455 ld1 {v0.s}[1], [x8] 456 sub v3.8b, v4.8b, v2.8b 457 umull v5.8h, v1.8b, v2.8b 458 umlal v5.8h, v0.8b, v3.8b 459 rshrn v6.8b, v5.8h, #6 460 st1 {v6.s}[0], [x0], x1 461 st1 {v6.s}[1], [x8], x1 462 b.gt 4b 463 ret 46480: 465 AARCH64_VALID_JUMP_TARGET 4668: 467 ld1 {v2.16b}, [x5], #16 468 ld1 {v1.16b}, [x2], #16 469 ldr d0, [x0] 470 ld1 {v0.d}[1], [x8] 471 sub v3.16b, v4.16b, v2.16b 472 subs w4, w4, #2 473 umull v5.8h, v1.8b, v2.8b 474 umlal v5.8h, v0.8b, v3.8b 475 umull2 v6.8h, v1.16b, v2.16b 476 umlal2 v6.8h, v0.16b, v3.16b 477 rshrn v7.8b, v5.8h, #6 478 rshrn v16.8b, v6.8h, #6 479 st1 {v7.8b}, [x0], x1 480 st1 {v16.8b}, [x8], x1 481 b.gt 8b 482 ret 483160: 484 AARCH64_VALID_JUMP_TARGET 48516: 486 ld1 {v1.16b, v2.16b}, [x5], #32 487 ld1 {v5.16b, v6.16b}, [x2], #32 488 ld1 {v0.16b}, [x0] 489 subs w4, w4, #2 490 sub v7.16b, v4.16b, v1.16b 491 sub v20.16b, v4.16b, v2.16b 492 ld1 {v3.16b}, [x8] 493 umull v16.8h, v5.8b, v1.8b 494 umlal v16.8h, v0.8b, v7.8b 495 umull2 v17.8h, v5.16b, v1.16b 496 umlal2 v17.8h, v0.16b, v7.16b 497 umull v21.8h, v6.8b, v2.8b 498 umlal v21.8h, v3.8b, v20.8b 499 umull2 v22.8h, v6.16b, v2.16b 500 umlal2 v22.8h, v3.16b, v20.16b 501 rshrn v18.8b, v16.8h, #6 502 rshrn2 v18.16b, v17.8h, #6 503 rshrn v19.8b, v21.8h, #6 504 rshrn2 v19.16b, v22.8h, #6 505 st1 {v18.16b}, [x0], x1 506 st1 {v19.16b}, [x8], x1 507 b.gt 16b 508 ret 509320: 510 AARCH64_VALID_JUMP_TARGET 51132: 512 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 513 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 514 ld1 {v20.16b, v21.16b}, [x0] 515 subs w4, w4, #2 516 ld1 {v22.16b, v23.16b}, [x8] 517 sub v5.16b, v4.16b, v0.16b 518 sub v6.16b, v4.16b, v1.16b 519 sub v30.16b, v4.16b, v2.16b 520 sub v31.16b, v4.16b, v3.16b 521 umull v24.8h, v16.8b, v0.8b 522 umlal v24.8h, v20.8b, v5.8b 523 umull2 v26.8h, v16.16b, v0.16b 524 umlal2 v26.8h, v20.16b, v5.16b 525 umull v28.8h, v17.8b, v1.8b 526 umlal v28.8h, v21.8b, v6.8b 527 umull2 v7.8h, v17.16b, v1.16b 528 umlal2 v7.8h, v21.16b, v6.16b 529 umull v27.8h, v18.8b, v2.8b 530 umlal v27.8h, v22.8b, v30.8b 531 umull2 v1.8h, v18.16b, v2.16b 532 umlal2 v1.8h, v22.16b, v30.16b 533 umull v29.8h, v19.8b, v3.8b 534 umlal v29.8h, v23.8b, v31.8b 535 umull2 v21.8h, v19.16b, v3.16b 536 umlal2 v21.8h, v23.16b, v31.16b 537 rshrn v24.8b, v24.8h, #6 538 rshrn2 v24.16b, v26.8h, #6 539 rshrn v25.8b, v28.8h, #6 540 rshrn2 v25.16b, v7.8h, #6 541 rshrn v27.8b, v27.8h, #6 542 rshrn2 v27.16b, v1.8h, #6 543 rshrn v28.8b, v29.8h, #6 544 rshrn2 v28.16b, v21.8h, #6 545 st1 {v24.16b, v25.16b}, [x0], x1 546 st1 {v27.16b, v28.16b}, [x8], x1 547 b.gt 32b 548 ret 549endfunc 550 551jumptable blend_tbl 552 .word 320b - blend_tbl 553 .word 160b - blend_tbl 554 .word 80b - blend_tbl 555 .word 40b - blend_tbl 556endjumptable 557 558function blend_h_8bpc_neon, export=1 559 movrel x6, blend_h_tbl 560 movrel x5, X(obmc_masks) 561 add x5, x5, w4, uxtw 562 sub w4, w4, w4, lsr #2 563 clz w7, w3 564 movi v4.16b, #64 565 add x8, x0, x1 566 lsl x1, x1, #1 567 sub w7, w7, #24 568 ldrsw x7, [x6, x7, lsl #2] 569 add x6, x6, x7 570 br x6 57120: 572 AARCH64_VALID_JUMP_TARGET 5732: 574 ldr h0, [x5], #2 575 ldr s1, [x2], #4 576 subs w4, w4, #2 577 ldr h2, [x0] 578 zip1 v0.8b, v0.8b, v0.8b 579 sub v3.8b, v4.8b, v0.8b 580 ld1 {v2.h}[1], [x8] 581 umull v5.8h, v1.8b, v0.8b 582 umlal v5.8h, v2.8b, v3.8b 583 rshrn v5.8b, v5.8h, #6 584 st1 {v5.h}[0], [x0], x1 585 st1 {v5.h}[1], [x8], x1 586 b.gt 2b 587 ret 58840: 589 AARCH64_VALID_JUMP_TARGET 5904: 591 ld2r {v0.8b, v1.8b}, [x5], #2 592 ld1 {v2.8b}, [x2], #8 593 subs w4, w4, #2 594 ext v0.8b, v0.8b, v1.8b, #4 595 ldr s3, [x0] 596 sub v5.8b, v4.8b, v0.8b 597 ld1 {v3.s}[1], [x8] 598 umull v6.8h, v2.8b, v0.8b 599 umlal v6.8h, v3.8b, v5.8b 600 rshrn v6.8b, v6.8h, #6 601 st1 {v6.s}[0], [x0], x1 602 st1 {v6.s}[1], [x8], x1 603 b.gt 4b 604 ret 60580: 606 AARCH64_VALID_JUMP_TARGET 6078: 608 ld2r {v0.16b, v1.16b}, [x5], #2 609 ld1 {v2.16b}, [x2], #16 610 ldr d3, [x0] 611 ext v0.16b, v0.16b, v1.16b, #8 612 sub v5.16b, v4.16b, v0.16b 613 ld1 {v3.d}[1], [x8] 614 subs w4, w4, #2 615 umull v6.8h, v0.8b, v2.8b 616 umlal v6.8h, v3.8b, v5.8b 617 umull2 v7.8h, v0.16b, v2.16b 618 umlal2 v7.8h, v3.16b, v5.16b 619 rshrn v16.8b, v6.8h, #6 620 rshrn v17.8b, v7.8h, #6 621 st1 {v16.8b}, [x0], x1 622 st1 {v17.8b}, [x8], x1 623 b.gt 8b 624 ret 625160: 626 AARCH64_VALID_JUMP_TARGET 62716: 628 ld2r {v0.16b, v1.16b}, [x5], #2 629 ld1 {v2.16b, v3.16b}, [x2], #32 630 ld1 {v5.16b}, [x0] 631 sub v7.16b, v4.16b, v0.16b 632 sub v16.16b, v4.16b, v1.16b 633 ld1 {v6.16b}, [x8] 634 subs w4, w4, #2 635 umull v17.8h, v0.8b, v2.8b 636 umlal v17.8h, v5.8b, v7.8b 637 umull2 v18.8h, v0.16b, v2.16b 638 umlal2 v18.8h, v5.16b, v7.16b 639 umull v19.8h, v1.8b, v3.8b 640 umlal v19.8h, v6.8b, v16.8b 641 umull2 v20.8h, v1.16b, v3.16b 642 umlal2 v20.8h, v6.16b, v16.16b 643 rshrn v21.8b, v17.8h, #6 644 rshrn2 v21.16b, v18.8h, #6 645 rshrn v22.8b, v19.8h, #6 646 rshrn2 v22.16b, v20.8h, #6 647 st1 {v21.16b}, [x0], x1 648 st1 {v22.16b}, [x8], x1 649 b.gt 16b 650 ret 6511280: 652640: 653320: 654 AARCH64_VALID_JUMP_TARGET 655 sub x1, x1, w3, uxtw 656 add x7, x2, w3, uxtw 657321: 658 ld2r {v0.16b, v1.16b}, [x5], #2 659 mov w6, w3 660 sub v20.16b, v4.16b, v0.16b 661 sub v21.16b, v4.16b, v1.16b 66232: 663 ld1 {v16.16b, v17.16b}, [x2], #32 664 ld1 {v2.16b, v3.16b}, [x0] 665 subs w6, w6, #32 666 umull v23.8h, v0.8b, v16.8b 667 umlal v23.8h, v2.8b, v20.8b 668 ld1 {v18.16b, v19.16b}, [x7], #32 669 umull2 v27.8h, v0.16b, v16.16b 670 umlal2 v27.8h, v2.16b, v20.16b 671 ld1 {v6.16b, v7.16b}, [x8] 672 umull v24.8h, v0.8b, v17.8b 673 umlal v24.8h, v3.8b, v20.8b 674 umull2 v28.8h, v0.16b, v17.16b 675 umlal2 v28.8h, v3.16b, v20.16b 676 umull v25.8h, v1.8b, v18.8b 677 umlal v25.8h, v6.8b, v21.8b 678 umull2 v5.8h, v1.16b, v18.16b 679 umlal2 v5.8h, v6.16b, v21.16b 680 rshrn v29.8b, v23.8h, #6 681 rshrn2 v29.16b, v27.8h, #6 682 umull v26.8h, v1.8b, v19.8b 683 umlal v26.8h, v7.8b, v21.8b 684 umull2 v31.8h, v1.16b, v19.16b 685 umlal2 v31.8h, v7.16b, v21.16b 686 rshrn v30.8b, v24.8h, #6 687 rshrn2 v30.16b, v28.8h, #6 688 rshrn v23.8b, v25.8h, #6 689 rshrn2 v23.16b, v5.8h, #6 690 rshrn v24.8b, v26.8h, #6 691 st1 {v29.16b, v30.16b}, [x0], #32 692 rshrn2 v24.16b, v31.8h, #6 693 st1 {v23.16b, v24.16b}, [x8], #32 694 b.gt 32b 695 subs w4, w4, #2 696 add x0, x0, x1 697 add x8, x8, x1 698 add x2, x2, w3, uxtw 699 add x7, x7, w3, uxtw 700 b.gt 321b 701 ret 702endfunc 703 704jumptable blend_h_tbl 705 .word 1280b - blend_h_tbl 706 .word 640b - blend_h_tbl 707 .word 320b - blend_h_tbl 708 .word 160b - blend_h_tbl 709 .word 80b - blend_h_tbl 710 .word 40b - blend_h_tbl 711 .word 20b - blend_h_tbl 712endjumptable 713 714function blend_v_8bpc_neon, export=1 715 movrel x6, blend_v_tbl 716 movrel x5, X(obmc_masks) 717 add x5, x5, w3, uxtw 718 clz w3, w3 719 movi v4.16b, #64 720 add x8, x0, x1 721 lsl x1, x1, #1 722 sub w3, w3, #26 723 ldrsw x3, [x6, x3, lsl #2] 724 add x6, x6, x3 725 br x6 72620: 727 AARCH64_VALID_JUMP_TARGET 728 ld1r {v0.8b}, [x5] 729 sub v1.8b, v4.8b, v0.8b 7302: 731 ldr h2, [x2], #2 732 ldr b3, [x0] 733 subs w4, w4, #2 734 ld1 {v2.b}[1], [x2] 735 ld1 {v3.b}[1], [x8] 736 umull v5.8h, v2.8b, v0.8b 737 umlal v5.8h, v3.8b, v1.8b 738 rshrn v5.8b, v5.8h, #6 739 add x2, x2, #2 740 st1 {v5.b}[0], [x0], x1 741 st1 {v5.b}[1], [x8], x1 742 b.gt 2b 743 ret 74440: 745 AARCH64_VALID_JUMP_TARGET 746 ld1r {v0.2s}, [x5] 747 sub x1, x1, #2 748 sub v1.8b, v4.8b, v0.8b 7494: 750 ld1 {v2.8b}, [x2], #8 751 ldr s3, [x0] 752 ld1 {v3.s}[1], [x8] 753 subs w4, w4, #2 754 umull v5.8h, v2.8b, v0.8b 755 umlal v5.8h, v3.8b, v1.8b 756 rshrn v5.8b, v5.8h, #6 757 str h5, [x0], #2 758 st1 {v5.h}[2], [x8], #2 759 st1 {v5.b}[2], [x0], x1 760 st1 {v5.b}[6], [x8], x1 761 b.gt 4b 762 ret 76380: 764 AARCH64_VALID_JUMP_TARGET 765 ld1r {v0.2d}, [x5] 766 sub x1, x1, #4 767 sub v1.16b, v4.16b, v0.16b 768 zip2 v16.2d, v1.2d, v1.2d 7698: 770 ld1 {v2.16b}, [x2], #16 771 ldr d3, [x0] 772 ldr d4, [x8] 773 subs w4, w4, #2 774 umull v5.8h, v0.8b, v2.8b 775 umlal v5.8h, v3.8b, v1.8b 776 umull2 v6.8h, v0.16b, v2.16b 777 umlal v6.8h, v4.8b, v16.8b 778 rshrn v7.8b, v5.8h, #6 779 rshrn v17.8b, v6.8h, #6 780 str s7, [x0], #4 781 str s17, [x8], #4 782 st1 {v7.h}[2], [x0], x1 783 st1 {v17.h}[2], [x8], x1 784 b.gt 8b 785 ret 786160: 787 AARCH64_VALID_JUMP_TARGET 788 ld1 {v0.16b}, [x5] 789 sub x1, x1, #8 790 sub v2.16b, v4.16b, v0.16b 79116: 792 ld1 {v5.16b, v6.16b}, [x2], #32 793 ld1 {v7.16b}, [x0] 794 subs w4, w4, #2 795 ld1 {v16.16b}, [x8] 796 umull v17.8h, v5.8b, v0.8b 797 umlal v17.8h, v7.8b, v2.8b 798 umull2 v18.8h, v5.16b, v0.16b 799 umlal2 v18.8h, v7.16b, v2.16b 800 umull v20.8h, v6.8b, v0.8b 801 umlal v20.8h, v16.8b, v2.8b 802 umull2 v21.8h, v6.16b, v0.16b 803 umlal2 v21.8h, v16.16b, v2.16b 804 rshrn v19.8b, v17.8h, #6 805 rshrn2 v19.16b, v18.8h, #6 806 rshrn v22.8b, v20.8h, #6 807 rshrn2 v22.16b, v21.8h, #6 808 st1 {v19.8b}, [x0], #8 809 st1 {v22.8b}, [x8], #8 810 st1 {v19.s}[2], [x0], x1 811 st1 {v22.s}[2], [x8], x1 812 b.gt 16b 813 ret 814320: 815 AARCH64_VALID_JUMP_TARGET 816 ld1 {v0.16b, v1.16b}, [x5] 817 sub x1, x1, #16 818 sub v2.16b, v4.16b, v0.16b 819 sub v3.8b, v4.8b, v1.8b 82032: 821 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 822 ld1 {v5.16b, v6.16b}, [x0] 823 subs w4, w4, #2 824 ld1 {v20.16b, v21.16b}, [x8] 825 umull v22.8h, v16.8b, v0.8b 826 umlal v22.8h, v5.8b, v2.8b 827 umull2 v23.8h, v16.16b, v0.16b 828 umlal2 v23.8h, v5.16b, v2.16b 829 umull v28.8h, v17.8b, v1.8b 830 umlal v28.8h, v6.8b, v3.8b 831 umull v30.8h, v18.8b, v0.8b 832 umlal v30.8h, v20.8b, v2.8b 833 umull2 v31.8h, v18.16b, v0.16b 834 umlal2 v31.8h, v20.16b, v2.16b 835 umull v25.8h, v19.8b, v1.8b 836 umlal v25.8h, v21.8b, v3.8b 837 rshrn v24.8b, v22.8h, #6 838 rshrn2 v24.16b, v23.8h, #6 839 rshrn v28.8b, v28.8h, #6 840 rshrn v30.8b, v30.8h, #6 841 rshrn2 v30.16b, v31.8h, #6 842 rshrn v27.8b, v25.8h, #6 843 st1 {v24.16b}, [x0], #16 844 st1 {v30.16b}, [x8], #16 845 st1 {v28.8b}, [x0], x1 846 st1 {v27.8b}, [x8], x1 847 b.gt 32b 848 ret 849endfunc 850 851jumptable blend_v_tbl 852 .word 320b - blend_v_tbl 853 .word 160b - blend_v_tbl 854 .word 80b - blend_v_tbl 855 .word 40b - blend_v_tbl 856 .word 20b - blend_v_tbl 857endjumptable 858 859 860// This has got the same signature as the put_8tap functions, 861// and assumes that x8 is set to (clz(w)-24). 862function put_neon, export=1 863 movrel x9, put_tbl 864 ldrsw x8, [x9, x8, lsl #2] 865 add x9, x9, x8 866 br x9 867 86820: 869 AARCH64_VALID_JUMP_TARGET 8702: 871 ldrh w9, [x2] 872 ldrh w10, [x2, x3] 873 add x2, x2, x3, lsl #1 874 subs w5, w5, #2 875 strh w9, [x0] 876 strh w10, [x0, x1] 877 add x0, x0, x1, lsl #1 878 b.gt 2b 879 ret 88040: 881 AARCH64_VALID_JUMP_TARGET 8824: 883 ldr w9, [x2] 884 ldr w10, [x2, x3] 885 add x2, x2, x3, lsl #1 886 subs w5, w5, #2 887 str w9, [x0] 888 str w10, [x0, x1] 889 add x0, x0, x1, lsl #1 890 b.gt 4b 891 ret 89280: 893 AARCH64_VALID_JUMP_TARGET 8948: 895 ldr x9, [x2] 896 ldr x10, [x2, x3] 897 add x2, x2, x3, lsl #1 898 subs w5, w5, #2 899 str x9, [x0] 900 str x10, [x0, x1] 901 add x0, x0, x1, lsl #1 902 b.gt 8b 903 ret 904160: 905 AARCH64_VALID_JUMP_TARGET 90616: 907 ldr q0, [x2] 908 ldr q1, [x2, x3] 909 add x2, x2, x3, lsl #1 910 subs w5, w5, #2 911 str q0, [x0] 912 str q1, [x0, x1] 913 add x0, x0, x1, lsl #1 914 b.gt 16b 915 ret 916320: 917 AARCH64_VALID_JUMP_TARGET 91832: 919 ldp q0, q1, [x2] 920 add x2, x2, x3 921 stp q0, q1, [x0] 922 add x0, x0, x1 923 ldp q2, q3, [x2] 924 add x2, x2, x3 925 stp q2, q3, [x0] 926 subs w5, w5, #2 927 add x0, x0, x1 928 b.gt 32b 929 ret 930640: 931 AARCH64_VALID_JUMP_TARGET 93264: 933 ldp q0, q1, [x2] 934 stp q0, q1, [x0] 935 ldp q2, q3, [x2, #32] 936 add x2, x2, x3 937 stp q2, q3, [x0, #32] 938 subs w5, w5, #1 939 add x0, x0, x1 940 b.gt 64b 941 ret 9421280: 943 AARCH64_VALID_JUMP_TARGET 944128: 945 ldp q0, q1, [x2] 946 stp q0, q1, [x0] 947 ldp q2, q3, [x2, #32] 948 stp q2, q3, [x0, #32] 949 ldp q4, q5, [x2, #64] 950 stp q4, q5, [x0, #64] 951 ldp q6, q7, [x2, #96] 952 add x2, x2, x3 953 stp q6, q7, [x0, #96] 954 subs w5, w5, #1 955 add x0, x0, x1 956 b.gt 128b 957 ret 958endfunc 959 960jumptable put_tbl 961 .word 1280b - put_tbl 962 .word 640b - put_tbl 963 .word 320b - put_tbl 964 .word 160b - put_tbl 965 .word 80b - put_tbl 966 .word 40b - put_tbl 967 .word 20b - put_tbl 968endjumptable 969 970 971// This has got the same signature as the prep_8tap functions, 972// and assumes that x8 is set to (clz(w)-24), and x7 to w*2. 973function prep_neon, export=1 974 movrel x9, prep_tbl 975 ldrsw x8, [x9, x8, lsl #2] 976 movi v24.16b, #16 977 add x9, x9, x8 978 br x9 979 98040: 981 AARCH64_VALID_JUMP_TARGET 9824: 983 ldr s0, [x1] 984 ldr s2, [x1, x2] 985 add x1, x1, x2, lsl #1 986 ldr s1, [x1] 987 ldr s3, [x1, x2] 988 add x1, x1, x2, lsl #1 989 mov v0.s[1], v2.s[0] 990 mov v1.s[1], v3.s[0] 991 ushll v0.8h, v0.8b, #4 992 ushll v1.8h, v1.8b, #4 993 subs w4, w4, #4 994 stp q0, q1, [x0], #32 995 b.gt 4b 996 ret 99780: 998 AARCH64_VALID_JUMP_TARGET 9998: 1000 ldr d0, [x1] 1001 ldr d1, [x1, x2] 1002 add x1, x1, x2, lsl #1 1003 ldr d2, [x1] 1004 ldr d3, [x1, x2] 1005 add x1, x1, x2, lsl #1 1006 ushll v0.8h, v0.8b, #4 1007 ushll v1.8h, v1.8b, #4 1008 umull v2.8h, v2.8b, v24.8b 1009 umull v3.8h, v3.8b, v24.8b 1010 subs w4, w4, #4 1011 stp q0, q1, [x0] 1012 stp q2, q3, [x0, #32] 1013 add x0, x0, #64 1014 b.gt 8b 1015 ret 1016160: 1017 AARCH64_VALID_JUMP_TARGET 101816: 1019 ldr q1, [x1] 1020 ldr q3, [x1, x2] 1021 add x1, x1, x2, lsl #1 1022 ushll v0.8h, v1.8b, #4 1023 ushll2 v1.8h, v1.16b, #4 1024 ldr q5, [x1] 1025 ldr q7, [x1, x2] 1026 add x1, x1, x2, lsl #1 1027 umull v2.8h, v3.8b, v24.8b 1028 umull2 v3.8h, v3.16b, v24.16b 1029 ushll v4.8h, v5.8b, #4 1030 ushll2 v5.8h, v5.16b, #4 1031 umull v6.8h, v7.8b, v24.8b 1032 umull2 v7.8h, v7.16b, v24.16b 1033 subs w4, w4, #4 1034 stp q0, q1, [x0] 1035 stp q2, q3, [x0, #32] 1036 stp q4, q5, [x0, #64] 1037 stp q6, q7, [x0, #96] 1038 add x0, x0, #128 1039 b.gt 16b 1040 ret 1041320: 1042 AARCH64_VALID_JUMP_TARGET 104332: 1044 ldp q4, q5, [x1] 1045 add x1, x1, x2 1046 ldp q6, q7, [x1] 1047 add x1, x1, x2 1048 ushll v0.8h, v4.8b, #4 1049 ushll2 v1.8h, v4.16b, #4 1050 umull v2.8h, v5.8b, v24.8b 1051 umull2 v3.8h, v5.16b, v24.16b 1052 ushll v4.8h, v6.8b, #4 1053 ushll2 v5.8h, v6.16b, #4 1054 umull v6.8h, v7.8b, v24.8b 1055 umull2 v7.8h, v7.16b, v24.16b 1056 subs w4, w4, #2 1057 stp q0, q1, [x0] 1058 stp q2, q3, [x0, #32] 1059 stp q4, q5, [x0, #64] 1060 stp q6, q7, [x0, #96] 1061 add x0, x0, #128 1062 b.gt 32b 1063 ret 1064640: 1065 AARCH64_VALID_JUMP_TARGET 106664: 1067 ldp q4, q5, [x1] 1068 ldp q6, q7, [x1, #32] 1069 add x1, x1, x2 1070 ushll v0.8h, v4.8b, #4 1071 ushll2 v1.8h, v4.16b, #4 1072 umull v2.8h, v5.8b, v24.8b 1073 umull2 v3.8h, v5.16b, v24.16b 1074 ushll v4.8h, v6.8b, #4 1075 ushll2 v5.8h, v6.16b, #4 1076 umull v6.8h, v7.8b, v24.8b 1077 umull2 v7.8h, v7.16b, v24.16b 1078 subs w4, w4, #1 1079 stp q0, q1, [x0] 1080 stp q2, q3, [x0, #32] 1081 stp q4, q5, [x0, #64] 1082 stp q6, q7, [x0, #96] 1083 add x0, x0, #128 1084 b.gt 64b 1085 ret 10861280: 1087 AARCH64_VALID_JUMP_TARGET 1088128: 1089 ldp q28, q29, [x1] 1090 ldp q30, q31, [x1, #32] 1091 ushll v16.8h, v28.8b, #4 1092 ushll2 v17.8h, v28.16b, #4 1093 umull v18.8h, v29.8b, v24.8b 1094 umull2 v19.8h, v29.16b, v24.16b 1095 ushll v20.8h, v30.8b, #4 1096 ushll2 v21.8h, v30.16b, #4 1097 umull v22.8h, v31.8b, v24.8b 1098 umull2 v23.8h, v31.16b, v24.16b 1099 ldp q28, q29, [x1, #64] 1100 ldp q30, q31, [x1, #96] 1101 add x1, x1, x2 1102 stp q16, q17, [x0] 1103 stp q18, q19, [x0, #32] 1104 stp q20, q21, [x0, #64] 1105 stp q22, q23, [x0, #96] 1106 ushll v16.8h, v28.8b, #4 1107 ushll2 v17.8h, v28.16b, #4 1108 umull v18.8h, v29.8b, v24.8b 1109 umull2 v19.8h, v29.16b, v24.16b 1110 ushll v20.8h, v30.8b, #4 1111 ushll2 v21.8h, v30.16b, #4 1112 umull v22.8h, v31.8b, v24.8b 1113 umull2 v23.8h, v31.16b, v24.16b 1114 subs w4, w4, #1 1115 stp q16, q17, [x0, #128] 1116 stp q18, q19, [x0, #160] 1117 stp q20, q21, [x0, #192] 1118 stp q22, q23, [x0, #224] 1119 add x0, x0, #256 1120 b.gt 128b 1121 ret 1122endfunc 1123 1124jumptable prep_tbl 1125 .word 1280b - prep_tbl 1126 .word 640b - prep_tbl 1127 .word 320b - prep_tbl 1128 .word 160b - prep_tbl 1129 .word 80b - prep_tbl 1130 .word 40b - prep_tbl 1131endjumptable 1132 1133 1134.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1135 ld1 {\d0\wd}[0], [\s0], \strd 1136 ld1 {\d1\wd}[0], [\s1], \strd 1137.ifnb \d2 1138 ld1 {\d2\wd}[0], [\s0], \strd 1139 ld1 {\d3\wd}[0], [\s1], \strd 1140.endif 1141.ifnb \d4 1142 ld1 {\d4\wd}[0], [\s0], \strd 1143.endif 1144.ifnb \d5 1145 ld1 {\d5\wd}[0], [\s1], \strd 1146.endif 1147.ifnb \d6 1148 ld1 {\d6\wd}[0], [\s0], \strd 1149.endif 1150.endm 1151.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 1152 ld1 {\d0\wd}, [\s0], \strd 1153 ld1 {\d1\wd}, [\s1], \strd 1154.ifnb \d2 1155 ld1 {\d2\wd}, [\s0], \strd 1156 ld1 {\d3\wd}, [\s1], \strd 1157.endif 1158.ifnb \d4 1159 ld1 {\d4\wd}, [\s0], \strd 1160.endif 1161.ifnb \d5 1162 ld1 {\d5\wd}, [\s1], \strd 1163.endif 1164.ifnb \d6 1165 ld1 {\d6\wd}, [\s0], \strd 1166.endif 1167.endm 1168.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1169 load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1170.endm 1171.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1172 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1173.endm 1174.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1175 load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1176.endm 1177.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 1178 load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 1179.endm 1180.macro interleave_1 wd, r0, r1, r2, r3, r4 1181 trn1 \r0\wd, \r0\wd, \r1\wd 1182 trn1 \r1\wd, \r1\wd, \r2\wd 1183.ifnb \r3 1184 trn1 \r2\wd, \r2\wd, \r3\wd 1185 trn1 \r3\wd, \r3\wd, \r4\wd 1186.endif 1187.endm 1188.macro interleave_1_h r0, r1, r2, r3, r4 1189 interleave_1 .4h, \r0, \r1, \r2, \r3, \r4 1190.endm 1191.macro interleave_1_s r0, r1, r2, r3, r4 1192 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 1193.endm 1194.macro interleave_2 wd, r0, r1, r2, r3, r4, r5 1195 trn1 \r0\wd, \r0\wd, \r2\wd 1196 trn1 \r1\wd, \r1\wd, \r3\wd 1197 trn1 \r2\wd, \r2\wd, \r4\wd 1198 trn1 \r3\wd, \r3\wd, \r5\wd 1199.endm 1200.macro interleave_2_s r0, r1, r2, r3, r4, r5 1201 interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5 1202.endm 1203.macro uxtl_b r0, r1, r2, r3, r4, r5, r6 1204 uxtl \r0\().8h, \r0\().8b 1205 uxtl \r1\().8h, \r1\().8b 1206.ifnb \r2 1207 uxtl \r2\().8h, \r2\().8b 1208 uxtl \r3\().8h, \r3\().8b 1209.endif 1210.ifnb \r4 1211 uxtl \r4\().8h, \r4\().8b 1212.endif 1213.ifnb \r5 1214 uxtl \r5\().8h, \r5\().8b 1215.endif 1216.ifnb \r6 1217 uxtl \r6\().8h, \r6\().8b 1218.endif 1219.endm 1220.macro mul_mla_4tap d, s0, s1, s2, s3, wd 1221 mul \d\wd, \s0\wd, v0.h[0] 1222 mla \d\wd, \s1\wd, v0.h[1] 1223 mla \d\wd, \s2\wd, v0.h[2] 1224 mla \d\wd, \s3\wd, v0.h[3] 1225.endm 1226// Interleaving the mul/mla chains actually hurts performance 1227// significantly on Cortex A53, thus keeping mul/mla tightly 1228// chained like this. 1229.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 1230 mul \d0\().4h, \s1\().4h, v0.h[1] 1231 mla \d0\().4h, \s2\().4h, v0.h[2] 1232 mla \d0\().4h, \s3\().4h, v0.h[3] 1233 mla \d0\().4h, \s4\().4h, v0.h[4] 1234 mla \d0\().4h, \s5\().4h, v0.h[5] 1235 mla \d0\().4h, \s6\().4h, v0.h[6] 1236.endm 1237.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 1238 mul \d0\().8h, \s1\().8h, v0.h[1] 1239 mla \d0\().8h, \s2\().8h, v0.h[2] 1240 mla \d0\().8h, \s3\().8h, v0.h[3] 1241 mla \d0\().8h, \s4\().8h, v0.h[4] 1242 mla \d0\().8h, \s5\().8h, v0.h[5] 1243 mla \d0\().8h, \s6\().8h, v0.h[6] 1244.endm 1245.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 1246 mul \d0\().8h, \s1\().8h, v0.h[1] 1247 mla \d0\().8h, \s2\().8h, v0.h[2] 1248 mla \d0\().8h, \s3\().8h, v0.h[3] 1249 mla \d0\().8h, \s4\().8h, v0.h[4] 1250 mla \d0\().8h, \s5\().8h, v0.h[5] 1251 mla \d0\().8h, \s6\().8h, v0.h[6] 1252 mul \d1\().8h, \s2\().8h, v0.h[1] 1253 mla \d1\().8h, \s3\().8h, v0.h[2] 1254 mla \d1\().8h, \s4\().8h, v0.h[3] 1255 mla \d1\().8h, \s5\().8h, v0.h[4] 1256 mla \d1\().8h, \s6\().8h, v0.h[5] 1257 mla \d1\().8h, \s7\().8h, v0.h[6] 1258.endm 1259.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 1260 mul \d0\().8h, \s1\().8h, v0.h[1] 1261 mla \d0\().8h, \s2\().8h, v0.h[2] 1262 mla \d0\().8h, \s3\().8h, v0.h[3] 1263 mla \d0\().8h, \s4\().8h, v0.h[4] 1264 mla \d0\().8h, \s5\().8h, v0.h[5] 1265 mla \d0\().8h, \s6\().8h, v0.h[6] 1266 mul \d1\().8h, \s3\().8h, v0.h[1] 1267 mla \d1\().8h, \s4\().8h, v0.h[2] 1268 mla \d1\().8h, \s5\().8h, v0.h[3] 1269 mla \d1\().8h, \s6\().8h, v0.h[4] 1270 mla \d1\().8h, \s7\().8h, v0.h[5] 1271 mla \d1\().8h, \s8\().8h, v0.h[6] 1272.endm 1273.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 1274 mul \d0\().4h, \s0\().4h, v0.h[0] 1275 mla \d0\().4h, \s1\().4h, v0.h[1] 1276 mla \d0\().4h, \s2\().4h, v0.h[2] 1277 mla \d0\().4h, \s3\().4h, v0.h[3] 1278 mla \d0\().4h, \s4\().4h, v0.h[4] 1279 mla \d0\().4h, \s5\().4h, v0.h[5] 1280 mla \d0\().4h, \s6\().4h, v0.h[6] 1281 mla \d0\().4h, \s7\().4h, v0.h[7] 1282.endm 1283.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 1284 mul \d0\().8h, \s0\().8h, v0.h[0] 1285 mla \d0\().8h, \s1\().8h, v0.h[1] 1286 mla \d0\().8h, \s2\().8h, v0.h[2] 1287 mla \d0\().8h, \s3\().8h, v0.h[3] 1288 mla \d0\().8h, \s4\().8h, v0.h[4] 1289 mla \d0\().8h, \s5\().8h, v0.h[5] 1290 mla \d0\().8h, \s6\().8h, v0.h[6] 1291 mla \d0\().8h, \s7\().8h, v0.h[7] 1292.endm 1293.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 1294 mul \d0\().8h, \s0\().8h, v0.h[0] 1295 mla \d0\().8h, \s1\().8h, v0.h[1] 1296 mla \d0\().8h, \s2\().8h, v0.h[2] 1297 mla \d0\().8h, \s3\().8h, v0.h[3] 1298 mla \d0\().8h, \s4\().8h, v0.h[4] 1299 mla \d0\().8h, \s5\().8h, v0.h[5] 1300 mla \d0\().8h, \s6\().8h, v0.h[6] 1301 mla \d0\().8h, \s7\().8h, v0.h[7] 1302 mul \d1\().8h, \s1\().8h, v0.h[0] 1303 mla \d1\().8h, \s2\().8h, v0.h[1] 1304 mla \d1\().8h, \s3\().8h, v0.h[2] 1305 mla \d1\().8h, \s4\().8h, v0.h[3] 1306 mla \d1\().8h, \s5\().8h, v0.h[4] 1307 mla \d1\().8h, \s6\().8h, v0.h[5] 1308 mla \d1\().8h, \s7\().8h, v0.h[6] 1309 mla \d1\().8h, \s8\().8h, v0.h[7] 1310.endm 1311.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 1312 mul \d0\().8h, \s0\().8h, v0.h[0] 1313 mla \d0\().8h, \s1\().8h, v0.h[1] 1314 mla \d0\().8h, \s2\().8h, v0.h[2] 1315 mla \d0\().8h, \s3\().8h, v0.h[3] 1316 mla \d0\().8h, \s4\().8h, v0.h[4] 1317 mla \d0\().8h, \s5\().8h, v0.h[5] 1318 mla \d0\().8h, \s6\().8h, v0.h[6] 1319 mla \d0\().8h, \s7\().8h, v0.h[7] 1320 mul \d1\().8h, \s2\().8h, v0.h[0] 1321 mla \d1\().8h, \s3\().8h, v0.h[1] 1322 mla \d1\().8h, \s4\().8h, v0.h[2] 1323 mla \d1\().8h, \s5\().8h, v0.h[3] 1324 mla \d1\().8h, \s6\().8h, v0.h[4] 1325 mla \d1\().8h, \s7\().8h, v0.h[5] 1326 mla \d1\().8h, \s8\().8h, v0.h[6] 1327 mla \d1\().8h, \s9\().8h, v0.h[7] 1328.endm 1329.macro sqrshrun_b shift, r0, r1, r2, r3 1330 sqrshrun \r0\().8b, \r0\().8h, #\shift 1331.ifnb \r1 1332 sqrshrun \r1\().8b, \r1\().8h, #\shift 1333.endif 1334.ifnb \r2 1335 sqrshrun \r2\().8b, \r2\().8h, #\shift 1336 sqrshrun \r3\().8b, \r3\().8h, #\shift 1337.endif 1338.endm 1339.macro srshr_h shift, r0, r1, r2, r3 1340 srshr \r0\().8h, \r0\().8h, #\shift 1341.ifnb \r1 1342 srshr \r1\().8h, \r1\().8h, #\shift 1343.endif 1344.ifnb \r2 1345 srshr \r2\().8h, \r2\().8h, #\shift 1346 srshr \r3\().8h, \r3\().8h, #\shift 1347.endif 1348.endm 1349.macro st_h strd, reg, lanes 1350 st1 {\reg\().h}[0], [x0], \strd 1351 st1 {\reg\().h}[1], [x8], \strd 1352.if \lanes > 2 1353 st1 {\reg\().h}[2], [x0], \strd 1354 st1 {\reg\().h}[3], [x8], \strd 1355.endif 1356.endm 1357.macro st_s strd, r0, r1 1358 st1 {\r0\().s}[0], [x0], \strd 1359 st1 {\r0\().s}[1], [x8], \strd 1360.ifnb \r1 1361 st1 {\r1\().s}[0], [x0], \strd 1362 st1 {\r1\().s}[1], [x8], \strd 1363.endif 1364.endm 1365.macro st_d strd, r0, r1 1366 st1 {\r0\().8b}, [x0], \strd 1367 st1 {\r0\().d}[1], [x8], \strd 1368.ifnb \r1 1369 st1 {\r1\().8b}, [x0], \strd 1370 st1 {\r1\().d}[1], [x8], \strd 1371.endif 1372.endm 1373.macro shift_store_4 type, strd, r0, r1 1374.ifc \type, put 1375 sqrshrun_b 6, \r0, \r1 1376 st_s \strd, \r0, \r1 1377.else 1378 srshr_h 2, \r0, \r1 1379 st_d \strd, \r0, \r1 1380.endif 1381.endm 1382.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 1383 st1 {\r0\wd}, [x0], \strd 1384 st1 {\r1\wd}, [x8], \strd 1385.ifnb \r2 1386 st1 {\r2\wd}, [x0], \strd 1387 st1 {\r3\wd}, [x8], \strd 1388.endif 1389.ifnb \r4 1390 st1 {\r4\wd}, [x0], \strd 1391 st1 {\r5\wd}, [x8], \strd 1392 st1 {\r6\wd}, [x0], \strd 1393 st1 {\r7\wd}, [x8], \strd 1394.endif 1395.endm 1396.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7 1397 st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1398.endm 1399.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7 1400 st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 1401.endm 1402.macro shift_store_8 type, strd, r0, r1, r2, r3 1403.ifc \type, put 1404 sqrshrun_b 6, \r0, \r1, \r2, \r3 1405 st_8b \strd, \r0, \r1, \r2, \r3 1406.else 1407 srshr_h 2, \r0, \r1, \r2, \r3 1408 st_16b \strd, \r0, \r1, \r2, \r3 1409.endif 1410.endm 1411.macro shift_store_16 type, strd, r0, r1, r2, r3 1412.ifc \type, put 1413 sqrshrun \r0\().8b, \r0\().8h, #6 1414 sqrshrun2 \r0\().16b, \r1\().8h, #6 1415 sqrshrun \r2\().8b, \r2\().8h, #6 1416 sqrshrun2 \r2\().16b, \r3\().8h, #6 1417 st_16b \strd, \r0, \r2 1418.else 1419 srshr_h 2, \r0, \r1, \r2, \r3 1420 st1 {\r0\().8h, \r1\().8h}, [x0], \strd 1421 st1 {\r2\().8h, \r3\().8h}, [x8], \strd 1422.endif 1423.endm 1424 1425.macro make_8tap_fn op, type, type_h, type_v, taps 1426function \op\()_8tap_\type\()_8bpc_neon, export=1 1427 mov x8, \type_h 1428 mov x9, \type_v 1429 b \op\()_\taps\()_neon 1430endfunc 1431.endm 1432 1433// No spaces in these expressions, due to gas-preprocessor. 1434#define REGULAR ((0*15<<7)|3*15) 1435#define SMOOTH ((1*15<<7)|4*15) 1436#define SHARP ((2*15<<7)|3*15) 1437 1438.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps 1439function \type\()_\taps\()_neon 1440 mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) 1441 mul \mx, \mx, w10 1442 mul \my, \my, w10 1443 add \mx, \mx, w8 // mx, 8tap_h, 4tap_h 1444 add \my, \my, w9 // my, 8tap_v, 4tap_v 1445.ifc \type, prep 1446 uxtw \d_strd, \w 1447 lsl \d_strd, \d_strd, #1 1448.endif 1449 1450 clz w8, \w 1451 tst \mx, #(0x7f << 14) 1452 sub w8, w8, #24 1453 movrel x10, X(mc_subpel_filters), -8 1454 b.ne L(\type\()_\taps\()_h) 1455 tst \my, #(0x7f << 14) 1456 b.ne L(\type\()_\taps\()_v) 1457 b \type\()_neon 1458 1459L(\type\()_\taps\()_h): 1460 cmp \w, #4 1461 ubfx w9, \mx, #7, #7 1462 and \mx, \mx, #0x7f 1463 b.le 4f 1464 mov \mx, w9 14654: 1466 tst \my, #(0x7f << 14) 1467 add \xmx, x10, \mx, uxtw #3 1468 b.ne L(\type\()_\taps\()_hv) 1469 1470 movrel x9, \type\()_\taps\()_h_tbl 1471 ldrsw x8, [x9, x8, lsl #2] 1472 add x9, x9, x8 1473 br x9 1474 147520: // 2xN h 1476 AARCH64_VALID_JUMP_TARGET 1477.ifc \type, put 1478 ldur s0, [\xmx, #2] 1479 sub \src, \src, #1 1480 add \ds2, \dst, \d_strd 1481 add \sr2, \src, \s_strd 1482 lsl \d_strd, \d_strd, #1 1483 lsl \s_strd, \s_strd, #1 1484 sxtl v0.8h, v0.8b 14852: 1486 ld1 {v4.8b}, [\src], \s_strd 1487 ld1 {v6.8b}, [\sr2], \s_strd 1488 uxtl v4.8h, v4.8b 1489 uxtl v6.8h, v6.8b 1490 ext v5.16b, v4.16b, v4.16b, #2 1491 ext v7.16b, v6.16b, v6.16b, #2 1492 subs \h, \h, #2 1493 trn1 v3.2s, v4.2s, v6.2s 1494 trn2 v6.2s, v4.2s, v6.2s 1495 trn1 v4.2s, v5.2s, v7.2s 1496 trn2 v7.2s, v5.2s, v7.2s 1497 mul v3.4h, v3.4h, v0.h[0] 1498 mla v3.4h, v4.4h, v0.h[1] 1499 mla v3.4h, v6.4h, v0.h[2] 1500 mla v3.4h, v7.4h, v0.h[3] 1501 srshr v3.4h, v3.4h, #2 1502 sqrshrun v3.8b, v3.8h, #4 1503 st1 {v3.h}[0], [\dst], \d_strd 1504 st1 {v3.h}[1], [\ds2], \d_strd 1505 b.gt 2b 1506 ret 1507.endif 1508 150940: // 4xN h 1510 AARCH64_VALID_JUMP_TARGET 1511 ldur s0, [\xmx, #2] 1512 sub \src, \src, #1 1513 add \ds2, \dst, \d_strd 1514 add \sr2, \src, \s_strd 1515 lsl \d_strd, \d_strd, #1 1516 lsl \s_strd, \s_strd, #1 1517 sxtl v0.8h, v0.8b 15184: 1519 ld1 {v16.8b}, [\src], \s_strd 1520 ld1 {v20.8b}, [\sr2], \s_strd 1521 uxtl v16.8h, v16.8b 1522 uxtl v20.8h, v20.8b 1523 ext v17.16b, v16.16b, v16.16b, #2 1524 ext v18.16b, v16.16b, v16.16b, #4 1525 ext v19.16b, v16.16b, v16.16b, #6 1526 ext v21.16b, v20.16b, v20.16b, #2 1527 ext v22.16b, v20.16b, v20.16b, #4 1528 ext v23.16b, v20.16b, v20.16b, #6 1529 subs \h, \h, #2 1530 mul v16.4h, v16.4h, v0.h[0] 1531 mla v16.4h, v17.4h, v0.h[1] 1532 mla v16.4h, v18.4h, v0.h[2] 1533 mla v16.4h, v19.4h, v0.h[3] 1534 mul v20.4h, v20.4h, v0.h[0] 1535 mla v20.4h, v21.4h, v0.h[1] 1536 mla v20.4h, v22.4h, v0.h[2] 1537 mla v20.4h, v23.4h, v0.h[3] 1538 srshr v16.4h, v16.4h, #2 1539 srshr v20.4h, v20.4h, #2 1540.ifc \type, put 1541 sqrshrun v16.8b, v16.8h, #4 1542 sqrshrun v20.8b, v20.8h, #4 1543 str s16, [\dst] 1544 str s20, [\ds2] 1545 add \dst, \dst, \d_strd 1546 add \ds2, \ds2, \d_strd 1547.else 1548 st1 {v16.4h}, [\dst], \d_strd 1549 st1 {v20.4h}, [\ds2], \d_strd 1550.endif 1551 b.gt 4b 1552 ret 1553 155480: // 8xN h 1555 AARCH64_VALID_JUMP_TARGET 1556 ld1 {v0.8b}, [\xmx] 1557.ifc \taps, 6tap 1558 sub \src, \src, #2 1559.else 1560 sub \src, \src, #3 1561.endif 1562 add \ds2, \dst, \d_strd 1563 add \sr2, \src, \s_strd 1564 lsl \d_strd, \d_strd, #1 1565 lsl \s_strd, \s_strd, #1 1566 sxtl v0.8h, v0.8b 15678: 1568 ld1 {v16.8b, v17.8b}, [\src], \s_strd 1569 ld1 {v20.8b, v21.8b}, [\sr2], \s_strd 1570 uxtl v16.8h, v16.8b 1571 uxtl v17.8h, v17.8b 1572 uxtl v20.8h, v20.8b 1573 uxtl v21.8h, v21.8b 1574 1575.ifc \taps, 6tap 1576 mul v18.8h, v16.8h, v0.h[1] 1577 mul v22.8h, v20.8h, v0.h[1] 1578 .irpc i, 23456 1579 ext v19.16b, v16.16b, v17.16b, #(2*\i-2) 1580 ext v23.16b, v20.16b, v21.16b, #(2*\i-2) 1581 mla v18.8h, v19.8h, v0.h[\i] 1582 mla v22.8h, v23.8h, v0.h[\i] 1583 .endr 1584.else // 8tap 1585 mul v18.8h, v16.8h, v0.h[0] 1586 mul v22.8h, v20.8h, v0.h[0] 1587 .irpc i, 1234567 1588 ext v19.16b, v16.16b, v17.16b, #(2*\i) 1589 ext v23.16b, v20.16b, v21.16b, #(2*\i) 1590 mla v18.8h, v19.8h, v0.h[\i] 1591 mla v22.8h, v23.8h, v0.h[\i] 1592 .endr 1593.endif 1594 subs \h, \h, #2 1595 srshr v18.8h, v18.8h, #2 1596 srshr v22.8h, v22.8h, #2 1597.ifc \type, put 1598 sqrshrun v18.8b, v18.8h, #4 1599 sqrshrun v22.8b, v22.8h, #4 1600 st1 {v18.8b}, [\dst], \d_strd 1601 st1 {v22.8b}, [\ds2], \d_strd 1602.else 1603 st1 {v18.8h}, [\dst], \d_strd 1604 st1 {v22.8h}, [\ds2], \d_strd 1605.endif 1606 b.gt 8b 1607 ret 1608160: 1609320: 1610640: 16111280: // 16xN, 32xN, ... h 1612 AARCH64_VALID_JUMP_TARGET 1613 ld1 {v0.8b}, [\xmx] 1614.ifc \taps, 6tap 1615 sub \src, \src, #2 1616.else 1617 sub \src, \src, #3 1618.endif 1619 add \ds2, \dst, \d_strd 1620 add \sr2, \src, \s_strd 1621 lsl \s_strd, \s_strd, #1 1622 sxtl v0.8h, v0.8b 1623 1624 sub \s_strd, \s_strd, \w, uxtw 1625 sub \s_strd, \s_strd, #8 1626.ifc \type, put 1627 lsl \d_strd, \d_strd, #1 1628 sub \d_strd, \d_strd, \w, uxtw 1629.endif 1630161: 1631 ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24 1632 ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24 1633 mov \mx, \w 1634 uxtl v16.8h, v16.8b 1635 uxtl v17.8h, v17.8b 1636 uxtl v18.8h, v18.8b 1637 uxtl v20.8h, v20.8b 1638 uxtl v21.8h, v21.8b 1639 uxtl v22.8h, v22.8b 1640 164116: 1642.ifc \taps, 6tap 1643 mul v24.8h, v16.8h, v0.h[1] 1644 mul v25.8h, v17.8h, v0.h[1] 1645 mul v26.8h, v20.8h, v0.h[1] 1646 mul v27.8h, v21.8h, v0.h[1] 1647 .irpc i, 23456 1648 ext v28.16b, v16.16b, v17.16b, #(2*\i-2) 1649 ext v29.16b, v17.16b, v18.16b, #(2*\i-2) 1650 ext v30.16b, v20.16b, v21.16b, #(2*\i-2) 1651 ext v31.16b, v21.16b, v22.16b, #(2*\i-2) 1652 mla v24.8h, v28.8h, v0.h[\i] 1653 mla v25.8h, v29.8h, v0.h[\i] 1654 mla v26.8h, v30.8h, v0.h[\i] 1655 mla v27.8h, v31.8h, v0.h[\i] 1656 .endr 1657.else // 8tap 1658 mul v24.8h, v16.8h, v0.h[0] 1659 mul v25.8h, v17.8h, v0.h[0] 1660 mul v26.8h, v20.8h, v0.h[0] 1661 mul v27.8h, v21.8h, v0.h[0] 1662 .irpc i, 1234567 1663 ext v28.16b, v16.16b, v17.16b, #(2*\i) 1664 ext v29.16b, v17.16b, v18.16b, #(2*\i) 1665 ext v30.16b, v20.16b, v21.16b, #(2*\i) 1666 ext v31.16b, v21.16b, v22.16b, #(2*\i) 1667 mla v24.8h, v28.8h, v0.h[\i] 1668 mla v25.8h, v29.8h, v0.h[\i] 1669 mla v26.8h, v30.8h, v0.h[\i] 1670 mla v27.8h, v31.8h, v0.h[\i] 1671 .endr 1672.endif 1673 srshr v24.8h, v24.8h, #2 1674 srshr v25.8h, v25.8h, #2 1675 srshr v26.8h, v26.8h, #2 1676 srshr v27.8h, v27.8h, #2 1677 subs \mx, \mx, #16 1678.ifc \type, put 1679 sqrshrun v24.8b, v24.8h, #4 1680 sqrshrun2 v24.16b, v25.8h, #4 1681 sqrshrun v26.8b, v26.8h, #4 1682 sqrshrun2 v26.16b, v27.8h, #4 1683 st1 {v24.16b}, [\dst], #16 1684 st1 {v26.16b}, [\ds2], #16 1685.else 1686 st1 {v24.8h, v25.8h}, [\dst], #32 1687 st1 {v26.8h, v27.8h}, [\ds2], #32 1688.endif 1689 b.le 9f 1690 1691 mov v16.16b, v18.16b 1692 mov v20.16b, v22.16b 1693 ld1 {v17.8b, v18.8b}, [\src], #16 1694 ld1 {v21.8b, v22.8b}, [\sr2], #16 1695 uxtl v17.8h, v17.8b 1696 uxtl v18.8h, v18.8b 1697 uxtl v21.8h, v21.8b 1698 uxtl v22.8h, v22.8b 1699 b 16b 1700 17019: 1702 add \dst, \dst, \d_strd 1703 add \ds2, \ds2, \d_strd 1704 add \src, \src, \s_strd 1705 add \sr2, \sr2, \s_strd 1706 1707 subs \h, \h, #2 1708 b.gt 161b 1709 ret 1710endfunc 1711 1712jumptable \type\()_\taps\()_h_tbl 1713 .word 1280b - \type\()_\taps\()_h_tbl 1714 .word 640b - \type\()_\taps\()_h_tbl 1715 .word 320b - \type\()_\taps\()_h_tbl 1716 .word 160b - \type\()_\taps\()_h_tbl 1717 .word 80b - \type\()_\taps\()_h_tbl 1718 .word 40b - \type\()_\taps\()_h_tbl 1719 .word 20b - \type\()_\taps\()_h_tbl 1720endjumptable 1721 1722function L(\type\()_\taps\()_v) 1723 cmp \h, #4 1724 ubfx w9, \my, #7, #7 1725 and \my, \my, #0x7f 1726 b.le 4f 1727 mov \my, w9 17284: 1729 add \xmy, x10, \my, uxtw #3 1730 1731 movrel x9, \type\()_\taps\()_v_tbl 1732 ldrsw x8, [x9, x8, lsl #2] 1733 add x9, x9, x8 1734 br x9 1735 173620: // 2xN v 1737 AARCH64_VALID_JUMP_TARGET 1738.ifc \type, put 1739 b.gt 28f 1740 1741 cmp \h, #2 1742 ldur s0, [\xmy, #2] 1743 sub \src, \src, \s_strd 1744 add \ds2, \dst, \d_strd 1745 add \sr2, \src, \s_strd 1746 lsl \s_strd, \s_strd, #1 1747 lsl \d_strd, \d_strd, #1 1748 sxtl v0.8h, v0.8b 1749 1750 // 2x2 v 1751 load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1752 interleave_1_h v1, v2, v3, v4, v5 1753 b.gt 24f 1754 uxtl_b v1, v2, v3, v4 1755 mul_mla_4tap v6, v1, v2, v3, v4, .4h 1756 sqrshrun_b 6, v6 1757 st_h \d_strd, v6, 2 1758 ret 1759 176024: // 2x4 v 1761 load_h \sr2, \src, \s_strd, v6, v7 1762 interleave_1_h v5, v6, v7 1763 interleave_2_s v1, v2, v3, v4, v5, v6 1764 uxtl_b v1, v2, v3, v4 1765 mul_mla_4tap v6, v1, v2, v3, v4, .8h 1766 sqrshrun_b 6, v6 1767 st_h \d_strd, v6, 4 1768 ret 1769 177028: // 2x6, 2x8, 2x12, 2x16 v 1771 ld1 {v0.8b}, [\xmy] 1772 sub \sr2, \src, \s_strd, lsl #1 1773 add \ds2, \dst, \d_strd 1774 sub \src, \sr2, \s_strd 1775 lsl \d_strd, \d_strd, #1 1776 lsl \s_strd, \s_strd, #1 1777 sxtl v0.8h, v0.8b 1778 1779 load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 1780 interleave_1_h v1, v2, v3, v4, v5 1781 interleave_1_h v5, v6, v7 1782 interleave_2_s v1, v2, v3, v4, v5, v6 1783 uxtl_b v1, v2, v3, v4 1784216: 1785 subs \h, \h, #4 1786 load_h \sr2, \src, \s_strd, v16, v17, v18, v19 1787 interleave_1_h v7, v16, v17, v18, v19 1788 interleave_2_s v5, v6, v7, v16, v17, v18 1789 uxtl_b v5, v6, v7, v16 1790 mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 1791 sqrshrun_b 6, v30 1792 st_h \d_strd, v30, 4 1793 b.le 0f 1794 cmp \h, #2 1795 mov v1.16b, v5.16b 1796 mov v2.16b, v6.16b 1797 mov v3.16b, v7.16b 1798 mov v4.16b, v16.16b 1799 mov v5.16b, v17.16b 1800 mov v6.16b, v18.16b 1801 mov v7.16b, v19.16b 1802 b.eq 26f 1803 b 216b 180426: 1805 load_h \sr2, \src, \s_strd, v16, v17 1806 interleave_1_h v7, v16, v17 1807 uxtl_b v5, v6, v7, v16 1808 mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 1809 sqrshrun_b 6, v30 1810 st_h \d_strd, v30, 2 18110: 1812 ret 1813.endif 1814 181540: 1816 AARCH64_VALID_JUMP_TARGET 1817 b.gt 480f 1818 1819 // 4x2, 4x4 v 1820 cmp \h, #2 1821 ldur s0, [\xmy, #2] 1822 sub \src, \src, \s_strd 1823 add \ds2, \dst, \d_strd 1824 add \sr2, \src, \s_strd 1825 lsl \s_strd, \s_strd, #1 1826 lsl \d_strd, \d_strd, #1 1827 sxtl v0.8h, v0.8b 1828 1829 load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1830 interleave_1_s v1, v2, v3, v4, v5 1831 uxtl_b v1, v2, v3, v4 1832 mul_mla_4tap v6, v1, v2, v3, v4, .8h 1833 shift_store_4 \type, \d_strd, v6 1834 b.le 0f 1835 load_s \sr2, \src, \s_strd, v6, v7 1836 interleave_1_s v5, v6, v7 1837 uxtl_b v5, v6 1838 mul_mla_4tap v7, v3, v4, v5, v6, .8h 1839 shift_store_4 \type, \d_strd, v7 18400: 1841 ret 1842 1843480: // 4x6, 4x8, 4x12, 4x16 v 1844 ld1 {v0.8b}, [\xmy] 1845 sub \sr2, \src, \s_strd, lsl #1 1846 add \ds2, \dst, \d_strd 1847 sub \src, \sr2, \s_strd 1848 lsl \s_strd, \s_strd, #1 1849 lsl \d_strd, \d_strd, #1 1850 sxtl v0.8h, v0.8b 1851 1852 load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1853 interleave_1_s v16, v17, v18 1854 interleave_1_s v18, v19, v20, v21, v22 1855 uxtl_b v16, v17 1856 uxtl_b v18, v19, v20, v21 1857 185848: 1859 subs \h, \h, #4 1860 load_s \sr2, \src, \s_strd, v23, v24, v25, v26 1861 interleave_1_s v22, v23, v24, v25, v26 1862 uxtl_b v22, v23, v24, v25 1863 mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 1864 shift_store_4 \type, \d_strd, v1, v2 1865 b.le 0f 1866 load_s \sr2, \src, \s_strd, v27, v16 1867 subs \h, \h, #2 1868 interleave_1_s v26, v27, v16 1869 uxtl_b v26, v27 1870 mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 1871 shift_store_4 \type, \d_strd, v1 1872 b.le 0f 1873 load_s \sr2, \src, \s_strd, v17, v18 1874 subs \h, \h, #2 1875 interleave_1_s v16, v17, v18 1876 uxtl_b v16, v17 1877 mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 1878 shift_store_4 \type, \d_strd, v2 1879 b.le 0f 1880 subs \h, \h, #4 1881 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 1882 interleave_1_s v18, v19, v20, v21, v22 1883 uxtl_b v18, v19, v20, v21 1884 mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 1885 shift_store_4 \type, \d_strd, v1, v2 1886 b.gt 48b 18870: 1888 ret 1889 189080: 1891 AARCH64_VALID_JUMP_TARGET 1892 b.gt 880f 1893 1894 // 8x2, 8x4 v 1895 cmp \h, #2 1896 ldur s0, [\xmy, #2] 1897 sub \src, \src, \s_strd 1898 add \ds2, \dst, \d_strd 1899 add \sr2, \src, \s_strd 1900 lsl \s_strd, \s_strd, #1 1901 lsl \d_strd, \d_strd, #1 1902 sxtl v0.8h, v0.8b 1903 1904 load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 1905 uxtl_b v1, v2, v3, v4, v5 1906 mul_mla_4tap v6, v1, v2, v3, v4, .8h 1907 mul_mla_4tap v7, v2, v3, v4, v5, .8h 1908 shift_store_8 \type, \d_strd, v6, v7 1909 b.le 0f 1910 load_8b \sr2, \src, \s_strd, v6, v7 1911 uxtl_b v6, v7 1912 mul_mla_4tap v1, v3, v4, v5, v6, .8h 1913 mul_mla_4tap v2, v4, v5, v6, v7, .8h 1914 shift_store_8 \type, \d_strd, v1, v2 19150: 1916 ret 1917 1918880: // 8x6, 8x8, 8x16, 8x32 v 19191680: // 16x8, 16x16, ... 1920320: // 32x8, 32x16, ... 1921640: 19221280: 1923 AARCH64_VALID_JUMP_TARGET 1924 ld1 {v0.8b}, [\xmy] 1925 sub \src, \src, \s_strd 1926 sub \src, \src, \s_strd, lsl #1 1927 sxtl v0.8h, v0.8b 1928 mov \my, \h 1929168: 1930 add \ds2, \dst, \d_strd 1931 add \sr2, \src, \s_strd 1932 lsl \s_strd, \s_strd, #1 1933 lsl \d_strd, \d_strd, #1 1934 1935 load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 1936 uxtl_b v16, v17, v18, v19, v20, v21, v22 1937 193888: 1939 subs \h, \h, #2 1940 load_8b \sr2, \src, \s_strd, v23, v24 1941 uxtl_b v23, v24 1942 mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 1943 shift_store_8 \type, \d_strd, v1, v2 1944 b.le 9f 1945 subs \h, \h, #2 1946 load_8b \sr2, \src, \s_strd, v25, v26 1947 uxtl_b v25, v26 1948 mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 1949 shift_store_8 \type, \d_strd, v3, v4 1950 b.le 9f 1951 subs \h, \h, #2 1952 load_8b \sr2, \src, \s_strd, v27, v16 1953 uxtl_b v27, v16 1954 mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 1955 shift_store_8 \type, \d_strd, v1, v2 1956 b.le 9f 1957 subs \h, \h, #2 1958 load_8b \sr2, \src, \s_strd, v17, v18 1959 uxtl_b v17, v18 1960 mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 1961 shift_store_8 \type, \d_strd, v3, v4 1962 b.le 9f 1963 subs \h, \h, #4 1964 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 1965 uxtl_b v19, v20, v21, v22 1966 mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 1967 mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 1968 shift_store_8 \type, \d_strd, v1, v2, v3, v4 1969 b.gt 88b 19709: 1971 subs \w, \w, #8 1972 b.le 0f 1973 asr \s_strd, \s_strd, #1 1974 asr \d_strd, \d_strd, #1 1975 msub \src, \s_strd, \xmy, \src 1976 msub \dst, \d_strd, \xmy, \dst 1977 sub \src, \src, \s_strd, lsl #3 1978 mov \h, \my 1979 add \src, \src, #8 1980.ifc \type, put 1981 add \dst, \dst, #8 1982.else 1983 add \dst, \dst, #16 1984.endif 1985 b 168b 19860: 1987 ret 1988 1989160: 1990 AARCH64_VALID_JUMP_TARGET 1991 b.gt 1680b 1992 1993 // 16x2, 16x4 v 1994 ldur s0, [\xmy, #2] 1995 sub \src, \src, \s_strd 1996 add \ds2, \dst, \d_strd 1997 add \sr2, \src, \s_strd 1998 lsl \s_strd, \s_strd, #1 1999 lsl \d_strd, \d_strd, #1 2000 sxtl v0.8h, v0.8b 2001 2002 cmp \h, #2 2003 load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 2004 uxtl v16.8h, v1.8b 2005 uxtl v17.8h, v2.8b 2006 uxtl v18.8h, v3.8b 2007 uxtl v19.8h, v4.8b 2008 uxtl v20.8h, v5.8b 2009 uxtl2 v23.8h, v1.16b 2010 uxtl2 v24.8h, v2.16b 2011 uxtl2 v25.8h, v3.16b 2012 uxtl2 v26.8h, v4.16b 2013 uxtl2 v27.8h, v5.16b 2014 mul_mla_4tap v1, v16, v17, v18, v19, .8h 2015 mul_mla_4tap v16, v17, v18, v19, v20, .8h 2016 mul_mla_4tap v2, v23, v24, v25, v26, .8h 2017 mul_mla_4tap v17, v24, v25, v26, v27, .8h 2018 shift_store_16 \type, \d_strd, v1, v2, v16, v17 2019 b.le 0f 2020 load_16b \sr2, \src, \s_strd, v6, v7 2021 uxtl v21.8h, v6.8b 2022 uxtl v22.8h, v7.8b 2023 uxtl2 v28.8h, v6.16b 2024 uxtl2 v29.8h, v7.16b 2025 mul_mla_4tap v1, v18, v19, v20, v21, .8h 2026 mul_mla_4tap v3, v19, v20, v21, v22, .8h 2027 mul_mla_4tap v2, v25, v26, v27, v28, .8h 2028 mul_mla_4tap v4, v26, v27, v28, v29, .8h 2029 shift_store_16 \type, \d_strd, v1, v2, v3, v4 20300: 2031 ret 2032endfunc 2033 2034jumptable \type\()_\taps\()_v_tbl 2035 .word 1280b - \type\()_\taps\()_v_tbl 2036 .word 640b - \type\()_\taps\()_v_tbl 2037 .word 320b - \type\()_\taps\()_v_tbl 2038 .word 160b - \type\()_\taps\()_v_tbl 2039 .word 80b - \type\()_\taps\()_v_tbl 2040 .word 40b - \type\()_\taps\()_v_tbl 2041 .word 20b - \type\()_\taps\()_v_tbl 2042endjumptable 2043 2044function L(\type\()_\taps\()_hv) 2045 cmp \h, #4 2046 ubfx w9, \my, #7, #7 2047 and \my, \my, #0x7f 2048 b.le 4f 2049 mov \my, w9 20504: 2051 add \xmy, x10, \my, uxtw #3 2052 2053 movrel x9, \type\()_\taps\()_hv_tbl 2054 ldrsw x8, [x9, x8, lsl #2] 2055 add x9, x9, x8 2056 br x9 2057 205820: 2059 AARCH64_VALID_JUMP_TARGET 2060.ifc \type, put 2061 ldur s0, [\xmx, #2] 2062 b.gt 280f 2063 ldur s1, [\xmy, #2] 2064 2065 // 2x2, 2x4 hv 2066 sub \sr2, \src, #1 2067 sub \src, \sr2, \s_strd 2068 add \ds2, \dst, \d_strd 2069 lsl \s_strd, \s_strd, #1 2070 lsl \d_strd, \d_strd, #1 2071 sxtl v0.8h, v0.8b 2072 sxtl v1.8h, v1.8b 2073 mov x15, x30 2074 2075 ld1 {v28.8b}, [\src], \s_strd 2076 uxtl v28.8h, v28.8b 2077 ext v29.16b, v28.16b, v28.16b, #2 2078 mul v28.4h, v28.4h, v0.4h 2079 mul v29.4h, v29.4h, v0.4h 2080 addp v28.4h, v28.4h, v29.4h 2081 addp v16.4h, v28.4h, v28.4h 2082 srshr v16.4h, v16.4h, #2 2083 bl L(\type\()_\taps\()_filter_2) 2084 2085 trn1 v16.2s, v16.2s, v28.2s 2086 mov v17.8b, v28.8b 2087 20882: 2089 bl L(\type\()_\taps\()_filter_2) 2090 2091 ext v18.8b, v17.8b, v28.8b, #4 2092 smull v2.4s, v16.4h, v1.h[0] 2093 smlal v2.4s, v17.4h, v1.h[1] 2094 smlal v2.4s, v18.4h, v1.h[2] 2095 smlal v2.4s, v28.4h, v1.h[3] 2096 2097 sqrshrn v2.4h, v2.4s, #\shift_hv 2098 sqxtun v2.8b, v2.8h 2099 subs \h, \h, #2 2100 st1 {v2.h}[0], [\dst], \d_strd 2101 st1 {v2.h}[1], [\ds2], \d_strd 2102 b.le 0f 2103 mov v16.8b, v18.8b 2104 mov v17.8b, v28.8b 2105 b 2b 2106 2107280: // 2x8, 2x16, 2x32 hv 2108 ld1 {v1.8b}, [\xmy] 2109 sub \src, \src, #1 2110 sub \sr2, \src, \s_strd, lsl #1 2111 sub \src, \sr2, \s_strd 2112 add \ds2, \dst, \d_strd 2113 lsl \s_strd, \s_strd, #1 2114 lsl \d_strd, \d_strd, #1 2115 sxtl v0.8h, v0.8b 2116 sxtl v1.8h, v1.8b 2117 mov x15, x30 2118 2119 ld1 {v28.8b}, [\src], \s_strd 2120 uxtl v28.8h, v28.8b 2121 ext v29.16b, v28.16b, v28.16b, #2 2122 mul v28.4h, v28.4h, v0.4h 2123 mul v29.4h, v29.4h, v0.4h 2124 addp v28.4h, v28.4h, v29.4h 2125 addp v16.4h, v28.4h, v28.4h 2126 srshr v16.4h, v16.4h, #2 2127 2128 bl L(\type\()_\taps\()_filter_2) 2129 trn1 v16.2s, v16.2s, v28.2s 2130 mov v17.8b, v28.8b 2131 bl L(\type\()_\taps\()_filter_2) 2132 ext v18.8b, v17.8b, v28.8b, #4 2133 mov v19.8b, v28.8b 2134 bl L(\type\()_\taps\()_filter_2) 2135 ext v20.8b, v19.8b, v28.8b, #4 2136 mov v21.8b, v28.8b 2137 213828: 2139 bl L(\type\()_\taps\()_filter_2) 2140 ext v22.8b, v21.8b, v28.8b, #4 2141.ifc \taps, 6tap 2142 smull v2.4s, v17.4h, v1.h[1] 2143 smlal v2.4s, v18.4h, v1.h[2] 2144 smlal v2.4s, v19.4h, v1.h[3] 2145 smlal v2.4s, v20.4h, v1.h[4] 2146 smlal v2.4s, v21.4h, v1.h[5] 2147 smlal v2.4s, v22.4h, v1.h[6] 2148.else // 8tap 2149 smull v2.4s, v16.4h, v1.h[0] 2150 smlal v2.4s, v17.4h, v1.h[1] 2151 smlal v2.4s, v18.4h, v1.h[2] 2152 smlal v2.4s, v19.4h, v1.h[3] 2153 smlal v2.4s, v20.4h, v1.h[4] 2154 smlal v2.4s, v21.4h, v1.h[5] 2155 smlal v2.4s, v22.4h, v1.h[6] 2156 smlal v2.4s, v28.4h, v1.h[7] 2157.endif 2158 2159 sqrshrn v2.4h, v2.4s, #\shift_hv 2160 sqxtun v2.8b, v2.8h 2161 subs \h, \h, #2 2162 st1 {v2.h}[0], [\dst], \d_strd 2163 st1 {v2.h}[1], [\ds2], \d_strd 2164 b.le 0f 2165 mov v16.8b, v18.8b 2166 mov v17.8b, v19.8b 2167 mov v18.8b, v20.8b 2168 mov v19.8b, v21.8b 2169 mov v20.8b, v22.8b 2170 mov v21.8b, v28.8b 2171 b 28b 2172 21730: 2174 ret x15 2175 2176L(\type\()_\taps\()_filter_2): 2177 ld1 {v28.8b}, [\sr2], \s_strd 2178 ld1 {v30.8b}, [\src], \s_strd 2179 uxtl v28.8h, v28.8b 2180 uxtl v30.8h, v30.8b 2181 ext v29.16b, v28.16b, v28.16b, #2 2182 ext v31.16b, v30.16b, v30.16b, #2 2183 trn1 v27.2s, v28.2s, v30.2s 2184 trn2 v30.2s, v28.2s, v30.2s 2185 trn1 v28.2s, v29.2s, v31.2s 2186 trn2 v31.2s, v29.2s, v31.2s 2187 mul v27.4h, v27.4h, v0.h[0] 2188 mla v27.4h, v28.4h, v0.h[1] 2189 mla v27.4h, v30.4h, v0.h[2] 2190 mla v27.4h, v31.4h, v0.h[3] 2191 srshr v28.4h, v27.4h, #2 2192 ret 2193.endif 2194 219540: 2196 AARCH64_VALID_JUMP_TARGET 2197 ldur s0, [\xmx, #2] 2198 b.gt 480f 2199 ldur s1, [\xmy, #2] 2200 sub \sr2, \src, #1 2201 sub \src, \sr2, \s_strd 2202 add \ds2, \dst, \d_strd 2203 lsl \s_strd, \s_strd, #1 2204 lsl \d_strd, \d_strd, #1 2205 sxtl v0.8h, v0.8b 2206 sxtl v1.8h, v1.8b 2207 mov x15, x30 2208 2209 // 4x2, 4x4 hv 2210 ld1 {v26.8b}, [\src], \s_strd 2211 uxtl v26.8h, v26.8b 2212 ext v28.16b, v26.16b, v26.16b, #2 2213 ext v29.16b, v26.16b, v26.16b, #4 2214 ext v30.16b, v26.16b, v26.16b, #6 2215 mul v31.4h, v26.4h, v0.h[0] 2216 mla v31.4h, v28.4h, v0.h[1] 2217 mla v31.4h, v29.4h, v0.h[2] 2218 mla v31.4h, v30.4h, v0.h[3] 2219 srshr v16.4h, v31.4h, #2 2220 2221 bl L(\type\()_\taps\()_filter_4) 2222 mov v17.8b, v28.8b 2223 mov v18.8b, v29.8b 2224 22254: 2226 bl L(\type\()_\taps\()_filter_4) 2227 // Interleaving the mul/mla chains actually hurts performance 2228 // significantly on Cortex A53, thus keeping mul/mla tightly 2229 // chained like this. 2230 smull v2.4s, v16.4h, v1.h[0] 2231 smlal v2.4s, v17.4h, v1.h[1] 2232 smlal v2.4s, v18.4h, v1.h[2] 2233 smlal v2.4s, v28.4h, v1.h[3] 2234 smull v3.4s, v17.4h, v1.h[0] 2235 smlal v3.4s, v18.4h, v1.h[1] 2236 smlal v3.4s, v28.4h, v1.h[2] 2237 smlal v3.4s, v29.4h, v1.h[3] 2238 sqrshrn v2.4h, v2.4s, #\shift_hv 2239 sqrshrn v3.4h, v3.4s, #\shift_hv 2240 subs \h, \h, #2 2241.ifc \type, put 2242 sqxtun v2.8b, v2.8h 2243 sqxtun v3.8b, v3.8h 2244 str s2, [\dst] 2245 str s3, [\ds2] 2246 add \dst, \dst, \d_strd 2247 add \ds2, \ds2, \d_strd 2248.else 2249 st1 {v2.4h}, [\dst], \d_strd 2250 st1 {v3.4h}, [\ds2], \d_strd 2251.endif 2252 b.le 0f 2253 mov v16.8b, v18.8b 2254 mov v17.8b, v28.8b 2255 mov v18.8b, v29.8b 2256 b 4b 2257 2258480: // 4x8, 4x16, 4x32 hv 2259 ld1 {v1.8b}, [\xmy] 2260 sub \src, \src, #1 2261.ifc \taps, 6tap 2262 sub \sr2, \src, \s_strd 2263 sub \src, \src, \s_strd, lsl #1 2264.else 2265 sub \sr2, \src, \s_strd, lsl #1 2266 sub \src, \sr2, \s_strd 2267.endif 2268 add \ds2, \dst, \d_strd 2269 lsl \s_strd, \s_strd, #1 2270 lsl \d_strd, \d_strd, #1 2271 sxtl v0.8h, v0.8b 2272 sxtl v1.8h, v1.8b 2273 mov x15, x30 2274 2275 ld1 {v26.8b}, [\src], \s_strd 2276 uxtl v26.8h, v26.8b 2277 ext v28.16b, v26.16b, v26.16b, #2 2278 ext v29.16b, v26.16b, v26.16b, #4 2279 ext v30.16b, v26.16b, v26.16b, #6 2280 mul v31.4h, v26.4h, v0.h[0] 2281 mla v31.4h, v28.4h, v0.h[1] 2282 mla v31.4h, v29.4h, v0.h[2] 2283 mla v31.4h, v30.4h, v0.h[3] 2284.ifc \taps, 6tap 2285 srshr v18.4h, v31.4h, #2 2286.else 2287 srshr v16.4h, v31.4h, #2 2288 2289 bl L(\type\()_\taps\()_filter_4) 2290 mov v17.8b, v28.8b 2291 mov v18.8b, v29.8b 2292.endif 2293 bl L(\type\()_\taps\()_filter_4) 2294 mov v19.8b, v28.8b 2295 mov v20.8b, v29.8b 2296 bl L(\type\()_\taps\()_filter_4) 2297 mov v21.8b, v28.8b 2298 mov v22.8b, v29.8b 2299 230048: 2301 bl L(\type\()_\taps\()_filter_4) 2302.ifc \taps, 6tap 2303 smull v2.4s, v18.4h, v1.h[1] 2304 smlal v2.4s, v19.4h, v1.h[2] 2305 smlal v2.4s, v20.4h, v1.h[3] 2306 smlal v2.4s, v21.4h, v1.h[4] 2307 smlal v2.4s, v22.4h, v1.h[5] 2308 smlal v2.4s, v28.4h, v1.h[6] 2309 smull v3.4s, v19.4h, v1.h[1] 2310 smlal v3.4s, v20.4h, v1.h[2] 2311 smlal v3.4s, v21.4h, v1.h[3] 2312 smlal v3.4s, v22.4h, v1.h[4] 2313 smlal v3.4s, v28.4h, v1.h[5] 2314 smlal v3.4s, v29.4h, v1.h[6] 2315.else // 8tap 2316 smull v2.4s, v16.4h, v1.h[0] 2317 smlal v2.4s, v17.4h, v1.h[1] 2318 smlal v2.4s, v18.4h, v1.h[2] 2319 smlal v2.4s, v19.4h, v1.h[3] 2320 smlal v2.4s, v20.4h, v1.h[4] 2321 smlal v2.4s, v21.4h, v1.h[5] 2322 smlal v2.4s, v22.4h, v1.h[6] 2323 smlal v2.4s, v28.4h, v1.h[7] 2324 smull v3.4s, v17.4h, v1.h[0] 2325 smlal v3.4s, v18.4h, v1.h[1] 2326 smlal v3.4s, v19.4h, v1.h[2] 2327 smlal v3.4s, v20.4h, v1.h[3] 2328 smlal v3.4s, v21.4h, v1.h[4] 2329 smlal v3.4s, v22.4h, v1.h[5] 2330 smlal v3.4s, v28.4h, v1.h[6] 2331 smlal v3.4s, v29.4h, v1.h[7] 2332.endif 2333 sqrshrn v2.4h, v2.4s, #\shift_hv 2334 sqrshrn v3.4h, v3.4s, #\shift_hv 2335 subs \h, \h, #2 2336.ifc \type, put 2337 sqxtun v2.8b, v2.8h 2338 sqxtun v3.8b, v3.8h 2339 str s2, [\dst] 2340 str s3, [\ds2] 2341 add \dst, \dst, \d_strd 2342 add \ds2, \ds2, \d_strd 2343.else 2344 st1 {v2.4h}, [\dst], \d_strd 2345 st1 {v3.4h}, [\ds2], \d_strd 2346.endif 2347 b.le 0f 2348.ifc \taps, 8tap 2349 mov v16.8b, v18.8b 2350 mov v17.8b, v19.8b 2351.endif 2352 mov v18.8b, v20.8b 2353 mov v19.8b, v21.8b 2354 mov v20.8b, v22.8b 2355 mov v21.8b, v28.8b 2356 mov v22.8b, v29.8b 2357 b 48b 23580: 2359 ret x15 2360 2361L(\type\()_\taps\()_filter_4): 2362 ld1 {v26.8b}, [\sr2], \s_strd 2363 ld1 {v27.8b}, [\src], \s_strd 2364 uxtl v26.8h, v26.8b 2365 uxtl v27.8h, v27.8b 2366 ext v28.16b, v26.16b, v26.16b, #2 2367 ext v29.16b, v26.16b, v26.16b, #4 2368 ext v30.16b, v26.16b, v26.16b, #6 2369 mul v31.4h, v26.4h, v0.h[0] 2370 mla v31.4h, v28.4h, v0.h[1] 2371 mla v31.4h, v29.4h, v0.h[2] 2372 mla v31.4h, v30.4h, v0.h[3] 2373 ext v28.16b, v27.16b, v27.16b, #2 2374 ext v29.16b, v27.16b, v27.16b, #4 2375 ext v30.16b, v27.16b, v27.16b, #6 2376 mul v27.4h, v27.4h, v0.h[0] 2377 mla v27.4h, v28.4h, v0.h[1] 2378 mla v27.4h, v29.4h, v0.h[2] 2379 mla v27.4h, v30.4h, v0.h[3] 2380 srshr v28.4h, v31.4h, #2 2381 srshr v29.4h, v27.4h, #2 2382 ret 2383 238480: 2385160: 2386320: 2387 AARCH64_VALID_JUMP_TARGET 2388 b.gt 880f 2389 ld1 {v0.8b}, [\xmx] 2390 ldur s1, [\xmy, #2] 2391.ifc \taps, 6tap 2392 sub \src, \src, #2 2393.else 2394 sub \src, \src, #3 2395.endif 2396 sub \src, \src, \s_strd 2397 sxtl v0.8h, v0.8b 2398 sxtl v1.8h, v1.8b 2399 mov x15, x30 2400 mov \my, \h 2401 2402164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv 2403 add \ds2, \dst, \d_strd 2404 add \sr2, \src, \s_strd 2405 lsl \d_strd, \d_strd, #1 2406 lsl \s_strd, \s_strd, #1 2407 2408 bl L(\type\()_\taps\()_filter_8_first) 2409 bl L(\type\()_\taps\()_filter_8) 2410 mov v17.16b, v24.16b 2411 mov v18.16b, v25.16b 2412 24138: 2414 smull v2.4s, v16.4h, v1.h[0] 2415 smull2 v3.4s, v16.8h, v1.h[0] 2416 bl L(\type\()_\taps\()_filter_8) 2417 smull v4.4s, v17.4h, v1.h[0] 2418 smull2 v5.4s, v17.8h, v1.h[0] 2419 smlal v2.4s, v17.4h, v1.h[1] 2420 smlal2 v3.4s, v17.8h, v1.h[1] 2421 smlal v4.4s, v18.4h, v1.h[1] 2422 smlal2 v5.4s, v18.8h, v1.h[1] 2423 smlal v2.4s, v18.4h, v1.h[2] 2424 smlal2 v3.4s, v18.8h, v1.h[2] 2425 smlal v4.4s, v24.4h, v1.h[2] 2426 smlal2 v5.4s, v24.8h, v1.h[2] 2427 smlal v2.4s, v24.4h, v1.h[3] 2428 smlal2 v3.4s, v24.8h, v1.h[3] 2429 smlal v4.4s, v25.4h, v1.h[3] 2430 smlal2 v5.4s, v25.8h, v1.h[3] 2431 sqrshrn v2.4h, v2.4s, #\shift_hv 2432 sqrshrn2 v2.8h, v3.4s, #\shift_hv 2433 sqrshrn v4.4h, v4.4s, #\shift_hv 2434 sqrshrn2 v4.8h, v5.4s, #\shift_hv 2435 subs \h, \h, #2 2436.ifc \type, put 2437 sqxtun v2.8b, v2.8h 2438 sqxtun v4.8b, v4.8h 2439 st1 {v2.8b}, [\dst], \d_strd 2440 st1 {v4.8b}, [\ds2], \d_strd 2441.else 2442 st1 {v2.8h}, [\dst], \d_strd 2443 st1 {v4.8h}, [\ds2], \d_strd 2444.endif 2445 b.le 9f 2446 mov v16.16b, v18.16b 2447 mov v17.16b, v24.16b 2448 mov v18.16b, v25.16b 2449 b 8b 24509: 2451 subs \w, \w, #8 2452 b.le 0f 2453 asr \s_strd, \s_strd, #1 2454 asr \d_strd, \d_strd, #1 2455 msub \src, \s_strd, \xmy, \src 2456 msub \dst, \d_strd, \xmy, \dst 2457 sub \src, \src, \s_strd, lsl #2 2458 mov \h, \my 2459 add \src, \src, #8 2460.ifc \type, put 2461 add \dst, \dst, #8 2462.else 2463 add \dst, \dst, #16 2464.endif 2465 b 164b 2466 2467880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 2468640: 24691280: 2470 AARCH64_VALID_JUMP_TARGET 2471 ld1 {v0.8b}, [\xmx] 2472 ld1 {v1.8b}, [\xmy] 2473.ifc \taps, 6tap 2474 sub \src, \src, #2 2475.else 2476 sub \src, \src, #3 2477 sub \src, \src, \s_strd 2478.endif 2479 sub \src, \src, \s_strd, lsl #1 2480 sxtl v0.8h, v0.8b 2481 sxtl v1.8h, v1.8b 2482 mov x15, x30 2483 mov \my, \h 2484 2485168: 2486 add \ds2, \dst, \d_strd 2487 add \sr2, \src, \s_strd 2488 lsl \d_strd, \d_strd, #1 2489 lsl \s_strd, \s_strd, #1 2490 2491 bl L(\type\()_\taps\()_filter_8_first) 2492.ifc \taps, 6tap 2493 mov v18.16b, v16.16b 2494.else 2495 bl L(\type\()_\taps\()_filter_8) 2496 mov v17.16b, v24.16b 2497 mov v18.16b, v25.16b 2498.endif 2499 bl L(\type\()_\taps\()_filter_8) 2500 mov v19.16b, v24.16b 2501 mov v20.16b, v25.16b 2502 bl L(\type\()_\taps\()_filter_8) 2503 mov v21.16b, v24.16b 2504 mov v22.16b, v25.16b 2505 250688: 2507.ifc \taps, 6tap 2508 smull v2.4s, v18.4h, v1.h[1] 2509 smull2 v3.4s, v18.8h, v1.h[1] 2510 bl L(\type\()_\taps\()_filter_8) 2511 smull v4.4s, v19.4h, v1.h[1] 2512 smull2 v5.4s, v19.8h, v1.h[1] 2513 smlal v2.4s, v19.4h, v1.h[2] 2514 smlal2 v3.4s, v19.8h, v1.h[2] 2515 smlal v4.4s, v20.4h, v1.h[2] 2516 smlal2 v5.4s, v20.8h, v1.h[2] 2517 smlal v2.4s, v20.4h, v1.h[3] 2518 smlal2 v3.4s, v20.8h, v1.h[3] 2519 smlal v4.4s, v21.4h, v1.h[3] 2520 smlal2 v5.4s, v21.8h, v1.h[3] 2521 smlal v2.4s, v21.4h, v1.h[4] 2522 smlal2 v3.4s, v21.8h, v1.h[4] 2523 smlal v4.4s, v22.4h, v1.h[4] 2524 smlal2 v5.4s, v22.8h, v1.h[4] 2525 smlal v2.4s, v22.4h, v1.h[5] 2526 smlal2 v3.4s, v22.8h, v1.h[5] 2527 smlal v4.4s, v24.4h, v1.h[5] 2528 smlal2 v5.4s, v24.8h, v1.h[5] 2529 smlal v2.4s, v24.4h, v1.h[6] 2530 smlal2 v3.4s, v24.8h, v1.h[6] 2531 smlal v4.4s, v25.4h, v1.h[6] 2532 smlal2 v5.4s, v25.8h, v1.h[6] 2533.else // 8tap 2534 smull v2.4s, v16.4h, v1.h[0] 2535 smull2 v3.4s, v16.8h, v1.h[0] 2536 bl L(\type\()_\taps\()_filter_8) 2537 smull v4.4s, v17.4h, v1.h[0] 2538 smull2 v5.4s, v17.8h, v1.h[0] 2539 smlal v2.4s, v17.4h, v1.h[1] 2540 smlal2 v3.4s, v17.8h, v1.h[1] 2541 smlal v4.4s, v18.4h, v1.h[1] 2542 smlal2 v5.4s, v18.8h, v1.h[1] 2543 smlal v2.4s, v18.4h, v1.h[2] 2544 smlal2 v3.4s, v18.8h, v1.h[2] 2545 smlal v4.4s, v19.4h, v1.h[2] 2546 smlal2 v5.4s, v19.8h, v1.h[2] 2547 smlal v2.4s, v19.4h, v1.h[3] 2548 smlal2 v3.4s, v19.8h, v1.h[3] 2549 smlal v4.4s, v20.4h, v1.h[3] 2550 smlal2 v5.4s, v20.8h, v1.h[3] 2551 smlal v2.4s, v20.4h, v1.h[4] 2552 smlal2 v3.4s, v20.8h, v1.h[4] 2553 smlal v4.4s, v21.4h, v1.h[4] 2554 smlal2 v5.4s, v21.8h, v1.h[4] 2555 smlal v2.4s, v21.4h, v1.h[5] 2556 smlal2 v3.4s, v21.8h, v1.h[5] 2557 smlal v4.4s, v22.4h, v1.h[5] 2558 smlal2 v5.4s, v22.8h, v1.h[5] 2559 smlal v2.4s, v22.4h, v1.h[6] 2560 smlal2 v3.4s, v22.8h, v1.h[6] 2561 smlal v4.4s, v24.4h, v1.h[6] 2562 smlal2 v5.4s, v24.8h, v1.h[6] 2563 smlal v2.4s, v24.4h, v1.h[7] 2564 smlal2 v3.4s, v24.8h, v1.h[7] 2565 smlal v4.4s, v25.4h, v1.h[7] 2566 smlal2 v5.4s, v25.8h, v1.h[7] 2567.endif 2568 sqrshrn v2.4h, v2.4s, #\shift_hv 2569 sqrshrn2 v2.8h, v3.4s, #\shift_hv 2570 sqrshrn v4.4h, v4.4s, #\shift_hv 2571 sqrshrn2 v4.8h, v5.4s, #\shift_hv 2572 subs \h, \h, #2 2573.ifc \type, put 2574 sqxtun v2.8b, v2.8h 2575 sqxtun v4.8b, v4.8h 2576 st1 {v2.8b}, [\dst], \d_strd 2577 st1 {v4.8b}, [\ds2], \d_strd 2578.else 2579 st1 {v2.8h}, [\dst], \d_strd 2580 st1 {v4.8h}, [\ds2], \d_strd 2581.endif 2582 b.le 9f 2583.ifc \taps, 8tap 2584 mov v16.16b, v18.16b 2585 mov v17.16b, v19.16b 2586.endif 2587 mov v18.16b, v20.16b 2588 mov v19.16b, v21.16b 2589 mov v20.16b, v22.16b 2590 mov v21.16b, v24.16b 2591 mov v22.16b, v25.16b 2592 b 88b 25939: 2594 subs \w, \w, #8 2595 b.le 0f 2596 asr \s_strd, \s_strd, #1 2597 asr \d_strd, \d_strd, #1 2598 msub \src, \s_strd, \xmy, \src 2599 msub \dst, \d_strd, \xmy, \dst 2600 sub \src, \src, \s_strd, lsl #3 2601 mov \h, \my 2602 add \src, \src, #8 2603.ifc \type, put 2604 add \dst, \dst, #8 2605.else 2606 add \dst, \dst, #16 2607.endif 2608.ifc \taps, 6tap 2609 add \src, \src, \s_strd, lsl #1 2610.endif 2611 b 168b 26120: 2613 ret x15 2614 2615L(\type\()_\taps\()_filter_8_first): 2616 ld1 {v28.8b, v29.8b}, [\src], \s_strd 2617 uxtl v28.8h, v28.8b 2618 uxtl v29.8h, v29.8b 2619.ifc \taps, 6tap 2620 mul v16.8h, v28.8h, v0.h[1] 2621 ext v25.16b, v28.16b, v29.16b, #(2*1) 2622 ext v26.16b, v28.16b, v29.16b, #(2*2) 2623 ext v27.16b, v28.16b, v29.16b, #(2*3) 2624 mla v16.8h, v25.8h, v0.h[2] 2625 mla v16.8h, v26.8h, v0.h[3] 2626 mla v16.8h, v27.8h, v0.h[4] 2627 ext v24.16b, v28.16b, v29.16b, #(2*4) 2628 ext v25.16b, v28.16b, v29.16b, #(2*5) 2629 mla v16.8h, v24.8h, v0.h[5] 2630 mla v16.8h, v25.8h, v0.h[6] 2631.else // 8tap 2632 mul v16.8h, v28.8h, v0.h[0] 2633 ext v24.16b, v28.16b, v29.16b, #(2*1) 2634 ext v25.16b, v28.16b, v29.16b, #(2*2) 2635 ext v26.16b, v28.16b, v29.16b, #(2*3) 2636 ext v27.16b, v28.16b, v29.16b, #(2*4) 2637 mla v16.8h, v24.8h, v0.h[1] 2638 mla v16.8h, v25.8h, v0.h[2] 2639 mla v16.8h, v26.8h, v0.h[3] 2640 mla v16.8h, v27.8h, v0.h[4] 2641 ext v24.16b, v28.16b, v29.16b, #(2*5) 2642 ext v25.16b, v28.16b, v29.16b, #(2*6) 2643 ext v26.16b, v28.16b, v29.16b, #(2*7) 2644 mla v16.8h, v24.8h, v0.h[5] 2645 mla v16.8h, v25.8h, v0.h[6] 2646 mla v16.8h, v26.8h, v0.h[7] 2647.endif 2648 srshr v16.8h, v16.8h, #2 2649 ret 2650 2651L(\type\()_\taps\()_filter_8): 2652 ld1 {v28.8b, v29.8b}, [\sr2], \s_strd 2653 ld1 {v30.8b, v31.8b}, [\src], \s_strd 2654 uxtl v28.8h, v28.8b 2655 uxtl v29.8h, v29.8b 2656 uxtl v30.8h, v30.8b 2657 uxtl v31.8h, v31.8b 2658.ifc \taps, 6tap 2659 mul v24.8h, v28.8h, v0.h[1] 2660 mul v25.8h, v30.8h, v0.h[1] 2661 .irpc i, 23456 2662 ext v26.16b, v28.16b, v29.16b, #(2*\i-2) 2663 ext v27.16b, v30.16b, v31.16b, #(2*\i-2) 2664 mla v24.8h, v26.8h, v0.h[\i] 2665 mla v25.8h, v27.8h, v0.h[\i] 2666 .endr 2667.else // 8tap 2668 mul v24.8h, v28.8h, v0.h[0] 2669 mul v25.8h, v30.8h, v0.h[0] 2670 .irpc i, 1234567 2671 ext v26.16b, v28.16b, v29.16b, #(2*\i) 2672 ext v27.16b, v30.16b, v31.16b, #(2*\i) 2673 mla v24.8h, v26.8h, v0.h[\i] 2674 mla v25.8h, v27.8h, v0.h[\i] 2675 .endr 2676.endif 2677 srshr v24.8h, v24.8h, #2 2678 srshr v25.8h, v25.8h, #2 2679 ret 2680endfunc 2681 2682jumptable \type\()_\taps\()_hv_tbl 2683 .word 1280b - \type\()_\taps\()_hv_tbl 2684 .word 640b - \type\()_\taps\()_hv_tbl 2685 .word 320b - \type\()_\taps\()_hv_tbl 2686 .word 160b - \type\()_\taps\()_hv_tbl 2687 .word 80b - \type\()_\taps\()_hv_tbl 2688 .word 40b - \type\()_\taps\()_hv_tbl 2689 .word 20b - \type\()_\taps\()_hv_tbl 2690endjumptable 2691.endm 2692 2693 2694.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv 2695function \type\()_bilin_8bpc_neon, export=1 2696 dup v1.16b, \mx 2697 dup v3.16b, \my 2698 mov w9, #16 2699 sub w8, w9, \mx 2700 sub w9, w9, \my 2701 dup v0.16b, w8 2702 dup v2.16b, w9 2703.ifc \type, prep 2704 uxtw \d_strd, \w 2705 lsl \d_strd, \d_strd, #1 2706.endif 2707 2708 clz w8, \w 2709 sub w8, w8, #24 2710 cbnz \mx, L(\type\()_bilin_h) 2711 cbnz \my, L(\type\()_bilin_v) 2712 b \type\()_neon 2713 2714L(\type\()_bilin_h): 2715 cbnz \my, L(\type\()_bilin_hv) 2716 2717 movrel x9, \type\()_bilin_h_tbl 2718 ldrsw x8, [x9, x8, lsl #2] 2719 add x9, x9, x8 2720 br x9 2721 272220: // 2xN h 2723 AARCH64_VALID_JUMP_TARGET 2724.ifc \type, put 2725 add \ds2, \dst, \d_strd 2726 add \sr2, \src, \s_strd 2727 lsl \d_strd, \d_strd, #1 2728 lsl \s_strd, \s_strd, #1 27292: 2730 ld1r {v4.4s}, [\src], \s_strd 2731 ld1r {v6.4s}, [\sr2], \s_strd 2732 ext v5.8b, v4.8b, v4.8b, #1 2733 ext v7.8b, v6.8b, v6.8b, #1 2734 trn1 v4.4h, v4.4h, v6.4h 2735 trn1 v5.4h, v5.4h, v7.4h 2736 subs \h, \h, #2 2737 umull v4.8h, v4.8b, v0.8b 2738 umlal v4.8h, v5.8b, v1.8b 2739 uqrshrn v4.8b, v4.8h, #4 2740 st1 {v4.h}[0], [\dst], \d_strd 2741 st1 {v4.h}[1], [\ds2], \d_strd 2742 b.gt 2b 2743 ret 2744.endif 2745 274640: // 4xN h 2747 AARCH64_VALID_JUMP_TARGET 2748 add \ds2, \dst, \d_strd 2749 add \sr2, \src, \s_strd 2750 lsl \d_strd, \d_strd, #1 2751 lsl \s_strd, \s_strd, #1 27524: 2753 ld1 {v4.8b}, [\src], \s_strd 2754 ld1 {v6.8b}, [\sr2], \s_strd 2755 ext v5.8b, v4.8b, v4.8b, #1 2756 ext v7.8b, v6.8b, v6.8b, #1 2757 trn1 v4.2s, v4.2s, v6.2s 2758 trn1 v5.2s, v5.2s, v7.2s 2759 subs \h, \h, #2 2760 umull v4.8h, v4.8b, v0.8b 2761 umlal v4.8h, v5.8b, v1.8b 2762.ifc \type, put 2763 uqrshrn v4.8b, v4.8h, #4 2764 st1 {v4.s}[0], [\dst], \d_strd 2765 st1 {v4.s}[1], [\ds2], \d_strd 2766.else 2767 st1 {v4.8b}, [\dst], \d_strd 2768 st1 {v4.d}[1], [\ds2], \d_strd 2769.endif 2770 b.gt 4b 2771 ret 2772 277380: // 8xN h 2774 AARCH64_VALID_JUMP_TARGET 2775 add \ds2, \dst, \d_strd 2776 add \sr2, \src, \s_strd 2777 lsl \d_strd, \d_strd, #1 2778 lsl \s_strd, \s_strd, #1 27798: 2780 ld1 {v4.16b}, [\src], \s_strd 2781 ld1 {v6.16b}, [\sr2], \s_strd 2782 ext v5.16b, v4.16b, v4.16b, #1 2783 ext v7.16b, v6.16b, v6.16b, #1 2784 subs \h, \h, #2 2785 umull v4.8h, v4.8b, v0.8b 2786 umull v6.8h, v6.8b, v0.8b 2787 umlal v4.8h, v5.8b, v1.8b 2788 umlal v6.8h, v7.8b, v1.8b 2789.ifc \type, put 2790 uqrshrn v4.8b, v4.8h, #4 2791 uqrshrn v6.8b, v6.8h, #4 2792 st1 {v4.8b}, [\dst], \d_strd 2793 st1 {v6.8b}, [\ds2], \d_strd 2794.else 2795 st1 {v4.8h}, [\dst], \d_strd 2796 st1 {v6.8h}, [\ds2], \d_strd 2797.endif 2798 b.gt 8b 2799 ret 2800160: 2801320: 2802640: 28031280: // 16xN, 32xN, ... h 2804 AARCH64_VALID_JUMP_TARGET 2805 add \ds2, \dst, \d_strd 2806 add \sr2, \src, \s_strd 2807 lsl \s_strd, \s_strd, #1 2808 2809 sub \s_strd, \s_strd, \w, uxtw 2810 sub \s_strd, \s_strd, #8 2811.ifc \type, put 2812 lsl \d_strd, \d_strd, #1 2813 sub \d_strd, \d_strd, \w, uxtw 2814.endif 2815161: 2816 ld1 {v16.d}[1], [\src], #8 2817 ld1 {v20.d}[1], [\sr2], #8 2818 mov \mx, \w 2819 282016: 2821 ld1 {v18.16b}, [\src], #16 2822 ld1 {v22.16b}, [\sr2], #16 2823 ext v17.16b, v16.16b, v18.16b, #8 2824 ext v19.16b, v16.16b, v18.16b, #9 2825 ext v21.16b, v20.16b, v22.16b, #8 2826 ext v23.16b, v20.16b, v22.16b, #9 2827 umull v16.8h, v17.8b, v0.8b 2828 umull2 v17.8h, v17.16b, v0.16b 2829 umull v20.8h, v21.8b, v0.8b 2830 umull2 v21.8h, v21.16b, v0.16b 2831 umlal v16.8h, v19.8b, v1.8b 2832 umlal2 v17.8h, v19.16b, v1.16b 2833 umlal v20.8h, v23.8b, v1.8b 2834 umlal2 v21.8h, v23.16b, v1.16b 2835 subs \mx, \mx, #16 2836.ifc \type, put 2837 uqrshrn v16.8b, v16.8h, #4 2838 uqrshrn2 v16.16b, v17.8h, #4 2839 uqrshrn v20.8b, v20.8h, #4 2840 uqrshrn2 v20.16b, v21.8h, #4 2841 st1 {v16.16b}, [\dst], #16 2842 st1 {v20.16b}, [\ds2], #16 2843.else 2844 st1 {v16.8h, v17.8h}, [\dst], #32 2845 st1 {v20.8h, v21.8h}, [\ds2], #32 2846.endif 2847 b.le 9f 2848 2849 mov v16.16b, v18.16b 2850 mov v20.16b, v22.16b 2851 b 16b 2852 28539: 2854 add \dst, \dst, \d_strd 2855 add \ds2, \ds2, \d_strd 2856 add \src, \src, \s_strd 2857 add \sr2, \sr2, \s_strd 2858 2859 subs \h, \h, #2 2860 b.gt 161b 2861 ret 2862endfunc 2863 2864jumptable \type\()_bilin_h_tbl 2865 .word 1280b - \type\()_bilin_h_tbl 2866 .word 640b - \type\()_bilin_h_tbl 2867 .word 320b - \type\()_bilin_h_tbl 2868 .word 160b - \type\()_bilin_h_tbl 2869 .word 80b - \type\()_bilin_h_tbl 2870 .word 40b - \type\()_bilin_h_tbl 2871 .word 20b - \type\()_bilin_h_tbl 2872endjumptable 2873 2874 2875function L(\type\()_bilin_v) 2876 cmp \h, #4 2877 movrel x9, \type\()_bilin_v_tbl 2878 ldrsw x8, [x9, x8, lsl #2] 2879 add x9, x9, x8 2880 br x9 2881 288220: // 2xN v 2883 AARCH64_VALID_JUMP_TARGET 2884.ifc \type, put 2885 cmp \h, #2 2886 add \ds2, \dst, \d_strd 2887 add \sr2, \src, \s_strd 2888 lsl \s_strd, \s_strd, #1 2889 lsl \d_strd, \d_strd, #1 2890 2891 // 2x2 v 2892 ld1r {v16.8h}, [\src], \s_strd 2893 b.gt 24f 289422: 2895 ld1r {v17.8h}, [\sr2], \s_strd 2896 ld1r {v18.8h}, [\src], \s_strd 2897 trn1 v16.4h, v16.4h, v17.4h 2898 trn1 v17.4h, v17.4h, v18.4h 2899 umull v4.8h, v16.8b, v2.8b 2900 umlal v4.8h, v17.8b, v3.8b 2901 uqrshrn v4.8b, v4.8h, #4 2902 str h4, [\dst] 2903 st1 {v4.h}[1], [\ds2] 2904 ret 290524: // 2x4, 2x6, 2x8, ... v 2906 ld1r {v17.8h}, [\sr2], \s_strd 2907 ld1r {v18.8h}, [\src], \s_strd 2908 ld1r {v19.8h}, [\sr2], \s_strd 2909 ld1r {v20.8h}, [\src], \s_strd 2910 sub \h, \h, #4 2911 trn1 v16.4h, v16.4h, v17.4h 2912 trn1 v17.4h, v17.4h, v18.4h 2913 trn1 v18.4h, v18.4h, v19.4h 2914 trn1 v19.4h, v19.4h, v20.4h 2915 trn1 v16.2s, v16.2s, v18.2s 2916 trn1 v17.2s, v17.2s, v19.2s 2917 umull v4.8h, v16.8b, v2.8b 2918 umlal v4.8h, v17.8b, v3.8b 2919 cmp \h, #2 2920 uqrshrn v4.8b, v4.8h, #4 2921 st1 {v4.h}[0], [\dst], \d_strd 2922 st1 {v4.h}[1], [\ds2], \d_strd 2923 st1 {v4.h}[2], [\dst], \d_strd 2924 st1 {v4.h}[3], [\ds2], \d_strd 2925 b.lt 0f 2926 mov v16.8b, v20.8b 2927 b.eq 22b 2928 b 24b 29290: 2930 ret 2931.endif 2932 293340: // 4xN v 2934 AARCH64_VALID_JUMP_TARGET 2935 add \ds2, \dst, \d_strd 2936 add \sr2, \src, \s_strd 2937 lsl \s_strd, \s_strd, #1 2938 lsl \d_strd, \d_strd, #1 2939 ld1r {v16.4s}, [\src], \s_strd 29404: 2941 ld1r {v17.4s}, [\sr2], \s_strd 2942 ld1r {v18.4s}, [\src], \s_strd 2943 trn1 v16.2s, v16.2s, v17.2s 2944 trn1 v17.2s, v17.2s, v18.2s 2945 umull v4.8h, v16.8b, v2.8b 2946 umlal v4.8h, v17.8b, v3.8b 2947 subs \h, \h, #2 2948.ifc \type, put 2949 uqrshrn v4.8b, v4.8h, #4 2950 st1 {v4.s}[0], [\dst], \d_strd 2951 st1 {v4.s}[1], [\ds2], \d_strd 2952.else 2953 st1 {v4.8b}, [\dst], \d_strd 2954 st1 {v4.d}[1], [\ds2], \d_strd 2955.endif 2956 b.le 0f 2957 mov v16.8b, v18.8b 2958 b 4b 29590: 2960 ret 2961 296280: // 8xN v 2963 AARCH64_VALID_JUMP_TARGET 2964 add \ds2, \dst, \d_strd 2965 add \sr2, \src, \s_strd 2966 lsl \s_strd, \s_strd, #1 2967 lsl \d_strd, \d_strd, #1 2968 ld1 {v16.8b}, [\src], \s_strd 29698: 2970 ld1 {v17.8b}, [\sr2], \s_strd 2971 ld1 {v18.8b}, [\src], \s_strd 2972 umull v4.8h, v16.8b, v2.8b 2973 umull v5.8h, v17.8b, v2.8b 2974 umlal v4.8h, v17.8b, v3.8b 2975 umlal v5.8h, v18.8b, v3.8b 2976 subs \h, \h, #2 2977.ifc \type, put 2978 uqrshrn v4.8b, v4.8h, #4 2979 uqrshrn v5.8b, v5.8h, #4 2980 st1 {v4.8b}, [\dst], \d_strd 2981 st1 {v5.8b}, [\ds2], \d_strd 2982.else 2983 st1 {v4.8h}, [\dst], \d_strd 2984 st1 {v5.8h}, [\ds2], \d_strd 2985.endif 2986 b.le 0f 2987 mov v16.8b, v18.8b 2988 b 8b 29890: 2990 ret 2991 2992160: // 16xN, 32xN, ... 2993320: 2994640: 29951280: 2996 AARCH64_VALID_JUMP_TARGET 2997 mov \my, \h 29981: 2999 add \ds2, \dst, \d_strd 3000 add \sr2, \src, \s_strd 3001 lsl \s_strd, \s_strd, #1 3002 lsl \d_strd, \d_strd, #1 3003 3004 ld1 {v16.16b}, [\src], \s_strd 30052: 3006 ld1 {v17.16b}, [\sr2], \s_strd 3007 ld1 {v18.16b}, [\src], \s_strd 3008 umull v4.8h, v16.8b, v2.8b 3009 umull2 v5.8h, v16.16b, v2.16b 3010 umull v6.8h, v17.8b, v2.8b 3011 umull2 v7.8h, v17.16b, v2.16b 3012 umlal v4.8h, v17.8b, v3.8b 3013 umlal2 v5.8h, v17.16b, v3.16b 3014 umlal v6.8h, v18.8b, v3.8b 3015 umlal2 v7.8h, v18.16b, v3.16b 3016 subs \h, \h, #2 3017.ifc \type, put 3018 uqrshrn v4.8b, v4.8h, #4 3019 uqrshrn2 v4.16b, v5.8h, #4 3020 uqrshrn v6.8b, v6.8h, #4 3021 uqrshrn2 v6.16b, v7.8h, #4 3022 st1 {v4.16b}, [\dst], \d_strd 3023 st1 {v6.16b}, [\ds2], \d_strd 3024.else 3025 st1 {v4.8h, v5.8h}, [\dst], \d_strd 3026 st1 {v6.8h, v7.8h}, [\ds2], \d_strd 3027.endif 3028 b.le 9f 3029 mov v16.16b, v18.16b 3030 b 2b 30319: 3032 subs \w, \w, #16 3033 b.le 0f 3034 asr \s_strd, \s_strd, #1 3035 asr \d_strd, \d_strd, #1 3036 msub \src, \s_strd, \xmy, \src 3037 msub \dst, \d_strd, \xmy, \dst 3038 sub \src, \src, \s_strd, lsl #1 3039 mov \h, \my 3040 add \src, \src, #16 3041.ifc \type, put 3042 add \dst, \dst, #16 3043.else 3044 add \dst, \dst, #32 3045.endif 3046 b 1b 30470: 3048 ret 3049endfunc 3050 3051jumptable \type\()_bilin_v_tbl 3052 .word 1280b - \type\()_bilin_v_tbl 3053 .word 640b - \type\()_bilin_v_tbl 3054 .word 320b - \type\()_bilin_v_tbl 3055 .word 160b - \type\()_bilin_v_tbl 3056 .word 80b - \type\()_bilin_v_tbl 3057 .word 40b - \type\()_bilin_v_tbl 3058 .word 20b - \type\()_bilin_v_tbl 3059endjumptable 3060 3061function L(\type\()_bilin_hv) 3062 uxtl v2.8h, v2.8b 3063 uxtl v3.8h, v3.8b 3064 movrel x9, \type\()_bilin_hv_tbl 3065 ldrsw x8, [x9, x8, lsl #2] 3066 add x9, x9, x8 3067 br x9 3068 306920: // 2xN hv 3070 AARCH64_VALID_JUMP_TARGET 3071.ifc \type, put 3072 add \sr2, \src, \s_strd 3073 add \ds2, \dst, \d_strd 3074 lsl \s_strd, \s_strd, #1 3075 lsl \d_strd, \d_strd, #1 3076 3077 ld1r {v28.4s}, [\src], \s_strd 3078 ext v29.8b, v28.8b, v28.8b, #1 3079 umull v16.8h, v28.8b, v0.8b 3080 umlal v16.8h, v29.8b, v1.8b 3081 30822: 3083 ld1r {v28.4s}, [\sr2], \s_strd 3084 ld1r {v30.4s}, [\src], \s_strd 3085 ext v29.8b, v28.8b, v28.8b, #1 3086 ext v31.8b, v30.8b, v30.8b, #1 3087 trn1 v28.4h, v28.4h, v30.4h 3088 trn1 v29.4h, v29.4h, v31.4h 3089 umull v17.8h, v28.8b, v0.8b 3090 umlal v17.8h, v29.8b, v1.8b 3091 3092 trn1 v16.2s, v16.2s, v17.2s 3093 3094 mul v4.4h, v16.4h, v2.4h 3095 mla v4.4h, v17.4h, v3.4h 3096 uqrshrn v4.8b, v4.8h, #8 3097 subs \h, \h, #2 3098 st1 {v4.h}[0], [\dst], \d_strd 3099 st1 {v4.h}[1], [\ds2], \d_strd 3100 b.le 0f 3101 trn2 v16.2s, v17.2s, v17.2s 3102 b 2b 31030: 3104 ret 3105.endif 3106 310740: // 4xN hv 3108 AARCH64_VALID_JUMP_TARGET 3109 add \sr2, \src, \s_strd 3110 add \ds2, \dst, \d_strd 3111 lsl \s_strd, \s_strd, #1 3112 lsl \d_strd, \d_strd, #1 3113 3114 ld1 {v28.8b}, [\src], \s_strd 3115 ext v29.8b, v28.8b, v28.8b, #1 3116 umull v16.8h, v28.8b, v0.8b 3117 umlal v16.8h, v29.8b, v1.8b 3118 31194: 3120 ld1 {v28.8b}, [\sr2], \s_strd 3121 ld1 {v30.8b}, [\src], \s_strd 3122 ext v29.8b, v28.8b, v28.8b, #1 3123 ext v31.8b, v30.8b, v30.8b, #1 3124 trn1 v28.2s, v28.2s, v30.2s 3125 trn1 v29.2s, v29.2s, v31.2s 3126 umull v17.8h, v28.8b, v0.8b 3127 umlal v17.8h, v29.8b, v1.8b 3128 3129 trn1 v16.2d, v16.2d, v17.2d 3130 3131 mul v4.8h, v16.8h, v2.8h 3132 mla v4.8h, v17.8h, v3.8h 3133 subs \h, \h, #2 3134.ifc \type, put 3135 uqrshrn v4.8b, v4.8h, #8 3136 st1 {v4.s}[0], [\dst], \d_strd 3137 st1 {v4.s}[1], [\ds2], \d_strd 3138.else 3139 urshr v4.8h, v4.8h, #4 3140 st1 {v4.8b}, [\dst], \d_strd 3141 st1 {v4.d}[1], [\ds2], \d_strd 3142.endif 3143 b.le 0f 3144 trn2 v16.2d, v17.2d, v17.2d 3145 b 4b 31460: 3147 ret 3148 314980: // 8xN, 16xN, ... hv 3150160: 3151320: 3152640: 31531280: 3154 AARCH64_VALID_JUMP_TARGET 3155 mov \my, \h 3156 31571: 3158 add \sr2, \src, \s_strd 3159 add \ds2, \dst, \d_strd 3160 lsl \s_strd, \s_strd, #1 3161 lsl \d_strd, \d_strd, #1 3162 3163 ld1 {v28.16b}, [\src], \s_strd 3164 ext v29.16b, v28.16b, v28.16b, #1 3165 umull v16.8h, v28.8b, v0.8b 3166 umlal v16.8h, v29.8b, v1.8b 3167 31682: 3169 ld1 {v28.16b}, [\sr2], \s_strd 3170 ld1 {v30.16b}, [\src], \s_strd 3171 ext v29.16b, v28.16b, v28.16b, #1 3172 ext v31.16b, v30.16b, v30.16b, #1 3173 umull v17.8h, v28.8b, v0.8b 3174 umlal v17.8h, v29.8b, v1.8b 3175 umull v18.8h, v30.8b, v0.8b 3176 umlal v18.8h, v31.8b, v1.8b 3177 3178 mul v4.8h, v16.8h, v2.8h 3179 mla v4.8h, v17.8h, v3.8h 3180 mul v5.8h, v17.8h, v2.8h 3181 mla v5.8h, v18.8h, v3.8h 3182 subs \h, \h, #2 3183.ifc \type, put 3184 uqrshrn v4.8b, v4.8h, #8 3185 uqrshrn v5.8b, v5.8h, #8 3186 st1 {v4.8b}, [\dst], \d_strd 3187 st1 {v5.8b}, [\ds2], \d_strd 3188.else 3189 urshr v4.8h, v4.8h, #4 3190 urshr v5.8h, v5.8h, #4 3191 st1 {v4.8h}, [\dst], \d_strd 3192 st1 {v5.8h}, [\ds2], \d_strd 3193.endif 3194 b.le 9f 3195 mov v16.16b, v18.16b 3196 b 2b 31979: 3198 subs \w, \w, #8 3199 b.le 0f 3200 asr \s_strd, \s_strd, #1 3201 asr \d_strd, \d_strd, #1 3202 msub \src, \s_strd, \xmy, \src 3203 msub \dst, \d_strd, \xmy, \dst 3204 sub \src, \src, \s_strd, lsl #1 3205 mov \h, \my 3206 add \src, \src, #8 3207.ifc \type, put 3208 add \dst, \dst, #8 3209.else 3210 add \dst, \dst, #16 3211.endif 3212 b 1b 32130: 3214 ret 3215endfunc 3216 3217jumptable \type\()_bilin_hv_tbl 3218 .word 1280b - \type\()_bilin_hv_tbl 3219 .word 640b - \type\()_bilin_hv_tbl 3220 .word 320b - \type\()_bilin_hv_tbl 3221 .word 160b - \type\()_bilin_hv_tbl 3222 .word 80b - \type\()_bilin_hv_tbl 3223 .word 40b - \type\()_bilin_hv_tbl 3224 .word 20b - \type\()_bilin_hv_tbl 3225endjumptable 3226.endm 3227 3228make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap 3229make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap 3230make_8tap_fn put, sharp, SHARP, SHARP, 8tap 3231make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap 3232make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap 3233filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap 3234 3235make_8tap_fn put, regular, REGULAR, REGULAR, 6tap 3236make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap 3237make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap 3238make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap 3239filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap 3240filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 3241 3242make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap 3243make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap 3244make_8tap_fn prep, sharp, SHARP, SHARP, 8tap 3245make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap 3246make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap 3247filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 8tap 3248 3249make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap 3250make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap 3251make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap 3252make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap 3253filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 6tap 3254filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 3255 3256 3257.macro load_filter_row dst, src, inc 3258 asr w13, \src, #10 3259 add \src, \src, \inc 3260 ldr \dst, [x11, w13, sxtw #3] 3261.endm 3262 3263function warp_filter_horz_neon 3264 add w12, w5, #512 3265 3266 ld1 {v16.8b, v17.8b}, [x2], x3 3267 3268 load_filter_row d0, w12, w7 3269 load_filter_row d1, w12, w7 3270 load_filter_row d2, w12, w7 3271 load_filter_row d3, w12, w7 3272 load_filter_row d4, w12, w7 3273 load_filter_row d5, w12, w7 3274 load_filter_row d6, w12, w7 3275 // subtract by 128 to allow using smull 3276 eor v16.8b, v16.8b, v22.8b 3277 eor v17.8b, v17.8b, v22.8b 3278 load_filter_row d7, w12, w7 3279 3280 ext v18.8b, v16.8b, v17.8b, #1 3281 ext v19.8b, v16.8b, v17.8b, #2 3282 smull v0.8h, v0.8b, v16.8b 3283 smull v1.8h, v1.8b, v18.8b 3284 ext v18.8b, v16.8b, v17.8b, #3 3285 ext v20.8b, v16.8b, v17.8b, #4 3286 smull v2.8h, v2.8b, v19.8b 3287 smull v3.8h, v3.8b, v18.8b 3288 ext v18.8b, v16.8b, v17.8b, #5 3289 ext v19.8b, v16.8b, v17.8b, #6 3290 smull v4.8h, v4.8b, v20.8b 3291 smull v5.8h, v5.8b, v18.8b 3292 ext v18.8b, v16.8b, v17.8b, #7 3293 smull v6.8h, v6.8b, v19.8b 3294 smull v7.8h, v7.8b, v18.8b 3295 3296 addp v0.8h, v0.8h, v1.8h 3297 addp v2.8h, v2.8h, v3.8h 3298 addp v4.8h, v4.8h, v5.8h 3299 addp v6.8h, v6.8h, v7.8h 3300 3301 addp v0.8h, v0.8h, v2.8h 3302 addp v4.8h, v4.8h, v6.8h 3303 3304 addp v0.8h, v0.8h, v4.8h 3305 3306 add w5, w5, w8 3307 3308 ret 3309endfunc 3310 3311// void dav1d_warp_affine_8x8_8bpc_neon( 3312// pixel *dst, const ptrdiff_t dst_stride, 3313// const pixel *src, const ptrdiff_t src_stride, 3314// const int16_t *const abcd, int mx, int my) 3315.macro warp t, shift 3316function warp_affine_8x8\t\()_8bpc_neon, export=1 3317 ldr x4, [x4] 3318 sbfx x7, x4, #0, #16 3319 sbfx x8, x4, #16, #16 3320 sbfx x9, x4, #32, #16 3321 sbfx x4, x4, #48, #16 3322 mov w10, #8 3323 sub x2, x2, x3, lsl #1 3324 sub x2, x2, x3 3325 sub x2, x2, #3 3326 movrel x11, X(mc_warp_filter), 64*8 3327 mov x15, x30 3328.ifnb \t 3329 lsl x1, x1, #1 3330.endif 3331 3332 movi v22.8b, #128 3333.ifb \t 3334 movi v23.8h, #128 3335.else 3336 movi v23.8h, #8, lsl #8 3337.endif 3338 3339 bl warp_filter_horz_neon 3340 srshr v24.8h, v0.8h, #3 3341 bl warp_filter_horz_neon 3342 srshr v25.8h, v0.8h, #3 3343 bl warp_filter_horz_neon 3344 srshr v26.8h, v0.8h, #3 3345 bl warp_filter_horz_neon 3346 srshr v27.8h, v0.8h, #3 3347 bl warp_filter_horz_neon 3348 srshr v28.8h, v0.8h, #3 3349 bl warp_filter_horz_neon 3350 srshr v29.8h, v0.8h, #3 3351 bl warp_filter_horz_neon 3352 srshr v30.8h, v0.8h, #3 3353 33541: 3355 add w14, w6, #512 3356 bl warp_filter_horz_neon 3357 srshr v31.8h, v0.8h, #3 3358 3359 load_filter_row d0, w14, w9 3360 load_filter_row d1, w14, w9 3361 load_filter_row d2, w14, w9 3362 load_filter_row d3, w14, w9 3363 load_filter_row d4, w14, w9 3364 load_filter_row d5, w14, w9 3365 load_filter_row d6, w14, w9 3366 load_filter_row d7, w14, w9 3367 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl 3368 3369 // This ordering of smull/smlal/smull2/smlal2 is highly 3370 // beneficial for Cortex A53 here. 3371 smull v16.4s, v24.4h, v0.4h 3372 smlal v16.4s, v25.4h, v1.4h 3373 smlal v16.4s, v26.4h, v2.4h 3374 smlal v16.4s, v27.4h, v3.4h 3375 smlal v16.4s, v28.4h, v4.4h 3376 smlal v16.4s, v29.4h, v5.4h 3377 smlal v16.4s, v30.4h, v6.4h 3378 smlal v16.4s, v31.4h, v7.4h 3379 smull2 v17.4s, v24.8h, v0.8h 3380 smlal2 v17.4s, v25.8h, v1.8h 3381 smlal2 v17.4s, v26.8h, v2.8h 3382 smlal2 v17.4s, v27.8h, v3.8h 3383 smlal2 v17.4s, v28.8h, v4.8h 3384 smlal2 v17.4s, v29.8h, v5.8h 3385 smlal2 v17.4s, v30.8h, v6.8h 3386 smlal2 v17.4s, v31.8h, v7.8h 3387 3388 mov v24.16b, v25.16b 3389 mov v25.16b, v26.16b 3390 sqrshrn v16.4h, v16.4s, #\shift 3391 mov v26.16b, v27.16b 3392 sqrshrn2 v16.8h, v17.4s, #\shift 3393 mov v27.16b, v28.16b 3394 mov v28.16b, v29.16b 3395 add v16.8h, v16.8h, v23.8h 3396.ifb \t 3397 sqxtun v16.8b, v16.8h 3398.endif 3399 mov v29.16b, v30.16b 3400 mov v30.16b, v31.16b 3401 subs w10, w10, #1 3402.ifnb \t 3403 st1 {v16.8h}, [x0], x1 3404.else 3405 st1 {v16.8b}, [x0], x1 3406.endif 3407 3408 add w6, w6, w4 3409 b.gt 1b 3410 3411 ret x15 3412endfunc 3413.endm 3414 3415warp , 11 3416warp t, 7 3417 3418// void dav1d_emu_edge_8bpc_neon( 3419// const intptr_t bw, const intptr_t bh, 3420// const intptr_t iw, const intptr_t ih, 3421// const intptr_t x, const intptr_t y, 3422// pixel *dst, const ptrdiff_t dst_stride, 3423// const pixel *ref, const ptrdiff_t ref_stride) 3424function emu_edge_8bpc_neon, export=1 3425 ldp x8, x9, [sp] 3426 3427 // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 3428 // ref += iclip(x, 0, iw - 1) 3429 sub x12, x3, #1 // ih - 1 3430 cmp x5, x3 3431 sub x13, x2, #1 // iw - 1 3432 csel x12, x12, x5, ge // min(y, ih - 1) 3433 cmp x4, x2 3434 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) 3435 csel x13, x13, x4, ge // min(x, iw - 1) 3436 bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) 3437 madd x8, x12, x9, x8 // ref += iclip() * stride 3438 add x8, x8, x13 // ref += iclip() 3439 3440 // bottom_ext = iclip(y + bh - ih, 0, bh - 1) 3441 // top_ext = iclip(-y, 0, bh - 1) 3442 add x10, x5, x1 // y + bh 3443 neg x5, x5 // -y 3444 sub x10, x10, x3 // y + bh - ih 3445 sub x12, x1, #1 // bh - 1 3446 cmp x10, x1 3447 bic x5, x5, x5, asr #63 // max(-y, 0) 3448 csel x10, x10, x12, lt // min(y + bh - ih, bh-1) 3449 cmp x5, x1 3450 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) 3451 csel x5, x5, x12, lt // min(max(-y, 0), bh-1) 3452 3453 // right_ext = iclip(x + bw - iw, 0, bw - 1) 3454 // left_ext = iclip(-x, 0, bw - 1) 3455 add x11, x4, x0 // x + bw 3456 neg x4, x4 // -x 3457 sub x11, x11, x2 // x + bw - iw 3458 sub x13, x0, #1 // bw - 1 3459 cmp x11, x0 3460 bic x4, x4, x4, asr #63 // max(-x, 0) 3461 csel x11, x11, x13, lt // min(x + bw - iw, bw-1) 3462 cmp x4, x0 3463 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) 3464 csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) 3465 3466 // center_h = bh - top_ext - bottom_ext 3467 // dst += top_ext * PXSTRIDE(dst_stride) 3468 // center_w = bw - left_ext - right_ext 3469 sub x1, x1, x5 // bh - top_ext 3470 madd x6, x5, x7, x6 3471 sub x2, x0, x4 // bw - left_ext 3472 sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext 3473 sub x2, x2, x11 // center_w = bw - left_ext - right_ext 3474 3475 mov x14, x6 // backup of dst 3476 3477.macro v_loop need_left, need_right 34780: 3479.if \need_left 3480 ld1r {v0.16b}, [x8] 3481 mov x12, x6 // out = dst 3482 mov x3, x4 34831: 3484 subs x3, x3, #16 3485 st1 {v0.16b}, [x12], #16 3486 b.gt 1b 3487.endif 3488 mov x13, x8 3489 add x12, x6, x4 // out = dst + left_ext 3490 mov x3, x2 34911: 3492 ld1 {v0.16b, v1.16b}, [x13], #32 3493 subs x3, x3, #32 3494 st1 {v0.16b, v1.16b}, [x12], #32 3495 b.gt 1b 3496.if \need_right 3497 add x3, x8, x2 // in + center_w 3498 sub x3, x3, #1 // in + center_w - 1 3499 add x12, x6, x4 // dst + left_ext 3500 ld1r {v0.16b}, [x3] 3501 add x12, x12, x2 // out = dst + left_ext + center_w 3502 mov x3, x11 35031: 3504 subs x3, x3, #16 3505 st1 {v0.16b}, [x12], #16 3506 b.gt 1b 3507.endif 3508 3509 subs x1, x1, #1 // center_h-- 3510 add x6, x6, x7 3511 add x8, x8, x9 3512 b.gt 0b 3513.endm 3514 3515 cbz x4, 2f 3516 // need_left 3517 cbz x11, 3f 3518 // need_left + need_right 3519 v_loop 1, 1 3520 b 5f 3521 35222: 3523 // !need_left 3524 cbz x11, 4f 3525 // !need_left + need_right 3526 v_loop 0, 1 3527 b 5f 3528 35293: 3530 // need_left + !need_right 3531 v_loop 1, 0 3532 b 5f 3533 35344: 3535 // !need_left + !need_right 3536 v_loop 0, 0 3537 35385: 3539 3540 cbz x10, 3f 3541 // need_bottom 3542 sub x8, x6, x7 // ref = dst - stride 3543 mov x4, x0 35441: 3545 ld1 {v0.16b, v1.16b}, [x8], #32 3546 mov x3, x10 35472: 3548 subs x3, x3, #1 3549 st1 {v0.16b, v1.16b}, [x6], x7 3550 b.gt 2b 3551 msub x6, x7, x10, x6 // dst -= bottom_ext * stride 3552 subs x4, x4, #32 // bw -= 32 3553 add x6, x6, #32 // dst += 32 3554 b.gt 1b 3555 35563: 3557 cbz x5, 3f 3558 // need_top 3559 msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 35601: 3561 ld1 {v0.16b, v1.16b}, [x14], #32 3562 mov x3, x5 35632: 3564 subs x3, x3, #1 3565 st1 {v0.16b, v1.16b}, [x6], x7 3566 b.gt 2b 3567 msub x6, x7, x5, x6 // dst -= top_ext * stride 3568 subs x0, x0, #32 // bw -= 32 3569 add x6, x6, #32 // dst += 32 3570 b.gt 1b 3571 35723: 3573 ret 3574endfunc 3575