1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, 32// const pixel *const topleft, 33// const int width, const int height, const int a, 34// const int max_width, const int max_height); 35function ipred_dc_128_8bpc_neon, export=1 36 clz w3, w3 37 movrel x5, ipred_dc_128_tbl 38 sub w3, w3, #25 39 ldrsw x3, [x5, w3, uxtw #2] 40 movi v0.16b, #128 41 add x5, x5, x3 42 add x6, x0, x1 43 lsl x1, x1, #1 44 br x5 4540: 46 AARCH64_VALID_JUMP_TARGET 474: 48 st1 {v0.s}[0], [x0], x1 49 st1 {v0.s}[0], [x6], x1 50 subs w4, w4, #4 51 st1 {v0.s}[0], [x0], x1 52 st1 {v0.s}[0], [x6], x1 53 b.gt 4b 54 ret 5580: 56 AARCH64_VALID_JUMP_TARGET 578: 58 st1 {v0.8b}, [x0], x1 59 st1 {v0.8b}, [x6], x1 60 subs w4, w4, #4 61 st1 {v0.8b}, [x0], x1 62 st1 {v0.8b}, [x6], x1 63 b.gt 8b 64 ret 65160: 66 AARCH64_VALID_JUMP_TARGET 6716: 68 st1 {v0.16b}, [x0], x1 69 st1 {v0.16b}, [x6], x1 70 subs w4, w4, #4 71 st1 {v0.16b}, [x0], x1 72 st1 {v0.16b}, [x6], x1 73 b.gt 16b 74 ret 75320: 76 AARCH64_VALID_JUMP_TARGET 77 movi v1.16b, #128 7832: 79 st1 {v0.16b, v1.16b}, [x0], x1 80 st1 {v0.16b, v1.16b}, [x6], x1 81 subs w4, w4, #4 82 st1 {v0.16b, v1.16b}, [x0], x1 83 st1 {v0.16b, v1.16b}, [x6], x1 84 b.gt 32b 85 ret 86640: 87 AARCH64_VALID_JUMP_TARGET 88 movi v1.16b, #128 89 movi v2.16b, #128 90 movi v3.16b, #128 9164: 92 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 93 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 94 subs w4, w4, #4 95 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 96 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 97 b.gt 64b 98 ret 99endfunc 100 101jumptable ipred_dc_128_tbl 102 .word 640b - ipred_dc_128_tbl 103 .word 320b - ipred_dc_128_tbl 104 .word 160b - ipred_dc_128_tbl 105 .word 80b - ipred_dc_128_tbl 106 .word 40b - ipred_dc_128_tbl 107endjumptable 108 109// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, 110// const pixel *const topleft, 111// const int width, const int height, const int a, 112// const int max_width, const int max_height); 113function ipred_v_8bpc_neon, export=1 114 clz w3, w3 115 movrel x5, ipred_v_tbl 116 sub w3, w3, #25 117 ldrsw x3, [x5, w3, uxtw #2] 118 add x2, x2, #1 119 add x5, x5, x3 120 add x6, x0, x1 121 lsl x1, x1, #1 122 br x5 12340: 124 AARCH64_VALID_JUMP_TARGET 125 ld1 {v0.s}[0], [x2] 1264: 127 st1 {v0.s}[0], [x0], x1 128 st1 {v0.s}[0], [x6], x1 129 subs w4, w4, #4 130 st1 {v0.s}[0], [x0], x1 131 st1 {v0.s}[0], [x6], x1 132 b.gt 4b 133 ret 13480: 135 AARCH64_VALID_JUMP_TARGET 136 ld1 {v0.8b}, [x2] 1378: 138 st1 {v0.8b}, [x0], x1 139 st1 {v0.8b}, [x6], x1 140 subs w4, w4, #4 141 st1 {v0.8b}, [x0], x1 142 st1 {v0.8b}, [x6], x1 143 b.gt 8b 144 ret 145160: 146 AARCH64_VALID_JUMP_TARGET 147 ld1 {v0.16b}, [x2] 14816: 149 st1 {v0.16b}, [x0], x1 150 st1 {v0.16b}, [x6], x1 151 subs w4, w4, #4 152 st1 {v0.16b}, [x0], x1 153 st1 {v0.16b}, [x6], x1 154 b.gt 16b 155 ret 156320: 157 AARCH64_VALID_JUMP_TARGET 158 ld1 {v0.16b, v1.16b}, [x2] 15932: 160 st1 {v0.16b, v1.16b}, [x0], x1 161 st1 {v0.16b, v1.16b}, [x6], x1 162 subs w4, w4, #4 163 st1 {v0.16b, v1.16b}, [x0], x1 164 st1 {v0.16b, v1.16b}, [x6], x1 165 b.gt 32b 166 ret 167640: 168 AARCH64_VALID_JUMP_TARGET 169 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 17064: 171 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 172 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 173 subs w4, w4, #4 174 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 175 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 176 b.gt 64b 177 ret 178endfunc 179 180jumptable ipred_v_tbl 181 .word 640b - ipred_v_tbl 182 .word 320b - ipred_v_tbl 183 .word 160b - ipred_v_tbl 184 .word 80b - ipred_v_tbl 185 .word 40b - ipred_v_tbl 186endjumptable 187 188// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, 189// const pixel *const topleft, 190// const int width, const int height, const int a, 191// const int max_width, const int max_height); 192function ipred_h_8bpc_neon, export=1 193 clz w3, w3 194 movrel x5, ipred_h_tbl 195 sub w3, w3, #25 196 ldrsw x3, [x5, w3, uxtw #2] 197 sub x2, x2, #4 198 add x5, x5, x3 199 mov x7, #-4 200 add x6, x0, x1 201 lsl x1, x1, #1 202 br x5 20340: 204 AARCH64_VALID_JUMP_TARGET 2054: 206 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 207 st1 {v3.s}[0], [x0], x1 208 st1 {v2.s}[0], [x6], x1 209 subs w4, w4, #4 210 st1 {v1.s}[0], [x0], x1 211 st1 {v0.s}[0], [x6], x1 212 b.gt 4b 213 ret 21480: 215 AARCH64_VALID_JUMP_TARGET 2168: 217 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 218 st1 {v3.8b}, [x0], x1 219 st1 {v2.8b}, [x6], x1 220 subs w4, w4, #4 221 st1 {v1.8b}, [x0], x1 222 st1 {v0.8b}, [x6], x1 223 b.gt 8b 224 ret 225160: 226 AARCH64_VALID_JUMP_TARGET 22716: 228 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 229 st1 {v3.16b}, [x0], x1 230 st1 {v2.16b}, [x6], x1 231 subs w4, w4, #4 232 st1 {v1.16b}, [x0], x1 233 st1 {v0.16b}, [x6], x1 234 b.gt 16b 235 ret 236320: 237 AARCH64_VALID_JUMP_TARGET 23832: 239 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 240 str q3, [x0, #16] 241 str q2, [x6, #16] 242 st1 {v3.16b}, [x0], x1 243 st1 {v2.16b}, [x6], x1 244 subs w4, w4, #4 245 str q1, [x0, #16] 246 str q0, [x6, #16] 247 st1 {v1.16b}, [x0], x1 248 st1 {v0.16b}, [x6], x1 249 b.gt 32b 250 ret 251640: 252 AARCH64_VALID_JUMP_TARGET 25364: 254 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 255 str q3, [x0, #16] 256 str q2, [x6, #16] 257 stp q3, q3, [x0, #32] 258 stp q2, q2, [x6, #32] 259 st1 {v3.16b}, [x0], x1 260 st1 {v2.16b}, [x6], x1 261 subs w4, w4, #4 262 str q1, [x0, #16] 263 str q0, [x6, #16] 264 stp q1, q1, [x0, #32] 265 stp q0, q0, [x6, #32] 266 st1 {v1.16b}, [x0], x1 267 st1 {v0.16b}, [x6], x1 268 b.gt 64b 269 ret 270endfunc 271 272jumptable ipred_h_tbl 273 .word 640b - ipred_h_tbl 274 .word 320b - ipred_h_tbl 275 .word 160b - ipred_h_tbl 276 .word 80b - ipred_h_tbl 277 .word 40b - ipred_h_tbl 278endjumptable 279 280// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, 281// const pixel *const topleft, 282// const int width, const int height, const int a, 283// const int max_width, const int max_height); 284function ipred_dc_top_8bpc_neon, export=1 285 clz w3, w3 286 movrel x5, ipred_dc_top_tbl 287 sub w3, w3, #25 288 ldrsw x3, [x5, w3, uxtw #2] 289 add x2, x2, #1 290 add x5, x5, x3 291 add x6, x0, x1 292 lsl x1, x1, #1 293 br x5 29440: 295 AARCH64_VALID_JUMP_TARGET 296 ld1r {v0.2s}, [x2] 297 uaddlv h0, v0.8b 298 rshrn v0.8b, v0.8h, #3 299 dup v0.8b, v0.b[0] 3004: 301 st1 {v0.s}[0], [x0], x1 302 st1 {v0.s}[0], [x6], x1 303 subs w4, w4, #4 304 st1 {v0.s}[0], [x0], x1 305 st1 {v0.s}[0], [x6], x1 306 b.gt 4b 307 ret 30880: 309 AARCH64_VALID_JUMP_TARGET 310 ld1 {v0.8b}, [x2] 311 uaddlv h0, v0.8b 312 rshrn v0.8b, v0.8h, #3 313 dup v0.8b, v0.b[0] 3148: 315 st1 {v0.8b}, [x0], x1 316 st1 {v0.8b}, [x6], x1 317 subs w4, w4, #4 318 st1 {v0.8b}, [x0], x1 319 st1 {v0.8b}, [x6], x1 320 b.gt 8b 321 ret 322160: 323 AARCH64_VALID_JUMP_TARGET 324 ld1 {v0.16b}, [x2] 325 uaddlv h0, v0.16b 326 rshrn v0.8b, v0.8h, #4 327 dup v0.16b, v0.b[0] 32816: 329 st1 {v0.16b}, [x0], x1 330 st1 {v0.16b}, [x6], x1 331 subs w4, w4, #4 332 st1 {v0.16b}, [x0], x1 333 st1 {v0.16b}, [x6], x1 334 b.gt 16b 335 ret 336320: 337 AARCH64_VALID_JUMP_TARGET 338 ld1 {v0.16b, v1.16b}, [x2] 339 uaddlv h0, v0.16b 340 uaddlv h1, v1.16b 341 add v2.4h, v0.4h, v1.4h 342 rshrn v2.8b, v2.8h, #5 343 dup v0.16b, v2.b[0] 344 dup v1.16b, v2.b[0] 34532: 346 st1 {v0.16b, v1.16b}, [x0], x1 347 st1 {v0.16b, v1.16b}, [x6], x1 348 subs w4, w4, #4 349 st1 {v0.16b, v1.16b}, [x0], x1 350 st1 {v0.16b, v1.16b}, [x6], x1 351 b.gt 32b 352 ret 353640: 354 AARCH64_VALID_JUMP_TARGET 355 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 356 uaddlv h0, v0.16b 357 uaddlv h1, v1.16b 358 uaddlv h2, v2.16b 359 uaddlv h3, v3.16b 360 add v4.4h, v0.4h, v1.4h 361 add v5.4h, v2.4h, v3.4h 362 add v4.4h, v4.4h, v5.4h 363 rshrn v4.8b, v4.8h, #6 364 dup v0.16b, v4.b[0] 365 dup v1.16b, v4.b[0] 366 dup v2.16b, v4.b[0] 367 dup v3.16b, v4.b[0] 36864: 369 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 370 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 371 subs w4, w4, #4 372 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 373 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 374 b.gt 64b 375 ret 376endfunc 377 378jumptable ipred_dc_top_tbl 379 .word 640b - ipred_dc_top_tbl 380 .word 320b - ipred_dc_top_tbl 381 .word 160b - ipred_dc_top_tbl 382 .word 80b - ipred_dc_top_tbl 383 .word 40b - ipred_dc_top_tbl 384endjumptable 385 386// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, 387// const pixel *const topleft, 388// const int width, const int height, const int a, 389// const int max_width, const int max_height); 390function ipred_dc_left_8bpc_neon, export=1 391 sub x2, x2, w4, uxtw 392 clz w3, w3 393 clz w7, w4 394 movrel x5, ipred_dc_left_tbl 395 sub w3, w3, #20 // 25 leading bits, minus table offset 5 396 sub w7, w7, #25 397 ldrsw x3, [x5, w3, uxtw #2] 398 ldrsw x7, [x5, w7, uxtw #2] 399 add x3, x5, x3 400 add x5, x5, x7 401 add x6, x0, x1 402 lsl x1, x1, #1 403 br x5 404 405L(ipred_dc_left_h4): 406 AARCH64_VALID_JUMP_TARGET 407 ld1r {v0.2s}, [x2] 408 uaddlv h0, v0.8b 409 rshrn v0.8b, v0.8h, #3 410 dup v0.16b, v0.b[0] 411 br x3 412L(ipred_dc_left_w4): 413 AARCH64_VALID_JUMP_TARGET 4141: 415 st1 {v0.s}[0], [x0], x1 416 st1 {v0.s}[0], [x6], x1 417 subs w4, w4, #4 418 st1 {v0.s}[0], [x0], x1 419 st1 {v0.s}[0], [x6], x1 420 b.gt 1b 421 ret 422 423L(ipred_dc_left_h8): 424 AARCH64_VALID_JUMP_TARGET 425 ld1 {v0.8b}, [x2] 426 uaddlv h0, v0.8b 427 rshrn v0.8b, v0.8h, #3 428 dup v0.16b, v0.b[0] 429 br x3 430L(ipred_dc_left_w8): 431 AARCH64_VALID_JUMP_TARGET 4321: 433 st1 {v0.8b}, [x0], x1 434 st1 {v0.8b}, [x6], x1 435 subs w4, w4, #4 436 st1 {v0.8b}, [x0], x1 437 st1 {v0.8b}, [x6], x1 438 b.gt 1b 439 ret 440 441L(ipred_dc_left_h16): 442 AARCH64_VALID_JUMP_TARGET 443 ld1 {v0.16b}, [x2] 444 uaddlv h0, v0.16b 445 rshrn v0.8b, v0.8h, #4 446 dup v0.16b, v0.b[0] 447 br x3 448L(ipred_dc_left_w16): 449 AARCH64_VALID_JUMP_TARGET 4501: 451 st1 {v0.16b}, [x0], x1 452 st1 {v0.16b}, [x6], x1 453 subs w4, w4, #4 454 st1 {v0.16b}, [x0], x1 455 st1 {v0.16b}, [x6], x1 456 b.gt 1b 457 ret 458 459L(ipred_dc_left_h32): 460 AARCH64_VALID_JUMP_TARGET 461 ld1 {v0.16b, v1.16b}, [x2] 462 uaddlv h0, v0.16b 463 uaddlv h1, v1.16b 464 add v0.4h, v0.4h, v1.4h 465 rshrn v0.8b, v0.8h, #5 466 dup v0.16b, v0.b[0] 467 br x3 468L(ipred_dc_left_w32): 469 AARCH64_VALID_JUMP_TARGET 470 mov v1.16b, v0.16b 4711: 472 st1 {v0.16b, v1.16b}, [x0], x1 473 st1 {v0.16b, v1.16b}, [x6], x1 474 subs w4, w4, #4 475 st1 {v0.16b, v1.16b}, [x0], x1 476 st1 {v0.16b, v1.16b}, [x6], x1 477 b.gt 1b 478 ret 479 480L(ipred_dc_left_h64): 481 AARCH64_VALID_JUMP_TARGET 482 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 483 uaddlv h0, v0.16b 484 uaddlv h1, v1.16b 485 uaddlv h2, v2.16b 486 uaddlv h3, v3.16b 487 add v0.4h, v0.4h, v1.4h 488 add v2.4h, v2.4h, v3.4h 489 add v0.4h, v0.4h, v2.4h 490 rshrn v0.8b, v0.8h, #6 491 dup v0.16b, v0.b[0] 492 br x3 493L(ipred_dc_left_w64): 494 AARCH64_VALID_JUMP_TARGET 495 mov v1.16b, v0.16b 496 mov v2.16b, v0.16b 497 mov v3.16b, v0.16b 4981: 499 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 500 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 501 subs w4, w4, #4 502 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 503 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 504 b.gt 1b 505 ret 506endfunc 507 508jumptable ipred_dc_left_tbl 509 .word L(ipred_dc_left_h64) - ipred_dc_left_tbl 510 .word L(ipred_dc_left_h32) - ipred_dc_left_tbl 511 .word L(ipred_dc_left_h16) - ipred_dc_left_tbl 512 .word L(ipred_dc_left_h8) - ipred_dc_left_tbl 513 .word L(ipred_dc_left_h4) - ipred_dc_left_tbl 514 .word L(ipred_dc_left_w64) - ipred_dc_left_tbl 515 .word L(ipred_dc_left_w32) - ipred_dc_left_tbl 516 .word L(ipred_dc_left_w16) - ipred_dc_left_tbl 517 .word L(ipred_dc_left_w8) - ipred_dc_left_tbl 518 .word L(ipred_dc_left_w4) - ipred_dc_left_tbl 519endjumptable 520 521// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, 522// const pixel *const topleft, 523// const int width, const int height, const int a, 524// const int max_width, const int max_height); 525function ipred_dc_8bpc_neon, export=1 526 sub x2, x2, w4, uxtw 527 add w7, w3, w4 // width + height 528 clz w3, w3 529 clz w6, w4 530 dup v16.8h, w7 // width + height 531 movrel x5, ipred_dc_tbl 532 rbit w7, w7 // rbit(width + height) 533 sub w3, w3, #20 // 25 leading bits, minus table offset 5 534 sub w6, w6, #25 535 clz w7, w7 // ctz(width + height) 536 ldrsw x3, [x5, w3, uxtw #2] 537 ldrsw x6, [x5, w6, uxtw #2] 538 neg w7, w7 // -ctz(width + height) 539 add x3, x5, x3 540 add x5, x5, x6 541 ushr v16.8h, v16.8h, #1 // (width + height) >> 1 542 dup v17.8h, w7 // -ctz(width + height) 543 add x6, x0, x1 544 lsl x1, x1, #1 545 br x5 546 547L(ipred_dc_h4): 548 AARCH64_VALID_JUMP_TARGET 549 ld1 {v0.s}[0], [x2], #4 550 ins v0.s[1], wzr 551 uaddlv h0, v0.8b 552 add x2, x2, #1 553 br x3 554L(ipred_dc_w4): 555 AARCH64_VALID_JUMP_TARGET 556 ld1 {v1.s}[0], [x2] 557 ins v1.s[1], wzr 558 add v0.4h, v0.4h, v16.4h 559 uaddlv h1, v1.8b 560 cmp w4, #4 561 add v0.4h, v0.4h, v1.4h 562 ushl v0.4h, v0.4h, v17.4h 563 b.eq 1f 564 // h = 8/16 565 mov w16, #(0x3334/2) 566 movk w16, #(0x5556/2), lsl #16 567 add w17, w4, w4 // w17 = 2*h = 16 or 32 568 lsr w16, w16, w17 569 dup v16.4h, w16 570 sqdmulh v0.4h, v0.4h, v16.4h 5711: 572 dup v0.8b, v0.b[0] 5732: 574 st1 {v0.s}[0], [x0], x1 575 st1 {v0.s}[0], [x6], x1 576 subs w4, w4, #4 577 st1 {v0.s}[0], [x0], x1 578 st1 {v0.s}[0], [x6], x1 579 b.gt 2b 580 ret 581 582L(ipred_dc_h8): 583 AARCH64_VALID_JUMP_TARGET 584 ld1 {v0.8b}, [x2], #8 585 uaddlv h0, v0.8b 586 add x2, x2, #1 587 br x3 588L(ipred_dc_w8): 589 AARCH64_VALID_JUMP_TARGET 590 ld1 {v1.8b}, [x2] 591 add v0.4h, v0.4h, v16.4h 592 uaddlv h1, v1.8b 593 cmp w4, #8 594 add v0.4h, v0.4h, v1.4h 595 ushl v0.4h, v0.4h, v17.4h 596 b.eq 1f 597 // h = 4/16/32 598 cmp w4, #32 599 mov w16, #(0x3334/2) 600 mov w17, #(0x5556/2) 601 csel w16, w16, w17, eq 602 dup v16.4h, w16 603 sqdmulh v0.4h, v0.4h, v16.4h 6041: 605 dup v0.8b, v0.b[0] 6062: 607 st1 {v0.8b}, [x0], x1 608 st1 {v0.8b}, [x6], x1 609 subs w4, w4, #4 610 st1 {v0.8b}, [x0], x1 611 st1 {v0.8b}, [x6], x1 612 b.gt 2b 613 ret 614 615L(ipred_dc_h16): 616 AARCH64_VALID_JUMP_TARGET 617 ld1 {v0.16b}, [x2], #16 618 uaddlv h0, v0.16b 619 add x2, x2, #1 620 br x3 621L(ipred_dc_w16): 622 AARCH64_VALID_JUMP_TARGET 623 ld1 {v1.16b}, [x2] 624 add v0.4h, v0.4h, v16.4h 625 uaddlv h1, v1.16b 626 cmp w4, #16 627 add v0.4h, v0.4h, v1.4h 628 ushl v0.4h, v0.4h, v17.4h 629 b.eq 1f 630 // h = 4/8/32/64 631 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 632 mov w16, #(0x3334/2) 633 mov w17, #(0x5556/2) 634 csel w16, w16, w17, eq 635 dup v16.4h, w16 636 sqdmulh v0.4h, v0.4h, v16.4h 6371: 638 dup v0.16b, v0.b[0] 6392: 640 st1 {v0.16b}, [x0], x1 641 st1 {v0.16b}, [x6], x1 642 subs w4, w4, #4 643 st1 {v0.16b}, [x0], x1 644 st1 {v0.16b}, [x6], x1 645 b.gt 2b 646 ret 647 648L(ipred_dc_h32): 649 AARCH64_VALID_JUMP_TARGET 650 ld1 {v0.16b, v1.16b}, [x2], #32 651 uaddlv h0, v0.16b 652 uaddlv h1, v1.16b 653 add x2, x2, #1 654 add v0.4h, v0.4h, v1.4h 655 br x3 656L(ipred_dc_w32): 657 AARCH64_VALID_JUMP_TARGET 658 ld1 {v1.16b, v2.16b}, [x2] 659 add v0.4h, v0.4h, v16.4h 660 uaddlv h1, v1.16b 661 uaddlv h2, v2.16b 662 cmp w4, #32 663 add v0.4h, v0.4h, v1.4h 664 add v0.4h, v0.4h, v2.4h 665 ushl v4.4h, v0.4h, v17.4h 666 b.eq 1f 667 // h = 8/16/64 668 cmp w4, #8 669 mov w16, #(0x3334/2) 670 mov w17, #(0x5556/2) 671 csel w16, w16, w17, eq 672 dup v16.4h, w16 673 sqdmulh v4.4h, v4.4h, v16.4h 6741: 675 dup v0.16b, v4.b[0] 676 dup v1.16b, v4.b[0] 6772: 678 st1 {v0.16b, v1.16b}, [x0], x1 679 st1 {v0.16b, v1.16b}, [x6], x1 680 subs w4, w4, #4 681 st1 {v0.16b, v1.16b}, [x0], x1 682 st1 {v0.16b, v1.16b}, [x6], x1 683 b.gt 2b 684 ret 685 686L(ipred_dc_h64): 687 AARCH64_VALID_JUMP_TARGET 688 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 689 uaddlv h0, v0.16b 690 uaddlv h1, v1.16b 691 uaddlv h2, v2.16b 692 uaddlv h3, v3.16b 693 add v0.4h, v0.4h, v1.4h 694 add v2.4h, v2.4h, v3.4h 695 add x2, x2, #1 696 add v0.4h, v0.4h, v2.4h 697 br x3 698L(ipred_dc_w64): 699 AARCH64_VALID_JUMP_TARGET 700 ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] 701 add v0.4h, v0.4h, v16.4h 702 uaddlv h1, v1.16b 703 uaddlv h2, v2.16b 704 uaddlv h3, v3.16b 705 uaddlv h4, v4.16b 706 add v1.4h, v1.4h, v2.4h 707 add v3.4h, v3.4h, v4.4h 708 cmp w4, #64 709 add v0.4h, v0.4h, v1.4h 710 add v0.4h, v0.4h, v3.4h 711 ushl v4.4h, v0.4h, v17.4h 712 b.eq 1f 713 // h = 16/32 714 mov w16, #(0x5556/2) 715 movk w16, #(0x3334/2), lsl #16 716 lsr w16, w16, w4 717 dup v16.4h, w16 718 sqdmulh v4.4h, v4.4h, v16.4h 7191: 720 dup v0.16b, v4.b[0] 721 dup v1.16b, v4.b[0] 722 dup v2.16b, v4.b[0] 723 dup v3.16b, v4.b[0] 7242: 725 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 726 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 727 subs w4, w4, #4 728 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 729 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 730 b.gt 2b 731 ret 732endfunc 733 734jumptable ipred_dc_tbl 735 .word L(ipred_dc_h64) - ipred_dc_tbl 736 .word L(ipred_dc_h32) - ipred_dc_tbl 737 .word L(ipred_dc_h16) - ipred_dc_tbl 738 .word L(ipred_dc_h8) - ipred_dc_tbl 739 .word L(ipred_dc_h4) - ipred_dc_tbl 740 .word L(ipred_dc_w64) - ipred_dc_tbl 741 .word L(ipred_dc_w32) - ipred_dc_tbl 742 .word L(ipred_dc_w16) - ipred_dc_tbl 743 .word L(ipred_dc_w8) - ipred_dc_tbl 744 .word L(ipred_dc_w4) - ipred_dc_tbl 745endjumptable 746 747// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, 748// const pixel *const topleft, 749// const int width, const int height, const int a, 750// const int max_width, const int max_height); 751function ipred_paeth_8bpc_neon, export=1 752 clz w9, w3 753 movrel x5, ipred_paeth_tbl 754 sub w9, w9, #25 755 ldrsw x9, [x5, w9, uxtw #2] 756 ld1r {v4.16b}, [x2] 757 add x8, x2, #1 758 sub x2, x2, #4 759 add x5, x5, x9 760 mov x7, #-4 761 add x6, x0, x1 762 lsl x1, x1, #1 763 br x5 76440: 765 AARCH64_VALID_JUMP_TARGET 766 ld1r {v5.4s}, [x8] 767 usubl v6.8h, v5.8b, v4.8b // top - topleft 7684: 769 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 770 zip1 v0.2s, v0.2s, v1.2s 771 zip1 v2.2s, v2.2s, v3.2s 772 uaddw v16.8h, v6.8h, v0.8b 773 uaddw v17.8h, v6.8h, v2.8b 774 sqxtun v16.8b, v16.8h // base 775 sqxtun2 v16.16b, v17.8h 776 zip1 v0.2d, v0.2d, v2.2d 777 uabd v20.16b, v5.16b, v16.16b // tdiff 778 uabd v22.16b, v4.16b, v16.16b // tldiff 779 uabd v16.16b, v0.16b, v16.16b // ldiff 780 umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff) 781 cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff 782 cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff 783 bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 784 bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ... 785 st1 {v20.s}[3], [x0], x1 786 st1 {v20.s}[2], [x6], x1 787 subs w4, w4, #4 788 st1 {v20.s}[1], [x0], x1 789 st1 {v20.s}[0], [x6], x1 790 b.gt 4b 791 ret 79280: 793 AARCH64_VALID_JUMP_TARGET 794 ld1r {v5.2d}, [x8] 795 usubl v6.8h, v5.8b, v4.8b // top - topleft 7968: 797 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 798 uaddw v16.8h, v6.8h, v0.8b 799 uaddw v17.8h, v6.8h, v1.8b 800 uaddw v18.8h, v6.8h, v2.8b 801 uaddw v19.8h, v6.8h, v3.8b 802 sqxtun v16.8b, v16.8h // base 803 sqxtun2 v16.16b, v17.8h 804 sqxtun v18.8b, v18.8h 805 sqxtun2 v18.16b, v19.8h 806 zip1 v2.2d, v2.2d, v3.2d 807 zip1 v0.2d, v0.2d, v1.2d 808 uabd v21.16b, v5.16b, v18.16b // tdiff 809 uabd v20.16b, v5.16b, v16.16b 810 uabd v23.16b, v4.16b, v18.16b // tldiff 811 uabd v22.16b, v4.16b, v16.16b 812 uabd v17.16b, v2.16b, v18.16b // ldiff 813 uabd v16.16b, v0.16b, v16.16b 814 umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff) 815 umin v18.16b, v20.16b, v22.16b 816 cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff 817 cmhs v20.16b, v22.16b, v20.16b 818 cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff 819 cmhs v16.16b, v18.16b, v16.16b 820 bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 821 bsl v20.16b, v5.16b, v4.16b 822 bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... 823 bit v20.16b, v0.16b, v16.16b 824 st1 {v21.d}[1], [x0], x1 825 st1 {v21.d}[0], [x6], x1 826 subs w4, w4, #4 827 st1 {v20.d}[1], [x0], x1 828 st1 {v20.d}[0], [x6], x1 829 b.gt 8b 830 ret 831160: 832320: 833640: 834 AARCH64_VALID_JUMP_TARGET 835 ld1 {v5.16b}, [x8], #16 836 mov w9, w3 837 // Set up pointers for four rows in parallel; x0, x6, x5, x10 838 add x5, x0, x1 839 add x10, x6, x1 840 lsl x1, x1, #1 841 sub x1, x1, w3, uxtw 8421: 843 ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 8442: 845 usubl v6.8h, v5.8b, v4.8b // top - topleft 846 usubl2 v7.8h, v5.16b, v4.16b 847 uaddw v24.8h, v6.8h, v0.8b 848 uaddw v25.8h, v7.8h, v0.8b 849 uaddw v26.8h, v6.8h, v1.8b 850 uaddw v27.8h, v7.8h, v1.8b 851 uaddw v28.8h, v6.8h, v2.8b 852 uaddw v29.8h, v7.8h, v2.8b 853 uaddw v30.8h, v6.8h, v3.8b 854 uaddw v31.8h, v7.8h, v3.8b 855 sqxtun v17.8b, v26.8h // base 856 sqxtun2 v17.16b, v27.8h 857 sqxtun v16.8b, v24.8h 858 sqxtun2 v16.16b, v25.8h 859 sqxtun v19.8b, v30.8h 860 sqxtun2 v19.16b, v31.8h 861 sqxtun v18.8b, v28.8h 862 sqxtun2 v18.16b, v29.8h 863 uabd v23.16b, v5.16b, v19.16b // tdiff 864 uabd v22.16b, v5.16b, v18.16b 865 uabd v21.16b, v5.16b, v17.16b 866 uabd v20.16b, v5.16b, v16.16b 867 uabd v27.16b, v4.16b, v19.16b // tldiff 868 uabd v26.16b, v4.16b, v18.16b 869 uabd v25.16b, v4.16b, v17.16b 870 uabd v24.16b, v4.16b, v16.16b 871 uabd v19.16b, v3.16b, v19.16b // ldiff 872 uabd v18.16b, v2.16b, v18.16b 873 uabd v17.16b, v1.16b, v17.16b 874 uabd v16.16b, v0.16b, v16.16b 875 umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff) 876 umin v30.16b, v22.16b, v26.16b 877 umin v29.16b, v21.16b, v25.16b 878 umin v28.16b, v20.16b, v24.16b 879 cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff 880 cmhs v22.16b, v26.16b, v22.16b 881 cmhs v21.16b, v25.16b, v21.16b 882 cmhs v20.16b, v24.16b, v20.16b 883 cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff 884 cmhs v18.16b, v30.16b, v18.16b 885 cmhs v17.16b, v29.16b, v17.16b 886 cmhs v16.16b, v28.16b, v16.16b 887 bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 888 bsl v22.16b, v5.16b, v4.16b 889 bsl v21.16b, v5.16b, v4.16b 890 bsl v20.16b, v5.16b, v4.16b 891 bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... 892 bit v22.16b, v2.16b, v18.16b 893 bit v21.16b, v1.16b, v17.16b 894 bit v20.16b, v0.16b, v16.16b 895 subs w3, w3, #16 896 st1 {v23.16b}, [x0], #16 897 st1 {v22.16b}, [x6], #16 898 st1 {v21.16b}, [x5], #16 899 st1 {v20.16b}, [x10], #16 900 b.le 8f 901 ld1 {v5.16b}, [x8], #16 902 b 2b 9038: 904 subs w4, w4, #4 905 b.le 9f 906 // End of horizontal loop, move pointers to next four rows 907 sub x8, x8, w9, uxtw 908 add x0, x0, x1 909 add x6, x6, x1 910 // Load the top row as early as possible 911 ld1 {v5.16b}, [x8], #16 912 add x5, x5, x1 913 add x10, x10, x1 914 mov w3, w9 915 b 1b 9169: 917 ret 918endfunc 919 920jumptable ipred_paeth_tbl 921 .word 640b - ipred_paeth_tbl 922 .word 320b - ipred_paeth_tbl 923 .word 160b - ipred_paeth_tbl 924 .word 80b - ipred_paeth_tbl 925 .word 40b - ipred_paeth_tbl 926endjumptable 927 928// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, 929// const pixel *const topleft, 930// const int width, const int height, const int a, 931// const int max_width, const int max_height); 932function ipred_smooth_8bpc_neon, export=1 933 movrel x10, X(sm_weights) 934 add x11, x10, w4, uxtw 935 add x10, x10, w3, uxtw 936 clz w9, w3 937 movrel x5, ipred_smooth_tbl 938 sub x12, x2, w4, uxtw 939 sub w9, w9, #25 940 ldrsw x9, [x5, w9, uxtw #2] 941 ld1r {v4.16b}, [x12] // bottom 942 add x8, x2, #1 943 add x5, x5, x9 944 add x6, x0, x1 945 lsl x1, x1, #1 946 br x5 94740: 948 AARCH64_VALID_JUMP_TARGET 949 ld1r {v6.2s}, [x8] // top 950 ld1r {v7.2s}, [x10] // weights_hor 951 sub x2, x2, #4 952 mov x7, #-4 953 dup v5.16b, v6.b[3] // right 954 usubl v6.8h, v6.8b, v4.8b // top-bottom 955 uxtl v7.8h, v7.8b // weights_hor 9564: 957 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 958 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 959 shll v20.8h, v5.8b, #8 // right*256 960 shll v21.8h, v5.8b, #8 961 zip1 v1.2s, v1.2s, v0.2s // left, flipped 962 zip1 v0.2s, v3.2s, v2.2s 963 zip1 v16.2s, v16.2s, v17.2s // weights_ver 964 zip1 v18.2s, v18.2s, v19.2s 965 shll v22.8h, v4.8b, #8 // bottom*256 966 shll v23.8h, v4.8b, #8 967 usubl v0.8h, v0.8b, v5.8b // left-right 968 usubl v1.8h, v1.8b, v5.8b 969 uxtl v16.8h, v16.8b // weights_ver 970 uxtl v18.8h, v18.8b 971 mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor 972 mla v21.8h, v1.8h, v7.8h 973 mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 974 mla v23.8h, v6.8h, v18.8h 975 uhadd v20.8h, v20.8h, v22.8h 976 uhadd v21.8h, v21.8h, v23.8h 977 rshrn v20.8b, v20.8h, #8 978 rshrn v21.8b, v21.8h, #8 979 st1 {v20.s}[0], [x0], x1 980 st1 {v20.s}[1], [x6], x1 981 subs w4, w4, #4 982 st1 {v21.s}[0], [x0], x1 983 st1 {v21.s}[1], [x6], x1 984 b.gt 4b 985 ret 98680: 987 AARCH64_VALID_JUMP_TARGET 988 ld1 {v6.8b}, [x8] // top 989 ld1 {v7.8b}, [x10] // weights_hor 990 sub x2, x2, #4 991 mov x7, #-4 992 dup v5.16b, v6.b[7] // right 993 usubl v6.8h, v6.8b, v4.8b // top-bottom 994 uxtl v7.8h, v7.8b // weights_hor 9958: 996 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 997 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 998 shll v20.8h, v5.8b, #8 // right*256 999 shll v21.8h, v5.8b, #8 1000 shll v22.8h, v5.8b, #8 1001 shll v23.8h, v5.8b, #8 1002 usubl v0.8h, v0.8b, v5.8b // left-right 1003 usubl v1.8h, v1.8b, v5.8b 1004 usubl v2.8h, v2.8b, v5.8b 1005 usubl v3.8h, v3.8b, v5.8b 1006 shll v24.8h, v4.8b, #8 // bottom*256 1007 shll v25.8h, v4.8b, #8 1008 shll v26.8h, v4.8b, #8 1009 shll v27.8h, v4.8b, #8 1010 uxtl v16.8h, v16.8b // weights_ver 1011 uxtl v17.8h, v17.8b 1012 uxtl v18.8h, v18.8b 1013 uxtl v19.8h, v19.8b 1014 mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor 1015 mla v21.8h, v2.8h, v7.8h // (left flipped) 1016 mla v22.8h, v1.8h, v7.8h 1017 mla v23.8h, v0.8h, v7.8h 1018 mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1019 mla v25.8h, v6.8h, v17.8h 1020 mla v26.8h, v6.8h, v18.8h 1021 mla v27.8h, v6.8h, v19.8h 1022 uhadd v20.8h, v20.8h, v24.8h 1023 uhadd v21.8h, v21.8h, v25.8h 1024 uhadd v22.8h, v22.8h, v26.8h 1025 uhadd v23.8h, v23.8h, v27.8h 1026 rshrn v20.8b, v20.8h, #8 1027 rshrn v21.8b, v21.8h, #8 1028 rshrn v22.8b, v22.8h, #8 1029 rshrn v23.8b, v23.8h, #8 1030 st1 {v20.8b}, [x0], x1 1031 st1 {v21.8b}, [x6], x1 1032 subs w4, w4, #4 1033 st1 {v22.8b}, [x0], x1 1034 st1 {v23.8b}, [x6], x1 1035 b.gt 8b 1036 ret 1037160: 1038320: 1039640: 1040 AARCH64_VALID_JUMP_TARGET 1041 add x12, x2, w3, uxtw 1042 sub x2, x2, #2 1043 mov x7, #-2 1044 ld1r {v5.16b}, [x12] // right 1045 sub x1, x1, w3, uxtw 1046 mov w9, w3 1047 10481: 1049 ld2r {v0.8b, v1.8b}, [x2], x7 // left 1050 ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver 1051 usubl v0.8h, v0.8b, v5.8b // left-right 1052 usubl v1.8h, v1.8b, v5.8b 1053 uxtl v16.8h, v16.8b // weights_ver 1054 uxtl v17.8h, v17.8b 10552: 1056 ld1 {v7.16b}, [x10], #16 // weights_hor 1057 ld1 {v3.16b}, [x8], #16 // top 1058 shll v20.8h, v5.8b, #8 // right*256 1059 shll v21.8h, v5.8b, #8 1060 shll v22.8h, v5.8b, #8 1061 shll v23.8h, v5.8b, #8 1062 uxtl v6.8h, v7.8b // weights_hor 1063 uxtl2 v7.8h, v7.16b 1064 usubl v2.8h, v3.8b, v4.8b // top-bottom 1065 usubl2 v3.8h, v3.16b, v4.16b 1066 mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor 1067 mla v21.8h, v1.8h, v7.8h // (left flipped) 1068 mla v22.8h, v0.8h, v6.8h 1069 mla v23.8h, v0.8h, v7.8h 1070 shll v24.8h, v4.8b, #8 // bottom*256 1071 shll v25.8h, v4.8b, #8 1072 shll v26.8h, v4.8b, #8 1073 shll v27.8h, v4.8b, #8 1074 mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1075 mla v25.8h, v3.8h, v16.8h 1076 mla v26.8h, v2.8h, v17.8h 1077 mla v27.8h, v3.8h, v17.8h 1078 uhadd v20.8h, v20.8h, v24.8h 1079 uhadd v21.8h, v21.8h, v25.8h 1080 uhadd v22.8h, v22.8h, v26.8h 1081 uhadd v23.8h, v23.8h, v27.8h 1082 rshrn v20.8b, v20.8h, #8 1083 rshrn2 v20.16b, v21.8h, #8 1084 rshrn v22.8b, v22.8h, #8 1085 rshrn2 v22.16b, v23.8h, #8 1086 subs w3, w3, #16 1087 st1 {v20.16b}, [x0], #16 1088 st1 {v22.16b}, [x6], #16 1089 b.gt 2b 1090 subs w4, w4, #2 1091 b.le 9f 1092 sub x8, x8, w9, uxtw 1093 sub x10, x10, w9, uxtw 1094 add x0, x0, x1 1095 add x6, x6, x1 1096 mov w3, w9 1097 b 1b 10989: 1099 ret 1100endfunc 1101 1102jumptable ipred_smooth_tbl 1103 .word 640b - ipred_smooth_tbl 1104 .word 320b - ipred_smooth_tbl 1105 .word 160b - ipred_smooth_tbl 1106 .word 80b - ipred_smooth_tbl 1107 .word 40b - ipred_smooth_tbl 1108endjumptable 1109 1110// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1111// const pixel *const topleft, 1112// const int width, const int height, const int a, 1113// const int max_width, const int max_height); 1114function ipred_smooth_v_8bpc_neon, export=1 1115 movrel x7, X(sm_weights) 1116 add x7, x7, w4, uxtw 1117 clz w9, w3 1118 movrel x5, ipred_smooth_v_tbl 1119 sub x8, x2, w4, uxtw 1120 sub w9, w9, #25 1121 ldrsw x9, [x5, w9, uxtw #2] 1122 ld1r {v4.16b}, [x8] // bottom 1123 add x2, x2, #1 1124 add x5, x5, x9 1125 add x6, x0, x1 1126 lsl x1, x1, #1 1127 br x5 112840: 1129 AARCH64_VALID_JUMP_TARGET 1130 ld1r {v6.2s}, [x2] // top 1131 usubl v6.8h, v6.8b, v4.8b // top-bottom 11324: 1133 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1134 shll v22.8h, v4.8b, #8 // bottom*256 1135 shll v23.8h, v4.8b, #8 1136 zip1 v16.2s, v16.2s, v17.2s // weights_ver 1137 zip1 v18.2s, v18.2s, v19.2s 1138 uxtl v16.8h, v16.8b // weights_ver 1139 uxtl v18.8h, v18.8b 1140 mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1141 mla v23.8h, v6.8h, v18.8h 1142 rshrn v22.8b, v22.8h, #8 1143 rshrn v23.8b, v23.8h, #8 1144 st1 {v22.s}[0], [x0], x1 1145 st1 {v22.s}[1], [x6], x1 1146 subs w4, w4, #4 1147 st1 {v23.s}[0], [x0], x1 1148 st1 {v23.s}[1], [x6], x1 1149 b.gt 4b 1150 ret 115180: 1152 AARCH64_VALID_JUMP_TARGET 1153 ld1 {v6.8b}, [x2] // top 1154 usubl v6.8h, v6.8b, v4.8b // top-bottom 11558: 1156 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1157 shll v24.8h, v4.8b, #8 // bottom*256 1158 shll v25.8h, v4.8b, #8 1159 shll v26.8h, v4.8b, #8 1160 shll v27.8h, v4.8b, #8 1161 uxtl v16.8h, v16.8b // weights_ver 1162 uxtl v17.8h, v17.8b 1163 uxtl v18.8h, v18.8b 1164 uxtl v19.8h, v19.8b 1165 mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1166 mla v25.8h, v6.8h, v17.8h 1167 mla v26.8h, v6.8h, v18.8h 1168 mla v27.8h, v6.8h, v19.8h 1169 rshrn v24.8b, v24.8h, #8 1170 rshrn v25.8b, v25.8h, #8 1171 rshrn v26.8b, v26.8h, #8 1172 rshrn v27.8b, v27.8h, #8 1173 st1 {v24.8b}, [x0], x1 1174 st1 {v25.8b}, [x6], x1 1175 subs w4, w4, #4 1176 st1 {v26.8b}, [x0], x1 1177 st1 {v27.8b}, [x6], x1 1178 b.gt 8b 1179 ret 1180160: 1181320: 1182640: 1183 AARCH64_VALID_JUMP_TARGET 1184 // Set up pointers for four rows in parallel; x0, x6, x5, x8 1185 add x5, x0, x1 1186 add x8, x6, x1 1187 lsl x1, x1, #1 1188 sub x1, x1, w3, uxtw 1189 mov w9, w3 1190 11911: 1192 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1193 uxtl v16.8h, v16.8b // weights_ver 1194 uxtl v17.8h, v17.8b 1195 uxtl v18.8h, v18.8b 1196 uxtl v19.8h, v19.8b 11972: 1198 ld1 {v3.16b}, [x2], #16 // top 1199 shll v20.8h, v4.8b, #8 // bottom*256 1200 shll v21.8h, v4.8b, #8 1201 shll v22.8h, v4.8b, #8 1202 shll v23.8h, v4.8b, #8 1203 shll v24.8h, v4.8b, #8 1204 shll v25.8h, v4.8b, #8 1205 shll v26.8h, v4.8b, #8 1206 shll v27.8h, v4.8b, #8 1207 usubl v2.8h, v3.8b, v4.8b // top-bottom 1208 usubl2 v3.8h, v3.16b, v4.16b 1209 mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver 1210 mla v21.8h, v3.8h, v16.8h 1211 mla v22.8h, v2.8h, v17.8h 1212 mla v23.8h, v3.8h, v17.8h 1213 mla v24.8h, v2.8h, v18.8h 1214 mla v25.8h, v3.8h, v18.8h 1215 mla v26.8h, v2.8h, v19.8h 1216 mla v27.8h, v3.8h, v19.8h 1217 rshrn v20.8b, v20.8h, #8 1218 rshrn2 v20.16b, v21.8h, #8 1219 rshrn v22.8b, v22.8h, #8 1220 rshrn2 v22.16b, v23.8h, #8 1221 rshrn v24.8b, v24.8h, #8 1222 rshrn2 v24.16b, v25.8h, #8 1223 rshrn v26.8b, v26.8h, #8 1224 rshrn2 v26.16b, v27.8h, #8 1225 subs w3, w3, #16 1226 st1 {v20.16b}, [x0], #16 1227 st1 {v22.16b}, [x6], #16 1228 st1 {v24.16b}, [x5], #16 1229 st1 {v26.16b}, [x8], #16 1230 b.gt 2b 1231 subs w4, w4, #4 1232 b.le 9f 1233 sub x2, x2, w9, uxtw 1234 add x0, x0, x1 1235 add x6, x6, x1 1236 add x5, x5, x1 1237 add x8, x8, x1 1238 mov w3, w9 1239 b 1b 12409: 1241 ret 1242endfunc 1243 1244jumptable ipred_smooth_v_tbl 1245 .word 640b - ipred_smooth_v_tbl 1246 .word 320b - ipred_smooth_v_tbl 1247 .word 160b - ipred_smooth_v_tbl 1248 .word 80b - ipred_smooth_v_tbl 1249 .word 40b - ipred_smooth_v_tbl 1250endjumptable 1251 1252// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1253// const pixel *const topleft, 1254// const int width, const int height, const int a, 1255// const int max_width, const int max_height); 1256function ipred_smooth_h_8bpc_neon, export=1 1257 movrel x8, X(sm_weights) 1258 add x8, x8, w3, uxtw 1259 clz w9, w3 1260 movrel x5, ipred_smooth_h_tbl 1261 add x12, x2, w3, uxtw 1262 sub w9, w9, #25 1263 ldrsw x9, [x5, w9, uxtw #2] 1264 ld1r {v5.16b}, [x12] // right 1265 add x5, x5, x9 1266 add x6, x0, x1 1267 lsl x1, x1, #1 1268 br x5 126940: 1270 AARCH64_VALID_JUMP_TARGET 1271 ld1r {v7.2s}, [x8] // weights_hor 1272 sub x2, x2, #4 1273 mov x7, #-4 1274 uxtl v7.8h, v7.8b // weights_hor 12754: 1276 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 1277 shll v20.8h, v5.8b, #8 // right*256 1278 shll v21.8h, v5.8b, #8 1279 zip1 v1.2s, v1.2s, v0.2s // left, flipped 1280 zip1 v0.2s, v3.2s, v2.2s 1281 usubl v0.8h, v0.8b, v5.8b // left-right 1282 usubl v1.8h, v1.8b, v5.8b 1283 mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor 1284 mla v21.8h, v1.8h, v7.8h 1285 rshrn v20.8b, v20.8h, #8 1286 rshrn v21.8b, v21.8h, #8 1287 st1 {v20.s}[0], [x0], x1 1288 st1 {v20.s}[1], [x6], x1 1289 subs w4, w4, #4 1290 st1 {v21.s}[0], [x0], x1 1291 st1 {v21.s}[1], [x6], x1 1292 b.gt 4b 1293 ret 129480: 1295 AARCH64_VALID_JUMP_TARGET 1296 ld1 {v7.8b}, [x8] // weights_hor 1297 sub x2, x2, #4 1298 mov x7, #-4 1299 uxtl v7.8h, v7.8b // weights_hor 13008: 1301 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 1302 shll v20.8h, v5.8b, #8 // right*256 1303 shll v21.8h, v5.8b, #8 1304 shll v22.8h, v5.8b, #8 1305 shll v23.8h, v5.8b, #8 1306 usubl v3.8h, v3.8b, v5.8b // left-right 1307 usubl v2.8h, v2.8b, v5.8b 1308 usubl v1.8h, v1.8b, v5.8b 1309 usubl v0.8h, v0.8b, v5.8b 1310 mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor 1311 mla v21.8h, v2.8h, v7.8h // (left flipped) 1312 mla v22.8h, v1.8h, v7.8h 1313 mla v23.8h, v0.8h, v7.8h 1314 rshrn v20.8b, v20.8h, #8 1315 rshrn v21.8b, v21.8h, #8 1316 rshrn v22.8b, v22.8h, #8 1317 rshrn v23.8b, v23.8h, #8 1318 st1 {v20.8b}, [x0], x1 1319 st1 {v21.8b}, [x6], x1 1320 subs w4, w4, #4 1321 st1 {v22.8b}, [x0], x1 1322 st1 {v23.8b}, [x6], x1 1323 b.gt 8b 1324 ret 1325160: 1326320: 1327640: 1328 AARCH64_VALID_JUMP_TARGET 1329 sub x2, x2, #4 1330 mov x7, #-4 1331 // Set up pointers for four rows in parallel; x0, x6, x5, x10 1332 add x5, x0, x1 1333 add x10, x6, x1 1334 lsl x1, x1, #1 1335 sub x1, x1, w3, uxtw 1336 mov w9, w3 1337 13381: 1339 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left 1340 usubl v0.8h, v0.8b, v5.8b // left-right 1341 usubl v1.8h, v1.8b, v5.8b 1342 usubl v2.8h, v2.8b, v5.8b 1343 usubl v3.8h, v3.8b, v5.8b 13442: 1345 ld1 {v7.16b}, [x8], #16 // weights_hor 1346 shll v20.8h, v5.8b, #8 // right*256 1347 shll v21.8h, v5.8b, #8 1348 shll v22.8h, v5.8b, #8 1349 shll v23.8h, v5.8b, #8 1350 shll v24.8h, v5.8b, #8 1351 shll v25.8h, v5.8b, #8 1352 shll v26.8h, v5.8b, #8 1353 shll v27.8h, v5.8b, #8 1354 uxtl v6.8h, v7.8b // weights_hor 1355 uxtl2 v7.8h, v7.16b 1356 mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor 1357 mla v21.8h, v3.8h, v7.8h // (left flipped) 1358 mla v22.8h, v2.8h, v6.8h 1359 mla v23.8h, v2.8h, v7.8h 1360 mla v24.8h, v1.8h, v6.8h 1361 mla v25.8h, v1.8h, v7.8h 1362 mla v26.8h, v0.8h, v6.8h 1363 mla v27.8h, v0.8h, v7.8h 1364 rshrn v20.8b, v20.8h, #8 1365 rshrn2 v20.16b, v21.8h, #8 1366 rshrn v22.8b, v22.8h, #8 1367 rshrn2 v22.16b, v23.8h, #8 1368 rshrn v24.8b, v24.8h, #8 1369 rshrn2 v24.16b, v25.8h, #8 1370 rshrn v26.8b, v26.8h, #8 1371 rshrn2 v26.16b, v27.8h, #8 1372 subs w3, w3, #16 1373 st1 {v20.16b}, [x0], #16 1374 st1 {v22.16b}, [x6], #16 1375 st1 {v24.16b}, [x5], #16 1376 st1 {v26.16b}, [x10], #16 1377 b.gt 2b 1378 subs w4, w4, #4 1379 b.le 9f 1380 sub x8, x8, w9, uxtw 1381 add x0, x0, x1 1382 add x6, x6, x1 1383 add x5, x5, x1 1384 add x10, x10, x1 1385 mov w3, w9 1386 b 1b 13879: 1388 ret 1389endfunc 1390 1391jumptable ipred_smooth_h_tbl 1392 .word 640b - ipred_smooth_h_tbl 1393 .word 320b - ipred_smooth_h_tbl 1394 .word 160b - ipred_smooth_h_tbl 1395 .word 80b - ipred_smooth_h_tbl 1396 .word 40b - ipred_smooth_h_tbl 1397endjumptable 1398 1399const padding_mask_buf 1400 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1401 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1402 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1403 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1404padding_mask: 1405 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1406 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1407 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1408 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1409endconst 1410 1411// void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz, 1412// const pixel *const in, const int end); 1413function ipred_z1_upsample_edge_8bpc_neon, export=1 1414 movrel x4, padding_mask 1415 ld1 {v0.16b}, [x2] // in[] 1416 add x5, x2, w3, uxtw // in[end] 1417 sub x4, x4, w3, uxtw 1418 1419 ld1r {v1.16b}, [x5] // padding 1420 ld1 {v3.16b}, [x4] // padding_mask 1421 1422 movi v31.8h, #9 1423 1424 bit v0.16b, v1.16b, v3.16b // padded in[] 1425 1426 ext v4.16b, v0.16b, v1.16b, #1 1427 ext v5.16b, v0.16b, v1.16b, #2 1428 ext v6.16b, v0.16b, v1.16b, #3 1429 1430 uaddl v16.8h, v4.8b, v5.8b // in[i+1] + in[i+2] 1431 uaddl2 v17.8h, v4.16b, v5.16b 1432 uaddl v18.8h, v0.8b, v6.8b // in[i+0] + in[i+3] 1433 uaddl2 v19.8h, v0.16b, v6.16b 1434 mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) 1435 mul v17.8h, v17.8h, v31.8h 1436 sub v16.8h, v16.8h, v18.8h 1437 sub v17.8h, v17.8h, v19.8h 1438 1439 sqrshrun v16.8b, v16.8h, #4 1440 sqrshrun2 v16.16b, v17.8h, #4 1441 1442 zip1 v0.16b, v4.16b, v16.16b 1443 zip2 v1.16b, v4.16b, v16.16b 1444 1445 st1 {v0.16b, v1.16b}, [x0] 1446 1447 ret 1448endfunc 1449 1450// void ipred_z2_upsample_edge_8bpc_neon(pixel *out, const int sz, 1451// const pixel *const in); 1452function ipred_z2_upsample_edge_8bpc_neon, export=1 1453 // Here, sz is 4 or 8, and we produce 2*sz+1 output elements. 1454 movrel x4, padding_mask 1455 ld1 {v0.16b}, [x2] // in[] 1456 add x5, x2, w1, uxtw // in[sz] 1457 sub x4, x4, w1, uxtw 1458 1459 ld1r {v2.16b}, [x2] // in[0] for padding 1460 ld1r {v1.16b}, [x5] // padding 1461 ld1 {v3.16b}, [x4] // padding_mask 1462 1463 movi v31.8h, #9 1464 1465 bit v0.16b, v1.16b, v3.16b // padded in[] 1466 1467 ext v4.16b, v2.16b, v0.16b, #15 1468 ext v5.16b, v0.16b, v1.16b, #1 1469 ext v6.16b, v0.16b, v1.16b, #2 1470 1471 uaddl v16.8h, v0.8b, v5.8b // in[i+0] + in[i+1] 1472 uaddl v18.8h, v4.8b, v6.8b // in[i-1] + in[i+2] 1473 mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) 1474 sub v16.8h, v16.8h, v18.8h 1475 1476 sqrshrun v16.8b, v16.8h, #4 1477 1478 add x5, x0, #16 1479 1480 zip1 v2.16b, v0.16b, v16.16b 1481 1482 st1 {v1.b}[0], [x5] 1483 // In case sz=8, output one single pixel in out[16]. 1484 st1 {v2.16b}, [x0] 1485 1486 ret 1487endfunc 1488 1489const edge_filter 1490 .byte 0, 4, 8, 0 1491 .byte 0, 5, 6, 0 1492// Leaving out the coeffs for strength=3 1493// .byte 2, 4, 4, 0 1494endconst 1495 1496// void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz, 1497// const pixel *const in, const int end, 1498// const int strength); 1499function ipred_z1_filter_edge_8bpc_neon, export=1 1500 cmp w4, #3 1501 b.eq L(fivetap) // if (strength == 3) goto fivetap 1502 1503 movrel x5, edge_filter, -3 1504 add x5, x5, w4, uxtw #2 // edge_filter + (strength - 1)*4 + 1 1505 1506 ld1 {v31.h}[0], [x5] // kernel[1-2] 1507 1508 ld1 {v0.16b}, [x2], #16 1509 1510 dup v30.16b, v31.b[0] 1511 dup v31.16b, v31.b[1] 15121: 1513 // in[end], is the last valid pixel. We produce 16 pixels out by 1514 // using 18 pixels in - the last pixel used is [17] of the ones 1515 // read/buffered. 1516 cmp w3, #17 1517 ld1 {v1.16b}, [x2], #16 1518 b.lt 2f 1519 ext v2.16b, v0.16b, v1.16b, #1 1520 ext v3.16b, v0.16b, v1.16b, #2 1521 umull v4.8h, v0.8b, v30.8b 1522 umlal v4.8h, v2.8b, v31.8b 1523 umlal v4.8h, v3.8b, v30.8b 1524 umull2 v5.8h, v0.16b, v30.16b 1525 umlal2 v5.8h, v2.16b, v31.16b 1526 umlal2 v5.8h, v3.16b, v30.16b 1527 subs w1, w1, #16 1528 mov v0.16b, v1.16b 1529 rshrn v4.8b, v4.8h, #4 1530 rshrn2 v4.16b, v5.8h, #4 1531 sub w3, w3, #16 1532 st1 {v4.16b}, [x0], #16 1533 b.gt 1b 1534 ret 15352: 1536 // Right padding 1537 1538 // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead) 1539 movrel x5, padding_mask 1540 sub w6, w3, #32 1541 sub x5, x5, w3, uxtw 1542 add x6, x2, w6, sxtw 1543 1544 ld1 {v2.16b}, [x5] // padding_mask 1545 1546 ld1r {v1.16b}, [x6] 1547 bit v0.16b, v1.16b, v2.16b // Pad v0-v1 1548 1549 // Filter one block 1550 ext v2.16b, v0.16b, v1.16b, #1 1551 ext v3.16b, v0.16b, v1.16b, #2 1552 umull v4.8h, v0.8b, v30.8b 1553 umlal v4.8h, v2.8b, v31.8b 1554 umlal v4.8h, v3.8b, v30.8b 1555 umull2 v5.8h, v0.16b, v30.16b 1556 umlal2 v5.8h, v2.16b, v31.16b 1557 umlal2 v5.8h, v3.16b, v30.16b 1558 subs w1, w1, #16 1559 rshrn v4.8b, v4.8h, #4 1560 rshrn2 v4.16b, v5.8h, #4 1561 st1 {v4.16b}, [x0], #16 1562 b.le 9f 15635: 1564 // After one block, any remaining output would only be filtering 1565 // padding - thus just store the padding. 1566 subs w1, w1, #16 1567 st1 {v1.16b}, [x0], #16 1568 b.gt 5b 15699: 1570 ret 1571 1572L(fivetap): 1573 sub x2, x2, #1 // topleft -= 1 1574 movi v29.16b, #2 1575 ld1 {v0.16b}, [x2], #16 1576 movi v30.16b, #4 1577 movi v31.16b, #4 1578 ins v0.b[0], v0.b[1] 15791: 1580 // in[end+1], is the last valid pixel. We produce 16 pixels out by 1581 // using 20 pixels in - the last pixel used is [19] of the ones 1582 // read/buffered. 1583 cmp w3, #18 1584 ld1 {v1.16b}, [x2], #16 1585 b.lt 2f // if (end + 1 < 19) 1586 ext v2.16b, v0.16b, v1.16b, #1 1587 ext v3.16b, v0.16b, v1.16b, #2 1588 ext v4.16b, v0.16b, v1.16b, #3 1589 ext v5.16b, v0.16b, v1.16b, #4 1590 umull v6.8h, v0.8b, v29.8b 1591 umlal v6.8h, v2.8b, v30.8b 1592 umlal v6.8h, v3.8b, v31.8b 1593 umlal v6.8h, v4.8b, v30.8b 1594 umlal v6.8h, v5.8b, v29.8b 1595 umull2 v7.8h, v0.16b, v29.16b 1596 umlal2 v7.8h, v2.16b, v30.16b 1597 umlal2 v7.8h, v3.16b, v31.16b 1598 umlal2 v7.8h, v4.16b, v30.16b 1599 umlal2 v7.8h, v5.16b, v29.16b 1600 subs w1, w1, #16 1601 mov v0.16b, v1.16b 1602 rshrn v6.8b, v6.8h, #4 1603 rshrn2 v6.16b, v7.8h, #4 1604 sub w3, w3, #16 1605 st1 {v6.16b}, [x0], #16 1606 b.gt 1b 1607 ret 16082: 1609 // Right padding 1610 1611 // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead) 1612 movrel x5, padding_mask, -1 1613 sub w6, w3, #31 1614 sub x5, x5, w3, uxtw 1615 add x6, x2, w6, sxtw 1616 1617 ld1 {v2.16b, v3.16b}, [x5] // padding_mask 1618 1619 ld1r {v28.16b}, [x6] 1620 bit v0.16b, v28.16b, v2.16b // Pad v0-v1 1621 bit v1.16b, v28.16b, v3.16b 16224: 1623 // Filter one block 1624 ext v2.16b, v0.16b, v1.16b, #1 1625 ext v3.16b, v0.16b, v1.16b, #2 1626 ext v4.16b, v0.16b, v1.16b, #3 1627 ext v5.16b, v0.16b, v1.16b, #4 1628 umull v6.8h, v0.8b, v29.8b 1629 umlal v6.8h, v2.8b, v30.8b 1630 umlal v6.8h, v3.8b, v31.8b 1631 umlal v6.8h, v4.8b, v30.8b 1632 umlal v6.8h, v5.8b, v29.8b 1633 umull2 v7.8h, v0.16b, v29.16b 1634 umlal2 v7.8h, v2.16b, v30.16b 1635 umlal2 v7.8h, v3.16b, v31.16b 1636 umlal2 v7.8h, v4.16b, v30.16b 1637 umlal2 v7.8h, v5.16b, v29.16b 1638 subs w1, w1, #16 1639 mov v0.16b, v1.16b 1640 mov v1.16b, v28.16b 1641 rshrn v6.8b, v6.8h, #4 1642 rshrn2 v6.16b, v7.8h, #4 1643 sub w3, w3, #16 1644 st1 {v6.16b}, [x0], #16 1645 b.le 9f 1646 // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to 1647 // filter properly once more - aka (w3 >= 0). 1648 cmp w3, #0 1649 b.ge 4b 16505: 1651 // When w3 <= 0, all remaining pixels in v0-v1 are equal to the 1652 // last valid pixel - thus just output that without filtering. 1653 subs w1, w1, #16 1654 st1 {v1.16b}, [x0], #16 1655 b.gt 5b 16569: 1657 ret 1658endfunc 1659 1660// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px, 1661// const int n); 1662function ipred_pixel_set_8bpc_neon, export=1 1663 dup v0.16b, w1 16641: 1665 subs w2, w2, #16 1666 st1 {v0.16b}, [x0], #16 1667 b.gt 1b 1668 ret 1669endfunc 1670 1671// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1672// const pixel *const top, 1673// const int width, const int height, 1674// const int dx, const int max_base_x); 1675function ipred_z1_fill1_8bpc_neon, export=1 1676 clz w9, w3 1677 movrel x8, ipred_z1_fill1_tbl 1678 sub w9, w9, #25 1679 ldrsw x9, [x8, w9, uxtw #2] 1680 add x10, x2, w6, uxtw // top[max_base_x] 1681 add x8, x8, x9 1682 ld1r {v31.16b}, [x10] // padding 1683 mov w7, w5 1684 mov w15, #64 1685 br x8 168640: 1687 AARCH64_VALID_JUMP_TARGET 16884: 1689 lsr w8, w7, #6 // base 1690 and w9, w7, #0x3e // frac 1691 add w7, w7, w5 // xpos += dx 1692 cmp w8, w6 // base >= max_base_x 1693 lsr w10, w7, #6 // base 1694 and w11, w7, #0x3e // frac 1695 b.ge 49f 1696 ldr d0, [x2, w8, uxtw] // top[base] 1697 ldr d2, [x2, w10, uxtw] 1698 dup v4.4h, w9 // frac 1699 dup v5.4h, w11 1700 ext v1.8b, v0.8b, v0.8b, #1 // top[base+1] 1701 ext v3.8b, v2.8b, v2.8b, #1 1702 usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] 1703 usubl v7.8h, v3.8b, v2.8b 1704 ushll v16.8h, v0.8b, #6 // top[base]*64 1705 ushll v17.8h, v2.8b, #6 1706 mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac 1707 mla v17.4h, v7.4h, v5.4h 1708 rshrn v16.8b, v16.8h, #6 1709 rshrn v17.8b, v17.8h, #6 1710 st1 {v16.s}[0], [x0], x1 1711 add w7, w7, w5 // xpos += dx 1712 subs w4, w4, #2 1713 st1 {v17.s}[0], [x0], x1 1714 b.gt 4b 1715 ret 1716 171749: 1718 st1 {v31.s}[0], [x0], x1 1719 subs w4, w4, #2 1720 st1 {v31.s}[0], [x0], x1 1721 b.gt 49b 1722 ret 1723 172480: 1725 AARCH64_VALID_JUMP_TARGET 17268: 1727 lsr w8, w7, #6 // base 1728 and w9, w7, #0x3e // frac 1729 add w7, w7, w5 // xpos += dx 1730 cmp w8, w6 // base >= max_base_x 1731 lsr w10, w7, #6 // base 1732 and w11, w7, #0x3e // frac 1733 b.ge 89f 1734 ldr q0, [x2, w8, uxtw] // top[base] 1735 ldr q2, [x2, w10, uxtw] 1736 dup v4.8b, w9 // frac 1737 dup v5.8b, w11 1738 sub w9, w15, w9 // 64 - frac 1739 sub w11, w15, w11 1740 dup v6.8b, w9 // 64 - frac 1741 dup v7.8b, w11 1742 ext v1.16b, v0.16b, v0.16b, #1 // top[base+1] 1743 ext v3.16b, v2.16b, v2.16b, #1 1744 umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac) 1745 umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac 1746 umull v17.8h, v2.8b, v7.8b 1747 umlal v17.8h, v3.8b, v5.8b 1748 rshrn v16.8b, v16.8h, #6 1749 rshrn v17.8b, v17.8h, #6 1750 st1 {v16.8b}, [x0], x1 1751 add w7, w7, w5 // xpos += dx 1752 subs w4, w4, #2 1753 st1 {v17.8b}, [x0], x1 1754 b.gt 8b 1755 ret 1756 175789: 1758 st1 {v31.8b}, [x0], x1 1759 subs w4, w4, #2 1760 st1 {v31.8b}, [x0], x1 1761 b.gt 89b 1762 ret 1763 1764160: 1765320: 1766640: 1767 AARCH64_VALID_JUMP_TARGET 1768 1769 mov w12, w3 1770 1771 add x13, x0, x1 1772 lsl x1, x1, #1 1773 sub x1, x1, w3, uxtw 17741: 1775 lsr w8, w7, #6 // base 1776 and w9, w7, #0x3e // frac 1777 add w7, w7, w5 // xpos += dx 1778 cmp w8, w6 // base >= max_base_x 1779 lsr w10, w7, #6 // base 1780 and w11, w7, #0x3e // frac 1781 b.ge 169f 1782 add x8, x2, w8, uxtw 1783 add x10, x2, w10, uxtw 1784 dup v4.16b, w9 // frac 1785 dup v5.16b, w11 1786 ld1 {v0.16b, v1.16b}, [x8], #32 // top[base] 1787 ld1 {v2.16b, v3.16b}, [x10], #32 1788 sub w9, w15, w9 // 64 - frac 1789 sub w11, w15, w11 1790 dup v6.16b, w9 // 64 - frac 1791 dup v7.16b, w11 1792 add w7, w7, w5 // xpos += dx 17932: 1794 ext v16.16b, v0.16b, v1.16b, #1 // top[base+1] 1795 ext v17.16b, v2.16b, v3.16b, #1 1796 subs w3, w3, #16 1797 umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac) 1798 umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac 1799 umull2 v19.8h, v0.16b, v6.16b 1800 umlal2 v19.8h, v16.16b, v4.16b 1801 umull v20.8h, v2.8b, v7.8b 1802 umlal v20.8h, v17.8b, v5.8b 1803 umull2 v21.8h, v2.16b, v7.16b 1804 umlal2 v21.8h, v17.16b, v5.16b 1805 rshrn v16.8b, v18.8h, #6 1806 rshrn2 v16.16b, v19.8h, #6 1807 rshrn v17.8b, v20.8h, #6 1808 rshrn2 v17.16b, v21.8h, #6 1809 st1 {v16.16b}, [x0], #16 1810 st1 {v17.16b}, [x13], #16 1811 b.le 3f 1812 mov v0.16b, v1.16b 1813 ld1 {v1.16b}, [x8], #16 // top[base] 1814 mov v2.16b, v3.16b 1815 ld1 {v3.16b}, [x10], #16 1816 b 2b 1817 18183: 1819 subs w4, w4, #2 1820 b.le 9f 1821 add x0, x0, x1 1822 add x13, x13, x1 1823 mov w3, w12 1824 b 1b 18259: 1826 ret 1827 1828169: 1829 st1 {v31.16b}, [x0], #16 1830 subs w3, w3, #16 1831 st1 {v31.16b}, [x13], #16 1832 b.gt 169b 1833 subs w4, w4, #2 1834 b.le 9b 1835 add x0, x0, x1 1836 add x13, x13, x1 1837 mov w3, w12 1838 b 169b 1839endfunc 1840 1841jumptable ipred_z1_fill1_tbl 1842 .word 640b - ipred_z1_fill1_tbl 1843 .word 320b - ipred_z1_fill1_tbl 1844 .word 160b - ipred_z1_fill1_tbl 1845 .word 80b - ipred_z1_fill1_tbl 1846 .word 40b - ipred_z1_fill1_tbl 1847endjumptable 1848 1849function ipred_z1_fill2_8bpc_neon, export=1 1850 cmp w3, #8 1851 add x10, x2, w6, uxtw // top[max_base_x] 1852 ld1r {v31.16b}, [x10] // padding 1853 mov w7, w5 1854 mov w15, #64 1855 b.eq 8f 1856 18574: // w == 4 1858 lsr w8, w7, #6 // base 1859 and w9, w7, #0x3e // frac 1860 add w7, w7, w5 // xpos += dx 1861 cmp w8, w6 // base >= max_base_x 1862 lsr w10, w7, #6 // base 1863 and w11, w7, #0x3e // frac 1864 b.ge 49f 1865 ldr d0, [x2, w8, uxtw] // top[base] 1866 ldr d2, [x2, w10, uxtw] 1867 dup v4.4h, w9 // frac 1868 dup v5.4h, w11 1869 uzp2 v1.8b, v0.8b, v0.8b // top[base+1] 1870 uzp1 v0.8b, v0.8b, v0.8b // top[base] 1871 uzp2 v3.8b, v2.8b, v2.8b 1872 uzp1 v2.8b, v2.8b, v2.8b 1873 usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] 1874 usubl v7.8h, v3.8b, v2.8b 1875 ushll v16.8h, v0.8b, #6 // top[base]*64 1876 ushll v17.8h, v2.8b, #6 1877 mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac 1878 mla v17.4h, v7.4h, v5.4h 1879 rshrn v16.8b, v16.8h, #6 1880 rshrn v17.8b, v17.8h, #6 1881 st1 {v16.s}[0], [x0], x1 1882 add w7, w7, w5 // xpos += dx 1883 subs w4, w4, #2 1884 st1 {v17.s}[0], [x0], x1 1885 b.gt 4b 1886 ret 1887 188849: 1889 st1 {v31.s}[0], [x0], x1 1890 subs w4, w4, #2 1891 st1 {v31.s}[0], [x0], x1 1892 b.gt 49b 1893 ret 1894 18958: // w == 8 1896 lsr w8, w7, #6 // base 1897 and w9, w7, #0x3e // frac 1898 add w7, w7, w5 // xpos += dx 1899 cmp w8, w6 // base >= max_base_x 1900 lsr w10, w7, #6 // base 1901 and w11, w7, #0x3e // frac 1902 b.ge 89f 1903 ldr q0, [x2, w8, uxtw] // top[base] 1904 ldr q2, [x2, w10, uxtw] 1905 dup v4.8b, w9 // frac 1906 dup v5.8b, w11 1907 sub w9, w15, w9 // 64 - frac 1908 sub w11, w15, w11 1909 dup v6.8b, w9 // 64 - frac 1910 dup v7.8b, w11 1911 uzp2 v1.16b, v0.16b, v0.16b // top[base+1] 1912 uzp1 v0.16b, v0.16b, v0.16b // top[base] 1913 uzp2 v3.16b, v2.16b, v2.16b 1914 uzp1 v2.16b, v2.16b, v2.16b 1915 umull v16.8h, v1.8b, v4.8b // top[base+1]*frac 1916 umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac) 1917 umull v17.8h, v3.8b, v5.8b 1918 umlal v17.8h, v2.8b, v7.8b 1919 rshrn v16.8b, v16.8h, #6 1920 rshrn v17.8b, v17.8h, #6 1921 st1 {v16.8b}, [x0], x1 1922 add w7, w7, w5 // xpos += dx 1923 subs w4, w4, #2 1924 st1 {v17.8b}, [x0], x1 1925 b.gt 8b 1926 ret 1927 192889: 1929 st1 {v31.8b}, [x0], x1 1930 subs w4, w4, #2 1931 st1 {v31.8b}, [x0], x1 1932 b.gt 89b 1933 ret 1934endfunc 1935 1936// void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src, 1937// const int n); 1938function ipred_reverse_8bpc_neon, export=1 1939 sub x1, x1, #16 1940 add x3, x0, #8 1941 mov x4, #16 19421: 1943 ld1 {v0.16b}, [x1] 1944 subs w2, w2, #16 1945 rev64 v0.16b, v0.16b 1946 sub x1, x1, #16 1947 st1 {v0.d}[1], [x0], x4 1948 st1 {v0.d}[0], [x3], x4 1949 b.gt 1b 1950 ret 1951endfunc 1952 1953const increments 1954 .short 0, 1, 2, 3, 4, 5, 6, 7 1955 .short 8, 9, 10, 11, 12, 13, 14, 15 1956endconst 1957 1958// void ipred_z2_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, 1959// const pixel *const top, 1960// const pixel *const left, 1961// const int width, const int height, 1962// const int dx, const int dy); 1963function ipred_z2_fill1_8bpc_neon, export=1 1964 clz w10, w4 1965 movrel x9, ipred_z2_fill1_tbl 1966 sub w10, w10, #25 1967 ldrsw x10, [x9, w10, uxtw #2] 1968 mov w8, #(1 << 6) // xpos = 1 << 6 1969 add x9, x9, x10 1970 sub w8, w8, w6 // xpos -= dx 1971 1972 movrel x11, increments 1973 ld1 {v31.8h}, [x11] // increments 1974 neg w7, w7 // -dy 1975 1976 br x9 197740: 1978 AARCH64_VALID_JUMP_TARGET 1979 1980 dup v30.4h, w7 // -dy 1981 movi v17.8b, #1 1982 1983 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy 1984 movi v25.16b, #0x3e 1985 add v30.4h, v16.4h, v30.4h // -= dy 1986 1987 xtn v31.8b, v31.8h // {0,1,2,3} 1988 1989 // Worst case height for w=4 is 16, but we need at least h+1 elements 1990 ld1 {v0.16b, v1.16b}, [x3] // left[] 1991 1992 movi v26.16b, #64 1993 movi v19.16b, #2 1994 1995 xtn v27.8b, v30.8h // (uint8_t)ypos 1996 shrn v29.8b, v30.8h, #6 // ypos >> 6 1997 and v27.8b, v27.8b, v25.8b // frac_y 1998 1999 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 2000 2001 add v30.8b, v29.8b, v17.8b // base_y + 1 2002 add v28.8b, v29.8b, v19.8b // base_y + 2 2003 2004 tbl v16.8b, {v0.16b}, v29.8b // left[base_y] 2005 2006 trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2 2007 2008 sub v28.8b, v26.8b, v27.8b // 64 - frac_y 2009 2010 trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} 2011 2012 trn1 v27.2s, v27.2s, v27.2s // frac_y 2013 trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y 2014 2015 movi v29.8b, #2 20164: 2017 asr w9, w8, #6 // base_x 2018 dup v6.4h, w8 // xpos 2019 sub w8, w8, w6 // xpos -= dx 2020 cmp w9, #-4 // base_x <= -4 2021 asr w11, w8, #6 // base_x 2022 b.le 49f 2023 2024 dup v7.4h, w8 // xpos 2025 2026 ldr d2, [x2, w9, sxtw] // top[base_x] 2027 ldr d4, [x2, w11, sxtw] 2028 2029 trn1 v6.2d, v6.2d, v7.2d // xpos 2030 2031 // Cut corners here; only doing tbl over v0 here; we only 2032 // seem to need the last pixel, from v1, after skipping to the 2033 // left-only codepath below. 2034 tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] 2035 2036 shrn v20.8b, v6.8h, #6 // first base_x for each row 2037 xtn v6.8b, v6.8h // (uint8_t)xpos 2038 2039 ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1] 2040 ext v5.8b, v4.8b, v4.8b, #1 2041 2042 and v6.8b, v6.8b, v25.8b // frac_x 2043 2044 trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] 2045 2046 trn1 v2.2s, v2.2s, v4.2s // top[base_x] 2047 trn1 v3.2s, v3.2s, v5.2s // top[base_x+1] 2048 2049 sub v7.8b, v26.8b, v6.8b // 64 - frac_x 2050 2051 add v20.8b, v20.8b, v31.8b // actual base_x 2052 2053 umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) 2054 umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y 2055 2056 umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) 2057 umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x 2058 2059 cmge v20.8b, v20.8b, #0 2060 2061 rshrn v16.8b, v16.8h, #6 2062 rshrn v22.8b, v22.8h, #6 2063 2064 bit v16.8b, v22.8b, v20.8b 2065 2066 st1 {v16.s}[0], [x0], x1 2067 sub w8, w8, w6 // xpos -= dx 2068 subs w5, w5, #2 2069 st1 {v16.s}[1], [x0], x1 2070 b.le 9f 2071 2072 ext v16.8b, v17.8b, v17.8b, #4 2073 add v30.8b, v30.8b, v29.8b // base_y += 2 2074 b 4b 2075 207649: 2077 tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2] 2078 2079 trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] 2080 2081 umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) 2082 umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y 2083 rshrn v18.8b, v18.8h, #6 2084 2085 st1 {v18.s}[0], [x0], x1 2086 subs w5, w5, #2 2087 st1 {v18.s}[1], [x0], x1 2088 b.le 9f 2089 2090 ext v16.8b, v17.8b, v17.8b, #4 2091 add v30.8b, v30.8b, v29.8b // base_y += 2 2092 b 49b 2093 20949: 2095 ret 2096 209780: 2098 AARCH64_VALID_JUMP_TARGET 2099 2100 dup v30.8h, w7 // -dy 2101 movi v17.8b, #1 2102 2103 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy 2104 movi v25.16b, #0x3e 2105 add v30.8h, v16.8h, v30.8h // -= dy 2106 2107 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} 2108 2109 // Worst case height for w=8 is 32, but we need at least h+1 elements 2110 ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[] 2111 2112 movi v26.16b, #64 2113 movi v19.16b, #2 2114 2115 xtn v27.8b, v30.8h // (uint8_t)ypos 2116 shrn v29.8b, v30.8h, #6 // ypos >> 6 2117 and v27.8b, v27.8b, v25.8b // frac_y 2118 2119 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 2120 2121 // Cut corners here; for the first row we don't expect to need to 2122 // read outside of v0. 2123 tbl v18.8b, {v0.16b}, v29.8b // left[base_y] 2124 2125 add v30.8b, v29.8b, v19.8b // base_y + 2 2126 add v29.8b, v29.8b, v17.8b // base_y + 1 2127 2128 sub v28.8b, v26.8b, v27.8b // 64 - frac_y 2129 2130 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} 2131 2132 movi v24.8b, #2 // 2 21338: 2134 asr w9, w8, #6 // base_x 2135 dup v16.8h, w8 // xpos 2136 sub w8, w8, w6 // xpos -= dx 2137 cmp w9, #-8 // base_x <= -8 2138 asr w11, w8, #6 // base_x 2139 b.le 89f 2140 2141 dup v17.8h, w8 // xpos 2142 2143 ldr q4, [x2, w9, sxtw] // top[base_x] 2144 ldr q6, [x2, w11, sxtw] 2145 2146 // Cut corners here; only doing tbl over v0-v1 here; we only 2147 // seem to need the last pixel, from v2, after skipping to the 2148 // left-only codepath below. 2149 tbl v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1] 2150 2151 shrn v21.8b, v16.8h, #6 // first base_x 2152 shrn2 v21.16b, v17.8h, #6 2153 xtn v16.8b, v16.8h // (uint8_t)xpos 2154 xtn2 v16.16b, v17.8h 2155 2156 tbl v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2] 2157 2158 ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1] 2159 ext v7.16b, v6.16b, v6.16b, #1 2160 2161 and v16.16b, v16.16b, v25.16b // frac_x 2162 2163 trn1 v4.2d, v4.2d, v6.2d // top[base_x] 2164 trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] 2165 2166 sub v7.16b, v26.16b, v16.16b // 64 - frac_x 2167 2168 add v21.16b, v21.16b, v31.16b // actual base_x 2169 2170 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2171 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2172 umull v17.8h, v19.8b, v28.8b 2173 umlal v17.8h, v20.8b, v27.8b 2174 2175 umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) 2176 umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x 2177 umull2 v23.8h, v4.16b, v7.16b 2178 umlal2 v23.8h, v5.16b, v16.16b 2179 2180 cmge v21.16b, v21.16b, #0 2181 2182 rshrn v6.8b, v6.8h, #6 2183 rshrn2 v6.16b, v17.8h, #6 2184 rshrn v22.8b, v22.8h, #6 2185 rshrn2 v22.16b, v23.8h, #6 2186 2187 bit v6.16b, v22.16b, v21.16b 2188 2189 st1 {v6.d}[0], [x0], x1 2190 sub w8, w8, w6 // xpos -= dx 2191 subs w5, w5, #2 2192 st1 {v6.d}[1], [x0], x1 2193 b.le 9f 2194 2195 mov v18.8b, v20.8b 2196 add v29.8b, v29.8b, v24.8b // base_y += 2 2197 add v30.8b, v30.8b, v24.8b // base_y += 2 2198 b 8b 2199 220089: 2201 tbl v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1] 2202 tbl v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2] 2203 2204 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2205 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2206 umull v17.8h, v19.8b, v28.8b 2207 umlal v17.8h, v20.8b, v27.8b 2208 2209 rshrn v6.8b, v6.8h, #6 2210 rshrn2 v6.16b, v17.8h, #6 2211 2212 st1 {v6.d}[0], [x0], x1 2213 subs w5, w5, #2 2214 st1 {v6.d}[1], [x0], x1 2215 b.le 9f 2216 2217 mov v18.8b, v20.8b 2218 add v29.8b, v29.8b, v24.8b // base_y += 2 2219 add v30.8b, v30.8b, v24.8b // base_y += 2 2220 b 89b 2221 22229: 2223 ret 2224 2225160: 2226 AARCH64_VALID_JUMP_TARGET 2227 2228 stp d8, d9, [sp, #-0x40]! 2229 stp d10, d11, [sp, #0x10] 2230 stp d12, d13, [sp, #0x20] 2231 stp d14, d15, [sp, #0x30] 2232 2233 add x11, x11, #16 // increments 2234 2235 dup v18.8h, w7 // -dy 2236 movi v17.16b, #1 2237 add x3, x3, #1 // Skip past left[0] 2238 2239 ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15} 2240 2241 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy 2242 mul v19.8h, v14.8h, v18.8h // {8,9,10,11,12,13,14,15}* -dy 2243 movi v25.16b, #0x3e 2244 add v16.8h, v16.8h, v18.8h // -= dy 2245 add v18.8h, v19.8h, v18.8h 2246 2247 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} 2248 xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15} 2249 2250 // Worst case height is 64. 2251 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[] 2252 ld1r {v15.16b}, [x2] // left[0] == top[0] 2253 2254 movi v26.16b, #64 2255 movi v19.16b, #2 2256 2257 xtn v27.8b, v16.8h // (uint8_t)ypos 2258 xtn2 v27.16b, v18.8h 2259 shrn v29.8b, v16.8h, #6 // ypos >> 6 2260 shrn2 v29.16b, v18.8h, #6 2261 mov v18.16b, v15.16b // left[0] 2262 and v27.16b, v27.16b, v25.16b // frac_y 2263 2264 // Cut corners here; for the first row we don't expect to need to 2265 // read outside of v0. 2266 tbx v18.16b, {v0.16b}, v29.16b // left[base_y] 2267 2268 add v30.16b, v29.16b, v19.16b // base_y + 2 2269 add v29.16b, v29.16b, v17.16b // base_y + 1 2270 2271 sub v28.16b, v26.16b, v27.16b // 64 - frac_y 2272 2273 movi v24.16b, #2 // 2 227416: 2275 asr w9, w8, #6 // base_x 2276 dup v16.8h, w8 // xpos 2277 sub w8, w8, w6 // xpos -= dx 2278 cmp w9, #-16 // base_x <= -16 2279 asr w11, w8, #6 // base_x 2280 b.le 169f 2281 2282 dup v17.8h, w8 // xpos 2283 2284 add x9, x2, w9, sxtw 2285 add x11, x2, w11, sxtw 2286 2287 ld1 {v4.16b, v5.16b}, [x9] // top[base_x] 2288 mov v19.16b, v15.16b // left[0] 2289 ld1 {v6.16b, v7.16b}, [x11] 2290 2291 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2292 2293 mov v20.16b, v15.16b // left[0] 2294 2295 shrn v21.8b, v16.8h, #6 // first base_x 2296 shrn v22.8b, v17.8h, #6 2297 xtn v16.8b, v16.8h // (uint8_t)xpos 2298 xtn v17.8b, v17.8h 2299 2300 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] 2301 2302 trn1 v21.2d, v21.2d, v21.2d // first base_x 2303 trn1 v22.2d, v22.2d, v22.2d 2304 trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos 2305 trn1 v17.2d, v17.2d, v17.2d 2306 2307 ext v5.16b, v4.16b, v5.16b, #1 // top[base_x+1] 2308 ext v7.16b, v6.16b, v7.16b, #1 2309 2310 and v16.16b, v16.16b, v25.16b // frac_x 2311 and v17.16b, v17.16b, v25.16b 2312 2313 umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2314 umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2315 2316 sub v8.16b, v26.16b, v16.16b // 64 - frac_x 2317 sub v9.16b, v26.16b, v17.16b 2318 2319 umull2 v11.8h, v18.16b, v28.16b 2320 umlal2 v11.8h, v19.16b, v27.16b 2321 2322 add v21.16b, v21.16b, v31.16b // actual base_x 2323 add v22.16b, v22.16b, v31.16b 2324 2325 umull v12.8h, v19.8b, v28.8b 2326 umlal v12.8h, v20.8b, v27.8b 2327 umull2 v13.8h, v19.16b, v28.16b 2328 umlal2 v13.8h, v20.16b, v27.16b 2329 2330 rshrn v10.8b, v10.8h, #6 2331 rshrn2 v10.16b, v11.8h, #6 2332 rshrn v11.8b, v12.8h, #6 2333 rshrn2 v11.16b, v13.8h, #6 2334 2335 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) 2336 umlal v12.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x 2337 umull2 v13.8h, v4.16b, v8.16b 2338 umlal2 v13.8h, v5.16b, v16.16b 2339 umull v14.8h, v6.8b, v9.8b 2340 umlal v14.8h, v7.8b, v17.8b 2341 umull2 v18.8h, v6.16b, v9.16b 2342 umlal2 v18.8h, v7.16b, v17.16b 2343 2344 cmge v21.16b, v21.16b, #0 2345 cmge v22.16b, v22.16b, #0 2346 2347 rshrn v12.8b, v12.8h, #6 2348 rshrn2 v12.16b, v13.8h, #6 2349 rshrn v13.8b, v14.8h, #6 2350 rshrn2 v13.16b, v18.8h, #6 2351 2352 bit v10.16b, v12.16b, v21.16b 2353 bit v11.16b, v13.16b, v22.16b 2354 2355 st1 {v10.16b}, [x0], x1 2356 subs w5, w5, #2 2357 sub w8, w8, w6 // xpos -= dx 2358 st1 {v11.16b}, [x0], x1 2359 b.le 9f 2360 2361 mov v18.16b, v20.16b 2362 add v29.16b, v29.16b, v24.16b // base_y += 2 2363 add v30.16b, v30.16b, v24.16b // base_y += 2 2364 b 16b 2365 2366169: 2367 mov v19.16b, v15.16b 2368 mov v20.16b, v15.16b 2369 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2370 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] 2371 2372 umull v4.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2373 umlal v4.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2374 umull2 v5.8h, v18.16b, v28.16b 2375 umlal2 v5.8h, v19.16b, v27.16b 2376 umull v6.8h, v19.8b, v28.8b 2377 umlal v6.8h, v20.8b, v27.8b 2378 umull2 v7.8h, v19.16b, v28.16b 2379 umlal2 v7.8h, v20.16b, v27.16b 2380 2381 rshrn v4.8b, v4.8h, #6 2382 rshrn2 v4.16b, v5.8h, #6 2383 rshrn v5.8b, v6.8h, #6 2384 rshrn2 v5.16b, v7.8h, #6 2385 2386 st1 {v4.16b}, [x0], x1 2387 subs w5, w5, #2 2388 st1 {v5.16b}, [x0], x1 2389 b.le 9f 2390 2391 mov v18.16b, v20.16b 2392 add v29.16b, v29.16b, v24.16b // base_y += 2 2393 add v30.16b, v30.16b, v24.16b // base_y += 2 2394 b 169b 2395 23969: 2397 ldp d14, d15, [sp, #0x30] 2398 ldp d12, d13, [sp, #0x20] 2399 ldp d10, d11, [sp, #0x10] 2400 ldp d8, d9, [sp], 0x40 2401 ret 2402 2403320: 2404640: 2405 AARCH64_VALID_JUMP_TARGET 2406 2407 stp d8, d9, [sp, #-0x40]! 2408 stp d10, d11, [sp, #0x10] 2409 stp d12, d13, [sp, #0x20] 2410 stp d14, d15, [sp, #0x30] 2411 2412 add x11, x11, #16 // increments 2413 2414 dup v25.8h, w7 // -dy 2415 add x3, x3, #1 // Skip past left[0] 2416 2417 ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15} 2418 2419 add x13, x0, x1 // alternating row 2420 lsl x1, x1, #1 // stride *= 2 2421 sub x1, x1, w4, uxtw // stride -= width 2422 2423 movi v11.8h, #8 2424 mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy 2425 add v26.8h, v26.8h, v25.8h // -= dy 2426 mul v25.8h, v25.8h, v11.8h // -8*dy 2427 2428 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} 2429 xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15} 2430 2431 // Worst case height is 64. 2432 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[] 2433 ld1r {v15.16b}, [x2] // left[0] == top[0] 2434 2435 mov w12, w4 // orig w 2436 neg w14, w4 // -w 2437 24381: 2439 mov v23.16b, v26.16b // reset ypos 2440 2441 asr w9, w8, #6 // base_x 2442 dup v16.8h, w8 // xpos 2443 sub w8, w8, w6 // xpos -= dx 2444 cmp w9, w14 // base_x <= -w 2445 asr w11, w8, #6 // base_x 2446 b.le 329f 2447 2448 dup v17.8h, w8 // xpos 2449 sub w8, w8, w6 // xpos -= dx 2450 2451 add x9, x2, w9, sxtw 2452 add x11, x2, w11, sxtw 2453 2454 sqshrn v21.8b, v16.8h, #6 // first base_x 2455 sqshrn v22.8b, v17.8h, #6 2456 xtn v16.8b, v16.8h // (uint8_t)xpos 2457 xtn v17.8b, v17.8h 2458 2459 ld1 {v4.16b}, [x9], #16 // top[base_x] 2460 ld1 {v6.16b}, [x11], #16 2461 2462 trn1 v21.2d, v21.2d, v21.2d // first base_x 2463 trn1 v22.2d, v22.2d, v22.2d 2464 trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos 2465 trn1 v17.2d, v17.2d, v17.2d 2466 2467 movi v10.16b, #0x3e 2468 movi v11.16b, #64 2469 2470 and v16.16b, v16.16b, v10.16b // frac_x 2471 and v17.16b, v17.16b, v10.16b 2472 2473 sub v8.16b, v11.16b, v16.16b // 64 - frac_x 2474 sub v9.16b, v11.16b, v17.16b 2475 2476 add v21.16b, v21.16b, v31.16b // actual base_x 2477 add v22.16b, v22.16b, v31.16b 2478 24792: 2480 add v13.8h, v23.8h, v25.8h // ypos -= 8*dy 2481 movi v12.16b, #64 2482 movi v20.16b, #2 2483 movi v10.16b, #0x3e 2484 2485 smov w10, v22.b[0] 2486 2487 xtn v27.8b, v23.8h // (uint8_t)ypos 2488 xtn2 v27.16b, v13.8h 2489 shrn v29.8b, v23.8h, #6 // ypos >> 6 2490 shrn2 v29.16b, v13.8h, #6 2491 cmp w10, #0 // base_x (bottom left) >= 0 2492 and v27.16b, v27.16b, v10.16b // frac_y 2493 2494 mov v18.16b, v15.16b // left[0] 2495 2496 b.ge 4f 2497 2498 add v23.8h, v13.8h, v25.8h // ypos -= 8*dy 2499 movi v13.16b, #1 2500 2501 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] 2502 add v29.16b, v29.16b, v13.16b // base_y + 1 2503 mov v19.16b, v15.16b // left[0] 2504 2505 sub v28.16b, v12.16b, v27.16b // 64 - frac_y 2506 2507 ld1 {v5.16b}, [x9], #16 // top[base_x] 2508 ld1 {v7.16b}, [x11], #16 2509 2510 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2511 add v29.16b, v29.16b, v13.16b // base_y + 2 2512 2513 mov v20.16b, v15.16b // left[0] 2514 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] 2515 2516 umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2517 umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2518 umull2 v11.8h, v18.16b, v28.16b 2519 umlal2 v11.8h, v19.16b, v27.16b 2520 umull v12.8h, v19.8b, v28.8b 2521 umlal v12.8h, v20.8b, v27.8b 2522 umull2 v13.8h, v19.16b, v28.16b 2523 umlal2 v13.8h, v20.16b, v27.16b 2524 2525 ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1] 2526 ext v19.16b, v6.16b, v7.16b, #1 2527 2528 rshrn v10.8b, v10.8h, #6 2529 rshrn2 v10.16b, v11.8h, #6 2530 rshrn v11.8b, v12.8h, #6 2531 rshrn2 v11.16b, v13.8h, #6 2532 2533 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) 2534 umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x 2535 umull2 v13.8h, v4.16b, v8.16b 2536 umlal2 v13.8h, v18.16b, v16.16b 2537 umull v14.8h, v6.8b, v9.8b 2538 umlal v14.8h, v19.8b, v17.8b 2539 umull2 v20.8h, v6.16b, v9.16b 2540 umlal2 v20.8h, v19.16b, v17.16b 2541 2542 cmge v18.16b, v21.16b, #0 2543 cmge v19.16b, v22.16b, #0 2544 2545 rshrn v12.8b, v12.8h, #6 2546 rshrn2 v12.16b, v13.8h, #6 2547 rshrn v13.8b, v14.8h, #6 2548 rshrn2 v13.16b, v20.8h, #6 2549 2550 bit v10.16b, v12.16b, v18.16b 2551 bit v11.16b, v13.16b, v19.16b 2552 2553 st1 {v10.16b}, [x0], #16 2554 subs w4, w4, #16 2555 st1 {v11.16b}, [x13], #16 2556 b.le 3f 2557 2558 movi v10.16b, #16 2559 mov v4.16b, v5.16b 2560 mov v6.16b, v7.16b 2561 add v21.16b, v21.16b, v10.16b // base_x += 16 2562 add v22.16b, v22.16b, v10.16b 2563 b 2b 2564 25653: 2566 subs w5, w5, #2 2567 b.le 9f 2568 movi v10.8h, #128 2569 add x0, x0, x1 2570 add x13, x13, x1 2571 mov w4, w12 // reset w 2572 add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6) 2573 b 1b 2574 25754: // The rest of the row only predicted from top[] 2576 ld1 {v5.16b}, [x9], #16 // top[base_x] 2577 ld1 {v7.16b}, [x11], #16 2578 2579 ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1] 2580 ext v19.16b, v6.16b, v7.16b, #1 2581 2582 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) 2583 umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x 2584 umull2 v13.8h, v4.16b, v8.16b 2585 umlal2 v13.8h, v18.16b, v16.16b 2586 umull v14.8h, v6.8b, v9.8b 2587 umlal v14.8h, v19.8b, v17.8b 2588 umull2 v20.8h, v6.16b, v9.16b 2589 umlal2 v20.8h, v19.16b, v17.16b 2590 2591 rshrn v12.8b, v12.8h, #6 2592 rshrn2 v12.16b, v13.8h, #6 2593 rshrn v13.8b, v14.8h, #6 2594 rshrn2 v13.16b, v20.8h, #6 2595 2596 st1 {v12.16b}, [x0], #16 2597 subs w4, w4, #16 2598 st1 {v13.16b}, [x13], #16 2599 b.le 3b 2600 2601 mov v4.16b, v5.16b 2602 mov v6.16b, v7.16b 2603 b 4b 2604 2605329: // The rest of the block only predicted from left[] 2606 add x1, x1, w4, uxtw // restore stride 2607 mov w12, w5 // orig remaining h 26081: 2609 add v13.8h, v23.8h, v25.8h // ypos -= 8*dy 2610 movi v12.16b, #64 2611 movi v10.16b, #0x3e 2612 2613 xtn v27.8b, v23.8h // (uint8_t)ypos 2614 xtn2 v27.16b, v13.8h 2615 shrn v29.8b, v23.8h, #6 // ypos >> 6 2616 shrn2 v29.16b, v13.8h, #6 2617 and v27.16b, v27.16b, v10.16b // frac_y 2618 2619 mov v18.16b, v15.16b // left[0] 2620 add v23.8h, v13.8h, v25.8h // ypos -= 8*dy 2621 movi v21.16b, #1 2622 2623 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] 2624 add v29.16b, v29.16b, v21.16b // base_y + 1 2625 2626 sub v28.16b, v12.16b, v27.16b // 64 - frac_y 26272: 2628 mov v19.16b, v15.16b // left[0] 2629 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2630 add v29.16b, v29.16b, v21.16b // base_y + 2 2631 mov v20.16b, v15.16b // left[0] 2632 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] 2633 add v29.16b, v29.16b, v21.16b // next base_y 2634 2635 umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2636 umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2637 umull2 v11.8h, v18.16b, v28.16b 2638 umlal2 v11.8h, v19.16b, v27.16b 2639 umull v12.8h, v19.8b, v28.8b 2640 umlal v12.8h, v20.8b, v27.8b 2641 umull2 v13.8h, v19.16b, v28.16b 2642 umlal2 v13.8h, v20.16b, v27.16b 2643 2644 rshrn v10.8b, v10.8h, #6 2645 rshrn2 v10.16b, v11.8h, #6 2646 rshrn v11.8b, v12.8h, #6 2647 rshrn2 v11.16b, v13.8h, #6 2648 2649 st1 {v10.16b}, [x0], x1 2650 subs w5, w5, #2 2651 st1 {v11.16b}, [x13], x1 2652 b.le 3f 2653 mov v18.16b, v20.16b 2654 b 2b 2655 26563: 2657 subs w4, w4, #16 2658 b.le 9f 2659 2660 lsr x1, x1, #1 2661 msub x0, x1, x12, x0 // ptr -= h * stride 2662 msub x13, x1, x12, x13 2663 lsl x1, x1, #1 2664 add x0, x0, #16 2665 add x13, x13, #16 2666 mov w5, w12 // reset h 2667 b 1b 2668 26699: 2670 ldp d14, d15, [sp, #0x30] 2671 ldp d12, d13, [sp, #0x20] 2672 ldp d10, d11, [sp, #0x10] 2673 ldp d8, d9, [sp], 0x40 2674 ret 2675endfunc 2676 2677jumptable ipred_z2_fill1_tbl 2678 .word 640b - ipred_z2_fill1_tbl 2679 .word 320b - ipred_z2_fill1_tbl 2680 .word 160b - ipred_z2_fill1_tbl 2681 .word 80b - ipred_z2_fill1_tbl 2682 .word 40b - ipred_z2_fill1_tbl 2683endjumptable 2684 2685function ipred_z2_fill2_8bpc_neon, export=1 2686 cmp w4, #8 2687 mov w8, #(2 << 6) // xpos = 2 << 6 2688 sub w8, w8, w6 // xpos -= dx 2689 2690 movrel x11, increments 2691 ld1 {v31.8h}, [x11] // increments 2692 neg w7, w7 // -dy 2693 b.eq 80f 2694 269540: 2696 dup v30.4h, w7 // -dy 2697 movi v17.8b, #1 2698 2699 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy 2700 movi v25.16b, #0x3e 2701 add v30.4h, v16.4h, v30.4h // -= dy 2702 2703 xtn v31.8b, v31.8h // {0,1,2,3} 2704 2705 // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements 2706 // from left. 2707 ld1 {v0.16b}, [x3] // left[] 2708 2709 movi v26.16b, #64 2710 movi v19.16b, #2 2711 2712 xtn v27.8b, v30.8h // (uint8_t)ypos 2713 shrn v29.8b, v30.8h, #6 // ypos >> 6 2714 and v27.8b, v27.8b, v25.8b // frac_y 2715 2716 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 2717 2718 add v30.8b, v29.8b, v17.8b // base_y + 1 2719 add v28.8b, v29.8b, v19.8b // base_y + 2 2720 2721 tbl v16.8b, {v0.16b}, v29.8b // left[base_y] 2722 2723 trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2 2724 2725 sub v28.8b, v26.8b, v27.8b // 64 - frac_y 2726 2727 trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} 2728 2729 trn1 v27.2s, v27.2s, v27.2s // frac_y 2730 trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y 2731 2732 movi v29.8b, #2 2733 add v31.8b, v31.8b, v31.8b // {0,2,4,6,0,2,4,6} 27344: 2735 asr w9, w8, #6 // base_x 2736 dup v6.4h, w8 // xpos 2737 sub w8, w8, w6 // xpos -= dx 2738 cmp w9, #-8 // base_x <= -8 2739 asr w11, w8, #6 // base_x 2740 b.le 49f 2741 2742 dup v7.4h, w8 // xpos 2743 2744 ldr d2, [x2, w9, sxtw] // top[base_x] 2745 ldr d4, [x2, w11, sxtw] 2746 2747 trn1 v6.2d, v6.2d, v7.2d // xpos 2748 2749 tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] 2750 2751 shrn v20.8b, v6.8h, #6 // first base_x for each row 2752 xtn v6.8b, v6.8h // (uint8_t)xpos 2753 2754 uzp2 v3.8b, v2.8b, v4.8b // top[base_x+1] 2755 uzp1 v2.8b, v2.8b, v4.8b // top[base_x] 2756 2757 and v6.8b, v6.8b, v25.8b // frac_x 2758 2759 trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] 2760 2761 sub v7.8b, v26.8b, v6.8b // 64 - frac_x 2762 2763 add v20.8b, v20.8b, v31.8b // actual base_x 2764 2765 umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) 2766 umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y 2767 2768 umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) 2769 umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x 2770 2771 cmge v20.8b, v20.8b, #0 2772 2773 rshrn v16.8b, v16.8h, #6 2774 rshrn v22.8b, v22.8h, #6 2775 2776 bit v16.8b, v22.8b, v20.8b 2777 2778 st1 {v16.s}[0], [x0], x1 2779 sub w8, w8, w6 // xpos -= dx 2780 subs w5, w5, #2 2781 st1 {v16.s}[1], [x0], x1 2782 b.le 9f 2783 2784 ext v16.8b, v17.8b, v17.8b, #4 2785 add v30.8b, v30.8b, v29.8b // base_y += 2 2786 b 4b 2787 278849: 2789 tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] 2790 2791 trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] 2792 2793 umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) 2794 umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y 2795 rshrn v18.8b, v18.8h, #6 2796 2797 st1 {v18.s}[0], [x0], x1 2798 subs w5, w5, #2 2799 st1 {v18.s}[1], [x0], x1 2800 b.le 9f 2801 2802 ext v16.8b, v17.8b, v17.8b, #4 2803 add v30.8b, v30.8b, v29.8b // base_y += 2 2804 b 49b 2805 28069: 2807 ret 2808 280980: 2810 dup v30.8h, w7 // -dy 2811 movi v17.8b, #1 2812 2813 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy 2814 movi v25.16b, #0x3e 2815 add v30.8h, v16.8h, v30.8h // -= dy 2816 2817 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} 2818 2819 // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements 2820 // from left. 2821 ld1 {v0.16b}, [x3] // left[] 2822 2823 movi v26.16b, #64 2824 movi v19.16b, #2 2825 2826 xtn v27.8b, v30.8h // (uint8_t)ypos 2827 shrn v29.8b, v30.8h, #6 // ypos >> 6 2828 and v27.8b, v27.8b, v25.8b // frac_y 2829 2830 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 2831 2832 tbl v18.8b, {v0.16b}, v29.8b // left[base_y] 2833 2834 add v30.8b, v29.8b, v19.8b // base_y + 2 2835 add v29.8b, v29.8b, v17.8b // base_y + 1 2836 2837 sub v28.8b, v26.8b, v27.8b // 64 - frac_y 2838 2839 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} 2840 2841 movi v24.8b, #2 // 2 2842 add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14} 28438: 2844 asr w9, w8, #6 // base_x 2845 dup v16.8h, w8 // xpos 2846 sub w8, w8, w6 // xpos -= dx 2847 cmp w9, #-16 // base_x <= -16 2848 asr w11, w8, #6 // base_x 2849 b.le 89f 2850 2851 dup v17.8h, w8 // xpos 2852 2853 ldr q4, [x2, w9, sxtw] // top[base_x] 2854 ldr q6, [x2, w11, sxtw] 2855 2856 tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1] 2857 2858 shrn v21.8b, v16.8h, #6 // first base_x 2859 shrn2 v21.16b, v17.8h, #6 2860 xtn v16.8b, v16.8h // (uint8_t)xpos 2861 xtn2 v16.16b, v17.8h 2862 2863 tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2] 2864 2865 uzp2 v5.16b, v4.16b, v6.16b // top[base_x+1] 2866 uzp1 v4.16b, v4.16b, v6.16b // top[base_x] 2867 2868 and v16.16b, v16.16b, v25.16b // frac_x 2869 2870 sub v7.16b, v26.16b, v16.16b // 64 - frac_x 2871 2872 add v21.16b, v21.16b, v31.16b // actual base_x 2873 2874 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2875 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2876 umull v17.8h, v19.8b, v28.8b 2877 umlal v17.8h, v20.8b, v27.8b 2878 2879 umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) 2880 umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x 2881 umull2 v23.8h, v4.16b, v7.16b 2882 umlal2 v23.8h, v5.16b, v16.16b 2883 2884 cmge v21.16b, v21.16b, #0 2885 2886 rshrn v6.8b, v6.8h, #6 2887 rshrn2 v6.16b, v17.8h, #6 2888 rshrn v22.8b, v22.8h, #6 2889 rshrn2 v22.16b, v23.8h, #6 2890 2891 bit v6.16b, v22.16b, v21.16b 2892 2893 st1 {v6.d}[0], [x0], x1 2894 sub w8, w8, w6 // xpos -= dx 2895 subs w5, w5, #2 2896 st1 {v6.d}[1], [x0], x1 2897 b.le 9f 2898 2899 mov v18.8b, v20.8b 2900 add v29.8b, v29.8b, v24.8b // base_y += 2 2901 add v30.8b, v30.8b, v24.8b // base_y += 2 2902 b 8b 2903 290489: 2905 tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1] 2906 tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2] 2907 2908 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 2909 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 2910 umull v17.8h, v19.8b, v28.8b 2911 umlal v17.8h, v20.8b, v27.8b 2912 2913 rshrn v6.8b, v6.8h, #6 2914 rshrn2 v6.16b, v17.8h, #6 2915 2916 st1 {v6.d}[0], [x0], x1 2917 subs w5, w5, #2 2918 st1 {v6.d}[1], [x0], x1 2919 b.le 9f 2920 2921 mov v18.8b, v20.8b 2922 add v29.8b, v29.8b, v24.8b // base_y += 2 2923 add v30.8b, v30.8b, v24.8b // base_y += 2 2924 b 89b 2925 29269: 2927 ret 2928endfunc 2929 2930function ipred_z2_fill3_8bpc_neon, export=1 2931 cmp w4, #8 2932 mov w8, #(1 << 6) // xpos = 1 << 6 2933 sub w8, w8, w6 // xpos -= dx 2934 2935 movrel x11, increments 2936 ld1 {v31.8h}, [x11] // increments 2937 neg w7, w7 // -dy 2938 b.eq 80f 2939 294040: 2941 dup v30.4h, w7 // -dy 2942 movi v17.8b, #1 2943 2944 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy 2945 movi v25.16b, #0x3e 2946 add v30.4h, v16.4h, v30.4h // -= dy 2947 2948 xtn v31.8b, v31.8h // {0,1,2,3} 2949 2950 // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. 2951 ld1 {v0.16b, v1.16b}, [x3] // left[] 2952 2953 movi v26.16b, #64 2954 movi v19.16b, #2 2955 2956 xtn v27.8b, v30.8h // (uint8_t)ypos 2957 shrn v29.8b, v30.8h, #6 // ypos >> 6 2958 and v27.8b, v27.8b, v25.8b // frac_y 2959 2960 add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 2961 2962 add v30.8b, v29.8b, v17.8b // base_y + 1 2963 add v28.8b, v29.8b, v19.8b // base_y + 2 2964 2965 trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} 2966 2967 add v24.8b, v30.8b, v19.8b // base_y + 3 2968 2969 trn1 v29.2s, v29.2s, v28.2s // base_y + 0, base_y + 2 2970 trn1 v30.2s, v30.2s, v24.2s // base_y + 1, base_y + 3 2971 2972 sub v28.8b, v26.8b, v27.8b // 64 - frac_y 2973 2974 trn1 v27.2s, v27.2s, v27.2s // frac_y 2975 trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y 2976 2977 movi v24.8b, #4 29784: 2979 asr w9, w8, #6 // base_x 2980 dup v6.4h, w8 // xpos 2981 sub w8, w8, w6 // xpos -= dx 2982 cmp w9, #-4 // base_x <= -4 2983 asr w11, w8, #6 // base_x 2984 b.le 49f 2985 2986 dup v7.4h, w8 // xpos 2987 2988 ldr d2, [x2, w9, sxtw] // top[base_x] 2989 ldr d4, [x2, w11, sxtw] 2990 2991 trn1 v6.2d, v6.2d, v7.2d // xpos 2992 2993 tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2] 2994 tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3] 2995 2996 shrn v20.8b, v6.8h, #6 // first base_x for each row 2997 xtn v6.8b, v6.8h // (uint8_t)xpos 2998 2999 ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1] 3000 ext v5.8b, v4.8b, v4.8b, #1 3001 3002 and v6.8b, v6.8b, v25.8b // frac_x 3003 3004 trn1 v2.2s, v2.2s, v4.2s // top[base_x] 3005 trn1 v3.2s, v3.2s, v5.2s // top[base_x+1] 3006 3007 sub v7.8b, v26.8b, v6.8b // 64 - frac_x 3008 3009 add v20.8b, v20.8b, v31.8b // actual base_x 3010 3011 umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) 3012 umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y 3013 3014 umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) 3015 umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x 3016 3017 cmge v20.8b, v20.8b, #0 3018 3019 rshrn v16.8b, v16.8h, #6 3020 rshrn v22.8b, v22.8h, #6 3021 3022 bit v16.8b, v22.8b, v20.8b 3023 3024 st1 {v16.s}[0], [x0], x1 3025 sub w8, w8, w6 // xpos -= dx 3026 subs w5, w5, #2 3027 st1 {v16.s}[1], [x0], x1 3028 b.le 9f 3029 3030 add v29.8b, v29.8b, v24.8b // base_y += 4 3031 add v30.8b, v30.8b, v24.8b // base_y += 4 3032 b 4b 3033 303449: 3035 tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2] 3036 tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3] 3037 3038 umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) 3039 umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y 3040 rshrn v18.8b, v18.8h, #6 3041 3042 st1 {v18.s}[0], [x0], x1 3043 subs w5, w5, #2 3044 st1 {v18.s}[1], [x0], x1 3045 b.le 9f 3046 3047 add v29.8b, v29.8b, v24.8b // base_y += 4 3048 add v30.8b, v30.8b, v24.8b // base_y += 4 3049 b 49b 3050 30519: 3052 ret 3053 305480: 3055 dup v30.8h, w7 // -dy 3056 movi v17.8b, #1 3057 3058 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy 3059 movi v25.16b, #0x3e 3060 add v30.8h, v16.8h, v30.8h // -= dy 3061 3062 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} 3063 3064 // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. 3065 ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[] 3066 3067 movi v26.16b, #64 3068 movi v19.16b, #2 3069 3070 xtn v27.8b, v30.8h // (uint8_t)ypos 3071 shrn v29.8b, v30.8h, #6 // ypos >> 6 3072 and v27.8b, v27.8b, v25.8b // frac_y 3073 3074 add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 3075 3076 add v28.8b, v29.8b, v17.8b // base_y + 1 3077 add v30.8b, v29.8b, v19.8b // base_y + 2 3078 3079 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} 3080 add v24.8b, v28.8b, v19.8b // base_y + 3 3081 3082 trn1 v29.2d, v29.2d, v30.2d // base_y + 0, base_y + 2 3083 trn1 v30.2d, v28.2d, v24.2d // base_y + 1, base_y + 3 3084 3085 sub v28.8b, v26.8b, v27.8b // 64 - frac_y 3086 3087 movi v24.16b, #4 3088 3089 trn1 v27.2d, v27.2d, v27.2d // frac_y 3090 trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y 30918: 3092 asr w9, w8, #6 // base_x 3093 dup v16.8h, w8 // xpos 3094 sub w8, w8, w6 // xpos -= dx 3095 cmp w9, #-8 // base_x <= -8 3096 asr w11, w8, #6 // base_x 3097 b.le 89f 3098 3099 dup v17.8h, w8 // xpos 3100 3101 ldr q4, [x2, w9, sxtw] // top[base_x] 3102 ldr q6, [x2, w11, sxtw] 3103 3104 tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] 3105 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] 3106 3107 shrn v21.8b, v16.8h, #6 // first base_x 3108 shrn2 v21.16b, v17.8h, #6 3109 xtn v16.8b, v16.8h // (uint8_t)xpos 3110 xtn2 v16.16b, v17.8h 3111 3112 ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1] 3113 ext v7.16b, v6.16b, v6.16b, #1 3114 3115 and v16.16b, v16.16b, v25.16b // frac_x 3116 3117 trn1 v4.2d, v4.2d, v6.2d // top[base_x] 3118 trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] 3119 3120 sub v7.16b, v26.16b, v16.16b // 64 - frac_x 3121 3122 add v21.16b, v21.16b, v31.16b // actual base_x 3123 3124 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 3125 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 3126 umull2 v17.8h, v18.16b, v28.16b 3127 umlal2 v17.8h, v19.16b, v27.16b 3128 3129 umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) 3130 umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x 3131 umull2 v23.8h, v4.16b, v7.16b 3132 umlal2 v23.8h, v5.16b, v16.16b 3133 3134 cmge v21.16b, v21.16b, #0 3135 3136 rshrn v6.8b, v6.8h, #6 3137 rshrn2 v6.16b, v17.8h, #6 3138 rshrn v22.8b, v22.8h, #6 3139 rshrn2 v22.16b, v23.8h, #6 3140 3141 bit v6.16b, v22.16b, v21.16b 3142 3143 st1 {v6.d}[0], [x0], x1 3144 sub w8, w8, w6 // xpos -= dx 3145 subs w5, w5, #2 3146 st1 {v6.d}[1], [x0], x1 3147 b.le 9f 3148 3149 add v29.16b, v29.16b, v24.16b // base_y += 4 3150 add v30.16b, v30.16b, v24.16b // base_y += 4 3151 b 8b 3152 315389: 3154 tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] 3155 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] 3156 3157 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) 3158 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y 3159 umull2 v17.8h, v18.16b, v28.16b 3160 umlal2 v17.8h, v19.16b, v27.16b 3161 3162 rshrn v6.8b, v6.8h, #6 3163 rshrn2 v6.16b, v17.8h, #6 3164 3165 st1 {v6.d}[0], [x0], x1 3166 subs w5, w5, #2 3167 st1 {v6.d}[1], [x0], x1 3168 b.le 9f 3169 3170 add v29.16b, v29.16b, v24.16b // base_y += 4 3171 add v30.16b, v30.16b, v24.16b // base_y += 4 3172 b 89b 3173 31749: 3175 ret 3176endfunc 3177 3178 3179// void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, 3180// const pixel *const left, 3181// const int width, const int height, 3182// const int dy, const int max_base_y); 3183function ipred_z3_fill1_8bpc_neon, export=1 3184 cmp w6, #64 3185 clz w9, w3 3186 movrel x8, ipred_z3_fill1_tbl 3187 sub w9, w9, #25 3188 ldrsw x9, [x8, w9, uxtw #2] 3189 add x10, x2, w6, uxtw // left[max_base_y] 3190 add x8, x8, x9 3191 movrel x11, increments 3192 ld1r {v31.16b}, [x10] // padding 3193 ld1 {v30.8h}, [x11] // increments 3194 mov w7, w5 3195 b.gt L(ipred_z3_fill1_large_h16) 3196 br x8 3197 319840: 3199 AARCH64_VALID_JUMP_TARGET 3200 dup v29.4h, w5 // dy 3201 3202 mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy 3203 movi v23.16b, #0x3e 3204 3205 // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32 3206 ld1 {v0.16b, v1.16b}, [x2] // left[] 3207 add v30.4h, v29.4h, v30.4h // ypos 3208 3209 movi v22.16b, #64 3210 movi v20.16b, #1 3211 movi v21.16b, #2 3212 3213 xtn v24.8b, v30.8h // (uint8_t)ypos 3214 uqshrn v26.8b, v30.8h, #6 // base 3215 and v24.8b, v24.8b, v23.8b // frac 3216 3217 mov v4.8b, v31.8b 3218 uqadd v27.8b, v26.8b, v20.8b // base + 1 3219 uqadd v28.8b, v26.8b, v21.8b // base + 2 3220 sub v25.8b, v22.8b, v24.8b // 64 - frac 3221 3222 tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base] 3223 3224 trn1 v27.2s, v27.2s, v28.2s // base + 1, base + 2 3225 trn1 v24.2s, v24.2s, v24.2s // frac 3226 trn1 v25.2s, v25.2s, v25.2s // 64 - frac 32271: 3228 mov v5.8b, v31.8b 3229 tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2] 3230 3231 trn1 v4.2s, v4.2s, v5.2s // left[base], left[base+1] 3232 3233 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) 3234 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac 3235 rshrn v16.8b, v16.8h, #6 3236 st1 {v16.s}[0], [x0], x1 3237 subs w4, w4, #2 3238 st1 {v16.s}[1], [x0], x1 3239 b.le 9f 3240 3241 ext v4.8b, v5.8b, v5.8b, #4 3242 uqadd v27.8b, v27.8b, v21.8b // base += 2 3243 b 1b 3244 32459: 3246 ret 3247 324880: 3249 AARCH64_VALID_JUMP_TARGET 3250 dup v29.8h, w5 // dy 3251 3252 mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy 3253 movi v23.16b, #0x3e 3254 3255 // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48 3256 ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[] 3257 add v30.8h, v29.8h, v30.8h // ypos 3258 3259 movi v22.16b, #64 3260 movi v20.16b, #1 3261 movi v21.16b, #2 3262 3263 xtn v24.8b, v30.8h // (uint8_t)ypos 3264 uqshrn v26.8b, v30.8h, #6 // base 3265 and v24.8b, v24.8b, v23.8b // frac 3266 3267 mov v4.8b, v31.8b 3268 uqadd v27.8b, v26.8b, v20.8b // base + 1 3269 uqadd v28.8b, v26.8b, v21.8b // base + 2 3270 sub v25.8b, v22.8b, v24.8b // 64 - frac 3271 3272 tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base] 32731: 3274 mov v5.8b, v31.8b 3275 mov v6.8b, v31.8b 3276 tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1] 3277 tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2] 3278 3279 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) 3280 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac 3281 umull v17.8h, v5.8b, v25.8b 3282 umlal v17.8h, v6.8b, v24.8b 3283 rshrn v16.8b, v16.8h, #6 3284 rshrn v17.8b, v17.8h, #6 3285 st1 {v16.8b}, [x0], x1 3286 subs w4, w4, #2 3287 st1 {v17.8b}, [x0], x1 3288 b.le 9f 3289 3290 mov v4.8b, v6.8b 3291 uqadd v27.8b, v27.8b, v21.8b // base += 2 3292 uqadd v28.8b, v28.8b, v21.8b // base += 2 3293 b 1b 3294 32959: 3296 ret 3297 3298160: 3299 AARCH64_VALID_JUMP_TARGET 3300 dup v28.8h, w5 // dy 3301 3302 shl v29.8h, v28.8h, #3 // 8*dy 3303 mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy 3304 movi v23.16b, #0x3e 3305 3306 // This is only executed if we've checked that max_base_y <= 64. 3307 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] 3308 add v28.8h, v28.8h, v30.8h // ypos 3309 3310 movi v22.16b, #64 3311 movi v20.16b, #1 3312 movi v21.16b, #2 3313 3314 add v29.8h, v28.8h, v29.8h // ypos + 8*dy 3315 3316 xtn v24.8b, v28.8h // (uint8_t)ypos 3317 xtn2 v24.16b, v29.8h 3318 uqshrn v26.8b, v28.8h, #6 // base 3319 uqshrn2 v26.16b, v29.8h, #6 3320 and v24.16b, v24.16b, v23.16b // frac 3321 3322 mov v4.16b, v31.16b 3323 uqadd v27.16b, v26.16b, v20.16b // base + 1 3324 uqadd v28.16b, v26.16b, v21.16b // base + 2 3325 sub v25.16b, v22.16b, v24.16b // 64 - frac 3326 3327 tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base] 33281: 3329 mov v5.16b, v31.16b 3330 mov v6.16b, v31.16b 3331 tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1] 3332 tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2] 3333 3334 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) 3335 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac 3336 umull2 v17.8h, v4.16b, v25.16b 3337 umlal2 v17.8h, v5.16b, v24.16b 3338 umull v18.8h, v5.8b, v25.8b 3339 umlal v18.8h, v6.8b, v24.8b 3340 umull2 v19.8h, v5.16b, v25.16b 3341 umlal2 v19.8h, v6.16b, v24.16b 3342 rshrn v16.8b, v16.8h, #6 3343 rshrn2 v16.16b, v17.8h, #6 3344 rshrn v17.8b, v18.8h, #6 3345 rshrn2 v17.16b, v19.8h, #6 3346 st1 {v16.16b}, [x0], x1 3347 subs w4, w4, #2 3348 st1 {v17.16b}, [x0], x1 3349 b.le 9f 3350 3351 mov v4.16b, v6.16b 3352 uqadd v27.16b, v27.16b, v21.16b // base += 2 3353 uqadd v28.16b, v28.16b, v21.16b // base += 2 3354 b 1b 3355 33569: 3357 ret 3358320: 3359640: 3360 AARCH64_VALID_JUMP_TARGET 3361 dup v28.8h, w5 // dy 3362 mov w12, w3 3363 3364 add x13, x0, x1 3365 3366 shl v29.8h, v28.8h, #3 // 8*dy 3367 mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy 3368 movi v23.16b, #0x3e 3369 3370 lsl x1, x1, #1 3371 sub x1, x1, w3, uxtw 3372 add v30.8h, v28.8h, v30.8h // ypos 3373 3374 // This is only executed if we've checked that max_base_y <= 64. 3375 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] 3376 3377 movi v22.16b, #64 3378 movi v20.16b, #1 3379 movi v21.16b, #2 3380 33811: 3382 mov v26.16b, v30.16b // reset ypos 3383 33842: 3385 add v27.8h, v26.8h, v29.8h // ypos + 8*dy 3386 uqshrn v16.8b, v26.8h, #6 // base 3387 uqshrn2 v16.16b, v27.8h, #6 3388 xtn v24.8b, v26.8h // (uint8_t)ypos 3389 xtn2 v24.16b, v27.8h 3390 umov w14, v16.b[0] 3391 and v24.16b, v24.16b, v23.16b // frac 3392 3393 uqadd v17.16b, v16.16b, v20.16b // base + 1 3394 cmp w14, w6 // base >= max_base_y 3395 uqadd v18.16b, v16.16b, v21.16b // base + 2 3396 sub v25.16b, v22.16b, v24.16b // 64 - frac 3397 3398 b.ge 4f 3399 3400 mov v4.16b, v31.16b 3401 mov v5.16b, v31.16b 3402 mov v6.16b, v31.16b 3403 tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base] 3404 tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1] 3405 tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2] 3406 3407 subs w3, w3, #16 3408 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) 3409 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac 3410 umull2 v17.8h, v4.16b, v25.16b 3411 umlal2 v17.8h, v5.16b, v24.16b 3412 umull v18.8h, v5.8b, v25.8b 3413 umlal v18.8h, v6.8b, v24.8b 3414 umull2 v19.8h, v5.16b, v25.16b 3415 umlal2 v19.8h, v6.16b, v24.16b 3416 rshrn v16.8b, v16.8h, #6 3417 rshrn2 v16.16b, v17.8h, #6 3418 rshrn v17.8b, v18.8h, #6 3419 rshrn2 v17.16b, v19.8h, #6 3420 st1 {v16.16b}, [x0], #16 3421 st1 {v17.16b}, [x13], #16 3422 b.le 3f 3423 add v26.8h, v27.8h, v29.8h // ypos += 16*dy 3424 b 2b 3425 34263: 3427 subs w4, w4, #2 3428 b.le 9f 3429 movi v16.8h, #128 3430 add x0, x0, x1 3431 add x13, x13, x1 3432 add v30.8h, v30.8h, v16.8h // ypos = dy + y*(1<<6)*2 3433 mov w3, w12 3434 b 1b 3435 34364: 3437 subs w3, w3, #16 3438 st1 {v31.16b}, [x0], #16 3439 st1 {v31.16b}, [x13], #16 3440 b.gt 4b 3441 b 3b 3442 34439: 3444 ret 3445 3446L(ipred_z3_fill1_large_h16): 3447 // Fallback case for max_base_y > 64; similar to the z1 3448 // implementation. This does the filtering vertically, filling out 3449 // a 2x pixel column at a time. 3450 mov w15, #64 3451 add x13, x0, x1 3452 lsl x1, x1, #1 3453 3454 mov w12, w4 34551: 3456 lsr w8, w7, #6 // base 3457 and w9, w7, #0x3e // frac 3458 add w7, w7, w5 // ypos += dy 3459 cmp w8, w6 // base >= max_base_y 3460 lsr w10, w7, #6 // base 3461 and w11, w7, #0x3e // frac 3462 b.ge ipred_z3_fill_padding_neon 3463 add x8, x2, w8, uxtw 3464 add x10, x2, w10, uxtw 3465 dup v4.16b, w9 // frac 3466 dup v5.16b, w11 3467 ld1 {v0.16b, v1.16b}, [x8], #32 // left[base] 3468 ld1 {v2.16b, v3.16b}, [x10], #32 3469 sub w9, w15, w9 // 64 - frac 3470 sub w11, w15, w11 3471 dup v6.16b, w9 // 64 - frac 3472 dup v7.16b, w11 3473 add w7, w7, w5 // ypos += dy 34742: 3475 ext v16.16b, v0.16b, v1.16b, #1 // left[base+1] 3476 ext v17.16b, v2.16b, v3.16b, #1 3477 subs w4, w4, #16 3478 umull v18.8h, v16.8b, v4.8b // left[base+1]*frac 3479 umlal v18.8h, v0.8b, v6.8b // + left[base]*(64-frac) 3480 umull2 v19.8h, v16.16b, v4.16b 3481 umlal2 v19.8h, v0.16b, v6.16b 3482 umull v20.8h, v17.8b, v5.8b 3483 umlal v20.8h, v2.8b, v7.8b 3484 umull2 v21.8h, v17.16b, v5.16b 3485 umlal2 v21.8h, v2.16b, v7.16b 3486 rshrn v16.8b, v18.8h, #6 3487 rshrn2 v16.16b, v19.8h, #6 3488 rshrn v17.8b, v20.8h, #6 3489 rshrn2 v17.16b, v21.8h, #6 3490 zip1 v18.16b, v16.16b, v17.16b 3491 zip2 v19.16b, v16.16b, v17.16b 3492 st1 {v18.h}[0], [x0], x1 3493 st1 {v18.h}[1], [x13], x1 3494 st1 {v18.h}[2], [x0], x1 3495 st1 {v18.h}[3], [x13], x1 3496 st1 {v18.h}[4], [x0], x1 3497 st1 {v18.h}[5], [x13], x1 3498 st1 {v18.h}[6], [x0], x1 3499 st1 {v18.h}[7], [x13], x1 3500 st1 {v19.h}[0], [x0], x1 3501 st1 {v19.h}[1], [x13], x1 3502 st1 {v19.h}[2], [x0], x1 3503 st1 {v19.h}[3], [x13], x1 3504 st1 {v19.h}[4], [x0], x1 3505 st1 {v19.h}[5], [x13], x1 3506 st1 {v19.h}[6], [x0], x1 3507 st1 {v19.h}[7], [x13], x1 3508 b.le 3f 3509 mov v0.16b, v1.16b 3510 ld1 {v1.16b}, [x8], #16 // left[base] 3511 mov v2.16b, v3.16b 3512 ld1 {v3.16b}, [x10], #16 3513 b 2b 3514 35153: 3516 subs w3, w3, #2 3517 b.le 9f 3518 lsr x1, x1, #1 3519 msub x0, x1, x12, x0 // ptr -= h * stride 3520 msub x13, x1, x12, x13 3521 lsl x1, x1, #1 3522 add x0, x0, #2 3523 add x13, x13, #2 3524 mov w4, w12 3525 b 1b 35269: 3527 ret 3528endfunc 3529 3530jumptable ipred_z3_fill1_tbl 3531 .word 640b - ipred_z3_fill1_tbl 3532 .word 320b - ipred_z3_fill1_tbl 3533 .word 160b - ipred_z3_fill1_tbl 3534 .word 80b - ipred_z3_fill1_tbl 3535 .word 40b - ipred_z3_fill1_tbl 3536endjumptable 3537 3538function ipred_z3_fill_padding_neon, export=0 3539 cmp w3, #16 3540 movrel x8, ipred_z3_fill_padding_tbl 3541 b.gt ipred_z3_fill_padding_wide 3542 // w3 = remaining width, w4 = constant height 3543 mov w12, w4 3544 35451: 3546 // Fill a WxH rectangle with padding. W can be any number; 3547 // this fills the exact width by filling in the largest 3548 // power of two in the remaining width, and repeating. 3549 clz w9, w3 3550 sub w9, w9, #25 3551 ldrsw x9, [x8, w9, uxtw #2] 3552 add x9, x8, x9 3553 br x9 3554 355520: 3556 AARCH64_VALID_JUMP_TARGET 35572: 3558 st1 {v31.h}[0], [x0], x1 3559 subs w4, w4, #4 3560 st1 {v31.h}[0], [x13], x1 3561 st1 {v31.h}[0], [x0], x1 3562 st1 {v31.h}[0], [x13], x1 3563 b.gt 2b 3564 subs w3, w3, #2 3565 lsr x1, x1, #1 3566 msub x0, x1, x12, x0 // ptr -= h * stride 3567 msub x13, x1, x12, x13 3568 b.le 9f 3569 lsl x1, x1, #1 3570 add x0, x0, #2 3571 add x13, x13, #2 3572 mov w4, w12 3573 b 1b 3574 357540: 3576 AARCH64_VALID_JUMP_TARGET 35774: 3578 st1 {v31.s}[0], [x0], x1 3579 subs w4, w4, #4 3580 st1 {v31.s}[0], [x13], x1 3581 st1 {v31.s}[0], [x0], x1 3582 st1 {v31.s}[0], [x13], x1 3583 b.gt 4b 3584 subs w3, w3, #4 3585 lsr x1, x1, #1 3586 msub x0, x1, x12, x0 // ptr -= h * stride 3587 msub x13, x1, x12, x13 3588 b.le 9f 3589 lsl x1, x1, #1 3590 add x0, x0, #4 3591 add x13, x13, #4 3592 mov w4, w12 3593 b 1b 3594 359580: 3596 AARCH64_VALID_JUMP_TARGET 35978: 3598 st1 {v31.8b}, [x0], x1 3599 subs w4, w4, #4 3600 st1 {v31.8b}, [x13], x1 3601 st1 {v31.8b}, [x0], x1 3602 st1 {v31.8b}, [x13], x1 3603 b.gt 8b 3604 subs w3, w3, #8 3605 lsr x1, x1, #1 3606 msub x0, x1, x12, x0 // ptr -= h * stride 3607 msub x13, x1, x12, x13 3608 b.le 9f 3609 lsl x1, x1, #1 3610 add x0, x0, #8 3611 add x13, x13, #8 3612 mov w4, w12 3613 b 1b 3614 3615160: 3616320: 3617640: 3618 AARCH64_VALID_JUMP_TARGET 361916: 3620 st1 {v31.16b}, [x0], x1 3621 subs w4, w4, #4 3622 st1 {v31.16b}, [x13], x1 3623 st1 {v31.16b}, [x0], x1 3624 st1 {v31.16b}, [x13], x1 3625 b.gt 16b 3626 subs w3, w3, #16 3627 lsr x1, x1, #1 3628 msub x0, x1, x12, x0 // ptr -= h * stride 3629 msub x13, x1, x12, x13 3630 b.le 9f 3631 lsl x1, x1, #1 3632 add x0, x0, #16 3633 add x13, x13, #16 3634 mov w4, w12 3635 b 1b 3636 36379: 3638 ret 3639endfunc 3640 3641jumptable ipred_z3_fill_padding_tbl 3642 .word 640b - ipred_z3_fill_padding_tbl 3643 .word 320b - ipred_z3_fill_padding_tbl 3644 .word 160b - ipred_z3_fill_padding_tbl 3645 .word 80b - ipred_z3_fill_padding_tbl 3646 .word 40b - ipred_z3_fill_padding_tbl 3647 .word 20b - ipred_z3_fill_padding_tbl 3648endjumptable 3649 3650function ipred_z3_fill_padding_wide 3651 // Fill a WxH rectangle with padding, with W > 16. 3652 lsr x1, x1, #1 3653 mov w12, w3 3654 sub x1, x1, w3, uxtw 36551: 3656 ands w5, w3, #15 3657 b.eq 2f 3658 // If the width isn't aligned to 16, first do one 16 byte write 3659 // and align the start pointer. 3660 sub w3, w3, w5 3661 st1 {v31.16b}, [x0] 3662 add x0, x0, w5, uxtw 36632: 3664 // Fill the rest of the line with aligned 16 byte writes. 3665 subs w3, w3, #16 3666 st1 {v31.16b}, [x0], #16 3667 b.gt 2b 3668 subs w4, w4, #1 3669 add x0, x0, x1 3670 b.le 9f 3671 mov w3, w12 3672 b 1b 36739: 3674 ret 3675endfunc 3676 3677function ipred_z3_fill2_8bpc_neon, export=1 3678 cmp w3, #8 3679 add x10, x2, w6, uxtw // left[max_base_y] 3680 movrel x11, increments 3681 ld1r {v31.16b}, [x10] // padding 3682 ld1 {v30.8h}, [x11] // increments 3683 b.eq 80f 3684 368540: // w == 4 3686 dup v29.4h, w5 // dy 3687 3688 mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy 3689 movi v23.16b, #0x3e 3690 3691 // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, 3692 // so max_base_y <= 32. 3693 ld1 {v0.16b, v1.16b}, [x2] // left[] 3694 add v30.4h, v29.4h, v30.4h // ypos 3695 3696 movi v22.16b, #64 3697 movi v20.16b, #1 3698 movi v21.16b, #2 3699 3700 xtn v24.8b, v30.8h // (uint8_t)ypos 3701 uqshrn v26.8b, v30.8h, #6 // base 3702 and v24.8b, v24.8b, v23.8b // frac 3703 3704 uqadd v27.8b, v26.8b, v20.8b // base + 1 3705 uqadd v28.8b, v26.8b, v21.8b // base + 2 3706 sub v25.8b, v22.8b, v24.8b // 64 - frac 3707 uqadd v29.8b, v27.8b, v21.8b // base + 3 3708 3709 trn1 v24.2s, v24.2s, v24.2s // frac 3710 trn1 v26.2s, v26.2s, v28.2s // base + 0, base + 2 3711 trn1 v27.2s, v27.2s, v29.2s // base + 1, base + 3 3712 trn1 v25.2s, v25.2s, v25.2s // 64 - frac 3713 3714 movi v21.16b, #4 37151: 3716 mov v4.8b, v31.8b 3717 mov v5.8b, v31.8b 3718 tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2] 3719 tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3] 3720 3721 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) 3722 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac 3723 rshrn v16.8b, v16.8h, #6 3724 st1 {v16.s}[0], [x0], x1 3725 subs w4, w4, #2 3726 st1 {v16.s}[1], [x0], x1 3727 b.le 9f 3728 3729 uqadd v26.8b, v26.8b, v21.8b // base += 4 3730 uqadd v27.8b, v27.8b, v21.8b // base += 4 3731 b 1b 3732 37339: 3734 ret 3735 373680: // w == 8 3737 dup v29.8h, w5 // dy 3738 3739 mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy 3740 movi v23.16b, #0x3e 3741 3742 // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, 3743 // so max_base_y <= 32. 3744 ld1 {v0.16b, v1.16b}, [x2] // left[] 3745 add v30.8h, v29.8h, v30.8h // ypos 3746 3747 movi v22.16b, #64 3748 movi v20.16b, #1 3749 movi v21.16b, #2 3750 3751 xtn v24.8b, v30.8h // (uint8_t)ypos 3752 uqshrn v26.8b, v30.8h, #6 // base 3753 and v24.8b, v24.8b, v23.8b // frac 3754 3755 uqadd v27.8b, v26.8b, v20.8b // base + 1 3756 uqadd v28.8b, v26.8b, v21.8b // base + 2 3757 sub v25.8b, v22.8b, v24.8b // 64 - frac 3758 uqadd v29.8b, v27.8b, v21.8b // base + 3 3759 3760 trn1 v24.2d, v24.2d, v24.2d // frac 3761 trn1 v26.2d, v26.2d, v28.2d // base + 0, base + 2 3762 trn1 v27.2d, v27.2d, v29.2d // base + 1, base + 3 3763 trn1 v25.2d, v25.2d, v25.2d // 64 - frac 3764 3765 movi v21.16b, #4 37661: 3767 mov v4.16b, v31.16b 3768 mov v5.16b, v31.16b 3769 tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2] 3770 tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3] 3771 3772 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) 3773 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac 3774 umull2 v17.8h, v4.16b, v25.16b 3775 umlal2 v17.8h, v5.16b, v24.16b 3776 rshrn v16.8b, v16.8h, #6 3777 rshrn v17.8b, v17.8h, #6 3778 st1 {v16.8b}, [x0], x1 3779 subs w4, w4, #2 3780 st1 {v17.8b}, [x0], x1 3781 b.le 9f 3782 3783 uqadd v26.16b, v26.16b, v21.16b // base += 4 3784 uqadd v27.16b, v27.16b, v21.16b // base += 4 3785 b 1b 3786 37879: 3788 ret 3789endfunc 3790 3791 3792// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, 3793// const pixel *const topleft, 3794// const int width, const int height, const int filt_idx, 3795// const int max_width, const int max_height); 3796function ipred_filter_8bpc_neon, export=1 3797 and w5, w5, #511 3798 movrel x6, X(filter_intra_taps) 3799 lsl w5, w5, #6 3800 add x6, x6, w5, uxtw 3801 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 3802 clz w9, w3 3803 movrel x5, ipred_filter_tbl 3804 ld1 {v20.8b, v21.8b, v22.8b}, [x6] 3805 sub w9, w9, #26 3806 ldrsw x9, [x5, w9, uxtw #2] 3807 sxtl v16.8h, v16.8b 3808 sxtl v17.8h, v17.8b 3809 add x5, x5, x9 3810 sxtl v18.8h, v18.8b 3811 sxtl v19.8h, v19.8b 3812 add x6, x0, x1 3813 lsl x1, x1, #1 3814 sxtl v20.8h, v20.8b 3815 sxtl v21.8h, v21.8b 3816 sxtl v22.8h, v22.8b 3817 br x5 381840: 3819 AARCH64_VALID_JUMP_TARGET 3820 ldur s0, [x2, #1] // top (0-3) 3821 sub x2, x2, #2 3822 mov x7, #-2 3823 uxtl v0.8h, v0.8b // top (0-3) 38244: 3825 ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) 3826 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 3827 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 3828 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 3829 uxtl v1.8h, v1.8b // left (0-1) + topleft (2) 3830 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 3831 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 3832 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 3833 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 3834 sqrshrun v2.8b, v2.8h, #4 3835 subs w4, w4, #2 3836 st1 {v2.s}[0], [x0], x1 3837 uxtl v0.8h, v2.8b 3838 st1 {v2.s}[1], [x6], x1 3839 ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3] 3840 b.gt 4b 3841 ret 384280: 3843 AARCH64_VALID_JUMP_TARGET 3844 ldur d0, [x2, #1] // top (0-7) 3845 sub x2, x2, #2 3846 mov x7, #-2 3847 uxtl v0.8h, v0.8b // top (0-7) 38488: 3849 ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) 3850 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 3851 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 3852 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 3853 uxtl v1.8h, v1.8b // left (0-1) + topleft (2) 3854 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 3855 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 3856 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 3857 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 3858 mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 3859 mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 3860 mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 3861 sqrshrun v2.8b, v2.8h, #4 3862 uxtl v1.8h, v2.8b // first block, in 16 bit 3863 mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 3864 mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) 3865 mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5) 3866 mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6) 3867 sqrshrun v3.8b, v3.8h, #4 3868 subs w4, w4, #2 3869 st2 {v2.s, v3.s}[0], [x0], x1 3870 zip2 v0.2s, v2.2s, v3.2s 3871 st2 {v2.s, v3.s}[1], [x6], x1 3872 uxtl v0.8h, v0.8b 3873 b.gt 8b 3874 ret 3875160: 3876320: 3877 AARCH64_VALID_JUMP_TARGET 3878 add x8, x2, #1 3879 sub x2, x2, #2 3880 mov x7, #-2 3881 sub x1, x1, w3, uxtw 3882 mov w9, w3 3883 38841: 3885 ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2) 3886 uxtl v0.8h, v0.8b // left (0-1) + topleft (2) 38872: 3888 ld1 {v2.16b}, [x8], #16 // top(0-15) 3889 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) 3890 mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 3891 uxtl v1.8h, v2.8b // top(0-7) 3892 uxtl2 v2.8h, v2.16b // top(8-15) 3893 mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 3894 mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 3895 mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 3896 mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 3897 mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 3898 3899 mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 3900 mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 3901 mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 3902 sqrshrun v3.8b, v3.8h, #4 3903 uxtl v0.8h, v3.8b // first block, in 16 bit 3904 mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 3905 mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) 3906 mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) 3907 mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) 3908 3909 mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 3910 mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 3911 mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 3912 sqrshrun v4.8b, v4.8h, #4 3913 uxtl v0.8h, v4.8b // second block, in 16 bit 3914 mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 3915 mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) 3916 mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) 3917 mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) 3918 3919 mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 3920 mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 3921 mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 3922 sqrshrun v5.8b, v5.8h, #4 3923 uxtl v0.8h, v5.8b // third block, in 16 bit 3924 mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 3925 mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) 3926 mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) 3927 mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) 3928 3929 subs w3, w3, #16 3930 sqrshrun v6.8b, v6.8h, #4 3931 3932 st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16 3933 st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16 3934 b.le 8f 3935 ins v0.h[2], v2.h[7] 3936 ins v0.b[0], v6.b[7] 3937 ins v0.b[2], v6.b[3] 3938 b 2b 39398: 3940 subs w4, w4, #2 3941 b.le 9f 3942 sub x8, x6, w9, uxtw 3943 add x0, x0, x1 3944 add x6, x6, x1 3945 mov w3, w9 3946 b 1b 39479: 3948 ret 3949endfunc 3950 3951jumptable ipred_filter_tbl 3952 .word 320b - ipred_filter_tbl 3953 .word 160b - ipred_filter_tbl 3954 .word 80b - ipred_filter_tbl 3955 .word 40b - ipred_filter_tbl 3956endjumptable 3957 3958// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, 3959// const pixel *const pal, const uint8_t *idx, 3960// const int w, const int h); 3961function pal_pred_8bpc_neon, export=1 3962 ld1 {v0.8b}, [x2] 3963 clz w9, w4 3964 movrel x6, pal_pred_tbl 3965 sub w9, w9, #25 3966 movi v31.16b, #7 3967 ldrsw x9, [x6, w9, uxtw #2] 3968 add x6, x6, x9 3969 add x2, x0, x1 3970 lsl x1, x1, #1 3971 br x6 397240: 3973 AARCH64_VALID_JUMP_TARGET 39744: 3975 ld1 {v1.8b}, [x3], #8 3976 subs w5, w5, #4 3977 ushr v3.8b, v1.8b, #4 3978 and v2.8b, v1.8b, v31.8b 3979 zip1 v1.16b, v2.16b, v3.16b 3980 tbl v1.16b, {v0.16b}, v1.16b 3981 st1 {v1.s}[0], [x0], x1 3982 st1 {v1.s}[1], [x2], x1 3983 st1 {v1.s}[2], [x0], x1 3984 st1 {v1.s}[3], [x2], x1 3985 b.gt 4b 3986 ret 398780: 3988 AARCH64_VALID_JUMP_TARGET 39898: 3990 ld1 {v1.16b}, [x3], #16 3991 subs w5, w5, #4 3992 ushr v4.16b, v1.16b, #4 3993 and v3.16b, v1.16b, v31.16b 3994 zip1 v1.16b, v3.16b, v4.16b 3995 zip2 v2.16b, v3.16b, v4.16b 3996 tbl v1.16b, {v0.16b}, v1.16b 3997 st1 {v1.d}[0], [x0], x1 3998 tbl v2.16b, {v0.16b}, v2.16b 3999 st1 {v1.d}[1], [x2], x1 4000 st1 {v2.d}[0], [x0], x1 4001 st1 {v2.d}[1], [x2], x1 4002 b.gt 8b 4003 ret 4004160: 4005 AARCH64_VALID_JUMP_TARGET 400616: 4007 ld1 {v1.16b, v2.16b}, [x3], #32 4008 subs w5, w5, #4 4009 ushr v5.16b, v1.16b, #4 4010 and v4.16b, v1.16b, v31.16b 4011 ushr v7.16b, v2.16b, #4 4012 and v6.16b, v2.16b, v31.16b 4013 zip1 v1.16b, v4.16b, v5.16b 4014 zip2 v2.16b, v4.16b, v5.16b 4015 zip1 v3.16b, v6.16b, v7.16b 4016 tbl v1.16b, {v0.16b}, v1.16b 4017 zip2 v4.16b, v6.16b, v7.16b 4018 tbl v2.16b, {v0.16b}, v2.16b 4019 st1 {v1.16b}, [x0], x1 4020 tbl v3.16b, {v0.16b}, v3.16b 4021 st1 {v2.16b}, [x2], x1 4022 tbl v4.16b, {v0.16b}, v4.16b 4023 st1 {v3.16b}, [x0], x1 4024 st1 {v4.16b}, [x2], x1 4025 b.gt 16b 4026 ret 4027320: 4028 AARCH64_VALID_JUMP_TARGET 402932: 4030 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 4031 subs w5, w5, #4 4032 ushr v21.16b, v16.16b, #4 4033 and v20.16b, v16.16b, v31.16b 4034 ushr v23.16b, v17.16b, #4 4035 and v22.16b, v17.16b, v31.16b 4036 ushr v25.16b, v18.16b, #4 4037 and v24.16b, v18.16b, v31.16b 4038 ushr v27.16b, v19.16b, #4 4039 and v26.16b, v19.16b, v31.16b 4040 zip1 v16.16b, v20.16b, v21.16b 4041 zip2 v17.16b, v20.16b, v21.16b 4042 zip1 v18.16b, v22.16b, v23.16b 4043 zip2 v19.16b, v22.16b, v23.16b 4044 zip1 v20.16b, v24.16b, v25.16b 4045 zip2 v21.16b, v24.16b, v25.16b 4046 tbl v16.16b, {v0.16b}, v16.16b 4047 zip1 v22.16b, v26.16b, v27.16b 4048 tbl v17.16b, {v0.16b}, v17.16b 4049 zip2 v23.16b, v26.16b, v27.16b 4050 tbl v18.16b, {v0.16b}, v18.16b 4051 tbl v19.16b, {v0.16b}, v19.16b 4052 tbl v20.16b, {v0.16b}, v20.16b 4053 st1 {v16.16b, v17.16b}, [x0], x1 4054 tbl v21.16b, {v0.16b}, v21.16b 4055 st1 {v18.16b, v19.16b}, [x2], x1 4056 tbl v22.16b, {v0.16b}, v22.16b 4057 st1 {v20.16b, v21.16b}, [x0], x1 4058 tbl v23.16b, {v0.16b}, v23.16b 4059 st1 {v22.16b, v23.16b}, [x2], x1 4060 b.gt 32b 4061 ret 4062640: 4063 AARCH64_VALID_JUMP_TARGET 406464: 4065 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 4066 subs w5, w5, #2 4067 ushr v21.16b, v16.16b, #4 4068 and v20.16b, v16.16b, v31.16b 4069 ushr v23.16b, v17.16b, #4 4070 and v22.16b, v17.16b, v31.16b 4071 ushr v25.16b, v18.16b, #4 4072 and v24.16b, v18.16b, v31.16b 4073 ushr v27.16b, v19.16b, #4 4074 and v26.16b, v19.16b, v31.16b 4075 zip1 v16.16b, v20.16b, v21.16b 4076 zip2 v17.16b, v20.16b, v21.16b 4077 zip1 v18.16b, v22.16b, v23.16b 4078 zip2 v19.16b, v22.16b, v23.16b 4079 zip1 v20.16b, v24.16b, v25.16b 4080 zip2 v21.16b, v24.16b, v25.16b 4081 tbl v16.16b, {v0.16b}, v16.16b 4082 zip1 v22.16b, v26.16b, v27.16b 4083 tbl v17.16b, {v0.16b}, v17.16b 4084 zip2 v23.16b, v26.16b, v27.16b 4085 tbl v18.16b, {v0.16b}, v18.16b 4086 tbl v19.16b, {v0.16b}, v19.16b 4087 st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 4088 tbl v20.16b, {v0.16b}, v20.16b 4089 tbl v21.16b, {v0.16b}, v21.16b 4090 tbl v22.16b, {v0.16b}, v22.16b 4091 tbl v23.16b, {v0.16b}, v23.16b 4092 st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1 4093 b.gt 64b 4094 ret 4095endfunc 4096 4097jumptable pal_pred_tbl 4098 .word 640b - pal_pred_tbl 4099 .word 320b - pal_pred_tbl 4100 .word 160b - pal_pred_tbl 4101 .word 80b - pal_pred_tbl 4102 .word 40b - pal_pred_tbl 4103endjumptable 4104 4105// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, 4106// const pixel *const topleft, 4107// const int width, const int height, 4108// const int16_t *ac, const int alpha); 4109function ipred_cfl_128_8bpc_neon, export=1 4110 clz w9, w3 4111 movrel x7, ipred_cfl_128_tbl 4112 sub w9, w9, #26 4113 ldrsw x9, [x7, w9, uxtw #2] 4114 movi v0.8h, #128 // dc 4115 dup v1.8h, w6 // alpha 4116 add x7, x7, x9 4117 add x6, x0, x1 4118 lsl x1, x1, #1 4119 br x7 4120L(ipred_cfl_splat_w4): 4121 AARCH64_VALID_JUMP_TARGET 41221: 4123 ld1 {v2.8h, v3.8h}, [x5], #32 4124 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha 4125 mul v3.8h, v3.8h, v1.8h 4126 cmlt v4.8h, v2.8h, #0 // sign 4127 cmlt v5.8h, v3.8h, #0 4128 add v2.8h, v2.8h, v4.8h // diff + sign 4129 add v3.8h, v3.8h, v5.8h 4130 srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() 4131 srshr v3.8h, v3.8h, #6 4132 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 4133 add v3.8h, v3.8h, v0.8h 4134 sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) 4135 sqxtun v3.8b, v3.8h 4136 st1 {v2.s}[0], [x0], x1 4137 st1 {v2.s}[1], [x6], x1 4138 subs w4, w4, #4 4139 st1 {v3.s}[0], [x0], x1 4140 st1 {v3.s}[1], [x6], x1 4141 b.gt 1b 4142 ret 4143L(ipred_cfl_splat_w8): 4144 AARCH64_VALID_JUMP_TARGET 41451: 4146 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64 4147 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha 4148 mul v3.8h, v3.8h, v1.8h 4149 mul v4.8h, v4.8h, v1.8h 4150 mul v5.8h, v5.8h, v1.8h 4151 cmlt v16.8h, v2.8h, #0 // sign 4152 cmlt v17.8h, v3.8h, #0 4153 cmlt v18.8h, v4.8h, #0 4154 cmlt v19.8h, v5.8h, #0 4155 add v2.8h, v2.8h, v16.8h // diff + sign 4156 add v3.8h, v3.8h, v17.8h 4157 add v4.8h, v4.8h, v18.8h 4158 add v5.8h, v5.8h, v19.8h 4159 srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() 4160 srshr v3.8h, v3.8h, #6 4161 srshr v4.8h, v4.8h, #6 4162 srshr v5.8h, v5.8h, #6 4163 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 4164 add v3.8h, v3.8h, v0.8h 4165 add v4.8h, v4.8h, v0.8h 4166 add v5.8h, v5.8h, v0.8h 4167 sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) 4168 sqxtun v3.8b, v3.8h 4169 sqxtun v4.8b, v4.8h 4170 sqxtun v5.8b, v5.8h 4171 st1 {v2.8b}, [x0], x1 4172 st1 {v3.8b}, [x6], x1 4173 subs w4, w4, #4 4174 st1 {v4.8b}, [x0], x1 4175 st1 {v5.8b}, [x6], x1 4176 b.gt 1b 4177 ret 4178L(ipred_cfl_splat_w16): 4179 AARCH64_VALID_JUMP_TARGET 4180 add x7, x5, w3, uxtw #1 4181 sub x1, x1, w3, uxtw 4182 mov w9, w3 41831: 4184 ld1 {v2.8h, v3.8h}, [x5], #32 4185 ld1 {v4.8h, v5.8h}, [x7], #32 4186 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha 4187 mul v3.8h, v3.8h, v1.8h 4188 mul v4.8h, v4.8h, v1.8h 4189 mul v5.8h, v5.8h, v1.8h 4190 cmlt v16.8h, v2.8h, #0 // sign 4191 cmlt v17.8h, v3.8h, #0 4192 cmlt v18.8h, v4.8h, #0 4193 cmlt v19.8h, v5.8h, #0 4194 add v2.8h, v2.8h, v16.8h // diff + sign 4195 add v3.8h, v3.8h, v17.8h 4196 add v4.8h, v4.8h, v18.8h 4197 add v5.8h, v5.8h, v19.8h 4198 srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() 4199 srshr v3.8h, v3.8h, #6 4200 srshr v4.8h, v4.8h, #6 4201 srshr v5.8h, v5.8h, #6 4202 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 4203 add v3.8h, v3.8h, v0.8h 4204 add v4.8h, v4.8h, v0.8h 4205 add v5.8h, v5.8h, v0.8h 4206 sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) 4207 sqxtun v3.8b, v3.8h 4208 sqxtun v4.8b, v4.8h 4209 sqxtun v5.8b, v5.8h 4210 subs w3, w3, #16 4211 st1 {v2.8b, v3.8b}, [x0], #16 4212 st1 {v4.8b, v5.8b}, [x6], #16 4213 b.gt 1b 4214 subs w4, w4, #2 4215 add x5, x5, w9, uxtw #1 4216 add x7, x7, w9, uxtw #1 4217 add x0, x0, x1 4218 add x6, x6, x1 4219 mov w3, w9 4220 b.gt 1b 4221 ret 4222endfunc 4223 4224jumptable ipred_cfl_128_tbl 4225ipred_cfl_splat_tbl: 4226 .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl 4227 .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl 4228 .word L(ipred_cfl_splat_w8) - ipred_cfl_128_tbl 4229 .word L(ipred_cfl_splat_w4) - ipred_cfl_128_tbl 4230endjumptable 4231 4232// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, 4233// const pixel *const topleft, 4234// const int width, const int height, 4235// const int16_t *ac, const int alpha); 4236function ipred_cfl_top_8bpc_neon, export=1 4237 clz w9, w3 4238 movrel x7, ipred_cfl_top_tbl 4239 sub w9, w9, #26 4240 ldrsw x9, [x7, w9, uxtw #2] 4241 dup v1.8h, w6 // alpha 4242 add x2, x2, #1 4243 add x7, x7, x9 4244 add x6, x0, x1 4245 lsl x1, x1, #1 4246 br x7 42474: 4248 AARCH64_VALID_JUMP_TARGET 4249 ld1r {v0.2s}, [x2] 4250 uaddlv h0, v0.8b 4251 urshr v0.4h, v0.4h, #3 4252 dup v0.8h, v0.h[0] 4253 b L(ipred_cfl_splat_w4) 42548: 4255 AARCH64_VALID_JUMP_TARGET 4256 ld1 {v0.8b}, [x2] 4257 uaddlv h0, v0.8b 4258 urshr v0.4h, v0.4h, #3 4259 dup v0.8h, v0.h[0] 4260 b L(ipred_cfl_splat_w8) 426116: 4262 AARCH64_VALID_JUMP_TARGET 4263 ld1 {v0.16b}, [x2] 4264 uaddlv h0, v0.16b 4265 urshr v0.4h, v0.4h, #4 4266 dup v0.8h, v0.h[0] 4267 b L(ipred_cfl_splat_w16) 426832: 4269 AARCH64_VALID_JUMP_TARGET 4270 ld1 {v2.16b, v3.16b}, [x2] 4271 uaddlv h2, v2.16b 4272 uaddlv h3, v3.16b 4273 add v2.4h, v2.4h, v3.4h 4274 urshr v2.4h, v2.4h, #5 4275 dup v0.8h, v2.h[0] 4276 b L(ipred_cfl_splat_w16) 4277endfunc 4278 4279jumptable ipred_cfl_top_tbl 4280 .word 32b - ipred_cfl_top_tbl 4281 .word 16b - ipred_cfl_top_tbl 4282 .word 8b - ipred_cfl_top_tbl 4283 .word 4b - ipred_cfl_top_tbl 4284endjumptable 4285 4286// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, 4287// const pixel *const topleft, 4288// const int width, const int height, 4289// const int16_t *ac, const int alpha); 4290function ipred_cfl_left_8bpc_neon, export=1 4291 sub x2, x2, w4, uxtw 4292 clz w9, w3 4293 clz w8, w4 4294 movrel x10, ipred_cfl_splat_tbl 4295 movrel x7, ipred_cfl_left_tbl 4296 sub w9, w9, #26 4297 sub w8, w8, #26 4298 ldrsw x9, [x10, w9, uxtw #2] 4299 ldrsw x8, [x7, w8, uxtw #2] 4300 dup v1.8h, w6 // alpha 4301 add x9, x10, x9 4302 add x7, x7, x8 4303 add x6, x0, x1 4304 lsl x1, x1, #1 4305 br x7 4306 4307L(ipred_cfl_left_h4): 4308 AARCH64_VALID_JUMP_TARGET 4309 ld1r {v0.2s}, [x2] 4310 uaddlv h0, v0.8b 4311 urshr v0.4h, v0.4h, #3 4312 dup v0.8h, v0.h[0] 4313 br x9 4314 4315L(ipred_cfl_left_h8): 4316 AARCH64_VALID_JUMP_TARGET 4317 ld1 {v0.8b}, [x2] 4318 uaddlv h0, v0.8b 4319 urshr v0.4h, v0.4h, #3 4320 dup v0.8h, v0.h[0] 4321 br x9 4322 4323L(ipred_cfl_left_h16): 4324 AARCH64_VALID_JUMP_TARGET 4325 ld1 {v0.16b}, [x2] 4326 uaddlv h0, v0.16b 4327 urshr v0.4h, v0.4h, #4 4328 dup v0.8h, v0.h[0] 4329 br x9 4330 4331L(ipred_cfl_left_h32): 4332 AARCH64_VALID_JUMP_TARGET 4333 ld1 {v2.16b, v3.16b}, [x2] 4334 uaddlv h2, v2.16b 4335 uaddlv h3, v3.16b 4336 add v2.4h, v2.4h, v3.4h 4337 urshr v2.4h, v2.4h, #5 4338 dup v0.8h, v2.h[0] 4339 br x9 4340endfunc 4341 4342jumptable ipred_cfl_left_tbl 4343 .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl 4344 .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl 4345 .word L(ipred_cfl_left_h8) - ipred_cfl_left_tbl 4346 .word L(ipred_cfl_left_h4) - ipred_cfl_left_tbl 4347endjumptable 4348 4349// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, 4350// const pixel *const topleft, 4351// const int width, const int height, 4352// const int16_t *ac, const int alpha); 4353function ipred_cfl_8bpc_neon, export=1 4354 sub x2, x2, w4, uxtw 4355 add w8, w3, w4 // width + height 4356 dup v1.8h, w6 // alpha 4357 clz w9, w3 4358 clz w6, w4 4359 dup v16.8h, w8 // width + height 4360 movrel x7, ipred_cfl_tbl 4361 rbit w8, w8 // rbit(width + height) 4362 sub w9, w9, #22 // 26 leading bits, minus table offset 4 4363 sub w6, w6, #26 4364 clz w8, w8 // ctz(width + height) 4365 ldrsw x9, [x7, w9, uxtw #2] 4366 ldrsw x6, [x7, w6, uxtw #2] 4367 neg w8, w8 // -ctz(width + height) 4368 add x9, x7, x9 4369 add x7, x7, x6 4370 ushr v16.8h, v16.8h, #1 // (width + height) >> 1 4371 dup v17.8h, w8 // -ctz(width + height) 4372 add x6, x0, x1 4373 lsl x1, x1, #1 4374 br x7 4375 4376L(ipred_cfl_h4): 4377 AARCH64_VALID_JUMP_TARGET 4378 ld1 {v0.s}[0], [x2], #4 4379 ins v0.s[1], wzr 4380 add x2, x2, #1 4381 uaddlv h0, v0.8b 4382 br x9 4383L(ipred_cfl_w4): 4384 AARCH64_VALID_JUMP_TARGET 4385 ld1 {v2.s}[0], [x2] 4386 ins v2.s[1], wzr 4387 add v0.4h, v0.4h, v16.4h 4388 uaddlv h2, v2.8b 4389 cmp w4, #4 4390 add v0.4h, v0.4h, v2.4h 4391 ushl v0.4h, v0.4h, v17.4h 4392 b.eq 1f 4393 // h = 8/16 4394 mov w16, #(0x3334/2) 4395 movk w16, #(0x5556/2), lsl #16 4396 add w17, w4, w4 // w17 = 2*h = 16 or 32 4397 lsr w16, w16, w17 4398 dup v16.4h, w16 4399 sqdmulh v0.4h, v0.4h, v16.4h 44001: 4401 dup v0.8h, v0.h[0] 4402 b L(ipred_cfl_splat_w4) 4403 4404L(ipred_cfl_h8): 4405 AARCH64_VALID_JUMP_TARGET 4406 ld1 {v0.8b}, [x2], #8 4407 uaddlv h0, v0.8b 4408 add x2, x2, #1 4409 br x9 4410L(ipred_cfl_w8): 4411 AARCH64_VALID_JUMP_TARGET 4412 ld1 {v2.8b}, [x2] 4413 add v0.4h, v0.4h, v16.4h 4414 uaddlv h2, v2.8b 4415 cmp w4, #8 4416 add v0.4h, v0.4h, v2.4h 4417 ushl v0.4h, v0.4h, v17.4h 4418 b.eq 1f 4419 // h = 4/16/32 4420 cmp w4, #32 4421 mov w16, #(0x3334/2) 4422 mov w17, #(0x5556/2) 4423 csel w16, w16, w17, eq 4424 dup v16.4h, w16 4425 sqdmulh v0.4h, v0.4h, v16.4h 44261: 4427 dup v0.8h, v0.h[0] 4428 b L(ipred_cfl_splat_w8) 4429 4430L(ipred_cfl_h16): 4431 AARCH64_VALID_JUMP_TARGET 4432 ld1 {v0.16b}, [x2], #16 4433 uaddlv h0, v0.16b 4434 add x2, x2, #1 4435 br x9 4436L(ipred_cfl_w16): 4437 AARCH64_VALID_JUMP_TARGET 4438 ld1 {v2.16b}, [x2] 4439 add v0.4h, v0.4h, v16.4h 4440 uaddlv h2, v2.16b 4441 cmp w4, #16 4442 add v0.4h, v0.4h, v2.4h 4443 ushl v0.4h, v0.4h, v17.4h 4444 b.eq 1f 4445 // h = 4/8/32 4446 cmp w4, #4 4447 mov w16, #(0x3334/2) 4448 mov w17, #(0x5556/2) 4449 csel w16, w16, w17, eq 4450 dup v16.4h, w16 4451 sqdmulh v0.4h, v0.4h, v16.4h 44521: 4453 dup v0.8h, v0.h[0] 4454 b L(ipred_cfl_splat_w16) 4455 4456L(ipred_cfl_h32): 4457 AARCH64_VALID_JUMP_TARGET 4458 ld1 {v2.16b, v3.16b}, [x2], #32 4459 uaddlv h2, v2.16b 4460 uaddlv h3, v3.16b 4461 add x2, x2, #1 4462 add v0.4h, v2.4h, v3.4h 4463 br x9 4464L(ipred_cfl_w32): 4465 AARCH64_VALID_JUMP_TARGET 4466 ld1 {v2.16b, v3.16b}, [x2] 4467 add v0.4h, v0.4h, v16.4h 4468 uaddlv h2, v2.16b 4469 uaddlv h3, v3.16b 4470 cmp w4, #32 4471 add v0.4h, v0.4h, v2.4h 4472 add v0.4h, v0.4h, v3.4h 4473 ushl v0.4h, v0.4h, v17.4h 4474 b.eq 1f 4475 // h = 8/16 4476 mov w16, #(0x5556/2) 4477 movk w16, #(0x3334/2), lsl #16 4478 add w17, w4, w4 // w17 = 2*h = 16 or 32 4479 lsr w16, w16, w17 4480 dup v16.4h, w16 4481 sqdmulh v0.4h, v0.4h, v16.4h 44821: 4483 dup v0.8h, v0.h[0] 4484 b L(ipred_cfl_splat_w16) 4485endfunc 4486 4487jumptable ipred_cfl_tbl 4488 .word L(ipred_cfl_h32) - ipred_cfl_tbl 4489 .word L(ipred_cfl_h16) - ipred_cfl_tbl 4490 .word L(ipred_cfl_h8) - ipred_cfl_tbl 4491 .word L(ipred_cfl_h4) - ipred_cfl_tbl 4492 .word L(ipred_cfl_w32) - ipred_cfl_tbl 4493 .word L(ipred_cfl_w16) - ipred_cfl_tbl 4494 .word L(ipred_cfl_w8) - ipred_cfl_tbl 4495 .word L(ipred_cfl_w4) - ipred_cfl_tbl 4496endjumptable 4497 4498// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, 4499// const ptrdiff_t stride, const int w_pad, 4500// const int h_pad, const int cw, const int ch); 4501function ipred_cfl_ac_420_8bpc_neon, export=1 4502 clz w8, w5 4503 lsl w4, w4, #2 4504 movrel x7, ipred_cfl_ac_420_tbl 4505 sub w8, w8, #27 4506 ldrsw x8, [x7, w8, uxtw #2] 4507 movi v16.8h, #0 4508 movi v17.8h, #0 4509 movi v18.8h, #0 4510 movi v19.8h, #0 4511 add x7, x7, x8 4512 sub w8, w6, w4 // height - h_pad 4513 rbit w9, w5 // rbit(width) 4514 rbit w10, w6 // rbit(height) 4515 clz w9, w9 // ctz(width) 4516 clz w10, w10 // ctz(height) 4517 add w9, w9, w10 // log2sz 4518 add x10, x1, x2 4519 dup v31.4s, w9 4520 lsl x2, x2, #1 4521 neg v31.4s, v31.4s // -log2sz 4522 br x7 4523 4524L(ipred_cfl_ac_420_w4): 4525 AARCH64_VALID_JUMP_TARGET 45261: // Copy and subsample input 4527 ld1 {v0.8b}, [x1], x2 4528 ld1 {v1.8b}, [x10], x2 4529 ld1 {v0.d}[1], [x1], x2 4530 ld1 {v1.d}[1], [x10], x2 4531 uaddlp v0.8h, v0.16b 4532 uaddlp v1.8h, v1.16b 4533 add v0.8h, v0.8h, v1.8h 4534 shl v0.8h, v0.8h, #1 4535 subs w8, w8, #2 4536 st1 {v0.8h}, [x0], #16 4537 add v16.8h, v16.8h, v0.8h 4538 b.gt 1b 4539 trn2 v1.2d, v0.2d, v0.2d 4540 trn2 v0.2d, v0.2d, v0.2d 4541L(ipred_cfl_ac_420_w4_hpad): 4542 cbz w4, 3f 45432: // Vertical padding (h_pad > 0) 4544 subs w4, w4, #4 4545 st1 {v0.8h, v1.8h}, [x0], #32 4546 add v16.8h, v16.8h, v0.8h 4547 add v17.8h, v17.8h, v1.8h 4548 b.gt 2b 45493: 4550 // Aggregate the sums 4551 add v0.8h, v16.8h, v17.8h 4552 uaddlv s0, v0.8h // sum 4553 sub x0, x0, w6, uxtw #3 4554 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 4555 dup v4.8h, v4.h[0] 45566: // Subtract dc from ac 4557 ld1 {v0.8h, v1.8h}, [x0] 4558 subs w6, w6, #4 4559 sub v0.8h, v0.8h, v4.8h 4560 sub v1.8h, v1.8h, v4.8h 4561 st1 {v0.8h, v1.8h}, [x0], #32 4562 b.gt 6b 4563 ret 4564 4565L(ipred_cfl_ac_420_w8): 4566 AARCH64_VALID_JUMP_TARGET 4567 cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 45681: // Copy and subsample input, without padding 4569 ld1 {v0.16b}, [x1], x2 4570 ld1 {v1.16b}, [x10], x2 4571 ld1 {v2.16b}, [x1], x2 4572 uaddlp v0.8h, v0.16b 4573 ld1 {v3.16b}, [x10], x2 4574 uaddlp v1.8h, v1.16b 4575 uaddlp v2.8h, v2.16b 4576 uaddlp v3.8h, v3.16b 4577 add v0.8h, v0.8h, v1.8h 4578 add v2.8h, v2.8h, v3.8h 4579 shl v0.8h, v0.8h, #1 4580 shl v1.8h, v2.8h, #1 4581 subs w8, w8, #2 4582 st1 {v0.8h, v1.8h}, [x0], #32 4583 add v16.8h, v16.8h, v0.8h 4584 add v17.8h, v17.8h, v1.8h 4585 b.gt 1b 4586 mov v0.16b, v1.16b 4587 b L(ipred_cfl_ac_420_w8_hpad) 4588 4589L(ipred_cfl_ac_420_w8_wpad): 45901: // Copy and subsample input, padding 4 4591 ld1 {v0.8b}, [x1], x2 4592 ld1 {v1.8b}, [x10], x2 4593 ld1 {v0.d}[1], [x1], x2 4594 ld1 {v1.d}[1], [x10], x2 4595 uaddlp v0.8h, v0.16b 4596 uaddlp v1.8h, v1.16b 4597 add v0.8h, v0.8h, v1.8h 4598 shl v0.8h, v0.8h, #1 4599 dup v1.4h, v0.h[3] 4600 dup v3.4h, v0.h[7] 4601 trn2 v2.2d, v0.2d, v0.2d 4602 subs w8, w8, #2 4603 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 4604 add v16.4h, v16.4h, v0.4h 4605 add v17.4h, v17.4h, v1.4h 4606 add v18.4h, v18.4h, v2.4h 4607 add v19.4h, v19.4h, v3.4h 4608 b.gt 1b 4609 trn1 v0.2d, v2.2d, v3.2d 4610 trn1 v1.2d, v2.2d, v3.2d 4611 4612L(ipred_cfl_ac_420_w8_hpad): 4613 cbz w4, 3f 46142: // Vertical padding (h_pad > 0) 4615 subs w4, w4, #4 4616 st1 {v0.8h, v1.8h}, [x0], #32 4617 add v16.8h, v16.8h, v0.8h 4618 add v17.8h, v17.8h, v1.8h 4619 st1 {v0.8h, v1.8h}, [x0], #32 4620 add v18.8h, v18.8h, v0.8h 4621 add v19.8h, v19.8h, v1.8h 4622 b.gt 2b 46233: 4624 4625L(ipred_cfl_ac_420_w8_calc_subtract_dc): 4626 // Aggregate the sums 4627 add v0.8h, v16.8h, v17.8h 4628 add v2.8h, v18.8h, v19.8h 4629 uaddlp v0.4s, v0.8h 4630 uaddlp v2.4s, v2.8h 4631 add v0.4s, v0.4s, v2.4s 4632 addv s0, v0.4s // sum 4633 sub x0, x0, w6, uxtw #4 4634 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 4635 dup v4.8h, v4.h[0] 4636L(ipred_cfl_ac_420_w8_subtract_dc): 46376: // Subtract dc from ac 4638 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 4639 subs w6, w6, #4 4640 sub v0.8h, v0.8h, v4.8h 4641 sub v1.8h, v1.8h, v4.8h 4642 sub v2.8h, v2.8h, v4.8h 4643 sub v3.8h, v3.8h, v4.8h 4644 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4645 b.gt 6b 4646 ret 4647 4648L(ipred_cfl_ac_420_w16): 4649 AARCH64_VALID_JUMP_TARGET 4650 movrel x7, ipred_cfl_ac_420_w16_tbl 4651 ldrsw x3, [x7, w3, uxtw #2] 4652 add x7, x7, x3 4653 br x7 4654 4655L(ipred_cfl_ac_420_w16_wpad0): 4656 AARCH64_VALID_JUMP_TARGET 46571: // Copy and subsample input, without padding 4658 ld1 {v0.16b, v1.16b}, [x1], x2 4659 ld1 {v2.16b, v3.16b}, [x10], x2 4660 uaddlp v0.8h, v0.16b 4661 ld1 {v4.16b, v5.16b}, [x1], x2 4662 uaddlp v1.8h, v1.16b 4663 ld1 {v6.16b, v7.16b}, [x10], x2 4664 uaddlp v2.8h, v2.16b 4665 uaddlp v3.8h, v3.16b 4666 uaddlp v4.8h, v4.16b 4667 uaddlp v5.8h, v5.16b 4668 uaddlp v6.8h, v6.16b 4669 uaddlp v7.8h, v7.16b 4670 add v0.8h, v0.8h, v2.8h 4671 add v1.8h, v1.8h, v3.8h 4672 add v4.8h, v4.8h, v6.8h 4673 add v5.8h, v5.8h, v7.8h 4674 shl v0.8h, v0.8h, #1 4675 shl v1.8h, v1.8h, #1 4676 shl v2.8h, v4.8h, #1 4677 shl v3.8h, v5.8h, #1 4678 subs w8, w8, #2 4679 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4680 add v16.8h, v16.8h, v0.8h 4681 add v17.8h, v17.8h, v1.8h 4682 add v18.8h, v18.8h, v2.8h 4683 add v19.8h, v19.8h, v3.8h 4684 b.gt 1b 4685 mov v0.16b, v2.16b 4686 mov v1.16b, v3.16b 4687 b L(ipred_cfl_ac_420_w16_hpad) 4688 4689L(ipred_cfl_ac_420_w16_wpad1): 4690 AARCH64_VALID_JUMP_TARGET 46911: // Copy and subsample input, padding 4 4692 ldr d1, [x1, #16] 4693 ld1 {v0.16b}, [x1], x2 4694 ldr d3, [x10, #16] 4695 ld1 {v2.16b}, [x10], x2 4696 uaddlp v1.4h, v1.8b 4697 ldr d5, [x1, #16] 4698 uaddlp v0.8h, v0.16b 4699 ld1 {v4.16b}, [x1], x2 4700 uaddlp v3.4h, v3.8b 4701 ldr d7, [x10, #16] 4702 uaddlp v2.8h, v2.16b 4703 ld1 {v6.16b}, [x10], x2 4704 uaddlp v5.4h, v5.8b 4705 uaddlp v4.8h, v4.16b 4706 uaddlp v7.4h, v7.8b 4707 uaddlp v6.8h, v6.16b 4708 add v1.4h, v1.4h, v3.4h 4709 add v0.8h, v0.8h, v2.8h 4710 add v5.4h, v5.4h, v7.4h 4711 add v4.8h, v4.8h, v6.8h 4712 shl v1.4h, v1.4h, #1 4713 shl v0.8h, v0.8h, #1 4714 shl v3.4h, v5.4h, #1 4715 shl v2.8h, v4.8h, #1 4716 dup v4.4h, v1.h[3] 4717 dup v5.4h, v3.h[3] 4718 trn1 v1.2d, v1.2d, v4.2d 4719 trn1 v3.2d, v3.2d, v5.2d 4720 subs w8, w8, #2 4721 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4722 add v16.8h, v16.8h, v0.8h 4723 add v17.8h, v17.8h, v1.8h 4724 add v18.8h, v18.8h, v2.8h 4725 add v19.8h, v19.8h, v3.8h 4726 b.gt 1b 4727 mov v0.16b, v2.16b 4728 mov v1.16b, v3.16b 4729 b L(ipred_cfl_ac_420_w16_hpad) 4730 4731L(ipred_cfl_ac_420_w16_wpad2): 4732 AARCH64_VALID_JUMP_TARGET 47331: // Copy and subsample input, padding 8 4734 ld1 {v0.16b}, [x1], x2 4735 ld1 {v2.16b}, [x10], x2 4736 ld1 {v4.16b}, [x1], x2 4737 uaddlp v0.8h, v0.16b 4738 ld1 {v6.16b}, [x10], x2 4739 uaddlp v2.8h, v2.16b 4740 uaddlp v4.8h, v4.16b 4741 uaddlp v6.8h, v6.16b 4742 add v0.8h, v0.8h, v2.8h 4743 add v4.8h, v4.8h, v6.8h 4744 shl v0.8h, v0.8h, #1 4745 shl v2.8h, v4.8h, #1 4746 dup v1.8h, v0.h[7] 4747 dup v3.8h, v2.h[7] 4748 subs w8, w8, #2 4749 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4750 add v16.8h, v16.8h, v0.8h 4751 add v17.8h, v17.8h, v1.8h 4752 add v18.8h, v18.8h, v2.8h 4753 add v19.8h, v19.8h, v3.8h 4754 b.gt 1b 4755 mov v0.16b, v2.16b 4756 mov v1.16b, v3.16b 4757 b L(ipred_cfl_ac_420_w16_hpad) 4758 4759L(ipred_cfl_ac_420_w16_wpad3): 4760 AARCH64_VALID_JUMP_TARGET 47611: // Copy and subsample input, padding 12 4762 ld1 {v0.8b}, [x1], x2 4763 ld1 {v2.8b}, [x10], x2 4764 ld1 {v4.8b}, [x1], x2 4765 uaddlp v0.4h, v0.8b 4766 ld1 {v6.8b}, [x10], x2 4767 uaddlp v2.4h, v2.8b 4768 uaddlp v4.4h, v4.8b 4769 uaddlp v6.4h, v6.8b 4770 add v0.4h, v0.4h, v2.4h 4771 add v4.4h, v4.4h, v6.4h 4772 shl v0.4h, v0.4h, #1 4773 shl v2.4h, v4.4h, #1 4774 dup v1.8h, v0.h[3] 4775 dup v3.8h, v2.h[3] 4776 trn1 v0.2d, v0.2d, v1.2d 4777 trn1 v2.2d, v2.2d, v3.2d 4778 subs w8, w8, #2 4779 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4780 add v16.8h, v16.8h, v0.8h 4781 add v17.8h, v17.8h, v1.8h 4782 add v18.8h, v18.8h, v2.8h 4783 add v19.8h, v19.8h, v3.8h 4784 b.gt 1b 4785 mov v0.16b, v2.16b 4786 mov v1.16b, v3.16b 4787 4788L(ipred_cfl_ac_420_w16_hpad): 4789 cbz w4, 3f 47902: // Vertical padding (h_pad > 0) 4791 subs w4, w4, #4 4792 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4793 add v16.8h, v16.8h, v0.8h 4794 add v17.8h, v17.8h, v1.8h 4795 add v18.8h, v18.8h, v2.8h 4796 add v19.8h, v19.8h, v3.8h 4797 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4798 add v16.8h, v16.8h, v0.8h 4799 add v17.8h, v17.8h, v1.8h 4800 add v18.8h, v18.8h, v2.8h 4801 add v19.8h, v19.8h, v3.8h 4802 b.gt 2b 48033: 4804 4805 // Double the height and reuse the w8 summing/subtracting 4806 lsl w6, w6, #1 4807 b L(ipred_cfl_ac_420_w8_calc_subtract_dc) 4808endfunc 4809 4810jumptable ipred_cfl_ac_420_tbl 4811 .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl 4812 .word L(ipred_cfl_ac_420_w8) - ipred_cfl_ac_420_tbl 4813 .word L(ipred_cfl_ac_420_w4) - ipred_cfl_ac_420_tbl 4814endjumptable 4815 4816jumptable ipred_cfl_ac_420_w16_tbl 4817 .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl 4818 .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl 4819 .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl 4820 .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl 4821endjumptable 4822 4823// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, 4824// const ptrdiff_t stride, const int w_pad, 4825// const int h_pad, const int cw, const int ch); 4826function ipred_cfl_ac_422_8bpc_neon, export=1 4827 clz w8, w5 4828 lsl w4, w4, #2 4829 movrel x7, ipred_cfl_ac_422_tbl 4830 sub w8, w8, #27 4831 ldrsw x8, [x7, w8, uxtw #2] 4832 movi v16.8h, #0 4833 movi v17.8h, #0 4834 movi v18.8h, #0 4835 movi v19.8h, #0 4836 add x7, x7, x8 4837 sub w8, w6, w4 // height - h_pad 4838 rbit w9, w5 // rbit(width) 4839 rbit w10, w6 // rbit(height) 4840 clz w9, w9 // ctz(width) 4841 clz w10, w10 // ctz(height) 4842 add w9, w9, w10 // log2sz 4843 add x10, x1, x2 4844 dup v31.4s, w9 4845 lsl x2, x2, #1 4846 neg v31.4s, v31.4s // -log2sz 4847 br x7 4848 4849L(ipred_cfl_ac_422_w4): 4850 AARCH64_VALID_JUMP_TARGET 48511: // Copy and subsample input 4852 ld1 {v0.8b}, [x1], x2 4853 ld1 {v0.d}[1], [x10], x2 4854 ld1 {v1.8b}, [x1], x2 4855 ld1 {v1.d}[1], [x10], x2 4856 uaddlp v0.8h, v0.16b 4857 uaddlp v1.8h, v1.16b 4858 shl v0.8h, v0.8h, #2 4859 shl v1.8h, v1.8h, #2 4860 subs w8, w8, #4 4861 add v16.8h, v16.8h, v0.8h 4862 add v17.8h, v17.8h, v1.8h 4863 st1 {v0.8h, v1.8h}, [x0], #32 4864 b.gt 1b 4865 trn2 v0.2d, v1.2d, v1.2d 4866 trn2 v1.2d, v1.2d, v1.2d 4867 b L(ipred_cfl_ac_420_w4_hpad) 4868 4869L(ipred_cfl_ac_422_w8): 4870 AARCH64_VALID_JUMP_TARGET 4871 cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 48721: // Copy and subsample input, without padding 4873 ld1 {v0.16b}, [x1], x2 4874 ld1 {v1.16b}, [x10], x2 4875 ld1 {v2.16b}, [x1], x2 4876 uaddlp v0.8h, v0.16b 4877 ld1 {v3.16b}, [x10], x2 4878 uaddlp v1.8h, v1.16b 4879 uaddlp v2.8h, v2.16b 4880 uaddlp v3.8h, v3.16b 4881 shl v0.8h, v0.8h, #2 4882 shl v1.8h, v1.8h, #2 4883 shl v2.8h, v2.8h, #2 4884 shl v3.8h, v3.8h, #2 4885 subs w8, w8, #4 4886 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4887 add v16.8h, v16.8h, v0.8h 4888 add v17.8h, v17.8h, v1.8h 4889 add v18.8h, v18.8h, v2.8h 4890 add v19.8h, v19.8h, v3.8h 4891 b.gt 1b 4892 mov v0.16b, v3.16b 4893 mov v1.16b, v3.16b 4894 b L(ipred_cfl_ac_420_w8_hpad) 4895 4896L(ipred_cfl_ac_422_w8_wpad): 48971: // Copy and subsample input, padding 4 4898 ld1 {v0.8b}, [x1], x2 4899 ld1 {v0.d}[1], [x10], x2 4900 ld1 {v2.8b}, [x1], x2 4901 ld1 {v2.d}[1], [x10], x2 4902 uaddlp v0.8h, v0.16b 4903 uaddlp v2.8h, v2.16b 4904 shl v0.8h, v0.8h, #2 4905 shl v2.8h, v2.8h, #2 4906 dup v4.4h, v0.h[3] 4907 dup v5.8h, v0.h[7] 4908 dup v6.4h, v2.h[3] 4909 dup v7.8h, v2.h[7] 4910 trn2 v1.2d, v0.2d, v5.2d 4911 trn1 v0.2d, v0.2d, v4.2d 4912 trn2 v3.2d, v2.2d, v7.2d 4913 trn1 v2.2d, v2.2d, v6.2d 4914 subs w8, w8, #4 4915 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4916 add v16.8h, v16.8h, v0.8h 4917 add v17.8h, v17.8h, v1.8h 4918 add v18.8h, v18.8h, v2.8h 4919 add v19.8h, v19.8h, v3.8h 4920 b.gt 1b 4921 mov v0.16b, v3.16b 4922 mov v1.16b, v3.16b 4923 b L(ipred_cfl_ac_420_w8_hpad) 4924 4925L(ipred_cfl_ac_422_w16): 4926 AARCH64_VALID_JUMP_TARGET 4927 movrel x7, ipred_cfl_ac_422_w16_tbl 4928 ldrsw x3, [x7, w3, uxtw #2] 4929 add x7, x7, x3 4930 br x7 4931 4932L(ipred_cfl_ac_422_w16_wpad0): 4933 AARCH64_VALID_JUMP_TARGET 49341: // Copy and subsample input, without padding 4935 ld1 {v0.16b, v1.16b}, [x1], x2 4936 ld1 {v2.16b, v3.16b}, [x10], x2 4937 uaddlp v0.8h, v0.16b 4938 uaddlp v1.8h, v1.16b 4939 uaddlp v2.8h, v2.16b 4940 uaddlp v3.8h, v3.16b 4941 shl v0.8h, v0.8h, #2 4942 shl v1.8h, v1.8h, #2 4943 shl v2.8h, v2.8h, #2 4944 shl v3.8h, v3.8h, #2 4945 subs w8, w8, #2 4946 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4947 add v16.8h, v16.8h, v0.8h 4948 add v17.8h, v17.8h, v1.8h 4949 add v18.8h, v18.8h, v2.8h 4950 add v19.8h, v19.8h, v3.8h 4951 b.gt 1b 4952 mov v0.16b, v2.16b 4953 mov v1.16b, v3.16b 4954 b L(ipred_cfl_ac_420_w16_hpad) 4955 4956L(ipred_cfl_ac_422_w16_wpad1): 4957 AARCH64_VALID_JUMP_TARGET 49581: // Copy and subsample input, padding 4 4959 ldr d1, [x1, #16] 4960 ld1 {v0.16b}, [x1], x2 4961 ldr d3, [x10, #16] 4962 ld1 {v2.16b}, [x10], x2 4963 uaddlp v1.4h, v1.8b 4964 uaddlp v0.8h, v0.16b 4965 uaddlp v3.4h, v3.8b 4966 uaddlp v2.8h, v2.16b 4967 shl v1.4h, v1.4h, #2 4968 shl v0.8h, v0.8h, #2 4969 shl v3.4h, v3.4h, #2 4970 shl v2.8h, v2.8h, #2 4971 dup v4.4h, v1.h[3] 4972 dup v5.4h, v3.h[3] 4973 trn1 v1.2d, v1.2d, v4.2d 4974 trn1 v3.2d, v3.2d, v5.2d 4975 subs w8, w8, #2 4976 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4977 add v16.8h, v16.8h, v0.8h 4978 add v17.8h, v17.8h, v1.8h 4979 add v18.8h, v18.8h, v2.8h 4980 add v19.8h, v19.8h, v3.8h 4981 b.gt 1b 4982 mov v0.16b, v2.16b 4983 mov v1.16b, v3.16b 4984 b L(ipred_cfl_ac_420_w16_hpad) 4985 4986L(ipred_cfl_ac_422_w16_wpad2): 4987 AARCH64_VALID_JUMP_TARGET 49881: // Copy and subsample input, padding 8 4989 ld1 {v0.16b}, [x1], x2 4990 ld1 {v2.16b}, [x10], x2 4991 uaddlp v0.8h, v0.16b 4992 uaddlp v2.8h, v2.16b 4993 shl v0.8h, v0.8h, #2 4994 shl v2.8h, v2.8h, #2 4995 dup v1.8h, v0.h[7] 4996 dup v3.8h, v2.h[7] 4997 subs w8, w8, #2 4998 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 4999 add v16.8h, v16.8h, v0.8h 5000 add v17.8h, v17.8h, v1.8h 5001 add v18.8h, v18.8h, v2.8h 5002 add v19.8h, v19.8h, v3.8h 5003 b.gt 1b 5004 mov v0.16b, v2.16b 5005 mov v1.16b, v3.16b 5006 b L(ipred_cfl_ac_420_w16_hpad) 5007 5008L(ipred_cfl_ac_422_w16_wpad3): 5009 AARCH64_VALID_JUMP_TARGET 50101: // Copy and subsample input, padding 12 5011 ld1 {v0.8b}, [x1], x2 5012 ld1 {v2.8b}, [x10], x2 5013 uaddlp v0.4h, v0.8b 5014 uaddlp v2.4h, v2.8b 5015 shl v0.4h, v0.4h, #2 5016 shl v2.4h, v2.4h, #2 5017 dup v1.8h, v0.h[3] 5018 dup v3.8h, v2.h[3] 5019 trn1 v0.2d, v0.2d, v1.2d 5020 trn1 v2.2d, v2.2d, v3.2d 5021 subs w8, w8, #2 5022 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5023 add v16.8h, v16.8h, v0.8h 5024 add v17.8h, v17.8h, v1.8h 5025 add v18.8h, v18.8h, v2.8h 5026 add v19.8h, v19.8h, v3.8h 5027 b.gt 1b 5028 mov v0.16b, v2.16b 5029 mov v1.16b, v3.16b 5030 b L(ipred_cfl_ac_420_w16_hpad) 5031endfunc 5032 5033jumptable ipred_cfl_ac_422_tbl 5034 .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl 5035 .word L(ipred_cfl_ac_422_w8) - ipred_cfl_ac_422_tbl 5036 .word L(ipred_cfl_ac_422_w4) - ipred_cfl_ac_422_tbl 5037endjumptable 5038 5039jumptable ipred_cfl_ac_422_w16_tbl 5040 .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl 5041 .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl 5042 .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl 5043 .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl 5044endjumptable 5045 5046// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, 5047// const ptrdiff_t stride, const int w_pad, 5048// const int h_pad, const int cw, const int ch); 5049function ipred_cfl_ac_444_8bpc_neon, export=1 5050 clz w8, w5 5051 lsl w4, w4, #2 5052 movrel x7, ipred_cfl_ac_444_tbl 5053 sub w8, w8, #26 5054 ldrsw x8, [x7, w8, uxtw #2] 5055 movi v16.8h, #0 5056 movi v17.8h, #0 5057 movi v18.8h, #0 5058 movi v19.8h, #0 5059 add x7, x7, x8 5060 sub w8, w6, w4 // height - h_pad 5061 rbit w9, w5 // rbit(width) 5062 rbit w10, w6 // rbit(height) 5063 clz w9, w9 // ctz(width) 5064 clz w10, w10 // ctz(height) 5065 add w9, w9, w10 // log2sz 5066 add x10, x1, x2 5067 dup v31.4s, w9 5068 lsl x2, x2, #1 5069 neg v31.4s, v31.4s // -log2sz 5070 br x7 5071 5072L(ipred_cfl_ac_444_w4): 5073 AARCH64_VALID_JUMP_TARGET 50741: // Copy and expand input 5075 ld1 {v0.s}[0], [x1], x2 5076 ld1 {v0.s}[1], [x10], x2 5077 ld1 {v1.s}[0], [x1], x2 5078 ld1 {v1.s}[1], [x10], x2 5079 ushll v0.8h, v0.8b, #3 5080 ushll v1.8h, v1.8b, #3 5081 subs w8, w8, #4 5082 add v16.8h, v16.8h, v0.8h 5083 add v17.8h, v17.8h, v1.8h 5084 st1 {v0.8h, v1.8h}, [x0], #32 5085 b.gt 1b 5086 trn2 v0.2d, v1.2d, v1.2d 5087 trn2 v1.2d, v1.2d, v1.2d 5088 b L(ipred_cfl_ac_420_w4_hpad) 5089 5090L(ipred_cfl_ac_444_w8): 5091 AARCH64_VALID_JUMP_TARGET 50921: // Copy and expand input 5093 ld1 {v0.8b}, [x1], x2 5094 ld1 {v1.8b}, [x10], x2 5095 ld1 {v2.8b}, [x1], x2 5096 ushll v0.8h, v0.8b, #3 5097 ld1 {v3.8b}, [x10], x2 5098 ushll v1.8h, v1.8b, #3 5099 ushll v2.8h, v2.8b, #3 5100 ushll v3.8h, v3.8b, #3 5101 subs w8, w8, #4 5102 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5103 add v16.8h, v16.8h, v0.8h 5104 add v17.8h, v17.8h, v1.8h 5105 add v18.8h, v18.8h, v2.8h 5106 add v19.8h, v19.8h, v3.8h 5107 b.gt 1b 5108 mov v0.16b, v3.16b 5109 mov v1.16b, v3.16b 5110 b L(ipred_cfl_ac_420_w8_hpad) 5111 5112L(ipred_cfl_ac_444_w16): 5113 AARCH64_VALID_JUMP_TARGET 5114 cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 51151: // Copy and expand input, without padding 5116 ld1 {v0.16b}, [x1], x2 5117 ld1 {v2.16b}, [x10], x2 5118 ld1 {v4.16b}, [x1], x2 5119 ushll2 v1.8h, v0.16b, #3 5120 ushll v0.8h, v0.8b, #3 5121 ld1 {v6.16b}, [x10], x2 5122 ushll2 v3.8h, v2.16b, #3 5123 ushll v2.8h, v2.8b, #3 5124 ushll2 v5.8h, v4.16b, #3 5125 ushll v4.8h, v4.8b, #3 5126 ushll2 v7.8h, v6.16b, #3 5127 ushll v6.8h, v6.8b, #3 5128 subs w8, w8, #4 5129 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5130 add v16.8h, v16.8h, v0.8h 5131 add v17.8h, v17.8h, v1.8h 5132 add v18.8h, v18.8h, v2.8h 5133 add v19.8h, v19.8h, v3.8h 5134 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5135 add v16.8h, v16.8h, v4.8h 5136 add v17.8h, v17.8h, v5.8h 5137 add v18.8h, v18.8h, v6.8h 5138 add v19.8h, v19.8h, v7.8h 5139 b.gt 1b 5140 mov v0.16b, v6.16b 5141 mov v1.16b, v7.16b 5142 mov v2.16b, v6.16b 5143 mov v3.16b, v7.16b 5144 b L(ipred_cfl_ac_420_w16_hpad) 5145 5146L(ipred_cfl_ac_444_w16_wpad): 51471: // Copy and expand input, padding 8 5148 ld1 {v0.8b}, [x1], x2 5149 ld1 {v2.8b}, [x10], x2 5150 ld1 {v4.8b}, [x1], x2 5151 ld1 {v6.8b}, [x10], x2 5152 ushll v0.8h, v0.8b, #3 5153 ushll v2.8h, v2.8b, #3 5154 ushll v4.8h, v4.8b, #3 5155 ushll v6.8h, v6.8b, #3 5156 dup v1.8h, v0.h[7] 5157 dup v3.8h, v2.h[7] 5158 dup v5.8h, v4.h[7] 5159 dup v7.8h, v6.h[7] 5160 subs w8, w8, #4 5161 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5162 add v16.8h, v16.8h, v0.8h 5163 add v17.8h, v17.8h, v1.8h 5164 add v18.8h, v18.8h, v2.8h 5165 add v19.8h, v19.8h, v3.8h 5166 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5167 add v16.8h, v16.8h, v4.8h 5168 add v17.8h, v17.8h, v5.8h 5169 add v18.8h, v18.8h, v6.8h 5170 add v19.8h, v19.8h, v7.8h 5171 b.gt 1b 5172 mov v0.16b, v6.16b 5173 mov v1.16b, v7.16b 5174 mov v2.16b, v6.16b 5175 mov v3.16b, v7.16b 5176 b L(ipred_cfl_ac_420_w16_hpad) 5177 5178L(ipred_cfl_ac_444_w32): 5179 AARCH64_VALID_JUMP_TARGET 5180 movrel x7, ipred_cfl_ac_444_w32_tbl 5181 lsr w3, w3, #1 5182 ldrsw x3, [x7, w3, uxtw #2] 5183 add x7, x7, x3 5184 br x7 5185 5186L(ipred_cfl_ac_444_w32_wpad0): 5187 AARCH64_VALID_JUMP_TARGET 51881: // Copy and expand input, without padding 5189 ld1 {v2.16b, v3.16b}, [x1], x2 5190 ld1 {v6.16b, v7.16b}, [x10], x2 5191 ushll v0.8h, v2.8b, #3 5192 ushll2 v1.8h, v2.16b, #3 5193 ushll v2.8h, v3.8b, #3 5194 ushll2 v3.8h, v3.16b, #3 5195 ushll v4.8h, v6.8b, #3 5196 ushll2 v5.8h, v6.16b, #3 5197 ushll v6.8h, v7.8b, #3 5198 ushll2 v7.8h, v7.16b, #3 5199 subs w8, w8, #2 5200 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5201 add v16.8h, v16.8h, v0.8h 5202 add v17.8h, v17.8h, v1.8h 5203 add v18.8h, v18.8h, v2.8h 5204 add v19.8h, v19.8h, v3.8h 5205 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5206 add v16.8h, v16.8h, v4.8h 5207 add v17.8h, v17.8h, v5.8h 5208 add v18.8h, v18.8h, v6.8h 5209 add v19.8h, v19.8h, v7.8h 5210 b.gt 1b 5211 b L(ipred_cfl_ac_444_w32_hpad) 5212 5213L(ipred_cfl_ac_444_w32_wpad2): 5214 AARCH64_VALID_JUMP_TARGET 52151: // Copy and expand input, padding 8 5216 ldr d2, [x1, #16] 5217 ld1 {v1.16b}, [x1], x2 5218 ldr d6, [x10, #16] 5219 ld1 {v5.16b}, [x10], x2 5220 ushll v2.8h, v2.8b, #3 5221 ushll v0.8h, v1.8b, #3 5222 ushll2 v1.8h, v1.16b, #3 5223 ushll v6.8h, v6.8b, #3 5224 ushll v4.8h, v5.8b, #3 5225 ushll2 v5.8h, v5.16b, #3 5226 dup v3.8h, v2.h[7] 5227 dup v7.8h, v6.h[7] 5228 subs w8, w8, #2 5229 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5230 add v16.8h, v16.8h, v0.8h 5231 add v17.8h, v17.8h, v1.8h 5232 add v18.8h, v18.8h, v2.8h 5233 add v19.8h, v19.8h, v3.8h 5234 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5235 add v16.8h, v16.8h, v4.8h 5236 add v17.8h, v17.8h, v5.8h 5237 add v18.8h, v18.8h, v6.8h 5238 add v19.8h, v19.8h, v7.8h 5239 b.gt 1b 5240 b L(ipred_cfl_ac_444_w32_hpad) 5241 5242L(ipred_cfl_ac_444_w32_wpad4): 5243 AARCH64_VALID_JUMP_TARGET 52441: // Copy and expand input, padding 16 5245 ld1 {v1.16b}, [x1], x2 5246 ld1 {v5.16b}, [x10], x2 5247 ushll v0.8h, v1.8b, #3 5248 ushll2 v1.8h, v1.16b, #3 5249 ushll v4.8h, v5.8b, #3 5250 ushll2 v5.8h, v5.16b, #3 5251 dup v2.8h, v1.h[7] 5252 dup v3.8h, v1.h[7] 5253 dup v6.8h, v5.h[7] 5254 dup v7.8h, v5.h[7] 5255 subs w8, w8, #2 5256 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5257 add v16.8h, v16.8h, v0.8h 5258 add v17.8h, v17.8h, v1.8h 5259 add v18.8h, v18.8h, v2.8h 5260 add v19.8h, v19.8h, v3.8h 5261 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5262 add v16.8h, v16.8h, v4.8h 5263 add v17.8h, v17.8h, v5.8h 5264 add v18.8h, v18.8h, v6.8h 5265 add v19.8h, v19.8h, v7.8h 5266 b.gt 1b 5267 b L(ipred_cfl_ac_444_w32_hpad) 5268 5269L(ipred_cfl_ac_444_w32_wpad6): 5270 AARCH64_VALID_JUMP_TARGET 52711: // Copy and expand input, padding 24 5272 ld1 {v0.8b}, [x1], x2 5273 ld1 {v4.8b}, [x10], x2 5274 ushll v0.8h, v0.8b, #3 5275 ushll v4.8h, v4.8b, #3 5276 dup v1.8h, v0.h[7] 5277 dup v2.8h, v0.h[7] 5278 dup v3.8h, v0.h[7] 5279 dup v5.8h, v4.h[7] 5280 dup v6.8h, v4.h[7] 5281 dup v7.8h, v4.h[7] 5282 subs w8, w8, #2 5283 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5284 add v16.8h, v16.8h, v0.8h 5285 add v17.8h, v17.8h, v1.8h 5286 add v18.8h, v18.8h, v2.8h 5287 add v19.8h, v19.8h, v3.8h 5288 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5289 add v16.8h, v16.8h, v4.8h 5290 add v17.8h, v17.8h, v5.8h 5291 add v18.8h, v18.8h, v6.8h 5292 add v19.8h, v19.8h, v7.8h 5293 b.gt 1b 5294 5295L(ipred_cfl_ac_444_w32_hpad): 5296 cbz w4, 3f 52972: // Vertical padding (h_pad > 0) 5298 subs w4, w4, #2 5299 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5300 add v16.8h, v16.8h, v4.8h 5301 add v17.8h, v17.8h, v5.8h 5302 add v18.8h, v18.8h, v6.8h 5303 add v19.8h, v19.8h, v7.8h 5304 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 5305 add v16.8h, v16.8h, v4.8h 5306 add v17.8h, v17.8h, v5.8h 5307 add v18.8h, v18.8h, v6.8h 5308 add v19.8h, v19.8h, v7.8h 5309 b.gt 2b 53103: 5311 5312 // Quadruple the height and reuse the w8 subtracting 5313 lsl w6, w6, #2 5314 // Aggregate the sums, with wider intermediates earlier than in 5315 // ipred_cfl_ac_420_w8_calc_subtract_dc. 5316 uaddlp v0.4s, v16.8h 5317 uaddlp v1.4s, v17.8h 5318 uaddlp v2.4s, v18.8h 5319 uaddlp v3.4s, v19.8h 5320 add v0.4s, v0.4s, v1.4s 5321 add v2.4s, v2.4s, v3.4s 5322 add v0.4s, v0.4s, v2.4s 5323 addv s0, v0.4s // sum 5324 sub x0, x0, w6, uxtw #4 5325 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 5326 dup v4.8h, v4.h[0] 5327 b L(ipred_cfl_ac_420_w8_subtract_dc) 5328endfunc 5329 5330jumptable ipred_cfl_ac_444_tbl 5331 .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl 5332 .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl 5333 .word L(ipred_cfl_ac_444_w8) - ipred_cfl_ac_444_tbl 5334 .word L(ipred_cfl_ac_444_w4) - ipred_cfl_ac_444_tbl 5335endjumptable 5336 5337jumptable ipred_cfl_ac_444_w32_tbl 5338 .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl 5339 .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl 5340 .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl 5341 .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl 5342endjumptable 5343