1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, 32// const pixel *const topleft, 33// const int width, const int height, const int a, 34// const int max_width, const int max_height, 35// const int bitdepth_max); 36function ipred_dc_128_16bpc_neon, export=1 37 ldr w8, [sp] 38 clz w3, w3 39 movrel x5, ipred_dc_128_tbl 40 sub w3, w3, #25 41 ldrsw x3, [x5, w3, uxtw #2] 42 dup v0.8h, w8 43 add x5, x5, x3 44 add x6, x0, x1 45 lsl x1, x1, #1 46 urshr v0.8h, v0.8h, #1 47 br x5 4840: 49 AARCH64_VALID_JUMP_TARGET 504: 51 st1 {v0.4h}, [x0], x1 52 st1 {v0.4h}, [x6], x1 53 subs w4, w4, #4 54 st1 {v0.4h}, [x0], x1 55 st1 {v0.4h}, [x6], x1 56 b.gt 4b 57 ret 5880: 59 AARCH64_VALID_JUMP_TARGET 608: 61 st1 {v0.8h}, [x0], x1 62 st1 {v0.8h}, [x6], x1 63 subs w4, w4, #4 64 st1 {v0.8h}, [x0], x1 65 st1 {v0.8h}, [x6], x1 66 b.gt 8b 67 ret 68160: 69 AARCH64_VALID_JUMP_TARGET 70 mov v1.16b, v0.16b 7116: 72 st1 {v0.8h, v1.8h}, [x0], x1 73 st1 {v0.8h, v1.8h}, [x6], x1 74 subs w4, w4, #4 75 st1 {v0.8h, v1.8h}, [x0], x1 76 st1 {v0.8h, v1.8h}, [x6], x1 77 b.gt 16b 78 ret 79320: 80 AARCH64_VALID_JUMP_TARGET 81 mov v1.16b, v0.16b 82 mov v2.16b, v0.16b 83 mov v3.16b, v0.16b 8432: 85 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 86 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 87 subs w4, w4, #4 88 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 89 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 90 b.gt 32b 91 ret 92640: 93 AARCH64_VALID_JUMP_TARGET 94 mov v1.16b, v0.16b 95 mov v2.16b, v0.16b 96 mov v3.16b, v0.16b 97 sub x1, x1, #64 9864: 99 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 100 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 101 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 102 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 103 subs w4, w4, #4 104 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 105 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 106 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 107 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 108 b.gt 64b 109 ret 110endfunc 111 112jumptable ipred_dc_128_tbl 113 .word 640b - ipred_dc_128_tbl 114 .word 320b - ipred_dc_128_tbl 115 .word 160b - ipred_dc_128_tbl 116 .word 80b - ipred_dc_128_tbl 117 .word 40b - ipred_dc_128_tbl 118endjumptable 119 120// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, 121// const pixel *const topleft, 122// const int width, const int height, const int a, 123// const int max_width, const int max_height); 124function ipred_v_16bpc_neon, export=1 125 clz w3, w3 126 movrel x5, ipred_v_tbl 127 sub w3, w3, #25 128 ldrsw x3, [x5, w3, uxtw #2] 129 add x2, x2, #2 130 add x5, x5, x3 131 add x6, x0, x1 132 lsl x1, x1, #1 133 br x5 13440: 135 AARCH64_VALID_JUMP_TARGET 136 ld1 {v0.4h}, [x2] 1374: 138 st1 {v0.4h}, [x0], x1 139 st1 {v0.4h}, [x6], x1 140 subs w4, w4, #4 141 st1 {v0.4h}, [x0], x1 142 st1 {v0.4h}, [x6], x1 143 b.gt 4b 144 ret 14580: 146 AARCH64_VALID_JUMP_TARGET 147 ld1 {v0.8h}, [x2] 1488: 149 st1 {v0.8h}, [x0], x1 150 st1 {v0.8h}, [x6], x1 151 subs w4, w4, #4 152 st1 {v0.8h}, [x0], x1 153 st1 {v0.8h}, [x6], x1 154 b.gt 8b 155 ret 156160: 157 AARCH64_VALID_JUMP_TARGET 158 ld1 {v0.8h, v1.8h}, [x2] 15916: 160 st1 {v0.8h, v1.8h}, [x0], x1 161 st1 {v0.8h, v1.8h}, [x6], x1 162 subs w4, w4, #4 163 st1 {v0.8h, v1.8h}, [x0], x1 164 st1 {v0.8h, v1.8h}, [x6], x1 165 b.gt 16b 166 ret 167320: 168 AARCH64_VALID_JUMP_TARGET 169 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 17032: 171 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 172 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 173 subs w4, w4, #4 174 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 175 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 176 b.gt 32b 177 ret 178640: 179 AARCH64_VALID_JUMP_TARGET 180 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 181 sub x1, x1, #64 182 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 18364: 184 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 185 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 186 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 187 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 188 subs w4, w4, #4 189 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 190 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 191 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 192 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 193 b.gt 64b 194 ret 195endfunc 196 197jumptable ipred_v_tbl 198 .word 640b - ipred_v_tbl 199 .word 320b - ipred_v_tbl 200 .word 160b - ipred_v_tbl 201 .word 80b - ipred_v_tbl 202 .word 40b - ipred_v_tbl 203endjumptable 204 205// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, 206// const pixel *const topleft, 207// const int width, const int height, const int a, 208// const int max_width, const int max_height); 209function ipred_h_16bpc_neon, export=1 210 clz w3, w3 211 movrel x5, ipred_h_tbl 212 sub w3, w3, #25 213 ldrsw x3, [x5, w3, uxtw #2] 214 sub x2, x2, #8 215 add x5, x5, x3 216 mov x7, #-8 217 add x6, x0, x1 218 lsl x1, x1, #1 219 br x5 22040: 221 AARCH64_VALID_JUMP_TARGET 2224: 223 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 224 st1 {v3.4h}, [x0], x1 225 st1 {v2.4h}, [x6], x1 226 subs w4, w4, #4 227 st1 {v1.4h}, [x0], x1 228 st1 {v0.4h}, [x6], x1 229 b.gt 4b 230 ret 23180: 232 AARCH64_VALID_JUMP_TARGET 2338: 234 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 235 st1 {v3.8h}, [x0], x1 236 st1 {v2.8h}, [x6], x1 237 subs w4, w4, #4 238 st1 {v1.8h}, [x0], x1 239 st1 {v0.8h}, [x6], x1 240 b.gt 8b 241 ret 242160: 243 AARCH64_VALID_JUMP_TARGET 24416: 245 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 246 str q3, [x0, #16] 247 str q2, [x6, #16] 248 st1 {v3.8h}, [x0], x1 249 st1 {v2.8h}, [x6], x1 250 subs w4, w4, #4 251 str q1, [x0, #16] 252 str q0, [x6, #16] 253 st1 {v1.8h}, [x0], x1 254 st1 {v0.8h}, [x6], x1 255 b.gt 16b 256 ret 257320: 258 AARCH64_VALID_JUMP_TARGET 25932: 260 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 261 str q3, [x0, #16] 262 str q2, [x6, #16] 263 stp q3, q3, [x0, #32] 264 stp q2, q2, [x6, #32] 265 st1 {v3.8h}, [x0], x1 266 st1 {v2.8h}, [x6], x1 267 subs w4, w4, #4 268 str q1, [x0, #16] 269 str q0, [x6, #16] 270 stp q1, q1, [x0, #32] 271 stp q0, q0, [x6, #32] 272 st1 {v1.8h}, [x0], x1 273 st1 {v0.8h}, [x6], x1 274 b.gt 32b 275 ret 276640: 277 AARCH64_VALID_JUMP_TARGET 27864: 279 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 280 str q3, [x0, #16] 281 str q2, [x6, #16] 282 stp q3, q3, [x0, #32] 283 stp q2, q2, [x6, #32] 284 stp q3, q3, [x0, #64] 285 stp q2, q2, [x6, #64] 286 stp q3, q3, [x0, #96] 287 stp q2, q2, [x6, #96] 288 st1 {v3.8h}, [x0], x1 289 st1 {v2.8h}, [x6], x1 290 subs w4, w4, #4 291 str q1, [x0, #16] 292 str q0, [x6, #16] 293 stp q1, q1, [x0, #32] 294 stp q0, q0, [x6, #32] 295 stp q1, q1, [x0, #64] 296 stp q0, q0, [x6, #64] 297 stp q1, q1, [x0, #96] 298 stp q0, q0, [x6, #96] 299 st1 {v1.8h}, [x0], x1 300 st1 {v0.8h}, [x6], x1 301 b.gt 64b 302 ret 303endfunc 304 305jumptable ipred_h_tbl 306 .word 640b - ipred_h_tbl 307 .word 320b - ipred_h_tbl 308 .word 160b - ipred_h_tbl 309 .word 80b - ipred_h_tbl 310 .word 40b - ipred_h_tbl 311endjumptable 312 313// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, 314// const pixel *const topleft, 315// const int width, const int height, const int a, 316// const int max_width, const int max_height); 317function ipred_dc_top_16bpc_neon, export=1 318 clz w3, w3 319 movrel x5, ipred_dc_top_tbl 320 sub w3, w3, #25 321 ldrsw x3, [x5, w3, uxtw #2] 322 add x2, x2, #2 323 add x5, x5, x3 324 add x6, x0, x1 325 lsl x1, x1, #1 326 br x5 32740: 328 AARCH64_VALID_JUMP_TARGET 329 ld1 {v0.4h}, [x2] 330 addv h0, v0.4h 331 urshr v0.4h, v0.4h, #2 332 dup v0.4h, v0.h[0] 3334: 334 st1 {v0.4h}, [x0], x1 335 st1 {v0.4h}, [x6], x1 336 subs w4, w4, #4 337 st1 {v0.4h}, [x0], x1 338 st1 {v0.4h}, [x6], x1 339 b.gt 4b 340 ret 34180: 342 AARCH64_VALID_JUMP_TARGET 343 ld1 {v0.8h}, [x2] 344 addv h0, v0.8h 345 urshr v0.4h, v0.4h, #3 346 dup v0.8h, v0.h[0] 3478: 348 st1 {v0.8h}, [x0], x1 349 st1 {v0.8h}, [x6], x1 350 subs w4, w4, #4 351 st1 {v0.8h}, [x0], x1 352 st1 {v0.8h}, [x6], x1 353 b.gt 8b 354 ret 355160: 356 AARCH64_VALID_JUMP_TARGET 357 ld1 {v0.8h, v1.8h}, [x2] 358 addp v0.8h, v0.8h, v1.8h 359 addv h0, v0.8h 360 urshr v2.4h, v0.4h, #4 361 dup v0.8h, v2.h[0] 362 dup v1.8h, v2.h[0] 36316: 364 st1 {v0.8h, v1.8h}, [x0], x1 365 st1 {v0.8h, v1.8h}, [x6], x1 366 subs w4, w4, #4 367 st1 {v0.8h, v1.8h}, [x0], x1 368 st1 {v0.8h, v1.8h}, [x6], x1 369 b.gt 16b 370 ret 371320: 372 AARCH64_VALID_JUMP_TARGET 373 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 374 addp v0.8h, v0.8h, v1.8h 375 addp v2.8h, v2.8h, v3.8h 376 addp v0.8h, v0.8h, v2.8h 377 uaddlv s0, v0.8h 378 rshrn v4.4h, v0.4s, #5 379 dup v0.8h, v4.h[0] 380 dup v1.8h, v4.h[0] 381 dup v2.8h, v4.h[0] 382 dup v3.8h, v4.h[0] 38332: 384 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 385 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 386 subs w4, w4, #4 387 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 388 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 389 b.gt 32b 390 ret 391640: 392 AARCH64_VALID_JUMP_TARGET 393 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 394 addp v0.8h, v0.8h, v1.8h 395 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 396 addp v2.8h, v2.8h, v3.8h 397 addp v4.8h, v4.8h, v5.8h 398 addp v6.8h, v6.8h, v7.8h 399 addp v0.8h, v0.8h, v2.8h 400 addp v4.8h, v4.8h, v6.8h 401 addp v0.8h, v0.8h, v4.8h 402 uaddlv s0, v0.8h 403 rshrn v4.4h, v0.4s, #6 404 sub x1, x1, #64 405 dup v0.8h, v4.h[0] 406 dup v1.8h, v4.h[0] 407 dup v2.8h, v4.h[0] 408 dup v3.8h, v4.h[0] 40964: 410 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 411 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 412 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 413 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 414 subs w4, w4, #4 415 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 416 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 417 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 418 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 419 b.gt 64b 420 ret 421endfunc 422 423jumptable ipred_dc_top_tbl 424 .word 640b - ipred_dc_top_tbl 425 .word 320b - ipred_dc_top_tbl 426 .word 160b - ipred_dc_top_tbl 427 .word 80b - ipred_dc_top_tbl 428 .word 40b - ipred_dc_top_tbl 429endjumptable 430 431// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, 432// const pixel *const topleft, 433// const int width, const int height, const int a, 434// const int max_width, const int max_height); 435function ipred_dc_left_16bpc_neon, export=1 436 sub x2, x2, w4, uxtw #1 437 clz w3, w3 438 clz w7, w4 439 movrel x5, ipred_dc_left_tbl 440 sub w3, w3, #20 // 25 leading bits, minus table offset 5 441 sub w7, w7, #25 442 ldrsw x3, [x5, w3, uxtw #2] 443 ldrsw x7, [x5, w7, uxtw #2] 444 add x3, x5, x3 445 add x5, x5, x7 446 add x6, x0, x1 447 lsl x1, x1, #1 448 br x5 449 450L(ipred_dc_left_h4): 451 AARCH64_VALID_JUMP_TARGET 452 ld1 {v0.4h}, [x2] 453 addv h0, v0.4h 454 urshr v0.4h, v0.4h, #2 455 dup v0.8h, v0.h[0] 456 br x3 457L(ipred_dc_left_w4): 458 AARCH64_VALID_JUMP_TARGET 4591: 460 st1 {v0.4h}, [x0], x1 461 st1 {v0.4h}, [x6], x1 462 subs w4, w4, #4 463 st1 {v0.4h}, [x0], x1 464 st1 {v0.4h}, [x6], x1 465 b.gt 1b 466 ret 467 468L(ipred_dc_left_h8): 469 AARCH64_VALID_JUMP_TARGET 470 ld1 {v0.8h}, [x2] 471 addv h0, v0.8h 472 urshr v0.4h, v0.4h, #3 473 dup v0.8h, v0.h[0] 474 br x3 475L(ipred_dc_left_w8): 476 AARCH64_VALID_JUMP_TARGET 4771: 478 st1 {v0.8h}, [x0], x1 479 st1 {v0.8h}, [x6], x1 480 subs w4, w4, #4 481 st1 {v0.8h}, [x0], x1 482 st1 {v0.8h}, [x6], x1 483 b.gt 1b 484 ret 485 486L(ipred_dc_left_h16): 487 AARCH64_VALID_JUMP_TARGET 488 ld1 {v0.8h, v1.8h}, [x2] 489 addp v0.8h, v0.8h, v1.8h 490 addv h0, v0.8h 491 urshr v2.4h, v0.4h, #4 492 dup v0.8h, v2.h[0] 493 dup v1.8h, v2.h[0] 494 br x3 495L(ipred_dc_left_w16): 496 AARCH64_VALID_JUMP_TARGET 497 mov v1.16b, v0.16b 4981: 499 st1 {v0.8h, v1.8h}, [x0], x1 500 st1 {v0.8h, v1.8h}, [x6], x1 501 subs w4, w4, #4 502 st1 {v0.8h, v1.8h}, [x0], x1 503 st1 {v0.8h, v1.8h}, [x6], x1 504 b.gt 1b 505 ret 506 507L(ipred_dc_left_h32): 508 AARCH64_VALID_JUMP_TARGET 509 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 510 addp v0.8h, v0.8h, v1.8h 511 addp v2.8h, v2.8h, v3.8h 512 addp v0.8h, v0.8h, v2.8h 513 uaddlp v0.4s, v0.8h 514 addv s0, v0.4s 515 rshrn v4.4h, v0.4s, #5 516 dup v0.8h, v4.h[0] 517 br x3 518L(ipred_dc_left_w32): 519 AARCH64_VALID_JUMP_TARGET 520 mov v1.16b, v0.16b 521 mov v2.16b, v0.16b 522 mov v3.16b, v0.16b 5231: 524 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 525 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 526 subs w4, w4, #4 527 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 528 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 529 b.gt 1b 530 ret 531 532L(ipred_dc_left_h64): 533 AARCH64_VALID_JUMP_TARGET 534 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 535 addp v0.8h, v0.8h, v1.8h 536 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 537 addp v2.8h, v2.8h, v3.8h 538 addp v4.8h, v4.8h, v5.8h 539 addp v6.8h, v6.8h, v7.8h 540 addp v0.8h, v0.8h, v2.8h 541 addp v4.8h, v4.8h, v6.8h 542 addp v0.8h, v0.8h, v4.8h 543 uaddlv s0, v0.8h 544 rshrn v4.4h, v0.4s, #6 545 dup v0.8h, v4.h[0] 546 br x3 547L(ipred_dc_left_w64): 548 AARCH64_VALID_JUMP_TARGET 549 mov v1.16b, v0.16b 550 mov v2.16b, v0.16b 551 mov v3.16b, v0.16b 552 sub x1, x1, #64 5531: 554 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 555 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 556 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 557 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 558 subs w4, w4, #4 559 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 560 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 561 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 562 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 563 b.gt 1b 564 ret 565endfunc 566 567jumptable ipred_dc_left_tbl 568 .word L(ipred_dc_left_h64) - ipred_dc_left_tbl 569 .word L(ipred_dc_left_h32) - ipred_dc_left_tbl 570 .word L(ipred_dc_left_h16) - ipred_dc_left_tbl 571 .word L(ipred_dc_left_h8) - ipred_dc_left_tbl 572 .word L(ipred_dc_left_h4) - ipred_dc_left_tbl 573 .word L(ipred_dc_left_w64) - ipred_dc_left_tbl 574 .word L(ipred_dc_left_w32) - ipred_dc_left_tbl 575 .word L(ipred_dc_left_w16) - ipred_dc_left_tbl 576 .word L(ipred_dc_left_w8) - ipred_dc_left_tbl 577 .word L(ipred_dc_left_w4) - ipred_dc_left_tbl 578endjumptable 579 580// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, 581// const pixel *const topleft, 582// const int width, const int height, const int a, 583// const int max_width, const int max_height); 584function ipred_dc_16bpc_neon, export=1 585 sub x2, x2, w4, uxtw #1 586 add w7, w3, w4 // width + height 587 clz w3, w3 588 clz w6, w4 589 dup v16.4s, w7 // width + height 590 movrel x5, ipred_dc_tbl 591 rbit w7, w7 // rbit(width + height) 592 sub w3, w3, #20 // 25 leading bits, minus table offset 5 593 sub w6, w6, #25 594 clz w7, w7 // ctz(width + height) 595 ldrsw x3, [x5, w3, uxtw #2] 596 ldrsw x6, [x5, w6, uxtw #2] 597 neg w7, w7 // -ctz(width + height) 598 add x3, x5, x3 599 add x5, x5, x6 600 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 601 dup v17.4s, w7 // -ctz(width + height) 602 add x6, x0, x1 603 lsl x1, x1, #1 604 br x5 605 606L(ipred_dc_h4): 607 AARCH64_VALID_JUMP_TARGET 608 ld1 {v0.4h}, [x2], #8 609 uaddlv s0, v0.4h 610 add x2, x2, #2 611 br x3 612L(ipred_dc_w4): 613 AARCH64_VALID_JUMP_TARGET 614 ld1 {v1.4h}, [x2] 615 add v0.2s, v0.2s, v16.2s 616 uaddlv s1, v1.4h 617 cmp w4, #4 618 add v0.2s, v0.2s, v1.2s 619 ushl v0.2s, v0.2s, v17.2s 620 b.eq 1f 621 // h = 8/16 622 cmp w4, #16 623 mov w16, #0x6667 624 mov w17, #0xAAAB 625 csel w16, w16, w17, eq 626 dup v16.2s, w16 627 mul v0.2s, v0.2s, v16.2s 628 ushr v0.2s, v0.2s, #17 6291: 630 dup v0.4h, v0.h[0] 6312: 632 st1 {v0.4h}, [x0], x1 633 st1 {v0.4h}, [x6], x1 634 subs w4, w4, #4 635 st1 {v0.4h}, [x0], x1 636 st1 {v0.4h}, [x6], x1 637 b.gt 2b 638 ret 639 640L(ipred_dc_h8): 641 AARCH64_VALID_JUMP_TARGET 642 ld1 {v0.8h}, [x2], #16 643 uaddlv s0, v0.8h 644 add x2, x2, #2 645 br x3 646L(ipred_dc_w8): 647 AARCH64_VALID_JUMP_TARGET 648 ld1 {v1.8h}, [x2] 649 add v0.2s, v0.2s, v16.2s 650 uaddlv s1, v1.8h 651 cmp w4, #8 652 add v0.2s, v0.2s, v1.2s 653 ushl v0.2s, v0.2s, v17.2s 654 b.eq 1f 655 // h = 4/16/32 656 cmp w4, #32 657 mov w16, #0x6667 658 mov w17, #0xAAAB 659 csel w16, w16, w17, eq 660 dup v16.2s, w16 661 mul v0.2s, v0.2s, v16.2s 662 ushr v0.2s, v0.2s, #17 6631: 664 dup v0.8h, v0.h[0] 6652: 666 st1 {v0.8h}, [x0], x1 667 st1 {v0.8h}, [x6], x1 668 subs w4, w4, #4 669 st1 {v0.8h}, [x0], x1 670 st1 {v0.8h}, [x6], x1 671 b.gt 2b 672 ret 673 674L(ipred_dc_h16): 675 AARCH64_VALID_JUMP_TARGET 676 ld1 {v0.8h, v1.8h}, [x2], #32 677 addp v0.8h, v0.8h, v1.8h 678 add x2, x2, #2 679 uaddlv s0, v0.8h 680 br x3 681L(ipred_dc_w16): 682 AARCH64_VALID_JUMP_TARGET 683 ld1 {v1.8h, v2.8h}, [x2] 684 add v0.2s, v0.2s, v16.2s 685 addp v1.8h, v1.8h, v2.8h 686 uaddlv s1, v1.8h 687 cmp w4, #16 688 add v0.2s, v0.2s, v1.2s 689 ushl v4.2s, v0.2s, v17.2s 690 b.eq 1f 691 // h = 4/8/32/64 692 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 693 mov w16, #0x6667 694 mov w17, #0xAAAB 695 csel w16, w16, w17, eq 696 dup v16.2s, w16 697 mul v4.2s, v4.2s, v16.2s 698 ushr v4.2s, v4.2s, #17 6991: 700 dup v0.8h, v4.h[0] 701 dup v1.8h, v4.h[0] 7022: 703 st1 {v0.8h, v1.8h}, [x0], x1 704 st1 {v0.8h, v1.8h}, [x6], x1 705 subs w4, w4, #4 706 st1 {v0.8h, v1.8h}, [x0], x1 707 st1 {v0.8h, v1.8h}, [x6], x1 708 b.gt 2b 709 ret 710 711L(ipred_dc_h32): 712 AARCH64_VALID_JUMP_TARGET 713 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 714 addp v0.8h, v0.8h, v1.8h 715 addp v2.8h, v2.8h, v3.8h 716 addp v0.8h, v0.8h, v2.8h 717 add x2, x2, #2 718 uaddlv s0, v0.8h 719 br x3 720L(ipred_dc_w32): 721 AARCH64_VALID_JUMP_TARGET 722 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] 723 add v0.2s, v0.2s, v16.2s 724 addp v1.8h, v1.8h, v2.8h 725 addp v3.8h, v3.8h, v4.8h 726 addp v1.8h, v1.8h, v3.8h 727 uaddlv s1, v1.8h 728 cmp w4, #32 729 add v0.2s, v0.2s, v1.2s 730 ushl v4.2s, v0.2s, v17.2s 731 b.eq 1f 732 // h = 8/16/64 733 cmp w4, #8 734 mov w16, #0x6667 735 mov w17, #0xAAAB 736 csel w16, w16, w17, eq 737 dup v16.2s, w16 738 mul v4.2s, v4.2s, v16.2s 739 ushr v4.2s, v4.2s, #17 7401: 741 dup v0.8h, v4.h[0] 742 dup v1.8h, v4.h[0] 743 dup v2.8h, v4.h[0] 744 dup v3.8h, v4.h[0] 7452: 746 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 747 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 748 subs w4, w4, #4 749 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 750 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 751 b.gt 2b 752 ret 753 754L(ipred_dc_h64): 755 AARCH64_VALID_JUMP_TARGET 756 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 757 addp v0.8h, v0.8h, v1.8h 758 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 759 addp v2.8h, v2.8h, v3.8h 760 addp v4.8h, v4.8h, v5.8h 761 addp v6.8h, v6.8h, v7.8h 762 addp v0.8h, v0.8h, v2.8h 763 addp v4.8h, v4.8h, v6.8h 764 addp v0.8h, v0.8h, v4.8h 765 add x2, x2, #2 766 uaddlv s0, v0.8h 767 br x3 768L(ipred_dc_w64): 769 AARCH64_VALID_JUMP_TARGET 770 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 771 add v0.2s, v0.2s, v16.2s 772 addp v1.8h, v1.8h, v2.8h 773 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2] 774 addp v3.8h, v3.8h, v4.8h 775 addp v20.8h, v20.8h, v21.8h 776 addp v22.8h, v22.8h, v23.8h 777 addp v1.8h, v1.8h, v3.8h 778 addp v20.8h, v20.8h, v22.8h 779 addp v1.8h, v1.8h, v20.8h 780 uaddlv s1, v1.8h 781 cmp w4, #64 782 add v0.2s, v0.2s, v1.2s 783 ushl v4.2s, v0.2s, v17.2s 784 b.eq 1f 785 // h = 16/32 786 cmp w4, #16 787 mov w16, #0x6667 788 mov w17, #0xAAAB 789 csel w16, w16, w17, eq 790 dup v16.2s, w16 791 mul v4.2s, v4.2s, v16.2s 792 ushr v4.2s, v4.2s, #17 7931: 794 sub x1, x1, #64 795 dup v0.8h, v4.h[0] 796 dup v1.8h, v4.h[0] 797 dup v2.8h, v4.h[0] 798 dup v3.8h, v4.h[0] 7992: 800 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 801 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 802 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 803 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 804 subs w4, w4, #4 805 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 806 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 807 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 808 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 809 b.gt 2b 810 ret 811endfunc 812 813jumptable ipred_dc_tbl 814 .word L(ipred_dc_h64) - ipred_dc_tbl 815 .word L(ipred_dc_h32) - ipred_dc_tbl 816 .word L(ipred_dc_h16) - ipred_dc_tbl 817 .word L(ipred_dc_h8) - ipred_dc_tbl 818 .word L(ipred_dc_h4) - ipred_dc_tbl 819 .word L(ipred_dc_w64) - ipred_dc_tbl 820 .word L(ipred_dc_w32) - ipred_dc_tbl 821 .word L(ipred_dc_w16) - ipred_dc_tbl 822 .word L(ipred_dc_w8) - ipred_dc_tbl 823 .word L(ipred_dc_w4) - ipred_dc_tbl 824endjumptable 825 826// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, 827// const pixel *const topleft, 828// const int width, const int height, const int a, 829// const int max_width, const int max_height); 830function ipred_paeth_16bpc_neon, export=1 831 clz w9, w3 832 movrel x5, ipred_paeth_tbl 833 sub w9, w9, #25 834 ldrsw x9, [x5, w9, uxtw #2] 835 ld1r {v4.8h}, [x2] 836 add x8, x2, #2 837 sub x2, x2, #8 838 add x5, x5, x9 839 mov x7, #-8 840 add x6, x0, x1 841 lsl x1, x1, #1 842 br x5 84340: 844 AARCH64_VALID_JUMP_TARGET 845 ld1r {v5.2d}, [x8] 846 sub v6.8h, v5.8h, v4.8h // top - topleft 8474: 848 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 849 zip1 v0.2d, v0.2d, v1.2d 850 zip1 v2.2d, v2.2d, v3.2d 851 add v16.8h, v6.8h, v0.8h // base 852 add v17.8h, v6.8h, v2.8h 853 sabd v20.8h, v5.8h, v16.8h // tdiff 854 sabd v21.8h, v5.8h, v17.8h 855 sabd v22.8h, v4.8h, v16.8h // tldiff 856 sabd v23.8h, v4.8h, v17.8h 857 sabd v16.8h, v0.8h, v16.8h // ldiff 858 sabd v17.8h, v2.8h, v17.8h 859 umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff) 860 umin v19.8h, v21.8h, v23.8h 861 cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff 862 cmge v21.8h, v23.8h, v21.8h 863 cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff 864 cmge v17.8h, v19.8h, v17.8h 865 bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 866 bsl v20.16b, v5.16b, v4.16b 867 bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... 868 bit v20.16b, v0.16b, v16.16b 869 st1 {v21.d}[1], [x0], x1 870 st1 {v21.d}[0], [x6], x1 871 subs w4, w4, #4 872 st1 {v20.d}[1], [x0], x1 873 st1 {v20.d}[0], [x6], x1 874 b.gt 4b 875 ret 87680: 877160: 878320: 879640: 880 AARCH64_VALID_JUMP_TARGET 881 ld1 {v5.8h}, [x8], #16 882 mov w9, w3 883 // Set up pointers for four rows in parallel; x0, x6, x5, x10 884 add x5, x0, x1 885 add x10, x6, x1 886 lsl x1, x1, #1 887 sub x1, x1, w3, uxtw #1 8881: 889 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 8902: 891 sub v6.8h, v5.8h, v4.8h // top - topleft 892 add v16.8h, v6.8h, v0.8h // base 893 add v17.8h, v6.8h, v1.8h 894 add v18.8h, v6.8h, v2.8h 895 add v19.8h, v6.8h, v3.8h 896 sabd v20.8h, v5.8h, v16.8h // tdiff 897 sabd v21.8h, v5.8h, v17.8h 898 sabd v22.8h, v5.8h, v18.8h 899 sabd v23.8h, v5.8h, v19.8h 900 sabd v24.8h, v4.8h, v16.8h // tldiff 901 sabd v25.8h, v4.8h, v17.8h 902 sabd v26.8h, v4.8h, v18.8h 903 sabd v27.8h, v4.8h, v19.8h 904 sabd v16.8h, v0.8h, v16.8h // ldiff 905 sabd v17.8h, v1.8h, v17.8h 906 sabd v18.8h, v2.8h, v18.8h 907 sabd v19.8h, v3.8h, v19.8h 908 umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff) 909 umin v29.8h, v21.8h, v25.8h 910 umin v30.8h, v22.8h, v26.8h 911 umin v31.8h, v23.8h, v27.8h 912 cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff 913 cmge v21.8h, v25.8h, v21.8h 914 cmge v22.8h, v26.8h, v22.8h 915 cmge v23.8h, v27.8h, v23.8h 916 cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff 917 cmge v17.8h, v29.8h, v17.8h 918 cmge v18.8h, v30.8h, v18.8h 919 cmge v19.8h, v31.8h, v19.8h 920 bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft 921 bsl v22.16b, v5.16b, v4.16b 922 bsl v21.16b, v5.16b, v4.16b 923 bsl v20.16b, v5.16b, v4.16b 924 bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... 925 bit v22.16b, v2.16b, v18.16b 926 bit v21.16b, v1.16b, v17.16b 927 bit v20.16b, v0.16b, v16.16b 928 st1 {v23.8h}, [x0], #16 929 st1 {v22.8h}, [x6], #16 930 subs w3, w3, #8 931 st1 {v21.8h}, [x5], #16 932 st1 {v20.8h}, [x10], #16 933 b.le 8f 934 ld1 {v5.8h}, [x8], #16 935 b 2b 9368: 937 subs w4, w4, #4 938 b.le 9f 939 // End of horizontal loop, move pointers to next four rows 940 sub x8, x8, w9, uxtw #1 941 add x0, x0, x1 942 add x6, x6, x1 943 // Load the top row as early as possible 944 ld1 {v5.8h}, [x8], #16 945 add x5, x5, x1 946 add x10, x10, x1 947 mov w3, w9 948 b 1b 9499: 950 ret 951endfunc 952 953jumptable ipred_paeth_tbl 954 .word 640b - ipred_paeth_tbl 955 .word 320b - ipred_paeth_tbl 956 .word 160b - ipred_paeth_tbl 957 .word 80b - ipred_paeth_tbl 958 .word 40b - ipred_paeth_tbl 959endjumptable 960 961// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, 962// const pixel *const topleft, 963// const int width, const int height, const int a, 964// const int max_width, const int max_height); 965function ipred_smooth_16bpc_neon, export=1 966 movrel x10, X(sm_weights) 967 add x11, x10, w4, uxtw 968 add x10, x10, w3, uxtw 969 clz w9, w3 970 movrel x5, ipred_smooth_tbl 971 sub x12, x2, w4, uxtw #1 972 sub w9, w9, #25 973 ldrsw x9, [x5, w9, uxtw #2] 974 ld1r {v4.8h}, [x12] // bottom 975 add x8, x2, #2 976 add x5, x5, x9 977 add x6, x0, x1 978 lsl x1, x1, #1 979 br x5 98040: 981 AARCH64_VALID_JUMP_TARGET 982 ld1r {v6.2d}, [x8] // top 983 ld1r {v7.2s}, [x10] // weights_hor 984 sub x2, x2, #8 985 mov x7, #-8 986 dup v5.8h, v6.h[3] // right 987 sub v6.8h, v6.8h, v4.8h // top-bottom 988 uxtl v7.8h, v7.8b // weights_hor 989 add v31.4h, v4.4h, v5.4h // bottom+right 9904: 991 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left 992 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 993 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 994 ushll v21.4s, v31.4h, #8 995 ushll v22.4s, v31.4h, #8 996 ushll v23.4s, v31.4h, #8 997 zip1 v1.2d, v1.2d, v0.2d // left, flipped 998 zip1 v0.2d, v3.2d, v2.2d 999 zip1 v16.2s, v16.2s, v17.2s // weights_ver 1000 zip1 v18.2s, v18.2s, v19.2s 1001 sub v0.8h, v0.8h, v5.8h // left-right 1002 sub v1.8h, v1.8h, v5.8h 1003 uxtl v16.8h, v16.8b // weights_ver 1004 uxtl v18.8h, v18.8b 1005 smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor 1006 smlal2 v21.4s, v0.8h, v7.8h 1007 smlal v22.4s, v1.4h, v7.4h 1008 smlal2 v23.4s, v1.8h, v7.8h 1009 smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver 1010 smlal2 v21.4s, v6.8h, v16.8h 1011 smlal v22.4s, v6.4h, v18.4h 1012 smlal2 v23.4s, v6.8h, v18.8h 1013 rshrn v20.4h, v20.4s, #9 1014 rshrn v21.4h, v21.4s, #9 1015 rshrn v22.4h, v22.4s, #9 1016 rshrn v23.4h, v23.4s, #9 1017 st1 {v20.4h}, [x0], x1 1018 st1 {v21.4h}, [x6], x1 1019 subs w4, w4, #4 1020 st1 {v22.4h}, [x0], x1 1021 st1 {v23.4h}, [x6], x1 1022 b.gt 4b 1023 ret 102480: 1025 AARCH64_VALID_JUMP_TARGET 1026 ld1 {v6.8h}, [x8] // top 1027 ld1 {v7.8b}, [x10] // weights_hor 1028 sub x2, x2, #8 1029 mov x7, #-8 1030 dup v5.8h, v6.h[7] // right 1031 sub v6.8h, v6.8h, v4.8h // top-bottom 1032 uxtl v7.8h, v7.8b // weights_hor 1033 add v31.4h, v4.4h, v5.4h // bottom+right 10348: 1035 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1036 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver 1037 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 1038 ushll v21.4s, v31.4h, #8 1039 ushll v22.4s, v31.4h, #8 1040 ushll v23.4s, v31.4h, #8 1041 ushll v24.4s, v31.4h, #8 1042 ushll v25.4s, v31.4h, #8 1043 ushll v26.4s, v31.4h, #8 1044 ushll v27.4s, v31.4h, #8 1045 sub v0.8h, v0.8h, v5.8h // left-right 1046 sub v1.8h, v1.8h, v5.8h 1047 sub v2.8h, v2.8h, v5.8h 1048 sub v3.8h, v3.8h, v5.8h 1049 uxtl v16.8h, v16.8b // weights_ver 1050 uxtl v17.8h, v17.8b 1051 uxtl v18.8h, v18.8b 1052 uxtl v19.8h, v19.8b 1053 smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor 1054 smlal2 v21.4s, v3.8h, v7.8h // (left flipped) 1055 smlal v22.4s, v2.4h, v7.4h 1056 smlal2 v23.4s, v2.8h, v7.8h 1057 smlal v24.4s, v1.4h, v7.4h 1058 smlal2 v25.4s, v1.8h, v7.8h 1059 smlal v26.4s, v0.4h, v7.4h 1060 smlal2 v27.4s, v0.8h, v7.8h 1061 smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver 1062 smlal2 v21.4s, v6.8h, v16.8h 1063 smlal v22.4s, v6.4h, v17.4h 1064 smlal2 v23.4s, v6.8h, v17.8h 1065 smlal v24.4s, v6.4h, v18.4h 1066 smlal2 v25.4s, v6.8h, v18.8h 1067 smlal v26.4s, v6.4h, v19.4h 1068 smlal2 v27.4s, v6.8h, v19.8h 1069 rshrn v20.4h, v20.4s, #9 1070 rshrn2 v20.8h, v21.4s, #9 1071 rshrn v21.4h, v22.4s, #9 1072 rshrn2 v21.8h, v23.4s, #9 1073 rshrn v22.4h, v24.4s, #9 1074 rshrn2 v22.8h, v25.4s, #9 1075 rshrn v23.4h, v26.4s, #9 1076 rshrn2 v23.8h, v27.4s, #9 1077 st1 {v20.8h}, [x0], x1 1078 st1 {v21.8h}, [x6], x1 1079 subs w4, w4, #4 1080 st1 {v22.8h}, [x0], x1 1081 st1 {v23.8h}, [x6], x1 1082 b.gt 8b 1083 ret 1084160: 1085320: 1086640: 1087 AARCH64_VALID_JUMP_TARGET 1088 add x12, x2, w3, uxtw #1 1089 sub x1, x1, w3, uxtw #1 1090 ld1r {v5.8h}, [x12] // right 1091 sub x2, x2, #4 1092 mov x7, #-4 1093 mov w9, w3 1094 add v31.4h, v4.4h, v5.4h // bottom+right 1095 10961: 1097 ld2r {v0.8h, v1.8h}, [x2], x7 // left 1098 ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver 1099 sub v0.8h, v0.8h, v5.8h // left-right 1100 sub v1.8h, v1.8h, v5.8h 1101 uxtl v16.8h, v16.8b // weights_ver 1102 uxtl v17.8h, v17.8b 11032: 1104 ld1 {v7.16b}, [x10], #16 // weights_hor 1105 ld1 {v2.8h, v3.8h}, [x8], #32 // top 1106 ushll v20.4s, v31.4h, #8 // (bottom+right)*256 1107 ushll v21.4s, v31.4h, #8 1108 ushll v22.4s, v31.4h, #8 1109 ushll v23.4s, v31.4h, #8 1110 ushll v24.4s, v31.4h, #8 1111 ushll v25.4s, v31.4h, #8 1112 ushll v26.4s, v31.4h, #8 1113 ushll v27.4s, v31.4h, #8 1114 uxtl v6.8h, v7.8b // weights_hor 1115 uxtl2 v7.8h, v7.16b 1116 sub v2.8h, v2.8h, v4.8h // top-bottom 1117 sub v3.8h, v3.8h, v4.8h 1118 smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor 1119 smlal2 v21.4s, v1.8h, v6.8h // (left flipped) 1120 smlal v22.4s, v1.4h, v7.4h 1121 smlal2 v23.4s, v1.8h, v7.8h 1122 smlal v24.4s, v0.4h, v6.4h 1123 smlal2 v25.4s, v0.8h, v6.8h 1124 smlal v26.4s, v0.4h, v7.4h 1125 smlal2 v27.4s, v0.8h, v7.8h 1126 smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver 1127 smlal2 v21.4s, v2.8h, v16.8h 1128 smlal v22.4s, v3.4h, v16.4h 1129 smlal2 v23.4s, v3.8h, v16.8h 1130 smlal v24.4s, v2.4h, v17.4h 1131 smlal2 v25.4s, v2.8h, v17.8h 1132 smlal v26.4s, v3.4h, v17.4h 1133 smlal2 v27.4s, v3.8h, v17.8h 1134 rshrn v20.4h, v20.4s, #9 1135 rshrn2 v20.8h, v21.4s, #9 1136 rshrn v21.4h, v22.4s, #9 1137 rshrn2 v21.8h, v23.4s, #9 1138 rshrn v22.4h, v24.4s, #9 1139 rshrn2 v22.8h, v25.4s, #9 1140 rshrn v23.4h, v26.4s, #9 1141 rshrn2 v23.8h, v27.4s, #9 1142 subs w3, w3, #16 1143 st1 {v20.8h, v21.8h}, [x0], #32 1144 st1 {v22.8h, v23.8h}, [x6], #32 1145 b.gt 2b 1146 subs w4, w4, #2 1147 b.le 9f 1148 sub x8, x8, w9, uxtw #1 1149 sub x10, x10, w9, uxtw 1150 add x0, x0, x1 1151 add x6, x6, x1 1152 mov w3, w9 1153 b 1b 11549: 1155 ret 1156endfunc 1157 1158jumptable ipred_smooth_tbl 1159 .word 640b - ipred_smooth_tbl 1160 .word 320b - ipred_smooth_tbl 1161 .word 160b - ipred_smooth_tbl 1162 .word 80b - ipred_smooth_tbl 1163 .word 40b - ipred_smooth_tbl 1164endjumptable 1165 1166// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1167// const pixel *const topleft, 1168// const int width, const int height, const int a, 1169// const int max_width, const int max_height); 1170function ipred_smooth_v_16bpc_neon, export=1 1171 movrel x7, X(sm_weights) 1172 add x7, x7, w4, uxtw 1173 clz w9, w3 1174 movrel x5, ipred_smooth_v_tbl 1175 sub x8, x2, w4, uxtw #1 1176 sub w9, w9, #25 1177 ldrsw x9, [x5, w9, uxtw #2] 1178 ld1r {v4.8h}, [x8] // bottom 1179 add x2, x2, #2 1180 add x5, x5, x9 1181 add x6, x0, x1 1182 lsl x1, x1, #1 1183 br x5 118440: 1185 AARCH64_VALID_JUMP_TARGET 1186 ld1r {v6.2d}, [x2] // top 1187 sub v6.8h, v6.8h, v4.8h // top-bottom 11884: 1189 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1190 zip1 v16.2s, v16.2s, v17.2s // weights_ver 1191 zip1 v18.2s, v18.2s, v19.2s 1192 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1193 ushll v18.8h, v18.8b, #7 1194 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1195 sqrdmulh v21.8h, v6.8h, v18.8h 1196 add v20.8h, v20.8h, v4.8h 1197 add v21.8h, v21.8h, v4.8h 1198 st1 {v20.d}[0], [x0], x1 1199 st1 {v20.d}[1], [x6], x1 1200 subs w4, w4, #4 1201 st1 {v21.d}[0], [x0], x1 1202 st1 {v21.d}[1], [x6], x1 1203 b.gt 4b 1204 ret 120580: 1206 AARCH64_VALID_JUMP_TARGET 1207 ld1 {v6.8h}, [x2] // top 1208 sub v6.8h, v6.8h, v4.8h // top-bottom 12098: 1210 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1211 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1212 ushll v17.8h, v17.8b, #7 1213 ushll v18.8h, v18.8b, #7 1214 ushll v19.8h, v19.8b, #7 1215 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1216 sqrdmulh v21.8h, v6.8h, v17.8h 1217 sqrdmulh v22.8h, v6.8h, v18.8h 1218 sqrdmulh v23.8h, v6.8h, v19.8h 1219 add v20.8h, v20.8h, v4.8h 1220 add v21.8h, v21.8h, v4.8h 1221 add v22.8h, v22.8h, v4.8h 1222 add v23.8h, v23.8h, v4.8h 1223 st1 {v20.8h}, [x0], x1 1224 st1 {v21.8h}, [x6], x1 1225 subs w4, w4, #4 1226 st1 {v22.8h}, [x0], x1 1227 st1 {v23.8h}, [x6], x1 1228 b.gt 8b 1229 ret 1230160: 1231320: 1232640: 1233 AARCH64_VALID_JUMP_TARGET 1234 // Set up pointers for four rows in parallel; x0, x6, x5, x8 1235 add x5, x0, x1 1236 add x8, x6, x1 1237 lsl x1, x1, #1 1238 sub x1, x1, w3, uxtw #1 1239 mov w9, w3 1240 12411: 1242 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver 1243 ushll v16.8h, v16.8b, #7 // weights_ver << 7 1244 ushll v17.8h, v17.8b, #7 1245 ushll v18.8h, v18.8b, #7 1246 ushll v19.8h, v19.8b, #7 12472: 1248 ld1 {v2.8h, v3.8h}, [x2], #32 // top 1249 sub v2.8h, v2.8h, v4.8h // top-bottom 1250 sub v3.8h, v3.8h, v4.8h 1251 sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 1252 sqrdmulh v21.8h, v3.8h, v16.8h 1253 sqrdmulh v22.8h, v2.8h, v17.8h 1254 sqrdmulh v23.8h, v3.8h, v17.8h 1255 sqrdmulh v24.8h, v2.8h, v18.8h 1256 sqrdmulh v25.8h, v3.8h, v18.8h 1257 sqrdmulh v26.8h, v2.8h, v19.8h 1258 sqrdmulh v27.8h, v3.8h, v19.8h 1259 add v20.8h, v20.8h, v4.8h 1260 add v21.8h, v21.8h, v4.8h 1261 add v22.8h, v22.8h, v4.8h 1262 add v23.8h, v23.8h, v4.8h 1263 add v24.8h, v24.8h, v4.8h 1264 add v25.8h, v25.8h, v4.8h 1265 add v26.8h, v26.8h, v4.8h 1266 add v27.8h, v27.8h, v4.8h 1267 subs w3, w3, #16 1268 st1 {v20.8h, v21.8h}, [x0], #32 1269 st1 {v22.8h, v23.8h}, [x6], #32 1270 st1 {v24.8h, v25.8h}, [x5], #32 1271 st1 {v26.8h, v27.8h}, [x8], #32 1272 b.gt 2b 1273 subs w4, w4, #4 1274 b.le 9f 1275 sub x2, x2, w9, uxtw #1 1276 add x0, x0, x1 1277 add x6, x6, x1 1278 add x5, x5, x1 1279 add x8, x8, x1 1280 mov w3, w9 1281 b 1b 12829: 1283 ret 1284endfunc 1285 1286jumptable ipred_smooth_v_tbl 1287 .word 640b - ipred_smooth_v_tbl 1288 .word 320b - ipred_smooth_v_tbl 1289 .word 160b - ipred_smooth_v_tbl 1290 .word 80b - ipred_smooth_v_tbl 1291 .word 40b - ipred_smooth_v_tbl 1292endjumptable 1293 1294// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1295// const pixel *const topleft, 1296// const int width, const int height, const int a, 1297// const int max_width, const int max_height); 1298function ipred_smooth_h_16bpc_neon, export=1 1299 movrel x8, X(sm_weights) 1300 add x8, x8, w3, uxtw 1301 clz w9, w3 1302 movrel x5, ipred_smooth_h_tbl 1303 add x12, x2, w3, uxtw #1 1304 sub w9, w9, #25 1305 ldrsw x9, [x5, w9, uxtw #2] 1306 ld1r {v5.8h}, [x12] // right 1307 add x5, x5, x9 1308 add x6, x0, x1 1309 lsl x1, x1, #1 1310 br x5 131140: 1312 AARCH64_VALID_JUMP_TARGET 1313 ld1r {v7.2s}, [x8] // weights_hor 1314 sub x2, x2, #8 1315 mov x7, #-8 1316 ushll v7.8h, v7.8b, #7 // weights_hor << 7 13174: 1318 ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left 1319 zip1 v1.2d, v1.2d, v0.2d // left, flipped 1320 zip1 v0.2d, v3.2d, v2.2d 1321 sub v0.8h, v0.8h, v5.8h // left-right 1322 sub v1.8h, v1.8h, v5.8h 1323 sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 1324 sqrdmulh v21.8h, v1.8h, v7.8h 1325 add v20.8h, v20.8h, v5.8h 1326 add v21.8h, v21.8h, v5.8h 1327 st1 {v20.d}[0], [x0], x1 1328 st1 {v20.d}[1], [x6], x1 1329 subs w4, w4, #4 1330 st1 {v21.d}[0], [x0], x1 1331 st1 {v21.d}[1], [x6], x1 1332 b.gt 4b 1333 ret 133480: 1335 AARCH64_VALID_JUMP_TARGET 1336 ld1 {v7.8b}, [x8] // weights_hor 1337 sub x2, x2, #8 1338 mov x7, #-8 1339 ushll v7.8h, v7.8b, #7 // weights_hor << 7 13408: 1341 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1342 sub v3.8h, v3.8h, v5.8h // left-right 1343 sub v2.8h, v2.8h, v5.8h 1344 sub v1.8h, v1.8h, v5.8h 1345 sub v0.8h, v0.8h, v5.8h 1346 sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 1347 sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped) 1348 sqrdmulh v22.8h, v1.8h, v7.8h 1349 sqrdmulh v23.8h, v0.8h, v7.8h 1350 add v20.8h, v20.8h, v5.8h 1351 add v21.8h, v21.8h, v5.8h 1352 add v22.8h, v22.8h, v5.8h 1353 add v23.8h, v23.8h, v5.8h 1354 st1 {v20.8h}, [x0], x1 1355 st1 {v21.8h}, [x6], x1 1356 subs w4, w4, #4 1357 st1 {v22.8h}, [x0], x1 1358 st1 {v23.8h}, [x6], x1 1359 b.gt 8b 1360 ret 1361160: 1362320: 1363640: 1364 AARCH64_VALID_JUMP_TARGET 1365 sub x2, x2, #8 1366 mov x7, #-8 1367 // Set up pointers for four rows in parallel; x0, x6, x5, x10 1368 add x5, x0, x1 1369 add x10, x6, x1 1370 lsl x1, x1, #1 1371 sub x1, x1, w3, uxtw #1 1372 mov w9, w3 1373 13741: 1375 ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left 1376 sub v0.8h, v0.8h, v5.8h // left-right 1377 sub v1.8h, v1.8h, v5.8h 1378 sub v2.8h, v2.8h, v5.8h 1379 sub v3.8h, v3.8h, v5.8h 13802: 1381 ld1 {v7.16b}, [x8], #16 // weights_hor 1382 ushll v6.8h, v7.8b, #7 // weights_hor << 7 1383 ushll2 v7.8h, v7.16b, #7 1384 sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8 1385 sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped) 1386 sqrdmulh v22.8h, v2.8h, v6.8h 1387 sqrdmulh v23.8h, v2.8h, v7.8h 1388 sqrdmulh v24.8h, v1.8h, v6.8h 1389 sqrdmulh v25.8h, v1.8h, v7.8h 1390 sqrdmulh v26.8h, v0.8h, v6.8h 1391 sqrdmulh v27.8h, v0.8h, v7.8h 1392 add v20.8h, v20.8h, v5.8h 1393 add v21.8h, v21.8h, v5.8h 1394 add v22.8h, v22.8h, v5.8h 1395 add v23.8h, v23.8h, v5.8h 1396 add v24.8h, v24.8h, v5.8h 1397 add v25.8h, v25.8h, v5.8h 1398 add v26.8h, v26.8h, v5.8h 1399 add v27.8h, v27.8h, v5.8h 1400 subs w3, w3, #16 1401 st1 {v20.8h, v21.8h}, [x0], #32 1402 st1 {v22.8h, v23.8h}, [x6], #32 1403 st1 {v24.8h, v25.8h}, [x5], #32 1404 st1 {v26.8h, v27.8h}, [x10], #32 1405 b.gt 2b 1406 subs w4, w4, #4 1407 b.le 9f 1408 sub x8, x8, w9, uxtw 1409 add x0, x0, x1 1410 add x6, x6, x1 1411 add x5, x5, x1 1412 add x10, x10, x1 1413 mov w3, w9 1414 b 1b 14159: 1416 ret 1417endfunc 1418 1419jumptable ipred_smooth_h_tbl 1420 .word 640b - ipred_smooth_h_tbl 1421 .word 320b - ipred_smooth_h_tbl 1422 .word 160b - ipred_smooth_h_tbl 1423 .word 80b - ipred_smooth_h_tbl 1424 .word 40b - ipred_smooth_h_tbl 1425endjumptable 1426 1427const padding_mask_buf 1428 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1429 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1430 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1431 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1432 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1433 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1434padding_mask: 1435 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1436 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1437 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1438 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1439 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1440 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 1441endconst 1442 1443// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz, 1444// const pixel *const in, const int end, 1445// const int bitdepth_max); 1446function ipred_z1_upsample_edge_16bpc_neon, export=1 1447 dup v30.8h, w4 // bitdepth_max 1448 movrel x4, padding_mask 1449 ld1 {v0.8h, v1.8h}, [x2] // in[] 1450 add x5, x2, w3, uxtw #1 // in[end] 1451 sub x4, x4, w3, uxtw #1 1452 1453 ld1r {v2.8h}, [x5] // padding 1454 ld1 {v3.8h, v4.8h}, [x4] // padding_mask 1455 1456 movi v31.8h, #9 1457 1458 bit v0.16b, v2.16b, v3.16b // padded in[] 1459 bit v1.16b, v2.16b, v4.16b 1460 1461 ext v4.16b, v0.16b, v1.16b, #2 1462 ext v5.16b, v1.16b, v2.16b, #2 1463 ext v6.16b, v0.16b, v1.16b, #4 1464 ext v7.16b, v1.16b, v2.16b, #4 1465 ext v16.16b, v0.16b, v1.16b, #6 1466 ext v17.16b, v1.16b, v2.16b, #6 1467 1468 add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2] 1469 add v19.8h, v5.8h, v7.8h 1470 add v20.8h, v0.8h, v16.8h 1471 add v21.8h, v1.8h, v17.8h 1472 umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2]) 1473 umull2 v23.4s, v18.8h, v31.8h 1474 umull v24.4s, v19.4h, v31.4h 1475 umull2 v25.4s, v19.8h, v31.8h 1476 usubw v22.4s, v22.4s, v20.4h 1477 usubw2 v23.4s, v23.4s, v20.8h 1478 usubw v24.4s, v24.4s, v21.4h 1479 usubw2 v25.4s, v25.4s, v21.8h 1480 1481 sqrshrun v16.4h, v22.4s, #4 1482 sqrshrun2 v16.8h, v23.4s, #4 1483 sqrshrun v17.4h, v24.4s, #4 1484 sqrshrun2 v17.8h, v25.4s, #4 1485 1486 smin v16.8h, v16.8h, v30.8h 1487 smin v17.8h, v17.8h, v30.8h 1488 1489 zip1 v0.8h, v4.8h, v16.8h 1490 zip2 v1.8h, v4.8h, v16.8h 1491 zip1 v2.8h, v5.8h, v17.8h 1492 zip2 v3.8h, v5.8h, v17.8h 1493 1494 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] 1495 1496 ret 1497endfunc 1498 1499// void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz, 1500// const pixel *const in, 1501// const int bitdepth_max); 1502function ipred_z2_upsample_edge_16bpc_neon, export=1 1503 dup v30.8h, w3 // bitdepth_max 1504 // Here, sz is 4 or 8, and we produce 2*sz+1 output elements. 1505 movrel x4, padding_mask 1506 ld1 {v0.8h, v1.8h}, [x2] // in[] 1507 add x5, x2, w1, uxtw #1 // in[sz] 1508 sub x4, x4, w1, uxtw #1 1509 1510 ld1r {v3.8h}, [x2] // in[0] for padding 1511 ld1r {v2.8h}, [x5] // padding 1512 ld1 {v4.8h, v5.8h}, [x4] // padding_mask 1513 1514 movi v31.8h, #9 1515 1516 bit v0.16b, v2.16b, v4.16b // padded in[] 1517 bit v1.16b, v2.16b, v5.16b 1518 1519 ext v4.16b, v3.16b, v0.16b, #14 1520 ext v5.16b, v0.16b, v1.16b, #2 1521 ext v6.16b, v0.16b, v1.16b, #4 1522 1523 add v16.8h, v0.8h, v5.8h // in[i+0] + in[i+1] 1524 add v17.8h, v4.8h, v6.8h // in[i-1] + in[i+2] 1525 umull v18.4s, v16.4h, v31.4h // 9*(in[i+1] + in[i+2]) 1526 umull2 v19.4s, v16.8h, v31.8h 1527 usubw v18.4s, v18.4s, v17.4h 1528 usubw2 v19.4s, v19.4s, v17.8h 1529 1530 sqrshrun v16.4h, v18.4s, #4 1531 sqrshrun2 v16.8h, v19.4s, #4 1532 1533 add x5, x0, #2*16 1534 1535 smin v16.8h, v16.8h, v30.8h 1536 1537 zip1 v4.8h, v0.8h, v16.8h 1538 zip2 v5.8h, v0.8h, v16.8h 1539 1540 st1 {v2.h}[0], [x5] 1541 // In case sz=8, output one single pixel in out[16]. 1542 st1 {v4.8h, v5.8h}, [x0] 1543 1544 ret 1545endfunc 1546 1547const edge_filter 1548 .short 0, 4, 8, 0 1549 .short 0, 5, 6, 0 1550// Leaving out the coeffs for strength=3 1551// .byte 2, 4, 4, 0 1552endconst 1553 1554// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz, 1555// const pixel *const in, const int end, 1556// const int strength); 1557function ipred_z1_filter_edge_16bpc_neon, export=1 1558 cmp w4, #3 1559 b.eq L(fivetap) // if (strength == 3) goto fivetap 1560 1561 movrel x5, edge_filter, -6 1562 add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1) 1563 1564 ld1 {v31.s}[0], [x5] // kernel[1-2] 1565 1566 ld1 {v0.8h}, [x2], #16 1567 1568 dup v30.8h, v31.h[0] 1569 dup v31.8h, v31.h[1] 15701: 1571 // in[end], is the last valid pixel. We produce 16 pixels out by 1572 // using 18 pixels in - the last pixel used is [17] of the ones 1573 // read/buffered. 1574 cmp w3, #17 1575 ld1 {v1.8h, v2.8h}, [x2], #32 1576 b.lt 2f 1577 ext v3.16b, v0.16b, v1.16b, #2 1578 ext v4.16b, v1.16b, v2.16b, #2 1579 ext v5.16b, v0.16b, v1.16b, #4 1580 ext v6.16b, v1.16b, v2.16b, #4 1581 mul v16.8h, v0.8h, v30.8h 1582 mla v16.8h, v3.8h, v31.8h 1583 mla v16.8h, v5.8h, v30.8h 1584 mul v17.8h, v1.8h, v30.8h 1585 mla v17.8h, v4.8h, v31.8h 1586 mla v17.8h, v6.8h, v30.8h 1587 subs w1, w1, #16 1588 mov v0.16b, v2.16b 1589 urshr v16.8h, v16.8h, #4 1590 urshr v17.8h, v17.8h, #4 1591 sub w3, w3, #16 1592 st1 {v16.8h, v17.8h}, [x0], #32 1593 b.gt 1b 1594 ret 15952: 1596 // Right padding 1597 1598 // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead) 1599 movrel x5, padding_mask 1600 sub w6, w3, #24 1601 sub x5, x5, w3, uxtw #1 1602 add x6, x2, w6, sxtw #1 1603 1604 ld1 {v3.8h, v4.8h}, [x5] // padding_mask 1605 1606 ld1r {v2.8h}, [x6] 1607 bit v0.16b, v2.16b, v3.16b // Pad v0-v1 1608 bit v1.16b, v2.16b, v4.16b 1609 1610 // Filter one block 1611 ext v3.16b, v0.16b, v1.16b, #2 1612 ext v4.16b, v1.16b, v2.16b, #2 1613 ext v5.16b, v0.16b, v1.16b, #4 1614 ext v6.16b, v1.16b, v2.16b, #4 1615 mul v16.8h, v0.8h, v30.8h 1616 mla v16.8h, v3.8h, v31.8h 1617 mla v16.8h, v5.8h, v30.8h 1618 mul v17.8h, v1.8h, v30.8h 1619 mla v17.8h, v4.8h, v31.8h 1620 mla v17.8h, v6.8h, v30.8h 1621 subs w1, w1, #16 1622 urshr v16.8h, v16.8h, #4 1623 urshr v17.8h, v17.8h, #4 1624 st1 {v16.8h, v17.8h}, [x0], #32 1625 b.le 9f 16265: 1627 // After one block, any remaining output would only be filtering 1628 // padding - thus just store the padding. 1629 subs w1, w1, #16 1630 st1 {v2.16b}, [x0], #16 1631 b.gt 5b 16329: 1633 ret 1634 1635L(fivetap): 1636 sub x2, x2, #2 // topleft -= 1 pixel 1637 movi v29.8h, #2 1638 ld1 {v0.8h}, [x2], #16 1639 movi v30.8h, #4 1640 movi v31.8h, #4 1641 ins v0.h[0], v0.h[1] 16421: 1643 // in[end+1], is the last valid pixel. We produce 16 pixels out by 1644 // using 20 pixels in - the last pixel used is [19] of the ones 1645 // read/buffered. 1646 cmp w3, #18 1647 ld1 {v1.8h, v2.8h}, [x2], #32 1648 b.lt 2f // if (end + 1 < 19) 1649 ext v3.16b, v0.16b, v1.16b, #2 1650 ext v4.16b, v1.16b, v2.16b, #2 1651 ext v5.16b, v0.16b, v1.16b, #4 1652 ext v6.16b, v1.16b, v2.16b, #4 1653 ext v16.16b, v0.16b, v1.16b, #6 1654 ext v17.16b, v1.16b, v2.16b, #6 1655 ext v18.16b, v0.16b, v1.16b, #8 1656 ext v19.16b, v1.16b, v2.16b, #8 1657 mul v20.8h, v0.8h, v29.8h 1658 mla v20.8h, v3.8h, v30.8h 1659 mla v20.8h, v5.8h, v31.8h 1660 mla v20.8h, v16.8h, v30.8h 1661 mla v20.8h, v18.8h, v29.8h 1662 mul v21.8h, v1.8h, v29.8h 1663 mla v21.8h, v4.8h, v30.8h 1664 mla v21.8h, v6.8h, v31.8h 1665 mla v21.8h, v17.8h, v30.8h 1666 mla v21.8h, v19.8h, v29.8h 1667 subs w1, w1, #16 1668 mov v0.16b, v2.16b 1669 urshr v20.8h, v20.8h, #4 1670 urshr v21.8h, v21.8h, #4 1671 sub w3, w3, #16 1672 st1 {v20.8h, v21.8h}, [x0], #32 1673 b.gt 1b 1674 ret 16752: 1676 // Right padding 1677 1678 // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead) 1679 movrel x5, padding_mask, -2 1680 sub w6, w3, #23 1681 sub x5, x5, w3, uxtw #1 1682 add x6, x2, w6, sxtw #1 1683 1684 ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask 1685 1686 ld1r {v28.8h}, [x6] 1687 bit v0.16b, v28.16b, v3.16b // Pad v0-v2 1688 bit v1.16b, v28.16b, v4.16b 1689 bit v2.16b, v28.16b, v5.16b 16904: 1691 // Filter one block 1692 ext v3.16b, v0.16b, v1.16b, #2 1693 ext v4.16b, v1.16b, v2.16b, #2 1694 ext v5.16b, v0.16b, v1.16b, #4 1695 ext v6.16b, v1.16b, v2.16b, #4 1696 ext v16.16b, v0.16b, v1.16b, #6 1697 ext v17.16b, v1.16b, v2.16b, #6 1698 ext v18.16b, v0.16b, v1.16b, #8 1699 ext v19.16b, v1.16b, v2.16b, #8 1700 mul v20.8h, v0.8h, v29.8h 1701 mla v20.8h, v3.8h, v30.8h 1702 mla v20.8h, v5.8h, v31.8h 1703 mla v20.8h, v16.8h, v30.8h 1704 mla v20.8h, v18.8h, v29.8h 1705 mul v21.8h, v1.8h, v29.8h 1706 mla v21.8h, v4.8h, v30.8h 1707 mla v21.8h, v6.8h, v31.8h 1708 mla v21.8h, v17.8h, v30.8h 1709 mla v21.8h, v19.8h, v29.8h 1710 subs w1, w1, #16 1711 mov v0.16b, v2.16b 1712 mov v1.16b, v28.16b 1713 mov v2.16b, v28.16b 1714 urshr v20.8h, v20.8h, #4 1715 urshr v21.8h, v21.8h, #4 1716 sub w3, w3, #16 1717 st1 {v20.8h, v21.8h}, [x0], #32 1718 b.le 9f 1719 // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to 1720 // filter properly once more - aka (w3 >= 0). 1721 cmp w3, #0 1722 b.ge 4b 17235: 1724 // When w3 <= 0, all remaining pixels in v0-v1 are equal to the 1725 // last valid pixel - thus just output that without filtering. 1726 subs w1, w1, #8 1727 st1 {v28.8h}, [x0], #16 1728 b.gt 5b 17299: 1730 ret 1731endfunc 1732 1733// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px, 1734// const int n); 1735function ipred_pixel_set_16bpc_neon, export=1 1736 dup v0.8h, w1 17371: 1738 subs w2, w2, #8 1739 st1 {v0.8h}, [x0], #16 1740 b.gt 1b 1741 ret 1742endfunc 1743 1744// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, 1745// const pixel *const top, 1746// const int width, const int height, 1747// const int dx, const int max_base_x); 1748function ipred_z1_fill1_16bpc_neon, export=1 1749 clz w9, w3 1750 movrel x8, ipred_z1_fill1_tbl 1751 sub w9, w9, #25 1752 ldrsw x9, [x8, w9, uxtw #2] 1753 add x10, x2, w6, uxtw #1 // top[max_base_x] 1754 add x8, x8, x9 1755 ld1r {v31.8h}, [x10] // padding 1756 mov w7, w5 1757 mov w15, #64 1758 br x8 175940: 1760 AARCH64_VALID_JUMP_TARGET 17614: 1762 lsr w8, w7, #6 // base 1763 and w9, w7, #0x3e // frac 1764 add w7, w7, w5 // xpos += dx 1765 cmp w8, w6 // base >= max_base_x 1766 lsr w10, w7, #6 // base 1767 and w11, w7, #0x3e // frac 1768 b.ge 49f 1769 lsl w8, w8, #1 1770 lsl w10, w10, #1 1771 ldr q0, [x2, w8, uxtw] // top[base] 1772 ldr q2, [x2, w10, uxtw] 1773 dup v4.4h, w9 // frac 1774 dup v5.4h, w11 1775 ext v1.16b, v0.16b, v0.16b, #2 // top[base+1] 1776 ext v3.16b, v2.16b, v2.16b, #2 1777 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] 1778 sub v7.4h, v3.4h, v2.4h 1779 ushll v16.4s, v0.4h, #6 // top[base]*64 1780 ushll v17.4s, v2.4h, #6 1781 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac 1782 smlal v17.4s, v7.4h, v5.4h 1783 rshrn v16.4h, v16.4s, #6 1784 rshrn v17.4h, v17.4s, #6 1785 st1 {v16.4h}, [x0], x1 1786 add w7, w7, w5 // xpos += dx 1787 subs w4, w4, #2 1788 st1 {v17.4h}, [x0], x1 1789 b.gt 4b 1790 ret 1791 179249: 1793 st1 {v31.4h}, [x0], x1 1794 subs w4, w4, #2 1795 st1 {v31.4h}, [x0], x1 1796 b.gt 49b 1797 ret 1798 179980: 1800 AARCH64_VALID_JUMP_TARGET 18018: 1802 lsr w8, w7, #6 // base 1803 and w9, w7, #0x3e // frac 1804 add w7, w7, w5 // xpos += dx 1805 cmp w8, w6 // base >= max_base_x 1806 lsr w10, w7, #6 // base 1807 and w11, w7, #0x3e // frac 1808 b.ge 89f 1809 add x8, x2, w8, uxtw #1 1810 add x10, x2, w10, uxtw #1 1811 dup v4.8h, w9 // frac 1812 dup v5.8h, w11 1813 ld1 {v0.8h}, [x8] // top[base] 1814 ld1 {v2.8h}, [x10] 1815 sub w9, w15, w9 // 64 - frac 1816 sub w11, w15, w11 1817 ldr h1, [x8, #16] 1818 ldr h3, [x10, #16] 1819 dup v6.8h, w9 // 64 - frac 1820 dup v7.8h, w11 1821 ext v1.16b, v0.16b, v1.16b, #2 // top[base+1] 1822 ext v3.16b, v2.16b, v3.16b, #2 1823 umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) 1824 umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac 1825 umull2 v17.4s, v0.8h, v6.8h 1826 umlal2 v17.4s, v1.8h, v4.8h 1827 umull v18.4s, v2.4h, v7.4h 1828 umlal v18.4s, v3.4h, v5.4h 1829 umull2 v19.4s, v2.8h, v7.8h 1830 umlal2 v19.4s, v3.8h, v5.8h 1831 rshrn v16.4h, v16.4s, #6 1832 rshrn2 v16.8h, v17.4s, #6 1833 rshrn v17.4h, v18.4s, #6 1834 rshrn2 v17.8h, v19.4s, #6 1835 st1 {v16.8h}, [x0], x1 1836 add w7, w7, w5 // xpos += dx 1837 subs w4, w4, #2 1838 st1 {v17.8h}, [x0], x1 1839 b.gt 8b 1840 ret 1841 184289: 1843 st1 {v31.8h}, [x0], x1 1844 subs w4, w4, #2 1845 st1 {v31.8h}, [x0], x1 1846 b.gt 89b 1847 ret 1848 1849160: 1850320: 1851640: 1852 AARCH64_VALID_JUMP_TARGET 1853 1854 mov w12, w3 1855 1856 add x13, x0, x1 1857 lsl x1, x1, #1 1858 sub x1, x1, w3, uxtw #1 18591: 1860 lsr w8, w7, #6 // base 1861 and w9, w7, #0x3e // frac 1862 add w7, w7, w5 // xpos += dx 1863 cmp w8, w6 // base >= max_base_x 1864 lsr w10, w7, #6 // base 1865 and w11, w7, #0x3e // frac 1866 b.ge 169f 1867 add x8, x2, w8, uxtw #1 1868 add x10, x2, w10, uxtw #1 1869 dup v6.8h, w9 // frac 1870 dup v7.8h, w11 1871 ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base] 1872 ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 1873 sub w9, w15, w9 // 64 - frac 1874 sub w11, w15, w11 1875 dup v16.8h, w9 // 64 - frac 1876 dup v17.8h, w11 1877 add w7, w7, w5 // xpos += dx 18782: 1879 ext v18.16b, v0.16b, v1.16b, #2 // top[base+1] 1880 ext v19.16b, v1.16b, v2.16b, #2 1881 ext v20.16b, v3.16b, v4.16b, #2 1882 ext v21.16b, v4.16b, v5.16b, #2 1883 subs w3, w3, #16 1884 umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac) 1885 umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac 1886 umull2 v23.4s, v0.8h, v16.8h 1887 umlal2 v23.4s, v18.8h, v6.8h 1888 umull v24.4s, v1.4h, v16.4h 1889 umlal v24.4s, v19.4h, v6.4h 1890 umull2 v25.4s, v1.8h, v16.8h 1891 umlal2 v25.4s, v19.8h, v6.8h 1892 umull v26.4s, v3.4h, v17.4h 1893 umlal v26.4s, v20.4h, v7.4h 1894 umull2 v27.4s, v3.8h, v17.8h 1895 umlal2 v27.4s, v20.8h, v7.8h 1896 umull v28.4s, v4.4h, v17.4h 1897 umlal v28.4s, v21.4h, v7.4h 1898 umull2 v29.4s, v4.8h, v17.8h 1899 umlal2 v29.4s, v21.8h, v7.8h 1900 rshrn v22.4h, v22.4s, #6 1901 rshrn2 v22.8h, v23.4s, #6 1902 rshrn v23.4h, v24.4s, #6 1903 rshrn2 v23.8h, v25.4s, #6 1904 rshrn v24.4h, v26.4s, #6 1905 rshrn2 v24.8h, v27.4s, #6 1906 rshrn v25.4h, v28.4s, #6 1907 rshrn2 v25.8h, v29.4s, #6 1908 st1 {v22.8h, v23.8h}, [x0], #32 1909 st1 {v24.8h, v25.8h}, [x13], #32 1910 b.le 3f 1911 mov v0.16b, v2.16b 1912 ld1 {v1.8h, v2.8h}, [x8], #32 // top[base] 1913 mov v3.16b, v5.16b 1914 ld1 {v4.8h, v5.8h}, [x10], #32 1915 b 2b 1916 19173: 1918 subs w4, w4, #2 1919 b.le 9f 1920 add x0, x0, x1 1921 add x13, x13, x1 1922 mov w3, w12 1923 b 1b 19249: 1925 ret 1926 1927169: 1928 st1 {v31.8h}, [x0], #16 1929 subs w3, w3, #8 1930 st1 {v31.8h}, [x13], #16 1931 b.gt 169b 1932 subs w4, w4, #2 1933 b.le 9b 1934 add x0, x0, x1 1935 add x13, x13, x1 1936 mov w3, w12 1937 b 169b 1938endfunc 1939 1940jumptable ipred_z1_fill1_tbl 1941 .word 640b - ipred_z1_fill1_tbl 1942 .word 320b - ipred_z1_fill1_tbl 1943 .word 160b - ipred_z1_fill1_tbl 1944 .word 80b - ipred_z1_fill1_tbl 1945 .word 40b - ipred_z1_fill1_tbl 1946endjumptable 1947 1948function ipred_z1_fill2_16bpc_neon, export=1 1949 cmp w3, #8 1950 add x10, x2, w6, uxtw // top[max_base_x] 1951 ld1r {v31.16b}, [x10] // padding 1952 mov w7, w5 1953 mov w15, #64 1954 b.eq 8f 1955 19564: // w == 4 1957 lsr w8, w7, #6 // base 1958 and w9, w7, #0x3e // frac 1959 add w7, w7, w5 // xpos += dx 1960 cmp w8, w6 // base >= max_base_x 1961 lsr w10, w7, #6 // base 1962 and w11, w7, #0x3e // frac 1963 b.ge 49f 1964 lsl w8, w8, #1 1965 lsl w10, w10, #1 1966 ldr q0, [x2, w8, uxtw] // top[base] 1967 ldr q2, [x2, w10, uxtw] 1968 dup v4.4h, w9 // frac 1969 dup v5.4h, w11 1970 uzp2 v1.8h, v0.8h, v0.8h // top[base+1] 1971 uzp1 v0.8h, v0.8h, v0.8h // top[base] 1972 uzp2 v3.8h, v2.8h, v2.8h 1973 uzp1 v2.8h, v2.8h, v2.8h 1974 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] 1975 sub v7.4h, v3.4h, v2.4h 1976 ushll v16.4s, v0.4h, #6 // top[base]*64 1977 ushll v17.4s, v2.4h, #6 1978 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac 1979 smlal v17.4s, v7.4h, v5.4h 1980 rshrn v16.4h, v16.4s, #6 1981 rshrn v17.4h, v17.4s, #6 1982 st1 {v16.4h}, [x0], x1 1983 add w7, w7, w5 // xpos += dx 1984 subs w4, w4, #2 1985 st1 {v17.4h}, [x0], x1 1986 b.gt 4b 1987 ret 1988 198949: 1990 st1 {v31.4h}, [x0], x1 1991 subs w4, w4, #2 1992 st1 {v31.4h}, [x0], x1 1993 b.gt 49b 1994 ret 1995 19968: // w == 8 1997 lsr w8, w7, #6 // base 1998 and w9, w7, #0x3e // frac 1999 add w7, w7, w5 // xpos += dx 2000 cmp w8, w6 // base >= max_base_x 2001 lsr w10, w7, #6 // base 2002 and w11, w7, #0x3e // frac 2003 b.ge 89f 2004 add x8, x2, w8, uxtw #1 2005 add x10, x2, w10, uxtw #1 2006 dup v4.8h, w9 // frac 2007 dup v5.8h, w11 2008 ld1 {v0.8h, v1.8h}, [x8] // top[base] 2009 ld1 {v2.8h, v3.8h}, [x10] 2010 sub w9, w15, w9 // 64 - frac 2011 sub w11, w15, w11 2012 dup v6.8h, w9 // 64 - frac 2013 dup v7.8h, w11 2014 uzp2 v20.8h, v0.8h, v1.8h // top[base+1] 2015 uzp1 v0.8h, v0.8h, v1.8h // top[base] 2016 uzp2 v21.8h, v2.8h, v3.8h 2017 uzp1 v2.8h, v2.8h, v3.8h 2018 umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) 2019 umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac 2020 umull2 v17.4s, v0.8h, v6.8h 2021 umlal2 v17.4s, v20.8h, v4.8h 2022 umull v18.4s, v2.4h, v7.4h 2023 umlal v18.4s, v21.4h, v5.4h 2024 umull2 v19.4s, v2.8h, v7.8h 2025 umlal2 v19.4s, v21.8h, v5.8h 2026 rshrn v16.4h, v16.4s, #6 2027 rshrn2 v16.8h, v17.4s, #6 2028 rshrn v17.4h, v18.4s, #6 2029 rshrn2 v17.8h, v19.4s, #6 2030 st1 {v16.8h}, [x0], x1 2031 add w7, w7, w5 // xpos += dx 2032 subs w4, w4, #2 2033 st1 {v17.8h}, [x0], x1 2034 b.gt 8b 2035 ret 2036 203789: 2038 st1 {v31.8h}, [x0], x1 2039 subs w4, w4, #2 2040 st1 {v31.8h}, [x0], x1 2041 b.gt 89b 2042 ret 2043endfunc 2044 2045// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src, 2046// const int n); 2047function ipred_reverse_16bpc_neon, export=1 2048 sub x1, x1, #16 2049 add x3, x0, #8 2050 mov x4, #16 20511: 2052 ld1 {v0.8h}, [x1] 2053 subs w2, w2, #8 2054 rev64 v0.8h, v0.8h 2055 sub x1, x1, #16 2056 st1 {v0.d}[1], [x0], x4 2057 st1 {v0.d}[0], [x3], x4 2058 b.gt 1b 2059 ret 2060endfunc 2061 2062const increments 2063 .short 0, 1, 2, 3, 4, 5, 6, 7 2064endconst 2065 2066// void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, 2067// const pixel *const top, 2068// const pixel *const left, 2069// const int width, const int height, 2070// const int dx, const int dy); 2071function ipred_z2_fill1_16bpc_neon, export=1 2072 clz w10, w4 2073 movrel x9, ipred_z2_fill1_tbl 2074 sub w10, w10, #25 2075 ldrsw x10, [x9, w10, uxtw #2] 2076 mov w8, #(1 << 6) // xpos = 1 << 6 2077 add x9, x9, x10 2078 sub w8, w8, w6 // xpos -= dx 2079 2080 movrel x11, increments 2081 ld1 {v31.8h}, [x11] // increments 2082 neg w7, w7 // -dy 2083 2084 br x9 208540: 2086 AARCH64_VALID_JUMP_TARGET 2087 2088 dup v30.4h, w7 // -dy 2089 movi v17.8b, #1 2090 2091 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy 2092 movi v25.8h, #0x3e 2093 add v30.4h, v16.4h, v30.4h // -= dy 2094 2095 // Worst case height for w=4 is 16, but we need at least h+1 elements 2096 ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] 2097 2098 movi v26.8h, #64 2099 movi v19.16b, #4 2100 2101 shrn v29.8b, v30.8h, #6 // ypos >> 6 2102 and v27.8b, v30.8b, v25.8b // frac_y 2103 2104 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 2105 2106 movi v23.4h, #1, lsl #8 2107 shl v29.8b, v29.8b, #1 // 2*base_y 2108 zip1 v29.8b, v29.8b, v29.8b // duplicate elements 2109 movi v17.8b, #2 2110 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... 2111 2112 add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) 2113 add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) 2114 2115 tbl v18.8b, {v0.16b}, v29.8b // left[base_y] 2116 2117 trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2 2118 2119 sub v28.4h, v26.4h, v27.4h // 64 - frac_y 2120 2121 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} 2122 2123 trn1 v27.2d, v27.2d, v27.2d // frac_y 2124 trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y 2125 2126 movi v29.16b, #4 21274: 2128 asr w9, w8, #6 // base_x 2129 dup v16.4h, w8 // xpos 2130 sub w8, w8, w6 // xpos -= dx 2131 cmp w9, #-4 // base_x <= -4 2132 asr w11, w8, #6 // base_x 2133 b.le 49f 2134 2135 lsl w9, w9, #1 2136 lsl w11, w11, #1 2137 2138 dup v17.4h, w8 // xpos 2139 2140 ldr q4, [x2, w9, sxtw] // top[base_x] 2141 ldr q6, [x2, w11, sxtw] 2142 2143 trn1 v16.2d, v16.2d, v17.2d // xpos 2144 2145 // Cut corners here; only doing tbl over v0-v1 here; we only 2146 // seem to need the last pixel, from v2, after skipping to the 2147 // left-only codepath below. 2148 tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] 2149 2150 sshr v20.8h, v16.8h, #6 // first base_x for each row 2151 2152 ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1] 2153 ext v7.16b, v6.16b, v6.16b, #2 2154 2155 and v16.16b, v16.16b, v25.16b // frac_x 2156 2157 trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] 2158 2159 trn1 v4.2d, v4.2d, v6.2d // top[base_x] 2160 trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] 2161 2162 sub v17.8h, v26.8h, v16.8h // 64 - frac_x 2163 2164 add v20.8h, v20.8h, v31.8h // actual base_x 2165 2166 umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2167 umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2168 umull2 v22.4s, v18.8h, v28.8h 2169 umlal2 v22.4s, v19.8h, v27.8h 2170 2171 umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) 2172 umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x 2173 umull2 v24.4s, v4.8h, v17.8h 2174 umlal2 v24.4s, v5.8h, v16.8h 2175 2176 cmge v20.8h, v20.8h, #0 2177 2178 rshrn v21.4h, v21.4s, #6 2179 rshrn2 v21.8h, v22.4s, #6 2180 rshrn v22.4h, v23.4s, #6 2181 rshrn2 v22.8h, v24.4s, #6 2182 2183 bit v21.16b, v22.16b, v20.16b 2184 2185 st1 {v21.d}[0], [x0], x1 2186 sub w8, w8, w6 // xpos -= dx 2187 subs w5, w5, #2 2188 st1 {v21.d}[1], [x0], x1 2189 b.le 9f 2190 2191 ext v18.16b, v19.16b, v19.16b, #8 2192 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) 2193 b 4b 2194 219549: 2196 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2] 2197 2198 trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] 2199 2200 umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2201 umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2202 umull2 v21.4s, v18.8h, v28.8h 2203 umlal2 v21.4s, v19.8h, v27.8h 2204 2205 rshrn v20.4h, v20.4s, #6 2206 rshrn2 v20.8h, v21.4s, #6 2207 2208 st1 {v20.d}[0], [x0], x1 2209 subs w5, w5, #2 2210 st1 {v20.d}[1], [x0], x1 2211 b.le 9f 2212 2213 ext v18.16b, v19.16b, v19.16b, #8 2214 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) 2215 b 49b 2216 22179: 2218 ret 2219 222080: 2221 AARCH64_VALID_JUMP_TARGET 2222 2223 stp d8, d9, [sp, #-0x40]! 2224 stp d10, d11, [sp, #0x10] 2225 stp d12, d13, [sp, #0x20] 2226 stp d14, d15, [sp, #0x30] 2227 2228 dup v18.8h, w7 // -dy 2229 add x3, x3, #2 // Skip past left[0] 2230 2231 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy 2232 movi v25.8h, #0x3e 2233 add v16.8h, v16.8h, v18.8h // -= dy 2234 2235 // Worst case height for w=8 is 32. 2236 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[] 2237 ld1r {v15.8h}, [x2] // left[0] == top[0] 2238 2239 movi v26.8h, #64 2240 movi v19.16b, #4 2241 2242 shrn v29.8b, v16.8h, #6 // ypos >> 6 2243 and v27.16b, v16.16b, v25.16b // frac_y 2244 2245 movi v23.8h, #1, lsl #8 2246 shl v29.8b, v29.8b, #1 // 2*base_y 2247 mov v18.16b, v15.16b // left[0] 2248 zip1 v29.16b, v29.16b, v29.16b // duplicate elements 2249 movi v17.16b, #2 2250 add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... 2251 2252 // Cut corners here; for the first row we don't expect to need to 2253 // read outside of v0. 2254 tbx v18.16b, {v0.16b}, v29.16b // left[base_y] 2255 2256 add v30.16b, v29.16b, v19.16b // base_y + 2 (*2) 2257 add v29.16b, v29.16b, v17.16b // base_y + 1 (*2) 2258 2259 sub v28.8h, v26.8h, v27.8h // 64 - frac_y 2260 2261 movi v24.16b, #4 22628: 2263 asr w9, w8, #6 // base_x 2264 dup v16.8h, w8 // xpos 2265 sub w8, w8, w6 // xpos -= dx 2266 cmp w9, #-16 // base_x <= -16 2267 asr w11, w8, #6 // base_x 2268 b.le 89f 2269 2270 dup v17.8h, w8 // xpos 2271 2272 add x9, x2, w9, sxtw #1 2273 add x11, x2, w11, sxtw #1 2274 2275 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] 2276 mov v19.16b, v15.16b // left[0] 2277 ld1 {v6.8h, v7.8h}, [x11] 2278 2279 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2280 2281 mov v20.16b, v15.16b // left[0] 2282 2283 sshr v21.8h, v16.8h, #6 // first base_x 2284 sshr v22.8h, v17.8h, #6 2285 2286 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] 2287 2288 ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1] 2289 ext v7.16b, v6.16b, v7.16b, #2 2290 2291 and v16.16b, v16.16b, v25.16b // frac_x 2292 and v17.16b, v17.16b, v25.16b 2293 2294 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2295 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2296 2297 sub v8.8h, v26.8h, v16.8h // 64 - frac_x 2298 sub v9.8h, v26.8h, v17.8h 2299 2300 umull2 v11.4s, v18.8h, v28.8h 2301 umlal2 v11.4s, v19.8h, v27.8h 2302 2303 add v21.8h, v21.8h, v31.8h // actual base_x 2304 add v22.8h, v22.8h, v31.8h 2305 2306 umull v12.4s, v19.4h, v28.4h 2307 umlal v12.4s, v20.4h, v27.4h 2308 umull2 v13.4s, v19.8h, v28.8h 2309 umlal2 v13.4s, v20.8h, v27.8h 2310 2311 rshrn v10.4h, v10.4s, #6 2312 rshrn2 v10.8h, v11.4s, #6 2313 rshrn v11.4h, v12.4s, #6 2314 rshrn2 v11.8h, v13.4s, #6 2315 2316 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) 2317 umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x 2318 umull2 v13.4s, v4.8h, v8.8h 2319 umlal2 v13.4s, v5.8h, v16.8h 2320 umull v14.4s, v6.4h, v9.4h 2321 umlal v14.4s, v7.4h, v17.4h 2322 umull2 v18.4s, v6.8h, v9.8h 2323 umlal2 v18.4s, v7.8h, v17.8h 2324 2325 cmge v21.8h, v21.8h, #0 2326 cmge v22.8h, v22.8h, #0 2327 2328 rshrn v12.4h, v12.4s, #6 2329 rshrn2 v12.8h, v13.4s, #6 2330 rshrn v13.4h, v14.4s, #6 2331 rshrn2 v13.8h, v18.4s, #6 2332 2333 bit v10.16b, v12.16b, v21.16b 2334 bit v11.16b, v13.16b, v22.16b 2335 2336 st1 {v10.8h}, [x0], x1 2337 subs w5, w5, #2 2338 sub w8, w8, w6 // xpos -= dx 2339 st1 {v11.8h}, [x0], x1 2340 b.le 9f 2341 2342 mov v18.16b, v20.16b 2343 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 2344 add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) 2345 b 8b 2346 234789: 2348 mov v19.16b, v15.16b 2349 mov v20.16b, v15.16b 2350 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2351 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] 2352 2353 umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2354 umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2355 umull2 v5.4s, v18.8h, v28.8h 2356 umlal2 v5.4s, v19.8h, v27.8h 2357 umull v6.4s, v19.4h, v28.4h 2358 umlal v6.4s, v20.4h, v27.4h 2359 umull2 v7.4s, v19.8h, v28.8h 2360 umlal2 v7.4s, v20.8h, v27.8h 2361 2362 rshrn v4.4h, v4.4s, #6 2363 rshrn2 v4.8h, v5.4s, #6 2364 rshrn v5.4h, v6.4s, #6 2365 rshrn2 v5.8h, v7.4s, #6 2366 2367 st1 {v4.8h}, [x0], x1 2368 subs w5, w5, #2 2369 st1 {v5.8h}, [x0], x1 2370 b.le 9f 2371 2372 mov v18.16b, v20.16b 2373 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 2374 add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) 2375 b 89b 2376 23779: 2378 ldp d14, d15, [sp, #0x30] 2379 ldp d12, d13, [sp, #0x20] 2380 ldp d10, d11, [sp, #0x10] 2381 ldp d8, d9, [sp], 0x40 2382 ret 2383 2384160: 2385320: 2386640: 2387 AARCH64_VALID_JUMP_TARGET 2388 2389 stp d8, d9, [sp, #-0x40]! 2390 stp d10, d11, [sp, #0x10] 2391 stp d12, d13, [sp, #0x20] 2392 stp d14, d15, [sp, #0x30] 2393 2394 dup v25.8h, w7 // -dy 2395 add x3, x3, #2 // Skip past left[0] 2396 2397 add x13, x0, x1 // alternating row 2398 lsl x1, x1, #1 // stride *= 2 2399 sub x1, x1, w4, uxtw #1 // stride -= width 2400 2401 movi v11.8h, #8 2402 mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy 2403 add v26.8h, v26.8h, v25.8h // -= dy 2404 mul v25.8h, v25.8h, v11.8h // -8*dy 2405 2406 // Worst case height is 64, but we can only fit 32 pixels into 2407 // v0-v3 usable within one tbx instruction. As long as base_y is 2408 // up to 32, we use tbx. 2409 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[] 2410 ld1r {v15.8h}, [x2] // left[0] == top[0] 2411 2412 mov w12, w4 // orig w 2413 neg w14, w4 // -w 2414 24151: 2416 mov v23.16b, v26.16b // reset ypos 2417 2418 asr w9, w8, #6 // base_x 2419 dup v16.8h, w8 // xpos 2420 sub w8, w8, w6 // xpos -= dx 2421 cmp w9, w14 // base_x <= -2*w 2422 asr w11, w8, #6 // base_x 2423 b.le 169f 2424 2425 dup v17.8h, w8 // xpos 2426 sub w8, w8, w6 // xpos -= dx 2427 2428 add x9, x2, w9, sxtw #1 2429 add x11, x2, w11, sxtw #1 2430 2431 sshr v21.8h, v16.8h, #6 // first base_x 2432 sshr v22.8h, v17.8h, #6 2433 2434 ld1 {v4.8h}, [x9], #16 // top[base_x] 2435 ld1 {v6.8h}, [x11], #16 2436 2437 movi v10.8h, #0x3e 2438 movi v11.8h, #64 2439 2440 and v16.16b, v16.16b, v10.16b // frac_x 2441 and v17.16b, v17.16b, v10.16b 2442 2443 sub v8.8h, v11.8h, v16.8h // 64 - frac_x 2444 sub v9.8h, v11.8h, v17.8h 2445 2446 add v21.8h, v21.8h, v31.8h // actual base_x 2447 add v22.8h, v22.8h, v31.8h 2448 24492: 2450 smov w10, v22.h[0] 2451 2452 shrn v29.8b, v23.8h, #6 // ypos >> 6 2453 movi v12.8h, #64 2454 cmp w10, #0 // base_x (bottom left) >= 0 2455 smov w10, v29.b[0] // base_y[0] 2456 movi v10.8h, #0x3e 2457 2458 b.ge 4f 2459 and v27.16b, v23.16b, v10.16b // frac_y 2460 cmp w10, #(32-3) 2461 2462 mov v18.16b, v15.16b // left[0] 2463 sub v28.8h, v12.8h, v27.8h // 64 - frac_y 2464 b.gt 22f 2465 246621: 2467 // base_y < 32, using tbx 2468 shl v29.8b, v29.8b, #1 // 2*base_y 2469 movi v11.8h, #1, lsl #8 2470 zip1 v29.16b, v29.16b, v29.16b // duplicate elements 2471 add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ... 2472 2473 movi v13.16b, #2 2474 2475 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] 2476 2477 add v29.16b, v29.16b, v13.16b // base_y + 1 (*2) 2478 mov v19.16b, v15.16b // left[0] 2479 2480 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2481 2482 add v29.16b, v29.16b, v13.16b // base_y + 2 (*2) 2483 mov v20.16b, v15.16b // left[0] 2484 2485 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] 2486 2487 b 23f 2488 248922: 2490 // base_y >= 32, using separate loads. 2491 smov w15, v29.b[1] 2492 smov w16, v29.b[2] 2493 add x10, x3, w10, sxtw #1 2494 smov w17, v29.b[3] 2495 add x15, x3, w15, sxtw #1 2496 ld3 {v18.h, v19.h, v20.h}[0], [x10] 2497 smov w10, v29.b[4] 2498 add x16, x3, w16, sxtw #1 2499 ld3 {v18.h, v19.h, v20.h}[1], [x15] 2500 smov w15, v29.b[5] 2501 add x17, x3, w17, sxtw #1 2502 ld3 {v18.h, v19.h, v20.h}[2], [x16] 2503 smov w16, v29.b[6] 2504 add x10, x3, w10, sxtw #1 2505 ld3 {v18.h, v19.h, v20.h}[3], [x17] 2506 smov w17, v29.b[7] 2507 add x15, x3, w15, sxtw #1 2508 add x16, x3, w16, sxtw #1 2509 ld3 {v18.h, v19.h, v20.h}[4], [x10] 2510 add x17, x3, w17, sxtw #1 2511 ld3 {v18.h, v19.h, v20.h}[5], [x15] 2512 ld3 {v18.h, v19.h, v20.h}[6], [x16] 2513 ld3 {v18.h, v19.h, v20.h}[7], [x17] 2514 251523: 2516 2517 ld1 {v5.8h}, [x9], #16 // top[base_x] 2518 ld1 {v7.8h}, [x11], #16 2519 2520 add v23.8h, v23.8h, v25.8h // ypos -= 8*dy 2521 2522 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2523 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2524 umull2 v11.4s, v18.8h, v28.8h 2525 umlal2 v11.4s, v19.8h, v27.8h 2526 umull v12.4s, v19.4h, v28.4h 2527 umlal v12.4s, v20.4h, v27.4h 2528 umull2 v13.4s, v19.8h, v28.8h 2529 umlal2 v13.4s, v20.8h, v27.8h 2530 2531 ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1] 2532 ext v19.16b, v6.16b, v7.16b, #2 2533 2534 rshrn v10.4h, v10.4s, #6 2535 rshrn2 v10.8h, v11.4s, #6 2536 rshrn v11.4h, v12.4s, #6 2537 rshrn2 v11.8h, v13.4s, #6 2538 2539 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) 2540 umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x 2541 umull2 v13.4s, v4.8h, v8.8h 2542 umlal2 v13.4s, v18.8h, v16.8h 2543 umull v14.4s, v6.4h, v9.4h 2544 umlal v14.4s, v19.4h, v17.4h 2545 umull2 v20.4s, v6.8h, v9.8h 2546 umlal2 v20.4s, v19.8h, v17.8h 2547 2548 cmge v18.8h, v21.8h, #0 2549 cmge v19.8h, v22.8h, #0 2550 2551 rshrn v12.4h, v12.4s, #6 2552 rshrn2 v12.8h, v13.4s, #6 2553 rshrn v13.4h, v14.4s, #6 2554 rshrn2 v13.8h, v20.4s, #6 2555 2556 bit v10.16b, v12.16b, v18.16b 2557 bit v11.16b, v13.16b, v19.16b 2558 2559 st1 {v10.8h}, [x0], #16 2560 subs w4, w4, #8 2561 st1 {v11.8h}, [x13], #16 2562 b.le 3f 2563 2564 movi v10.8h, #8 2565 mov v4.16b, v5.16b 2566 mov v6.16b, v7.16b 2567 add v21.8h, v21.8h, v10.8h // base_x += 8 2568 add v22.8h, v22.8h, v10.8h 2569 b 2b 2570 25713: 2572 subs w5, w5, #2 2573 b.le 9f 2574 movi v10.8h, #128 2575 add x0, x0, x1 2576 add x13, x13, x1 2577 mov w4, w12 // reset w 2578 add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6) 2579 b 1b 2580 25814: // The rest of the row only predicted from top[] 2582 ld1 {v5.8h}, [x9], #16 // top[base_x] 2583 ld1 {v7.8h}, [x11], #16 2584 2585 ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1] 2586 ext v19.16b, v6.16b, v7.16b, #2 2587 2588 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) 2589 umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x 2590 umull2 v13.4s, v4.8h, v8.8h 2591 umlal2 v13.4s, v18.8h, v16.8h 2592 umull v14.4s, v6.4h, v9.4h 2593 umlal v14.4s, v19.4h, v17.4h 2594 umull2 v20.4s, v6.8h, v9.8h 2595 umlal2 v20.4s, v19.8h, v17.8h 2596 2597 rshrn v12.4h, v12.4s, #6 2598 rshrn2 v12.8h, v13.4s, #6 2599 rshrn v13.4h, v14.4s, #6 2600 rshrn2 v13.8h, v20.4s, #6 2601 2602 st1 {v12.8h}, [x0], #16 2603 subs w4, w4, #8 2604 st1 {v13.8h}, [x13], #16 2605 b.le 3b 2606 2607 mov v4.16b, v5.16b 2608 mov v6.16b, v7.16b 2609 b 4b 2610 2611169: // The rest of the block only predicted from left[] 2612 add x1, x1, w4, uxtw #1 // restore stride 2613 mov w12, w5 // orig remaining h 26141: 2615 movi v12.8h, #64 2616 movi v10.8h, #0x3e 2617 2618 shrn v29.8b, v23.8h, #6 // ypos >> 6 2619 and v27.16b, v23.16b, v10.16b // frac_y 2620 2621 smov w10, v29.b[0] // base_y[0] 2622 2623 shl v29.8b, v29.8b, #1 // 2*base_y 2624 movi v11.8h, #1, lsl #8 2625 zip1 v29.16b, v29.16b, v29.16b // duplicate elements 2626 add v23.8h, v23.8h, v25.8h // ypos -= 8*dy 2627 add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ... 2628 2629 cmp w10, #(32-1) 2630 2631 mov v18.16b, v15.16b // left[0] 2632 movi v21.16b, #2 2633 2634 sub v28.8h, v12.8h, v27.8h // 64 - frac_y 2635 2636 b.gt 31f 2637 2638 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] 2639 add v29.16b, v29.16b, v21.16b // base_y + 1 (*2) 2640 26412: 2642 // base_y < 32, using tbx. 2643 smov w10, v29.b[0] // base_y[0] 2644 mov v19.16b, v15.16b // left[0] 2645 cmp w10, #(64-4) 2646 b.gt 32f 2647 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] 2648 add v29.16b, v29.16b, v21.16b // base_y + 2 (*2) 2649 mov v20.16b, v15.16b // left[0] 2650 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] 2651 add v29.16b, v29.16b, v21.16b // next base_y 2652 2653 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2654 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2655 umull2 v11.4s, v18.8h, v28.8h 2656 umlal2 v11.4s, v19.8h, v27.8h 2657 umull v12.4s, v19.4h, v28.4h 2658 umlal v12.4s, v20.4h, v27.4h 2659 umull2 v13.4s, v19.8h, v28.8h 2660 umlal2 v13.4s, v20.8h, v27.8h 2661 2662 rshrn v10.4h, v10.4s, #6 2663 rshrn2 v10.8h, v11.4s, #6 2664 rshrn v11.4h, v12.4s, #6 2665 rshrn2 v11.8h, v13.4s, #6 2666 2667 st1 {v10.8h}, [x0], x1 2668 subs w5, w5, #2 2669 st1 {v11.8h}, [x13], x1 2670 b.le 4f 2671 mov v18.16b, v20.16b 2672 b 2b 2673 267431: // base_y >= 32, using separate loads, loading v18 if we had to bail 2675 // in the prologue. 2676 smov w10, v29.b[0] 2677 smov w15, v29.b[2] 2678 movi v21.16b, #2 2679 smov w16, v29.b[4] 2680 add x10, x3, w10, sxtw 2681 smov w17, v29.b[6] 2682 add x15, x3, w15, sxtw 2683 ld1 {v18.h}[0], [x10] 2684 smov w10, v29.b[8] 2685 add x16, x3, w16, sxtw 2686 ld1 {v18.h}[1], [x15] 2687 smov w15, v29.b[10] 2688 add x17, x3, w17, sxtw 2689 ld1 {v18.h}[2], [x16] 2690 smov w16, v29.b[12] 2691 add x10, x3, w10, sxtw 2692 ld1 {v18.h}[3], [x17] 2693 smov w17, v29.b[14] 2694 add x15, x3, w15, sxtw 2695 add x16, x3, w16, sxtw 2696 ld1 {v18.h}[4], [x10] 2697 add x17, x3, w17, sxtw 2698 ld1 {v18.h}[5], [x15] 2699 add v29.16b, v29.16b, v21.16b // next base_y 2700 ld1 {v18.h}[6], [x16] 2701 ld1 {v18.h}[7], [x17] 2702 270332: // base_y >= 32, using separate loads. 2704 cmp w5, #4 2705 b.lt 34f 270633: // h >= 4, preserving v18 from the previous round, loading v19-v22. 2707 smov w10, v29.b[0] 2708 subs w5, w5, #4 2709 smov w15, v29.b[2] 2710 movi v10.16b, #8 2711 smov w16, v29.b[4] 2712 add x10, x3, w10, sxtw 2713 smov w17, v29.b[6] 2714 add x15, x3, w15, sxtw 2715 ld4 {v19.h, v20.h, v21.h, v22.h}[0], [x10] 2716 smov w10, v29.b[8] 2717 add x16, x3, w16, sxtw 2718 ld4 {v19.h, v20.h, v21.h, v22.h}[1], [x15] 2719 smov w15, v29.b[10] 2720 add x17, x3, w17, sxtw 2721 ld4 {v19.h, v20.h, v21.h, v22.h}[2], [x16] 2722 smov w16, v29.b[12] 2723 add x10, x3, w10, sxtw 2724 ld4 {v19.h, v20.h, v21.h, v22.h}[3], [x17] 2725 smov w17, v29.b[14] 2726 add x15, x3, w15, sxtw 2727 add x16, x3, w16, sxtw 2728 ld4 {v19.h, v20.h, v21.h, v22.h}[4], [x10] 2729 add x17, x3, w17, sxtw 2730 ld4 {v19.h, v20.h, v21.h, v22.h}[5], [x15] 2731 ld4 {v19.h, v20.h, v21.h, v22.h}[6], [x16] 2732 add v29.16b, v29.16b, v10.16b // next base_y 2733 ld4 {v19.h, v20.h, v21.h, v22.h}[7], [x17] 2734 2735 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2736 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2737 umull2 v11.4s, v18.8h, v28.8h 2738 umlal2 v11.4s, v19.8h, v27.8h 2739 umull v12.4s, v19.4h, v28.4h 2740 umlal v12.4s, v20.4h, v27.4h 2741 umull2 v13.4s, v19.8h, v28.8h 2742 umlal2 v13.4s, v20.8h, v27.8h 2743 2744 rshrn v10.4h, v10.4s, #6 2745 rshrn2 v10.8h, v11.4s, #6 2746 rshrn v11.4h, v12.4s, #6 2747 rshrn2 v11.8h, v13.4s, #6 2748 2749 umull v12.4s, v20.4h, v28.4h // left[base_y]*(64-frac_y) 2750 umlal v12.4s, v21.4h, v27.4h // + left[base_y+1]*frac_y 2751 umull2 v13.4s, v20.8h, v28.8h 2752 umlal2 v13.4s, v21.8h, v27.8h 2753 umull v14.4s, v21.4h, v28.4h 2754 umlal v14.4s, v22.4h, v27.4h 2755 umull2 v18.4s, v21.8h, v28.8h 2756 umlal2 v18.4s, v22.8h, v27.8h 2757 2758 rshrn v12.4h, v12.4s, #6 2759 rshrn2 v12.8h, v13.4s, #6 2760 rshrn v13.4h, v14.4s, #6 2761 rshrn2 v13.8h, v18.4s, #6 2762 2763 st1 {v10.8h}, [x0], x1 2764 cmp w5, #2 2765 st1 {v11.8h}, [x13], x1 2766 st1 {v12.8h}, [x0], x1 2767 st1 {v13.8h}, [x13], x1 2768 b.lt 4f 2769 mov v18.16b, v22.16b 2770 b.gt 33b 2771 277234: // h == 2, preserving v18 from the previous round, loading v19-v20. 2773 smov w10, v29.b[0] 2774 smov w15, v29.b[2] 2775 movi v21.16b, #4 2776 smov w16, v29.b[4] 2777 add x10, x3, w10, sxtw 2778 smov w17, v29.b[6] 2779 add x15, x3, w15, sxtw 2780 ld2 {v19.h, v20.h}[0], [x10] 2781 smov w10, v29.b[8] 2782 add x16, x3, w16, sxtw 2783 ld2 {v19.h, v20.h}[1], [x15] 2784 smov w15, v29.b[10] 2785 add x17, x3, w17, sxtw 2786 ld2 {v19.h, v20.h}[2], [x16] 2787 smov w16, v29.b[12] 2788 add x10, x3, w10, sxtw 2789 ld2 {v19.h, v20.h}[3], [x17] 2790 smov w17, v29.b[14] 2791 add x15, x3, w15, sxtw 2792 add x16, x3, w16, sxtw 2793 ld2 {v19.h, v20.h}[4], [x10] 2794 add x17, x3, w17, sxtw 2795 ld2 {v19.h, v20.h}[5], [x15] 2796 ld2 {v19.h, v20.h}[6], [x16] 2797 add v29.16b, v29.16b, v21.16b // next base_y 2798 ld2 {v19.h, v20.h}[7], [x17] 2799 2800 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2801 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2802 umull2 v11.4s, v18.8h, v28.8h 2803 umlal2 v11.4s, v19.8h, v27.8h 2804 umull v12.4s, v19.4h, v28.4h 2805 umlal v12.4s, v20.4h, v27.4h 2806 umull2 v13.4s, v19.8h, v28.8h 2807 umlal2 v13.4s, v20.8h, v27.8h 2808 2809 rshrn v10.4h, v10.4s, #6 2810 rshrn2 v10.8h, v11.4s, #6 2811 rshrn v11.4h, v12.4s, #6 2812 rshrn2 v11.8h, v13.4s, #6 2813 2814 st1 {v10.8h}, [x0], x1 2815 st1 {v11.8h}, [x13], x1 2816 // The h==2 case only happens once at the end, if at all. 2817 28184: 2819 subs w4, w4, #8 2820 b.le 9f 2821 2822 lsr x1, x1, #1 2823 msub x0, x1, x12, x0 // ptr -= h * stride 2824 msub x13, x1, x12, x13 2825 lsl x1, x1, #1 2826 add x0, x0, #16 2827 add x13, x13, #16 2828 mov w5, w12 // reset h 2829 b 1b 2830 28319: 2832 ldp d14, d15, [sp, #0x30] 2833 ldp d12, d13, [sp, #0x20] 2834 ldp d10, d11, [sp, #0x10] 2835 ldp d8, d9, [sp], 0x40 2836 ret 2837endfunc 2838 2839jumptable ipred_z2_fill1_tbl 2840 .word 640b - ipred_z2_fill1_tbl 2841 .word 320b - ipred_z2_fill1_tbl 2842 .word 160b - ipred_z2_fill1_tbl 2843 .word 80b - ipred_z2_fill1_tbl 2844 .word 40b - ipred_z2_fill1_tbl 2845endjumptable 2846 2847function ipred_z2_fill2_16bpc_neon, export=1 2848 cmp w4, #8 2849 mov w8, #(2 << 6) // xpos = 2 << 6 2850 sub w8, w8, w6 // xpos -= dx 2851 2852 movrel x11, increments 2853 ld1 {v31.8h}, [x11] // increments 2854 neg w7, w7 // -dy 2855 b.eq 80f 2856 285740: 2858 dup v30.4h, w7 // -dy 2859 movi v17.8b, #1 2860 2861 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy 2862 movi v25.8h, #0x3e 2863 add v30.4h, v16.4h, v30.4h // -= dy 2864 2865 // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements 2866 // from left. 2867 ld1 {v0.8h, v1.8h}, [x3] // left[] 2868 2869 movi v26.8h, #64 2870 movi v19.16b, #4 2871 2872 shrn v29.8b, v30.8h, #6 // ypos >> 6 2873 and v27.8b, v30.8b, v25.8b // frac_y 2874 2875 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 2876 2877 movi v23.4h, #1, lsl #8 2878 shl v29.8b, v29.8b, #1 // 2*base_y 2879 zip1 v29.8b, v29.8b, v29.8b // duplicate elements 2880 movi v17.8b, #2 2881 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... 2882 2883 add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) 2884 add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) 2885 2886 tbl v18.8b, {v0.16b}, v29.8b // left[base_y] 2887 2888 trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2 2889 2890 sub v28.4h, v26.4h, v27.4h // 64 - frac_y 2891 2892 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} 2893 2894 trn1 v27.2d, v27.2d, v27.2d // frac_y 2895 trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y 2896 2897 movi v29.16b, #4 2898 add v31.8h, v31.8h, v31.8h // {0,2,4,6,0,2,4,6} 28994: 2900 asr w9, w8, #6 // base_x 2901 dup v16.4h, w8 // xpos 2902 sub w8, w8, w6 // xpos -= dx 2903 cmp w9, #-8 // base_x <= -8 2904 asr w11, w8, #6 // base_x 2905 b.le 49f 2906 2907 lsl w9, w9, #1 2908 lsl w11, w11, #1 2909 2910 dup v17.4h, w8 // xpos 2911 2912 ldr q4, [x2, w9, sxtw] // top[base_x] 2913 ldr q6, [x2, w11, sxtw] 2914 2915 trn1 v16.2d, v16.2d, v17.2d // xpos 2916 2917 tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] 2918 2919 sshr v20.8h, v16.8h, #6 // first base_x for each row 2920 2921 uzp2 v5.8h, v4.8h, v6.8h // top[base_x+1] 2922 uzp1 v4.8h, v4.8h, v6.8h // top[base_x] 2923 2924 and v16.16b, v16.16b, v25.16b // frac_x 2925 2926 trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] 2927 2928 sub v17.8h, v26.8h, v16.8h // 64 - frac_x 2929 2930 add v20.8h, v20.8h, v31.8h // actual base_x 2931 2932 umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2933 umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2934 umull2 v22.4s, v18.8h, v28.8h 2935 umlal2 v22.4s, v19.8h, v27.8h 2936 2937 umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) 2938 umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x 2939 umull2 v24.4s, v4.8h, v17.8h 2940 umlal2 v24.4s, v5.8h, v16.8h 2941 2942 cmge v20.8h, v20.8h, #0 2943 2944 rshrn v21.4h, v21.4s, #6 2945 rshrn2 v21.8h, v22.4s, #6 2946 rshrn v22.4h, v23.4s, #6 2947 rshrn2 v22.8h, v24.4s, #6 2948 2949 bit v21.16b, v22.16b, v20.16b 2950 2951 st1 {v21.d}[0], [x0], x1 2952 sub w8, w8, w6 // xpos -= dx 2953 subs w5, w5, #2 2954 st1 {v21.d}[1], [x0], x1 2955 b.le 9f 2956 2957 ext v18.16b, v19.16b, v19.16b, #8 2958 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) 2959 b 4b 2960 296149: 2962 tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] 2963 2964 trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] 2965 2966 umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 2967 umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 2968 umull2 v21.4s, v18.8h, v28.8h 2969 umlal2 v21.4s, v19.8h, v27.8h 2970 2971 rshrn v20.4h, v20.4s, #6 2972 rshrn2 v20.8h, v21.4s, #6 2973 2974 st1 {v20.d}[0], [x0], x1 2975 subs w5, w5, #2 2976 st1 {v20.d}[1], [x0], x1 2977 b.le 9f 2978 2979 ext v18.16b, v19.16b, v19.16b, #8 2980 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) 2981 b 49b 2982 29839: 2984 ret 2985 298680: 2987 stp d8, d9, [sp, #-0x40]! 2988 stp d10, d11, [sp, #0x10] 2989 stp d12, d13, [sp, #0x20] 2990 stp d14, d15, [sp, #0x30] 2991 2992 dup v18.8h, w7 // -dy 2993 movi v17.8b, #1 2994 2995 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy 2996 movi v25.8h, #0x3e 2997 add v16.8h, v16.8h, v18.8h // -= dy 2998 2999 // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements 3000 // from left. 3001 ld1 {v0.8h, v1.8h}, [x3] // left[] 3002 3003 movi v26.8h, #64 3004 movi v19.16b, #4 3005 3006 shrn v29.8b, v16.8h, #6 // ypos >> 6 3007 and v27.16b, v16.16b, v25.16b // frac_y 3008 3009 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 3010 3011 movi v23.8h, #1, lsl #8 3012 shl v29.8b, v29.8b, #1 // 2*base_y 3013 zip1 v29.16b, v29.16b, v29.16b // duplicate elements 3014 movi v17.16b, #2 3015 add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... 3016 3017 // Cut corners here; for the first row we don't expect to need to 3018 // read outside of v0. 3019 tbl v18.16b, {v0.16b}, v29.16b // left[base_y] 3020 3021 add v30.16b, v29.16b, v19.16b // base_y + 2 (*2) 3022 add v29.16b, v29.16b, v17.16b // base_y + 1 (*2) 3023 3024 sub v28.8h, v26.8h, v27.8h // 64 - frac_y 3025 3026 movi v24.16b, #4 3027 add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14} 30288: 3029 asr w9, w8, #6 // base_x 3030 dup v16.8h, w8 // xpos 3031 sub w8, w8, w6 // xpos -= dx 3032 cmp w9, #-16 // base_x <= -16 3033 asr w11, w8, #6 // base_x 3034 b.le 89f 3035 3036 dup v17.8h, w8 // xpos 3037 3038 add x9, x2, w9, sxtw #1 3039 add x11, x2, w11, sxtw #1 3040 3041 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] 3042 ld1 {v6.8h, v7.8h}, [x11] 3043 3044 tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1] 3045 3046 sshr v21.8h, v16.8h, #6 // first base_x 3047 sshr v22.8h, v17.8h, #6 3048 3049 tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2] 3050 3051 uzp2 v2.8h, v4.8h, v5.8h // top[base_x+1] 3052 uzp1 v4.8h, v4.8h, v5.8h // top[base_x] 3053 uzp2 v3.8h, v6.8h, v7.8h 3054 uzp1 v6.8h, v6.8h, v7.8h 3055 mov v5.16b, v2.16b 3056 mov v7.16b, v3.16b 3057 3058 and v16.16b, v16.16b, v25.16b // frac_x 3059 and v17.16b, v17.16b, v25.16b 3060 3061 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 3062 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 3063 3064 sub v8.8h, v26.8h, v16.8h // 64 - frac_x 3065 sub v9.8h, v26.8h, v17.8h 3066 3067 umull2 v11.4s, v18.8h, v28.8h 3068 umlal2 v11.4s, v19.8h, v27.8h 3069 3070 add v21.8h, v21.8h, v31.8h // actual base_x 3071 add v22.8h, v22.8h, v31.8h 3072 3073 umull v12.4s, v19.4h, v28.4h 3074 umlal v12.4s, v20.4h, v27.4h 3075 umull2 v13.4s, v19.8h, v28.8h 3076 umlal2 v13.4s, v20.8h, v27.8h 3077 3078 rshrn v10.4h, v10.4s, #6 3079 rshrn2 v10.8h, v11.4s, #6 3080 rshrn v11.4h, v12.4s, #6 3081 rshrn2 v11.8h, v13.4s, #6 3082 3083 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) 3084 umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x 3085 umull2 v13.4s, v4.8h, v8.8h 3086 umlal2 v13.4s, v5.8h, v16.8h 3087 umull v14.4s, v6.4h, v9.4h 3088 umlal v14.4s, v7.4h, v17.4h 3089 umull2 v18.4s, v6.8h, v9.8h 3090 umlal2 v18.4s, v7.8h, v17.8h 3091 3092 cmge v21.8h, v21.8h, #0 3093 cmge v22.8h, v22.8h, #0 3094 3095 rshrn v12.4h, v12.4s, #6 3096 rshrn2 v12.8h, v13.4s, #6 3097 rshrn v13.4h, v14.4s, #6 3098 rshrn2 v13.8h, v18.4s, #6 3099 3100 bit v10.16b, v12.16b, v21.16b 3101 bit v11.16b, v13.16b, v22.16b 3102 3103 st1 {v10.8h}, [x0], x1 3104 subs w5, w5, #2 3105 sub w8, w8, w6 // xpos -= dx 3106 st1 {v11.8h}, [x0], x1 3107 b.le 9f 3108 3109 mov v18.16b, v20.16b 3110 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 3111 add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) 3112 b 8b 3113 311489: 3115 tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1] 3116 tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2] 3117 3118 umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 3119 umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 3120 umull2 v5.4s, v18.8h, v28.8h 3121 umlal2 v5.4s, v19.8h, v27.8h 3122 umull v6.4s, v19.4h, v28.4h 3123 umlal v6.4s, v20.4h, v27.4h 3124 umull2 v7.4s, v19.8h, v28.8h 3125 umlal2 v7.4s, v20.8h, v27.8h 3126 3127 rshrn v4.4h, v4.4s, #6 3128 rshrn2 v4.8h, v5.4s, #6 3129 rshrn v5.4h, v6.4s, #6 3130 rshrn2 v5.8h, v7.4s, #6 3131 3132 st1 {v4.8h}, [x0], x1 3133 subs w5, w5, #2 3134 st1 {v5.8h}, [x0], x1 3135 b.le 9f 3136 3137 mov v18.16b, v20.16b 3138 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 3139 add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) 3140 b 89b 3141 31429: 3143 ldp d14, d15, [sp, #0x30] 3144 ldp d12, d13, [sp, #0x20] 3145 ldp d10, d11, [sp, #0x10] 3146 ldp d8, d9, [sp], 0x40 3147 ret 3148endfunc 3149 3150function ipred_z2_fill3_16bpc_neon, export=1 3151 cmp w4, #8 3152 mov w8, #(1 << 6) // xpos = 1 << 6 3153 sub w8, w8, w6 // xpos -= dx 3154 3155 movrel x11, increments 3156 ld1 {v31.8h}, [x11] // increments 3157 neg w7, w7 // -dy 3158 b.eq 80f 3159 316040: 3161 dup v30.4h, w7 // -dy 3162 movi v17.8b, #1 3163 3164 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy 3165 movi v25.8h, #0x3e 3166 add v30.4h, v16.4h, v30.4h // -= dy 3167 3168 // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. 3169 ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] 3170 3171 movi v26.8h, #64 3172 movi v19.16b, #2 3173 3174 shrn v29.8b, v30.8h, #6 // ypos >> 6 3175 and v27.8b, v30.8b, v25.8b // frac_y 3176 3177 add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 3178 3179 movi v23.4h, #1, lsl #8 3180 shl v29.8b, v29.8b, #1 // 2*base_y 3181 movi v19.16b, #4 3182 zip1 v29.8b, v29.8b, v29.8b // duplicate elements 3183 movi v17.8b, #2 3184 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... 3185 3186 add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) 3187 add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) 3188 3189 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} 3190 3191 add v24.8b, v30.8b, v19.8b // base_y + 3 (*2) 3192 3193 trn1 v29.2d, v29.2d, v28.2d // base_y + 0, base_y + 2 3194 trn1 v30.2d, v30.2d, v24.2d // base_y + 1, base_y + 3 3195 3196 sub v28.4h, v26.4h, v27.4h // 64 - frac_y 3197 3198 trn1 v27.2d, v27.2d, v27.2d // frac_y 3199 trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y 3200 3201 movi v24.16b, #8 32024: 3203 asr w9, w8, #6 // base_x 3204 dup v16.4h, w8 // xpos 3205 sub w8, w8, w6 // xpos -= dx 3206 cmp w9, #-4 // base_x <= -4 3207 asr w11, w8, #6 // base_x 3208 b.le 49f 3209 3210 lsl w9, w9, #1 3211 lsl w11, w11, #1 3212 3213 dup v17.4h, w8 // xpos 3214 3215 ldr q4, [x2, w9, sxtw] // top[base_x] 3216 ldr q6, [x2, w11, sxtw] 3217 3218 trn1 v16.2d, v16.2d, v17.2d // xpos 3219 3220 tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] 3221 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] 3222 3223 sshr v20.8h, v16.8h, #6 // first base_x for each row 3224 3225 ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1] 3226 ext v7.16b, v6.16b, v6.16b, #2 3227 3228 and v16.16b, v16.16b, v25.16b // frac_x 3229 3230 trn1 v4.2d, v4.2d, v6.2d // top[base_x] 3231 trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] 3232 3233 sub v17.8h, v26.8h, v16.8h // 64 - frac_x 3234 3235 add v20.8h, v20.8h, v31.8h // actual base_x 3236 3237 umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 3238 umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 3239 umull2 v22.4s, v18.8h, v28.8h 3240 umlal2 v22.4s, v19.8h, v27.8h 3241 3242 umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) 3243 umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x 3244 umull2 v24.4s, v4.8h, v17.8h 3245 umlal2 v24.4s, v5.8h, v16.8h 3246 3247 cmge v20.8h, v20.8h, #0 3248 3249 rshrn v21.4h, v21.4s, #6 3250 rshrn2 v21.8h, v22.4s, #6 3251 rshrn v22.4h, v23.4s, #6 3252 rshrn2 v22.8h, v24.4s, #6 3253 3254 movi v24.16b, #8 3255 3256 bit v21.16b, v22.16b, v20.16b 3257 3258 st1 {v21.d}[0], [x0], x1 3259 sub w8, w8, w6 // xpos -= dx 3260 subs w5, w5, #2 3261 st1 {v21.d}[1], [x0], x1 3262 b.le 9f 3263 3264 add v29.16b, v29.16b, v24.16b // base_y += 4 (*2) 3265 add v30.16b, v30.16b, v24.16b // base_y += 4 (*2) 3266 b 4b 3267 326849: 3269 tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] 3270 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] 3271 3272 umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 3273 umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 3274 umull2 v21.4s, v18.8h, v28.8h 3275 umlal2 v21.4s, v19.8h, v27.8h 3276 3277 rshrn v20.4h, v20.4s, #6 3278 rshrn2 v20.8h, v21.4s, #6 3279 3280 st1 {v20.d}[0], [x0], x1 3281 subs w5, w5, #2 3282 st1 {v20.d}[1], [x0], x1 3283 b.le 9f 3284 3285 add v29.16b, v29.16b, v24.16b // base_y += 4 (*2) 3286 add v30.16b, v30.16b, v24.16b // base_y += 4 (*2) 3287 b 49b 3288 32899: 3290 ret 3291 329280: 3293 stp d8, d9, [sp, #-0x40]! 3294 stp d10, d11, [sp, #0x10] 3295 stp d12, d13, [sp, #0x20] 3296 stp d14, d15, [sp, #0x30] 3297 3298 dup v18.8h, w7 // -dy 3299 movi v17.16b, #2 3300 3301 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy 3302 movi v25.8h, #0x3e 3303 add v16.8h, v16.8h, v18.8h // -= dy 3304 3305 // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. 3306 ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] 3307 3308 movi v26.8h, #64 3309 movi v19.16b, #4 3310 3311 shrn v29.8b, v16.8h, #6 // ypos >> 6 3312 and v27.16b, v16.16b, v25.16b // frac_y 3313 3314 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 2 3315 3316 movi v23.8h, #1, lsl #8 3317 shl v29.8b, v29.8b, #1 // 2*base_y 3318 mov v18.16b, v15.16b // left[0] 3319 zip1 v29.16b, v29.16b, v29.16b // duplicate elements 3320 add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... 3321 3322 add v30.16b, v29.16b, v17.16b // base_y + 1 (*2) 3323 3324 sub v28.8h, v26.8h, v27.8h // 64 - frac_y 3325 3326 movi v24.16b, #4 33278: 3328 asr w9, w8, #6 // base_x 3329 dup v16.8h, w8 // xpos 3330 sub w8, w8, w6 // xpos -= dx 3331 cmp w9, #-16 // base_x <= -16 3332 asr w11, w8, #6 // base_x 3333 b.le 89f 3334 3335 dup v17.8h, w8 // xpos 3336 3337 add x9, x2, w9, sxtw #1 3338 add x11, x2, w11, sxtw #1 3339 3340 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] 3341 ld1 {v6.8h, v7.8h}, [x11] 3342 3343 tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0] 3344 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 3345 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1] 3346 add v30.16b, v30.16b, v24.16b 3347 3348 sshr v22.8h, v16.8h, #6 // first base_x 3349 tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2] 3350 sshr v23.8h, v17.8h, #6 3351 tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3] 3352 3353 ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1] 3354 ext v7.16b, v6.16b, v7.16b, #2 3355 3356 and v16.16b, v16.16b, v25.16b // frac_x 3357 and v17.16b, v17.16b, v25.16b 3358 3359 umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 3360 umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 3361 3362 sub v8.8h, v26.8h, v16.8h // 64 - frac_x 3363 sub v9.8h, v26.8h, v17.8h 3364 3365 umull2 v11.4s, v18.8h, v28.8h 3366 umlal2 v11.4s, v19.8h, v27.8h 3367 3368 add v22.8h, v22.8h, v31.8h // actual base_x 3369 add v23.8h, v23.8h, v31.8h 3370 3371 umull v12.4s, v20.4h, v28.4h 3372 umlal v12.4s, v21.4h, v27.4h 3373 umull2 v13.4s, v20.8h, v28.8h 3374 umlal2 v13.4s, v21.8h, v27.8h 3375 3376 rshrn v10.4h, v10.4s, #6 3377 rshrn2 v10.8h, v11.4s, #6 3378 rshrn v11.4h, v12.4s, #6 3379 rshrn2 v11.8h, v13.4s, #6 3380 3381 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) 3382 umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x 3383 umull2 v13.4s, v4.8h, v8.8h 3384 umlal2 v13.4s, v5.8h, v16.8h 3385 umull v14.4s, v6.4h, v9.4h 3386 umlal v14.4s, v7.4h, v17.4h 3387 umull2 v18.4s, v6.8h, v9.8h 3388 umlal2 v18.4s, v7.8h, v17.8h 3389 3390 cmge v22.8h, v22.8h, #0 3391 cmge v23.8h, v23.8h, #0 3392 3393 rshrn v12.4h, v12.4s, #6 3394 rshrn2 v12.8h, v13.4s, #6 3395 rshrn v13.4h, v14.4s, #6 3396 rshrn2 v13.8h, v18.4s, #6 3397 3398 bit v10.16b, v12.16b, v22.16b 3399 bit v11.16b, v13.16b, v23.16b 3400 3401 st1 {v10.8h}, [x0], x1 3402 subs w5, w5, #2 3403 sub w8, w8, w6 // xpos -= dx 3404 st1 {v11.8h}, [x0], x1 3405 b.le 9f 3406 3407 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 3408 add v30.16b, v30.16b, v24.16b 3409 b 8b 3410 341189: 3412 tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0] 3413 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 3414 tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1] 3415 add v30.16b, v30.16b, v24.16b 3416 tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2] 3417 tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3] 3418 3419 umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) 3420 umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y 3421 umull2 v5.4s, v18.8h, v28.8h 3422 umlal2 v5.4s, v19.8h, v27.8h 3423 umull v6.4s, v20.4h, v28.4h 3424 umlal v6.4s, v21.4h, v27.4h 3425 umull2 v7.4s, v20.8h, v28.8h 3426 umlal2 v7.4s, v21.8h, v27.8h 3427 3428 rshrn v4.4h, v4.4s, #6 3429 rshrn2 v4.8h, v5.4s, #6 3430 rshrn v5.4h, v6.4s, #6 3431 rshrn2 v5.8h, v7.4s, #6 3432 3433 st1 {v4.8h}, [x0], x1 3434 subs w5, w5, #2 3435 st1 {v5.8h}, [x0], x1 3436 b.le 9f 3437 3438 add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) 3439 add v30.16b, v30.16b, v24.16b 3440 b 89b 3441 34429: 3443 ldp d14, d15, [sp, #0x30] 3444 ldp d12, d13, [sp, #0x20] 3445 ldp d10, d11, [sp, #0x10] 3446 ldp d8, d9, [sp], 0x40 3447 ret 3448endfunc 3449 3450// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, 3451// const pixel *const left, 3452// const int width, const int height, 3453// const int dy, const int max_base_y); 3454function ipred_z3_fill1_16bpc_neon, export=1 3455 clz w9, w4 3456 movrel x8, ipred_z3_fill1_tbl 3457 sub w9, w9, #25 3458 ldrsw x9, [x8, w9, uxtw #2] 3459 add x10, x2, w6, uxtw #1 // left[max_base_y] 3460 add x8, x8, x9 3461 ld1r {v31.8h}, [x10] // padding 3462 mov w7, w5 3463 mov w15, #64 3464 add x13, x0, x1 3465 lsl x1, x1, #1 3466 br x8 3467 346840: 3469 AARCH64_VALID_JUMP_TARGET 34704: 3471 lsr w8, w7, #6 // base 3472 and w9, w7, #0x3e // frac 3473 add w7, w7, w5 // xpos += dx 3474 cmp w8, w6 // base >= max_base_x 3475 lsr w10, w7, #6 // base 3476 and w11, w7, #0x3e // frac 3477 b.ge ipred_z3_fill_padding_neon 3478 lsl w8, w8, #1 3479 lsl w10, w10, #1 3480 ldr q0, [x2, w8, uxtw] // left[base] 3481 ldr q2, [x2, w10, uxtw] 3482 dup v4.8h, w9 // frac 3483 dup v5.8h, w11 3484 ext v1.16b, v0.16b, v0.16b, #2 // left[base+1] 3485 ext v3.16b, v2.16b, v2.16b, #2 3486 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] 3487 sub v7.4h, v3.4h, v2.4h 3488 ushll v16.4s, v0.4h, #6 // top[base]*64 3489 ushll v17.4s, v2.4h, #6 3490 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac 3491 smlal v17.4s, v7.4h, v5.4h 3492 rshrn v16.4h, v16.4s, #6 3493 rshrn v17.4h, v17.4s, #6 3494 subs w3, w3, #2 3495 zip1 v18.8h, v16.8h, v17.8h 3496 st1 {v18.s}[0], [x0], x1 3497 st1 {v18.s}[1], [x13], x1 3498 add w7, w7, w5 // xpos += dx 3499 st1 {v18.s}[2], [x0] 3500 st1 {v18.s}[3], [x13] 3501 b.le 9f 3502 sub x0, x0, x1 // ptr -= 4 * (2*stride) 3503 sub x13, x13, x1 3504 add x0, x0, #4 3505 add x13, x13, #4 3506 b 4b 35079: 3508 ret 3509 351080: 3511 AARCH64_VALID_JUMP_TARGET 35128: 3513 lsr w8, w7, #6 // base 3514 and w9, w7, #0x3e // frac 3515 add w7, w7, w5 // xpos += dx 3516 cmp w8, w6 // base >= max_base_x 3517 lsr w10, w7, #6 // base 3518 and w11, w7, #0x3e // frac 3519 b.ge ipred_z3_fill_padding_neon 3520 add x8, x2, w8, uxtw #1 3521 add x10, x2, w10, uxtw #1 3522 dup v4.8h, w9 // frac 3523 dup v5.8h, w11 3524 ld1 {v0.8h}, [x8] // left[base] 3525 ld1 {v2.8h}, [x10] 3526 sub w9, w15, w9 // 64 - frac 3527 sub w11, w15, w11 3528 ldr h1, [x8, #16] 3529 ldr h3, [x10, #16] 3530 dup v6.8h, w9 // 64 - frac 3531 dup v7.8h, w11 3532 ext v1.16b, v0.16b, v1.16b, #2 // left[base+1] 3533 ext v3.16b, v2.16b, v3.16b, #2 3534 umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac) 3535 umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac 3536 umull2 v17.4s, v0.8h, v6.8h 3537 umlal2 v17.4s, v1.8h, v4.8h 3538 umull v18.4s, v2.4h, v7.4h 3539 umlal v18.4s, v3.4h, v5.4h 3540 umull2 v19.4s, v2.8h, v7.8h 3541 umlal2 v19.4s, v3.8h, v5.8h 3542 rshrn v16.4h, v16.4s, #6 3543 rshrn2 v16.8h, v17.4s, #6 3544 rshrn v17.4h, v18.4s, #6 3545 rshrn2 v17.8h, v19.4s, #6 3546 subs w3, w3, #2 3547 zip1 v18.8h, v16.8h, v17.8h 3548 zip2 v19.8h, v16.8h, v17.8h 3549 add w7, w7, w5 // xpos += dx 3550 st1 {v18.s}[0], [x0], x1 3551 st1 {v18.s}[1], [x13], x1 3552 st1 {v18.s}[2], [x0], x1 3553 st1 {v18.s}[3], [x13], x1 3554 st1 {v19.s}[0], [x0], x1 3555 st1 {v19.s}[1], [x13], x1 3556 st1 {v19.s}[2], [x0], x1 3557 st1 {v19.s}[3], [x13], x1 3558 b.le 9f 3559 sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) 3560 sub x13, x13, x1, lsl #2 3561 add x0, x0, #4 3562 add x13, x13, #4 3563 b 8b 35649: 3565 ret 3566 3567160: 3568320: 3569640: 3570 AARCH64_VALID_JUMP_TARGET 3571 mov w12, w4 35721: 3573 lsr w8, w7, #6 // base 3574 and w9, w7, #0x3e // frac 3575 add w7, w7, w5 // ypos += dy 3576 cmp w8, w6 // base >= max_base_y 3577 lsr w10, w7, #6 // base 3578 and w11, w7, #0x3e // frac 3579 b.ge ipred_z3_fill_padding_neon 3580 add x8, x2, w8, uxtw #1 3581 add x10, x2, w10, uxtw #1 3582 dup v6.8h, w9 // frac 3583 dup v7.8h, w11 3584 ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base] 3585 ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 3586 sub w9, w15, w9 // 64 - frac 3587 sub w11, w15, w11 3588 dup v16.8h, w9 // 64 - frac 3589 dup v17.8h, w11 3590 add w7, w7, w5 // ypos += dy 35912: 3592 ext v18.16b, v0.16b, v1.16b, #2 // left[base+1] 3593 ext v19.16b, v1.16b, v2.16b, #2 3594 ext v20.16b, v3.16b, v4.16b, #2 3595 ext v21.16b, v4.16b, v5.16b, #2 3596 subs w4, w4, #16 3597 umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac) 3598 umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac 3599 umull2 v23.4s, v0.8h, v16.8h 3600 umlal2 v23.4s, v18.8h, v6.8h 3601 umull v24.4s, v1.4h, v16.4h 3602 umlal v24.4s, v19.4h, v6.4h 3603 umull2 v25.4s, v1.8h, v16.8h 3604 umlal2 v25.4s, v19.8h, v6.8h 3605 umull v26.4s, v3.4h, v17.4h 3606 umlal v26.4s, v20.4h, v7.4h 3607 umull2 v27.4s, v3.8h, v17.8h 3608 umlal2 v27.4s, v20.8h, v7.8h 3609 umull v28.4s, v4.4h, v17.4h 3610 umlal v28.4s, v21.4h, v7.4h 3611 umull2 v29.4s, v4.8h, v17.8h 3612 umlal2 v29.4s, v21.8h, v7.8h 3613 rshrn v22.4h, v22.4s, #6 3614 rshrn2 v22.8h, v23.4s, #6 3615 rshrn v23.4h, v24.4s, #6 3616 rshrn2 v23.8h, v25.4s, #6 3617 rshrn v24.4h, v26.4s, #6 3618 rshrn2 v24.8h, v27.4s, #6 3619 rshrn v25.4h, v28.4s, #6 3620 rshrn2 v25.8h, v29.4s, #6 3621 zip1 v18.8h, v22.8h, v24.8h 3622 zip2 v19.8h, v22.8h, v24.8h 3623 zip1 v20.8h, v23.8h, v25.8h 3624 zip2 v21.8h, v23.8h, v25.8h 3625 st1 {v18.s}[0], [x0], x1 3626 st1 {v18.s}[1], [x13], x1 3627 st1 {v18.s}[2], [x0], x1 3628 st1 {v18.s}[3], [x13], x1 3629 st1 {v19.s}[0], [x0], x1 3630 st1 {v19.s}[1], [x13], x1 3631 st1 {v19.s}[2], [x0], x1 3632 st1 {v19.s}[3], [x13], x1 3633 st1 {v20.s}[0], [x0], x1 3634 st1 {v20.s}[1], [x13], x1 3635 st1 {v20.s}[2], [x0], x1 3636 st1 {v20.s}[3], [x13], x1 3637 st1 {v21.s}[0], [x0], x1 3638 st1 {v21.s}[1], [x13], x1 3639 st1 {v21.s}[2], [x0], x1 3640 st1 {v21.s}[3], [x13], x1 3641 b.le 3f 3642 mov v0.16b, v2.16b 3643 ld1 {v1.8h, v2.8h}, [x8], #32 // left[base] 3644 mov v3.16b, v5.16b 3645 ld1 {v4.8h, v5.8h}, [x10], #32 3646 b 2b 3647 36483: 3649 subs w3, w3, #2 3650 b.le 9f 3651 lsr x1, x1, #1 3652 msub x0, x1, x12, x0 // ptr -= h * stride 3653 msub x13, x1, x12, x13 3654 lsl x1, x1, #1 3655 add x0, x0, #4 3656 add x13, x13, #4 3657 mov w4, w12 3658 b 1b 36599: 3660 ret 3661endfunc 3662 3663jumptable ipred_z3_fill1_tbl 3664 .word 640b - ipred_z3_fill1_tbl 3665 .word 320b - ipred_z3_fill1_tbl 3666 .word 160b - ipred_z3_fill1_tbl 3667 .word 80b - ipred_z3_fill1_tbl 3668 .word 40b - ipred_z3_fill1_tbl 3669endjumptable 3670 3671function ipred_z3_fill_padding_neon, export=0 3672 cmp w3, #8 3673 movrel x8, ipred_z3_fill_padding_tbl 3674 b.gt ipred_z3_fill_padding_wide 3675 // w3 = remaining width, w4 = constant height 3676 mov w12, w4 3677 36781: 3679 // Fill a WxH rectangle with padding. W can be any number; 3680 // this fills the exact width by filling in the largest 3681 // power of two in the remaining width, and repeating. 3682 clz w9, w3 3683 sub w9, w9, #25 3684 ldrsw x9, [x8, w9, uxtw #2] 3685 add x9, x8, x9 3686 br x9 3687 368820: 3689 AARCH64_VALID_JUMP_TARGET 36902: 3691 st1 {v31.s}[0], [x0], x1 3692 subs w4, w4, #4 3693 st1 {v31.s}[0], [x13], x1 3694 st1 {v31.s}[0], [x0], x1 3695 st1 {v31.s}[0], [x13], x1 3696 b.gt 2b 3697 subs w3, w3, #2 3698 lsr x1, x1, #1 3699 msub x0, x1, x12, x0 // ptr -= h * stride 3700 msub x13, x1, x12, x13 3701 b.le 9f 3702 lsl x1, x1, #1 3703 add x0, x0, #4 3704 add x13, x13, #4 3705 mov w4, w12 3706 b 1b 3707 370840: 3709 AARCH64_VALID_JUMP_TARGET 37104: 3711 st1 {v31.4h}, [x0], x1 3712 subs w4, w4, #4 3713 st1 {v31.4h}, [x13], x1 3714 st1 {v31.4h}, [x0], x1 3715 st1 {v31.4h}, [x13], x1 3716 b.gt 4b 3717 subs w3, w3, #4 3718 lsr x1, x1, #1 3719 msub x0, x1, x12, x0 // ptr -= h * stride 3720 msub x13, x1, x12, x13 3721 b.le 9f 3722 lsl x1, x1, #1 3723 add x0, x0, #8 3724 add x13, x13, #8 3725 mov w4, w12 3726 b 1b 3727 372880: 3729160: 3730320: 3731640: 3732 AARCH64_VALID_JUMP_TARGET 37338: 3734 st1 {v31.8h}, [x0], x1 3735 subs w4, w4, #4 3736 st1 {v31.8h}, [x13], x1 3737 st1 {v31.8h}, [x0], x1 3738 st1 {v31.8h}, [x13], x1 3739 b.gt 8b 3740 subs w3, w3, #8 3741 lsr x1, x1, #1 3742 msub x0, x1, x12, x0 // ptr -= h * stride 3743 msub x13, x1, x12, x13 3744 b.le 9f 3745 lsl x1, x1, #1 3746 add x0, x0, #16 3747 add x13, x13, #16 3748 mov w4, w12 3749 b 1b 3750 37519: 3752 ret 3753endfunc 3754 3755jumptable ipred_z3_fill_padding_tbl 3756 .word 640b - ipred_z3_fill_padding_tbl 3757 .word 320b - ipred_z3_fill_padding_tbl 3758 .word 160b - ipred_z3_fill_padding_tbl 3759 .word 80b - ipred_z3_fill_padding_tbl 3760 .word 40b - ipred_z3_fill_padding_tbl 3761 .word 20b - ipred_z3_fill_padding_tbl 3762endjumptable 3763 3764function ipred_z3_fill_padding_wide 3765 // Fill a WxH rectangle with padding, with W > 8. 3766 lsr x1, x1, #1 3767 mov w12, w3 3768 sub x1, x1, w3, uxtw #1 37691: 3770 ands w5, w3, #7 3771 b.eq 2f 3772 // If the width isn't aligned to 8, first do one 8 pixel write 3773 // and align the start pointer. 3774 sub w3, w3, w5 3775 st1 {v31.8h}, [x0] 3776 add x0, x0, w5, uxtw #1 37772: 3778 // Fill the rest of the line with aligned 8 pixel writes. 3779 subs w3, w3, #8 3780 st1 {v31.8h}, [x0], #16 3781 b.gt 2b 3782 subs w4, w4, #1 3783 add x0, x0, x1 3784 b.le 9f 3785 mov w3, w12 3786 b 1b 37879: 3788 ret 3789endfunc 3790 3791function ipred_z3_fill2_16bpc_neon, export=1 3792 cmp w4, #8 3793 add x10, x2, w6, uxtw // left[max_base_y] 3794 ld1r {v31.16b}, [x10] // padding 3795 mov w7, w5 3796 mov w15, #64 3797 add x13, x0, x1 3798 lsl x1, x1, #1 3799 b.eq 8f 3800 38014: // h == 4 3802 lsr w8, w7, #6 // base 3803 and w9, w7, #0x3e // frac 3804 add w7, w7, w5 // xpos += dx 3805 cmp w8, w6 // base >= max_base_x 3806 lsr w10, w7, #6 // base 3807 and w11, w7, #0x3e // frac 3808 b.ge ipred_z3_fill_padding_neon 3809 lsl w8, w8, #1 3810 lsl w10, w10, #1 3811 ldr q0, [x2, w8, uxtw] // top[base] 3812 ldr q2, [x2, w10, uxtw] 3813 dup v4.4h, w9 // frac 3814 dup v5.4h, w11 3815 uzp2 v1.8h, v0.8h, v0.8h // top[base+1] 3816 uzp1 v0.8h, v0.8h, v0.8h // top[base] 3817 uzp2 v3.8h, v2.8h, v2.8h 3818 uzp1 v2.8h, v2.8h, v2.8h 3819 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] 3820 sub v7.4h, v3.4h, v2.4h 3821 ushll v16.4s, v0.4h, #6 // top[base]*64 3822 ushll v17.4s, v2.4h, #6 3823 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac 3824 smlal v17.4s, v7.4h, v5.4h 3825 rshrn v16.4h, v16.4s, #6 3826 rshrn v17.4h, v17.4s, #6 3827 subs w3, w3, #2 3828 zip1 v18.8h, v16.8h, v17.8h 3829 st1 {v18.s}[0], [x0], x1 3830 st1 {v18.s}[1], [x13], x1 3831 add w7, w7, w5 // xpos += dx 3832 st1 {v18.s}[2], [x0] 3833 st1 {v18.s}[3], [x13] 3834 b.le 9f 3835 sub x0, x0, x1 // ptr -= 4 * (2*stride) 3836 sub x13, x13, x1 3837 add x0, x0, #4 3838 add x13, x13, #4 3839 b 4b 38409: 3841 ret 3842 38438: // h == 8 3844 lsr w8, w7, #6 // base 3845 and w9, w7, #0x3e // frac 3846 add w7, w7, w5 // xpos += dx 3847 cmp w8, w6 // base >= max_base_x 3848 lsr w10, w7, #6 // base 3849 and w11, w7, #0x3e // frac 3850 b.ge ipred_z3_fill_padding_neon 3851 add x8, x2, w8, uxtw #1 3852 add x10, x2, w10, uxtw #1 3853 dup v4.8h, w9 // frac 3854 dup v5.8h, w11 3855 ld1 {v0.8h, v1.8h}, [x8] // top[base] 3856 ld1 {v2.8h, v3.8h}, [x10] 3857 sub w9, w15, w9 // 64 - frac 3858 sub w11, w15, w11 3859 dup v6.8h, w9 // 64 - frac 3860 dup v7.8h, w11 3861 uzp2 v20.8h, v0.8h, v1.8h // top[base+1] 3862 uzp1 v0.8h, v0.8h, v1.8h // top[base] 3863 uzp2 v21.8h, v2.8h, v3.8h 3864 uzp1 v2.8h, v2.8h, v3.8h 3865 umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) 3866 umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac 3867 umull2 v17.4s, v0.8h, v6.8h 3868 umlal2 v17.4s, v20.8h, v4.8h 3869 umull v18.4s, v2.4h, v7.4h 3870 umlal v18.4s, v21.4h, v5.4h 3871 umull2 v19.4s, v2.8h, v7.8h 3872 umlal2 v19.4s, v21.8h, v5.8h 3873 rshrn v16.4h, v16.4s, #6 3874 rshrn2 v16.8h, v17.4s, #6 3875 rshrn v17.4h, v18.4s, #6 3876 rshrn2 v17.8h, v19.4s, #6 3877 subs w3, w3, #2 3878 zip1 v18.8h, v16.8h, v17.8h 3879 zip2 v19.8h, v16.8h, v17.8h 3880 add w7, w7, w5 // xpos += dx 3881 st1 {v18.s}[0], [x0], x1 3882 st1 {v18.s}[1], [x13], x1 3883 st1 {v18.s}[2], [x0], x1 3884 st1 {v18.s}[3], [x13], x1 3885 st1 {v19.s}[0], [x0], x1 3886 st1 {v19.s}[1], [x13], x1 3887 st1 {v19.s}[2], [x0], x1 3888 st1 {v19.s}[3], [x13], x1 3889 b.le 9f 3890 sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) 3891 sub x13, x13, x1, lsl #2 3892 add x0, x0, #4 3893 add x13, x13, #4 3894 b 8b 38959: 3896 ret 3897endfunc 3898 3899 3900// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, 3901// const pixel *const topleft, 3902// const int width, const int height, const int filt_idx, 3903// const int max_width, const int max_height, 3904// const int bitdepth_max); 3905.macro filter_fn bpc 3906function ipred_filter_\bpc\()bpc_neon 3907 and w5, w5, #511 3908 movrel x6, X(filter_intra_taps) 3909 lsl w5, w5, #6 3910 add x6, x6, w5, uxtw 3911 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 3912 clz w9, w3 3913 movrel x5, ipred_filter\bpc\()_tbl 3914 ld1 {v20.8b, v21.8b, v22.8b}, [x6] 3915 sub w9, w9, #26 3916 ldrsw x9, [x5, w9, uxtw #2] 3917 sxtl v16.8h, v16.8b 3918 sxtl v17.8h, v17.8b 3919 add x5, x5, x9 3920 sxtl v18.8h, v18.8b 3921 sxtl v19.8h, v19.8b 3922 add x6, x0, x1 3923 lsl x1, x1, #1 3924 sxtl v20.8h, v20.8b 3925 sxtl v21.8h, v21.8b 3926 sxtl v22.8h, v22.8b 3927 dup v31.8h, w8 3928.if \bpc == 10 3929 movi v30.8h, #0 3930.endif 3931 br x5 393240: 3933 AARCH64_VALID_JUMP_TARGET 3934 ldur d0, [x2, #2] // top (0-3) 3935 sub x2, x2, #4 3936 mov x7, #-4 39374: 3938 ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) 3939.if \bpc == 10 3940 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 3941 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 3942 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 3943 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 3944 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 3945 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 3946 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 3947 srshr v2.8h, v2.8h, #4 3948 smax v2.8h, v2.8h, v30.8h 3949.else 3950 smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) 3951 smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) 3952 smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) 3953 smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) 3954 smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) 3955 smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) 3956 smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) 3957 smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 3958 smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 3959 smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 3960 smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 3961 smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) 3962 smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 3963 smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 3964 sqrshrun v2.4h, v2.4s, #4 3965 sqrshrun2 v2.8h, v3.4s, #4 3966.endif 3967 smin v2.8h, v2.8h, v31.8h 3968 subs w4, w4, #2 3969 st1 {v2.d}[0], [x0], x1 3970 ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] 3971 st1 {v2.d}[1], [x6], x1 3972 b.gt 4b 3973 ret 397480: 3975 AARCH64_VALID_JUMP_TARGET 3976 ldur q0, [x2, #2] // top (0-7) 3977 sub x2, x2, #4 3978 mov x7, #-4 39798: 3980 ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) 3981.if \bpc == 10 3982 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 3983 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 3984 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 3985 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 3986 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) 3987 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 3988 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 3989 mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 3990 mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 3991 mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 3992 srshr v2.8h, v2.8h, #4 3993 smax v2.8h, v2.8h, v30.8h 3994 smin v2.8h, v2.8h, v31.8h 3995 mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 3996 mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) 3997 mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5) 3998 mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6) 3999 srshr v3.8h, v3.8h, #4 4000 smax v3.8h, v3.8h, v30.8h 4001.else 4002 smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) 4003 smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) 4004 smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) 4005 smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) 4006 smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) 4007 smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) 4008 smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) 4009 smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) 4010 smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) 4011 smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) 4012 smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) 4013 smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) 4014 smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) 4015 smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) 4016 smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1) 4017 smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2) 4018 smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3) 4019 sqrshrun v2.4h, v2.4s, #4 4020 sqrshrun2 v2.8h, v3.4s, #4 4021 smin v2.8h, v2.8h, v31.8h 4022 smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4) 4023 smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0) 4024 smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5) 4025 smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6) 4026 smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1) 4027 smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2) 4028 smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3) 4029 smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4) 4030 smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0) 4031 smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5) 4032 smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6) 4033 sqrshrun v3.4h, v4.4s, #4 4034 sqrshrun2 v3.8h, v5.4s, #4 4035.endif 4036 smin v3.8h, v3.8h, v31.8h 4037 subs w4, w4, #2 4038 st2 {v2.d, v3.d}[0], [x0], x1 4039 zip2 v0.2d, v2.2d, v3.2d 4040 st2 {v2.d, v3.d}[1], [x6], x1 4041 b.gt 8b 4042 ret 4043160: 4044320: 4045 AARCH64_VALID_JUMP_TARGET 4046 add x8, x2, #2 4047 sub x2, x2, #4 4048 mov x7, #-4 4049 sub x1, x1, w3, uxtw #1 4050 mov w9, w3 4051 40521: 4053 ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2) 40542: 4055 ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15) 4056.if \bpc == 10 4057 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) 4058 mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 4059 mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 4060 mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 4061 mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 4062 mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 4063 mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 4064 4065 mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 4066 mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 4067 mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 4068 srshr v3.8h, v3.8h, #4 4069 smax v3.8h, v3.8h, v30.8h 4070 smin v3.8h, v3.8h, v31.8h 4071 mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 4072 mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) 4073 mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5) 4074 mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6) 4075 4076 mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 4077 mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 4078 mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 4079 srshr v4.8h, v4.8h, #4 4080 smax v4.8h, v4.8h, v30.8h 4081 smin v4.8h, v4.8h, v31.8h 4082 mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 4083 mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) 4084 mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5) 4085 mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6) 4086 4087 mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 4088 mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 4089 mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 4090 srshr v5.8h, v5.8h, #4 4091 smax v5.8h, v5.8h, v30.8h 4092 smin v5.8h, v5.8h, v31.8h 4093 mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 4094 mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) 4095 mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5) 4096 mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6) 4097 4098 subs w3, w3, #16 4099 srshr v6.8h, v6.8h, #4 4100 smax v6.8h, v6.8h, v30.8h 4101.else 4102 smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0) 4103 smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5) 4104 smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6) 4105 smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1) 4106 smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2) 4107 smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3) 4108 smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4) 4109 smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0) 4110 smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5) 4111 smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6) 4112 smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1) 4113 smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2) 4114 smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3) 4115 smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4) 4116 4117 smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1) 4118 smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2) 4119 smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3) 4120 sqrshrun v3.4h, v3.4s, #4 4121 sqrshrun2 v3.8h, v4.4s, #4 4122 smin v3.8h, v3.8h, v31.8h 4123 smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4) 4124 smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0) 4125 smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5) 4126 smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6) 4127 smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1) 4128 smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2) 4129 smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3) 4130 smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4) 4131 smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0) 4132 smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5) 4133 smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6) 4134 4135 smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1) 4136 smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2) 4137 smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3) 4138 sqrshrun v4.4h, v5.4s, #4 4139 sqrshrun2 v4.8h, v6.4s, #4 4140 smin v4.8h, v4.8h, v31.8h 4141 smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4) 4142 smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0) 4143 smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5) 4144 smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6) 4145 smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1) 4146 smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2) 4147 smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3) 4148 smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4) 4149 smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0) 4150 smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5) 4151 smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6) 4152 4153 smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1) 4154 smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2) 4155 smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3) 4156 sqrshrun v5.4h, v24.4s, #4 4157 sqrshrun2 v5.8h, v25.4s, #4 4158 smin v5.8h, v5.8h, v31.8h 4159 smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4) 4160 smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0) 4161 smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5) 4162 smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6) 4163 smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1) 4164 smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2) 4165 smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3) 4166 smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4) 4167 smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0) 4168 smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5) 4169 smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6) 4170 4171 subs w3, w3, #16 4172 sqrshrun v6.4h, v26.4s, #4 4173 sqrshrun2 v6.8h, v27.4s, #4 4174.endif 4175 smin v6.8h, v6.8h, v31.8h 4176 4177 ins v0.h[2], v2.h[7] 4178 st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32 4179 ins v0.h[0], v6.h[7] 4180 st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32 4181 ins v0.h[1], v6.h[3] 4182 b.gt 2b 4183 subs w4, w4, #2 4184 b.le 9f 4185 sub x8, x6, w9, uxtw #1 4186 add x0, x0, x1 4187 add x6, x6, x1 4188 mov w3, w9 4189 b 1b 41909: 4191 ret 4192endfunc 4193 4194jumptable ipred_filter\bpc\()_tbl 4195 .word 320b - ipred_filter\bpc\()_tbl 4196 .word 160b - ipred_filter\bpc\()_tbl 4197 .word 80b - ipred_filter\bpc\()_tbl 4198 .word 40b - ipred_filter\bpc\()_tbl 4199endjumptable 4200.endm 4201 4202filter_fn 10 4203filter_fn 12 4204 4205function ipred_filter_16bpc_neon, export=1 4206 ldr w8, [sp] 4207 cmp w8, 0x3ff 4208 b.le ipred_filter_10bpc_neon 4209 b ipred_filter_12bpc_neon 4210endfunc 4211 4212// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, 4213// const pixel *const pal, const uint8_t *idx, 4214// const int w, const int h); 4215function pal_pred_16bpc_neon, export=1 4216 ld1 {v30.8h}, [x2] 4217 clz w9, w4 4218 movrel x6, pal_pred_tbl 4219 sub w9, w9, #25 4220 movi v29.16b, #7 4221 ldrsw x9, [x6, w9, uxtw #2] 4222 movi v31.8h, #1, lsl #8 4223 add x6, x6, x9 4224 br x6 422540: 4226 AARCH64_VALID_JUMP_TARGET 4227 add x2, x0, x1 4228 lsl x1, x1, #1 42294: 4230 ld1 {v1.8b}, [x3], #8 4231 subs w5, w5, #4 4232 ushr v3.8b, v1.8b, #4 4233 and v2.8b, v1.8b, v29.8b 4234 zip1 v1.16b, v2.16b, v3.16b 4235 // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... 4236 add v1.16b, v1.16b, v1.16b 4237 zip1 v0.16b, v1.16b, v1.16b 4238 zip2 v1.16b, v1.16b, v1.16b 4239 add v0.8h, v0.8h, v31.8h 4240 add v1.8h, v1.8h, v31.8h 4241 tbl v0.16b, {v30.16b}, v0.16b 4242 st1 {v0.d}[0], [x0], x1 4243 tbl v1.16b, {v30.16b}, v1.16b 4244 st1 {v0.d}[1], [x2], x1 4245 st1 {v1.d}[0], [x0], x1 4246 st1 {v1.d}[1], [x2], x1 4247 b.gt 4b 4248 ret 424980: 4250 AARCH64_VALID_JUMP_TARGET 4251 add x2, x0, x1 4252 lsl x1, x1, #1 42538: 4254 ld1 {v2.16b}, [x3], #16 4255 subs w5, w5, #4 4256 ushr v4.16b, v2.16b, #4 4257 and v3.16b, v2.16b, v29.16b 4258 zip1 v2.16b, v3.16b, v4.16b 4259 zip2 v3.16b, v3.16b, v4.16b 4260 add v2.16b, v2.16b, v2.16b 4261 add v3.16b, v3.16b, v3.16b 4262 zip1 v0.16b, v2.16b, v2.16b 4263 zip2 v1.16b, v2.16b, v2.16b 4264 zip1 v2.16b, v3.16b, v3.16b 4265 zip2 v3.16b, v3.16b, v3.16b 4266 add v0.8h, v0.8h, v31.8h 4267 add v1.8h, v1.8h, v31.8h 4268 add v2.8h, v2.8h, v31.8h 4269 add v3.8h, v3.8h, v31.8h 4270 tbl v0.16b, {v30.16b}, v0.16b 4271 tbl v1.16b, {v30.16b}, v1.16b 4272 st1 {v0.8h}, [x0], x1 4273 tbl v2.16b, {v30.16b}, v2.16b 4274 st1 {v1.8h}, [x2], x1 4275 tbl v3.16b, {v30.16b}, v3.16b 4276 st1 {v2.8h}, [x0], x1 4277 st1 {v3.8h}, [x2], x1 4278 b.gt 8b 4279 ret 4280160: 4281 AARCH64_VALID_JUMP_TARGET 4282 add x2, x0, x1 4283 lsl x1, x1, #1 428416: 4285 ld1 {v4.16b, v5.16b}, [x3], #32 4286 subs w5, w5, #4 4287 ushr v7.16b, v4.16b, #4 4288 and v6.16b, v4.16b, v29.16b 4289 ushr v3.16b, v5.16b, #4 4290 and v2.16b, v5.16b, v29.16b 4291 zip1 v4.16b, v6.16b, v7.16b 4292 zip2 v5.16b, v6.16b, v7.16b 4293 zip1 v6.16b, v2.16b, v3.16b 4294 zip2 v7.16b, v2.16b, v3.16b 4295 add v4.16b, v4.16b, v4.16b 4296 add v5.16b, v5.16b, v5.16b 4297 add v6.16b, v6.16b, v6.16b 4298 add v7.16b, v7.16b, v7.16b 4299 zip1 v0.16b, v4.16b, v4.16b 4300 zip2 v1.16b, v4.16b, v4.16b 4301 zip1 v2.16b, v5.16b, v5.16b 4302 zip2 v3.16b, v5.16b, v5.16b 4303 zip1 v4.16b, v6.16b, v6.16b 4304 zip2 v5.16b, v6.16b, v6.16b 4305 zip1 v6.16b, v7.16b, v7.16b 4306 zip2 v7.16b, v7.16b, v7.16b 4307 add v0.8h, v0.8h, v31.8h 4308 add v1.8h, v1.8h, v31.8h 4309 add v2.8h, v2.8h, v31.8h 4310 add v3.8h, v3.8h, v31.8h 4311 add v4.8h, v4.8h, v31.8h 4312 tbl v0.16b, {v30.16b}, v0.16b 4313 add v5.8h, v5.8h, v31.8h 4314 tbl v1.16b, {v30.16b}, v1.16b 4315 add v6.8h, v6.8h, v31.8h 4316 tbl v2.16b, {v30.16b}, v2.16b 4317 add v7.8h, v7.8h, v31.8h 4318 tbl v3.16b, {v30.16b}, v3.16b 4319 tbl v4.16b, {v30.16b}, v4.16b 4320 tbl v5.16b, {v30.16b}, v5.16b 4321 st1 {v0.8h, v1.8h}, [x0], x1 4322 tbl v6.16b, {v30.16b}, v6.16b 4323 st1 {v2.8h, v3.8h}, [x2], x1 4324 tbl v7.16b, {v30.16b}, v7.16b 4325 st1 {v4.8h, v5.8h}, [x0], x1 4326 st1 {v6.8h, v7.8h}, [x2], x1 4327 b.gt 16b 4328 ret 4329320: 4330 AARCH64_VALID_JUMP_TARGET 4331 add x2, x0, x1 4332 lsl x1, x1, #1 433332: 4334 ld1 {v4.16b, v5.16b}, [x3], #32 4335 subs w5, w5, #2 4336 ushr v7.16b, v4.16b, #4 4337 and v6.16b, v4.16b, v29.16b 4338 ushr v3.16b, v5.16b, #4 4339 and v2.16b, v5.16b, v29.16b 4340 zip1 v4.16b, v6.16b, v7.16b 4341 zip2 v5.16b, v6.16b, v7.16b 4342 zip1 v6.16b, v2.16b, v3.16b 4343 zip2 v7.16b, v2.16b, v3.16b 4344 add v4.16b, v4.16b, v4.16b 4345 add v5.16b, v5.16b, v5.16b 4346 add v6.16b, v6.16b, v6.16b 4347 add v7.16b, v7.16b, v7.16b 4348 zip1 v0.16b, v4.16b, v4.16b 4349 zip2 v1.16b, v4.16b, v4.16b 4350 zip1 v2.16b, v5.16b, v5.16b 4351 zip2 v3.16b, v5.16b, v5.16b 4352 zip1 v4.16b, v6.16b, v6.16b 4353 zip2 v5.16b, v6.16b, v6.16b 4354 zip1 v6.16b, v7.16b, v7.16b 4355 zip2 v7.16b, v7.16b, v7.16b 4356 add v0.8h, v0.8h, v31.8h 4357 add v1.8h, v1.8h, v31.8h 4358 add v2.8h, v2.8h, v31.8h 4359 add v3.8h, v3.8h, v31.8h 4360 add v4.8h, v4.8h, v31.8h 4361 tbl v0.16b, {v30.16b}, v0.16b 4362 add v5.8h, v5.8h, v31.8h 4363 tbl v1.16b, {v30.16b}, v1.16b 4364 add v6.8h, v6.8h, v31.8h 4365 tbl v2.16b, {v30.16b}, v2.16b 4366 add v7.8h, v7.8h, v31.8h 4367 tbl v3.16b, {v30.16b}, v3.16b 4368 tbl v4.16b, {v30.16b}, v4.16b 4369 tbl v5.16b, {v30.16b}, v5.16b 4370 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 4371 tbl v6.16b, {v30.16b}, v6.16b 4372 tbl v7.16b, {v30.16b}, v7.16b 4373 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 4374 b.gt 32b 4375 ret 4376640: 4377 AARCH64_VALID_JUMP_TARGET 4378 add x2, x0, #64 437964: 4380 ld1 {v4.16b, v5.16b}, [x3], #32 4381 subs w5, w5, #1 4382 ushr v7.16b, v4.16b, #4 4383 and v6.16b, v4.16b, v29.16b 4384 ushr v3.16b, v5.16b, #4 4385 and v2.16b, v5.16b, v29.16b 4386 zip1 v4.16b, v6.16b, v7.16b 4387 zip2 v5.16b, v6.16b, v7.16b 4388 zip1 v6.16b, v2.16b, v3.16b 4389 zip2 v7.16b, v2.16b, v3.16b 4390 add v4.16b, v4.16b, v4.16b 4391 add v5.16b, v5.16b, v5.16b 4392 add v6.16b, v6.16b, v6.16b 4393 add v7.16b, v7.16b, v7.16b 4394 zip1 v0.16b, v4.16b, v4.16b 4395 zip2 v1.16b, v4.16b, v4.16b 4396 zip1 v2.16b, v5.16b, v5.16b 4397 zip2 v3.16b, v5.16b, v5.16b 4398 zip1 v4.16b, v6.16b, v6.16b 4399 zip2 v5.16b, v6.16b, v6.16b 4400 zip1 v6.16b, v7.16b, v7.16b 4401 zip2 v7.16b, v7.16b, v7.16b 4402 add v0.8h, v0.8h, v31.8h 4403 add v1.8h, v1.8h, v31.8h 4404 add v2.8h, v2.8h, v31.8h 4405 add v3.8h, v3.8h, v31.8h 4406 add v4.8h, v4.8h, v31.8h 4407 tbl v0.16b, {v30.16b}, v0.16b 4408 add v5.8h, v5.8h, v31.8h 4409 tbl v1.16b, {v30.16b}, v1.16b 4410 add v6.8h, v6.8h, v31.8h 4411 tbl v2.16b, {v30.16b}, v2.16b 4412 add v7.8h, v7.8h, v31.8h 4413 tbl v3.16b, {v30.16b}, v3.16b 4414 tbl v4.16b, {v30.16b}, v4.16b 4415 tbl v5.16b, {v30.16b}, v5.16b 4416 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 4417 tbl v6.16b, {v30.16b}, v6.16b 4418 tbl v7.16b, {v30.16b}, v7.16b 4419 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 4420 b.gt 64b 4421 ret 4422endfunc 4423 4424jumptable pal_pred_tbl 4425 .word 640b - pal_pred_tbl 4426 .word 320b - pal_pred_tbl 4427 .word 160b - pal_pred_tbl 4428 .word 80b - pal_pred_tbl 4429 .word 40b - pal_pred_tbl 4430endjumptable 4431 4432// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, 4433// const pixel *const topleft, 4434// const int width, const int height, 4435// const int16_t *ac, const int alpha, 4436// const int bitdepth_max); 4437function ipred_cfl_128_16bpc_neon, export=1 4438 dup v31.8h, w7 // bitdepth_max 4439 clz w9, w3 4440 movrel x7, ipred_cfl_128_tbl 4441 sub w9, w9, #26 4442 ldrsw x9, [x7, w9, uxtw #2] 4443 urshr v0.8h, v31.8h, #1 4444 dup v1.8h, w6 // alpha 4445 add x7, x7, x9 4446 add x6, x0, x1 4447 lsl x1, x1, #1 4448 movi v30.8h, #0 4449 br x7 4450L(ipred_cfl_splat_w4): 4451 AARCH64_VALID_JUMP_TARGET 44521: 4453 ld1 {v4.8h, v5.8h}, [x5], #32 4454 subs w4, w4, #4 4455 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha 4456 smull2 v3.4s, v4.8h, v1.8h 4457 smull v4.4s, v5.4h, v1.4h 4458 smull2 v5.4s, v5.8h, v1.8h 4459 cmlt v16.4s, v2.4s, #0 // sign 4460 cmlt v17.4s, v3.4s, #0 4461 cmlt v18.4s, v4.4s, #0 4462 cmlt v19.4s, v5.4s, #0 4463 add v2.4s, v2.4s, v16.4s // diff + sign 4464 add v3.4s, v3.4s, v17.4s 4465 add v4.4s, v4.4s, v18.4s 4466 add v5.4s, v5.4s, v19.4s 4467 rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 4468 rshrn2 v2.8h, v3.4s, #6 4469 rshrn v3.4h, v4.4s, #6 4470 rshrn2 v3.8h, v5.4s, #6 4471 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 4472 add v3.8h, v3.8h, v0.8h 4473 smax v2.8h, v2.8h, v30.8h 4474 smax v3.8h, v3.8h, v30.8h 4475 smin v2.8h, v2.8h, v31.8h 4476 smin v3.8h, v3.8h, v31.8h 4477 st1 {v2.d}[0], [x0], x1 4478 st1 {v2.d}[1], [x6], x1 4479 st1 {v3.d}[0], [x0], x1 4480 st1 {v3.d}[1], [x6], x1 4481 b.gt 1b 4482 ret 4483L(ipred_cfl_splat_w8): 4484 AARCH64_VALID_JUMP_TARGET 44851: 4486 ld1 {v4.8h, v5.8h}, [x5], #32 4487 subs w4, w4, #2 4488 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha 4489 smull2 v3.4s, v4.8h, v1.8h 4490 smull v4.4s, v5.4h, v1.4h 4491 smull2 v5.4s, v5.8h, v1.8h 4492 cmlt v16.4s, v2.4s, #0 // sign 4493 cmlt v17.4s, v3.4s, #0 4494 cmlt v18.4s, v4.4s, #0 4495 cmlt v19.4s, v5.4s, #0 4496 add v2.4s, v2.4s, v16.4s // diff + sign 4497 add v3.4s, v3.4s, v17.4s 4498 add v4.4s, v4.4s, v18.4s 4499 add v5.4s, v5.4s, v19.4s 4500 rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 4501 rshrn2 v2.8h, v3.4s, #6 4502 rshrn v3.4h, v4.4s, #6 4503 rshrn2 v3.8h, v5.4s, #6 4504 add v2.8h, v2.8h, v0.8h // dc + apply_sign() 4505 add v3.8h, v3.8h, v0.8h 4506 smax v2.8h, v2.8h, v30.8h 4507 smax v3.8h, v3.8h, v30.8h 4508 smin v2.8h, v2.8h, v31.8h 4509 smin v3.8h, v3.8h, v31.8h 4510 st1 {v2.8h}, [x0], x1 4511 st1 {v3.8h}, [x6], x1 4512 b.gt 1b 4513 ret 4514L(ipred_cfl_splat_w16): 4515 AARCH64_VALID_JUMP_TARGET 4516 add x7, x5, w3, uxtw #1 4517 sub x1, x1, w3, uxtw #1 4518 mov w9, w3 45191: 4520 ld1 {v2.8h, v3.8h}, [x5], #32 4521 ld1 {v4.8h, v5.8h}, [x7], #32 4522 subs w3, w3, #16 4523 smull v16.4s, v2.4h, v1.4h // diff = ac * alpha 4524 smull2 v17.4s, v2.8h, v1.8h 4525 smull v18.4s, v3.4h, v1.4h 4526 smull2 v19.4s, v3.8h, v1.8h 4527 smull v2.4s, v4.4h, v1.4h 4528 smull2 v3.4s, v4.8h, v1.8h 4529 smull v4.4s, v5.4h, v1.4h 4530 smull2 v5.4s, v5.8h, v1.8h 4531 cmlt v20.4s, v16.4s, #0 // sign 4532 cmlt v21.4s, v17.4s, #0 4533 cmlt v22.4s, v18.4s, #0 4534 cmlt v23.4s, v19.4s, #0 4535 cmlt v24.4s, v2.4s, #0 4536 cmlt v25.4s, v3.4s, #0 4537 cmlt v26.4s, v4.4s, #0 4538 cmlt v27.4s, v5.4s, #0 4539 add v16.4s, v16.4s, v20.4s // diff + sign 4540 add v17.4s, v17.4s, v21.4s 4541 add v18.4s, v18.4s, v22.4s 4542 add v19.4s, v19.4s, v23.4s 4543 add v2.4s, v2.4s, v24.4s 4544 add v3.4s, v3.4s, v25.4s 4545 add v4.4s, v4.4s, v26.4s 4546 add v5.4s, v5.4s, v27.4s 4547 rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() 4548 rshrn2 v16.8h, v17.4s, #6 4549 rshrn v17.4h, v18.4s, #6 4550 rshrn2 v17.8h, v19.4s, #6 4551 rshrn v6.4h, v2.4s, #6 4552 rshrn2 v6.8h, v3.4s, #6 4553 rshrn v7.4h, v4.4s, #6 4554 rshrn2 v7.8h, v5.4s, #6 4555 add v2.8h, v16.8h, v0.8h // dc + apply_sign() 4556 add v3.8h, v17.8h, v0.8h 4557 add v4.8h, v6.8h, v0.8h 4558 add v5.8h, v7.8h, v0.8h 4559 smax v2.8h, v2.8h, v30.8h 4560 smax v3.8h, v3.8h, v30.8h 4561 smax v4.8h, v4.8h, v30.8h 4562 smax v5.8h, v5.8h, v30.8h 4563 smin v2.8h, v2.8h, v31.8h 4564 smin v3.8h, v3.8h, v31.8h 4565 smin v4.8h, v4.8h, v31.8h 4566 smin v5.8h, v5.8h, v31.8h 4567 st1 {v2.8h, v3.8h}, [x0], #32 4568 st1 {v4.8h, v5.8h}, [x6], #32 4569 b.gt 1b 4570 subs w4, w4, #2 4571 add x5, x5, w9, uxtw #1 4572 add x7, x7, w9, uxtw #1 4573 add x0, x0, x1 4574 add x6, x6, x1 4575 mov w3, w9 4576 b.gt 1b 4577 ret 4578endfunc 4579 4580jumptable ipred_cfl_128_tbl 4581ipred_cfl_splat_tbl: 4582 .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl 4583 .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl 4584 .word L(ipred_cfl_splat_w8) - ipred_cfl_128_tbl 4585 .word L(ipred_cfl_splat_w4) - ipred_cfl_128_tbl 4586endjumptable 4587 4588// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, 4589// const pixel *const topleft, 4590// const int width, const int height, 4591// const int16_t *ac, const int alpha, 4592// const int bitdepth_max); 4593function ipred_cfl_top_16bpc_neon, export=1 4594 dup v31.8h, w7 // bitdepth_max 4595 clz w9, w3 4596 movrel x7, ipred_cfl_top_tbl 4597 sub w9, w9, #26 4598 ldrsw x9, [x7, w9, uxtw #2] 4599 dup v1.8h, w6 // alpha 4600 add x2, x2, #2 4601 add x7, x7, x9 4602 add x6, x0, x1 4603 lsl x1, x1, #1 4604 movi v30.8h, #0 4605 br x7 46064: 4607 AARCH64_VALID_JUMP_TARGET 4608 ld1 {v0.4h}, [x2] 4609 addv h0, v0.4h 4610 urshr v0.4h, v0.4h, #2 4611 dup v0.8h, v0.h[0] 4612 b L(ipred_cfl_splat_w4) 46138: 4614 AARCH64_VALID_JUMP_TARGET 4615 ld1 {v0.8h}, [x2] 4616 addv h0, v0.8h 4617 urshr v0.4h, v0.4h, #3 4618 dup v0.8h, v0.h[0] 4619 b L(ipred_cfl_splat_w8) 462016: 4621 AARCH64_VALID_JUMP_TARGET 4622 ld1 {v2.8h, v3.8h}, [x2] 4623 addp v0.8h, v2.8h, v3.8h 4624 addv h0, v0.8h 4625 urshr v0.4h, v0.4h, #4 4626 dup v0.8h, v0.h[0] 4627 b L(ipred_cfl_splat_w16) 462832: 4629 AARCH64_VALID_JUMP_TARGET 4630 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 4631 addp v2.8h, v2.8h, v3.8h 4632 addp v4.8h, v4.8h, v5.8h 4633 addp v0.8h, v2.8h, v4.8h 4634 uaddlv s0, v0.8h 4635 rshrn v0.4h, v0.4s, #5 4636 dup v0.8h, v0.h[0] 4637 b L(ipred_cfl_splat_w16) 4638endfunc 4639 4640jumptable ipred_cfl_top_tbl 4641 .word 32b - ipred_cfl_top_tbl 4642 .word 16b - ipred_cfl_top_tbl 4643 .word 8b - ipred_cfl_top_tbl 4644 .word 4b - ipred_cfl_top_tbl 4645endjumptable 4646 4647// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, 4648// const pixel *const topleft, 4649// const int width, const int height, 4650// const int16_t *ac, const int alpha, 4651// const int bitdepth_max); 4652function ipred_cfl_left_16bpc_neon, export=1 4653 dup v31.8h, w7 // bitdepth_max 4654 sub x2, x2, w4, uxtw #1 4655 clz w9, w3 4656 clz w8, w4 4657 movrel x10, ipred_cfl_splat_tbl 4658 movrel x7, ipred_cfl_left_tbl 4659 sub w9, w9, #26 4660 sub w8, w8, #26 4661 ldrsw x9, [x10, w9, uxtw #2] 4662 ldrsw x8, [x7, w8, uxtw #2] 4663 dup v1.8h, w6 // alpha 4664 add x9, x10, x9 4665 add x7, x7, x8 4666 add x6, x0, x1 4667 lsl x1, x1, #1 4668 movi v30.8h, #0 4669 br x7 4670 4671L(ipred_cfl_left_h4): 4672 AARCH64_VALID_JUMP_TARGET 4673 ld1 {v0.4h}, [x2] 4674 addv h0, v0.4h 4675 urshr v0.4h, v0.4h, #2 4676 dup v0.8h, v0.h[0] 4677 br x9 4678 4679L(ipred_cfl_left_h8): 4680 AARCH64_VALID_JUMP_TARGET 4681 ld1 {v0.8h}, [x2] 4682 addv h0, v0.8h 4683 urshr v0.4h, v0.4h, #3 4684 dup v0.8h, v0.h[0] 4685 br x9 4686 4687L(ipred_cfl_left_h16): 4688 AARCH64_VALID_JUMP_TARGET 4689 ld1 {v2.8h, v3.8h}, [x2] 4690 addp v0.8h, v2.8h, v3.8h 4691 addv h0, v0.8h 4692 urshr v0.4h, v0.4h, #4 4693 dup v0.8h, v0.h[0] 4694 br x9 4695 4696L(ipred_cfl_left_h32): 4697 AARCH64_VALID_JUMP_TARGET 4698 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 4699 addp v2.8h, v2.8h, v3.8h 4700 addp v4.8h, v4.8h, v5.8h 4701 addp v0.8h, v2.8h, v4.8h 4702 uaddlv s0, v0.8h 4703 rshrn v0.4h, v0.4s, #5 4704 dup v0.8h, v0.h[0] 4705 br x9 4706endfunc 4707 4708jumptable ipred_cfl_left_tbl 4709 .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl 4710 .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl 4711 .word L(ipred_cfl_left_h8) - ipred_cfl_left_tbl 4712 .word L(ipred_cfl_left_h4) - ipred_cfl_left_tbl 4713endjumptable 4714 4715// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, 4716// const pixel *const topleft, 4717// const int width, const int height, 4718// const int16_t *ac, const int alpha, 4719// const int bitdepth_max); 4720function ipred_cfl_16bpc_neon, export=1 4721 dup v31.8h, w7 // bitdepth_max 4722 sub x2, x2, w4, uxtw #1 4723 add w8, w3, w4 // width + height 4724 dup v1.8h, w6 // alpha 4725 clz w9, w3 4726 clz w6, w4 4727 dup v16.4s, w8 // width + height 4728 movrel x7, ipred_cfl_tbl 4729 rbit w8, w8 // rbit(width + height) 4730 sub w9, w9, #22 // 26 leading bits, minus table offset 4 4731 sub w6, w6, #26 4732 clz w8, w8 // ctz(width + height) 4733 ldrsw x9, [x7, w9, uxtw #2] 4734 ldrsw x6, [x7, w6, uxtw #2] 4735 neg w8, w8 // -ctz(width + height) 4736 add x9, x7, x9 4737 add x7, x7, x6 4738 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 4739 dup v17.4s, w8 // -ctz(width + height) 4740 add x6, x0, x1 4741 lsl x1, x1, #1 4742 movi v30.8h, #0 4743 br x7 4744 4745L(ipred_cfl_h4): 4746 AARCH64_VALID_JUMP_TARGET 4747 ld1 {v0.4h}, [x2], #8 4748 uaddlv s0, v0.4h 4749 add x2, x2, #2 4750 br x9 4751L(ipred_cfl_w4): 4752 AARCH64_VALID_JUMP_TARGET 4753 ld1 {v2.4h}, [x2] 4754 add v0.2s, v0.2s, v16.2s 4755 uaddlv s2, v2.4h 4756 cmp w4, #4 4757 add v0.2s, v0.2s, v2.2s 4758 ushl v0.2s, v0.2s, v17.2s 4759 b.eq 1f 4760 // h = 8/16 4761 cmp w4, #16 4762 mov w16, #0x6667 4763 mov w17, #0xAAAB 4764 csel w16, w16, w17, eq 4765 dup v16.2s, w16 4766 mul v0.2s, v0.2s, v16.2s 4767 ushr v0.2s, v0.2s, #17 47681: 4769 dup v0.8h, v0.h[0] 4770 b L(ipred_cfl_splat_w4) 4771 4772L(ipred_cfl_h8): 4773 AARCH64_VALID_JUMP_TARGET 4774 ld1 {v0.8h}, [x2], #16 4775 uaddlv s0, v0.8h 4776 add x2, x2, #2 4777 br x9 4778L(ipred_cfl_w8): 4779 AARCH64_VALID_JUMP_TARGET 4780 ld1 {v2.8h}, [x2] 4781 add v0.2s, v0.2s, v16.2s 4782 uaddlv s2, v2.8h 4783 cmp w4, #8 4784 add v0.2s, v0.2s, v2.2s 4785 ushl v0.2s, v0.2s, v17.2s 4786 b.eq 1f 4787 // h = 4/16/32 4788 cmp w4, #32 4789 mov w16, #0x6667 4790 mov w17, #0xAAAB 4791 csel w16, w16, w17, eq 4792 dup v16.2s, w16 4793 mul v0.2s, v0.2s, v16.2s 4794 ushr v0.2s, v0.2s, #17 47951: 4796 dup v0.8h, v0.h[0] 4797 b L(ipred_cfl_splat_w8) 4798 4799L(ipred_cfl_h16): 4800 AARCH64_VALID_JUMP_TARGET 4801 ld1 {v2.8h, v3.8h}, [x2], #32 4802 addp v0.8h, v2.8h, v3.8h 4803 add x2, x2, #2 4804 uaddlv s0, v0.8h 4805 br x9 4806L(ipred_cfl_w16): 4807 AARCH64_VALID_JUMP_TARGET 4808 ld1 {v2.8h, v3.8h}, [x2] 4809 add v0.2s, v0.2s, v16.2s 4810 addp v2.8h, v2.8h, v3.8h 4811 uaddlv s2, v2.8h 4812 cmp w4, #16 4813 add v0.2s, v0.2s, v2.2s 4814 ushl v0.2s, v0.2s, v17.2s 4815 b.eq 1f 4816 // h = 4/8/32 4817 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask 4818 mov w16, #0x6667 4819 mov w17, #0xAAAB 4820 csel w16, w16, w17, eq 4821 dup v16.2s, w16 4822 mul v0.2s, v0.2s, v16.2s 4823 ushr v0.2s, v0.2s, #17 48241: 4825 dup v0.8h, v0.h[0] 4826 b L(ipred_cfl_splat_w16) 4827 4828L(ipred_cfl_h32): 4829 AARCH64_VALID_JUMP_TARGET 4830 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 4831 addp v2.8h, v2.8h, v3.8h 4832 addp v4.8h, v4.8h, v5.8h 4833 addp v0.8h, v2.8h, v4.8h 4834 add x2, x2, #2 4835 uaddlv s0, v0.8h 4836 br x9 4837L(ipred_cfl_w32): 4838 AARCH64_VALID_JUMP_TARGET 4839 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] 4840 add v0.4s, v0.4s, v16.4s 4841 addp v2.8h, v2.8h, v3.8h 4842 addp v4.8h, v4.8h, v5.8h 4843 addp v2.8h, v2.8h, v4.8h 4844 cmp w4, #32 4845 uaddlv s2, v2.8h 4846 add v0.2s, v0.2s, v2.2s 4847 ushl v0.2s, v0.2s, v17.2s 4848 b.eq 1f 4849 // h = 8/16 4850 cmp w4, #8 4851 mov w16, #0x6667 4852 mov w17, #0xAAAB 4853 csel w16, w16, w17, eq 4854 dup v16.2s, w16 4855 mul v0.2s, v0.2s, v16.2s 4856 ushr v0.2s, v0.2s, #17 48571: 4858 dup v0.8h, v0.h[0] 4859 b L(ipred_cfl_splat_w16) 4860endfunc 4861 4862jumptable ipred_cfl_tbl 4863 .word L(ipred_cfl_h32) - ipred_cfl_tbl 4864 .word L(ipred_cfl_h16) - ipred_cfl_tbl 4865 .word L(ipred_cfl_h8) - ipred_cfl_tbl 4866 .word L(ipred_cfl_h4) - ipred_cfl_tbl 4867 .word L(ipred_cfl_w32) - ipred_cfl_tbl 4868 .word L(ipred_cfl_w16) - ipred_cfl_tbl 4869 .word L(ipred_cfl_w8) - ipred_cfl_tbl 4870 .word L(ipred_cfl_w4) - ipred_cfl_tbl 4871endjumptable 4872 4873// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, 4874// const ptrdiff_t stride, const int w_pad, 4875// const int h_pad, const int cw, const int ch); 4876function ipred_cfl_ac_420_16bpc_neon, export=1 4877 clz w8, w5 4878 lsl w4, w4, #2 4879 movrel x7, ipred_cfl_ac_420_tbl 4880 sub w8, w8, #27 4881 ldrsw x8, [x7, w8, uxtw #2] 4882 movi v24.4s, #0 4883 movi v25.4s, #0 4884 movi v26.4s, #0 4885 movi v27.4s, #0 4886 add x7, x7, x8 4887 sub w8, w6, w4 // height - h_pad 4888 rbit w9, w5 // rbit(width) 4889 rbit w10, w6 // rbit(height) 4890 clz w9, w9 // ctz(width) 4891 clz w10, w10 // ctz(height) 4892 add w9, w9, w10 // log2sz 4893 add x10, x1, x2 4894 dup v31.4s, w9 4895 lsl x2, x2, #1 4896 neg v31.4s, v31.4s // -log2sz 4897 br x7 4898 4899L(ipred_cfl_ac_420_w4): 4900 AARCH64_VALID_JUMP_TARGET 49011: // Copy and subsample input 4902 ld1 {v0.8h}, [x1], x2 4903 ld1 {v1.8h}, [x10], x2 4904 ld1 {v2.8h}, [x1], x2 4905 ld1 {v3.8h}, [x10], x2 4906 addp v0.8h, v0.8h, v2.8h 4907 addp v1.8h, v1.8h, v3.8h 4908 add v0.8h, v0.8h, v1.8h 4909 shl v0.8h, v0.8h, #1 4910 subs w8, w8, #2 4911 st1 {v0.8h}, [x0], #16 4912 uaddw v24.4s, v24.4s, v0.4h 4913 uaddw2 v25.4s, v25.4s, v0.8h 4914 b.gt 1b 4915 trn2 v1.2d, v0.2d, v0.2d 4916 trn2 v0.2d, v0.2d, v0.2d 4917L(ipred_cfl_ac_420_w4_hpad): 4918 cbz w4, 3f 49192: // Vertical padding (h_pad > 0) 4920 subs w4, w4, #4 4921 st1 {v0.8h, v1.8h}, [x0], #32 4922 uaddw v24.4s, v24.4s, v0.4h 4923 uaddw2 v25.4s, v25.4s, v0.8h 4924 uaddw v26.4s, v26.4s, v1.4h 4925 uaddw2 v27.4s, v27.4s, v1.8h 4926 b.gt 2b 49273: 4928L(ipred_cfl_ac_420_w4_calc_subtract_dc): 4929 // Aggregate the sums 4930 add v24.4s, v24.4s, v25.4s 4931 add v26.4s, v26.4s, v27.4s 4932 add v0.4s, v24.4s, v26.4s 4933 addv s0, v0.4s // sum 4934 sub x0, x0, w6, uxtw #3 4935 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz 4936 dup v4.8h, v4.h[0] 49376: // Subtract dc from ac 4938 ld1 {v0.8h, v1.8h}, [x0] 4939 subs w6, w6, #4 4940 sub v0.8h, v0.8h, v4.8h 4941 sub v1.8h, v1.8h, v4.8h 4942 st1 {v0.8h, v1.8h}, [x0], #32 4943 b.gt 6b 4944 ret 4945 4946L(ipred_cfl_ac_420_w8): 4947 AARCH64_VALID_JUMP_TARGET 4948 cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 49491: // Copy and subsample input, without padding 4950 ld1 {v0.8h, v1.8h}, [x1], x2 4951 ld1 {v2.8h, v3.8h}, [x10], x2 4952 ld1 {v4.8h, v5.8h}, [x1], x2 4953 addp v0.8h, v0.8h, v1.8h 4954 ld1 {v6.8h, v7.8h}, [x10], x2 4955 addp v2.8h, v2.8h, v3.8h 4956 addp v4.8h, v4.8h, v5.8h 4957 addp v6.8h, v6.8h, v7.8h 4958 add v0.8h, v0.8h, v2.8h 4959 add v4.8h, v4.8h, v6.8h 4960 shl v0.8h, v0.8h, #1 4961 shl v1.8h, v4.8h, #1 4962 subs w8, w8, #2 4963 st1 {v0.8h, v1.8h}, [x0], #32 4964 uaddw v24.4s, v24.4s, v0.4h 4965 uaddw2 v25.4s, v25.4s, v0.8h 4966 uaddw v26.4s, v26.4s, v1.4h 4967 uaddw2 v27.4s, v27.4s, v1.8h 4968 b.gt 1b 4969 mov v0.16b, v1.16b 4970 b L(ipred_cfl_ac_420_w8_hpad) 4971 4972L(ipred_cfl_ac_420_w8_wpad): 49731: // Copy and subsample input, padding 4 4974 ld1 {v0.8h}, [x1], x2 4975 ld1 {v1.8h}, [x10], x2 4976 ld1 {v2.8h}, [x1], x2 4977 ld1 {v3.8h}, [x10], x2 4978 addp v0.8h, v0.8h, v2.8h 4979 addp v1.8h, v1.8h, v3.8h 4980 add v0.8h, v0.8h, v1.8h 4981 shl v0.8h, v0.8h, #1 4982 dup v1.4h, v0.h[3] 4983 dup v3.4h, v0.h[7] 4984 trn2 v2.2d, v0.2d, v0.2d 4985 subs w8, w8, #2 4986 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 4987 uaddw v24.4s, v24.4s, v0.4h 4988 uaddw v25.4s, v25.4s, v1.4h 4989 uaddw v26.4s, v26.4s, v2.4h 4990 uaddw v27.4s, v27.4s, v3.4h 4991 b.gt 1b 4992 trn1 v0.2d, v2.2d, v3.2d 4993 trn1 v1.2d, v2.2d, v3.2d 4994 4995L(ipred_cfl_ac_420_w8_hpad): 4996 cbz w4, 3f 49972: // Vertical padding (h_pad > 0) 4998 subs w4, w4, #4 4999 st1 {v0.8h, v1.8h}, [x0], #32 5000 uaddw v24.4s, v24.4s, v0.4h 5001 uaddw2 v25.4s, v25.4s, v0.8h 5002 uaddw v26.4s, v26.4s, v1.4h 5003 uaddw2 v27.4s, v27.4s, v1.8h 5004 st1 {v0.8h, v1.8h}, [x0], #32 5005 uaddw v24.4s, v24.4s, v0.4h 5006 uaddw2 v25.4s, v25.4s, v0.8h 5007 uaddw v26.4s, v26.4s, v1.4h 5008 uaddw2 v27.4s, v27.4s, v1.8h 5009 b.gt 2b 50103: 5011 5012 // Double the height and reuse the w4 summing/subtracting 5013 lsl w6, w6, #1 5014 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 5015 5016L(ipred_cfl_ac_420_w16): 5017 AARCH64_VALID_JUMP_TARGET 5018 movrel x7, ipred_cfl_ac_420_w16_tbl 5019 ldrsw x3, [x7, w3, uxtw #2] 5020 add x7, x7, x3 5021 br x7 5022 5023L(ipred_cfl_ac_420_w16_wpad0): 5024 AARCH64_VALID_JUMP_TARGET 50251: // Copy and subsample input, without padding 5026 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 5027 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 5028 addp v0.8h, v0.8h, v1.8h 5029 addp v2.8h, v2.8h, v3.8h 5030 addp v4.8h, v4.8h, v5.8h 5031 addp v6.8h, v6.8h, v7.8h 5032 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2 5033 add v0.8h, v0.8h, v4.8h 5034 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2 5035 add v2.8h, v2.8h, v6.8h 5036 addp v16.8h, v16.8h, v17.8h 5037 addp v18.8h, v18.8h, v19.8h 5038 addp v20.8h, v20.8h, v21.8h 5039 addp v22.8h, v22.8h, v23.8h 5040 add v16.8h, v16.8h, v20.8h 5041 add v18.8h, v18.8h, v22.8h 5042 shl v0.8h, v0.8h, #1 5043 shl v1.8h, v2.8h, #1 5044 shl v2.8h, v16.8h, #1 5045 shl v3.8h, v18.8h, #1 5046 subs w8, w8, #2 5047 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5048 uaddw v24.4s, v24.4s, v0.4h 5049 uaddw2 v25.4s, v25.4s, v0.8h 5050 uaddw v26.4s, v26.4s, v1.4h 5051 uaddw2 v27.4s, v27.4s, v1.8h 5052 uaddw v24.4s, v24.4s, v2.4h 5053 uaddw2 v25.4s, v25.4s, v2.8h 5054 uaddw v26.4s, v26.4s, v3.4h 5055 uaddw2 v27.4s, v27.4s, v3.8h 5056 b.gt 1b 5057 mov v0.16b, v2.16b 5058 mov v1.16b, v3.16b 5059 b L(ipred_cfl_ac_420_w16_hpad) 5060 5061L(ipred_cfl_ac_420_w16_wpad1): 5062 AARCH64_VALID_JUMP_TARGET 50631: // Copy and subsample input, padding 4 5064 ldr q2, [x1, #32] 5065 ld1 {v0.8h, v1.8h}, [x1], x2 5066 ldr q5, [x10, #32] 5067 ld1 {v3.8h, v4.8h}, [x10], x2 5068 addp v2.8h, v2.8h, v2.8h 5069 addp v0.8h, v0.8h, v1.8h 5070 addp v5.8h, v5.8h, v5.8h 5071 addp v3.8h, v3.8h, v4.8h 5072 ldr q18, [x1, #32] 5073 add v2.4h, v2.4h, v5.4h 5074 ld1 {v16.8h, v17.8h}, [x1], x2 5075 add v0.8h, v0.8h, v3.8h 5076 ldr q21, [x10, #32] 5077 ld1 {v19.8h, v20.8h}, [x10], x2 5078 addp v18.8h, v18.8h, v18.8h 5079 addp v16.8h, v16.8h, v17.8h 5080 addp v21.8h, v21.8h, v21.8h 5081 addp v19.8h, v19.8h, v20.8h 5082 add v18.4h, v18.4h, v21.4h 5083 add v16.8h, v16.8h, v19.8h 5084 shl v1.4h, v2.4h, #1 5085 shl v0.8h, v0.8h, #1 5086 shl v3.4h, v18.4h, #1 5087 shl v2.8h, v16.8h, #1 5088 dup v4.4h, v1.h[3] 5089 dup v5.4h, v3.h[3] 5090 trn1 v1.2d, v1.2d, v4.2d 5091 trn1 v3.2d, v3.2d, v5.2d 5092 subs w8, w8, #2 5093 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5094 uaddw v24.4s, v24.4s, v0.4h 5095 uaddw2 v25.4s, v25.4s, v0.8h 5096 uaddw v26.4s, v26.4s, v1.4h 5097 uaddw2 v27.4s, v27.4s, v1.8h 5098 uaddw v24.4s, v24.4s, v2.4h 5099 uaddw2 v25.4s, v25.4s, v2.8h 5100 uaddw v26.4s, v26.4s, v3.4h 5101 uaddw2 v27.4s, v27.4s, v3.8h 5102 b.gt 1b 5103 mov v0.16b, v2.16b 5104 mov v1.16b, v3.16b 5105 b L(ipred_cfl_ac_420_w16_hpad) 5106 5107L(ipred_cfl_ac_420_w16_wpad2): 5108 AARCH64_VALID_JUMP_TARGET 51091: // Copy and subsample input, padding 8 5110 ld1 {v0.8h, v1.8h}, [x1], x2 5111 ld1 {v2.8h, v3.8h}, [x10], x2 5112 ld1 {v4.8h, v5.8h}, [x1], x2 5113 addp v0.8h, v0.8h, v1.8h 5114 ld1 {v6.8h, v7.8h}, [x10], x2 5115 addp v2.8h, v2.8h, v3.8h 5116 addp v4.8h, v4.8h, v5.8h 5117 addp v6.8h, v6.8h, v7.8h 5118 add v0.8h, v0.8h, v2.8h 5119 add v4.8h, v4.8h, v6.8h 5120 shl v0.8h, v0.8h, #1 5121 shl v2.8h, v4.8h, #1 5122 dup v1.8h, v0.h[7] 5123 dup v3.8h, v2.h[7] 5124 subs w8, w8, #2 5125 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5126 uaddw v24.4s, v24.4s, v0.4h 5127 uaddw2 v25.4s, v25.4s, v0.8h 5128 uaddw v26.4s, v26.4s, v1.4h 5129 uaddw2 v27.4s, v27.4s, v1.8h 5130 uaddw v24.4s, v24.4s, v2.4h 5131 uaddw2 v25.4s, v25.4s, v2.8h 5132 uaddw v26.4s, v26.4s, v3.4h 5133 uaddw2 v27.4s, v27.4s, v3.8h 5134 b.gt 1b 5135 mov v0.16b, v2.16b 5136 mov v1.16b, v3.16b 5137 b L(ipred_cfl_ac_420_w16_hpad) 5138 5139L(ipred_cfl_ac_420_w16_wpad3): 5140 AARCH64_VALID_JUMP_TARGET 51411: // Copy and subsample input, padding 12 5142 ld1 {v0.8h}, [x1], x2 5143 ld1 {v2.8h}, [x10], x2 5144 ld1 {v4.8h}, [x1], x2 5145 ld1 {v6.8h}, [x10], x2 5146 addp v0.8h, v0.8h, v4.8h 5147 addp v2.8h, v2.8h, v6.8h 5148 add v0.8h, v0.8h, v2.8h 5149 shl v0.8h, v0.8h, #1 5150 dup v1.8h, v0.h[3] 5151 dup v3.8h, v0.h[7] 5152 trn2 v2.2d, v0.2d, v3.2d 5153 trn1 v0.2d, v0.2d, v1.2d 5154 subs w8, w8, #2 5155 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5156 uaddw v24.4s, v24.4s, v0.4h 5157 uaddw2 v25.4s, v25.4s, v0.8h 5158 uaddw v26.4s, v26.4s, v1.4h 5159 uaddw2 v27.4s, v27.4s, v1.8h 5160 uaddw v24.4s, v24.4s, v2.4h 5161 uaddw2 v25.4s, v25.4s, v2.8h 5162 uaddw v26.4s, v26.4s, v3.4h 5163 uaddw2 v27.4s, v27.4s, v3.8h 5164 b.gt 1b 5165 mov v0.16b, v2.16b 5166 mov v1.16b, v3.16b 5167 5168L(ipred_cfl_ac_420_w16_hpad): 5169 cbz w4, 3f 51702: // Vertical padding (h_pad > 0) 5171 subs w4, w4, #4 5172 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5173 uaddw v24.4s, v24.4s, v0.4h 5174 uaddw2 v25.4s, v25.4s, v0.8h 5175 uaddw v26.4s, v26.4s, v1.4h 5176 uaddw2 v27.4s, v27.4s, v1.8h 5177 uaddw v24.4s, v24.4s, v2.4h 5178 uaddw2 v25.4s, v25.4s, v2.8h 5179 uaddw v26.4s, v26.4s, v3.4h 5180 uaddw2 v27.4s, v27.4s, v3.8h 5181 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5182 uaddw v24.4s, v24.4s, v0.4h 5183 uaddw2 v25.4s, v25.4s, v0.8h 5184 uaddw v26.4s, v26.4s, v1.4h 5185 uaddw2 v27.4s, v27.4s, v1.8h 5186 uaddw v24.4s, v24.4s, v2.4h 5187 uaddw2 v25.4s, v25.4s, v2.8h 5188 uaddw v26.4s, v26.4s, v3.4h 5189 uaddw2 v27.4s, v27.4s, v3.8h 5190 b.gt 2b 51913: 5192 5193 // Quadruple the height and reuse the w4 summing/subtracting 5194 lsl w6, w6, #2 5195 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 5196endfunc 5197 5198jumptable ipred_cfl_ac_420_tbl 5199 .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl 5200 .word L(ipred_cfl_ac_420_w8) - ipred_cfl_ac_420_tbl 5201 .word L(ipred_cfl_ac_420_w4) - ipred_cfl_ac_420_tbl 5202endjumptable 5203 5204jumptable ipred_cfl_ac_420_w16_tbl 5205 .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl 5206 .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl 5207 .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl 5208 .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl 5209endjumptable 5210 5211// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, 5212// const ptrdiff_t stride, const int w_pad, 5213// const int h_pad, const int cw, const int ch); 5214function ipred_cfl_ac_422_16bpc_neon, export=1 5215 clz w8, w5 5216 lsl w4, w4, #2 5217 movrel x7, ipred_cfl_ac_422_tbl 5218 sub w8, w8, #27 5219 ldrsw x8, [x7, w8, uxtw #2] 5220 movi v24.4s, #0 5221 movi v25.4s, #0 5222 movi v26.4s, #0 5223 movi v27.4s, #0 5224 add x7, x7, x8 5225 sub w8, w6, w4 // height - h_pad 5226 rbit w9, w5 // rbit(width) 5227 rbit w10, w6 // rbit(height) 5228 clz w9, w9 // ctz(width) 5229 clz w10, w10 // ctz(height) 5230 add w9, w9, w10 // log2sz 5231 add x10, x1, x2 5232 dup v31.4s, w9 5233 lsl x2, x2, #1 5234 neg v31.4s, v31.4s // -log2sz 5235 br x7 5236 5237L(ipred_cfl_ac_422_w4): 5238 AARCH64_VALID_JUMP_TARGET 52391: // Copy and subsample input 5240 ld1 {v0.8h}, [x1], x2 5241 ld1 {v1.8h}, [x10], x2 5242 ld1 {v2.8h}, [x1], x2 5243 ld1 {v3.8h}, [x10], x2 5244 addp v0.8h, v0.8h, v1.8h 5245 addp v2.8h, v2.8h, v3.8h 5246 shl v0.8h, v0.8h, #2 5247 shl v1.8h, v2.8h, #2 5248 subs w8, w8, #4 5249 st1 {v0.8h, v1.8h}, [x0], #32 5250 uaddw v24.4s, v24.4s, v0.4h 5251 uaddw2 v25.4s, v25.4s, v0.8h 5252 uaddw v26.4s, v26.4s, v1.4h 5253 uaddw2 v27.4s, v27.4s, v1.8h 5254 b.gt 1b 5255 trn2 v0.2d, v1.2d, v1.2d 5256 trn2 v1.2d, v1.2d, v1.2d 5257 b L(ipred_cfl_ac_420_w4_hpad) 5258 5259L(ipred_cfl_ac_422_w8): 5260 AARCH64_VALID_JUMP_TARGET 5261 cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 52621: // Copy and subsample input, without padding 5263 ld1 {v0.8h, v1.8h}, [x1], x2 5264 ld1 {v2.8h, v3.8h}, [x10], x2 5265 ld1 {v4.8h, v5.8h}, [x1], x2 5266 addp v0.8h, v0.8h, v1.8h 5267 ld1 {v6.8h, v7.8h}, [x10], x2 5268 addp v2.8h, v2.8h, v3.8h 5269 addp v4.8h, v4.8h, v5.8h 5270 addp v6.8h, v6.8h, v7.8h 5271 shl v0.8h, v0.8h, #2 5272 shl v1.8h, v2.8h, #2 5273 shl v2.8h, v4.8h, #2 5274 shl v3.8h, v6.8h, #2 5275 subs w8, w8, #4 5276 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5277 uaddw v24.4s, v24.4s, v0.4h 5278 uaddw2 v25.4s, v25.4s, v0.8h 5279 uaddw v26.4s, v26.4s, v1.4h 5280 uaddw2 v27.4s, v27.4s, v1.8h 5281 uaddw v24.4s, v24.4s, v2.4h 5282 uaddw2 v25.4s, v25.4s, v2.8h 5283 uaddw v26.4s, v26.4s, v3.4h 5284 uaddw2 v27.4s, v27.4s, v3.8h 5285 b.gt 1b 5286 mov v0.16b, v3.16b 5287 mov v1.16b, v3.16b 5288 b L(ipred_cfl_ac_420_w8_hpad) 5289 5290L(ipred_cfl_ac_422_w8_wpad): 52911: // Copy and subsample input, padding 4 5292 ld1 {v0.8h}, [x1], x2 5293 ld1 {v1.8h}, [x10], x2 5294 ld1 {v2.8h}, [x1], x2 5295 ld1 {v3.8h}, [x10], x2 5296 addp v0.8h, v0.8h, v1.8h 5297 addp v2.8h, v2.8h, v3.8h 5298 shl v0.8h, v0.8h, #2 5299 shl v2.8h, v2.8h, #2 5300 dup v4.4h, v0.h[3] 5301 dup v5.8h, v0.h[7] 5302 dup v6.4h, v2.h[3] 5303 dup v7.8h, v2.h[7] 5304 trn2 v1.2d, v0.2d, v5.2d 5305 trn1 v0.2d, v0.2d, v4.2d 5306 trn2 v3.2d, v2.2d, v7.2d 5307 trn1 v2.2d, v2.2d, v6.2d 5308 subs w8, w8, #4 5309 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5310 uaddw v24.4s, v24.4s, v0.4h 5311 uaddw2 v25.4s, v25.4s, v0.8h 5312 uaddw v26.4s, v26.4s, v1.4h 5313 uaddw2 v27.4s, v27.4s, v1.8h 5314 uaddw v24.4s, v24.4s, v2.4h 5315 uaddw2 v25.4s, v25.4s, v2.8h 5316 uaddw v26.4s, v26.4s, v3.4h 5317 uaddw2 v27.4s, v27.4s, v3.8h 5318 b.gt 1b 5319 mov v0.16b, v3.16b 5320 mov v1.16b, v3.16b 5321 b L(ipred_cfl_ac_420_w8_hpad) 5322 5323L(ipred_cfl_ac_422_w16): 5324 AARCH64_VALID_JUMP_TARGET 5325 movrel x7, ipred_cfl_ac_422_w16_tbl 5326 ldrsw x3, [x7, w3, uxtw #2] 5327 add x7, x7, x3 5328 br x7 5329 5330L(ipred_cfl_ac_422_w16_wpad0): 5331 AARCH64_VALID_JUMP_TARGET 53321: // Copy and subsample input, without padding 5333 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 5334 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 5335 addp v0.8h, v0.8h, v1.8h 5336 addp v2.8h, v2.8h, v3.8h 5337 addp v4.8h, v4.8h, v5.8h 5338 addp v6.8h, v6.8h, v7.8h 5339 shl v0.8h, v0.8h, #2 5340 shl v1.8h, v2.8h, #2 5341 shl v2.8h, v4.8h, #2 5342 shl v3.8h, v6.8h, #2 5343 subs w8, w8, #2 5344 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5345 uaddw v24.4s, v24.4s, v0.4h 5346 uaddw2 v25.4s, v25.4s, v0.8h 5347 uaddw v26.4s, v26.4s, v1.4h 5348 uaddw2 v27.4s, v27.4s, v1.8h 5349 uaddw v24.4s, v24.4s, v2.4h 5350 uaddw2 v25.4s, v25.4s, v2.8h 5351 uaddw v26.4s, v26.4s, v3.4h 5352 uaddw2 v27.4s, v27.4s, v3.8h 5353 b.gt 1b 5354 mov v0.16b, v2.16b 5355 mov v1.16b, v3.16b 5356 b L(ipred_cfl_ac_420_w16_hpad) 5357 5358L(ipred_cfl_ac_422_w16_wpad1): 5359 AARCH64_VALID_JUMP_TARGET 53601: // Copy and subsample input, padding 4 5361 ldr q2, [x1, #32] 5362 ld1 {v0.8h, v1.8h}, [x1], x2 5363 ldr q6, [x10, #32] 5364 ld1 {v4.8h, v5.8h}, [x10], x2 5365 addp v2.8h, v2.8h, v2.8h 5366 addp v0.8h, v0.8h, v1.8h 5367 addp v6.8h, v6.8h, v6.8h 5368 addp v4.8h, v4.8h, v5.8h 5369 shl v1.4h, v2.4h, #2 5370 shl v0.8h, v0.8h, #2 5371 shl v3.4h, v6.4h, #2 5372 shl v2.8h, v4.8h, #2 5373 dup v4.4h, v1.h[3] 5374 dup v5.4h, v3.h[3] 5375 trn1 v1.2d, v1.2d, v4.2d 5376 trn1 v3.2d, v3.2d, v5.2d 5377 subs w8, w8, #2 5378 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5379 uaddw v24.4s, v24.4s, v0.4h 5380 uaddw2 v25.4s, v25.4s, v0.8h 5381 uaddw v26.4s, v26.4s, v1.4h 5382 uaddw2 v27.4s, v27.4s, v1.8h 5383 uaddw v24.4s, v24.4s, v2.4h 5384 uaddw2 v25.4s, v25.4s, v2.8h 5385 uaddw v26.4s, v26.4s, v3.4h 5386 uaddw2 v27.4s, v27.4s, v3.8h 5387 b.gt 1b 5388 mov v0.16b, v2.16b 5389 mov v1.16b, v3.16b 5390 b L(ipred_cfl_ac_420_w16_hpad) 5391 5392L(ipred_cfl_ac_422_w16_wpad2): 5393 AARCH64_VALID_JUMP_TARGET 53941: // Copy and subsample input, padding 8 5395 ld1 {v0.8h, v1.8h}, [x1], x2 5396 ld1 {v2.8h, v3.8h}, [x10], x2 5397 addp v0.8h, v0.8h, v1.8h 5398 addp v2.8h, v2.8h, v3.8h 5399 shl v0.8h, v0.8h, #2 5400 shl v2.8h, v2.8h, #2 5401 dup v1.8h, v0.h[7] 5402 dup v3.8h, v2.h[7] 5403 subs w8, w8, #2 5404 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5405 uaddw v24.4s, v24.4s, v0.4h 5406 uaddw2 v25.4s, v25.4s, v0.8h 5407 uaddw v26.4s, v26.4s, v1.4h 5408 uaddw2 v27.4s, v27.4s, v1.8h 5409 uaddw v24.4s, v24.4s, v2.4h 5410 uaddw2 v25.4s, v25.4s, v2.8h 5411 uaddw v26.4s, v26.4s, v3.4h 5412 uaddw2 v27.4s, v27.4s, v3.8h 5413 b.gt 1b 5414 mov v0.16b, v2.16b 5415 mov v1.16b, v3.16b 5416 b L(ipred_cfl_ac_420_w16_hpad) 5417 5418L(ipred_cfl_ac_422_w16_wpad3): 5419 AARCH64_VALID_JUMP_TARGET 54201: // Copy and subsample input, padding 12 5421 ld1 {v0.8h}, [x1], x2 5422 ld1 {v2.8h}, [x10], x2 5423 addp v0.8h, v0.8h, v0.8h 5424 addp v2.8h, v2.8h, v2.8h 5425 shl v0.4h, v0.4h, #2 5426 shl v2.4h, v2.4h, #2 5427 dup v1.8h, v0.h[3] 5428 dup v3.8h, v2.h[3] 5429 trn1 v0.2d, v0.2d, v1.2d 5430 trn1 v2.2d, v2.2d, v3.2d 5431 subs w8, w8, #2 5432 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5433 uaddw v24.4s, v24.4s, v0.4h 5434 uaddw2 v25.4s, v25.4s, v0.8h 5435 uaddw v26.4s, v26.4s, v1.4h 5436 uaddw2 v27.4s, v27.4s, v1.8h 5437 uaddw v24.4s, v24.4s, v2.4h 5438 uaddw2 v25.4s, v25.4s, v2.8h 5439 uaddw v26.4s, v26.4s, v3.4h 5440 uaddw2 v27.4s, v27.4s, v3.8h 5441 b.gt 1b 5442 mov v0.16b, v2.16b 5443 mov v1.16b, v3.16b 5444 b L(ipred_cfl_ac_420_w16_hpad) 5445endfunc 5446 5447jumptable ipred_cfl_ac_422_tbl 5448 .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl 5449 .word L(ipred_cfl_ac_422_w8) - ipred_cfl_ac_422_tbl 5450 .word L(ipred_cfl_ac_422_w4) - ipred_cfl_ac_422_tbl 5451endjumptable 5452 5453jumptable ipred_cfl_ac_422_w16_tbl 5454 .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl 5455 .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl 5456 .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl 5457 .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl 5458endjumptable 5459 5460// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, 5461// const ptrdiff_t stride, const int w_pad, 5462// const int h_pad, const int cw, const int ch); 5463function ipred_cfl_ac_444_16bpc_neon, export=1 5464 clz w8, w5 5465 lsl w4, w4, #2 5466 movrel x7, ipred_cfl_ac_444_tbl 5467 sub w8, w8, #26 5468 ldrsw x8, [x7, w8, uxtw #2] 5469 movi v24.4s, #0 5470 movi v25.4s, #0 5471 movi v26.4s, #0 5472 movi v27.4s, #0 5473 add x7, x7, x8 5474 sub w8, w6, w4 // height - h_pad 5475 rbit w9, w5 // rbit(width) 5476 rbit w10, w6 // rbit(height) 5477 clz w9, w9 // ctz(width) 5478 clz w10, w10 // ctz(height) 5479 add w9, w9, w10 // log2sz 5480 add x10, x1, x2 5481 dup v31.4s, w9 5482 lsl x2, x2, #1 5483 neg v31.4s, v31.4s // -log2sz 5484 br x7 5485 5486L(ipred_cfl_ac_444_w4): 5487 AARCH64_VALID_JUMP_TARGET 54881: // Copy and expand input 5489 ld1 {v0.4h}, [x1], x2 5490 ld1 {v0.d}[1], [x10], x2 5491 ld1 {v1.4h}, [x1], x2 5492 ld1 {v1.d}[1], [x10], x2 5493 shl v0.8h, v0.8h, #3 5494 shl v1.8h, v1.8h, #3 5495 subs w8, w8, #4 5496 st1 {v0.8h, v1.8h}, [x0], #32 5497 uaddw v24.4s, v24.4s, v0.4h 5498 uaddw2 v25.4s, v25.4s, v0.8h 5499 uaddw v26.4s, v26.4s, v1.4h 5500 uaddw2 v27.4s, v27.4s, v1.8h 5501 b.gt 1b 5502 trn2 v0.2d, v1.2d, v1.2d 5503 trn2 v1.2d, v1.2d, v1.2d 5504 b L(ipred_cfl_ac_420_w4_hpad) 5505 5506L(ipred_cfl_ac_444_w8): 5507 AARCH64_VALID_JUMP_TARGET 55081: // Copy and expand input 5509 ld1 {v0.8h}, [x1], x2 5510 ld1 {v1.8h}, [x10], x2 5511 ld1 {v2.8h}, [x1], x2 5512 shl v0.8h, v0.8h, #3 5513 ld1 {v3.8h}, [x10], x2 5514 shl v1.8h, v1.8h, #3 5515 shl v2.8h, v2.8h, #3 5516 shl v3.8h, v3.8h, #3 5517 subs w8, w8, #4 5518 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5519 uaddw v24.4s, v24.4s, v0.4h 5520 uaddw2 v25.4s, v25.4s, v0.8h 5521 uaddw v26.4s, v26.4s, v1.4h 5522 uaddw2 v27.4s, v27.4s, v1.8h 5523 uaddw v24.4s, v24.4s, v2.4h 5524 uaddw2 v25.4s, v25.4s, v2.8h 5525 uaddw v26.4s, v26.4s, v3.4h 5526 uaddw2 v27.4s, v27.4s, v3.8h 5527 b.gt 1b 5528 mov v0.16b, v3.16b 5529 mov v1.16b, v3.16b 5530 b L(ipred_cfl_ac_420_w8_hpad) 5531 5532L(ipred_cfl_ac_444_w16): 5533 AARCH64_VALID_JUMP_TARGET 5534 cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 55351: // Copy and expand input, without padding 5536 ld1 {v0.8h, v1.8h}, [x1], x2 5537 ld1 {v2.8h, v3.8h}, [x10], x2 5538 shl v0.8h, v0.8h, #3 5539 shl v1.8h, v1.8h, #3 5540 shl v2.8h, v2.8h, #3 5541 shl v3.8h, v3.8h, #3 5542 subs w8, w8, #2 5543 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5544 uaddw v24.4s, v24.4s, v0.4h 5545 uaddw2 v25.4s, v25.4s, v0.8h 5546 uaddw v26.4s, v26.4s, v1.4h 5547 uaddw2 v27.4s, v27.4s, v1.8h 5548 uaddw v24.4s, v24.4s, v2.4h 5549 uaddw2 v25.4s, v25.4s, v2.8h 5550 uaddw v26.4s, v26.4s, v3.4h 5551 uaddw2 v27.4s, v27.4s, v3.8h 5552 b.gt 1b 5553 mov v0.16b, v2.16b 5554 mov v1.16b, v3.16b 5555 b L(ipred_cfl_ac_420_w16_hpad) 5556 5557L(ipred_cfl_ac_444_w16_wpad): 55581: // Copy and expand input, padding 8 5559 ld1 {v0.8h}, [x1], x2 5560 ld1 {v2.8h}, [x10], x2 5561 shl v0.8h, v0.8h, #3 5562 shl v2.8h, v2.8h, #3 5563 dup v1.8h, v0.h[7] 5564 dup v3.8h, v2.h[7] 5565 subs w8, w8, #2 5566 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5567 uaddw v24.4s, v24.4s, v0.4h 5568 uaddw2 v25.4s, v25.4s, v0.8h 5569 uaddw v26.4s, v26.4s, v1.4h 5570 uaddw2 v27.4s, v27.4s, v1.8h 5571 uaddw v24.4s, v24.4s, v2.4h 5572 uaddw2 v25.4s, v25.4s, v2.8h 5573 uaddw v26.4s, v26.4s, v3.4h 5574 uaddw2 v27.4s, v27.4s, v3.8h 5575 b.gt 1b 5576 mov v0.16b, v2.16b 5577 mov v1.16b, v3.16b 5578 b L(ipred_cfl_ac_420_w16_hpad) 5579 5580L(ipred_cfl_ac_444_w32): 5581 AARCH64_VALID_JUMP_TARGET 5582 movrel x7, ipred_cfl_ac_444_w32_tbl 5583 lsr w3, w3, #1 5584 ldrsw x3, [x7, w3, uxtw #2] 5585 lsr x2, x2, #1 // Restore the stride to one line increments 5586 add x7, x7, x3 5587 br x7 5588 5589L(ipred_cfl_ac_444_w32_wpad0): 5590 AARCH64_VALID_JUMP_TARGET 55911: // Copy and expand input, without padding 5592 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 5593 shl v0.8h, v0.8h, #3 5594 shl v1.8h, v1.8h, #3 5595 shl v2.8h, v2.8h, #3 5596 shl v3.8h, v3.8h, #3 5597 subs w8, w8, #1 5598 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5599 uaddw v24.4s, v24.4s, v0.4h 5600 uaddw2 v25.4s, v25.4s, v0.8h 5601 uaddw v26.4s, v26.4s, v1.4h 5602 uaddw2 v27.4s, v27.4s, v1.8h 5603 uaddw v24.4s, v24.4s, v2.4h 5604 uaddw2 v25.4s, v25.4s, v2.8h 5605 uaddw v26.4s, v26.4s, v3.4h 5606 uaddw2 v27.4s, v27.4s, v3.8h 5607 b.gt 1b 5608 b L(ipred_cfl_ac_444_w32_hpad) 5609 5610L(ipred_cfl_ac_444_w32_wpad2): 5611 AARCH64_VALID_JUMP_TARGET 56121: // Copy and expand input, padding 8 5613 ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 5614 shl v2.8h, v2.8h, #3 5615 shl v0.8h, v0.8h, #3 5616 shl v1.8h, v1.8h, #3 5617 dup v3.8h, v2.h[7] 5618 subs w8, w8, #1 5619 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5620 uaddw v24.4s, v24.4s, v0.4h 5621 uaddw2 v25.4s, v25.4s, v0.8h 5622 uaddw v26.4s, v26.4s, v1.4h 5623 uaddw2 v27.4s, v27.4s, v1.8h 5624 uaddw v24.4s, v24.4s, v2.4h 5625 uaddw2 v25.4s, v25.4s, v2.8h 5626 uaddw v26.4s, v26.4s, v3.4h 5627 uaddw2 v27.4s, v27.4s, v3.8h 5628 b.gt 1b 5629 b L(ipred_cfl_ac_444_w32_hpad) 5630 5631L(ipred_cfl_ac_444_w32_wpad4): 5632 AARCH64_VALID_JUMP_TARGET 56331: // Copy and expand input, padding 16 5634 ld1 {v0.8h, v1.8h}, [x1], x2 5635 shl v1.8h, v1.8h, #3 5636 shl v0.8h, v0.8h, #3 5637 dup v2.8h, v1.h[7] 5638 dup v3.8h, v1.h[7] 5639 subs w8, w8, #1 5640 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5641 uaddw v24.4s, v24.4s, v0.4h 5642 uaddw2 v25.4s, v25.4s, v0.8h 5643 uaddw v26.4s, v26.4s, v1.4h 5644 uaddw2 v27.4s, v27.4s, v1.8h 5645 uaddw v24.4s, v24.4s, v2.4h 5646 uaddw2 v25.4s, v25.4s, v2.8h 5647 uaddw v26.4s, v26.4s, v3.4h 5648 uaddw2 v27.4s, v27.4s, v3.8h 5649 b.gt 1b 5650 b L(ipred_cfl_ac_444_w32_hpad) 5651 5652L(ipred_cfl_ac_444_w32_wpad6): 5653 AARCH64_VALID_JUMP_TARGET 56541: // Copy and expand input, padding 24 5655 ld1 {v0.8h}, [x1], x2 5656 shl v0.8h, v0.8h, #3 5657 dup v1.8h, v0.h[7] 5658 dup v2.8h, v0.h[7] 5659 dup v3.8h, v0.h[7] 5660 subs w8, w8, #1 5661 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5662 uaddw v24.4s, v24.4s, v0.4h 5663 uaddw2 v25.4s, v25.4s, v0.8h 5664 uaddw v26.4s, v26.4s, v1.4h 5665 uaddw2 v27.4s, v27.4s, v1.8h 5666 uaddw v24.4s, v24.4s, v2.4h 5667 uaddw2 v25.4s, v25.4s, v2.8h 5668 uaddw v26.4s, v26.4s, v3.4h 5669 uaddw2 v27.4s, v27.4s, v3.8h 5670 b.gt 1b 5671 5672L(ipred_cfl_ac_444_w32_hpad): 5673 cbz w4, 3f 56742: // Vertical padding (h_pad > 0) 5675 subs w4, w4, #2 5676 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5677 uaddw v24.4s, v24.4s, v0.4h 5678 uaddw2 v25.4s, v25.4s, v0.8h 5679 uaddw v26.4s, v26.4s, v1.4h 5680 uaddw2 v27.4s, v27.4s, v1.8h 5681 uaddw v24.4s, v24.4s, v2.4h 5682 uaddw2 v25.4s, v25.4s, v2.8h 5683 uaddw v26.4s, v26.4s, v3.4h 5684 uaddw2 v27.4s, v27.4s, v3.8h 5685 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 5686 uaddw v24.4s, v24.4s, v0.4h 5687 uaddw2 v25.4s, v25.4s, v0.8h 5688 uaddw v26.4s, v26.4s, v1.4h 5689 uaddw2 v27.4s, v27.4s, v1.8h 5690 uaddw v24.4s, v24.4s, v2.4h 5691 uaddw2 v25.4s, v25.4s, v2.8h 5692 uaddw v26.4s, v26.4s, v3.4h 5693 uaddw2 v27.4s, v27.4s, v3.8h 5694 b.gt 2b 56953: 5696 5697 // Multiply the height by eight and reuse the w4 subtracting 5698 lsl w6, w6, #3 5699 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) 5700endfunc 5701 5702jumptable ipred_cfl_ac_444_tbl 5703 .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl 5704 .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl 5705 .word L(ipred_cfl_ac_444_w8) - ipred_cfl_ac_444_tbl 5706 .word L(ipred_cfl_ac_444_w4) - ipred_cfl_ac_444_tbl 5707endjumptable 5708 5709jumptable ipred_cfl_ac_444_w32_tbl 5710 .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl 5711 .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl 5712 .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl 5713 .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl 5714endjumptable 5715