1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32 33wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 34 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 35wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 36wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 37wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 38sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 39sgr_r_ext: times 16 db 1 40 times 16 db 9 41sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 42 db 9, -1, 10, -1, 11, -1, 12, -1 43 44pb_m5: times 4 db -5 45pb_3: times 4 db 3 46pw_5_6: dw 5, 6 47pw_164_24: dw 164, 24 48pw_455_24: dw 455, 24 49pw_256: times 2 dw 256 50pw_2056: times 2 dw 2056 51pw_m16380: times 2 dw -16380 52pd_25: dd 25 53pd_34816: dd 34816 54pd_m4096: dd -4096 55pf_256: dd 256.0 56 57cextern pb_0to63 58 59SECTION .text 60 61DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers 62 63INIT_YMM avx2 64cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ 65 w, h, edge, flt 66 mov fltq, r6mp 67 movifnidn hd, hm 68 mov edged, r7m 69 mov wd, wm 70 vbroadcasti128 m6, [wiener_shufA] 71 vpbroadcastb m11, [fltq+ 0] ; x0 x0 72 vbroadcasti128 m7, [wiener_shufB] 73 vpbroadcastd m12, [fltq+ 2] 74 vbroadcasti128 m8, [wiener_shufC] 75 packsswb m12, m12 ; x1 x2 76 vpbroadcastw m13, [fltq+ 6] ; x3 77 vbroadcasti128 m9, [sgr_shuf+6] 78 add lpfq, wq 79 vpbroadcastd m10, [pw_m16380] 80 vpbroadcastd m14, [fltq+16] ; y0 y1 81 add dstq, wq 82 vpbroadcastd m15, [fltq+20] ; y2 y3 83 lea t1, [rsp+wq*2+16] 84 psllw m14, 5 85 neg wq 86 psllw m15, 5 87 test edgeb, 4 ; LR_HAVE_TOP 88 jz .no_top 89 call .h_top 90 add lpfq, strideq 91 mov t6, t1 92 mov t5, t1 93 add t1, 384*2 94 call .h_top 95 lea r10, [lpfq+strideq*4] 96 mov lpfq, dstq 97 mov t4, t1 98 add t1, 384*2 99 add r10, strideq 100 mov [rsp], r10 ; below 101 call .h 102 mov t3, t1 103 mov t2, t1 104 dec hd 105 jz .v1 106 add lpfq, strideq 107 add t1, 384*2 108 call .h 109 mov t2, t1 110 dec hd 111 jz .v2 112 add lpfq, strideq 113 add t1, 384*2 114 call .h 115 dec hd 116 jz .v3 117.main: 118 lea t0, [t1+384*2] 119.main_loop: 120 call .hv 121 dec hd 122 jnz .main_loop 123 test edgeb, 8 ; LR_HAVE_BOTTOM 124 jz .v3 125 mov lpfq, [rsp] 126 call .hv_bottom 127 add lpfq, strideq 128 call .hv_bottom 129.v1: 130 call .v 131 RET 132.no_top: 133 lea r10, [lpfq+strideq*4] 134 mov lpfq, dstq 135 lea r10, [r10+strideq*2] 136 mov [rsp], r10 137 call .h 138 mov t6, t1 139 mov t5, t1 140 mov t4, t1 141 mov t3, t1 142 mov t2, t1 143 dec hd 144 jz .v1 145 add lpfq, strideq 146 add t1, 384*2 147 call .h 148 mov t2, t1 149 dec hd 150 jz .v2 151 add lpfq, strideq 152 add t1, 384*2 153 call .h 154 dec hd 155 jz .v3 156 lea t0, [t1+384*2] 157 call .hv 158 dec hd 159 jz .v3 160 add t0, 384*8 161 call .hv 162 dec hd 163 jnz .main 164.v3: 165 call .v 166.v2: 167 call .v 168 jmp .v1 169.extend_right: 170 movd xm2, r10d 171 vpbroadcastd m0, [pb_3] 172 vpbroadcastd m1, [pb_m5] 173 vpbroadcastb m2, xm2 174 mova m3, [pb_0to63] 175 psubb m0, m2 176 psubb m1, m2 177 pminub m0, m3 178 pminub m1, m3 179 pshufb m4, m0 180 pshufb m5, m1 181 ret 182.h: 183 mov r10, wq 184 test edgeb, 1 ; LR_HAVE_LEFT 185 jz .h_extend_left 186 movd xm4, [leftq] 187 vpblendd m4, [lpfq+r10-4], 0xfe 188 add leftq, 4 189 jmp .h_main 190.h_extend_left: 191 vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located 192 mova m4, [lpfq+r10] ; before the start of the buffer 193 palignr m4, m5, 12 194 pshufb m4, [wiener_l_shuf] 195 jmp .h_main 196.h_top: 197 mov r10, wq 198 test edgeb, 1 ; LR_HAVE_LEFT 199 jz .h_extend_left 200.h_loop: 201 movu m4, [lpfq+r10-4] 202.h_main: 203 movu m5, [lpfq+r10+4] 204 test edgeb, 2 ; LR_HAVE_RIGHT 205 jnz .h_have_right 206 cmp r10d, -34 207 jl .h_have_right 208 call .extend_right 209.h_have_right: 210 pshufb m0, m4, m6 211 pmaddubsw m0, m11 212 pshufb m1, m5, m6 213 pmaddubsw m1, m11 214 pshufb m2, m4, m7 215 pmaddubsw m2, m12 216 pshufb m3, m5, m7 217 pmaddubsw m3, m12 218 paddw m0, m2 219 pshufb m2, m4, m8 220 pmaddubsw m2, m12 221 paddw m1, m3 222 pshufb m3, m5, m8 223 pmaddubsw m3, m12 224 pshufb m4, m9 225 paddw m0, m2 226 pmullw m2, m4, m13 227 pshufb m5, m9 228 paddw m1, m3 229 pmullw m3, m5, m13 230 psllw m4, 7 231 psllw m5, 7 232 paddw m4, m10 233 paddw m5, m10 234 paddw m0, m2 235 vpbroadcastd m2, [pw_2056] 236 paddw m1, m3 237 paddsw m0, m4 238 paddsw m1, m5 239 psraw m0, 3 240 psraw m1, 3 241 paddw m0, m2 242 paddw m1, m2 243 mova [t1+r10*2+ 0], m0 244 mova [t1+r10*2+32], m1 245 add r10, 32 246 jl .h_loop 247 ret 248ALIGN function_align 249.hv: 250 add lpfq, strideq 251 mov r10, wq 252 test edgeb, 1 ; LR_HAVE_LEFT 253 jz .hv_extend_left 254 movd xm4, [leftq] 255 vpblendd m4, [lpfq+r10-4], 0xfe 256 add leftq, 4 257 jmp .hv_main 258.hv_extend_left: 259 movu m4, [lpfq+r10-4] 260 pshufb m4, [wiener_l_shuf] 261 jmp .hv_main 262.hv_bottom: 263 mov r10, wq 264 test edgeb, 1 ; LR_HAVE_LEFT 265 jz .hv_extend_left 266.hv_loop: 267 movu m4, [lpfq+r10-4] 268.hv_main: 269 movu m5, [lpfq+r10+4] 270 test edgeb, 2 ; LR_HAVE_RIGHT 271 jnz .hv_have_right 272 cmp r10d, -34 273 jl .hv_have_right 274 call .extend_right 275.hv_have_right: 276 pshufb m0, m4, m6 277 pmaddubsw m0, m11 278 pshufb m1, m5, m6 279 pmaddubsw m1, m11 280 pshufb m2, m4, m7 281 pmaddubsw m2, m12 282 pshufb m3, m5, m7 283 pmaddubsw m3, m12 284 paddw m0, m2 285 pshufb m2, m4, m8 286 pmaddubsw m2, m12 287 paddw m1, m3 288 pshufb m3, m5, m8 289 pmaddubsw m3, m12 290 pshufb m4, m9 291 paddw m0, m2 292 pmullw m2, m4, m13 293 pshufb m5, m9 294 paddw m1, m3 295 pmullw m3, m5, m13 296 psllw m4, 7 297 psllw m5, 7 298 paddw m4, m10 299 paddw m5, m10 300 paddw m0, m2 301 paddw m1, m3 302 mova m2, [t4+r10*2] 303 paddw m2, [t2+r10*2] 304 mova m3, [t3+r10*2] 305 paddsw m0, m4 306 vpbroadcastd m4, [pw_2056] 307 paddsw m1, m5 308 mova m5, [t5+r10*2] 309 paddw m5, [t1+r10*2] 310 psraw m0, 3 311 psraw m1, 3 312 paddw m0, m4 313 paddw m1, m4 314 paddw m4, m0, [t6+r10*2] 315 mova [t0+r10*2], m0 316 punpcklwd m0, m2, m3 317 pmaddwd m0, m15 318 punpckhwd m2, m3 319 pmaddwd m2, m15 320 punpcklwd m3, m4, m5 321 pmaddwd m3, m14 322 punpckhwd m4, m5 323 pmaddwd m4, m14 324 paddd m0, m3 325 paddd m4, m2 326 mova m2, [t4+r10*2+32] 327 paddw m2, [t2+r10*2+32] 328 mova m3, [t3+r10*2+32] 329 mova m5, [t5+r10*2+32] 330 paddw m5, [t1+r10*2+32] 331 packuswb m0, m4 332 paddw m4, m1, [t6+r10*2+32] 333 mova [t0+r10*2+32], m1 334 punpcklwd m1, m2, m3 335 pmaddwd m1, m15 336 punpckhwd m2, m3 337 pmaddwd m2, m15 338 punpcklwd m3, m4, m5 339 pmaddwd m3, m14 340 punpckhwd m4, m5 341 pmaddwd m4, m14 342 paddd m1, m3 343 paddd m2, m4 344 packuswb m1, m2 345 psrlw m0, 8 346 psrlw m1, 8 347 packuswb m0, m1 348 mova [dstq+r10], m0 349 add r10, 32 350 jl .hv_loop 351 mov t6, t5 352 mov t5, t4 353 mov t4, t3 354 mov t3, t2 355 mov t2, t1 356 mov t1, t0 357 mov t0, t6 358 add dstq, strideq 359 ret 360.v: 361 mov r10, wq 362.v_loop: 363 mova m2, [t4+r10*2+ 0] 364 paddw m2, [t2+r10*2+ 0] 365 mova m4, [t3+r10*2+ 0] 366 mova m6, [t1+r10*2+ 0] 367 paddw m8, m6, [t6+r10*2+ 0] 368 paddw m6, [t5+r10*2+ 0] 369 mova m3, [t4+r10*2+32] 370 paddw m3, [t2+r10*2+32] 371 mova m5, [t3+r10*2+32] 372 mova m7, [t1+r10*2+32] 373 paddw m9, m7, [t6+r10*2+32] 374 paddw m7, [t5+r10*2+32] 375 punpcklwd m0, m2, m4 376 pmaddwd m0, m15 377 punpckhwd m2, m4 378 pmaddwd m2, m15 379 punpcklwd m4, m8, m6 380 pmaddwd m4, m14 381 punpckhwd m6, m8, m6 382 pmaddwd m6, m14 383 punpcklwd m1, m3, m5 384 pmaddwd m1, m15 385 punpckhwd m3, m5 386 pmaddwd m3, m15 387 punpcklwd m5, m9, m7 388 pmaddwd m5, m14 389 punpckhwd m7, m9, m7 390 pmaddwd m7, m14 391 paddd m0, m4 392 paddd m2, m6 393 paddd m1, m5 394 paddd m3, m7 395 packuswb m0, m2 396 packuswb m1, m3 397 psrlw m0, 8 398 psrlw m1, 8 399 packuswb m0, m1 400 mova [dstq+r10], m0 401 add r10, 32 402 jl .v_loop 403 mov t6, t5 404 mov t5, t4 405 mov t4, t3 406 mov t3, t2 407 mov t2, t1 408 add dstq, strideq 409 ret 410 411cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ 412 w, h, edge, flt 413 mov fltq, r6mp 414 movifnidn hd, hm 415 mov edged, r7m 416 mov wd, wm 417 vbroadcasti128 m6, [wiener_shufB] 418 vpbroadcastd m12, [fltq+ 2] 419 vbroadcasti128 m7, [wiener_shufC] 420 packsswb m12, m12 ; x1 x2 421 vpbroadcastw m13, [fltq+ 6] ; x3 422 vbroadcasti128 m8, [sgr_shuf+6] 423 add lpfq, wq 424 vpbroadcastd m9, [pw_m16380] 425 vpbroadcastd m10, [pw_2056] 426 mova m11, [wiener_l_shuf] 427 vpbroadcastd m14, [fltq+16] ; __ y1 428 add dstq, wq 429 vpbroadcastd m15, [fltq+20] ; y2 y3 430 lea t1, [rsp+wq*2+16] 431 psllw m14, 5 432 neg wq 433 psllw m15, 5 434 test edgeb, 4 ; LR_HAVE_TOP 435 jz .no_top 436 call .h_top 437 add lpfq, strideq 438 mov t4, t1 439 add t1, 384*2 440 call .h_top 441 lea r10, [lpfq+strideq*4] 442 mov lpfq, dstq 443 mov t3, t1 444 add t1, 384*2 445 add r10, strideq 446 mov [rsp], r10 ; below 447 call .h 448 mov t2, t1 449 dec hd 450 jz .v1 451 add lpfq, strideq 452 add t1, 384*2 453 call .h 454 dec hd 455 jz .v2 456.main: 457 mov t0, t4 458.main_loop: 459 call .hv 460 dec hd 461 jnz .main_loop 462 test edgeb, 8 ; LR_HAVE_BOTTOM 463 jz .v2 464 mov lpfq, [rsp] 465 call .hv_bottom 466 add lpfq, strideq 467 call .hv_bottom 468.end: 469 RET 470.no_top: 471 lea r10, [lpfq+strideq*4] 472 mov lpfq, dstq 473 lea r10, [r10+strideq*2] 474 mov [rsp], r10 475 call .h 476 mov t4, t1 477 mov t3, t1 478 mov t2, t1 479 dec hd 480 jz .v1 481 add lpfq, strideq 482 add t1, 384*2 483 call .h 484 dec hd 485 jz .v2 486 lea t0, [t1+384*2] 487 call .hv 488 dec hd 489 jz .v2 490 add t0, 384*6 491 call .hv 492 dec hd 493 jnz .main 494.v2: 495 call .v 496 mov t4, t3 497 mov t3, t2 498 mov t2, t1 499 add dstq, strideq 500.v1: 501 call .v 502 jmp .end 503.h: 504 mov r10, wq 505 test edgeb, 1 ; LR_HAVE_LEFT 506 jz .h_extend_left 507 movd xm4, [leftq] 508 vpblendd m4, [lpfq+r10-4], 0xfe 509 add leftq, 4 510 jmp .h_main 511.h_extend_left: 512 vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located 513 mova m4, [lpfq+r10] ; before the start of the buffer 514 palignr m4, m5, 12 515 pshufb m4, m11 516 jmp .h_main 517.h_top: 518 mov r10, wq 519 test edgeb, 1 ; LR_HAVE_LEFT 520 jz .h_extend_left 521.h_loop: 522 movu m4, [lpfq+r10-4] 523.h_main: 524 movu m5, [lpfq+r10+4] 525 test edgeb, 2 ; LR_HAVE_RIGHT 526 jnz .h_have_right 527 cmp r10d, -33 528 jl .h_have_right 529 call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right 530.h_have_right: 531 pshufb m0, m4, m6 532 pmaddubsw m0, m12 533 pshufb m1, m5, m6 534 pmaddubsw m1, m12 535 pshufb m2, m4, m7 536 pmaddubsw m2, m12 537 pshufb m3, m5, m7 538 pmaddubsw m3, m12 539 pshufb m4, m8 540 paddw m0, m2 541 pmullw m2, m4, m13 542 pshufb m5, m8 543 paddw m1, m3 544 pmullw m3, m5, m13 545 psllw m4, 7 546 psllw m5, 7 547 paddw m4, m9 548 paddw m5, m9 549 paddw m0, m2 550 paddw m1, m3 551 paddsw m0, m4 552 paddsw m1, m5 553 psraw m0, 3 554 psraw m1, 3 555 paddw m0, m10 556 paddw m1, m10 557 mova [t1+r10*2+ 0], m0 558 mova [t1+r10*2+32], m1 559 add r10, 32 560 jl .h_loop 561 ret 562ALIGN function_align 563.hv: 564 add lpfq, strideq 565 mov r10, wq 566 test edgeb, 1 ; LR_HAVE_LEFT 567 jz .hv_extend_left 568 movd xm4, [leftq] 569 vpblendd m4, [lpfq+r10-4], 0xfe 570 add leftq, 4 571 jmp .hv_main 572.hv_extend_left: 573 movu m4, [lpfq+r10-4] 574 pshufb m4, m11 575 jmp .hv_main 576.hv_bottom: 577 mov r10, wq 578 test edgeb, 1 ; LR_HAVE_LEFT 579 jz .hv_extend_left 580.hv_loop: 581 movu m4, [lpfq+r10-4] 582.hv_main: 583 movu m5, [lpfq+r10+4] 584 test edgeb, 2 ; LR_HAVE_RIGHT 585 jnz .hv_have_right 586 cmp r10d, -33 587 jl .hv_have_right 588 call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right 589.hv_have_right: 590 pshufb m0, m4, m6 591 pmaddubsw m0, m12 592 pshufb m1, m5, m6 593 pmaddubsw m1, m12 594 pshufb m2, m4, m7 595 pmaddubsw m2, m12 596 pshufb m3, m5, m7 597 pmaddubsw m3, m12 598 pshufb m4, m8 599 paddw m0, m2 600 pmullw m2, m4, m13 601 pshufb m5, m8 602 paddw m1, m3 603 pmullw m3, m5, m13 604 psllw m4, 7 605 psllw m5, 7 606 paddw m4, m9 607 paddw m5, m9 608 paddw m0, m2 609 paddw m1, m3 610 mova m2, [t3+r10*2] 611 paddw m2, [t1+r10*2] 612 mova m3, [t2+r10*2] 613 paddsw m0, m4 614 paddsw m1, m5 615 psraw m0, 3 616 psraw m1, 3 617 paddw m0, m10 618 paddw m1, m10 619 paddw m4, m0, [t4+r10*2] 620 mova [t0+r10*2], m0 621 punpcklwd m0, m2, m3 622 pmaddwd m0, m15 623 punpckhwd m2, m3 624 pmaddwd m2, m15 625 punpcklwd m3, m4, m4 626 pmaddwd m3, m14 627 punpckhwd m4, m4 628 pmaddwd m4, m14 629 paddd m0, m3 630 paddd m4, m2 631 mova m2, [t3+r10*2+32] 632 paddw m2, [t1+r10*2+32] 633 mova m3, [t2+r10*2+32] 634 packuswb m0, m4 635 paddw m4, m1, [t4+r10*2+32] 636 mova [t0+r10*2+32], m1 637 punpcklwd m1, m2, m3 638 pmaddwd m1, m15 639 punpckhwd m2, m3 640 pmaddwd m2, m15 641 punpcklwd m3, m4, m4 642 pmaddwd m3, m14 643 punpckhwd m4, m4 644 pmaddwd m4, m14 645 paddd m1, m3 646 paddd m2, m4 647 packuswb m1, m2 648 psrlw m0, 8 649 psrlw m1, 8 650 packuswb m0, m1 651 mova [dstq+r10], m0 652 add r10, 32 653 jl .hv_loop 654 mov t4, t3 655 mov t3, t2 656 mov t2, t1 657 mov t1, t0 658 mov t0, t4 659 add dstq, strideq 660 ret 661.v: 662 mov r10, wq 663 psrld m13, m14, 16 ; y1 __ 664.v_loop: 665 mova m6, [t1+r10*2+ 0] 666 paddw m2, m6, [t3+r10*2+ 0] 667 mova m4, [t2+r10*2+ 0] 668 mova m7, [t1+r10*2+32] 669 paddw m3, m7, [t3+r10*2+32] 670 mova m5, [t2+r10*2+32] 671 paddw m6, [t4+r10*2+ 0] 672 paddw m7, [t4+r10*2+32] 673 punpcklwd m0, m2, m4 674 pmaddwd m0, m15 675 punpckhwd m2, m4 676 pmaddwd m2, m15 677 punpcklwd m1, m3, m5 678 pmaddwd m1, m15 679 punpckhwd m3, m5 680 pmaddwd m3, m15 681 punpcklwd m5, m7, m6 682 pmaddwd m4, m5, m14 683 punpckhwd m7, m6 684 pmaddwd m6, m7, m14 685 pmaddwd m5, m13 686 pmaddwd m7, m13 687 paddd m0, m4 688 paddd m2, m6 689 paddd m1, m5 690 paddd m3, m7 691 packuswb m0, m2 692 packuswb m1, m3 693 psrlw m0, 8 694 psrlw m1, 8 695 packuswb m0, m1 696 mova [dstq+r10], m0 697 add r10, 32 698 jl .v_loop 699 ret 700 701cglobal sgr_filter_5x5_8bpc, 4, 12, 16, 400*24+16, dst, stride, left, lpf, \ 702 w, h, edge, params 703 mov paramsq, r6mp 704 mov wd, wm 705 movifnidn hd, hm 706 vbroadcasti128 m8, [sgr_shuf+0] 707 mov edged, r7m 708 vbroadcasti128 m9, [sgr_shuf+8] 709 add lpfq, wq 710 vbroadcasti128 m10, [sgr_shuf+2] 711 add dstq, wq 712 vbroadcasti128 m11, [sgr_shuf+6] 713 lea t3, [rsp+wq*4+16+400*12] 714 vpbroadcastw m7, [paramsq+8] ; w0 715 pxor m6, m6 716 vpbroadcastd m12, [paramsq+0] ; s0 717 lea t1, [rsp+wq*2+20] 718 vpbroadcastd m13, [pw_164_24] 719 neg wq 720 vbroadcastss m14, [pf_256] 721 psllw m7, 4 722 vpbroadcastd m15, [pd_m4096] 723 test edgeb, 4 ; LR_HAVE_TOP 724 jz .no_top 725 call .h_top 726 add lpfq, strideq 727 mov t2, t1 728 call .top_fixup 729 add t1, 400*6 730 call .h_top 731 lea r10, [lpfq+strideq*4] 732 mov lpfq, dstq 733 add r10, strideq 734 mov [rsp], r10 ; below 735 mov t0, t2 736 dec hd 737 jz .height1 738 or edged, 16 739 call .h 740.main: 741 add lpfq, strideq 742 call .hv 743 call .prep_n 744 sub hd, 2 745 jl .extend_bottom 746.main_loop: 747 add lpfq, strideq 748 test hd, hd 749 jz .odd_height 750 call .h 751 add lpfq, strideq 752 call .hv 753 call .n0 754 call .n1 755 sub hd, 2 756 jge .main_loop 757 test edgeb, 8 ; LR_HAVE_BOTTOM 758 jz .extend_bottom 759 mov lpfq, [rsp] 760 call .h_top 761 add lpfq, strideq 762 call .hv_bottom 763.end: 764 call .n0 765 call .n1 766.end2: 767 RET 768.height1: 769 call .hv 770 call .prep_n 771 jmp .odd_height_end 772.odd_height: 773 call .hv 774 call .n0 775 call .n1 776.odd_height_end: 777 call .v 778 call .n0 779 jmp .end2 780.extend_bottom: 781 call .v 782 jmp .end 783.no_top: 784 lea r10, [lpfq+strideq*4] 785 mov lpfq, dstq 786 lea r10, [r10+strideq*2] 787 mov [rsp], r10 788 call .h 789 lea t2, [t1+400*6] 790 call .top_fixup 791 dec hd 792 jz .no_top_height1 793 or edged, 16 794 mov t0, t1 795 mov t1, t2 796 jmp .main 797.no_top_height1: 798 call .v 799 call .prep_n 800 jmp .odd_height_end 801.extend_right: 802 movd xm2, r10d 803 mova m0, [sgr_r_ext] 804 vpbroadcastb m2, xm2 805 psubb m0, m2 806 pminub m0, [pb_0to63] 807 pshufb m5, m0 808 ret 809.h: ; horizontal boxsum 810 lea r10, [wq-2] 811 test edgeb, 1 ; LR_HAVE_LEFT 812 jz .h_extend_left 813 vpbroadcastd xm0, [leftq] 814 mova xm5, [lpfq+wq] 815 palignr xm5, xm0, 12 816 add leftq, 4 817 jmp .h_main 818.h_extend_left: 819 mova xm5, [lpfq+wq] 820 pshufb xm5, [sgr_l_shuf] 821 jmp .h_main 822.h_top: 823 lea r10, [wq-2] 824 test edgeb, 1 ; LR_HAVE_LEFT 825 jz .h_extend_left 826.h_loop: 827 movu xm5, [lpfq+r10-2] 828.h_main: 829 vinserti128 m5, [lpfq+r10+6], 1 830 test edgeb, 2 ; LR_HAVE_RIGHT 831 jnz .h_have_right 832 cmp r10d, -18 833 jl .h_have_right 834 call .extend_right 835.h_have_right: 836 pshufb m3, m5, m8 837 pmullw m4, m3, m3 838 pshufb m2, m5, m9 839 paddw m0, m3, m2 840 shufps m3, m2, q2121 841 paddw m0, m3 842 punpcklwd m1, m2, m3 843 pmaddwd m1, m1 844 punpckhwd m2, m3 845 pmaddwd m2, m2 846 punpcklwd m3, m4, m6 847 paddd m1, m3 848 punpckhwd m4, m6 849 paddd m2, m4 850 pshufb m4, m5, m10 851 paddw m0, m4 852 pshufb m5, m11 853 paddw m0, m5 ; sum 854 punpcklwd m3, m4, m5 855 pmaddwd m3, m3 856 punpckhwd m4, m5 857 pmaddwd m4, m4 858 test edgeb, 16 ; y > 0 859 jz .h_loop_end 860 paddw m0, [t1+r10*2+400*0] 861 paddd m1, [t1+r10*2+400*2] 862 paddd m2, [t1+r10*2+400*4] 863.h_loop_end: 864 paddd m1, m3 ; sumsq 865 paddd m2, m4 866 mova [t1+r10*2+400*0], m0 867 mova [t1+r10*2+400*2], m1 868 mova [t1+r10*2+400*4], m2 869 add r10, 16 870 jl .h_loop 871 ret 872.top_fixup: 873 lea r10, [wq-2] 874.top_fixup_loop: ; the sums of the first row needs to be doubled 875 mova m0, [t1+r10*2+400*0] 876 mova m1, [t1+r10*2+400*2] 877 mova m2, [t1+r10*2+400*4] 878 paddw m0, m0 879 paddd m1, m1 880 paddd m2, m2 881 mova [t2+r10*2+400*0], m0 882 mova [t2+r10*2+400*2], m1 883 mova [t2+r10*2+400*4], m2 884 add r10, 16 885 jl .top_fixup_loop 886 ret 887ALIGN function_align 888.hv: ; horizontal boxsum + vertical boxsum + ab 889 lea r10, [wq-2] 890 test edgeb, 1 ; LR_HAVE_LEFT 891 jz .hv_extend_left 892 vpbroadcastd xm0, [leftq] 893 mova xm5, [lpfq+wq] 894 palignr xm5, xm0, 12 895 add leftq, 4 896 jmp .hv_main 897.hv_extend_left: 898 mova xm5, [lpfq+wq] 899 pshufb xm5, [sgr_l_shuf] 900 jmp .hv_main 901.hv_bottom: 902 lea r10, [wq-2] 903 test edgeb, 1 ; LR_HAVE_LEFT 904 jz .hv_extend_left 905.hv_loop: 906 movu xm5, [lpfq+r10-2] 907.hv_main: 908 vinserti128 m5, [lpfq+r10+6], 1 909 test edgeb, 2 ; LR_HAVE_RIGHT 910 jnz .hv_have_right 911 cmp r10d, -18 912 jl .hv_have_right 913 call .extend_right 914.hv_have_right: 915 pshufb m1, m5, m8 916 pmullw m4, m1, m1 917 pshufb m3, m5, m9 918 paddw m0, m1, m3 919 shufps m1, m3, q2121 920 paddw m0, m1 921 punpcklwd m2, m3, m1 922 pmaddwd m2, m2 923 punpckhwd m3, m1 924 pmaddwd m3, m3 925 punpcklwd m1, m4, m6 926 paddd m2, m1 927 punpckhwd m4, m6 928 paddd m3, m4 929 pshufb m1, m5, m10 930 paddw m0, m1 931 pshufb m5, m11 932 paddw m0, m5 ; h sum 933 punpcklwd m4, m5, m1 934 pmaddwd m4, m4 935 punpckhwd m5, m1 936 pmaddwd m5, m5 937 paddw m1, m0, [t1+r10*2+400*0] 938 paddd m2, m4 ; h sumsq 939 paddd m3, m5 940 paddd m4, m2, [t1+r10*2+400*2] 941 paddd m5, m3, [t1+r10*2+400*4] 942 test hd, hd 943 jz .hv_last_row 944.hv_main2: 945 paddw m1, [t2+r10*2+400*0] ; hv sum 946 paddd m4, [t2+r10*2+400*2] ; hv sumsq 947 paddd m5, [t2+r10*2+400*4] 948 mova [t0+r10*2+400*0], m0 949 mova [t0+r10*2+400*2], m2 950 mova [t0+r10*2+400*4], m3 951 vpbroadcastd m2, [pd_25] 952 punpcklwd m0, m1, m6 ; b 953 punpckhwd m1, m6 954 pmulld m4, m2 ; a * 25 955 pmulld m5, m2 956 pmaddwd m2, m0, m0 ; b * b 957 pmaddwd m3, m1, m1 958 psubd m4, m2 ; p 959 psubd m5, m3 960 pmulld m4, m12 ; p * s 961 pmulld m5, m12 962 pmaddwd m0, m13 ; b * 164 963 pmaddwd m1, m13 964 paddw m4, m13 965 paddw m5, m13 966 psrld m4, 20 ; z + 1 967 psrld m5, 20 968 cvtdq2ps m4, m4 969 cvtdq2ps m5, m5 970 rcpps m2, m4 ; 1 / (z + 1) 971 rcpps m3, m5 972 pcmpgtd m4, m14, m4 973 pcmpgtd m5, m14, m5 974 mulps m2, m14 ; 256 / (z + 1) 975 mulps m3, m14 976 psrld m4, 24 ; z < 255 ? 255 : 0 977 psrld m5, 24 978 cvtps2dq m2, m2 979 cvtps2dq m3, m3 980 pminsw m2, m4 ; x 981 pminsw m3, m5 982 vpbroadcastd m4, [pd_34816] 983 pmulld m0, m2 984 pmulld m1, m3 985 paddd m0, m4 ; x * b * 164 + (1 << 11) + (1 << 15) 986 paddd m1, m4 987 pand m0, m15 988 pand m1, m15 989 por m0, m2 ; a | (b << 12) 990 por m1, m3 991 mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires 992 vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b. 993 mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but 994 vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way. 995 add r10, 16 996 jl .hv_loop 997 mov t2, t1 998 mov t1, t0 999 mov t0, t2 1000 ret 1001.hv_last_row: ; esoteric edge case for odd heights 1002 mova [t1+r10*2+400*0], m1 1003 paddw m1, m0 1004 mova [t1+r10*2+400*2], m4 1005 paddd m4, m2 1006 mova [t1+r10*2+400*4], m5 1007 paddd m5, m3 1008 jmp .hv_main2 1009.v: ; vertical boxsum + ab 1010 lea r10, [wq-2] 1011.v_loop: 1012 mova m0, [t1+r10*2+400*0] 1013 mova m2, [t1+r10*2+400*2] 1014 mova m3, [t1+r10*2+400*4] 1015 paddw m1, m0, [t2+r10*2+400*0] 1016 paddd m4, m2, [t2+r10*2+400*2] 1017 paddd m5, m3, [t2+r10*2+400*4] 1018 paddw m0, m0 1019 paddd m2, m2 1020 paddd m3, m3 1021 paddw m1, m0 ; hv sum 1022 paddd m4, m2 ; hv sumsq 1023 paddd m5, m3 1024 vpbroadcastd m2, [pd_25] 1025 punpcklwd m0, m1, m6 ; b 1026 punpckhwd m1, m6 1027 pmulld m4, m2 ; a * 25 1028 pmulld m5, m2 1029 pmaddwd m2, m0, m0 ; b * b 1030 pmaddwd m3, m1, m1 1031 psubd m4, m2 ; p 1032 psubd m5, m3 1033 pmulld m4, m12 ; p * s 1034 pmulld m5, m12 1035 pmaddwd m0, m13 ; b * 164 1036 pmaddwd m1, m13 1037 paddw m4, m13 1038 paddw m5, m13 1039 psrld m4, 20 ; z + 1 1040 psrld m5, 20 1041 cvtdq2ps m4, m4 1042 cvtdq2ps m5, m5 1043 rcpps m2, m4 ; 1 / (z + 1) 1044 rcpps m3, m5 1045 pcmpgtd m4, m14, m4 1046 pcmpgtd m5, m14, m5 1047 mulps m2, m14 ; 256 / (z + 1) 1048 mulps m3, m14 1049 psrld m4, 24 ; z < 255 ? 255 : 0 1050 psrld m5, 24 1051 cvtps2dq m2, m2 1052 cvtps2dq m3, m3 1053 pminsw m2, m4 ; x 1054 pminsw m3, m5 1055 vpbroadcastd m4, [pd_34816] 1056 pmulld m0, m2 1057 pmulld m1, m3 1058 paddd m0, m4 ; x * b * 164 + (1 << 11) + (1 << 15) 1059 paddd m1, m4 1060 pand m0, m15 1061 pand m1, m15 1062 por m0, m2 ; a | (b << 12) 1063 por m1, m3 1064 mova [t3+r10*4+ 8], xm0 1065 vextracti128 [t3+r10*4+40], m0, 1 1066 mova [t3+r10*4+24], xm1 1067 vextracti128 [t3+r10*4+56], m1, 1 1068 add r10, 16 1069 jl .v_loop 1070 ret 1071.prep_n: ; initial neighbor setup 1072 mov r10, wq 1073.prep_n_loop: 1074 movu m0, [t3+r10*4+ 4] 1075 movu m1, [t3+r10*4+36] 1076 paddd m2, m0, [t3+r10*4+ 0] 1077 paddd m3, m1, [t3+r10*4+32] 1078 paddd m2, [t3+r10*4+ 8] 1079 paddd m3, [t3+r10*4+40] 1080 paddd m0, m2 1081 pslld m2, 2 1082 paddd m1, m3 1083 pslld m3, 2 1084 paddd m2, m0 ; ab 565 1085 paddd m3, m1 1086 pandn m0, m15, m2 ; a 1087 psrld m2, 12 ; b 1088 pandn m1, m15, m3 1089 psrld m3, 12 1090 mova [t3+r10*4+400*4+ 0], m0 1091 mova [t3+r10*4+400*8+ 0], m2 1092 mova [t3+r10*4+400*4+32], m1 1093 mova [t3+r10*4+400*8+32], m3 1094 add r10, 16 1095 jl .prep_n_loop 1096 ret 1097ALIGN function_align 1098.n0: ; neighbor + output (even rows) 1099 mov r10, wq 1100.n0_loop: 1101 movu m0, [t3+r10*4+ 4] 1102 movu m1, [t3+r10*4+36] 1103 paddd m2, m0, [t3+r10*4+ 0] 1104 paddd m3, m1, [t3+r10*4+32] 1105 paddd m2, [t3+r10*4+ 8] 1106 paddd m3, [t3+r10*4+40] 1107 paddd m0, m2 1108 pslld m2, 2 1109 paddd m1, m3 1110 pslld m3, 2 1111 paddd m2, m0 1112 paddd m3, m1 1113 pandn m0, m15, m2 1114 psrld m2, 12 1115 pandn m1, m15, m3 1116 psrld m3, 12 1117 paddd m4, m0, [t3+r10*4+400*4+ 0] ; a 1118 paddd m5, m1, [t3+r10*4+400*4+32] 1119 mova [t3+r10*4+400*4+ 0], m0 1120 mova [t3+r10*4+400*4+32], m1 1121 paddd m0, m2, [t3+r10*4+400*8+ 0] ; b 1122 paddd m1, m3, [t3+r10*4+400*8+32] 1123 mova [t3+r10*4+400*8+ 0], m2 1124 mova [t3+r10*4+400*8+32], m3 1125 pmovzxbd m2, [dstq+r10+0] 1126 pmovzxbd m3, [dstq+r10+8] 1127 pmaddwd m4, m2 ; a * src 1128 pmaddwd m5, m3 1129 packssdw m2, m3 1130 psubd m0, m4 ; b - a * src + (1 << 8) 1131 psubd m1, m5 1132 psrad m0, 9 1133 psrad m1, 9 1134 packssdw m0, m1 1135 pmulhrsw m0, m7 1136 paddw m0, m2 1137 vextracti128 xm1, m0, 1 1138 packuswb xm0, xm1 1139 pshufd xm0, xm0, q3120 1140 mova [dstq+r10], xm0 1141 add r10, 16 1142 jl .n0_loop 1143 add dstq, strideq 1144 ret 1145ALIGN function_align 1146.n1: ; neighbor + output (odd rows) 1147 mov r10, wq 1148.n1_loop: 1149 pmovzxbd m2, [dstq+r10+0] 1150 pmovzxbd m3, [dstq+r10+8] 1151 pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src 1152 pmaddwd m5, m3, [t3+r10*4+400*4+32] 1153 mova m0, [t3+r10*4+400*8+ 0] ; b 1154 mova m1, [t3+r10*4+400*8+32] 1155 packssdw m2, m3 1156 psubd m0, m4 ; b - a * src + (1 << 7) 1157 psubd m1, m5 1158 psrad m0, 8 1159 psrad m1, 8 1160 packssdw m0, m1 1161 pmulhrsw m0, m7 1162 paddw m0, m2 1163 vextracti128 xm1, m0, 1 1164 packuswb xm0, xm1 1165 pshufd xm0, xm0, q3120 1166 mova [dstq+r10], xm0 1167 add r10, 16 1168 jl .n1_loop 1169 add dstq, strideq 1170 ret 1171 1172cglobal sgr_filter_3x3_8bpc, 4, 14, 16, -400*28-16, dst, stride, left, lpf, \ 1173 w, h, edge, params 1174 mov paramsq, r6mp 1175 mov wd, wm 1176 movifnidn hd, hm 1177 vbroadcasti128 m8, [sgr_shuf+2] 1178 mov edged, r7m 1179 vbroadcasti128 m9, [sgr_shuf+4] 1180 add lpfq, wq 1181 vbroadcasti128 m10, [sgr_shuf+6] 1182 add dstq, wq 1183 vpbroadcastw m7, [paramsq+10] ; w1 1184 lea t3, [rsp+wq*4+16+400*12] 1185 vpbroadcastd m11, [paramsq+ 4] ; s1 1186 pxor m6, m6 1187 vpbroadcastd m12, [pw_455_24] 1188 lea t1, [rsp+wq*2+20] 1189 vbroadcastss m13, [pf_256] 1190 neg wq 1191 vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) 1192 psllw m7, 4 1193 vpbroadcastd m15, [pd_m4096] 1194 test edgeb, 4 ; LR_HAVE_TOP 1195 jz .no_top 1196 call .h_top 1197 add lpfq, strideq 1198 mov t2, t1 1199 add t1, 400*6 1200 call .h_top 1201 lea t4, [lpfq+strideq*4] 1202 mov lpfq, dstq 1203 add t4, strideq 1204 mov [rsp], t4 ; below 1205 mov t0, t2 1206 call .hv 1207.main: 1208 mov t5, t3 1209 add t3, 400*4 1210 dec hd 1211 jz .height1 1212 add lpfq, strideq 1213 call .hv 1214 call .prep_n 1215 dec hd 1216 jz .extend_bottom 1217.main_loop: 1218 add lpfq, strideq 1219 call .hv 1220 call .n 1221 dec hd 1222 jnz .main_loop 1223 test edgeb, 8 ; LR_HAVE_BOTTOM 1224 jz .extend_bottom 1225 mov lpfq, [rsp] 1226 call .hv_bottom 1227 call .n 1228 add lpfq, strideq 1229 call .hv_bottom 1230.end: 1231 call .n 1232 RET 1233.height1: 1234 call .v 1235 call .prep_n 1236 mov t2, t1 1237 call .v 1238 jmp .end 1239.extend_bottom: 1240 call .v 1241 call .n 1242 mov t2, t1 1243 call .v 1244 jmp .end 1245.no_top: 1246 lea t4, [lpfq+strideq*4] 1247 mov lpfq, dstq 1248 lea t4, [t4+strideq*2] 1249 mov [rsp], t4 1250 call .h 1251 lea t0, [t1+400*6] 1252 mov t2, t1 1253 call .v 1254 jmp .main 1255.h: ; horizontal boxsum 1256 lea r10, [wq-2] 1257 test edgeb, 1 ; LR_HAVE_LEFT 1258 jz .h_extend_left 1259 vpbroadcastd xm0, [leftq] 1260 mova xm5, [lpfq+wq] 1261 palignr xm5, xm0, 12 1262 add leftq, 4 1263 jmp .h_main 1264.h_extend_left: 1265 mova xm5, [lpfq+wq] 1266 pshufb xm5, [sgr_l_shuf] 1267 jmp .h_main 1268.h_top: 1269 lea r10, [wq-2] 1270 test edgeb, 1 ; LR_HAVE_LEFT 1271 jz .h_extend_left 1272.h_loop: 1273 movu xm5, [lpfq+r10-2] 1274.h_main: 1275 vinserti128 m5, [lpfq+r10+6], 1 1276 test edgeb, 2 ; LR_HAVE_RIGHT 1277 jnz .h_have_right 1278 cmp r10d, -17 1279 jl .h_have_right 1280 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1281.h_have_right: 1282 pshufb m0, m5, m8 1283 pmullw m2, m0, m0 1284 pshufb m4, m5, m9 1285 paddw m0, m4 1286 pshufb m5, m10 1287 paddw m0, m5 ; sum 1288 punpcklwd m3, m4, m5 1289 pmaddwd m3, m3 1290 punpckhwd m4, m5 1291 pmaddwd m4, m4 1292 punpcklwd m1, m2, m6 1293 punpckhwd m2, m6 1294 mova [t1+r10*2+400*0], m0 1295 paddd m1, m3 ; sumsq 1296 paddd m2, m4 1297 mova [t1+r10*2+400*2], m1 1298 mova [t1+r10*2+400*4], m2 1299 add r10, 16 1300 jl .h_loop 1301 ret 1302ALIGN function_align 1303.hv: ; horizontal boxsum + vertical boxsum + ab 1304 lea r10, [wq-2] 1305 test edgeb, 1 ; LR_HAVE_LEFT 1306 jz .hv_extend_left 1307 vpbroadcastd xm0, [leftq] 1308 mova xm5, [lpfq+wq] 1309 palignr xm5, xm0, 12 1310 add leftq, 4 1311 jmp .hv_main 1312.hv_extend_left: 1313 mova xm5, [lpfq+wq] 1314 pshufb xm5, [sgr_l_shuf] 1315 jmp .hv_main 1316.hv_bottom: 1317 lea r10, [wq-2] 1318 test edgeb, 1 ; LR_HAVE_LEFT 1319 jz .hv_extend_left 1320.hv_loop: 1321 movu xm5, [lpfq+r10-2] 1322.hv_main: 1323 vinserti128 m5, [lpfq+r10+6], 1 1324 test edgeb, 2 ; LR_HAVE_RIGHT 1325 jnz .hv_have_right 1326 cmp r10d, -17 1327 jl .hv_have_right 1328 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1329.hv_have_right: 1330 pshufb m0, m5, m8 1331 pmullw m3, m0, m0 1332 pshufb m1, m5, m9 1333 paddw m0, m1 1334 pshufb m5, m10 1335 paddw m0, m5 ; h sum 1336 punpcklwd m4, m5, m1 1337 pmaddwd m4, m4 1338 punpckhwd m5, m1 1339 pmaddwd m5, m5 1340 paddw m1, m0, [t2+r10*2+400*0] 1341 paddw m1, [t1+r10*2+400*0] ; hv sum 1342 punpcklwd m2, m3, m6 1343 punpckhwd m3, m6 1344 paddd m4, m2 ; h sumsq 1345 paddd m5, m3 1346 paddd m2, m4, [t2+r10*2+400*2] 1347 paddd m3, m5, [t2+r10*2+400*4] 1348 paddd m2, [t1+r10*2+400*2] ; hv sumsq 1349 paddd m3, [t1+r10*2+400*4] 1350 mova [t0+r10*2+400*0], m0 1351 punpcklwd m0, m1, m6 ; b 1352 punpckhwd m1, m6 1353 mova [t0+r10*2+400*2], m4 1354 pslld m4, m2, 3 1355 mova [t0+r10*2+400*4], m5 1356 pslld m5, m3, 3 1357 paddd m4, m2 ; a * 9 1358 pmaddwd m2, m0, m0 ; b * b 1359 paddd m5, m3 1360 pmaddwd m3, m1, m1 1361 psubd m4, m2 ; p 1362 psubd m5, m3 1363 pmulld m4, m11 ; p * s 1364 pmulld m5, m11 1365 pmaddwd m0, m12 ; b * 455 1366 pmaddwd m1, m12 1367 paddw m4, m12 1368 paddw m5, m12 1369 psrld m4, 20 ; z + 1 1370 psrld m5, 20 1371 cvtdq2ps m4, m4 1372 cvtdq2ps m5, m5 1373 rcpps m2, m4 ; 1 / (z + 1) 1374 rcpps m3, m5 1375 pcmpgtd m4, m13, m4 1376 pcmpgtd m5, m13, m5 1377 mulps m2, m13 ; 256 / (z + 1) 1378 mulps m3, m13 1379 psrld m4, 24 ; z < 255 ? 255 : 0 1380 psrld m5, 24 1381 cvtps2dq m2, m2 1382 cvtps2dq m3, m3 1383 pminsw m2, m4 ; x 1384 pminsw m3, m5 1385 pmulld m0, m2 1386 pmulld m1, m3 1387 paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) 1388 paddd m1, m14 1389 pand m0, m15 1390 pand m1, m15 1391 por m0, m2 ; a | (b << 12) 1392 por m1, m3 1393 mova [t3+r10*4+ 8], xm0 1394 vextracti128 [t3+r10*4+40], m0, 1 1395 mova [t3+r10*4+24], xm1 1396 vextracti128 [t3+r10*4+56], m1, 1 1397 add r10, 16 1398 jl .hv_loop 1399 mov t2, t1 1400 mov t1, t0 1401 mov t0, t2 1402 ret 1403.v: ; vertical boxsum + ab 1404 lea r10, [wq-2] 1405.v_loop: 1406 mova m1, [t1+r10*2+400*0] 1407 paddw m1, m1 1408 paddw m1, [t2+r10*2+400*0] ; hv sum 1409 mova m2, [t1+r10*2+400*2] 1410 mova m3, [t1+r10*2+400*4] 1411 paddd m2, m2 1412 paddd m3, m3 1413 paddd m2, [t2+r10*2+400*2] ; hv sumsq 1414 paddd m3, [t2+r10*2+400*4] 1415 punpcklwd m0, m1, m6 ; b 1416 punpckhwd m1, m6 1417 pslld m4, m2, 3 1418 pslld m5, m3, 3 1419 paddd m4, m2 ; a * 9 1420 pmaddwd m2, m0, m0 ; b * b 1421 paddd m5, m3 1422 pmaddwd m3, m1, m1 1423 psubd m4, m2 ; p 1424 psubd m5, m3 1425 pmulld m4, m11 ; p * s 1426 pmulld m5, m11 1427 pmaddwd m0, m12 ; b * 455 1428 pmaddwd m1, m12 1429 paddw m4, m12 1430 paddw m5, m12 1431 psrld m4, 20 ; z + 1 1432 psrld m5, 20 1433 cvtdq2ps m4, m4 1434 cvtdq2ps m5, m5 1435 rcpps m2, m4 ; 1 / (z + 1) 1436 rcpps m3, m5 1437 pcmpgtd m4, m13, m4 1438 pcmpgtd m5, m13, m5 1439 mulps m2, m13 ; 256 / (z + 1) 1440 mulps m3, m13 1441 psrld m4, 24 ; z < 255 ? 255 : 0 1442 psrld m5, 24 1443 cvtps2dq m2, m2 1444 cvtps2dq m3, m3 1445 pminsw m2, m4 ; x 1446 pminsw m3, m5 1447 pmulld m0, m2 1448 pmulld m1, m3 1449 paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) 1450 paddd m1, m14 1451 pand m0, m15 1452 pand m1, m15 1453 por m0, m2 ; a | (b << 12) 1454 por m1, m3 1455 mova [t3+r10*4+ 8], xm0 1456 vextracti128 [t3+r10*4+40], m0, 1 1457 mova [t3+r10*4+24], xm1 1458 vextracti128 [t3+r10*4+56], m1, 1 1459 add r10, 16 1460 jl .v_loop 1461 ret 1462.prep_n: ; initial neighbor setup 1463 mov r10, wq 1464 mov t4, t3 1465 add t3, 400*4 1466.prep_n_loop: 1467 mova m2, [t5+r10*4+0] 1468 mova m3, [t4+r10*4+0] 1469 paddd m2, [t5+r10*4+8] 1470 paddd m3, [t4+r10*4+8] 1471 paddd m0, m2, [t5+r10*4+4] 1472 paddd m1, m3, [t4+r10*4+4] 1473 pslld m0, 2 1474 paddd m1, m1 ; ab[ 0] 222 1475 psubd m0, m2 ; ab[-1] 343 1476 mova [t3+r10*4+400*4], m1 1477 paddd m1, m1 1478 mova [t5+r10*4], m0 1479 psubd m1, m3 ; ab[ 0] 343 1480 mova [t4+r10*4], m1 1481 add r10, 8 1482 jl .prep_n_loop 1483 ret 1484; a+b are packed together in a single dword, but we can't do the 1485; full neighbor calculations before splitting them since we don't 1486; have sufficient precision. The solution is to do the calculations 1487; in two equal halves and split a and b before doing the final sum. 1488ALIGN function_align 1489.n: ; neighbor + output 1490 mov r10, wq 1491.n_loop: 1492 mova m4, [t3+r10*4+ 0] 1493 paddd m4, [t3+r10*4+ 8] 1494 paddd m5, m4, [t3+r10*4+ 4] 1495 paddd m5, m5 ; ab[+1] 222 1496 mova m2, [t3+r10*4+400*4+ 0] 1497 paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 1498 mova m3, [t3+r10*4+400*4+32] 1499 paddd m1, m3, [t5+r10*4+32] 1500 mova [t3+r10*4+400*4+ 0], m5 1501 paddd m5, m5 1502 psubd m5, m4 ; ab[+1] 343 1503 mova [t5+r10*4+ 0], m5 1504 paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343 1505 mova m4, [t3+r10*4+32] 1506 paddd m4, [t3+r10*4+40] 1507 paddd m5, m4, [t3+r10*4+36] 1508 paddd m5, m5 1509 mova [t3+r10*4+400*4+32], m5 1510 paddd m5, m5 1511 psubd m5, m4 1512 mova [t5+r10*4+32], m5 1513 pandn m4, m15, m0 1514 psrld m0, 12 1515 paddd m3, m5 1516 pandn m5, m15, m2 1517 psrld m2, 12 1518 paddd m4, m5 ; a 1519 pandn m5, m15, m1 1520 psrld m1, 12 1521 paddd m0, m2 ; b + (1 << 8) 1522 pandn m2, m15, m3 1523 psrld m3, 12 1524 paddd m5, m2 1525 pmovzxbd m2, [dstq+r10+0] 1526 paddd m1, m3 1527 pmovzxbd m3, [dstq+r10+8] 1528 pmaddwd m4, m2 ; a * src 1529 pmaddwd m5, m3 1530 packssdw m2, m3 1531 psubd m0, m4 ; b - a * src + (1 << 8) 1532 psubd m1, m5 1533 psrad m0, 9 1534 psrad m1, 9 1535 packssdw m0, m1 1536 pmulhrsw m0, m7 1537 paddw m0, m2 1538 vextracti128 xm1, m0, 1 1539 packuswb xm0, xm1 1540 pshufd xm0, xm0, q3120 1541 mova [dstq+r10], xm0 1542 add r10, 16 1543 jl .n_loop 1544 mov r10, t5 1545 mov t5, t4 1546 mov t4, r10 1547 add dstq, strideq 1548 ret 1549 1550cglobal sgr_filter_mix_8bpc, 4, 12, 16, 400*56+8, dst, stride, left, lpf, \ 1551 w, h, edge, params 1552 mov paramsq, r6mp 1553 mov wd, wm 1554 movifnidn hd, hm 1555 mov edged, r7m 1556 vbroadcasti128 m9, [sgr_shuf+0] 1557 vbroadcasti128 m10, [sgr_shuf+8] 1558 add lpfq, wq 1559 vbroadcasti128 m11, [sgr_shuf+2] 1560 vbroadcasti128 m12, [sgr_shuf+6] 1561 add dstq, wq 1562 vpbroadcastd m15, [paramsq+8] ; w0 w1 1563 lea t3, [rsp+wq*4+400*24+8] 1564 vpbroadcastd m13, [paramsq+0] ; s0 1565 pxor m7, m7 1566 vpbroadcastd m14, [paramsq+4] ; s1 1567 lea t1, [rsp+wq*2+12] 1568 neg wq 1569 psllw m15, 2 ; to reuse existing pd_m4096 register for rounding 1570 test edgeb, 4 ; LR_HAVE_TOP 1571 jz .no_top 1572 call .h_top 1573 add lpfq, strideq 1574 mov t2, t1 1575 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup 1576 add t1, 400*12 1577 call .h_top 1578 lea r10, [lpfq+strideq*4] 1579 mov lpfq, dstq 1580 add r10, strideq 1581 mov [rsp], r10 ; below 1582 call .hv0 1583.main: 1584 dec hd 1585 jz .height1 1586 add lpfq, strideq 1587 call .hv1 1588 call .prep_n 1589 sub hd, 2 1590 jl .extend_bottom 1591.main_loop: 1592 add lpfq, strideq 1593 call .hv0 1594 test hd, hd 1595 jz .odd_height 1596 add lpfq, strideq 1597 call .hv1 1598 call .n0 1599 call .n1 1600 sub hd, 2 1601 jge .main_loop 1602 test edgeb, 8 ; LR_HAVE_BOTTOM 1603 jz .extend_bottom 1604 mov lpfq, [rsp] 1605 call .hv0_bottom 1606 add lpfq, strideq 1607 call .hv1_bottom 1608.end: 1609 call .n0 1610 call .n1 1611.end2: 1612 RET 1613.height1: 1614 call .v1 1615 call .prep_n 1616 jmp .odd_height_end 1617.odd_height: 1618 call .v1 1619 call .n0 1620 call .n1 1621.odd_height_end: 1622 call .v0 1623 call .v1 1624 call .n0 1625 jmp .end2 1626.extend_bottom: 1627 call .v0 1628 call .v1 1629 jmp .end 1630.no_top: 1631 lea r10, [lpfq+strideq*4] 1632 mov lpfq, dstq 1633 lea r10, [r10+strideq*2] 1634 mov [rsp], r10 1635 call .h 1636 lea t2, [t1+400*12] 1637 lea r10, [wq-2] 1638.top_fixup_loop: 1639 mova m0, [t1+r10*2+400* 0] 1640 mova m1, [t1+r10*2+400* 2] 1641 mova m2, [t1+r10*2+400* 4] 1642 paddw m0, m0 1643 mova m3, [t1+r10*2+400* 6] 1644 paddd m1, m1 1645 mova m4, [t1+r10*2+400* 8] 1646 paddd m2, m2 1647 mova m5, [t1+r10*2+400*10] 1648 mova [t2+r10*2+400* 0], m0 1649 mova [t2+r10*2+400* 2], m1 1650 mova [t2+r10*2+400* 4], m2 1651 mova [t2+r10*2+400* 6], m3 1652 mova [t2+r10*2+400* 8], m4 1653 mova [t2+r10*2+400*10], m5 1654 add r10, 16 1655 jl .top_fixup_loop 1656 call .v0 1657 jmp .main 1658.h: ; horizontal boxsums 1659 lea r10, [wq-2] 1660 test edgeb, 1 ; LR_HAVE_LEFT 1661 jz .h_extend_left 1662 vpbroadcastd xm0, [leftq] 1663 mova xm5, [lpfq+wq] 1664 palignr xm5, xm0, 12 1665 add leftq, 4 1666 jmp .h_main 1667.h_extend_left: 1668 mova xm5, [lpfq+wq] 1669 pshufb xm5, [sgr_l_shuf] 1670 jmp .h_main 1671.h_top: 1672 lea r10, [wq-2] 1673 test edgeb, 1 ; LR_HAVE_LEFT 1674 jz .h_extend_left 1675.h_loop: 1676 movu xm5, [lpfq+r10-2] 1677.h_main: 1678 vinserti128 m5, [lpfq+r10+6], 1 1679 test edgeb, 2 ; LR_HAVE_RIGHT 1680 jnz .h_have_right 1681 cmp r10d, -18 1682 jl .h_have_right 1683 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1684.h_have_right: 1685 pshufb m6, m5, m9 1686 pshufb m4, m5, m10 1687 paddw m8, m6, m4 1688 shufps m0, m6, m4, q2121 1689 pmullw m3, m0, m0 1690 pshufb m2, m5, m11 1691 paddw m0, m2 1692 pshufb m5, m12 1693 paddw m0, m5 ; sum3 1694 punpcklwd m1, m2, m5 1695 pmaddwd m1, m1 1696 punpckhwd m2, m5 1697 pmaddwd m2, m2 1698 punpcklwd m5, m6, m4 1699 pmaddwd m5, m5 1700 punpckhwd m6, m4 1701 pmaddwd m6, m6 1702 punpcklwd m4, m3, m7 1703 paddd m1, m4 ; sumsq3 1704 punpckhwd m3, m7 1705 paddd m2, m3 1706 mova [t1+r10*2+400* 6], m0 1707 mova [t1+r10*2+400* 8], m1 1708 mova [t1+r10*2+400*10], m2 1709 paddw m8, m0 ; sum5 1710 paddd m5, m1 ; sumsq5 1711 paddd m6, m2 1712 mova [t1+r10*2+400* 0], m8 1713 mova [t1+r10*2+400* 2], m5 1714 mova [t1+r10*2+400* 4], m6 1715 add r10, 16 1716 jl .h_loop 1717 ret 1718ALIGN function_align 1719.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) 1720 lea r10, [wq-2] 1721 test edgeb, 1 ; LR_HAVE_LEFT 1722 jz .hv0_extend_left 1723 vpbroadcastd xm0, [leftq] 1724 mova xm5, [lpfq+wq] 1725 palignr xm5, xm0, 12 1726 add leftq, 4 1727 jmp .hv0_main 1728.hv0_extend_left: 1729 mova xm5, [lpfq+wq] 1730 pshufb xm5, [sgr_l_shuf] 1731 jmp .hv0_main 1732.hv0_bottom: 1733 lea r10, [wq-2] 1734 test edgeb, 1 ; LR_HAVE_LEFT 1735 jz .hv0_extend_left 1736.hv0_loop: 1737 movu xm5, [lpfq+r10-2] 1738.hv0_main: 1739 vinserti128 m5, [lpfq+r10+6], 1 1740 test edgeb, 2 ; LR_HAVE_RIGHT 1741 jnz .hv0_have_right 1742 cmp r10d, -18 1743 jl .hv0_have_right 1744 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1745.hv0_have_right: 1746 pshufb m6, m5, m9 1747 pshufb m4, m5, m10 1748 paddw m8, m6, m4 1749 shufps m1, m6, m4, q2121 1750 pmullw m0, m1, m1 1751 pshufb m3, m5, m11 1752 paddw m1, m3 1753 pshufb m5, m12 1754 paddw m1, m5 ; sum3 1755 punpcklwd m2, m3, m5 1756 pmaddwd m2, m2 1757 punpckhwd m3, m5 1758 pmaddwd m3, m3 1759 punpcklwd m5, m6, m4 1760 pmaddwd m5, m5 1761 punpckhwd m6, m4 1762 pmaddwd m6, m6 1763 punpcklwd m4, m0, m7 1764 paddd m2, m4 ; sumsq3 1765 punpckhwd m0, m7 1766 paddd m3, m0 1767 paddw m8, m1 ; sum5 1768 paddd m5, m2 ; sumsq5 1769 paddd m6, m3 1770 mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row 1771 mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd 1772 mova [t3+r10*4+400*0+40], m6 1773 paddw m8, [t1+r10*2+400* 0] 1774 paddd m5, [t1+r10*2+400* 2] 1775 paddd m6, [t1+r10*2+400* 4] 1776 mova [t1+r10*2+400* 0], m8 1777 mova [t1+r10*2+400* 2], m5 1778 mova [t1+r10*2+400* 4], m6 1779 paddw m0, m1, [t1+r10*2+400* 6] 1780 paddd m4, m2, [t1+r10*2+400* 8] 1781 paddd m5, m3, [t1+r10*2+400*10] 1782 mova [t1+r10*2+400* 6], m1 1783 mova [t1+r10*2+400* 8], m2 1784 mova [t1+r10*2+400*10], m3 1785 paddw m1, m0, [t2+r10*2+400* 6] 1786 paddd m2, m4, [t2+r10*2+400* 8] 1787 paddd m3, m5, [t2+r10*2+400*10] 1788 mova [t2+r10*2+400* 6], m0 1789 mova [t2+r10*2+400* 8], m4 1790 mova [t2+r10*2+400*10], m5 1791 vpbroadcastd m8, [pw_455_24] 1792 punpcklwd m0, m1, m7 ; b3 1793 vbroadcastss m6, [pf_256] 1794 punpckhwd m1, m7 1795 pslld m4, m2, 3 1796 pslld m5, m3, 3 1797 paddd m4, m2 ; a3 * 9 1798 pmaddwd m2, m0, m0 ; b3 * b 1799 paddd m5, m3 1800 pmaddwd m3, m1, m1 1801 psubd m4, m2 ; p3 1802 psubd m5, m3 1803 pmulld m4, m14 ; p3 * s1 1804 pmulld m5, m14 1805 pmaddwd m0, m8 ; b3 * 455 1806 pmaddwd m1, m8 1807 paddw m4, m8 1808 paddw m5, m8 1809 vpbroadcastd m8, [pd_34816] 1810 psrld m4, 20 ; z3 + 1 1811 psrld m5, 20 1812 cvtdq2ps m4, m4 1813 cvtdq2ps m5, m5 1814 rcpps m2, m4 ; 1 / (z3 + 1) 1815 rcpps m3, m5 1816 pcmpgtd m4, m6, m4 1817 pcmpgtd m5, m6, m5 1818 mulps m2, m6 ; 256 / (z3 + 1) 1819 mulps m3, m6 1820 vpbroadcastd m6, [pd_m4096] 1821 psrld m4, 24 ; z3 < 255 ? 255 : 0 1822 psrld m5, 24 1823 cvtps2dq m2, m2 1824 cvtps2dq m3, m3 1825 pminsw m2, m4 ; x3 1826 pminsw m3, m5 1827 pmulld m0, m2 1828 pmulld m1, m3 1829 paddd m0, m8 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1830 paddd m1, m8 1831 pand m0, m6 1832 pand m1, m6 1833 por m0, m2 ; a3 | (b3 << 12) 1834 por m1, m3 1835 mova [t3+r10*4+400*4+ 8], xm0 1836 vextracti128 [t3+r10*4+400*4+40], m0, 1 1837 mova [t3+r10*4+400*4+24], xm1 1838 vextracti128 [t3+r10*4+400*4+56], m1, 1 1839 add r10, 16 1840 jl .hv0_loop 1841 ret 1842ALIGN function_align 1843.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 1844 lea r10, [wq-2] 1845 test edgeb, 1 ; LR_HAVE_LEFT 1846 jz .hv1_extend_left 1847 vpbroadcastd xm0, [leftq] 1848 mova xm5, [lpfq+wq] 1849 palignr xm5, xm0, 12 1850 add leftq, 4 1851 jmp .hv1_main 1852.hv1_extend_left: 1853 mova xm5, [lpfq+wq] 1854 pshufb xm5, [sgr_l_shuf] 1855 jmp .hv1_main 1856.hv1_bottom: 1857 lea r10, [wq-2] 1858 test edgeb, 1 ; LR_HAVE_LEFT 1859 jz .hv1_extend_left 1860.hv1_loop: 1861 movu xm5, [lpfq+r10-2] 1862.hv1_main: 1863 vinserti128 m5, [lpfq+r10+6], 1 1864 test edgeb, 2 ; LR_HAVE_RIGHT 1865 jnz .hv1_have_right 1866 cmp r10d, -18 1867 jl .hv1_have_right 1868 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1869.hv1_have_right: 1870 pshufb m6, m5, m9 1871 pshufb m3, m5, m10 1872 paddw m8, m6, m3 1873 shufps m2, m6, m3, q2121 1874 pmullw m1, m2, m2 1875 pshufb m0, m5, m11 1876 paddw m2, m0 1877 pshufb m5, m12 1878 paddw m2, m5 ; sum3 1879 punpcklwd m4, m5, m0 1880 pmaddwd m4, m4 1881 punpckhwd m5, m0 1882 pmaddwd m5, m5 1883 punpcklwd m0, m6, m3 1884 pmaddwd m0, m0 1885 punpckhwd m6, m3 1886 pmaddwd m6, m6 1887 punpcklwd m3, m1, m7 1888 paddd m4, m3 ; sumsq3 1889 punpckhwd m1, m7 1890 paddd m5, m1 1891 paddw m1, m2, [t2+r10*2+400* 6] 1892 mova [t2+r10*2+400* 6], m2 1893 paddw m8, m2 ; sum5 1894 paddd m2, m4, [t2+r10*2+400* 8] 1895 paddd m3, m5, [t2+r10*2+400*10] 1896 mova [t2+r10*2+400* 8], m4 1897 mova [t2+r10*2+400*10], m5 1898 vpbroadcastd m9, [pw_455_24] 1899 paddd m4, m0 ; sumsq5 1900 paddd m5, m6 1901 punpcklwd m0, m1, m7 ; b3 1902 punpckhwd m1, m7 1903 pslld m6, m2, 3 1904 pslld m7, m3, 3 1905 paddd m6, m2 ; a3 * 9 1906 pmaddwd m2, m0, m0 ; b3 * b3 1907 paddd m7, m3 1908 pmaddwd m3, m1, m1 1909 psubd m6, m2 ; p3 1910 psubd m7, m3 1911 pmulld m6, m14 ; p3 * s1 1912 pmulld m7, m14 1913 pmaddwd m0, m9 ; b3 * 455 1914 pmaddwd m1, m9 1915 paddw m6, m9 1916 paddw m7, m9 1917 vbroadcastss m9, [pf_256] 1918 psrld m6, 20 ; z3 + 1 1919 psrld m7, 20 1920 cvtdq2ps m6, m6 1921 cvtdq2ps m7, m7 1922 rcpps m2, m6 ; 1 / (z3 + 1) 1923 rcpps m3, m7 1924 pcmpgtd m6, m9, m6 1925 pcmpgtd m7, m9, m7 1926 mulps m2, m9 ; 256 / (z3 + 1) 1927 mulps m3, m9 1928 vpbroadcastd m9, [pd_34816] 1929 psrld m6, 24 ; z3 < 255 ? 255 : 0 1930 psrld m7, 24 1931 cvtps2dq m2, m2 1932 cvtps2dq m3, m3 1933 pminsw m2, m6 ; x3 1934 vpbroadcastd m6, [pd_m4096] 1935 pminsw m3, m7 1936 pmulld m0, m2 1937 pmulld m1, m3 1938 paddd m0, m9 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1939 paddd m1, m9 1940 pand m0, m6 1941 pand m7, m6, m1 1942 por m0, m2 ; a3 | (b3 << 12) 1943 por m7, m3 1944 paddw m1, m8, [t2+r10*2+400*0] 1945 paddd m2, m4, [t2+r10*2+400*2] 1946 paddd m3, m5, [t2+r10*2+400*4] 1947 paddw m1, [t1+r10*2+400*0] 1948 paddd m2, [t1+r10*2+400*2] 1949 paddd m3, [t1+r10*2+400*4] 1950 mova [t2+r10*2+400*0], m8 1951 mova [t2+r10*2+400*2], m4 1952 mova [t2+r10*2+400*4], m5 1953 mova [t3+r10*4+400*8+ 8], xm0 1954 vextracti128 [t3+r10*4+400*8+40], m0, 1 1955 mova [t3+r10*4+400*8+24], xm7 1956 vextracti128 [t3+r10*4+400*8+56], m7, 1 1957 vpbroadcastd m4, [pd_25] 1958 pxor m7, m7 1959 vpbroadcastd m8, [pw_164_24] 1960 punpcklwd m0, m1, m7 ; b5 1961 punpckhwd m1, m7 1962 pmulld m2, m4 ; a5 * 25 1963 pmulld m3, m4 1964 pmaddwd m4, m0, m0 ; b5 * b5 1965 pmaddwd m5, m1, m1 1966 psubd m2, m4 ; p5 1967 psubd m3, m5 1968 pmulld m2, m13 ; p5 * s0 1969 pmulld m3, m13 1970 pmaddwd m0, m8 ; b5 * 164 1971 pmaddwd m1, m8 1972 paddw m2, m8 1973 paddw m3, m8 1974 vbroadcastss m8, [pf_256] 1975 psrld m2, 20 ; z5 + 1 1976 psrld m3, 20 1977 cvtdq2ps m2, m2 1978 cvtdq2ps m3, m3 1979 rcpps m4, m2 ; 1 / (z5 + 1) 1980 rcpps m5, m3 1981 pcmpgtd m2, m8, m2 1982 pcmpgtd m3, m8, m3 1983 mulps m4, m8 ; 256 / (z5 + 1) 1984 mulps m5, m8 1985 psrld m2, 24 ; z5 < 255 ? 255 : 0 1986 psrld m3, 24 1987 cvtps2dq m4, m4 1988 cvtps2dq m5, m5 1989 pminsw m4, m2 ; x5 1990 pminsw m5, m3 1991 pmulld m0, m4 1992 pmulld m1, m5 1993 paddd m0, m9 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 1994 paddd m1, m9 1995 vbroadcasti128 m9, [sgr_shuf] 1996 pand m0, m6 1997 pand m1, m6 1998 por m0, m4 ; a5 | (b5 << 12) 1999 por m1, m5 2000 mova [t3+r10*4+400*0+ 8], xm0 2001 vextracti128 [t3+r10*4+400*0+40], m0, 1 2002 mova [t3+r10*4+400*0+24], xm1 2003 vextracti128 [t3+r10*4+400*0+56], m1, 1 2004 add r10, 16 2005 jl .hv1_loop 2006 mov r10, t2 2007 mov t2, t1 2008 mov t1, r10 2009 ret 2010.v0: ; vertical boxsums + ab3 (even rows) 2011 lea r10, [wq-2] 2012 vpbroadcastd m6, [pd_34816] 2013.v0_loop: 2014 mova m0, [t1+r10*2+400* 6] 2015 mova m4, [t1+r10*2+400* 8] 2016 mova m5, [t1+r10*2+400*10] 2017 paddw m0, m0 2018 paddd m4, m4 2019 paddd m5, m5 2020 paddw m1, m0, [t2+r10*2+400* 6] 2021 paddd m2, m4, [t2+r10*2+400* 8] 2022 paddd m3, m5, [t2+r10*2+400*10] 2023 mova [t2+r10*2+400* 6], m0 2024 mova [t2+r10*2+400* 8], m4 2025 mova [t2+r10*2+400*10], m5 2026 vpbroadcastd m8, [pw_455_24] 2027 punpcklwd m0, m1, m7 ; b3 2028 punpckhwd m1, m7 2029 pslld m4, m2, 3 2030 pslld m5, m3, 3 2031 paddd m4, m2 ; a3 * 9 2032 pmaddwd m2, m0, m0 ; b3 * b3 2033 paddd m5, m3 2034 pmaddwd m3, m1, m1 2035 psubd m4, m2 ; p3 2036 psubd m5, m3 2037 pmulld m4, m14 ; p3 * s1 2038 pmulld m5, m14 2039 pmaddwd m0, m8 ; b3 * 455 2040 pmaddwd m1, m8 2041 paddw m4, m8 2042 paddw m5, m8 2043 vbroadcastss m8, [pf_256] 2044 psrld m4, 20 ; z3 + 1 2045 psrld m5, 20 2046 cvtdq2ps m4, m4 2047 cvtdq2ps m5, m5 2048 rcpps m2, m4 ; 1 / (z3 + 1) 2049 rcpps m3, m5 2050 pcmpgtd m4, m8, m4 2051 pcmpgtd m5, m8, m5 2052 mulps m2, m8 ; 256 / (z3 + 1) 2053 mulps m3, m8 2054 vpbroadcastd m8, [pd_m4096] 2055 psrld m4, 24 ; z3 < 255 ? 255 : 0 2056 psrld m5, 24 2057 cvtps2dq m2, m2 2058 cvtps2dq m3, m3 2059 pminsw m2, m4 ; x3 2060 pminsw m3, m5 2061 pmulld m0, m2 2062 pmulld m1, m3 2063 paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2064 paddd m1, m6 2065 pand m0, m8 2066 pand m1, m8 2067 por m0, m2 ; a3 | (b3 << 12) 2068 por m1, m3 2069 mova m2, [t1+r10*2+400*0] 2070 mova m3, [t1+r10*2+400*2] 2071 mova m4, [t1+r10*2+400*4] 2072 mova [t3+r10*4+400*8+ 8], m2 2073 mova [t3+r10*4+400*0+ 8], m3 2074 mova [t3+r10*4+400*0+40], m4 2075 paddw m2, m2 ; cc5 2076 paddd m3, m3 2077 paddd m4, m4 2078 mova [t1+r10*2+400*0], m2 2079 mova [t1+r10*2+400*2], m3 2080 mova [t1+r10*2+400*4], m4 2081 mova [t3+r10*4+400*4+ 8], xm0 2082 vextracti128 [t3+r10*4+400*4+40], m0, 1 2083 mova [t3+r10*4+400*4+24], xm1 2084 vextracti128 [t3+r10*4+400*4+56], m1, 1 2085 add r10, 16 2086 jl .v0_loop 2087 ret 2088.v1: ; vertical boxsums + ab (odd rows) 2089 lea r10, [wq-2] 2090.v1_loop: 2091 mova m4, [t1+r10*2+400* 6] 2092 mova m5, [t1+r10*2+400* 8] 2093 mova m6, [t1+r10*2+400*10] 2094 paddw m1, m4, [t2+r10*2+400* 6] 2095 paddd m2, m5, [t2+r10*2+400* 8] 2096 paddd m3, m6, [t2+r10*2+400*10] 2097 mova [t2+r10*2+400* 6], m4 2098 mova [t2+r10*2+400* 8], m5 2099 mova [t2+r10*2+400*10], m6 2100 vpbroadcastd m8, [pw_455_24] 2101 punpcklwd m0, m1, m7 ; b3 2102 punpckhwd m1, m7 2103 pslld m4, m2, 3 2104 pslld m5, m3, 3 2105 paddd m4, m2 ; a3 * 9 2106 pmaddwd m2, m0, m0 ; b3 * b3 2107 paddd m5, m3 2108 pmaddwd m3, m1, m1 2109 psubd m4, m2 ; p3 2110 psubd m5, m3 2111 pmulld m4, m14 ; p3 * s1 2112 pmulld m5, m14 2113 pmaddwd m0, m8 ; b3 * 455 2114 pmaddwd m1, m8 2115 paddw m4, m8 2116 paddw m5, m8 2117 vbroadcastss m8, [pf_256] 2118 psrld m4, 20 ; z3 + 1 2119 psrld m5, 20 2120 cvtdq2ps m4, m4 2121 cvtdq2ps m5, m5 2122 rcpps m2, m4 ; 1 / (z3 + 1) 2123 rcpps m3, m5 2124 pcmpgtd m4, m8, m4 2125 pcmpgtd m5, m8, m5 2126 mulps m2, m8 ; 256 / (z3 + 1) 2127 mulps m3, m8 2128 vpbroadcastd m8, [pd_m4096] 2129 psrld m4, 24 ; z3 < 255 ? 255 : 0 2130 psrld m5, 24 2131 cvtps2dq m2, m2 2132 cvtps2dq m3, m3 2133 pminsw m2, m4 ; x3 2134 vpbroadcastd m4, [pd_34816] 2135 pminsw m3, m5 2136 pmulld m0, m2 2137 pmulld m1, m3 2138 paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2139 paddd m1, m4 2140 pand m0, m8 2141 pand m8, m1 2142 por m0, m2 ; a3 | (b3 << 12) 2143 por m8, m3 2144 mova m4, [t3+r10*4+400*8+ 8] 2145 mova m5, [t3+r10*4+400*0+ 8] 2146 mova m6, [t3+r10*4+400*0+40] 2147 paddw m1, m4, [t2+r10*2+400*0] 2148 paddd m2, m5, [t2+r10*2+400*2] 2149 paddd m3, m6, [t2+r10*2+400*4] 2150 paddw m1, [t1+r10*2+400*0] 2151 paddd m2, [t1+r10*2+400*2] 2152 paddd m3, [t1+r10*2+400*4] 2153 mova [t2+r10*2+400*0], m4 2154 mova [t2+r10*2+400*2], m5 2155 mova [t2+r10*2+400*4], m6 2156 vpbroadcastd m4, [pd_25] 2157 mova [t3+r10*4+400*8+ 8], xm0 2158 vextracti128 [t3+r10*4+400*8+40], m0, 1 2159 mova [t3+r10*4+400*8+24], xm8 2160 vextracti128 [t3+r10*4+400*8+56], m8, 1 2161 vpbroadcastd m8, [pw_164_24] 2162 punpcklwd m0, m1, m7 ; b5 2163 vbroadcastss m6, [pf_256] 2164 punpckhwd m1, m7 2165 pmulld m2, m4 ; a5 * 25 2166 pmulld m3, m4 2167 pmaddwd m4, m0, m0 ; b5 * b5 2168 pmaddwd m5, m1, m1 2169 psubd m2, m4 ; p5 2170 psubd m3, m5 2171 pmulld m2, m13 ; p5 * s0 2172 pmulld m3, m13 2173 pmaddwd m0, m8 ; b5 * 164 2174 pmaddwd m1, m8 2175 paddw m2, m8 2176 paddw m3, m8 2177 vpbroadcastd m8, [pd_34816] 2178 psrld m2, 20 ; z5 + 1 2179 psrld m3, 20 2180 cvtdq2ps m2, m2 2181 cvtdq2ps m3, m3 2182 rcpps m4, m2 ; 1 / (z5 + 1) 2183 rcpps m5, m3 2184 pcmpgtd m2, m6, m2 2185 pcmpgtd m3, m6, m3 2186 mulps m4, m6 ; 256 / (z5 + 1) 2187 mulps m5, m6 2188 vpbroadcastd m6, [pd_m4096] 2189 psrld m2, 24 ; z5 < 255 ? 255 : 0 2190 psrld m3, 24 2191 cvtps2dq m4, m4 2192 cvtps2dq m5, m5 2193 pminsw m4, m2 ; x5 2194 pminsw m5, m3 2195 pmulld m0, m4 2196 pmulld m1, m5 2197 paddd m0, m8 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2198 paddd m1, m8 2199 pand m0, m6 2200 pand m1, m6 2201 por m0, m4 ; a5 | (b5 << 12) 2202 por m1, m5 2203 mova [t3+r10*4+400*0+ 8], xm0 2204 vextracti128 [t3+r10*4+400*0+40], m0, 1 2205 mova [t3+r10*4+400*0+24], xm1 2206 vextracti128 [t3+r10*4+400*0+56], m1, 1 2207 add r10, 16 2208 jl .v1_loop 2209 mov r10, t2 2210 mov t2, t1 2211 mov t1, r10 2212 ret 2213.prep_n: ; initial neighbor setup 2214 mov r10, wq 2215.prep_n_loop: 2216 movu m0, [t3+r10*4+400*0+4] 2217 paddd m1, m0, [t3+r10*4+400*0+0] 2218 mova m4, [t3+r10*4+400*4+0] 2219 paddd m1, [t3+r10*4+400*0+8] 2220 mova m5, [t3+r10*4+400*8+0] 2221 paddd m4, [t3+r10*4+400*4+8] 2222 paddd m5, [t3+r10*4+400*8+8] 2223 paddd m2, m4, [t3+r10*4+400*4+4] 2224 paddd m3, m5, [t3+r10*4+400*8+4] 2225 paddd m0, m1 2226 pslld m1, 2 2227 pslld m2, 2 2228 paddd m1, m0 ; ab5 565 2229 paddd m3, m3 ; ab3[ 0] 222 2230 psubd m2, m4 ; ab3[-1] 343 2231 mova [t3+r10*4+400*20], m3 2232 pandn m0, m6, m1 ; a5 565 2233 mova [t3+r10*4+400*24], m2 2234 psrld m1, 12 ; b5 565 2235 mova [t3+r10*4+400*12], m0 2236 paddd m3, m3 2237 mova [t3+r10*4+400*16], m1 2238 psubd m3, m5 ; ab3[ 0] 343 2239 mova [t3+r10*4+400*28], m3 2240 add r10, 8 2241 jl .prep_n_loop 2242 ret 2243ALIGN function_align 2244.n0: ; neighbor + output (even rows) 2245 mov r10, wq 2246.n0_loop: 2247 movu m0, [t3+r10*4+4] 2248 paddd m4, m0, [t3+r10*4+0] 2249 paddd m4, [t3+r10*4+8] 2250 paddd m0, m4 2251 pslld m4, 2 2252 paddd m4, m0 2253 pandn m0, m6, m4 2254 psrld m4, 12 2255 paddd m2, m0, [t3+r10*4+400*12] ; a5 2256 mova [t3+r10*4+400*12], m0 2257 paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) 2258 mova [t3+r10*4+400*16], m4 2259 mova m3, [t3+r10*4+400*4+0] 2260 paddd m3, [t3+r10*4+400*4+8] 2261 paddd m5, m3, [t3+r10*4+400*4+4] 2262 paddd m5, m5 ; ab3[ 1] 222 2263 mova m4, [t3+r10*4+400*20] 2264 paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343 2265 mova [t3+r10*4+400*20], m5 2266 paddd m5, m5 2267 psubd m5, m3 ; ab3[ 1] 343 2268 mova [t3+r10*4+400*24], m5 2269 paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 2270 pandn m3, m6, m1 2271 psrld m1, 12 2272 pandn m5, m6, m4 2273 psrld m4, 12 2274 paddd m3, m5 ; a3 2275 paddd m1, m4 ; b3 + (1 << 8) 2276 pmovzxbd m4, [dstq+r10] 2277 pmaddwd m2, m4 ; a5 * src 2278 pmaddwd m3, m4 ; a3 * src 2279 psubd m0, m2 ; b5 - a5 * src + (1 << 8) 2280 psubd m1, m3 ; b3 - a3 * src + (1 << 8) 2281 psrld m0, 9 2282 pslld m1, 7 2283 pblendw m0, m1, 0xaa 2284 pmaddwd m0, m15 2285 psubd m0, m6 2286 psrad m0, 13 2287 paddd m0, m4 2288 vextracti128 xm1, m0, 1 2289 packssdw xm0, xm1 2290 packuswb xm0, xm0 2291 movq [dstq+r10], xm0 2292 add r10, 8 2293 jl .n0_loop 2294 add dstq, strideq 2295 ret 2296ALIGN function_align 2297.n1: ; neighbor + output (odd rows) 2298 mov r10, wq 2299.n1_loop: 2300 mova m3, [t3+r10*4+400*8+0] 2301 paddd m3, [t3+r10*4+400*8+8] 2302 paddd m5, m3, [t3+r10*4+400*8+4] 2303 paddd m5, m5 ; ab3[ 1] 222 2304 mova m4, [t3+r10*4+400*20] 2305 paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343 2306 mova [t3+r10*4+400*20], m5 2307 paddd m5, m5 2308 psubd m5, m3 ; ab3[ 1] 343 2309 mova [t3+r10*4+400*28], m5 2310 paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 2311 pandn m3, m6, m1 2312 psrld m1, 12 2313 pandn m5, m6, m4 2314 psrld m4, 12 2315 paddd m3, m5 ; -a3 2316 paddd m1, m4 ; b3 + (1 << 8) 2317 pmovzxbd m4, [dstq+r10] 2318 pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src 2319 mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7) 2320 pmaddwd m3, m4 ; -a3 * src 2321 psubd m0, m2 ; a5 * src + b5 + (1 << 7) 2322 psubd m1, m3 ; a3 * src + b3 + (1 << 8) 2323 psrld m0, 8 2324 pslld m1, 7 2325 pblendw m0, m1, 0xaa 2326 pmaddwd m0, m15 2327 psubd m0, m6 2328 psrad m0, 13 2329 paddd m0, m4 2330 vextracti128 xm1, m0, 1 2331 packssdw xm0, xm1 2332 packuswb xm0, xm0 2333 movq [dstq+r10], xm0 2334 add r10, 8 2335 jl .n1_loop 2336 add dstq, strideq 2337 ret 2338 2339%endif ; ARCH_X86_64 2340