1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32 33sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 34sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 35wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 36 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 37wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15 38 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 39wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 40wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 41wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 42wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 43wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 44 45wiener_hshift: dw 4, 4, 1, 1 46wiener_vshift: dw 1024, 1024, 4096, 4096 47wiener_round: dd 1049600, 1048832 48 49pb_m10_m9: times 2 db -10, -9 50pb_m6_m5: times 2 db -6, -5 51pb_m2_m1: times 2 db -2, -1 52pb_2_3: times 2 db 2, 3 53pb_6_7: times 2 db 6, 7 54pw_1023: times 2 dw 1023 55pw_164_24: dw 164, 24 56pw_455_24: dw 455, 24 57pd_8: dd 8 58pd_25: dd 25 59pd_4096: dd 4096 60pd_34816: dd 34816 61pd_m262128: dd -262128 62pf_256: dd 256.0 63 64%define pw_256 sgr_lshuf5 65 66cextern pb_0to63 67 68SECTION .text 69 70DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers 71 72INIT_YMM avx2 73cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ 74 w, h, edge, flt 75%define base t4-wiener_hshift 76 mov fltq, r6mp 77 movifnidn wd, wm 78 movifnidn hd, hm 79 mov edged, r7m 80 mov t3d, r8m ; pixel_max 81 vbroadcasti128 m6, [wiener_shufA] 82 vpbroadcastd m12, [fltq+ 0] ; x0 x1 83 lea t4, [wiener_hshift] 84 vbroadcasti128 m7, [wiener_shufB] 85 add wd, wd 86 vpbroadcastd m13, [fltq+ 4] ; x2 x3 87 shr t3d, 11 88 vpbroadcastd m14, [fltq+16] ; y0 y1 89 add lpfq, wq 90 vpbroadcastd m15, [fltq+20] ; y2 y3 91 add dstq, wq 92 vbroadcasti128 m8, [wiener_shufC] 93 lea t1, [rsp+wq+16] 94 vbroadcasti128 m9, [wiener_shufD] 95 neg wq 96 vpbroadcastd m0, [base+wiener_hshift+t3*4] 97 vpbroadcastd m10, [base+wiener_round+t3*4] 98 vpbroadcastd m11, [base+wiener_vshift+t3*4] 99 pmullw m12, m0 ; upshift filter coefs to make the 100 pmullw m13, m0 ; horizontal downshift constant 101 test edgeb, 4 ; LR_HAVE_TOP 102 jz .no_top 103 call .h_top 104 add lpfq, strideq 105 mov t6, t1 106 mov t5, t1 107 add t1, 384*2 108 call .h_top 109 lea r10, [lpfq+strideq*4] 110 mov lpfq, dstq 111 mov t4, t1 112 add t1, 384*2 113 add r10, strideq 114 mov [rsp], r10 ; below 115 call .h 116 mov t3, t1 117 mov t2, t1 118 dec hd 119 jz .v1 120 add lpfq, strideq 121 add t1, 384*2 122 call .h 123 mov t2, t1 124 dec hd 125 jz .v2 126 add lpfq, strideq 127 add t1, 384*2 128 call .h 129 dec hd 130 jz .v3 131.main: 132 lea t0, [t1+384*2] 133.main_loop: 134 call .hv 135 dec hd 136 jnz .main_loop 137 test edgeb, 8 ; LR_HAVE_BOTTOM 138 jz .v3 139 mov lpfq, [rsp] 140 call .hv_bottom 141 add lpfq, strideq 142 call .hv_bottom 143.v1: 144 call .v 145 RET 146.no_top: 147 lea r10, [lpfq+strideq*4] 148 mov lpfq, dstq 149 lea r10, [r10+strideq*2] 150 mov [rsp], r10 151 call .h 152 mov t6, t1 153 mov t5, t1 154 mov t4, t1 155 mov t3, t1 156 mov t2, t1 157 dec hd 158 jz .v1 159 add lpfq, strideq 160 add t1, 384*2 161 call .h 162 mov t2, t1 163 dec hd 164 jz .v2 165 add lpfq, strideq 166 add t1, 384*2 167 call .h 168 dec hd 169 jz .v3 170 lea t0, [t1+384*2] 171 call .hv 172 dec hd 173 jz .v3 174 add t0, 384*8 175 call .hv 176 dec hd 177 jnz .main 178.v3: 179 call .v 180.v2: 181 call .v 182 jmp .v1 183.extend_right: 184 movd xm1, r10d 185 vpbroadcastd m0, [pb_6_7] 186 mova m2, [pb_0to63] 187 vpbroadcastb m1, xm1 188 psubb m0, m1 189 pminub m0, m2 190 pshufb m3, m0 191 vpbroadcastd m0, [pb_m2_m1] 192 psubb m0, m1 193 pminub m0, m2 194 pshufb m4, m0 195 vpbroadcastd m0, [pb_m10_m9] 196 psubb m0, m1 197 pminub m0, m2 198 pshufb m5, m0 199 ret 200.h: 201 mov r10, wq 202 test edgeb, 1 ; LR_HAVE_LEFT 203 jz .h_extend_left 204 movq xm3, [leftq] 205 vpblendd m3, [lpfq+r10-8], 0xfc 206 add leftq, 8 207 jmp .h_main 208.h_extend_left: 209 vbroadcasti128 m3, [lpfq+r10] ; avoid accessing memory located 210 mova m4, [lpfq+r10] ; before the start of the buffer 211 shufpd m3, m4, 0x05 212 pshufb m3, [wiener_lshuf7] 213 jmp .h_main2 214.h_top: 215 mov r10, wq 216 test edgeb, 1 ; LR_HAVE_LEFT 217 jz .h_extend_left 218.h_loop: 219 movu m3, [lpfq+r10-8] 220.h_main: 221 mova m4, [lpfq+r10+0] 222.h_main2: 223 movu m5, [lpfq+r10+8] 224 test edgeb, 2 ; LR_HAVE_RIGHT 225 jnz .h_have_right 226 cmp r10d, -36 227 jl .h_have_right 228 call .extend_right 229.h_have_right: 230 pshufb m0, m3, m6 231 pshufb m1, m4, m7 232 paddw m0, m1 233 pshufb m3, m8 234 pmaddwd m0, m12 235 pshufb m1, m4, m9 236 paddw m3, m1 237 pshufb m1, m4, m6 238 pmaddwd m3, m13 239 pshufb m2, m5, m7 240 paddw m1, m2 241 vpbroadcastd m2, [pd_m262128] ; (1 << 4) - (1 << 18) 242 pshufb m4, m8 243 pmaddwd m1, m12 244 pshufb m5, m9 245 paddw m4, m5 246 pmaddwd m4, m13 247 paddd m0, m2 248 paddd m1, m2 249 paddd m0, m3 250 paddd m1, m4 251 psrad m0, 4 252 psrad m1, 4 253 packssdw m0, m1 254 psraw m0, 1 255 mova [t1+r10], m0 256 add r10, 32 257 jl .h_loop 258 ret 259ALIGN function_align 260.hv: 261 add lpfq, strideq 262 mov r10, wq 263 test edgeb, 1 ; LR_HAVE_LEFT 264 jz .hv_extend_left 265 movq xm3, [leftq] 266 vpblendd m3, [lpfq+r10-8], 0xfc 267 add leftq, 8 268 jmp .hv_main 269.hv_extend_left: 270 movu m3, [lpfq+r10-8] 271 pshufb m3, [wiener_lshuf7] 272 jmp .hv_main 273.hv_bottom: 274 mov r10, wq 275 test edgeb, 1 ; LR_HAVE_LEFT 276 jz .hv_extend_left 277.hv_loop: 278 movu m3, [lpfq+r10-8] 279.hv_main: 280 mova m4, [lpfq+r10+0] 281 movu m5, [lpfq+r10+8] 282 test edgeb, 2 ; LR_HAVE_RIGHT 283 jnz .hv_have_right 284 cmp r10d, -36 285 jl .hv_have_right 286 call .extend_right 287.hv_have_right: 288 pshufb m0, m3, m6 289 pshufb m1, m4, m7 290 paddw m0, m1 291 pshufb m3, m8 292 pmaddwd m0, m12 293 pshufb m1, m4, m9 294 paddw m3, m1 295 pshufb m1, m4, m6 296 pmaddwd m3, m13 297 pshufb m2, m5, m7 298 paddw m1, m2 299 vpbroadcastd m2, [pd_m262128] 300 pshufb m4, m8 301 pmaddwd m1, m12 302 pshufb m5, m9 303 paddw m4, m5 304 pmaddwd m4, m13 305 paddd m0, m2 306 paddd m1, m2 307 mova m2, [t4+r10] 308 paddw m2, [t2+r10] 309 mova m5, [t3+r10] 310 paddd m0, m3 311 paddd m1, m4 312 psrad m0, 4 313 psrad m1, 4 314 packssdw m0, m1 315 mova m4, [t5+r10] 316 paddw m4, [t1+r10] 317 psraw m0, 1 318 paddw m3, m0, [t6+r10] 319 mova [t0+r10], m0 320 punpcklwd m0, m2, m5 321 pmaddwd m0, m15 322 punpckhwd m2, m5 323 pmaddwd m2, m15 324 punpcklwd m1, m3, m4 325 pmaddwd m1, m14 326 punpckhwd m3, m4 327 pmaddwd m3, m14 328 paddd m0, m10 329 paddd m2, m10 330 paddd m0, m1 331 paddd m2, m3 332 psrad m0, 5 333 psrad m2, 5 334 packusdw m0, m2 335 pmulhuw m0, m11 336 mova [dstq+r10], m0 337 add r10, 32 338 jl .hv_loop 339 mov t6, t5 340 mov t5, t4 341 mov t4, t3 342 mov t3, t2 343 mov t2, t1 344 mov t1, t0 345 mov t0, t6 346 add dstq, strideq 347 ret 348.v: 349 mov r10, wq 350.v_loop: 351 mova m1, [t4+r10] 352 paddw m1, [t2+r10] 353 mova m2, [t3+r10] 354 mova m4, [t1+r10] 355 paddw m3, m4, [t6+r10] 356 paddw m4, [t5+r10] 357 punpcklwd m0, m1, m2 358 pmaddwd m0, m15 359 punpckhwd m1, m2 360 pmaddwd m1, m15 361 punpcklwd m2, m3, m4 362 pmaddwd m2, m14 363 punpckhwd m3, m4 364 pmaddwd m3, m14 365 paddd m0, m10 366 paddd m1, m10 367 paddd m0, m2 368 paddd m1, m3 369 psrad m0, 5 370 psrad m1, 5 371 packusdw m0, m1 372 pmulhuw m0, m11 373 mova [dstq+r10], m0 374 add r10, 32 375 jl .v_loop 376 mov t6, t5 377 mov t5, t4 378 mov t4, t3 379 mov t3, t2 380 mov t2, t1 381 add dstq, strideq 382 ret 383 384cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ 385 w, h, edge, flt 386%define base t4-wiener_hshift 387 mov fltq, r6mp 388 movifnidn wd, wm 389 movifnidn hd, hm 390 mov edged, r7m 391 mov t3d, r8m ; pixel_max 392 vbroadcasti128 m5, [wiener_shufE] 393 vpbroadcastw m11, [fltq+ 2] ; x1 394 vbroadcasti128 m6, [wiener_shufB] 395 lea t4, [wiener_hshift] 396 vbroadcasti128 m7, [wiener_shufD] 397 add wd, wd 398 vpbroadcastd m12, [fltq+ 4] ; x2 x3 399 shr t3d, 11 400 vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) 401 add lpfq, wq 402 vpbroadcastw m13, [fltq+18] ; y1 403 add dstq, wq 404 vpbroadcastd m14, [fltq+20] ; y2 y3 405 lea t1, [rsp+wq+16] 406 neg wq 407 vpbroadcastd m0, [base+wiener_hshift+t3*4] 408 vpbroadcastd m9, [base+wiener_round+t3*4] 409 vpbroadcastd m10, [base+wiener_vshift+t3*4] 410 mova m15, [wiener_lshuf5] 411 pmullw m11, m0 412 pmullw m12, m0 413 test edgeb, 4 ; LR_HAVE_TOP 414 jz .no_top 415 call .h_top 416 add lpfq, strideq 417 mov t4, t1 418 add t1, 384*2 419 call .h_top 420 lea r10, [lpfq+strideq*4] 421 mov lpfq, dstq 422 mov t3, t1 423 add t1, 384*2 424 add r10, strideq 425 mov [rsp], r10 ; below 426 call .h 427 mov t2, t1 428 dec hd 429 jz .v1 430 add lpfq, strideq 431 add t1, 384*2 432 call .h 433 dec hd 434 jz .v2 435.main: 436 mov t0, t4 437.main_loop: 438 call .hv 439 dec hd 440 jnz .main_loop 441 test edgeb, 8 ; LR_HAVE_BOTTOM 442 jz .v2 443 mov lpfq, [rsp] 444 call .hv_bottom 445 add lpfq, strideq 446 call .hv_bottom 447.end: 448 RET 449.no_top: 450 lea r10, [lpfq+strideq*4] 451 mov lpfq, dstq 452 lea r10, [r10+strideq*2] 453 mov [rsp], r10 454 call .h 455 mov t4, t1 456 mov t3, t1 457 mov t2, t1 458 dec hd 459 jz .v1 460 add lpfq, strideq 461 add t1, 384*2 462 call .h 463 dec hd 464 jz .v2 465 lea t0, [t1+384*2] 466 call .hv 467 dec hd 468 jz .v2 469 add t0, 384*6 470 call .hv 471 dec hd 472 jnz .main 473.v2: 474 call .v 475 mov t4, t3 476 mov t3, t2 477 mov t2, t1 478 add dstq, strideq 479.v1: 480 call .v 481 jmp .end 482.extend_right: 483 movd xm2, r10d 484 vpbroadcastd m0, [pb_2_3] 485 vpbroadcastd m1, [pb_m6_m5] 486 vpbroadcastb m2, xm2 487 psubb m0, m2 488 psubb m1, m2 489 mova m2, [pb_0to63] 490 pminub m0, m2 491 pminub m1, m2 492 pshufb m3, m0 493 pshufb m4, m1 494 ret 495.h: 496 mov r10, wq 497 test edgeb, 1 ; LR_HAVE_LEFT 498 jz .h_extend_left 499 movd xm3, [leftq+4] 500 vpblendd m3, [lpfq+r10-4], 0xfe 501 add leftq, 8 502 jmp .h_main 503.h_extend_left: 504 vbroadcasti128 m4, [lpfq+r10] ; avoid accessing memory located 505 mova m3, [lpfq+r10] ; before the start of the buffer 506 palignr m3, m4, 12 507 pshufb m3, m15 508 jmp .h_main 509.h_top: 510 mov r10, wq 511 test edgeb, 1 ; LR_HAVE_LEFT 512 jz .h_extend_left 513.h_loop: 514 movu m3, [lpfq+r10-4] 515.h_main: 516 movu m4, [lpfq+r10+4] 517 test edgeb, 2 ; LR_HAVE_RIGHT 518 jnz .h_have_right 519 cmp r10d, -34 520 jl .h_have_right 521 call .extend_right 522.h_have_right: 523 pshufb m0, m3, m5 524 pmaddwd m0, m11 525 pshufb m1, m4, m5 526 pmaddwd m1, m11 527 pshufb m2, m3, m6 528 pshufb m3, m7 529 paddw m2, m3 530 pshufb m3, m4, m6 531 pmaddwd m2, m12 532 pshufb m4, m7 533 paddw m3, m4 534 pmaddwd m3, m12 535 paddd m0, m8 536 paddd m1, m8 537 paddd m0, m2 538 paddd m1, m3 539 psrad m0, 4 540 psrad m1, 4 541 packssdw m0, m1 542 psraw m0, 1 543 mova [t1+r10], m0 544 add r10, 32 545 jl .h_loop 546 ret 547ALIGN function_align 548.hv: 549 add lpfq, strideq 550 mov r10, wq 551 test edgeb, 1 ; LR_HAVE_LEFT 552 jz .hv_extend_left 553 movd xm3, [leftq+4] 554 vpblendd m3, [lpfq+r10-4], 0xfe 555 add leftq, 8 556 jmp .hv_main 557.hv_extend_left: 558 movu m3, [lpfq+r10-4] 559 pshufb m3, m15 560 jmp .hv_main 561.hv_bottom: 562 mov r10, wq 563 test edgeb, 1 ; LR_HAVE_LEFT 564 jz .hv_extend_left 565.hv_loop: 566 movu m3, [lpfq+r10-4] 567.hv_main: 568 movu m4, [lpfq+r10+4] 569 test edgeb, 2 ; LR_HAVE_RIGHT 570 jnz .hv_have_right 571 cmp r10d, -34 572 jl .hv_have_right 573 call .extend_right 574.hv_have_right: 575 pshufb m0, m3, m5 576 pmaddwd m0, m11 577 pshufb m1, m4, m5 578 pmaddwd m1, m11 579 pshufb m2, m3, m6 580 pshufb m3, m7 581 paddw m2, m3 582 pshufb m3, m4, m6 583 pmaddwd m2, m12 584 pshufb m4, m7 585 paddw m3, m4 586 pmaddwd m3, m12 587 paddd m0, m8 588 paddd m1, m8 589 paddd m0, m2 590 mova m2, [t3+r10] 591 paddw m2, [t1+r10] 592 paddd m1, m3 593 mova m4, [t2+r10] 594 punpckhwd m3, m2, m4 595 pmaddwd m3, m14 596 punpcklwd m2, m4 597 mova m4, [t4+r10] 598 psrad m0, 4 599 psrad m1, 4 600 packssdw m0, m1 601 pmaddwd m2, m14 602 psraw m0, 1 603 mova [t0+r10], m0 604 punpckhwd m1, m0, m4 605 pmaddwd m1, m13 606 punpcklwd m0, m4 607 pmaddwd m0, m13 608 paddd m3, m9 609 paddd m2, m9 610 paddd m1, m3 611 paddd m0, m2 612 psrad m1, 5 613 psrad m0, 5 614 packusdw m0, m1 615 pmulhuw m0, m10 616 mova [dstq+r10], m0 617 add r10, 32 618 jl .hv_loop 619 mov t4, t3 620 mov t3, t2 621 mov t2, t1 622 mov t1, t0 623 mov t0, t4 624 add dstq, strideq 625 ret 626.v: 627 mov r10, wq 628.v_loop: 629 mova m0, [t1+r10] 630 paddw m2, m0, [t3+r10] 631 mova m1, [t2+r10] 632 mova m4, [t4+r10] 633 punpckhwd m3, m2, m1 634 pmaddwd m3, m14 635 punpcklwd m2, m1 636 pmaddwd m2, m14 637 punpckhwd m1, m0, m4 638 pmaddwd m1, m13 639 punpcklwd m0, m4 640 pmaddwd m0, m13 641 paddd m3, m9 642 paddd m2, m9 643 paddd m1, m3 644 paddd m0, m2 645 psrad m1, 5 646 psrad m0, 5 647 packusdw m0, m1 648 pmulhuw m0, m10 649 mova [dstq+r10], m0 650 add r10, 32 651 jl .v_loop 652 ret 653 654cglobal sgr_filter_5x5_16bpc, 4, 14, 16, 400*24+16, dst, stride, left, lpf, \ 655 w, h, edge, params 656%define base r13-pb_m10_m9 657 movifnidn wd, wm 658 mov paramsq, r6mp 659 lea r13, [pb_m10_m9] 660 movifnidn hd, hm 661 mov edged, r7m 662 vpbroadcastw m7, [paramsq+8] ; w0 663 add wd, wd 664 vpbroadcastd m8, [base+pd_8] 665 add lpfq, wq 666 vpbroadcastd m9, [base+pd_25] 667 add dstq, wq 668 mova xm10, [base+sgr_lshuf5] 669 lea t3, [rsp+wq*2+400*12+16] 670 vpbroadcastd m11, [paramsq+0] ; s0 671 lea t4, [rsp+wq+400*20+16] 672 vpbroadcastd m12, [base+pw_164_24] 673 lea t1, [rsp+wq+20] 674 vbroadcastss m13, [base+pf_256] 675 neg wq 676 vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15) 677 pxor m6, m6 678 vpbroadcastd m15, [base+pw_1023] 679 psllw m7, 4 680 test edgeb, 4 ; LR_HAVE_TOP 681 jz .no_top 682 call .h_top 683 add lpfq, strideq 684 mov t2, t1 685 call .top_fixup 686 add t1, 400*6 687 call .h_top 688 lea r10, [lpfq+strideq*4] 689 mov lpfq, dstq 690 add r10, strideq 691 mov [rsp], r10 ; below 692 mov t0, t2 693 dec hd 694 jz .height1 695 or edged, 16 696 call .h 697.main: 698 add lpfq, strideq 699 call .hv 700 call .prep_n 701 sub hd, 2 702 jl .extend_bottom 703.main_loop: 704 add lpfq, strideq 705 test hd, hd 706 jz .odd_height 707 call .h 708 add lpfq, strideq 709 call .hv 710 call .n0 711 call .n1 712 sub hd, 2 713 jge .main_loop 714 test edgeb, 8 ; LR_HAVE_BOTTOM 715 jz .extend_bottom 716 mov lpfq, [rsp] 717 call .h_top 718 add lpfq, strideq 719 call .hv_bottom 720.end: 721 call .n0 722 call .n1 723.end2: 724 RET 725.height1: 726 call .hv 727 call .prep_n 728 jmp .odd_height_end 729.odd_height: 730 call .hv 731 call .n0 732 call .n1 733.odd_height_end: 734 call .v 735 call .n0 736 jmp .end2 737.extend_bottom: 738 call .v 739 jmp .end 740.no_top: 741 lea r10, [lpfq+strideq*4] 742 mov lpfq, dstq 743 lea r10, [r10+strideq*2] 744 mov [rsp], r10 745 call .h 746 lea t2, [t1+400*6] 747 call .top_fixup 748 dec hd 749 jz .no_top_height1 750 or edged, 16 751 mov t0, t1 752 mov t1, t2 753 jmp .main 754.no_top_height1: 755 call .v 756 call .prep_n 757 jmp .odd_height_end 758.extend_right: 759 vpbroadcastw m0, [lpfq-2] 760 movu m1, [r13+r10+ 0] 761 movu m2, [r13+r10+16] 762 vpblendvb m4, m0, m1 763 vpblendvb m5, m0, m2 764 ret 765.h: ; horizontal boxsum 766 lea r10, [wq-4] 767 test edgeb, 1 ; LR_HAVE_LEFT 768 jz .h_extend_left 769 vpbroadcastq xm5, [leftq] 770 vinserti128 m5, [lpfq+wq], 1 771 mova m4, [lpfq+wq] 772 add leftq, 8 773 palignr m4, m5, 10 774 jmp .h_main 775.h_extend_left: 776 mova xm4, [lpfq+wq] 777 pshufb xm4, xm10 778 vinserti128 m4, [lpfq+wq+10], 1 779 jmp .h_main 780.h_top: 781 lea r10, [wq-4] 782 test edgeb, 1 ; LR_HAVE_LEFT 783 jz .h_extend_left 784.h_loop: 785 movu m4, [lpfq+r10- 2] 786.h_main: 787 movu m5, [lpfq+r10+14] 788 test edgeb, 2 ; LR_HAVE_RIGHT 789 jnz .h_have_right 790 cmp r10d, -36 791 jl .h_have_right 792 call .extend_right 793.h_have_right: 794 palignr m2, m5, m4, 2 795 paddw m0, m4, m2 796 palignr m3, m5, m4, 6 797 paddw m0, m3 798 punpcklwd m1, m2, m3 799 pmaddwd m1, m1 800 punpckhwd m2, m3 801 pmaddwd m2, m2 802 shufpd m5, m4, m5, 0x05 803 paddw m0, m5 804 punpcklwd m3, m4, m5 805 pmaddwd m3, m3 806 paddd m1, m3 807 punpckhwd m3, m4, m5 808 pmaddwd m3, m3 809 shufps m4, m5, q2121 810 paddw m0, m4 ; sum 811 punpcklwd m5, m4, m6 812 pmaddwd m5, m5 813 punpckhwd m4, m6 814 pmaddwd m4, m4 815 paddd m2, m3 816 test edgeb, 16 ; y > 0 817 jz .h_loop_end 818 paddw m0, [t1+r10+400*0] 819 paddd m1, [t1+r10+400*2] 820 paddd m2, [t1+r10+400*4] 821.h_loop_end: 822 paddd m1, m5 ; sumsq 823 paddd m2, m4 824 mova [t1+r10+400*0], m0 825 mova [t1+r10+400*2], m1 826 mova [t1+r10+400*4], m2 827 add r10, 32 828 jl .h_loop 829 ret 830.top_fixup: 831 lea r10, [wq-4] 832.top_fixup_loop: ; the sums of the first row needs to be doubled 833 mova m0, [t1+r10+400*0] 834 mova m1, [t1+r10+400*2] 835 mova m2, [t1+r10+400*4] 836 paddw m0, m0 837 paddd m1, m1 838 paddd m2, m2 839 mova [t2+r10+400*0], m0 840 mova [t2+r10+400*2], m1 841 mova [t2+r10+400*4], m2 842 add r10, 32 843 jl .top_fixup_loop 844 ret 845ALIGN function_align 846.hv: ; horizontal boxsum + vertical boxsum + ab 847 lea r10, [wq-4] 848 test edgeb, 1 ; LR_HAVE_LEFT 849 jz .hv_extend_left 850 vpbroadcastq xm5, [leftq] 851 vinserti128 m5, [lpfq+wq], 1 852 mova m4, [lpfq+wq] 853 add leftq, 8 854 palignr m4, m5, 10 855 jmp .hv_main 856.hv_extend_left: 857 mova xm4, [lpfq+wq] 858 pshufb xm4, xm10 859 vinserti128 m4, [lpfq+wq+10], 1 860 jmp .hv_main 861.hv_bottom: 862 lea r10, [wq-4] 863 test edgeb, 1 ; LR_HAVE_LEFT 864 jz .hv_extend_left 865.hv_loop: 866 movu m4, [lpfq+r10- 2] 867.hv_main: 868 movu m5, [lpfq+r10+14] 869 test edgeb, 2 ; LR_HAVE_RIGHT 870 jnz .hv_have_right 871 cmp r10d, -36 872 jl .hv_have_right 873 call .extend_right 874.hv_have_right: 875 palignr m3, m5, m4, 2 876 paddw m0, m4, m3 877 palignr m1, m5, m4, 6 878 paddw m0, m1 879 punpcklwd m2, m3, m1 880 pmaddwd m2, m2 881 punpckhwd m3, m1 882 pmaddwd m3, m3 883 shufpd m5, m4, m5, 0x05 884 paddw m0, m5 885 punpcklwd m1, m4, m5 886 pmaddwd m1, m1 887 paddd m2, m1 888 punpckhwd m1, m4, m5 889 pmaddwd m1, m1 890 shufps m4, m5, q2121 891 paddw m0, m4 ; h sum 892 punpcklwd m5, m4, m6 893 pmaddwd m5, m5 894 punpckhwd m4, m6 895 pmaddwd m4, m4 896 paddd m3, m1 897 paddd m2, m5 ; h sumsq 898 paddd m3, m4 899 paddw m1, m0, [t1+r10+400*0] 900 paddd m4, m2, [t1+r10+400*2] 901 paddd m5, m3, [t1+r10+400*4] 902 test hd, hd 903 jz .hv_last_row 904.hv_main2: 905 paddw m1, [t2+r10+400*0] ; hv sum 906 paddd m4, [t2+r10+400*2] ; hv sumsq 907 paddd m5, [t2+r10+400*4] 908 mova [t0+r10+400*0], m0 909 mova [t0+r10+400*2], m2 910 mova [t0+r10+400*4], m3 911 psrlw m3, m1, 1 912 paddd m4, m8 913 pavgw m3, m6 ; (b + 2) >> 2 914 paddd m5, m8 915 psrld m4, 4 ; (a + 8) >> 4 916 punpcklwd m2, m3, m6 917 psrld m5, 4 918 punpckhwd m3, m6 919 pmulld m4, m9 ; a * 25 920 pmulld m5, m9 921 pmaddwd m2, m2 ; b * b 922 pmaddwd m3, m3 923 punpcklwd m0, m1, m6 ; b 924 punpckhwd m1, m6 925 pmaxud m4, m2 926 pmaxud m5, m3 927 psubd m4, m2 ; p 928 psubd m5, m3 929 pmulld m4, m11 ; p * s 930 pmulld m5, m11 931 pmaddwd m0, m12 ; b * 164 932 pmaddwd m1, m12 933 paddw m4, m12 934 paddw m5, m12 935 psrld m4, 20 ; z + 1 936 psrld m5, 20 937 cvtdq2ps m4, m4 938 cvtdq2ps m5, m5 939 rcpps m2, m4 ; 1 / (z + 1) 940 rcpps m3, m5 941 pcmpgtd m4, m13, m4 942 pcmpgtd m5, m13, m5 943 mulps m2, m13 ; 256 / (z + 1) 944 mulps m3, m13 945 psrld m4, 24 ; z < 255 ? 255 : 0 946 psrld m5, 24 947 cvtps2dq m2, m2 948 cvtps2dq m3, m3 949 pminsw m2, m4 ; x 950 pminsw m3, m5 951 pmulld m0, m2 952 pmulld m1, m3 953 packssdw m2, m3 954 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) 955 paddd m1, m14 956 mova [t4+r10+4], m2 957 psrld m0, 12 ; b 958 psrld m1, 12 959 mova [t3+r10*2+ 8], xm0 960 vextracti128 [t3+r10*2+40], m0, 1 961 mova [t3+r10*2+24], xm1 962 vextracti128 [t3+r10*2+56], m1, 1 963 add r10, 32 964 jl .hv_loop 965 mov t2, t1 966 mov t1, t0 967 mov t0, t2 968 ret 969.hv_last_row: ; esoteric edge case for odd heights 970 mova [t1+r10+400*0], m1 971 paddw m1, m0 972 mova [t1+r10+400*2], m4 973 paddd m4, m2 974 mova [t1+r10+400*4], m5 975 paddd m5, m3 976 jmp .hv_main2 977.v: ; vertical boxsum + ab 978 lea r10, [wq-4] 979.v_loop: 980 mova m0, [t1+r10+400*0] 981 mova m2, [t1+r10+400*2] 982 mova m3, [t1+r10+400*4] 983 paddw m1, m0, [t2+r10+400*0] 984 paddd m4, m2, [t2+r10+400*2] 985 paddd m5, m3, [t2+r10+400*4] 986 paddw m0, m0 987 paddd m2, m2 988 paddd m3, m3 989 paddw m1, m0 ; hv sum 990 paddd m4, m2 ; hv sumsq 991 paddd m5, m3 992 psrlw m3, m1, 1 993 paddd m4, m8 994 pavgw m3, m6 ; (b + 2) >> 2 995 paddd m5, m8 996 psrld m4, 4 ; (a + 8) >> 4 997 punpcklwd m2, m3, m6 998 psrld m5, 4 999 punpckhwd m3, m6 1000 pmulld m4, m9 ; a * 25 1001 pmulld m5, m9 1002 pmaddwd m2, m2 ; b * b 1003 pmaddwd m3, m3 1004 punpcklwd m0, m1, m6 ; b 1005 punpckhwd m1, m6 1006 pmaxud m4, m2 1007 pmaxud m5, m3 1008 psubd m4, m2 ; p 1009 psubd m5, m3 1010 pmulld m4, m11 ; p * s 1011 pmulld m5, m11 1012 pmaddwd m0, m12 ; b * 164 1013 pmaddwd m1, m12 1014 paddw m4, m12 1015 paddw m5, m12 1016 psrld m4, 20 ; z + 1 1017 psrld m5, 20 1018 cvtdq2ps m4, m4 1019 cvtdq2ps m5, m5 1020 rcpps m2, m4 ; 1 / (z + 1) 1021 rcpps m3, m5 1022 pcmpgtd m4, m13, m4 1023 pcmpgtd m5, m13, m5 1024 mulps m2, m13 ; 256 / (z + 1) 1025 mulps m3, m13 1026 psrld m4, 24 ; z < 255 ? 255 : 0 1027 psrld m5, 24 1028 cvtps2dq m2, m2 1029 cvtps2dq m3, m3 1030 pminsw m2, m4 ; x 1031 pminsw m3, m5 1032 pmulld m0, m2 1033 pmulld m1, m3 1034 packssdw m2, m3 1035 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) 1036 paddd m1, m14 1037 mova [t4+r10+4], m2 1038 psrld m0, 12 ; b 1039 psrld m1, 12 1040 mova [t3+r10*2+ 8], xm0 1041 vextracti128 [t3+r10*2+40], m0, 1 1042 mova [t3+r10*2+24], xm1 1043 vextracti128 [t3+r10*2+56], m1, 1 1044 add r10, 32 1045 jl .v_loop 1046 ret 1047.prep_n: ; initial neighbor setup 1048 mov r10, wq 1049.prep_n_loop: 1050 movu m0, [t4+r10*1+ 2] 1051 movu m1, [t3+r10*2+ 4] 1052 movu m2, [t3+r10*2+36] 1053 paddw m3, m0, [t4+r10*1+ 0] 1054 paddd m4, m1, [t3+r10*2+ 0] 1055 paddd m5, m2, [t3+r10*2+32] 1056 paddw m3, [t4+r10*1+ 4] 1057 paddd m4, [t3+r10*2+ 8] 1058 paddd m5, [t3+r10*2+40] 1059 paddw m0, m3 1060 psllw m3, 2 1061 paddd m1, m4 1062 pslld m4, 2 1063 paddd m2, m5 1064 pslld m5, 2 1065 paddw m0, m3 ; a 565 1066 paddd m1, m4 ; b 565 1067 paddd m2, m5 1068 mova [t4+r10*1+400*2+ 0], m0 1069 mova [t3+r10*2+400*4+ 0], m1 1070 mova [t3+r10*2+400*4+32], m2 1071 add r10, 32 1072 jl .prep_n_loop 1073 ret 1074ALIGN function_align 1075.n0: ; neighbor + output (even rows) 1076 mov r10, wq 1077.n0_loop: 1078 movu m0, [t4+r10*1+ 2] 1079 movu m1, [t3+r10*2+ 4] 1080 movu m2, [t3+r10*2+36] 1081 paddw m3, m0, [t4+r10*1+ 0] 1082 paddd m4, m1, [t3+r10*2+ 0] 1083 paddd m5, m2, [t3+r10*2+32] 1084 paddw m3, [t4+r10*1+ 4] 1085 paddd m4, [t3+r10*2+ 8] 1086 paddd m5, [t3+r10*2+40] 1087 paddw m0, m3 1088 psllw m3, 2 1089 paddd m1, m4 1090 pslld m4, 2 1091 paddd m2, m5 1092 pslld m5, 2 1093 paddw m0, m3 ; a 565 1094 paddd m1, m4 ; b 565 1095 paddd m2, m5 1096 paddw m3, m0, [t4+r10*1+400*2+ 0] 1097 paddd m4, m1, [t3+r10*2+400*4+ 0] 1098 paddd m5, m2, [t3+r10*2+400*4+32] 1099 mova [t4+r10*1+400*2+ 0], m0 1100 mova [t3+r10*2+400*4+ 0], m1 1101 mova [t3+r10*2+400*4+32], m2 1102 mova m0, [dstq+r10] 1103 punpcklwd m1, m0, m6 ; src 1104 punpcklwd m2, m3, m6 ; a 1105 pmaddwd m2, m1 ; a * src 1106 punpckhwd m1, m0, m6 1107 punpckhwd m3, m6 1108 pmaddwd m3, m1 1109 vinserti128 m1, m4, xm5, 1 1110 vperm2i128 m4, m5, 0x31 1111 psubd m1, m2 ; b - a * src + (1 << 8) 1112 psubd m4, m3 1113 psrad m1, 9 1114 psrad m4, 9 1115 packssdw m1, m4 1116 pmulhrsw m1, m7 1117 paddw m0, m1 1118 pmaxsw m0, m6 1119 pminsw m0, m15 1120 mova [dstq+r10], m0 1121 add r10, 32 1122 jl .n0_loop 1123 add dstq, strideq 1124 ret 1125ALIGN function_align 1126.n1: ; neighbor + output (odd rows) 1127 mov r10, wq 1128.n1_loop: 1129 mova m0, [dstq+r10] 1130 mova m3, [t4+r10*1+400*2+ 0] 1131 mova m4, [t3+r10*2+400*4+ 0] 1132 mova m5, [t3+r10*2+400*4+32] 1133 punpcklwd m1, m0, m6 ; src 1134 punpcklwd m2, m3, m6 ; a 1135 pmaddwd m2, m1 1136 punpckhwd m1, m0, m6 1137 punpckhwd m3, m6 1138 pmaddwd m3, m1 1139 vinserti128 m1, m4, xm5, 1 1140 vperm2i128 m4, m5, 0x31 1141 psubd m1, m2 ; b - a * src + (1 << 7) 1142 psubd m4, m3 1143 psrad m1, 8 1144 psrad m4, 8 1145 packssdw m1, m4 1146 pmulhrsw m1, m7 1147 paddw m0, m1 1148 pmaxsw m0, m6 1149 pminsw m0, m15 1150 mova [dstq+r10], m0 1151 add r10, 32 1152 jl .n1_loop 1153 add dstq, strideq 1154 ret 1155 1156cglobal sgr_filter_3x3_16bpc, 4, 14, 15, 400*42+8, dst, stride, left, lpf, \ 1157 w, h, edge, params 1158 movifnidn wd, wm 1159 mov paramsq, r6mp 1160 lea r13, [pb_m10_m9] 1161 add wd, wd 1162 movifnidn hd, hm 1163 mov edged, r7m 1164 vpbroadcastw m7, [paramsq+10] ; w1 1165 add lpfq, wq 1166 vpbroadcastd m8, [base+pd_8] 1167 add dstq, wq 1168 vpbroadcastd m9, [paramsq+ 4] ; s1 1169 lea t3, [rsp+wq*2+400*12+8] 1170 mova xm10, [base+sgr_lshuf3] 1171 lea t4, [rsp+wq+400*32+8] 1172 vpbroadcastd m11, [base+pw_455_24] 1173 lea t1, [rsp+wq+12] 1174 vbroadcastss m12, [base+pf_256] 1175 neg wq 1176 vpbroadcastd m13, [base+pd_34816] 1177 pxor m6, m6 1178 vpbroadcastd m14, [base+pw_1023] 1179 psllw m7, 4 1180 test edgeb, 4 ; LR_HAVE_TOP 1181 jz .no_top 1182 call .h_top 1183 add lpfq, strideq 1184 mov t2, t1 1185 add t1, 400*6 1186 call .h_top 1187 lea r10, [lpfq+strideq*4] 1188 mov lpfq, dstq 1189 add r10, strideq 1190 mov [rsp], r10 ; below 1191 call .hv0 1192.main: 1193 dec hd 1194 jz .height1 1195 add lpfq, strideq 1196 call .hv1 1197 call .prep_n 1198 sub hd, 2 1199 jl .extend_bottom 1200.main_loop: 1201 add lpfq, strideq 1202 call .hv0 1203 test hd, hd 1204 jz .odd_height 1205 add lpfq, strideq 1206 call .hv1 1207 call .n0 1208 call .n1 1209 sub hd, 2 1210 jge .main_loop 1211 test edgeb, 8 ; LR_HAVE_BOTTOM 1212 jz .extend_bottom 1213 mov lpfq, [rsp] 1214 call .hv0_bottom 1215 add lpfq, strideq 1216 call .hv1_bottom 1217.end: 1218 call .n0 1219 call .n1 1220.end2: 1221 RET 1222.height1: 1223 call .v1 1224 call .prep_n 1225 jmp .odd_height_end 1226.odd_height: 1227 call .v1 1228 call .n0 1229 call .n1 1230.odd_height_end: 1231 call .v0 1232 call .v1 1233 call .n0 1234 jmp .end2 1235.extend_bottom: 1236 call .v0 1237 call .v1 1238 jmp .end 1239.no_top: 1240 lea r10, [lpfq+strideq*4] 1241 mov lpfq, dstq 1242 lea r10, [r10+strideq*2] 1243 mov [rsp], r10 1244 call .h 1245 lea r10, [wq-4] 1246 lea t2, [t1+400*6] 1247.top_fixup_loop: 1248 mova m0, [t1+r10+400*0] 1249 mova m1, [t1+r10+400*2] 1250 mova m2, [t1+r10+400*4] 1251 mova [t2+r10+400*0], m0 1252 mova [t2+r10+400*2], m1 1253 mova [t2+r10+400*4], m2 1254 add r10, 32 1255 jl .top_fixup_loop 1256 call .v0 1257 jmp .main 1258.extend_right: 1259 vpbroadcastw m0, [lpfq-2] 1260 movu m1, [r13+r10+ 2] 1261 movu m2, [r13+r10+18] 1262 vpblendvb m4, m0, m1 1263 vpblendvb m5, m0, m2 1264 ret 1265.h: ; horizontal boxsum 1266 lea r10, [wq-4] 1267 test edgeb, 1 ; LR_HAVE_LEFT 1268 jz .h_extend_left 1269 vpbroadcastq xm5, [leftq] 1270 vinserti128 m5, [lpfq+wq], 1 1271 mova m4, [lpfq+wq] 1272 add leftq, 8 1273 palignr m4, m5, 12 1274 jmp .h_main 1275.h_extend_left: 1276 mova xm4, [lpfq+wq] 1277 pshufb xm4, xm10 1278 vinserti128 m4, [lpfq+wq+12], 1 1279 jmp .h_main 1280.h_top: 1281 lea r10, [wq-4] 1282 test edgeb, 1 ; LR_HAVE_LEFT 1283 jz .h_extend_left 1284.h_loop: 1285 movu m4, [lpfq+r10+ 0] 1286.h_main: 1287 movu m5, [lpfq+r10+16] 1288 test edgeb, 2 ; LR_HAVE_RIGHT 1289 jnz .h_have_right 1290 cmp r10d, -34 1291 jl .h_have_right 1292 call .extend_right 1293.h_have_right: 1294 palignr m0, m5, m4, 2 1295 paddw m1, m4, m0 1296 punpcklwd m2, m4, m0 1297 pmaddwd m2, m2 1298 punpckhwd m3, m4, m0 1299 pmaddwd m3, m3 1300 palignr m5, m4, 4 1301 paddw m1, m5 ; sum 1302 punpcklwd m4, m5, m6 1303 pmaddwd m4, m4 1304 punpckhwd m5, m6 1305 pmaddwd m5, m5 1306 paddd m2, m4 ; sumsq 1307 paddd m3, m5 1308 mova [t1+r10+400*0], m1 1309 mova [t1+r10+400*2], m2 1310 mova [t1+r10+400*4], m3 1311 add r10, 32 1312 jl .h_loop 1313 ret 1314ALIGN function_align 1315.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 1316 lea r10, [wq-4] 1317 test edgeb, 1 ; LR_HAVE_LEFT 1318 jz .hv0_extend_left 1319 vpbroadcastq xm5, [leftq] 1320 vinserti128 m5, [lpfq+wq], 1 1321 mova m4, [lpfq+wq] 1322 add leftq, 8 1323 palignr m4, m5, 12 1324 jmp .hv0_main 1325.hv0_extend_left: 1326 mova xm4, [lpfq+wq] 1327 pshufb xm4, xm10 1328 vinserti128 m4, [lpfq+wq+12], 1 1329 jmp .hv0_main 1330.hv0_bottom: 1331 lea r10, [wq-4] 1332 test edgeb, 1 ; LR_HAVE_LEFT 1333 jz .hv0_extend_left 1334.hv0_loop: 1335 movu m4, [lpfq+r10+ 0] 1336.hv0_main: 1337 movu m5, [lpfq+r10+16] 1338 test edgeb, 2 ; LR_HAVE_RIGHT 1339 jnz .hv0_have_right 1340 cmp r10d, -34 1341 jl .hv0_have_right 1342 call .extend_right 1343.hv0_have_right: 1344 palignr m0, m5, m4, 2 1345 paddw m1, m4, m0 1346 punpcklwd m2, m4, m0 1347 pmaddwd m2, m2 1348 punpckhwd m3, m4, m0 1349 pmaddwd m3, m3 1350 palignr m5, m4, 4 1351 paddw m1, m5 ; sum 1352 punpcklwd m4, m5, m6 1353 pmaddwd m4, m4 1354 punpckhwd m5, m6 1355 pmaddwd m5, m5 1356 paddd m2, m4 ; sumsq 1357 paddd m3, m5 1358 paddw m0, m1, [t1+r10+400*0] 1359 paddd m4, m2, [t1+r10+400*2] 1360 paddd m5, m3, [t1+r10+400*4] 1361 mova [t1+r10+400*0], m1 1362 mova [t1+r10+400*2], m2 1363 mova [t1+r10+400*4], m3 1364 paddw m1, m0, [t2+r10+400*0] 1365 paddd m2, m4, [t2+r10+400*2] 1366 paddd m3, m5, [t2+r10+400*4] 1367 mova [t2+r10+400*0], m0 1368 mova [t2+r10+400*2], m4 1369 mova [t2+r10+400*4], m5 1370 paddd m2, m8 1371 paddd m3, m8 1372 psrld m2, 4 ; (a + 8) >> 4 1373 psrld m3, 4 1374 pslld m4, m2, 3 1375 pslld m5, m3, 3 1376 paddd m4, m2 ; ((a + 8) >> 4) * 9 1377 paddd m5, m3 1378 psrlw m3, m1, 1 1379 pavgw m3, m6 ; (b + 2) >> 2 1380 punpcklwd m2, m3, m6 1381 pmaddwd m2, m2 1382 punpckhwd m3, m6 1383 pmaddwd m3, m3 1384 punpcklwd m0, m1, m6 ; b 1385 punpckhwd m1, m6 1386 pmaxud m4, m2 1387 psubd m4, m2 ; p 1388 pmaxud m5, m3 1389 psubd m5, m3 1390 pmulld m4, m9 ; p * s 1391 pmulld m5, m9 1392 pmaddwd m0, m11 ; b * 455 1393 pmaddwd m1, m11 1394 paddw m4, m11 1395 paddw m5, m11 1396 psrld m4, 20 ; z + 1 1397 psrld m5, 20 1398 cvtdq2ps m4, m4 1399 cvtdq2ps m5, m5 1400 rcpps m2, m4 ; 1 / (z + 1) 1401 rcpps m3, m5 1402 pcmpgtd m4, m12, m4 1403 pcmpgtd m5, m12, m5 1404 mulps m2, m12 ; 256 / (z + 1) 1405 mulps m3, m12 1406 psrld m4, 24 ; z < 255 ? 255 : 0 1407 psrld m5, 24 1408 cvtps2dq m2, m2 1409 cvtps2dq m3, m3 1410 pminsw m2, m4 ; x 1411 pminsw m3, m5 1412 pmulld m0, m2 1413 pmulld m1, m3 1414 packssdw m2, m3 1415 paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1416 paddd m1, m13 1417 psrld m0, 12 1418 psrld m1, 12 1419 mova [t4+r10*1+400*0+ 4], m2 1420 mova [t3+r10*2+400*0+ 8], xm0 1421 vextracti128 [t3+r10*2+400*0+40], m0, 1 1422 mova [t3+r10*2+400*0+24], xm1 1423 vextracti128 [t3+r10*2+400*0+56], m1, 1 1424 add r10, 32 1425 jl .hv0_loop 1426 ret 1427ALIGN function_align 1428.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 1429 lea r10, [wq-4] 1430 test edgeb, 1 ; LR_HAVE_LEFT 1431 jz .hv1_extend_left 1432 vpbroadcastq xm5, [leftq] 1433 vinserti128 m5, [lpfq+wq], 1 1434 mova m4, [lpfq+wq] 1435 add leftq, 8 1436 palignr m4, m5, 12 1437 jmp .hv1_main 1438.hv1_extend_left: 1439 mova xm4, [lpfq+wq] 1440 pshufb xm4, xm10 1441 vinserti128 m4, [lpfq+wq+12], 1 1442 jmp .hv1_main 1443.hv1_bottom: 1444 lea r10, [wq-4] 1445 test edgeb, 1 ; LR_HAVE_LEFT 1446 jz .hv1_extend_left 1447.hv1_loop: 1448 movu m4, [lpfq+r10+ 0] 1449.hv1_main: 1450 movu m5, [lpfq+r10+16] 1451 test edgeb, 2 ; LR_HAVE_RIGHT 1452 jnz .hv1_have_right 1453 cmp r10d, -34 1454 jl .hv1_have_right 1455 call .extend_right 1456.hv1_have_right: 1457 palignr m1, m5, m4, 2 1458 paddw m0, m4, m1 1459 punpcklwd m2, m4, m1 1460 pmaddwd m2, m2 1461 punpckhwd m3, m4, m1 1462 pmaddwd m3, m3 1463 palignr m5, m4, 4 1464 paddw m0, m5 ; h sum 1465 punpcklwd m1, m5, m6 1466 pmaddwd m1, m1 1467 punpckhwd m5, m6 1468 pmaddwd m5, m5 1469 paddd m2, m1 ; h sumsq 1470 paddd m3, m5 1471 paddw m1, m0, [t2+r10+400*0] 1472 paddd m4, m2, [t2+r10+400*2] 1473 paddd m5, m3, [t2+r10+400*4] 1474 mova [t2+r10+400*0], m0 1475 mova [t2+r10+400*2], m2 1476 mova [t2+r10+400*4], m3 1477 paddd m4, m8 1478 paddd m5, m8 1479 psrld m4, 4 ; (a + 8) >> 4 1480 psrld m5, 4 1481 pslld m2, m4, 3 1482 pslld m3, m5, 3 1483 paddd m4, m2 ; ((a + 8) >> 4) * 9 1484 paddd m5, m3 1485 psrlw m3, m1, 1 1486 pavgw m3, m6 ; (b + 2) >> 2 1487 punpcklwd m2, m3, m6 1488 pmaddwd m2, m2 1489 punpckhwd m3, m6 1490 pmaddwd m3, m3 1491 punpcklwd m0, m1, m6 ; b 1492 punpckhwd m1, m6 1493 pmaxud m4, m2 1494 psubd m4, m2 ; p 1495 pmaxud m5, m3 1496 psubd m5, m3 1497 pmulld m4, m9 ; p * s 1498 pmulld m5, m9 1499 pmaddwd m0, m11 ; b * 455 1500 pmaddwd m1, m11 1501 paddw m4, m11 1502 paddw m5, m11 1503 psrld m4, 20 ; z + 1 1504 psrld m5, 20 1505 cvtdq2ps m4, m4 1506 cvtdq2ps m5, m5 1507 rcpps m2, m4 ; 1 / (z + 1) 1508 rcpps m3, m5 1509 pcmpgtd m4, m12, m4 1510 pcmpgtd m5, m12, m5 1511 mulps m2, m12 ; 256 / (z + 1) 1512 mulps m3, m12 1513 psrld m4, 24 ; z < 255 ? 255 : 0 1514 psrld m5, 24 1515 cvtps2dq m2, m2 1516 cvtps2dq m3, m3 1517 pminsw m2, m4 ; x 1518 pminsw m3, m5 1519 pmulld m0, m2 1520 pmulld m1, m3 1521 packssdw m2, m3 1522 paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1523 paddd m1, m13 1524 psrld m0, 12 1525 psrld m1, 12 1526 mova [t4+r10*1+400*2 +4], m2 1527 mova [t3+r10*2+400*4+ 8], xm0 1528 vextracti128 [t3+r10*2+400*4+40], m0, 1 1529 mova [t3+r10*2+400*4+24], xm1 1530 vextracti128 [t3+r10*2+400*4+56], m1, 1 1531 add r10, 32 1532 jl .hv1_loop 1533 mov r10, t2 1534 mov t2, t1 1535 mov t1, r10 1536 ret 1537.v0: ; vertical boxsums + ab (even rows) 1538 lea r10, [wq-4] 1539.v0_loop: 1540 mova m0, [t1+r10+400*0] 1541 mova m4, [t1+r10+400*2] 1542 mova m5, [t1+r10+400*4] 1543 paddw m0, m0 1544 paddd m4, m4 1545 paddd m5, m5 1546 paddw m1, m0, [t2+r10+400*0] 1547 paddd m2, m4, [t2+r10+400*2] 1548 paddd m3, m5, [t2+r10+400*4] 1549 mova [t2+r10+400*0], m0 1550 mova [t2+r10+400*2], m4 1551 mova [t2+r10+400*4], m5 1552 paddd m2, m8 1553 paddd m3, m8 1554 psrld m2, 4 ; (a + 8) >> 4 1555 psrld m3, 4 1556 pslld m4, m2, 3 1557 pslld m5, m3, 3 1558 paddd m4, m2 ; ((a + 8) >> 4) * 9 1559 paddd m5, m3 1560 psrlw m3, m1, 1 1561 pavgw m3, m6 ; (b + 2) >> 2 1562 punpcklwd m2, m3, m6 1563 pmaddwd m2, m2 1564 punpckhwd m3, m6 1565 pmaddwd m3, m3 1566 punpcklwd m0, m1, m6 ; b 1567 punpckhwd m1, m6 1568 pmaxud m4, m2 1569 psubd m4, m2 ; p 1570 pmaxud m5, m3 1571 psubd m5, m3 1572 pmulld m4, m9 ; p * s 1573 pmulld m5, m9 1574 pmaddwd m0, m11 ; b * 455 1575 pmaddwd m1, m11 1576 paddw m4, m11 1577 paddw m5, m11 1578 psrld m4, 20 ; z + 1 1579 psrld m5, 20 1580 cvtdq2ps m4, m4 1581 cvtdq2ps m5, m5 1582 rcpps m2, m4 ; 1 / (z + 1) 1583 rcpps m3, m5 1584 pcmpgtd m4, m12, m4 1585 pcmpgtd m5, m12, m5 1586 mulps m2, m12 ; 256 / (z + 1) 1587 mulps m3, m12 1588 psrld m4, 24 ; z < 255 ? 255 : 0 1589 psrld m5, 24 1590 cvtps2dq m2, m2 1591 cvtps2dq m3, m3 1592 pminsw m2, m4 ; x 1593 pminsw m3, m5 1594 pmulld m0, m2 1595 pmulld m1, m3 1596 packssdw m2, m3 1597 paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1598 paddd m1, m13 1599 psrld m0, 12 1600 psrld m1, 12 1601 mova [t4+r10*1+400*0+ 4], m2 1602 mova [t3+r10*2+400*0+ 8], xm0 1603 vextracti128 [t3+r10*2+400*0+40], m0, 1 1604 mova [t3+r10*2+400*0+24], xm1 1605 vextracti128 [t3+r10*2+400*0+56], m1, 1 1606 add r10, 32 1607 jl .v0_loop 1608 ret 1609.v1: ; vertical boxsums + ab (odd rows) 1610 lea r10, [wq-4] 1611.v1_loop: 1612 mova m0, [t1+r10+400*0] 1613 mova m4, [t1+r10+400*2] 1614 mova m5, [t1+r10+400*4] 1615 paddw m1, m0, [t2+r10+400*0] 1616 paddd m2, m4, [t2+r10+400*2] 1617 paddd m3, m5, [t2+r10+400*4] 1618 mova [t2+r10+400*0], m0 1619 mova [t2+r10+400*2], m4 1620 mova [t2+r10+400*4], m5 1621 paddd m2, m8 1622 paddd m3, m8 1623 psrld m2, 4 ; (a + 8) >> 4 1624 psrld m3, 4 1625 pslld m4, m2, 3 1626 pslld m5, m3, 3 1627 paddd m4, m2 ; ((a + 8) >> 4) * 9 1628 paddd m5, m3 1629 psrlw m3, m1, 1 1630 pavgw m3, m6 ; (b + 2) >> 2 1631 punpcklwd m2, m3, m6 1632 pmaddwd m2, m2 1633 punpckhwd m3, m6 1634 pmaddwd m3, m3 1635 punpcklwd m0, m1, m6 ; b 1636 punpckhwd m1, m6 1637 pmaxud m4, m2 1638 psubd m4, m2 ; p 1639 pmaxud m5, m3 1640 psubd m5, m3 1641 pmulld m4, m9 ; p * s 1642 pmulld m5, m9 1643 pmaddwd m0, m11 ; b * 455 1644 pmaddwd m1, m11 1645 paddw m4, m11 1646 paddw m5, m11 1647 psrld m4, 20 ; z + 1 1648 psrld m5, 20 1649 cvtdq2ps m4, m4 1650 cvtdq2ps m5, m5 1651 rcpps m2, m4 ; 1 / (z + 1) 1652 rcpps m3, m5 1653 pcmpgtd m4, m12, m4 1654 pcmpgtd m5, m12, m5 1655 mulps m2, m12 ; 256 / (z + 1) 1656 mulps m3, m12 1657 psrld m4, 24 ; z < 255 ? 255 : 0 1658 psrld m5, 24 1659 cvtps2dq m2, m2 1660 cvtps2dq m3, m3 1661 pminsw m2, m4 ; x 1662 pminsw m3, m5 1663 pmulld m0, m2 1664 pmulld m1, m3 1665 packssdw m2, m3 1666 paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1667 paddd m1, m13 1668 psrld m0, 12 1669 psrld m1, 12 1670 mova [t4+r10*1+400*2+ 4], m2 1671 mova [t3+r10*2+400*4+ 8], xm0 1672 vextracti128 [t3+r10*2+400*4+40], m0, 1 1673 mova [t3+r10*2+400*4+24], xm1 1674 vextracti128 [t3+r10*2+400*4+56], m1, 1 1675 add r10, 32 1676 jl .v1_loop 1677 mov r10, t2 1678 mov t2, t1 1679 mov t1, r10 1680 ret 1681.prep_n: ; initial neighbor setup 1682 mov r10, wq 1683.prep_n_loop: 1684 mova xm0, [t4+r10*1+400*0+0] 1685 paddw xm0, [t4+r10*1+400*0+4] 1686 paddw xm2, xm0, [t4+r10*1+400*0+2] 1687 mova m1, [t3+r10*2+400*0+0] 1688 paddd m1, [t3+r10*2+400*0+8] 1689 paddd m3, m1, [t3+r10*2+400*0+4] 1690 psllw xm2, 2 ; a[-1] 444 1691 pslld m3, 2 ; b[-1] 444 1692 psubw xm2, xm0 ; a[-1] 343 1693 psubd m3, m1 ; b[-1] 343 1694 mova [t4+r10*1+400* 4], xm2 1695 mova [t3+r10*2+400* 8], m3 1696 mova xm0, [t4+r10*1+400*2+0] 1697 paddw xm0, [t4+r10*1+400*2+4] 1698 paddw xm2, xm0, [t4+r10*1+400*2+2] 1699 mova m1, [t3+r10*2+400*4+0] 1700 paddd m1, [t3+r10*2+400*4+8] 1701 paddd m3, m1, [t3+r10*2+400*4+4] 1702 psllw xm2, 2 ; a[ 0] 444 1703 pslld m3, 2 ; b[ 0] 444 1704 mova [t4+r10*1+400* 6], xm2 1705 mova [t3+r10*2+400*12], m3 1706 psubw xm2, xm0 ; a[ 0] 343 1707 psubd m3, m1 ; b[ 0] 343 1708 mova [t4+r10*1+400* 8], xm2 1709 mova [t3+r10*2+400*16], m3 1710 add r10, 16 1711 jl .prep_n_loop 1712 ret 1713ALIGN function_align 1714.n0: ; neighbor + output (even rows) 1715 mov r10, wq 1716.n0_loop: 1717 mova m3, [t4+r10*1+400*0+0] 1718 paddw m3, [t4+r10*1+400*0+4] 1719 paddw m1, m3, [t4+r10*1+400*0+2] 1720 psllw m1, 2 ; a[ 1] 444 1721 psubw m2, m1, m3 ; a[ 1] 343 1722 paddw m3, m2, [t4+r10*1+400*4] 1723 paddw m3, [t4+r10*1+400*6] 1724 mova [t4+r10*1+400*4], m2 1725 mova [t4+r10*1+400*6], m1 1726 mova m4, [t3+r10*2+400*0+0] 1727 paddd m4, [t3+r10*2+400*0+8] 1728 paddd m1, m4, [t3+r10*2+400*0+4] 1729 pslld m1, 2 ; b[ 1] 444 1730 psubd m2, m1, m4 ; b[ 1] 343 1731 paddd m4, m2, [t3+r10*2+400* 8+ 0] 1732 paddd m4, [t3+r10*2+400*12+ 0] 1733 mova [t3+r10*2+400* 8+ 0], m2 1734 mova [t3+r10*2+400*12+ 0], m1 1735 mova m5, [t3+r10*2+400*0+32] 1736 paddd m5, [t3+r10*2+400*0+40] 1737 paddd m1, m5, [t3+r10*2+400*0+36] 1738 pslld m1, 2 1739 psubd m2, m1, m5 1740 paddd m5, m2, [t3+r10*2+400* 8+32] 1741 paddd m5, [t3+r10*2+400*12+32] 1742 mova [t3+r10*2+400* 8+32], m2 1743 mova [t3+r10*2+400*12+32], m1 1744 mova m0, [dstq+r10] 1745 punpcklwd m1, m0, m6 1746 punpcklwd m2, m3, m6 1747 pmaddwd m2, m1 ; a * src 1748 punpckhwd m1, m0, m6 1749 punpckhwd m3, m6 1750 pmaddwd m3, m1 1751 vinserti128 m1, m4, xm5, 1 1752 vperm2i128 m4, m5, 0x31 1753 psubd m1, m2 ; b - a * src + (1 << 8) 1754 psubd m4, m3 1755 psrad m1, 9 1756 psrad m4, 9 1757 packssdw m1, m4 1758 pmulhrsw m1, m7 1759 paddw m0, m1 1760 pmaxsw m0, m6 1761 pminsw m0, m14 1762 mova [dstq+r10], m0 1763 add r10, 32 1764 jl .n0_loop 1765 add dstq, strideq 1766 ret 1767ALIGN function_align 1768.n1: ; neighbor + output (odd rows) 1769 mov r10, wq 1770.n1_loop: 1771 mova m3, [t4+r10*1+400*2+0] 1772 paddw m3, [t4+r10*1+400*2+4] 1773 paddw m1, m3, [t4+r10*1+400*2+2] 1774 psllw m1, 2 ; a[ 1] 444 1775 psubw m2, m1, m3 ; a[ 1] 343 1776 paddw m3, m2, [t4+r10*1+400*6] 1777 paddw m3, [t4+r10*1+400*8] 1778 mova [t4+r10*1+400*6], m1 1779 mova [t4+r10*1+400*8], m2 1780 mova m4, [t3+r10*2+400*4+0] 1781 paddd m4, [t3+r10*2+400*4+8] 1782 paddd m1, m4, [t3+r10*2+400*4+4] 1783 pslld m1, 2 ; b[ 1] 444 1784 psubd m2, m1, m4 ; b[ 1] 343 1785 paddd m4, m2, [t3+r10*2+400*12+ 0] 1786 paddd m4, [t3+r10*2+400*16+ 0] 1787 mova [t3+r10*2+400*12+ 0], m1 1788 mova [t3+r10*2+400*16+ 0], m2 1789 mova m5, [t3+r10*2+400*4+32] 1790 paddd m5, [t3+r10*2+400*4+40] 1791 paddd m1, m5, [t3+r10*2+400*4+36] 1792 pslld m1, 2 1793 psubd m2, m1, m5 1794 paddd m5, m2, [t3+r10*2+400*12+32] 1795 paddd m5, [t3+r10*2+400*16+32] 1796 mova [t3+r10*2+400*12+32], m1 1797 mova [t3+r10*2+400*16+32], m2 1798 mova m0, [dstq+r10] 1799 punpcklwd m1, m0, m6 1800 punpcklwd m2, m3, m6 1801 pmaddwd m2, m1 ; a * src 1802 punpckhwd m1, m0, m6 1803 punpckhwd m3, m6 1804 pmaddwd m3, m1 1805 vinserti128 m1, m4, xm5, 1 1806 vperm2i128 m4, m5, 0x31 1807 psubd m1, m2 ; b - a * src + (1 << 8) 1808 psubd m4, m3 1809 psrad m1, 9 1810 psrad m4, 9 1811 packssdw m1, m4 1812 pmulhrsw m1, m7 1813 paddw m0, m1 1814 pmaxsw m0, m6 1815 pminsw m0, m14 1816 mova [dstq+r10], m0 1817 add r10, 32 1818 jl .n1_loop 1819 add dstq, strideq 1820 ret 1821 1822cglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \ 1823 w, h, edge, params 1824 movifnidn wd, wm 1825 mov paramsq, r6mp 1826 lea r13, [pb_m10_m9] 1827 add wd, wd 1828 movifnidn hd, hm 1829 mov edged, r7m 1830 add lpfq, wq 1831 vpbroadcastd m15, [paramsq+8] ; w0 w1 1832 add dstq, wq 1833 vpbroadcastd m13, [paramsq+0] ; s0 1834 lea t3, [rsp+wq*2+400*24+8] 1835 vpbroadcastd m14, [paramsq+4] ; s1 1836 lea t4, [rsp+wq+400*52+8] 1837 vpbroadcastd m9, [base+pd_8] 1838 lea t1, [rsp+wq+12] 1839 vpbroadcastd m10, [base+pd_34816] 1840 neg wq 1841 vbroadcastss m11, [base+pf_256] 1842 pxor m7, m7 1843 vpbroadcastd m12, [base+pw_455_24] 1844 psllw m15, 2 1845 test edgeb, 4 ; LR_HAVE_TOP 1846 jz .no_top 1847 call .h_top 1848 add lpfq, strideq 1849 mov t2, t1 1850 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup 1851 add t1, 400*12 1852 call .h_top 1853 lea r10, [lpfq+strideq*4] 1854 mov lpfq, dstq 1855 add r10, strideq 1856 mov [rsp], r10 ; below 1857 call .hv0 1858.main: 1859 dec hd 1860 jz .height1 1861 add lpfq, strideq 1862 call .hv1 1863 call .prep_n 1864 sub hd, 2 1865 jl .extend_bottom 1866.main_loop: 1867 add lpfq, strideq 1868 call .hv0 1869 test hd, hd 1870 jz .odd_height 1871 add lpfq, strideq 1872 call .hv1 1873 call .n0 1874 call .n1 1875 sub hd, 2 1876 jge .main_loop 1877 test edgeb, 8 ; LR_HAVE_BOTTOM 1878 jz .extend_bottom 1879 mov lpfq, [rsp] 1880 call .hv0_bottom 1881 add lpfq, strideq 1882 call .hv1_bottom 1883.end: 1884 call .n0 1885 call .n1 1886.end2: 1887 RET 1888.height1: 1889 call .v1 1890 call .prep_n 1891 jmp .odd_height_end 1892.odd_height: 1893 call .v1 1894 call .n0 1895 call .n1 1896.odd_height_end: 1897 call .v0 1898 call .v1 1899 call .n0 1900 jmp .end2 1901.extend_bottom: 1902 call .v0 1903 call .v1 1904 jmp .end 1905.no_top: 1906 lea r10, [lpfq+strideq*4] 1907 mov lpfq, dstq 1908 lea r10, [r10+strideq*2] 1909 mov [rsp], r10 1910 call .h 1911 lea r10, [wq-4] 1912 lea t2, [t1+400*12] 1913.top_fixup_loop: 1914 mova m0, [t1+r10+400* 0] 1915 mova m1, [t1+r10+400* 2] 1916 mova m2, [t1+r10+400* 4] 1917 paddw m0, m0 1918 mova m3, [t1+r10+400* 6] 1919 paddd m1, m1 1920 mova m4, [t1+r10+400* 8] 1921 paddd m2, m2 1922 mova m5, [t1+r10+400*10] 1923 mova [t2+r10+400* 0], m0 1924 mova [t2+r10+400* 2], m1 1925 mova [t2+r10+400* 4], m2 1926 mova [t2+r10+400* 6], m3 1927 mova [t2+r10+400* 8], m4 1928 mova [t2+r10+400*10], m5 1929 add r10, 32 1930 jl .top_fixup_loop 1931 call .v0 1932 jmp .main 1933.h: ; horizontal boxsum 1934 lea r10, [wq-4] 1935 test edgeb, 1 ; LR_HAVE_LEFT 1936 jz .h_extend_left 1937 vpbroadcastq xm5, [leftq] 1938 vinserti128 m5, [lpfq+wq], 1 1939 mova m4, [lpfq+wq] 1940 add leftq, 8 1941 palignr m4, m5, 10 1942 jmp .h_main 1943.h_extend_left: 1944 mova xm4, [lpfq+wq] 1945 pshufb xm4, [base+sgr_lshuf5] 1946 vinserti128 m4, [lpfq+wq+10], 1 1947 jmp .h_main 1948.h_top: 1949 lea r10, [wq-4] 1950 test edgeb, 1 ; LR_HAVE_LEFT 1951 jz .h_extend_left 1952.h_loop: 1953 movu m4, [lpfq+r10- 2] 1954.h_main: 1955 movu m5, [lpfq+r10+14] 1956 test edgeb, 2 ; LR_HAVE_RIGHT 1957 jnz .h_have_right 1958 cmp r10d, -36 1959 jl .h_have_right 1960 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right 1961.h_have_right: 1962 palignr m3, m5, m4, 2 1963 palignr m0, m5, m4, 4 1964 paddw m1, m3, m0 1965 punpcklwd m2, m3, m0 1966 pmaddwd m2, m2 1967 punpckhwd m3, m0 1968 pmaddwd m3, m3 1969 palignr m0, m5, m4, 6 1970 paddw m1, m0 ; sum3 1971 punpcklwd m6, m0, m7 1972 pmaddwd m6, m6 1973 punpckhwd m0, m7 1974 pmaddwd m0, m0 1975 paddd m2, m6 ; sumsq3 1976 shufpd m6, m4, m5, 0x05 1977 punpcklwd m5, m6, m4 1978 paddw m8, m4, m6 1979 pmaddwd m5, m5 1980 punpckhwd m6, m4 1981 pmaddwd m6, m6 1982 paddd m3, m0 1983 mova [t1+r10+400* 6], m1 1984 mova [t1+r10+400* 8], m2 1985 mova [t1+r10+400*10], m3 1986 paddw m8, m1 ; sum5 1987 paddd m5, m2 ; sumsq5 1988 paddd m6, m3 1989 mova [t1+r10+400* 0], m8 1990 mova [t1+r10+400* 2], m5 1991 mova [t1+r10+400* 4], m6 1992 add r10, 32 1993 jl .h_loop 1994 ret 1995ALIGN function_align 1996.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 1997 lea r10, [wq-4] 1998 test edgeb, 1 ; LR_HAVE_LEFT 1999 jz .hv0_extend_left 2000 vpbroadcastq xm5, [leftq] 2001 vinserti128 m5, [lpfq+wq], 1 2002 mova m4, [lpfq+wq] 2003 add leftq, 8 2004 palignr m4, m5, 10 2005 jmp .hv0_main 2006.hv0_extend_left: 2007 mova xm4, [lpfq+wq] 2008 pshufb xm4, [base+sgr_lshuf5] 2009 vinserti128 m4, [lpfq+wq+10], 1 2010 jmp .hv0_main 2011.hv0_bottom: 2012 lea r10, [wq-4] 2013 test edgeb, 1 ; LR_HAVE_LEFT 2014 jz .hv0_extend_left 2015.hv0_loop: 2016 movu m4, [lpfq+r10- 2] 2017.hv0_main: 2018 movu m5, [lpfq+r10+14] 2019 test edgeb, 2 ; LR_HAVE_RIGHT 2020 jnz .hv0_have_right 2021 cmp r10d, -36 2022 jl .hv0_have_right 2023 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right 2024.hv0_have_right: 2025 palignr m3, m5, m4, 2 2026 palignr m0, m5, m4, 4 2027 paddw m1, m3, m0 2028 punpcklwd m2, m3, m0 2029 pmaddwd m2, m2 2030 punpckhwd m3, m0 2031 pmaddwd m3, m3 2032 palignr m0, m5, m4, 6 2033 paddw m1, m0 ; h sum3 2034 punpcklwd m6, m0, m7 2035 pmaddwd m6, m6 2036 punpckhwd m0, m7 2037 pmaddwd m0, m0 2038 paddd m2, m6 ; h sumsq3 2039 shufpd m6, m4, m5, 0x05 2040 punpcklwd m5, m6, m4 2041 paddw m8, m4, m6 2042 pmaddwd m5, m5 2043 punpckhwd m6, m4 2044 pmaddwd m6, m6 2045 paddd m3, m0 2046 paddw m8, m1 ; h sum5 2047 paddd m5, m2 ; h sumsq5 2048 paddd m6, m3 2049 mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4? 2050 mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd 2051 mova [t3+r10*2+400*0+40], m6 2052 paddw m8, [t1+r10+400* 0] 2053 paddd m5, [t1+r10+400* 2] 2054 paddd m6, [t1+r10+400* 4] 2055 mova [t1+r10+400* 0], m8 2056 mova [t1+r10+400* 2], m5 2057 mova [t1+r10+400* 4], m6 2058 paddw m0, m1, [t1+r10+400* 6] 2059 paddd m4, m2, [t1+r10+400* 8] 2060 paddd m5, m3, [t1+r10+400*10] 2061 mova [t1+r10+400* 6], m1 2062 mova [t1+r10+400* 8], m2 2063 mova [t1+r10+400*10], m3 2064 paddw m1, m0, [t2+r10+400* 6] 2065 paddd m2, m4, [t2+r10+400* 8] 2066 paddd m3, m5, [t2+r10+400*10] 2067 mova [t2+r10+400* 6], m0 2068 mova [t2+r10+400* 8], m4 2069 mova [t2+r10+400*10], m5 2070 paddd m2, m9 2071 paddd m3, m9 2072 psrld m2, 4 ; (a3 + 8) >> 4 2073 psrld m3, 4 2074 pslld m4, m2, 3 2075 pslld m5, m3, 3 2076 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2077 paddd m5, m3 2078 psrlw m3, m1, 1 2079 pavgw m3, m7 ; (b3 + 2) >> 2 2080 punpcklwd m2, m3, m7 2081 pmaddwd m2, m2 2082 punpckhwd m3, m7 2083 pmaddwd m3, m3 2084 punpcklwd m0, m1, m7 ; b3 2085 punpckhwd m1, m7 2086 pmaxud m4, m2 2087 psubd m4, m2 ; p3 2088 pmaxud m5, m3 2089 psubd m5, m3 2090 pmulld m4, m14 ; p3 * s1 2091 pmulld m5, m14 2092 pmaddwd m0, m12 ; b3 * 455 2093 pmaddwd m1, m12 2094 paddw m4, m12 2095 paddw m5, m12 2096 psrld m4, 20 ; z3 + 1 2097 psrld m5, 20 2098 cvtdq2ps m4, m4 2099 cvtdq2ps m5, m5 2100 rcpps m2, m4 ; 1 / (z3 + 1) 2101 rcpps m3, m5 2102 pcmpgtd m4, m11, m4 2103 pcmpgtd m5, m11, m5 2104 mulps m2, m11 ; 256 / (z3 + 1) 2105 mulps m3, m11 2106 psrld m4, 24 ; z3 < 255 ? 255 : 0 2107 psrld m5, 24 2108 cvtps2dq m2, m2 2109 cvtps2dq m3, m3 2110 pminsw m2, m4 ; x3 2111 pminsw m3, m5 2112 pmulld m0, m2 2113 pmulld m1, m3 2114 packssdw m2, m3 2115 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2116 paddd m1, m10 2117 psrld m0, 12 2118 psrld m1, 12 2119 mova [t4+r10*1+400*2+ 4], m2 2120 mova [t3+r10*2+400*4+ 8], xm0 2121 vextracti128 [t3+r10*2+400*4+40], m0, 1 2122 mova [t3+r10*2+400*4+24], xm1 2123 vextracti128 [t3+r10*2+400*4+56], m1, 1 2124 add r10, 32 2125 jl .hv0_loop 2126 ret 2127ALIGN function_align 2128.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2129 lea r10, [wq-4] 2130 test edgeb, 1 ; LR_HAVE_LEFT 2131 jz .hv1_extend_left 2132 vpbroadcastq xm5, [leftq] 2133 vinserti128 m5, [lpfq+wq], 1 2134 mova m4, [lpfq+wq] 2135 add leftq, 8 2136 palignr m4, m5, 10 2137 jmp .hv1_main 2138.hv1_extend_left: 2139 mova xm4, [lpfq+wq] 2140 pshufb xm4, [base+sgr_lshuf5] 2141 vinserti128 m4, [lpfq+wq+10], 1 2142 jmp .hv1_main 2143.hv1_bottom: 2144 lea r10, [wq-4] 2145 test edgeb, 1 ; LR_HAVE_LEFT 2146 jz .hv1_extend_left 2147.hv1_loop: 2148 movu m4, [lpfq+r10- 2] 2149.hv1_main: 2150 movu m5, [lpfq+r10+14] 2151 test edgeb, 2 ; LR_HAVE_RIGHT 2152 jnz .hv1_have_right 2153 cmp r10d, -36 2154 jl .hv1_have_right 2155 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right 2156.hv1_have_right: 2157 palignr m6, m5, m4, 2 2158 palignr m3, m5, m4, 4 2159 paddw m2, m6, m3 2160 punpcklwd m0, m6, m3 2161 pmaddwd m0, m0 2162 punpckhwd m6, m3 2163 pmaddwd m6, m6 2164 palignr m3, m5, m4, 6 2165 paddw m2, m3 ; h sum3 2166 punpcklwd m1, m3, m7 2167 pmaddwd m1, m1 2168 punpckhwd m3, m7 2169 pmaddwd m3, m3 2170 paddd m0, m1 ; h sumsq3 2171 shufpd m1, m4, m5, 0x05 2172 punpckhwd m5, m4, m1 2173 paddw m8, m4, m1 2174 pmaddwd m5, m5 2175 punpcklwd m4, m1 2176 pmaddwd m4, m4 2177 paddd m6, m3 2178 paddw m1, m2, [t2+r10+400* 6] 2179 mova [t2+r10+400* 6], m2 2180 paddw m8, m2 ; h sum5 2181 paddd m2, m0, [t2+r10+400* 8] 2182 paddd m3, m6, [t2+r10+400*10] 2183 mova [t2+r10+400* 8], m0 2184 mova [t2+r10+400*10], m6 2185 paddd m4, m0 ; h sumsq5 2186 paddd m5, m6 2187 paddd m2, m9 2188 paddd m3, m9 2189 psrld m2, 4 ; (a3 + 8) >> 4 2190 psrld m3, 4 2191 pslld m0, m2, 3 2192 pslld m6, m3, 3 2193 paddd m2, m0 ; ((a3 + 8) >> 4) * 9 2194 paddd m3, m6 2195 psrlw m6, m1, 1 2196 pavgw m6, m7 ; (b3 + 2) >> 2 2197 punpcklwd m0, m6, m7 2198 pmaddwd m0, m0 2199 punpckhwd m6, m7 2200 pmaddwd m6, m6 2201 pmaxud m2, m0 2202 psubd m2, m0 ; p3 2203 pmaxud m3, m6 2204 psubd m3, m6 2205 punpcklwd m0, m1, m7 ; b3 2206 punpckhwd m1, m7 2207 pmulld m2, m14 ; p3 * s1 2208 pmulld m3, m14 2209 pmaddwd m0, m12 ; b3 * 455 2210 pmaddwd m1, m12 2211 paddw m2, m12 2212 paddw m3, m12 2213 psrld m2, 20 ; z + 1 2214 psrld m3, 20 2215 cvtdq2ps m2, m2 2216 cvtdq2ps m3, m3 2217 rcpps m6, m2 ; 1 / (z + 1) 2218 rcpps m7, m3 2219 pcmpgtd m2, m11, m2 2220 pcmpgtd m3, m11, m3 2221 mulps m6, m11 ; 256 / (z + 1) 2222 mulps m7, m11 2223 psrld m2, 24 ; z < 255 ? 255 : 0 2224 psrld m3, 24 2225 cvtps2dq m6, m6 2226 cvtps2dq m7, m7 2227 pminsw m6, m2 ; x 2228 pminsw m7, m3 2229 pmulld m0, m6 2230 packssdw m6, m7 2231 pmulld m7, m1 2232 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2233 paddd m7, m10 2234 psrld m0, 12 2235 psrld m7, 12 2236 paddw m1, m8, [t2+r10+400*0] 2237 paddd m2, m4, [t2+r10+400*2] 2238 paddd m3, m5, [t2+r10+400*4] 2239 paddw m1, [t1+r10+400*0] 2240 paddd m2, [t1+r10+400*2] 2241 paddd m3, [t1+r10+400*4] 2242 mova [t2+r10+400*0], m8 2243 mova [t2+r10+400*2], m4 2244 mova [t2+r10+400*4], m5 2245 mova [t4+r10*1+400*4 +4], m6 2246 mova [t3+r10*2+400*8+ 8], xm0 2247 vextracti128 [t3+r10*2+400*8+40], m0, 1 2248 mova [t3+r10*2+400*8+24], xm7 2249 vextracti128 [t3+r10*2+400*8+56], m7, 1 2250 vpbroadcastd m4, [base+pd_25] 2251 vpbroadcastd m6, [base+pw_164_24] 2252 pxor m7, m7 2253 paddd m2, m9 2254 paddd m3, m9 2255 psrld m2, 4 ; (a5 + 8) >> 4 2256 psrld m3, 4 2257 pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 2258 pmulld m3, m4 2259 psrlw m5, m1, 1 2260 pavgw m5, m7 ; (b5 + 2) >> 2 2261 punpcklwd m4, m5, m7 2262 pmaddwd m4, m4 2263 punpckhwd m5, m7 2264 pmaddwd m5, m5 2265 punpcklwd m0, m1, m7 ; b5 2266 punpckhwd m1, m7 2267 pmaxud m2, m4 2268 psubd m2, m4 ; p5 2269 pmaxud m3, m5 2270 psubd m3, m5 2271 pmulld m2, m13 ; p5 * s0 2272 pmulld m3, m13 2273 pmaddwd m0, m6 ; b5 * 164 2274 pmaddwd m1, m6 2275 paddw m2, m6 2276 paddw m3, m6 2277 psrld m2, 20 ; z5 + 1 2278 psrld m3, 20 2279 cvtdq2ps m2, m2 2280 cvtdq2ps m3, m3 2281 rcpps m4, m2 ; 1 / (z5 + 1) 2282 rcpps m5, m3 2283 pcmpgtd m2, m11, m2 2284 pcmpgtd m3, m11, m3 2285 mulps m4, m11 ; 256 / (z5 + 1) 2286 mulps m5, m11 2287 psrld m2, 24 ; z5 < 255 ? 255 : 0 2288 psrld m3, 24 2289 cvtps2dq m4, m4 2290 cvtps2dq m5, m5 2291 pminsw m4, m2 ; x5 2292 pminsw m5, m3 2293 pmulld m0, m4 2294 pmulld m1, m5 2295 packssdw m4, m5 2296 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2297 paddd m1, m10 2298 psrld m0, 12 2299 psrld m1, 12 2300 mova [t4+r10*1+400*0+ 4], m4 2301 mova [t3+r10*2+400*0+ 8], xm0 2302 vextracti128 [t3+r10*2+400*0+40], m0, 1 2303 mova [t3+r10*2+400*0+24], xm1 2304 vextracti128 [t3+r10*2+400*0+56], m1, 1 2305 add r10, 32 2306 jl .hv1_loop 2307 mov r10, t2 2308 mov t2, t1 2309 mov t1, r10 2310 ret 2311.v0: ; vertical boxsums + ab3 (even rows) 2312 lea r10, [wq-4] 2313.v0_loop: 2314 mova m0, [t1+r10+400* 6] 2315 mova m4, [t1+r10+400* 8] 2316 mova m5, [t1+r10+400*10] 2317 paddw m0, m0 2318 paddd m4, m4 2319 paddd m5, m5 2320 paddw m1, m0, [t2+r10+400* 6] 2321 paddd m2, m4, [t2+r10+400* 8] 2322 paddd m3, m5, [t2+r10+400*10] 2323 mova [t2+r10+400* 6], m0 2324 mova [t2+r10+400* 8], m4 2325 mova [t2+r10+400*10], m5 2326 paddd m2, m9 2327 paddd m3, m9 2328 psrld m2, 4 ; (a3 + 8) >> 4 2329 psrld m3, 4 2330 pslld m4, m2, 3 2331 pslld m5, m3, 3 2332 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2333 paddd m5, m3 2334 psrlw m3, m1, 1 2335 pavgw m3, m7 ; (b3 + 2) >> 2 2336 punpcklwd m2, m3, m7 2337 pmaddwd m2, m2 2338 punpckhwd m3, m7 2339 pmaddwd m3, m3 2340 punpcklwd m0, m1, m7 ; b3 2341 punpckhwd m1, m7 2342 pmaxud m4, m2 2343 psubd m4, m2 ; p3 2344 pmaxud m5, m3 2345 psubd m5, m3 2346 pmulld m4, m14 ; p3 * s1 2347 pmulld m5, m14 2348 pmaddwd m0, m12 ; b3 * 455 2349 pmaddwd m1, m12 2350 paddw m4, m12 2351 paddw m5, m12 2352 psrld m4, 20 ; z + 1 2353 psrld m5, 20 2354 cvtdq2ps m4, m4 2355 cvtdq2ps m5, m5 2356 rcpps m2, m4 ; 1 / (z + 1) 2357 rcpps m3, m5 2358 pcmpgtd m4, m11, m4 2359 pcmpgtd m5, m11, m5 2360 mulps m2, m11 ; 256 / (z + 1) 2361 mulps m3, m11 2362 psrld m4, 24 ; z < 255 ? 255 : 0 2363 psrld m5, 24 2364 cvtps2dq m2, m2 2365 cvtps2dq m3, m3 2366 pminsw m2, m4 ; x 2367 pminsw m3, m5 2368 pmulld m0, m2 2369 pmulld m1, m3 2370 packssdw m2, m3 2371 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2372 paddd m1, m10 2373 psrld m0, 12 2374 psrld m1, 12 2375 mova m3, [t1+r10+400*0] 2376 mova m4, [t1+r10+400*2] 2377 mova m5, [t1+r10+400*4] 2378 mova [t3+r10*2+400*8+ 8], m3 2379 mova [t3+r10*2+400*0+ 8], m4 2380 mova [t3+r10*2+400*0+40], m5 2381 paddw m3, m3 ; cc5 2382 paddd m4, m4 2383 paddd m5, m5 2384 mova [t1+r10+400*0], m3 2385 mova [t1+r10+400*2], m4 2386 mova [t1+r10+400*4], m5 2387 mova [t4+r10*1+400*2+ 4], m2 2388 mova [t3+r10*2+400*4+ 8], xm0 2389 vextracti128 [t3+r10*2+400*4+40], m0, 1 2390 mova [t3+r10*2+400*4+24], xm1 2391 vextracti128 [t3+r10*2+400*4+56], m1, 1 2392 add r10, 32 2393 jl .v0_loop 2394 ret 2395.v1: ; vertical boxsums + ab (odd rows) 2396 lea r10, [wq-4] 2397.v1_loop: 2398 mova m4, [t1+r10+400* 6] 2399 mova m5, [t1+r10+400* 8] 2400 mova m6, [t1+r10+400*10] 2401 paddw m1, m4, [t2+r10+400* 6] 2402 paddd m2, m5, [t2+r10+400* 8] 2403 paddd m3, m6, [t2+r10+400*10] 2404 mova [t2+r10+400* 6], m4 2405 mova [t2+r10+400* 8], m5 2406 mova [t2+r10+400*10], m6 2407 paddd m2, m9 2408 paddd m3, m9 2409 psrld m2, 4 ; (a3 + 8) >> 4 2410 psrld m3, 4 2411 pslld m4, m2, 3 2412 pslld m5, m3, 3 2413 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2414 paddd m5, m3 2415 psrlw m3, m1, 1 2416 pavgw m3, m7 ; (b3 + 2) >> 2 2417 punpcklwd m2, m3, m7 2418 pmaddwd m2, m2 2419 punpckhwd m3, m7 2420 pmaddwd m3, m3 2421 punpcklwd m0, m1, m7 ; b3 2422 punpckhwd m1, m7 2423 pmaxud m4, m2 2424 psubd m4, m2 ; p3 2425 pmaxud m5, m3 2426 psubd m5, m3 2427 pmulld m4, m14 ; p3 * s1 2428 pmulld m5, m14 2429 pmaddwd m0, m12 ; b3 * 455 2430 pmaddwd m1, m12 2431 paddw m4, m12 2432 paddw m5, m12 2433 psrld m4, 20 ; z + 1 2434 psrld m5, 20 2435 cvtdq2ps m4, m4 2436 cvtdq2ps m5, m5 2437 rcpps m2, m4 ; 1 / (z + 1) 2438 rcpps m3, m5 2439 pcmpgtd m4, m11, m4 2440 pcmpgtd m5, m11, m5 2441 mulps m2, m11 ; 256 / (z + 1) 2442 mulps m3, m11 2443 psrld m4, 24 ; z < 255 ? 255 : 0 2444 psrld m5, 24 2445 cvtps2dq m2, m2 2446 cvtps2dq m3, m3 2447 pminsw m2, m4 ; x 2448 pminsw m3, m5 2449 pmulld m0, m2 2450 pmulld m1, m3 2451 packssdw m2, m3 2452 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2453 paddd m1, m10 2454 psrld m0, 12 2455 psrld m8, m1, 12 2456 mova [t4+r10*1+400*4+4], m2 2457 mova m4, [t3+r10*2+400*8+ 8] 2458 mova m5, [t3+r10*2+400*0+ 8] 2459 mova m6, [t3+r10*2+400*0+40] 2460 paddw m1, m4, [t2+r10+400*0] 2461 paddd m2, m5, [t2+r10+400*2] 2462 paddd m3, m6, [t2+r10+400*4] 2463 paddw m1, [t1+r10+400*0] 2464 paddd m2, [t1+r10+400*2] 2465 paddd m3, [t1+r10+400*4] 2466 mova [t2+r10+400*0], m4 2467 mova [t2+r10+400*2], m5 2468 mova [t2+r10+400*4], m6 2469 mova [t3+r10*2+400*8+ 8], xm0 2470 vextracti128 [t3+r10*2+400*8+40], m0, 1 2471 mova [t3+r10*2+400*8+24], xm8 2472 vextracti128 [t3+r10*2+400*8+56], m8, 1 2473 vpbroadcastd m4, [base+pd_25] 2474 vpbroadcastd m6, [base+pw_164_24] 2475 paddd m2, m9 2476 paddd m3, m9 2477 psrld m2, 4 ; (a5 + 8) >> 4 2478 psrld m3, 4 2479 pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 2480 pmulld m3, m4 2481 psrlw m5, m1, 1 2482 pavgw m5, m7 ; (b5 + 2) >> 2 2483 punpcklwd m4, m5, m7 2484 pmaddwd m4, m4 2485 punpckhwd m5, m7 2486 pmaddwd m5, m5 2487 punpcklwd m0, m1, m7 ; b5 2488 punpckhwd m1, m7 2489 pmaxud m2, m4 2490 psubd m2, m4 ; p5 2491 pmaxud m3, m5 2492 psubd m3, m5 2493 pmulld m2, m13 ; p5 * s0 2494 pmulld m3, m13 2495 pmaddwd m0, m6 ; b5 * 164 2496 pmaddwd m1, m6 2497 paddw m2, m6 2498 paddw m3, m6 2499 psrld m2, 20 ; z5 + 1 2500 psrld m3, 20 2501 cvtdq2ps m2, m2 2502 cvtdq2ps m3, m3 2503 rcpps m4, m2 ; 1 / (z5 + 1) 2504 rcpps m5, m3 2505 pcmpgtd m2, m11, m2 2506 pcmpgtd m3, m11, m3 2507 mulps m4, m11 ; 256 / (z5 + 1) 2508 mulps m5, m11 2509 psrld m2, 24 ; z5 < 255 ? 255 : 0 2510 psrld m3, 24 2511 cvtps2dq m4, m4 2512 cvtps2dq m5, m5 2513 pminsw m4, m2 ; x5 2514 pminsw m5, m3 2515 pmulld m0, m4 2516 pmulld m1, m5 2517 packssdw m4, m5 2518 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2519 paddd m1, m10 2520 psrld m0, 12 2521 psrld m1, 12 2522 mova [t4+r10*1+400*0+ 4], m4 2523 mova [t3+r10*2+400*0+ 8], xm0 2524 vextracti128 [t3+r10*2+400*0+40], m0, 1 2525 mova [t3+r10*2+400*0+24], xm1 2526 vextracti128 [t3+r10*2+400*0+56], m1, 1 2527 add r10, 32 2528 jl .v1_loop 2529 mov r10, t2 2530 mov t2, t1 2531 mov t1, r10 2532 ret 2533.prep_n: ; initial neighbor setup 2534 mov r10, wq 2535.prep_n_loop: 2536 movu xm0, [t4+r10*1+400*0+2] 2537 paddw xm2, xm0, [t4+r10*1+400*0+0] 2538 paddw xm2, [t4+r10*1+400*0+4] 2539 movu m1, [t3+r10*2+400*0+4] 2540 paddd m3, m1, [t3+r10*2+400*0+0] 2541 paddd m3, [t3+r10*2+400*0+8] 2542 paddw xm0, xm2 2543 paddd m1, m3 2544 psllw xm2, 2 2545 pslld m3, 2 2546 paddw xm0, xm2 ; a5 565 2547 paddd m1, m3 ; b5 565 2548 mova [t4+r10*1+400* 6], xm0 2549 mova [t3+r10*2+400*12], m1 2550 mova xm0, [t4+r10*1+400*2+0] 2551 paddw xm0, [t4+r10*1+400*2+4] 2552 paddw xm2, xm0, [t4+r10*1+400*2+2] 2553 mova m1, [t3+r10*2+400*4+0] 2554 paddd m1, [t3+r10*2+400*4+8] 2555 paddd m3, m1, [t3+r10*2+400*4+4] 2556 psllw xm2, 2 ; a3[-1] 444 2557 pslld m3, 2 ; b3[-1] 444 2558 psubw xm2, xm0 ; a3[-1] 343 2559 psubd m3, m1 ; b3[-1] 343 2560 mova [t4+r10*1+400* 8], xm2 2561 mova [t3+r10*2+400*16], m3 2562 mova xm0, [t4+r10*1+400*4+0] 2563 paddw xm0, [t4+r10*1+400*4+4] 2564 paddw xm2, xm0, [t4+r10*1+400*4+2] 2565 mova m1, [t3+r10*2+400*8+0] 2566 paddd m1, [t3+r10*2+400*8+8] 2567 paddd m3, m1, [t3+r10*2+400*8+4] 2568 psllw xm2, 2 ; a3[ 0] 444 2569 pslld m3, 2 ; b3[ 0] 444 2570 mova [t4+r10*1+400*10], xm2 2571 mova [t3+r10*2+400*20], m3 2572 psubw xm2, xm0 ; a3[ 0] 343 2573 psubd m3, m1 ; b3[ 0] 343 2574 mova [t4+r10*1+400*12], xm2 2575 mova [t3+r10*2+400*24], m3 2576 add r10, 16 2577 jl .prep_n_loop 2578 ret 2579ALIGN function_align 2580.n0: ; neighbor + output (even rows) 2581 mov r10, wq 2582 vpbroadcastd m6, [base+pd_4096] 2583.n0_loop: 2584 movu xm2, [t4+r10*1+2] 2585 paddw xm0, xm2, [t4+r10*1+0] 2586 paddw xm0, [t4+r10*1+4] 2587 paddw xm2, xm0 2588 psllw xm0, 2 2589 paddw xm0, xm2 ; a5 2590 movu m1, [t3+r10*2+4] 2591 paddd m4, m1, [t3+r10*2+0] 2592 paddd m4, [t3+r10*2+8] 2593 paddd m1, m4 2594 pslld m4, 2 2595 paddd m4, m1 ; b5 2596 paddw xm2, xm0, [t4+r10*1+400* 6] 2597 mova [t4+r10*1+400* 6], xm0 2598 paddd m0, m4, [t3+r10*2+400*12] 2599 mova [t3+r10*2+400*12], m4 2600 mova xm3, [t4+r10*1+400*2+0] 2601 paddw xm3, [t4+r10*1+400*2+4] 2602 paddw xm5, xm3, [t4+r10*1+400*2+2] 2603 psllw xm5, 2 ; a3[ 1] 444 2604 psubw xm4, xm5, xm3 ; a3[ 1] 343 2605 paddw xm3, xm4, [t4+r10*1+400* 8] 2606 paddw xm3, [t4+r10*1+400*10] 2607 mova [t4+r10*1+400* 8], xm4 2608 mova [t4+r10*1+400*10], xm5 2609 mova m1, [t3+r10*2+400*4+0] 2610 paddd m1, [t3+r10*2+400*4+8] 2611 paddd m5, m1, [t3+r10*2+400*4+4] 2612 pslld m5, 2 ; b3[ 1] 444 2613 psubd m4, m5, m1 ; b3[ 1] 343 2614 paddd m1, m4, [t3+r10*2+400*16] 2615 paddd m1, [t3+r10*2+400*20] 2616 mova [t3+r10*2+400*16], m4 2617 mova [t3+r10*2+400*20], m5 2618 pmovzxwd m4, [dstq+r10] 2619 pmovzxwd m2, xm2 ; a5 2620 pmovzxwd m3, xm3 ; a3 2621 pmaddwd m2, m4 ; a5 * src 2622 pmaddwd m3, m4 ; a3 * src 2623 pslld m4, 13 2624 psubd m0, m2 ; b5 - a5 * src + (1 << 8) 2625 psubd m1, m3 ; b3 - a3 * src + (1 << 8) 2626 psrld m0, 9 2627 pslld m1, 7 2628 pblendw m0, m1, 0xaa 2629 pmaddwd m0, m15 2630 paddd m4, m6 2631 paddd m0, m4 2632 psrad m0, 7 2633 vextracti128 xm1, m0, 1 2634 packusdw xm0, xm1 ; clip 2635 psrlw xm0, 6 2636 mova [dstq+r10], xm0 2637 add r10, 16 2638 jl .n0_loop 2639 add dstq, strideq 2640 ret 2641ALIGN function_align 2642.n1: ; neighbor + output (odd rows) 2643 mov r10, wq 2644 vpbroadcastd m6, [base+pd_4096] 2645.n1_loop: 2646 mova xm3, [t4+r10*1+400*4+0] 2647 paddw xm3, [t4+r10*1+400*4+4] 2648 paddw xm5, xm3, [t4+r10*1+400*4+2] 2649 psllw xm5, 2 ; a3[ 1] 444 2650 psubw xm4, xm5, xm3 ; a3[ 1] 343 2651 paddw xm3, xm4, [t4+r10*1+400*12] 2652 paddw xm3, [t4+r10*1+400*10] 2653 mova [t4+r10*1+400*10], xm5 2654 mova [t4+r10*1+400*12], xm4 2655 mova m1, [t3+r10*2+400*8+0] 2656 paddd m1, [t3+r10*2+400*8+8] 2657 paddd m5, m1, [t3+r10*2+400*8+4] 2658 pslld m5, 2 ; b3[ 1] 444 2659 psubd m4, m5, m1 ; b3[ 1] 343 2660 paddd m1, m4, [t3+r10*2+400*24] 2661 paddd m1, [t3+r10*2+400*20] 2662 mova [t3+r10*2+400*20], m5 2663 mova [t3+r10*2+400*24], m4 2664 pmovzxwd m4, [dstq+r10] 2665 pmovzxwd m2, [t4+r10*1+400* 6] 2666 pmovzxwd m3, xm3 2667 mova m0, [t3+r10*2+400*12] 2668 pmaddwd m2, m4 ; a5 * src 2669 pmaddwd m3, m4 ; a3 * src 2670 pslld m4, 13 2671 psubd m0, m2 ; b5 - a5 * src + (1 << 8) 2672 psubd m1, m3 ; b3 - a3 * src + (1 << 8) 2673 psrld m0, 8 2674 pslld m1, 7 2675 pblendw m0, m1, 0xaa 2676 pmaddwd m0, m15 2677 paddd m4, m6 2678 paddd m0, m4 2679 psrad m0, 7 2680 vextracti128 xm1, m0, 1 2681 packusdw xm0, xm1 ; clip 2682 psrlw xm0, 6 2683 mova [dstq+r10], xm0 2684 add r10, 16 2685 jl .n1_loop 2686 add dstq, strideq 2687 ret 2688 2689%endif ; ARCH_X86_64 2690