1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 16 32 33wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 34wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 35wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 36wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 37wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 38r_ext_mask: times 72 db -1 39 times 8 db 0 40wiener_hshift: dw 4, 4, 1, 1 41wiener_vshift: dw 1024, 1024, 4096, 4096 42wiener_round: dd 1049600, 1048832 43 44pw_164_455: dw 164, 455 45pw_1023: times 2 dw 1023 46pw_61448: times 2 dw 61448 47pd_m262128: dd -262128 48pd_m34816: dd -34816 49pd_m25: dd -25 50pd_m9: dd -9 51pd_8: dd 8 52pd_2147483648: dd 2147483648 53 54cextern sgr_x_by_x 55 56SECTION .text 57 58DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers 59 60INIT_ZMM avx512icl 61cglobal wiener_filter7_16bpc, 4, 15, 17, -384*12-16, dst, stride, left, lpf, \ 62 w, h, edge, flt 63%define base t4-wiener_hshift 64 mov fltq, r6mp 65 movifnidn wd, wm 66 movifnidn hd, hm 67 mov edged, r7m 68 mov t3d, r8m ; pixel_max 69 vbroadcasti128 m6, [wiener_shufA] 70 vpbroadcastd m12, [fltq+ 0] ; x0 x1 71 lea t4, [wiener_hshift] 72 vbroadcasti128 m7, [wiener_shufB] 73 add wd, wd 74 vpbroadcastd m13, [fltq+ 4] ; x2 x3 75 shr t3d, 11 76 vpbroadcastd m14, [fltq+16] ; y0 y1 77 add lpfq, wq 78 vpbroadcastd m15, [fltq+20] ; y2 y3 79 add dstq, wq 80 vbroadcasti128 m8, [wiener_shufC] 81 lea t1, [rsp+wq+16] 82 vbroadcasti128 m9, [wiener_shufD] 83 neg wq 84 vpbroadcastd m0, [base+wiener_hshift+t3*4] 85 mov r10d, 0xfe 86 vpbroadcastd m10, [base+wiener_round+t3*4] 87 kmovb k1, r10d 88 vpbroadcastd m11, [base+wiener_vshift+t3*4] 89 pmullw m12, m0 ; upshift filter coefs to make the 90 vpbroadcastd m16, [pd_m262128] 91 pmullw m13, m0 ; horizontal downshift constant 92 test edgeb, 4 ; LR_HAVE_TOP 93 jz .no_top 94 call .h_top 95 add lpfq, strideq 96 mov t6, t1 97 mov t5, t1 98 add t1, 384*2 99 call .h_top 100 lea r10, [lpfq+strideq*4] 101 mov lpfq, dstq 102 mov t4, t1 103 add t1, 384*2 104 add r10, strideq 105 mov [rsp], r10 ; below 106 call .h 107 mov t3, t1 108 mov t2, t1 109 dec hd 110 jz .v1 111 add lpfq, strideq 112 add t1, 384*2 113 call .h 114 mov t2, t1 115 dec hd 116 jz .v2 117 add lpfq, strideq 118 add t1, 384*2 119 call .h 120 dec hd 121 jz .v3 122.main: 123 lea t0, [t1+384*2] 124.main_loop: 125 call .hv 126 dec hd 127 jnz .main_loop 128 test edgeb, 8 ; LR_HAVE_BOTTOM 129 jz .v3 130 mov lpfq, [rsp] 131 call .hv_bottom 132 add lpfq, strideq 133 call .hv_bottom 134.v1: 135 call .v 136 RET 137.no_top: 138 lea r10, [lpfq+strideq*4] 139 mov lpfq, dstq 140 lea r10, [r10+strideq*2] 141 mov [rsp], r10 142 call .h 143 mov t6, t1 144 mov t5, t1 145 mov t4, t1 146 mov t3, t1 147 mov t2, t1 148 dec hd 149 jz .v1 150 add lpfq, strideq 151 add t1, 384*2 152 call .h 153 mov t2, t1 154 dec hd 155 jz .v2 156 add lpfq, strideq 157 add t1, 384*2 158 call .h 159 dec hd 160 jz .v3 161 lea t0, [t1+384*2] 162 call .hv 163 dec hd 164 jz .v3 165 add t0, 384*8 166 call .hv 167 dec hd 168 jnz .main 169.v3: 170 call .v 171.v2: 172 call .v 173 jmp .v1 174.h: 175 mov r10, wq 176 test edgeb, 1 ; LR_HAVE_LEFT 177 jz .h_extend_left 178 movq xm3, [leftq] 179 vmovdqu64 m3{k1}, [lpfq+r10-8] 180 add leftq, 8 181 jmp .h_main 182.h_extend_left: 183 mova m4, [lpfq+r10+0] 184 vpbroadcastw xm3, xm4 185 vmovdqu64 m3{k1}, [lpfq+r10-8] 186 jmp .h_main2 187.h_top: 188 mov r10, wq 189 test edgeb, 1 ; LR_HAVE_LEFT 190 jz .h_extend_left 191.h_loop: 192 movu m3, [lpfq+r10-8] 193.h_main: 194 mova m4, [lpfq+r10+0] 195.h_main2: 196 movu m5, [lpfq+r10+8] 197 test edgeb, 2 ; LR_HAVE_RIGHT 198 jnz .h_have_right 199 cmp r10d, -68 200 jl .h_have_right 201 push r0 202 lea r0, [r_ext_mask+66] 203 vpbroadcastw m0, [lpfq-2] 204 vpternlogd m3, m0, [r0+r10+ 0], 0xe4 ; c ? a : b 205 vpternlogd m4, m0, [r0+r10+ 8], 0xe4 206 vpternlogd m5, m0, [r0+r10+16], 0xe4 207 pop r0 208.h_have_right: 209 pshufb m2, m3, m6 210 pshufb m1, m4, m7 211 paddw m2, m1 212 pshufb m3, m8 213 mova m0, m16 214 vpdpwssd m0, m2, m12 215 pshufb m1, m4, m9 216 paddw m3, m1 217 pshufb m1, m4, m6 218 vpdpwssd m0, m3, m13 219 pshufb m2, m5, m7 220 paddw m2, m1 221 mova m1, m16 222 pshufb m4, m8 223 vpdpwssd m1, m2, m12 224 pshufb m5, m9 225 paddw m4, m5 226 vpdpwssd m1, m4, m13 227 psrad m0, 4 228 psrad m1, 4 229 packssdw m0, m1 230 psraw m0, 1 231 mova [t1+r10], m0 232 add r10, 64 233 jl .h_loop 234 ret 235ALIGN function_align 236.hv: 237 add lpfq, strideq 238 mov r10, wq 239 test edgeb, 1 ; LR_HAVE_LEFT 240 jz .hv_extend_left 241 movq xm3, [leftq] 242 vmovdqu64 m3{k1}, [lpfq+r10-8] 243 add leftq, 8 244 jmp .hv_main 245.hv_extend_left: 246 mova m4, [lpfq+r10+0] 247 vpbroadcastw xm3, xm4 248 vmovdqu64 m3{k1}, [lpfq+r10-8] 249 jmp .hv_main2 250.hv_bottom: 251 mov r10, wq 252 test edgeb, 1 ; LR_HAVE_LEFT 253 jz .hv_extend_left 254.hv_loop: 255 movu m3, [lpfq+r10-8] 256.hv_main: 257 mova m4, [lpfq+r10+0] 258.hv_main2: 259 movu m5, [lpfq+r10+8] 260 test edgeb, 2 ; LR_HAVE_RIGHT 261 jnz .hv_have_right 262 cmp r10d, -68 263 jl .hv_have_right 264 push r0 265 lea r0, [r_ext_mask+66] 266 vpbroadcastw m0, [lpfq-2] 267 vpternlogd m3, m0, [r0+r10+ 0], 0xe4 268 vpternlogd m4, m0, [r0+r10+ 8], 0xe4 269 vpternlogd m5, m0, [r0+r10+16], 0xe4 270 pop r0 271.hv_have_right: 272 pshufb m2, m3, m6 273 pshufb m1, m4, m7 274 paddw m2, m1 275 pshufb m3, m8 276 mova m0, m16 277 vpdpwssd m0, m2, m12 278 pshufb m1, m4, m9 279 paddw m3, m1 280 pshufb m1, m4, m6 281 vpdpwssd m0, m3, m13 282 pshufb m2, m5, m7 283 paddw m2, m1 284 pshufb m4, m8 285 mova m1, m16 286 vpdpwssd m1, m2, m12 287 pshufb m5, m9 288 paddw m4, m5 289 vpdpwssd m1, m4, m13 290 mova m2, [t4+r10] 291 paddw m2, [t2+r10] 292 mova m5, [t3+r10] 293 psrad m0, 4 294 psrad m1, 4 295 packssdw m0, m1 296 mova m4, [t5+r10] 297 paddw m4, [t1+r10] 298 psraw m0, 1 299 paddw m3, m0, [t6+r10] 300 mova [t0+r10], m0 301 punpcklwd m1, m2, m5 302 mova m0, m10 303 vpdpwssd m0, m1, m15 304 punpckhwd m2, m5 305 mova m1, m10 306 vpdpwssd m1, m2, m15 307 punpcklwd m2, m3, m4 308 vpdpwssd m0, m2, m14 309 punpckhwd m3, m4 310 vpdpwssd m1, m3, m14 311 psrad m0, 5 312 psrad m1, 5 313 packusdw m0, m1 314 pmulhuw m0, m11 315 mova [dstq+r10], m0 316 add r10, 64 317 jl .hv_loop 318 mov t6, t5 319 mov t5, t4 320 mov t4, t3 321 mov t3, t2 322 mov t2, t1 323 mov t1, t0 324 mov t0, t6 325 add dstq, strideq 326 ret 327.v: 328 mov r10, wq 329.v_loop: 330 mova m2, [t4+r10] 331 paddw m2, [t2+r10] 332 mova m3, [t3+r10] 333 punpcklwd m1, m2, m3 334 mova m0, m10 335 vpdpwssd m0, m1, m15 336 punpckhwd m2, m3 337 mova m1, m10 338 vpdpwssd m1, m2, m15 339 mova m4, [t1+r10] 340 paddw m3, m4, [t6+r10] 341 paddw m4, [t5+r10] 342 punpcklwd m2, m3, m4 343 vpdpwssd m0, m2, m14 344 punpckhwd m3, m4 345 vpdpwssd m1, m3, m14 346 psrad m0, 5 347 psrad m1, 5 348 packusdw m0, m1 349 pmulhuw m0, m11 350 mova [dstq+r10], m0 351 add r10, 64 352 jl .v_loop 353 mov t6, t5 354 mov t5, t4 355 mov t4, t3 356 mov t3, t2 357 mov t2, t1 358 add dstq, strideq 359 ret 360 361cglobal wiener_filter5_16bpc, 4, 14, 15, 384*8+16, dst, stride, left, lpf, \ 362 w, h, edge, flt 363%define base r13-r_ext_mask-70 364 mov fltq, r6mp 365 movifnidn wd, wm 366 movifnidn hd, hm 367 mov edged, r7m 368 mov t3d, r8m ; pixel_max 369 vbroadcasti128 m5, [wiener_shufE] 370 vpbroadcastw m11, [fltq+ 2] ; x1 371 vbroadcasti128 m6, [wiener_shufB] 372 lea r13, [r_ext_mask+70] 373 vbroadcasti128 m7, [wiener_shufD] 374 add wd, wd 375 vpbroadcastd m12, [fltq+ 4] ; x2 x3 376 shr t3d, 11 377 vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) 378 add lpfq, wq 379 vpbroadcastw m13, [fltq+18] ; y1 380 add dstq, wq 381 vpbroadcastd m14, [fltq+20] ; y2 y3 382 lea t1, [rsp+wq+16] 383 vpbroadcastd m0, [base+wiener_hshift+t3*4] 384 neg wq 385 vpbroadcastd m9, [base+wiener_round+t3*4] 386 mov r10d, 0xfffe 387 vpbroadcastd m10, [base+wiener_vshift+t3*4] 388 kmovw k1, r10d 389 pmullw m11, m0 390 pmullw m12, m0 391 test edgeb, 4 ; LR_HAVE_TOP 392 jz .no_top 393 call .h_top 394 add lpfq, strideq 395 mov t4, t1 396 add t1, 384*2 397 call .h_top 398 lea r10, [lpfq+strideq*4] 399 mov lpfq, dstq 400 mov t3, t1 401 add t1, 384*2 402 add r10, strideq 403 mov [rsp], r10 ; below 404 call .h 405 mov t2, t1 406 dec hd 407 jz .v1 408 add lpfq, strideq 409 add t1, 384*2 410 call .h 411 dec hd 412 jz .v2 413.main: 414 mov t0, t4 415.main_loop: 416 call .hv 417 dec hd 418 jnz .main_loop 419 test edgeb, 8 ; LR_HAVE_BOTTOM 420 jz .v2 421 mov lpfq, [rsp] 422 call .hv_bottom 423 add lpfq, strideq 424 call .hv_bottom 425.end: 426 RET 427.no_top: 428 lea r10, [lpfq+strideq*4] 429 mov lpfq, dstq 430 lea r10, [r10+strideq*2] 431 mov [rsp], r10 432 call .h 433 mov t4, t1 434 mov t3, t1 435 mov t2, t1 436 dec hd 437 jz .v1 438 add lpfq, strideq 439 add t1, 384*2 440 call .h 441 dec hd 442 jz .v2 443 lea t0, [t1+384*2] 444 call .hv 445 dec hd 446 jz .v2 447 add t0, 384*6 448 call .hv 449 dec hd 450 jnz .main 451.v2: 452 call .v 453 mov t4, t3 454 mov t3, t2 455 mov t2, t1 456 add dstq, strideq 457.v1: 458 call .v 459 jmp .end 460.h: 461 mov r10, wq 462 test edgeb, 1 ; LR_HAVE_LEFT 463 jz .h_extend_left 464 movd xm3, [leftq+4] 465 vmovdqu32 m3{k1}, [lpfq+r10-4] 466 add leftq, 8 467 jmp .h_main 468.h_extend_left: 469 vpbroadcastw xm3, [lpfq+r10] 470 vmovdqu32 m3{k1}, [lpfq+r10-4] 471 jmp .h_main 472.h_top: 473 mov r10, wq 474 test edgeb, 1 ; LR_HAVE_LEFT 475 jz .h_extend_left 476.h_loop: 477 movu m3, [lpfq+r10-4] 478.h_main: 479 movu m4, [lpfq+r10+4] 480 test edgeb, 2 ; LR_HAVE_RIGHT 481 jnz .h_have_right 482 cmp r10d, -66 483 jl .h_have_right 484 vpbroadcastw m0, [lpfq-2] 485 vpternlogd m3, m0, [r13+r10+0], 0xe4 ; c ? a : b 486 vpternlogd m4, m0, [r13+r10+8], 0xe4 487.h_have_right: 488 pshufb m1, m3, m5 489 mova m0, m8 490 vpdpwssd m0, m1, m11 491 pshufb m2, m4, m5 492 mova m1, m8 493 vpdpwssd m1, m2, m11 494 pshufb m2, m3, m6 495 pshufb m3, m7 496 paddw m2, m3 497 pshufb m3, m4, m6 498 vpdpwssd m0, m2, m12 499 pshufb m4, m7 500 paddw m3, m4 501 vpdpwssd m1, m3, m12 502 psrad m0, 4 503 psrad m1, 4 504 packssdw m0, m1 505 psraw m0, 1 506 mova [t1+r10], m0 507 add r10, 64 508 jl .h_loop 509 ret 510ALIGN function_align 511.hv: 512 add lpfq, strideq 513 mov r10, wq 514 test edgeb, 1 ; LR_HAVE_LEFT 515 jz .hv_extend_left 516 movd xm3, [leftq+4] 517 vmovdqu32 m3{k1}, [lpfq+r10-4] 518 add leftq, 8 519 jmp .hv_main 520.hv_extend_left: 521 vpbroadcastw xm3, [lpfq+r10] 522 vmovdqu32 m3{k1}, [lpfq+r10-4] 523 jmp .hv_main 524.hv_bottom: 525 mov r10, wq 526 test edgeb, 1 ; LR_HAVE_LEFT 527 jz .hv_extend_left 528.hv_loop: 529 movu m3, [lpfq+r10-4] 530.hv_main: 531 movu m4, [lpfq+r10+4] 532 test edgeb, 2 ; LR_HAVE_RIGHT 533 jnz .hv_have_right 534 cmp r10d, -66 535 jl .hv_have_right 536 vpbroadcastw m0, [lpfq-2] 537 vpternlogd m3, m0, [r13+r10+0], 0xe4 538 vpternlogd m4, m0, [r13+r10+8], 0xe4 539.hv_have_right: 540 pshufb m1, m3, m5 541 mova m0, m8 542 vpdpwssd m0, m1, m11 543 pshufb m2, m4, m5 544 mova m1, m8 545 vpdpwssd m1, m2, m11 546 pshufb m2, m3, m6 547 pshufb m3, m7 548 paddw m2, m3 549 pshufb m3, m4, m6 550 vpdpwssd m0, m2, m12 551 pshufb m4, m7 552 paddw m4, m3 553 vpdpwssd m1, m4, m12 554 mova m2, [t3+r10] 555 paddw m2, [t1+r10] 556 mova m3, [t2+r10] 557 punpcklwd m4, m2, m3 558 punpckhwd m2, m3 559 mova m3, m9 560 vpdpwssd m3, m2, m14 561 mova m2, m9 562 vpdpwssd m2, m4, m14 563 mova m4, [t4+r10] 564 psrad m0, 4 565 psrad m1, 4 566 packssdw m0, m1 567 psraw m0, 1 568 mova [t0+r10], m0 569 punpcklwd m1, m0, m4 570 vpdpwssd m2, m1, m13 571 punpckhwd m0, m4 572 vpdpwssd m3, m0, m13 573 psrad m2, 5 574 psrad m3, 5 575 packusdw m2, m3 576 pmulhuw m2, m10 577 mova [dstq+r10], m2 578 add r10, 64 579 jl .hv_loop 580 mov t4, t3 581 mov t3, t2 582 mov t2, t1 583 mov t1, t0 584 mov t0, t4 585 add dstq, strideq 586 ret 587.v: 588 mov r10, wq 589.v_loop: 590 mova m0, [t1+r10] 591 paddw m2, m0, [t3+r10] 592 mova m1, [t2+r10] 593 mova m4, [t4+r10] 594 punpckhwd m3, m2, m1 595 pmaddwd m3, m14 596 punpcklwd m2, m1 597 pmaddwd m2, m14 598 punpckhwd m1, m0, m4 599 pmaddwd m1, m13 600 punpcklwd m0, m4 601 pmaddwd m0, m13 602 paddd m3, m9 603 paddd m2, m9 604 paddd m1, m3 605 paddd m0, m2 606 psrad m1, 5 607 psrad m0, 5 608 packusdw m0, m1 609 pmulhuw m0, m10 610 mova [dstq+r10], m0 611 add r10, 64 612 jl .v_loop 613 ret 614 615cglobal sgr_filter_5x5_16bpc, 4, 14, 22, 416*24+8, dst, stride, left, lpf, \ 616 w, h, edge, params 617%define base r13-r_ext_mask-72 618 movifnidn wd, wm 619 mov paramsq, r6mp 620 lea r13, [r_ext_mask+72] 621 mov edged, r7m 622 movifnidn hd, hm 623 pxor m6, m6 624 vpbroadcastw m7, [paramsq+8] ; w0 625 add wd, wd 626 vpbroadcastd m8, [base+pd_8] 627 add lpfq, wq 628 vpbroadcastd m9, [base+pd_m25] 629 add dstq, wq 630 vpsubd m10, m6, [paramsq+0] {1to16} ; -s0 631 lea t3, [rsp+wq*2+416*12+8] 632 vpbroadcastd m11, [base+pw_164_455] 633 lea t4, [rsp+wq+416*20+8] 634 vpbroadcastd m12, [base+pw_61448] ; (15 << 12) + (1 << 3) 635 lea t1, [rsp+wq+12] 636 vpbroadcastd m13, [base+pd_m34816] ; -((1 << 11) + (1 << 15)) 637 neg wq 638 vpbroadcastd m14, [base+pw_1023] 639 psllw m7, 4 640 mova m18, [sgr_x_by_x+64*0] 641 mov r10d, 0xfffffff8 642 mova m19, [sgr_x_by_x+64*1] 643 kmovd k1, r10d 644 mova m20, [sgr_x_by_x+64*2] 645 mov r10, 0x3333333333333333 646 mova m21, [sgr_x_by_x+64*3] 647 kmovq k2, r10 648 test edgeb, 4 ; LR_HAVE_TOP 649 jz .no_top 650 call .h_top 651 add lpfq, strideq 652 mov t2, t1 653 call .top_fixup 654 add t1, 416*6 655 call .h_top 656 lea r10, [lpfq+strideq*4] 657 mov lpfq, dstq 658 add r10, strideq 659 mov [rsp], r10 ; below 660 mov t0, t2 661 dec hd 662 jz .height1 663 or edged, 16 664 call .h 665.main: 666 add lpfq, strideq 667 call .hv 668 call .prep_n 669 sub hd, 2 670 jl .extend_bottom 671.main_loop: 672 add lpfq, strideq 673 test hd, hd 674 jz .odd_height 675 call .h 676 add lpfq, strideq 677 call .hv 678 call .n0 679 call .n1 680 sub hd, 2 681 jge .main_loop 682 test edgeb, 8 ; LR_HAVE_BOTTOM 683 jz .extend_bottom 684 mov lpfq, [rsp] 685 call .h_top 686 add lpfq, strideq 687 call .hv_bottom 688.end: 689 call .n0 690 call .n1 691.end2: 692 RET 693.height1: 694 call .hv 695 call .prep_n 696 jmp .odd_height_end 697.odd_height: 698 call .hv 699 call .n0 700 call .n1 701.odd_height_end: 702 call .v 703 call .n0 704 jmp .end2 705.extend_bottom: 706 call .v 707 jmp .end 708.no_top: 709 lea r10, [lpfq+strideq*4] 710 mov lpfq, dstq 711 lea r10, [r10+strideq*2] 712 mov [rsp], r10 713 call .h 714 lea t2, [t1+416*6] 715 call .top_fixup 716 dec hd 717 jz .no_top_height1 718 or edged, 16 719 mov t0, t1 720 mov t1, t2 721 jmp .main 722.no_top_height1: 723 call .v 724 call .prep_n 725 jmp .odd_height_end 726.h: ; horizontal boxsum 727 lea r10, [wq-4] 728 test edgeb, 1 ; LR_HAVE_LEFT 729 jz .h_extend_left 730 movq xm16, [leftq+2] 731 vmovdqu16 m16{k1}, [lpfq+wq-6] 732 add leftq, 8 733 jmp .h_main 734.h_extend_left: 735 vpbroadcastw xm16, [lpfq+wq] 736 vmovdqu16 m16{k1}, [lpfq+wq-6] 737 jmp .h_main 738.h_top: 739 lea r10, [wq-4] 740 test edgeb, 1 ; LR_HAVE_LEFT 741 jz .h_extend_left 742.h_loop: 743 movu m16, [lpfq+r10- 2] 744.h_main: 745 movu m17, [lpfq+r10+14] 746 test edgeb, 2 ; LR_HAVE_RIGHT 747 jnz .h_have_right 748 cmp r10d, -68 749 jl .h_have_right 750 vpbroadcastw m0, [lpfq-2] 751 vpternlogd m16, m0, [r13+r10+ 0], 0xe4 ; c ? a : b 752 vpternlogd m17, m0, [r13+r10+16], 0xe4 753.h_have_right: 754 palignr m2, m17, m16, 2 755 paddw m0, m16, m2 756 palignr m3, m17, m16, 6 757 paddw m0, m3 758 punpcklwd m1, m2, m3 759 pmaddwd m1, m1 760 punpckhwd m2, m3 761 pmaddwd m2, m2 762 shufpd m17, m16, m17, 0x55 763 paddw m0, m17 764 punpcklwd m3, m16, m17 765 vpdpwssd m1, m3, m3 766 punpckhwd m3, m16, m17 767 vpdpwssd m2, m3, m3 768 shufps m16, m17, q2121 769 paddw m0, m16 ; sum 770 test edgeb, 16 ; y > 0 771 jz .h_loop_end 772 paddw m0, [t1+r10+416*0] 773 paddd m1, [t1+r10+416*2] 774 paddd m2, [t1+r10+416*4] 775.h_loop_end: 776 punpcklwd m17, m16, m6 777 vpdpwssd m1, m17, m17 ; sumsq 778 punpckhwd m16, m6 779 vpdpwssd m2, m16, m16 780 mova [t1+r10+416*0], m0 781 mova [t1+r10+416*2], m1 782 mova [t1+r10+416*4], m2 783 add r10, 64 784 jl .h_loop 785 ret 786.top_fixup: 787 lea r10, [wq-4] 788.top_fixup_loop: ; the sums of the first row needs to be doubled 789 mova m0, [t1+r10+416*0] 790 mova m1, [t1+r10+416*2] 791 mova m2, [t1+r10+416*4] 792 paddw m0, m0 793 paddd m1, m1 794 paddd m2, m2 795 mova [t2+r10+416*0], m0 796 mova [t2+r10+416*2], m1 797 mova [t2+r10+416*4], m2 798 add r10, 64 799 jl .top_fixup_loop 800 ret 801ALIGN function_align 802.hv: ; horizontal boxsum + vertical boxsum + ab 803 lea r10, [wq-4] 804 test edgeb, 1 ; LR_HAVE_LEFT 805 jz .hv_extend_left 806 movq xm16, [leftq+2] 807 vmovdqu16 m16{k1}, [lpfq+wq-6] 808 add leftq, 8 809 jmp .hv_main 810.hv_extend_left: 811 vpbroadcastw xm16, [lpfq+wq] 812 vmovdqu16 m16{k1}, [lpfq+wq-6] 813 jmp .hv_main 814.hv_bottom: 815 lea r10, [wq-4] 816 test edgeb, 1 ; LR_HAVE_LEFT 817 jz .hv_extend_left 818.hv_loop: 819 movu m16, [lpfq+r10- 2] 820.hv_main: 821 movu m17, [lpfq+r10+14] 822 test edgeb, 2 ; LR_HAVE_RIGHT 823 jnz .hv_have_right 824 cmp r10d, -68 825 jl .hv_have_right 826 vpbroadcastw m0, [lpfq-2] 827 vpternlogd m16, m0, [r13+r10+ 0], 0xe4 828 vpternlogd m17, m0, [r13+r10+16], 0xe4 829.hv_have_right: 830 palignr m3, m17, m16, 2 831 paddw m0, m16, m3 832 palignr m1, m17, m16, 6 833 paddw m0, m1 834 punpcklwd m2, m3, m1 835 pmaddwd m2, m2 836 punpckhwd m3, m1 837 pmaddwd m3, m3 838 shufpd m17, m16, m17, 0x55 839 paddw m0, m17 840 punpcklwd m1, m16, m17 841 vpdpwssd m2, m1, m1 842 punpckhwd m1, m16, m17 843 vpdpwssd m3, m1, m1 844 shufps m16, m17, q2121 845 paddw m0, m16 ; h sum 846 punpcklwd m17, m16, m6 847 vpdpwssd m2, m17, m17 ; h sumsq 848 punpckhwd m16, m6 849 vpdpwssd m3, m16, m16 850 paddw m1, m0, [t1+r10+416*0] 851 paddd m16, m2, [t1+r10+416*2] 852 paddd m17, m3, [t1+r10+416*4] 853 test hd, hd 854 jz .hv_last_row 855.hv_main2: 856 paddw m1, [t2+r10+416*0] ; hv sum 857 paddd m16, [t2+r10+416*2] ; hv sumsq 858 paddd m17, [t2+r10+416*4] 859 mova [t0+r10+416*0], m0 860 mova [t0+r10+416*2], m2 861 mova [t0+r10+416*4], m3 862 psrlw m3, m1, 1 863 paddd m16, m8 864 pavgw m3, m6 ; (b + 2) >> 2 865 paddd m17, m8 866 psrld m16, 4 ; (a + 8) >> 4 867 psrld m17, 4 868 pmulld m16, m9 ; -a * 25 869 pmulld m17, m9 870 punpcklwd m2, m3, m6 871 vpdpwssd m16, m2, m2 ; -p 872 punpckhwd m3, m6 873 vpdpwssd m17, m3, m3 874 punpcklwd m0, m1, m6 ; b 875 punpckhwd m1, m6 876 pmulld m16, m10 ; p * s 877 pmulld m17, m10 878 pmaddwd m0, m11 ; b * 164 879 pmaddwd m1, m11 880 vpalignr m17{k2}, m16, m16, 2 881 mova m16, m20 882 pmaxsw m17, m6 883 paddusw m17, m12 884 psraw m17, 4 ; min(z, 255) - 256 885 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 886 vpmovb2m k3, m17 887 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 888 vmovdqu8 m17{k3}, m16 ; x 889 pandn m16, m13, m17 890 psrld m17, 16 891 pmulld m0, m16 892 pmulld m1, m17 893 packssdw m16, m17 894 psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 895 psubd m1, m13 896 mova [t4+r10+4], m16 897 psrld m16, m0, 12 ; b 898 psrld m17, m1, 12 899 mova [t3+r10*2+ 8], xm16 900 mova [t3+r10*2+ 24], xm17 901 vextracti128 [t3+r10*2+ 40], ym16, 1 902 vextracti128 [t3+r10*2+ 56], ym17, 1 903 vextracti32x4 [t3+r10*2+ 72], m16, 2 904 vextracti32x4 [t3+r10*2+ 88], m17, 2 905 vextracti32x4 [t3+r10*2+104], m16, 3 906 vextracti32x4 [t3+r10*2+120], m17, 3 907 add r10, 64 908 jl .hv_loop 909 mov t2, t1 910 mov t1, t0 911 mov t0, t2 912 ret 913.hv_last_row: ; esoteric edge case for odd heights 914 mova [t1+r10+416*0], m1 915 paddw m1, m0 916 mova [t1+r10+416*2], m16 917 paddd m16, m2 918 mova [t1+r10+416*4], m17 919 paddd m17, m3 920 jmp .hv_main2 921.v: ; vertical boxsum + ab 922 lea r10, [wq-4] 923.v_loop: 924 mova m2, [t1+r10+416*2] 925 mova m3, [t1+r10+416*4] 926 mova m0, [t1+r10+416*0] 927 paddd m16, m2, [t2+r10+416*2] 928 paddd m17, m3, [t2+r10+416*4] 929 paddw m1, m0, [t2+r10+416*0] 930 paddd m2, m2 931 paddd m3, m3 932 paddd m16, m2 ; hv sumsq 933 paddd m17, m3 934 paddd m16, m8 935 paddd m17, m8 936 psrld m16, 4 ; (a + 8) >> 4 937 psrld m17, 4 938 pmulld m16, m9 ; -a * 25 939 pmulld m17, m9 940 paddw m0, m0 941 paddw m1, m0 ; hv sum 942 psrlw m3, m1, 1 943 pavgw m3, m6 ; (b + 2) >> 2 944 punpcklwd m2, m3, m6 945 vpdpwssd m16, m2, m2 ; -p 946 punpckhwd m3, m6 947 vpdpwssd m17, m3, m3 948 punpcklwd m0, m1, m6 ; b 949 punpckhwd m1, m6 950 pmulld m16, m10 ; p * s 951 pmulld m17, m10 952 pmaddwd m0, m11 ; b * 164 953 pmaddwd m1, m11 954 vpalignr m17{k2}, m16, m16, 2 955 mova m16, m20 956 pmaxsw m17, m6 957 paddusw m17, m12 958 psraw m17, 4 ; min(z, 255) - 256 959 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 960 vpmovb2m k3, m17 961 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 962 vmovdqu8 m17{k3}, m16 ; x 963 pandn m16, m13, m17 964 psrld m17, 16 965 pmulld m0, m16 966 pmulld m1, m17 967 packssdw m16, m17 968 psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 969 psubd m1, m13 970 mova [t4+r10+4], m16 971 psrld m16, m0, 12 ; b 972 psrld m17, m1, 12 973 mova [t3+r10*2+ 8], xm16 974 mova [t3+r10*2+ 24], xm17 975 vextracti128 [t3+r10*2+ 40], ym16, 1 976 vextracti128 [t3+r10*2+ 56], ym17, 1 977 vextracti32x4 [t3+r10*2+ 72], m16, 2 978 vextracti32x4 [t3+r10*2+ 88], m17, 2 979 vextracti32x4 [t3+r10*2+104], m16, 3 980 vextracti32x4 [t3+r10*2+120], m17, 3 981 add r10, 64 982 jl .v_loop 983 ret 984.prep_n: ; initial neighbor setup 985 mov r10, wq 986.prep_n_loop: 987 movu m0, [t4+r10*1+ 2] 988 movu m1, [t3+r10*2+ 4] 989 movu m2, [t3+r10*2+68] 990 paddw m3, m0, [t4+r10*1+ 0] 991 paddd m16, m1, [t3+r10*2+ 0] 992 paddd m17, m2, [t3+r10*2+64] 993 paddw m3, [t4+r10*1+ 4] 994 paddd m16, [t3+r10*2+ 8] 995 paddd m17, [t3+r10*2+72] 996 paddw m0, m3 997 psllw m3, 2 998 paddd m1, m16 999 pslld m16, 2 1000 paddd m2, m17 1001 pslld m17, 2 1002 paddw m0, m3 ; a 565 1003 paddd m1, m16 ; b 565 1004 paddd m2, m17 1005 mova [t4+r10*1+416*2+ 0], m0 1006 mova [t3+r10*2+416*4+ 0], m1 1007 mova [t3+r10*2+416*4+64], m2 1008 add r10, 64 1009 jl .prep_n_loop 1010 ret 1011ALIGN function_align 1012.n0: ; neighbor + output (even rows) 1013 mov r10, wq 1014.n0_loop: 1015 movu m0, [t4+r10*1+ 2] 1016 movu m1, [t3+r10*2+ 4] 1017 movu m2, [t3+r10*2+68] 1018 paddw m3, m0, [t4+r10*1+ 0] 1019 paddd m16, m1, [t3+r10*2+ 0] 1020 paddd m17, m2, [t3+r10*2+64] 1021 paddw m3, [t4+r10*1+ 4] 1022 paddd m16, [t3+r10*2+ 8] 1023 paddd m17, [t3+r10*2+72] 1024 paddw m0, m3 1025 psllw m3, 2 1026 paddd m1, m16 1027 pslld m16, 2 1028 paddd m2, m17 1029 pslld m17, 2 1030 paddw m0, m3 ; a 565 1031 paddd m1, m16 ; b 565 1032 paddd m2, m17 1033 paddw m3, m0, [t4+r10*1+416*2+ 0] 1034 paddd m16, m1, [t3+r10*2+416*4+ 0] 1035 paddd m17, m2, [t3+r10*2+416*4+64] 1036 mova [t4+r10*1+416*2+ 0], m0 1037 mova [t3+r10*2+416*4+ 0], m1 1038 mova [t3+r10*2+416*4+64], m2 1039 mova m0, [dstq+r10] 1040 punpcklwd m1, m0, m6 ; src 1041 punpcklwd m2, m3, m6 ; a 1042 pmaddwd m2, m1 ; a * src 1043 punpckhwd m1, m0, m6 1044 punpckhwd m3, m6 1045 pmaddwd m3, m1 1046 vshufi32x4 m1, m16, m17, q2020 1047 vshufi32x4 m16, m17, q3131 1048 psubd m1, m2 ; b - a * src + (1 << 8) 1049 psubd m16, m3 1050 psrad m1, 9 1051 psrad m16, 9 1052 packssdw m1, m16 1053 pmulhrsw m1, m7 1054 paddw m0, m1 1055 pmaxsw m0, m6 1056 pminsw m0, m14 1057 mova [dstq+r10], m0 1058 add r10, 64 1059 jl .n0_loop 1060 add dstq, strideq 1061 ret 1062ALIGN function_align 1063.n1: ; neighbor + output (odd rows) 1064 mov r10, wq 1065.n1_loop: 1066 mova m0, [dstq+r10] 1067 mova m3, [t4+r10*1+416*2+ 0] 1068 mova m16, [t3+r10*2+416*4+ 0] 1069 mova m17, [t3+r10*2+416*4+64] 1070 punpcklwd m1, m0, m6 ; src 1071 punpcklwd m2, m3, m6 ; a 1072 pmaddwd m2, m1 1073 punpckhwd m1, m0, m6 1074 punpckhwd m3, m6 1075 pmaddwd m3, m1 1076 vshufi32x4 m1, m16, m17, q2020 1077 vshufi32x4 m16, m17, q3131 1078 psubd m1, m2 ; b - a * src + (1 << 7) 1079 psubd m16, m3 1080 psrad m1, 8 1081 psrad m16, 8 1082 packssdw m1, m16 1083 pmulhrsw m1, m7 1084 paddw m0, m1 1085 pmaxsw m0, m6 1086 pminsw m0, m14 1087 mova [dstq+r10], m0 1088 add r10, 64 1089 jl .n1_loop 1090 add dstq, strideq 1091 ret 1092 1093cglobal sgr_filter_3x3_16bpc, 4, 14, 22, 416*42+8, dst, stride, left, lpf, \ 1094 w, h, edge, params 1095 movifnidn wd, wm 1096 mov paramsq, r6mp 1097 lea r13, [r_ext_mask+72] 1098 mov edged, r7m 1099 movifnidn hd, hm 1100 pxor m6, m6 1101 vpbroadcastw m7, [paramsq+10] ; w1 1102 add wd, wd 1103 vpbroadcastd m8, [base+pd_8] 1104 add lpfq, wq 1105 vpbroadcastd m9, [base+pd_m9] 1106 add dstq, wq 1107 vpsubd m10, m6, [paramsq+4] {1to16} ; -s1 1108 lea t3, [rsp+wq*2+416*12+8] 1109 vpbroadcastd m11, [base+pw_164_455] 1110 lea t4, [rsp+wq+416*32+8] 1111 vpbroadcastd m12, [base+pw_61448] 1112 lea t1, [rsp+wq+12] 1113 vpbroadcastd m13, [base+pd_m34816] 1114 neg wq 1115 vpbroadcastd m14, [base+pw_1023] 1116 psllw m7, 4 1117 mova m18, [sgr_x_by_x+64*0] 1118 mov r10d, 0xfffffffc 1119 mova m19, [sgr_x_by_x+64*1] 1120 kmovd k1, r10d 1121 mova m20, [sgr_x_by_x+64*2] 1122 mov r10, 0x3333333333333333 1123 mova m21, [sgr_x_by_x+64*3] 1124 kmovq k2, r10 1125 test edgeb, 4 ; LR_HAVE_TOP 1126 jz .no_top 1127 call .h_top 1128 add lpfq, strideq 1129 mov t2, t1 1130 add t1, 416*6 1131 call .h_top 1132 lea r10, [lpfq+strideq*4] 1133 mov lpfq, dstq 1134 add r10, strideq 1135 mov [rsp], r10 ; below 1136 call .hv0 1137.main: 1138 dec hd 1139 jz .height1 1140 add lpfq, strideq 1141 call .hv1 1142 call .prep_n 1143 sub hd, 2 1144 jl .extend_bottom 1145.main_loop: 1146 add lpfq, strideq 1147 call .hv0 1148 test hd, hd 1149 jz .odd_height 1150 add lpfq, strideq 1151 call .hv1 1152 call .n0 1153 call .n1 1154 sub hd, 2 1155 jge .main_loop 1156 test edgeb, 8 ; LR_HAVE_BOTTOM 1157 jz .extend_bottom 1158 mov lpfq, [rsp] 1159 call .hv0_bottom 1160 add lpfq, strideq 1161 call .hv1_bottom 1162.end: 1163 call .n0 1164 call .n1 1165.end2: 1166 RET 1167.height1: 1168 call .v1 1169 call .prep_n 1170 jmp .odd_height_end 1171.odd_height: 1172 call .v1 1173 call .n0 1174 call .n1 1175.odd_height_end: 1176 call .v0 1177 call .v1 1178 call .n0 1179 jmp .end2 1180.extend_bottom: 1181 call .v0 1182 call .v1 1183 jmp .end 1184.no_top: 1185 lea r10, [lpfq+strideq*4] 1186 mov lpfq, dstq 1187 lea r10, [r10+strideq*2] 1188 mov [rsp], r10 1189 call .h 1190 lea r10, [wq-4] 1191 lea t2, [t1+416*6] 1192.top_fixup_loop: 1193 mova m0, [t1+r10+416*0] 1194 mova m1, [t1+r10+416*2] 1195 mova m2, [t1+r10+416*4] 1196 mova [t2+r10+416*0], m0 1197 mova [t2+r10+416*2], m1 1198 mova [t2+r10+416*4], m2 1199 add r10, 64 1200 jl .top_fixup_loop 1201 call .v0 1202 jmp .main 1203.h: ; horizontal boxsum 1204 lea r10, [wq-4] 1205 test edgeb, 1 ; LR_HAVE_LEFT 1206 jz .h_extend_left 1207 movd xm16, [leftq+4] 1208 vmovdqu16 m16{k1}, [lpfq+wq-4] 1209 add leftq, 8 1210 jmp .h_main 1211.h_extend_left: 1212 vpbroadcastw xm16, [lpfq+wq] 1213 vmovdqu16 m16{k1}, [lpfq+wq-4] 1214 jmp .h_main 1215.h_top: 1216 lea r10, [wq-4] 1217 test edgeb, 1 ; LR_HAVE_LEFT 1218 jz .h_extend_left 1219.h_loop: 1220 movu m16, [lpfq+r10+ 0] 1221.h_main: 1222 movu m17, [lpfq+r10+16] 1223 test edgeb, 2 ; LR_HAVE_RIGHT 1224 jnz .h_have_right 1225 cmp r10d, -66 1226 jl .h_have_right 1227 vpbroadcastw m0, [lpfq-2] 1228 vpternlogd m16, m0, [r13+r10+ 0], 0xe4 1229 vpternlogd m17, m0, [r13+r10+16], 0xe4 1230.h_have_right: 1231 palignr m0, m17, m16, 2 1232 paddw m1, m16, m0 1233 punpcklwd m2, m16, m0 1234 pmaddwd m2, m2 1235 punpckhwd m3, m16, m0 1236 pmaddwd m3, m3 1237 palignr m17, m16, 4 1238 paddw m1, m17 ; sum 1239 punpcklwd m16, m17, m6 1240 vpdpwssd m2, m16, m16 ; sumsq 1241 punpckhwd m17, m6 1242 vpdpwssd m3, m17, m17 1243 mova [t1+r10+416*0], m1 1244 mova [t1+r10+416*2], m2 1245 mova [t1+r10+416*4], m3 1246 add r10, 64 1247 jl .h_loop 1248 ret 1249ALIGN function_align 1250.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 1251 lea r10, [wq-4] 1252 test edgeb, 1 ; LR_HAVE_LEFT 1253 jz .hv0_extend_left 1254 movd xm16, [leftq+4] 1255 vmovdqu16 m16{k1}, [lpfq+wq-4] 1256 add leftq, 8 1257 jmp .hv0_main 1258.hv0_extend_left: 1259 vpbroadcastw xm16, [lpfq+wq] 1260 vmovdqu16 m16{k1}, [lpfq+wq-4] 1261 jmp .hv0_main 1262.hv0_bottom: 1263 lea r10, [wq-4] 1264 test edgeb, 1 ; LR_HAVE_LEFT 1265 jz .hv0_extend_left 1266.hv0_loop: 1267 movu m16, [lpfq+r10+ 0] 1268.hv0_main: 1269 movu m17, [lpfq+r10+16] 1270 test edgeb, 2 ; LR_HAVE_RIGHT 1271 jnz .hv0_have_right 1272 cmp r10d, -66 1273 jl .hv0_have_right 1274 vpbroadcastw m0, [lpfq-2] 1275 vpternlogd m16, m0, [r13+r10+ 0], 0xe4 1276 vpternlogd m17, m0, [r13+r10+16], 0xe4 1277.hv0_have_right: 1278 palignr m0, m17, m16, 2 1279 paddw m1, m16, m0 1280 punpcklwd m2, m16, m0 1281 pmaddwd m2, m2 1282 punpckhwd m3, m16, m0 1283 pmaddwd m3, m3 1284 palignr m17, m16, 4 1285 paddw m1, m17 ; sum 1286 punpcklwd m16, m17, m6 1287 vpdpwssd m2, m16, m16 ; sumsq 1288 punpckhwd m17, m6 1289 vpdpwssd m3, m17, m17 1290 paddw m0, m1, [t1+r10+416*0] 1291 paddd m16, m2, [t1+r10+416*2] 1292 paddd m17, m3, [t1+r10+416*4] 1293 mova [t1+r10+416*0], m1 1294 mova [t1+r10+416*2], m2 1295 mova [t1+r10+416*4], m3 1296 paddw m1, m0, [t2+r10+416*0] 1297 paddd m2, m16, [t2+r10+416*2] 1298 paddd m3, m17, [t2+r10+416*4] 1299 mova [t2+r10+416*0], m0 1300 mova [t2+r10+416*2], m16 1301 mova [t2+r10+416*4], m17 1302 paddd m2, m8 1303 paddd m3, m8 1304 psrld m2, 4 ; (a + 8) >> 4 1305 psrld m3, 4 1306 pmulld m2, m9 ; -((a + 8) >> 4) * 9 1307 pmulld m3, m9 1308 psrlw m17, m1, 1 1309 pavgw m17, m6 ; (b + 2) >> 2 1310 punpcklwd m16, m17, m6 1311 vpdpwssd m2, m16, m16 ; -p 1312 punpckhwd m17, m6 1313 vpdpwssd m3, m17, m17 1314 punpcklwd m16, m6, m1 ; b 1315 punpckhwd m17, m6, m1 1316 pminsd m2, m6 1317 pminsd m3, m6 1318 pmulld m2, m10 ; p * s 1319 pmulld m3, m10 1320 pmaddwd m16, m11 ; b * 455 1321 pmaddwd m17, m11 1322 vpalignr m3{k2}, m2, m2, 2 1323 mova m2, m20 1324 paddusw m3, m12 1325 psraw m3, 4 ; min(z, 255) - 256 1326 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 1327 vpmovb2m k3, m3 1328 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 1329 vmovdqu8 m3{k3}, m2 ; x 1330 pandn m2, m13, m3 1331 psrld m3, 16 1332 pmulld m16, m2 1333 pmulld m17, m3 1334 packssdw m2, m3 1335 psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1336 psubd m17, m13 1337 mova [t4+r10*1+416*0+4], m2 1338 psrld m16, 12 1339 psrld m17, 12 1340 mova [t3+r10*2+416*0+ 8], xm16 1341 mova [t3+r10*2+416*0+ 24], xm17 1342 vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 1343 vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 1344 vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 1345 vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 1346 vextracti32x4 [t3+r10*2+416*0+104], m16, 3 1347 vextracti32x4 [t3+r10*2+416*0+120], m17, 3 1348 add r10, 64 1349 jl .hv0_loop 1350 ret 1351ALIGN function_align 1352.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 1353 lea r10, [wq-4] 1354 test edgeb, 1 ; LR_HAVE_LEFT 1355 jz .hv1_extend_left 1356 movd xm16, [leftq+4] 1357 vmovdqu16 m16{k1}, [lpfq+wq-4] 1358 add leftq, 8 1359 jmp .hv1_main 1360.hv1_extend_left: 1361 vpbroadcastw xm16, [lpfq+wq] 1362 vmovdqu16 m16{k1}, [lpfq+wq-4] 1363 jmp .hv1_main 1364.hv1_bottom: 1365 lea r10, [wq-4] 1366 test edgeb, 1 ; LR_HAVE_LEFT 1367 jz .hv1_extend_left 1368.hv1_loop: 1369 movu m16, [lpfq+r10+ 0] 1370.hv1_main: 1371 movu m17, [lpfq+r10+16] 1372 test edgeb, 2 ; LR_HAVE_RIGHT 1373 jnz .hv1_have_right 1374 cmp r10d, -66 1375 jl .hv1_have_right 1376 vpbroadcastw m0, [lpfq-2] 1377 vpternlogd m16, m0, [r13+r10+ 0], 0xe4 1378 vpternlogd m17, m0, [r13+r10+16], 0xe4 1379.hv1_have_right: 1380 palignr m1, m17, m16, 2 1381 paddw m0, m16, m1 1382 punpcklwd m2, m16, m1 1383 pmaddwd m2, m2 1384 punpckhwd m3, m16, m1 1385 pmaddwd m3, m3 1386 palignr m17, m16, 4 1387 paddw m0, m17 ; h sum 1388 punpcklwd m1, m17, m6 1389 vpdpwssd m2, m1, m1 ; h sumsq 1390 punpckhwd m17, m6 1391 vpdpwssd m3, m17, m17 1392 paddw m1, m0, [t2+r10+416*0] 1393 paddd m16, m2, [t2+r10+416*2] 1394 paddd m17, m3, [t2+r10+416*4] 1395 mova [t2+r10+416*0], m0 1396 mova [t2+r10+416*2], m2 1397 mova [t2+r10+416*4], m3 1398 paddd m16, m8 1399 paddd m17, m8 1400 psrld m16, 4 ; (a + 8) >> 4 1401 psrld m17, 4 1402 pmulld m16, m9 ; -((a + 8) >> 4) * 9 1403 pmulld m17, m9 1404 psrlw m3, m1, 1 1405 pavgw m3, m6 ; (b + 2) >> 2 1406 punpcklwd m2, m3, m6 1407 vpdpwssd m16, m2, m2 ; -p 1408 punpckhwd m3, m6 1409 vpdpwssd m17, m3, m3 1410 punpcklwd m0, m6, m1 ; b 1411 punpckhwd m1, m6, m1 1412 pminsd m16, m6 1413 pminsd m17, m6 1414 pmulld m16, m10 ; p * s 1415 pmulld m17, m10 1416 pmaddwd m0, m11 ; b * 455 1417 pmaddwd m1, m11 1418 vpalignr m17{k2}, m16, m16, 2 1419 mova m16, m20 1420 paddusw m17, m12 1421 psraw m17, 4 ; min(z, 255) - 256 1422 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 1423 vpmovb2m k3, m17 1424 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 1425 vmovdqu8 m17{k3}, m16 ; x 1426 pandn m16, m13, m17 1427 psrld m17, 16 1428 pmulld m0, m16 1429 pmulld m1, m17 1430 packssdw m16, m17 1431 psubd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1432 psubd m1, m13 1433 mova [t4+r10*1+416*2+4], m16 1434 psrld m16, m0, 12 1435 psrld m17, m1, 12 1436 mova [t3+r10*2+416*4+ 8], xm16 1437 mova [t3+r10*2+416*4+ 24], xm17 1438 vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 1439 vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 1440 vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 1441 vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 1442 vextracti32x4 [t3+r10*2+416*4+104], m16, 3 1443 vextracti32x4 [t3+r10*2+416*4+120], m17, 3 1444 add r10, 64 1445 jl .hv1_loop 1446 mov r10, t2 1447 mov t2, t1 1448 mov t1, r10 1449 ret 1450.v0: ; vertical boxsums + ab (even rows) 1451 lea r10, [wq-4] 1452.v0_loop: 1453 mova m0, [t1+r10+416*0] 1454 mova m16, [t1+r10+416*2] 1455 mova m17, [t1+r10+416*4] 1456 paddw m0, m0 1457 paddd m16, m16 1458 paddd m17, m17 1459 paddw m1, m0, [t2+r10+416*0] 1460 paddd m2, m16, [t2+r10+416*2] 1461 paddd m3, m17, [t2+r10+416*4] 1462 mova [t2+r10+416*0], m0 1463 mova [t2+r10+416*2], m16 1464 mova [t2+r10+416*4], m17 1465 paddd m2, m8 1466 paddd m3, m8 1467 psrld m2, 4 ; (a + 8) >> 4 1468 psrld m3, 4 1469 pmulld m2, m9 ; -((a + 8) >> 4) * 9 1470 pmulld m3, m9 1471 psrlw m17, m1, 1 1472 pavgw m17, m6 ; (b + 2) >> 2 1473 punpcklwd m16, m17, m6 1474 vpdpwssd m2, m16, m16 ; -p 1475 punpckhwd m17, m6 1476 vpdpwssd m3, m17, m17 1477 punpcklwd m16, m6, m1 ; b 1478 punpckhwd m17, m6, m1 1479 pminsd m2, m6 1480 pminsd m3, m6 1481 pmulld m2, m10 ; p * s 1482 pmulld m3, m10 1483 pmaddwd m16, m11 ; b * 455 1484 pmaddwd m17, m11 1485 vpalignr m3{k2}, m2, m2, 2 1486 mova m2, m20 1487 paddusw m3, m12 1488 psraw m3, 4 ; min(z, 255) - 256 1489 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 1490 vpmovb2m k3, m3 1491 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 1492 vmovdqu8 m3{k3}, m2 ; x 1493 pandn m2, m13, m3 1494 psrld m3, 16 1495 pmulld m16, m2 1496 pmulld m17, m3 1497 packssdw m2, m3 1498 psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1499 psubd m17, m13 1500 mova [t4+r10*1+416*0+4], m2 1501 psrld m16, 12 1502 psrld m17, 12 1503 mova [t3+r10*2+416*0+ 8], xm16 1504 mova [t3+r10*2+416*0+ 24], xm17 1505 vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 1506 vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 1507 vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 1508 vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 1509 vextracti32x4 [t3+r10*2+416*0+104], m16, 3 1510 vextracti32x4 [t3+r10*2+416*0+120], m17, 3 1511 add r10, 64 1512 jl .v0_loop 1513 ret 1514.v1: ; vertical boxsums + ab (odd rows) 1515 lea r10, [wq-4] 1516.v1_loop: 1517 mova m0, [t1+r10+416*0] 1518 mova m16, [t1+r10+416*2] 1519 mova m17, [t1+r10+416*4] 1520 paddw m1, m0, [t2+r10+416*0] 1521 paddd m2, m16, [t2+r10+416*2] 1522 paddd m3, m17, [t2+r10+416*4] 1523 mova [t2+r10+416*0], m0 1524 mova [t2+r10+416*2], m16 1525 mova [t2+r10+416*4], m17 1526 paddd m2, m8 1527 paddd m3, m8 1528 psrld m2, 4 ; (a + 8) >> 4 1529 psrld m3, 4 1530 pmulld m2, m9 ; -((a + 8) >> 4) * 9 1531 pmulld m3, m9 1532 psrlw m17, m1, 1 1533 pavgw m17, m6 ; (b + 2) >> 2 1534 punpcklwd m16, m17, m6 1535 vpdpwssd m2, m16, m16 ; -p 1536 punpckhwd m17, m6 1537 vpdpwssd m3, m17, m17 1538 punpcklwd m16, m6, m1 ; b 1539 punpckhwd m17, m6, m1 1540 pminsd m2, m6 1541 pminsd m3, m6 1542 pmulld m2, m10 ; p * s 1543 pmulld m3, m10 1544 pmaddwd m16, m11 ; b * 455 1545 pmaddwd m17, m11 1546 vpalignr m3{k2}, m2, m2, 2 1547 mova m2, m20 1548 paddusw m3, m12 1549 psraw m3, 4 ; min(z, 255) - 256 1550 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 1551 vpmovb2m k3, m3 1552 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 1553 vmovdqu8 m3{k3}, m2 ; x 1554 pandn m2, m13, m3 1555 psrld m3, 16 1556 pmulld m16, m2 1557 pmulld m17, m3 1558 packssdw m2, m3 1559 psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1560 psubd m17, m13 1561 mova [t4+r10*1+416*2+4], m2 1562 psrld m16, 12 1563 psrld m17, 12 1564 mova [t3+r10*2+416*4+ 8], xm16 1565 mova [t3+r10*2+416*4+ 24], xm17 1566 vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 1567 vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 1568 vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 1569 vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 1570 vextracti32x4 [t3+r10*2+416*4+104], m16, 3 1571 vextracti32x4 [t3+r10*2+416*4+120], m17, 3 1572 add r10, 64 1573 jl .v1_loop 1574 mov r10, t2 1575 mov t2, t1 1576 mov t1, r10 1577 ret 1578.prep_n: ; initial neighbor setup 1579 mov r10, wq 1580.prep_n_loop: 1581 mova ym16, [t4+r10*1+416*0+0] 1582 paddw ym16, [t4+r10*1+416*0+4] 1583 paddw ym17, ym16, [t4+r10*1+416*0+2] 1584 mova m0, [t3+r10*2+416*0+0] 1585 paddd m0, [t3+r10*2+416*0+8] 1586 paddd m1, m0, [t3+r10*2+416*0+4] 1587 psllw ym17, 2 ; a[-1] 444 1588 pslld m1, 2 ; b[-1] 444 1589 psubw ym17, ym16 ; a[-1] 343 1590 psubd m1, m0 ; b[-1] 343 1591 vmovdqa32 [t4+r10*1+416* 4], ym17 1592 vmovdqa32 [t3+r10*2+416* 8], m1 1593 mova ym16, [t4+r10*1+416*2+0] 1594 paddw ym16, [t4+r10*1+416*2+4] 1595 paddw ym17, ym16, [t4+r10*1+416*2+2] 1596 mova m0, [t3+r10*2+416*4+0] 1597 paddd m0, [t3+r10*2+416*4+8] 1598 paddd m1, m0, [t3+r10*2+416*4+4] 1599 psllw ym17, 2 ; a[ 0] 444 1600 pslld m1, 2 ; b[ 0] 444 1601 vmovdqa32 [t4+r10*1+416* 6], ym17 1602 vmovdqa32 [t3+r10*2+416*12], m1 1603 psubw ym17, ym16 ; a[ 0] 343 1604 psubd m1, m0 ; b[ 0] 343 1605 vmovdqa32 [t4+r10*1+416* 8], ym17 1606 vmovdqa32 [t3+r10*2+416*16], m1 1607 add r10, 32 1608 jl .prep_n_loop 1609 ret 1610ALIGN function_align 1611.n0: ; neighbor + output (even rows) 1612 mov r10, wq 1613.n0_loop: 1614 mova m3, [t4+r10*1+416*0+0] 1615 paddw m3, [t4+r10*1+416*0+4] 1616 paddw m1, m3, [t4+r10*1+416*0+2] 1617 psllw m1, 2 ; a[ 1] 444 1618 psubw m2, m1, m3 ; a[ 1] 343 1619 paddw m3, m2, [t4+r10*1+416*4] 1620 paddw m3, [t4+r10*1+416*6] 1621 mova [t4+r10*1+416*4], m2 1622 mova [t4+r10*1+416*6], m1 1623 mova m16, [t3+r10*2+416*0+0] 1624 paddd m16, [t3+r10*2+416*0+8] 1625 paddd m1, m16, [t3+r10*2+416*0+4] 1626 pslld m1, 2 ; b[ 1] 444 1627 psubd m2, m1, m16 ; b[ 1] 343 1628 paddd m16, m2, [t3+r10*2+416* 8+ 0] 1629 paddd m16, [t3+r10*2+416*12+ 0] 1630 mova [t3+r10*2+416* 8+ 0], m2 1631 mova [t3+r10*2+416*12+ 0], m1 1632 mova m17, [t3+r10*2+416*0+64] 1633 paddd m17, [t3+r10*2+416*0+72] 1634 paddd m1, m17, [t3+r10*2+416*0+68] 1635 pslld m1, 2 1636 psubd m2, m1, m17 1637 paddd m17, m2, [t3+r10*2+416* 8+64] 1638 paddd m17, [t3+r10*2+416*12+64] 1639 mova [t3+r10*2+416* 8+64], m2 1640 mova [t3+r10*2+416*12+64], m1 1641 mova m0, [dstq+r10] 1642 punpcklwd m1, m0, m6 1643 punpcklwd m2, m3, m6 1644 pmaddwd m2, m1 ; a * src 1645 punpckhwd m1, m0, m6 1646 punpckhwd m3, m6 1647 pmaddwd m3, m1 1648 vshufi32x4 m1, m16, m17, q2020 1649 vshufi32x4 m16, m17, q3131 1650 psubd m1, m2 ; b - a * src + (1 << 8) 1651 psubd m16, m3 1652 psrad m1, 9 1653 psrad m16, 9 1654 packssdw m1, m16 1655 pmulhrsw m1, m7 1656 paddw m0, m1 1657 pmaxsw m0, m6 1658 pminsw m0, m14 1659 mova [dstq+r10], m0 1660 add r10, 64 1661 jl .n0_loop 1662 add dstq, strideq 1663 ret 1664ALIGN function_align 1665.n1: ; neighbor + output (odd rows) 1666 mov r10, wq 1667.n1_loop: 1668 mova m3, [t4+r10*1+416*2+0] 1669 paddw m3, [t4+r10*1+416*2+4] 1670 paddw m1, m3, [t4+r10*1+416*2+2] 1671 psllw m1, 2 ; a[ 1] 444 1672 psubw m2, m1, m3 ; a[ 1] 343 1673 paddw m3, m2, [t4+r10*1+416*6] 1674 paddw m3, [t4+r10*1+416*8] 1675 mova [t4+r10*1+416*6], m1 1676 mova [t4+r10*1+416*8], m2 1677 mova m16, [t3+r10*2+416*4+0] 1678 paddd m16, [t3+r10*2+416*4+8] 1679 paddd m1, m16, [t3+r10*2+416*4+4] 1680 pslld m1, 2 ; b[ 1] 444 1681 psubd m2, m1, m16 ; b[ 1] 343 1682 paddd m16, m2, [t3+r10*2+416*12+ 0] 1683 paddd m16, [t3+r10*2+416*16+ 0] 1684 mova [t3+r10*2+416*12+ 0], m1 1685 mova [t3+r10*2+416*16+ 0], m2 1686 mova m17, [t3+r10*2+416*4+64] 1687 paddd m17, [t3+r10*2+416*4+72] 1688 paddd m1, m17, [t3+r10*2+416*4+68] 1689 pslld m1, 2 1690 psubd m2, m1, m17 1691 paddd m17, m2, [t3+r10*2+416*12+64] 1692 paddd m17, [t3+r10*2+416*16+64] 1693 mova [t3+r10*2+416*12+64], m1 1694 mova [t3+r10*2+416*16+64], m2 1695 mova m0, [dstq+r10] 1696 punpcklwd m1, m0, m6 1697 punpcklwd m2, m3, m6 1698 pmaddwd m2, m1 ; a * src 1699 punpckhwd m1, m0, m6 1700 punpckhwd m3, m6 1701 pmaddwd m3, m1 1702 vshufi32x4 m1, m16, m17, q2020 1703 vshufi32x4 m16, m17, q3131 1704 psubd m1, m2 ; b - a * src + (1 << 8) 1705 psubd m16, m3 1706 psrad m1, 9 1707 psrad m16, 9 1708 packssdw m1, m16 1709 pmulhrsw m1, m7 1710 paddw m0, m1 1711 pmaxsw m0, m6 1712 pminsw m0, m14 1713 mova [dstq+r10], m0 1714 add r10, 64 1715 jl .n1_loop 1716 add dstq, strideq 1717 ret 1718 1719cglobal sgr_filter_mix_16bpc, 4, 14, 23, 416*66+8, dst, stride, left, lpf, \ 1720 w, h, edge, params 1721 movifnidn wd, wm 1722 mov paramsq, r6mp 1723 lea r13, [r_ext_mask+72] 1724 mov edged, r7m 1725 movifnidn hd, hm 1726 vpbroadcastd m7, [paramsq+8] ; w0 w1 1727 pxor m6, m6 1728 vpbroadcastd m8, [base+pd_8] 1729 add wd, wd 1730 vpbroadcastd m9, [base+pd_m9] 1731 add lpfq, wq 1732 vpbroadcastd m10, [base+pd_m25] 1733 add dstq, wq 1734 vpsubd m11, m6, [paramsq+0] {1to16} ; -s0 1735 lea t3, [rsp+wq*2+416*24+8] 1736 vpsubd m12, m6, [paramsq+4] {1to16} ; -s1 1737 lea t4, [rsp+wq+416*52+8] 1738 vpbroadcastd m13, [base+pw_164_455] 1739 lea t1, [rsp+wq+12] 1740 vpbroadcastd m14, [base+pw_61448] 1741 neg wq 1742 vpbroadcastd m15, [base+pd_m34816] 1743 psllw m7, 2 1744 vpbroadcastd m22, [base+pd_2147483648] 1745 mov r10d, 0xfffffff8 1746 mova m18, [sgr_x_by_x+64*0] 1747 kmovd k1, r10d 1748 mova m19, [sgr_x_by_x+64*1] 1749 mov r10, 0x3333333333333333 1750 mova m20, [sgr_x_by_x+64*2] 1751 kmovq k2, r10 1752 mova m21, [sgr_x_by_x+64*3] 1753 test edgeb, 4 ; LR_HAVE_TOP 1754 jz .no_top 1755 call .h_top 1756 add lpfq, strideq 1757 mov t2, t1 1758 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx512icl).top_fixup 1759 add t1, 416*12 1760 call .h_top 1761 lea r10, [lpfq+strideq*4] 1762 mov lpfq, dstq 1763 add r10, strideq 1764 mov [rsp], r10 ; below 1765 call .hv0 1766.main: 1767 dec hd 1768 jz .height1 1769 add lpfq, strideq 1770 call .hv1 1771 call .prep_n 1772 sub hd, 2 1773 jl .extend_bottom 1774.main_loop: 1775 add lpfq, strideq 1776 call .hv0 1777 test hd, hd 1778 jz .odd_height 1779 add lpfq, strideq 1780 call .hv1 1781 call .n0 1782 call .n1 1783 sub hd, 2 1784 jge .main_loop 1785 test edgeb, 8 ; LR_HAVE_BOTTOM 1786 jz .extend_bottom 1787 mov lpfq, [rsp] 1788 call .hv0_bottom 1789 add lpfq, strideq 1790 call .hv1_bottom 1791.end: 1792 call .n0 1793 call .n1 1794.end2: 1795 RET 1796.height1: 1797 call .v1 1798 call .prep_n 1799 jmp .odd_height_end 1800.odd_height: 1801 call .v1 1802 call .n0 1803 call .n1 1804.odd_height_end: 1805 call .v0 1806 call .v1 1807 call .n0 1808 jmp .end2 1809.extend_bottom: 1810 call .v0 1811 call .v1 1812 jmp .end 1813.no_top: 1814 lea r10, [lpfq+strideq*4] 1815 mov lpfq, dstq 1816 lea r10, [r10+strideq*2] 1817 mov [rsp], r10 1818 call .h 1819 lea r10, [wq-4] 1820 lea t2, [t1+416*12] 1821.top_fixup_loop: 1822 mova m0, [t1+r10+416* 0] 1823 mova m1, [t1+r10+416* 2] 1824 mova m2, [t1+r10+416* 4] 1825 paddw m0, m0 1826 mova m3, [t1+r10+416* 6] 1827 paddd m1, m1 1828 mova m4, [t1+r10+416* 8] 1829 paddd m2, m2 1830 mova m5, [t1+r10+416*10] 1831 mova [t2+r10+416* 0], m0 1832 mova [t2+r10+416* 2], m1 1833 mova [t2+r10+416* 4], m2 1834 mova [t2+r10+416* 6], m3 1835 mova [t2+r10+416* 8], m4 1836 mova [t2+r10+416*10], m5 1837 add r10, 64 1838 jl .top_fixup_loop 1839 call .v0 1840 jmp .main 1841.h: ; horizontal boxsum 1842 lea r10, [wq-4] 1843 test edgeb, 1 ; LR_HAVE_LEFT 1844 jz .h_extend_left 1845 movq xm16, [leftq+2] 1846 vmovdqu16 m16{k1}, [lpfq+wq-6] 1847 add leftq, 8 1848 jmp .h_main 1849.h_extend_left: 1850 vpbroadcastw xm16, [lpfq+wq] 1851 vmovdqu16 m16{k1}, [lpfq+wq-6] 1852 jmp .h_main 1853.h_top: 1854 lea r10, [wq-4] 1855 test edgeb, 1 ; LR_HAVE_LEFT 1856 jz .h_extend_left 1857.h_loop: 1858 movu m16, [lpfq+r10- 2] 1859.h_main: 1860 movu m17, [lpfq+r10+14] 1861 test edgeb, 2 ; LR_HAVE_RIGHT 1862 jnz .h_have_right 1863 cmp r10d, -68 1864 jl .h_have_right 1865 vpbroadcastw m0, [lpfq-2] 1866 vpternlogd m16, m0, [r13+r10+ 0], 0xe4 1867 vpternlogd m17, m0, [r13+r10+16], 0xe4 1868.h_have_right: 1869 palignr m3, m17, m16, 2 1870 palignr m0, m17, m16, 4 1871 paddw m1, m3, m0 1872 punpcklwd m2, m3, m0 1873 pmaddwd m2, m2 1874 punpckhwd m3, m0 1875 pmaddwd m3, m3 1876 palignr m0, m17, m16, 6 1877 paddw m1, m0 ; sum3 1878 punpcklwd m4, m0, m6 1879 vpdpwssd m2, m4, m4 ; sumsq3 1880 punpckhwd m0, m6 1881 vpdpwssd m3, m0, m0 1882 shufpd m4, m16, m17, 0x55 1883 punpcklwd m17, m4, m16 1884 paddw m0, m16, m4 1885 punpckhwd m4, m16 1886 mova [t1+r10+416* 6], m1 1887 mova [t1+r10+416* 8], m2 1888 mova [t1+r10+416*10], m3 1889 paddw m1, m0 ; sum5 1890 vpdpwssd m2, m17, m17 ; sumsq5 1891 vpdpwssd m3, m4, m4 1892 mova [t1+r10+416* 0], m1 1893 mova [t1+r10+416* 2], m2 1894 mova [t1+r10+416* 4], m3 1895 add r10, 64 1896 jl .h_loop 1897 ret 1898ALIGN function_align 1899.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 1900 lea r10, [wq-4] 1901 test edgeb, 1 ; LR_HAVE_LEFT 1902 jz .hv0_extend_left 1903 movq xm16, [leftq+2] 1904 vmovdqu16 m16{k1}, [lpfq+wq-6] 1905 add leftq, 8 1906 jmp .hv0_main 1907.hv0_extend_left: 1908 vpbroadcastw xm16, [lpfq+wq] 1909 vmovdqu16 m16{k1}, [lpfq+wq-6] 1910 jmp .hv0_main 1911.hv0_bottom: 1912 lea r10, [wq-4] 1913 test edgeb, 1 ; LR_HAVE_LEFT 1914 jz .hv0_extend_left 1915.hv0_loop: 1916 movu m16, [lpfq+r10- 2] 1917.hv0_main: 1918 movu m17, [lpfq+r10+14] 1919 test edgeb, 2 ; LR_HAVE_RIGHT 1920 jnz .hv0_have_right 1921 cmp r10d, -68 1922 jl .hv0_have_right 1923 vpbroadcastw m0, [lpfq-2] 1924 vpternlogd m16, m0, [r13+r10+ 0], 0xe4 1925 vpternlogd m17, m0, [r13+r10+16], 0xe4 1926.hv0_have_right: 1927 palignr m3, m17, m16, 2 1928 palignr m0, m17, m16, 4 1929 paddw m1, m3, m0 1930 punpcklwd m2, m3, m0 1931 pmaddwd m2, m2 1932 punpckhwd m3, m0 1933 pmaddwd m3, m3 1934 palignr m0, m17, m16, 6 1935 paddw m1, m0 ; h sum3 1936 punpcklwd m4, m0, m6 1937 vpdpwssd m2, m4, m4 ; h sumsq3 1938 punpckhwd m0, m6 1939 vpdpwssd m3, m0, m0 1940 shufpd m17, m16, m17, 0x55 1941 paddw m4, m1, [t1+r10+416* 6] 1942 paddd m5, m2, [t1+r10+416* 8] 1943 mova [t1+r10+416* 6], m1 1944 mova [t1+r10+416* 8], m2 1945 paddw m1, m16 1946 paddw m1, m17 ; h sum5 1947 punpcklwd m0, m17, m16 1948 vpdpwssd m2, m0, m0 ; h sumsq5 1949 paddd m0, m3, [t1+r10+416*10] 1950 mova [t1+r10+416*10], m3 1951 punpckhwd m17, m16 1952 vpdpwssd m3, m17, m17 1953 mova [t3+r10*2+416*8+ 8], m1 ; we need a clean copy of the last row 1954 mova [t3+r10*2+416*0+ 8], m2 ; in case height is odd 1955 mova [t3+r10*2+416*0+72], m3 1956 paddw m1, [t1+r10+416* 0] 1957 paddd m2, [t1+r10+416* 2] 1958 paddd m3, [t1+r10+416* 4] 1959 mova [t1+r10+416* 0], m1 1960 mova [t1+r10+416* 2], m2 1961 mova [t1+r10+416* 4], m3 1962 paddw m17, m4, [t2+r10+416* 6] 1963 paddd m2, m5, [t2+r10+416* 8] 1964 paddd m3, m0, [t2+r10+416*10] 1965 mova [t2+r10+416* 6], m4 1966 mova [t2+r10+416* 8], m5 1967 mova [t2+r10+416*10], m0 1968 paddd m2, m8 1969 paddd m3, m8 1970 psrld m2, 4 ; (a3 + 8) >> 4 1971 psrld m3, 4 1972 pmulld m2, m9 ; -((a3 + 8) >> 4) * 9 1973 pmulld m3, m9 1974 psrlw m5, m17, 1 1975 pavgw m5, m6 ; (b3 + 2) >> 2 1976 punpcklwd m4, m5, m6 1977 vpdpwssd m2, m4, m4 ; -p3 1978 punpckhwd m5, m6 1979 vpdpwssd m3, m5, m5 1980 punpcklwd m16, m6, m17 ; b3 1981 punpckhwd m17, m6, m17 1982 pminsd m2, m6 1983 pminsd m3, m6 1984 pmulld m2, m12 ; p3 * s1 1985 pmulld m3, m12 1986 pmaddwd m16, m13 ; b3 * 455 1987 pmaddwd m17, m13 1988 vpalignr m3{k2}, m2, m2, 2 1989 mova m2, m20 1990 paddusw m3, m14 1991 psraw m3, 4 ; min(z3, 255) - 256 1992 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 1993 vpmovb2m k3, m3 1994 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 1995 vmovdqu8 m3{k3}, m2 ; x3 1996 pandn m2, m15, m3 1997 psrld m3, 16 1998 pmulld m16, m2 1999 pmulld m17, m3 2000 packssdw m2, m3 2001 mova [t4+r10*1+416*2+4], m2 2002 psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2003 psubd m17, m15 2004 psrld m16, 12 2005 psrld m17, 12 2006 mova [t3+r10*2+416*4+ 8], xm16 2007 mova [t3+r10*2+416*4+ 24], xm17 2008 vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 2009 vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 2010 vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 2011 vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 2012 vextracti32x4 [t3+r10*2+416*4+104], m16, 3 2013 vextracti32x4 [t3+r10*2+416*4+120], m17, 3 2014 add r10, 64 2015 jl .hv0_loop 2016 ret 2017ALIGN function_align 2018.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2019 lea r10, [wq-4] 2020 test edgeb, 1 ; LR_HAVE_LEFT 2021 jz .hv1_extend_left 2022 movq xm16, [leftq+2] 2023 vmovdqu16 m16{k1}, [lpfq+wq-6] 2024 add leftq, 8 2025 jmp .hv1_main 2026.hv1_extend_left: 2027 vpbroadcastw xm16, [lpfq+wq] 2028 vmovdqu16 m16{k1}, [lpfq+wq-6] 2029 jmp .hv1_main 2030.hv1_bottom: 2031 lea r10, [wq-4] 2032 test edgeb, 1 ; LR_HAVE_LEFT 2033 jz .hv1_extend_left 2034.hv1_loop: 2035 movu m16, [lpfq+r10- 2] 2036.hv1_main: 2037 movu m17, [lpfq+r10+14] 2038 test edgeb, 2 ; LR_HAVE_RIGHT 2039 jnz .hv1_have_right 2040 cmp r10d, -68 2041 jl .hv1_have_right 2042 vpbroadcastw m0, [lpfq-2] 2043 vpternlogd m16, m0, [r13+r10+ 0], 0xe4 2044 vpternlogd m17, m0, [r13+r10+16], 0xe4 2045.hv1_have_right: 2046 palignr m1, m17, m16, 2 2047 palignr m3, m17, m16, 4 2048 paddw m2, m1, m3 2049 punpcklwd m0, m1, m3 2050 pmaddwd m0, m0 2051 punpckhwd m1, m3 2052 pmaddwd m1, m1 2053 palignr m3, m17, m16, 6 2054 paddw m2, m3 ; h sum3 2055 punpcklwd m5, m3, m6 2056 vpdpwssd m0, m5, m5 ; h sumsq3 2057 punpckhwd m3, m6 2058 vpdpwssd m1, m3, m3 2059 shufpd m3, m16, m17, 0x55 2060 punpcklwd m5, m16, m3 2061 paddw m4, m16, m3 2062 punpckhwd m16, m3 2063 paddw m17, m2, [t2+r10+416* 6] 2064 mova [t2+r10+416* 6], m2 2065 paddw m4, m2 ; h sum5 2066 paddd m2, m0, [t2+r10+416* 8] 2067 paddd m3, m1, [t2+r10+416*10] 2068 mova [t2+r10+416* 8], m0 2069 mova [t2+r10+416*10], m1 2070 vpdpwssd m0, m5, m5 ; h sumsq5 2071 vpdpwssd m1, m16, m16 2072 paddd m2, m8 2073 paddd m3, m8 2074 psrld m2, 4 ; (a3 + 8) >> 4 2075 psrld m3, 4 2076 pmulld m2, m9 ; -((a3 + 8) >> 4) * 9 2077 pmulld m3, m9 2078 psrlw m16, m17, 1 2079 pavgw m16, m6 ; (b3 + 2) >> 2 2080 punpcklwd m5, m16, m6 2081 vpdpwssd m2, m5, m5 ; -p3 2082 punpckhwd m16, m6 2083 vpdpwssd m3, m16, m16 2084 punpcklwd m16, m6, m17 ; b3 2085 punpckhwd m17, m6, m17 2086 pminsd m2, m6 2087 pminsd m3, m6 2088 pmulld m2, m12 ; p3 * s1 2089 pmulld m3, m12 2090 pmaddwd m16, m13 ; b3 * 455 2091 pmaddwd m17, m13 2092 vpalignr m3{k2}, m2, m2, 2 2093 mova m2, m20 2094 paddusw m3, m14 2095 psraw m3, 4 ; min(z3, 255) - 256 2096 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 2097 vpmovb2m k3, m3 2098 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 2099 vmovdqu8 m3{k3}, m2 ; x3 2100 pandn m2, m15, m3 2101 psrld m3, 16 2102 pmulld m16, m2 2103 pmulld m17, m3 2104 packssdw m2, m3 2105 mova [t4+r10*1+416*4+4], m2 2106 psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2107 psubd m17, m15 2108 psrld m16, 12 2109 psrld m17, 12 2110 paddw m5, m4, [t2+r10+416*0] 2111 paddd m2, m0, [t2+r10+416*2] 2112 paddd m3, m1, [t2+r10+416*4] 2113 paddw m5, [t1+r10+416*0] 2114 paddd m2, [t1+r10+416*2] 2115 paddd m3, [t1+r10+416*4] 2116 mova [t2+r10+416*0], m4 2117 mova [t2+r10+416*2], m0 2118 mova [t2+r10+416*4], m1 2119 mova [t3+r10*2+416*8+ 8], xm16 2120 mova [t3+r10*2+416*8+ 24], xm17 2121 vextracti128 [t3+r10*2+416*8+ 40], ym16, 1 2122 vextracti128 [t3+r10*2+416*8+ 56], ym17, 1 2123 vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2 2124 vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2 2125 vextracti32x4 [t3+r10*2+416*8+104], m16, 3 2126 vextracti32x4 [t3+r10*2+416*8+120], m17, 3 2127 paddd m2, m8 2128 paddd m3, m8 2129 psrld m2, 4 ; (a5 + 8) >> 4 2130 psrld m3, 4 2131 pmulld m2, m10 ; -((a5 + 8) >> 4) * 25 2132 pmulld m3, m10 2133 psrlw m17, m5, 1 2134 pavgw m17, m6 ; (b5 + 2) >> 2 2135 punpcklwd m16, m17, m6 2136 vpdpwssd m2, m16, m16 ; -p5 2137 punpckhwd m17, m6 2138 vpdpwssd m3, m17, m17 2139 punpcklwd m16, m5, m6 ; b5 2140 punpckhwd m17, m5, m6 2141 pmulld m2, m11 ; p5 * s0 2142 pmulld m3, m11 2143 pmaddwd m16, m13 ; b5 * 164 2144 pmaddwd m17, m13 2145 vpalignr m3{k2}, m2, m2, 2 2146 mova m2, m20 2147 pmaxsw m3, m6 2148 paddusw m3, m14 2149 psraw m3, 4 ; min(z5, 255) - 256 2150 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 2151 vpmovb2m k3, m3 2152 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 2153 vmovdqu8 m3{k3}, m2 ; x5 2154 pandn m2, m15, m3 2155 psrld m3, 16 2156 pmulld m16, m2 2157 pmulld m17, m3 2158 packssdw m2, m3 2159 mova [t4+r10*1+416*0+4], m2 2160 psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2161 psubd m17, m15 2162 psrld m16, 12 2163 psrld m17, 12 2164 mova [t3+r10*2+416*0+ 8], xm16 2165 mova [t3+r10*2+416*0+ 24], xm17 2166 vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 2167 vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 2168 vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 2169 vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 2170 vextracti32x4 [t3+r10*2+416*0+104], m16, 3 2171 vextracti32x4 [t3+r10*2+416*0+120], m17, 3 2172 add r10, 64 2173 jl .hv1_loop 2174 mov r10, t2 2175 mov t2, t1 2176 mov t1, r10 2177 ret 2178.v0: ; vertical boxsums + ab3 (even rows) 2179 lea r10, [wq-4] 2180.v0_loop: 2181 mova m16, [t1+r10+416* 6] 2182 mova m2, [t1+r10+416* 8] 2183 mova m3, [t1+r10+416*10] 2184 paddw m16, m16 2185 paddd m2, m2 2186 paddd m3, m3 2187 paddw m17, m16, [t2+r10+416* 6] 2188 paddd m4, m2, [t2+r10+416* 8] 2189 paddd m5, m3, [t2+r10+416*10] 2190 mova [t2+r10+416* 6], m16 2191 mova [t2+r10+416* 8], m2 2192 mova [t2+r10+416*10], m3 2193 paddd m4, m8 2194 paddd m5, m8 2195 psrld m4, 4 ; (a3 + 8) >> 4 2196 psrld m5, 4 2197 pmulld m4, m9 ; -((a3 + 8) >> 4) * 9 2198 pmulld m5, m9 2199 psrlw m3, m17, 1 2200 pavgw m3, m6 ; (b3 + 2) >> 2 2201 punpcklwd m2, m3, m6 2202 vpdpwssd m4, m2, m2 ; -p3 2203 punpckhwd m3, m6 2204 vpdpwssd m5, m3, m3 2205 punpcklwd m16, m6, m17 ; b3 2206 punpckhwd m17, m6, m17 2207 pminsd m4, m6 2208 pminsd m5, m6 2209 pmulld m4, m12 ; p3 * s1 2210 pmulld m5, m12 2211 pmaddwd m16, m13 ; b3 * 455 2212 pmaddwd m17, m13 2213 vpalignr m5{k2}, m4, m4, 2 2214 mova m4, m20 2215 paddusw m5, m14 2216 psraw m5, 4 ; min(z3, 255) - 256 2217 vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255] 2218 vpmovb2m k3, m5 2219 vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127] 2220 vmovdqu8 m5{k3}, m4 ; x3 2221 pandn m4, m15, m5 2222 psrld m5, 16 2223 pmulld m16, m4 2224 pmulld m17, m5 2225 packssdw m4, m5 2226 mova [t4+r10*1+416*2+4], m4 2227 psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2228 psubd m17, m15 2229 psrld m16, 12 2230 psrld m17, 12 2231 mova m3, [t1+r10+416*0] 2232 mova m4, [t1+r10+416*2] 2233 mova m5, [t1+r10+416*4] 2234 mova [t3+r10*2+416*8+ 8], m3 2235 mova [t3+r10*2+416*0+ 8], m4 2236 mova [t3+r10*2+416*0+72], m5 2237 paddw m3, m3 ; cc5 2238 paddd m4, m4 2239 paddd m5, m5 2240 mova [t1+r10+416*0], m3 2241 mova [t1+r10+416*2], m4 2242 mova [t1+r10+416*4], m5 2243 mova [t3+r10*2+416*4+ 8], xm16 2244 mova [t3+r10*2+416*4+ 24], xm17 2245 vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 2246 vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 2247 vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 2248 vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 2249 vextracti32x4 [t3+r10*2+416*4+104], m16, 3 2250 vextracti32x4 [t3+r10*2+416*4+120], m17, 3 2251 add r10, 64 2252 jl .v0_loop 2253 ret 2254.v1: ; vertical boxsums + ab (odd rows) 2255 lea r10, [wq-4] 2256.v1_loop: 2257 mova m16, [t1+r10+416* 6] 2258 mova m2, [t1+r10+416* 8] 2259 mova m3, [t1+r10+416*10] 2260 paddw m17, m16, [t2+r10+416* 6] 2261 paddd m4, m2, [t2+r10+416* 8] 2262 paddd m5, m3, [t2+r10+416*10] 2263 mova [t2+r10+416* 6], m16 2264 mova [t2+r10+416* 8], m2 2265 mova [t2+r10+416*10], m3 2266 paddd m4, m8 2267 paddd m5, m8 2268 psrld m4, 4 ; (a3 + 8) >> 4 2269 psrld m5, 4 2270 pmulld m4, m9 ; -((a3 + 8) >> 4) * 9 2271 pmulld m5, m9 2272 psrlw m3, m17, 1 2273 pavgw m3, m6 ; (b3 + 2) >> 2 2274 punpcklwd m2, m3, m6 2275 vpdpwssd m4, m2, m2 ; -p3 2276 punpckhwd m3, m6 2277 vpdpwssd m5, m3, m3 2278 punpcklwd m16, m6, m17 ; b3 2279 punpckhwd m17, m6, m17 2280 pminsd m4, m6 2281 pminsd m5, m6 2282 pmulld m4, m12 ; p3 * s1 2283 pmulld m5, m12 2284 pmaddwd m16, m13 ; b3 * 455 2285 pmaddwd m17, m13 2286 vpalignr m5{k2}, m4, m4, 2 2287 mova m4, m20 2288 paddusw m5, m14 2289 psraw m5, 4 ; min(z3, 255) - 256 2290 vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255] 2291 vpmovb2m k3, m5 2292 vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127] 2293 vmovdqu8 m5{k3}, m4 ; x3 2294 pandn m4, m15, m5 2295 psrld m5, 16 2296 pmulld m16, m4 2297 pmulld m17, m5 2298 packssdw m4, m5 2299 mova [t4+r10*1+416*4+4], m4 2300 psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2301 psubd m17, m15 2302 psrld m16, 12 2303 psrld m17, 12 2304 mova m0, [t3+r10*2+416*8+ 8] 2305 mova m4, [t3+r10*2+416*0+ 8] 2306 mova m5, [t3+r10*2+416*0+72] 2307 paddw m1, m0, [t2+r10+416*0] 2308 paddd m2, m4, [t2+r10+416*2] 2309 paddd m3, m5, [t2+r10+416*4] 2310 paddw m1, [t1+r10+416*0] 2311 paddd m2, [t1+r10+416*2] 2312 paddd m3, [t1+r10+416*4] 2313 mova [t2+r10+416*0], m0 2314 mova [t2+r10+416*2], m4 2315 mova [t2+r10+416*4], m5 2316 mova [t3+r10*2+416*8+ 8], xm16 2317 mova [t3+r10*2+416*8+ 24], xm17 2318 vextracti128 [t3+r10*2+416*8+ 40], ym16, 1 2319 vextracti128 [t3+r10*2+416*8+ 56], ym17, 1 2320 vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2 2321 vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2 2322 vextracti32x4 [t3+r10*2+416*8+104], m16, 3 2323 vextracti32x4 [t3+r10*2+416*8+120], m17, 3 2324 paddd m2, m8 2325 paddd m3, m8 2326 psrld m2, 4 ; (a5 + 8) >> 4 2327 psrld m3, 4 2328 pmulld m2, m10 ; -((a5 + 8) >> 4) * 25 2329 pmulld m3, m10 2330 psrlw m5, m1, 1 2331 pavgw m5, m6 ; (b5 + 2) >> 2 2332 punpcklwd m4, m5, m6 2333 vpdpwssd m2, m4, m4 ; -p5 2334 punpckhwd m5, m6 2335 vpdpwssd m3, m5, m5 2336 punpcklwd m16, m1, m6 ; b5 2337 punpckhwd m17, m1, m6 2338 pmulld m2, m11 ; p5 * s0 2339 pmulld m3, m11 2340 pmaddwd m16, m13 ; b5 * 164 2341 pmaddwd m17, m13 2342 vpalignr m3{k2}, m2, m2, 2 2343 mova m2, m20 2344 pmaxsw m3, m6 2345 paddusw m3, m14 2346 psraw m3, 4 ; min(z5, 255) - 256 2347 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 2348 vpmovb2m k3, m3 2349 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 2350 vmovdqu8 m3{k3}, m2 ; x5 2351 pandn m2, m15, m3 2352 psrld m3, 16 2353 pmulld m16, m2 2354 pmulld m17, m3 2355 packssdw m2, m3 2356 mova [t4+r10*1+416*0+4], m2 2357 psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2358 psubd m17, m15 2359 psrld m16, 12 2360 psrld m17, 12 2361 mova [t3+r10*2+416*0+ 8], xm16 2362 mova [t3+r10*2+416*0+ 24], xm17 2363 vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 2364 vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 2365 vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 2366 vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 2367 vextracti32x4 [t3+r10*2+416*0+104], m16, 3 2368 vextracti32x4 [t3+r10*2+416*0+120], m17, 3 2369 add r10, 64 2370 jl .v1_loop 2371 mov r10, t2 2372 mov t2, t1 2373 mov t1, r10 2374 ret 2375.prep_n: ; initial neighbor setup 2376 mov r10, wq 2377.prep_n_loop: 2378 movu ym0, [t4+r10*1+416*0+2] 2379 paddw ym2, ym0, [t4+r10*1+416*0+0] 2380 paddw ym2, [t4+r10*1+416*0+4] 2381 movu m1, [t3+r10*2+416*0+4] 2382 paddd m3, m1, [t3+r10*2+416*0+0] 2383 paddd m3, [t3+r10*2+416*0+8] 2384 paddw ym0, ym2 2385 paddd m1, m3 2386 psllw ym2, 2 2387 pslld m3, 2 2388 paddw ym0, ym2 ; a5 565 2389 paddd m1, m3 ; b5 565 2390 mova [t4+r10*1+416* 6], ym0 2391 mova [t3+r10*2+416*12], m1 2392 mova ym0, [t4+r10*1+416*2+0] 2393 paddw ym0, [t4+r10*1+416*2+4] 2394 paddw ym2, ym0, [t4+r10*1+416*2+2] 2395 mova m1, [t3+r10*2+416*4+0] 2396 paddd m1, [t3+r10*2+416*4+8] 2397 paddd m3, m1, [t3+r10*2+416*4+4] 2398 psllw ym2, 2 ; a3[-1] 444 2399 pslld m3, 2 ; b3[-1] 444 2400 psubw ym2, ym0 ; a3[-1] 343 2401 psubd m3, m1 ; b3[-1] 343 2402 mova [t4+r10*1+416* 8], ym2 2403 mova [t3+r10*2+416*16], m3 2404 mova ym0, [t4+r10*1+416*4+0] 2405 paddw ym0, [t4+r10*1+416*4+4] 2406 paddw ym2, ym0, [t4+r10*1+416*4+2] 2407 mova m1, [t3+r10*2+416*8+0] 2408 paddd m1, [t3+r10*2+416*8+8] 2409 paddd m3, m1, [t3+r10*2+416*8+4] 2410 psllw ym2, 2 ; a3[ 0] 444 2411 pslld m3, 2 ; b3[ 0] 444 2412 mova [t4+r10*1+416*10], ym2 2413 mova [t3+r10*2+416*20], m3 2414 psubw ym2, ym0 ; a3[ 0] 343 2415 psubd m3, m1 ; b3[ 0] 343 2416 mova [t4+r10*1+416*12], ym2 2417 mova [t3+r10*2+416*24], m3 2418 add r10, 32 2419 jl .prep_n_loop 2420 ret 2421ALIGN function_align 2422.n0: ; neighbor + output (even rows) 2423 mov r10, wq 2424.n0_loop: 2425 movu ym2, [t4+r10*1+2] 2426 paddw ym0, ym2, [t4+r10*1+0] 2427 paddw ym0, [t4+r10*1+4] 2428 paddw ym2, ym0 2429 psllw ym0, 2 2430 paddw ym0, ym2 ; a5 2431 movu m1, [t3+r10*2+4] 2432 paddd m4, m1, [t3+r10*2+0] 2433 paddd m4, [t3+r10*2+8] 2434 paddd m1, m4 2435 pslld m4, 2 2436 paddd m4, m1 ; b5 2437 paddw ym2, ym0, [t4+r10*1+416* 6] 2438 mova [t4+r10*1+416* 6], ym0 2439 paddd m0, m4, [t3+r10*2+416*12] 2440 mova [t3+r10*2+416*12], m4 2441 mova ym3, [t4+r10*1+416*2+0] 2442 paddw ym3, [t4+r10*1+416*2+4] 2443 paddw ym5, ym3, [t4+r10*1+416*2+2] 2444 psllw ym5, 2 ; a3[ 1] 444 2445 psubw ym4, ym5, ym3 ; a3[ 1] 343 2446 paddw ym3, ym4, [t4+r10*1+416* 8] 2447 paddw ym3, [t4+r10*1+416*10] 2448 mova [t4+r10*1+416* 8], ym4 2449 mova [t4+r10*1+416*10], ym5 2450 mova m1, [t3+r10*2+416*4+0] 2451 paddd m1, [t3+r10*2+416*4+8] 2452 paddd m5, m1, [t3+r10*2+416*4+4] 2453 pslld m5, 2 ; b3[ 1] 444 2454 psubd m4, m5, m1 ; b3[ 1] 343 2455 paddd m1, m4, [t3+r10*2+416*16] 2456 paddd m1, [t3+r10*2+416*20] 2457 mova [t3+r10*2+416*16], m4 2458 mova [t3+r10*2+416*20], m5 2459 pmovzxwd m4, [dstq+r10] 2460 pmovzxwd m2, ym2 ; a5 2461 pmovzxwd m3, ym3 ; a3 2462 pmaddwd m2, m4 ; a5 * src 2463 pmaddwd m3, m4 ; a3 * src 2464 vpshldd m4, m22, 13 2465 psubd m0, m2 ; b5 - a5 * src + (1 << 8) 2466 psubd m1, m3 ; b3 - a3 * src + (1 << 8) 2467 psrld m0, 9 2468 pslld m1, 7 2469 vpblendmb m0{k2}, m1, m0 2470 vpdpwssd m4, m0, m7 2471 psrad m4, 7 2472 pmaxsd m4, m6 2473 vpmovusdw ym16, m4 ; clip 2474 psrlw ym16, 6 2475 mova [dstq+r10], ym16 2476 add r10, 32 2477 jl .n0_loop 2478 add dstq, strideq 2479 ret 2480ALIGN function_align 2481.n1: ; neighbor + output (odd rows) 2482 mov r10, wq 2483.n1_loop: 2484 mova ym3, [t4+r10*1+416*4+0] 2485 paddw ym3, [t4+r10*1+416*4+4] 2486 paddw ym5, ym3, [t4+r10*1+416*4+2] 2487 psllw ym5, 2 ; a3[ 1] 444 2488 psubw ym4, ym5, ym3 ; a3[ 1] 343 2489 paddw ym3, ym4, [t4+r10*1+416*12] 2490 paddw ym3, [t4+r10*1+416*10] 2491 mova [t4+r10*1+416*10], ym5 2492 mova [t4+r10*1+416*12], ym4 2493 mova m0, [t3+r10*2+416*8+0] 2494 paddd m0, [t3+r10*2+416*8+8] 2495 paddd m5, m0, [t3+r10*2+416*8+4] 2496 pslld m5, 2 ; b3[ 1] 444 2497 psubd m4, m5, m0 ; b3[ 1] 343 2498 paddd m0, m4, [t3+r10*2+416*24] 2499 paddd m0, [t3+r10*2+416*20] 2500 mova [t3+r10*2+416*20], m5 2501 mova [t3+r10*2+416*24], m4 2502 pmovzxwd m4, [dstq+r10] 2503 pmovzxwd m2, [t4+r10*1+416* 6] 2504 pmovzxwd m3, ym3 2505 mova m1, [t3+r10*2+416*12] 2506 pmaddwd m2, m4 ; a5 * src 2507 pmaddwd m3, m4 ; a3 * src 2508 vpshldd m4, m22, 13 2509 psubd m1, m2 ; b5 - a5 * src + (1 << 8) 2510 psubd m0, m3 ; b3 - a3 * src + (1 << 8) 2511 pslld m0, 7 2512 vpalignr m0{k2}, m1, m1, 1 2513 vpdpwssd m4, m0, m7 2514 psrad m4, 7 2515 pmaxsd m4, m6 2516 vpmovusdw ym16, m4 ; clip 2517 psrlw ym16, 6 2518 mova [dstq+r10], ym16 2519 add r10, 32 2520 jl .n1_loop 2521 add dstq, strideq 2522 ret 2523 2524%endif ; ARCH_X86_64 2525