1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 30 31wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 32wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 33wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 34wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 35wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 36wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 37wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 38sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 39sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 40pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 41 42pb_m14_m13: times 8 db -14,-13 43pb_m10_m9: times 8 db -10, -9 44pb_m6_m5: times 8 db -6, -5 45pb_m2_m1: times 8 db -2, -1 46pb_2_3: times 8 db 2, 3 47pb_6_7: times 8 db 6, 7 48pw_256: times 8 dw 256 49pw_1023: times 8 dw 1023 50pd_8: times 4 dd 8 51pd_4096: times 4 dd 4096 52pd_34816: times 4 dd 34816 53pd_m262128: times 4 dd -262128 54pd_0xffff: times 4 dd 0xffff 55pd_0xf00800a4: times 4 dd 0xf00800a4 56pd_0xf00801c7: times 4 dd 0xf00801c7 57pd_0xfffffff0: times 4 dd 0xfffffff0 58 59wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 60wiener_round: dd 1049600, 1048832 61 62cextern sgr_x_by_x 63 64SECTION .text 65 66%macro movif64 2 ; dst, src 67 %if ARCH_X86_64 68 mov %1, %2 69 %endif 70%endmacro 71 72%macro movif32 2 ; dst, src 73 %if ARCH_X86_32 74 mov %1, %2 75 %endif 76%endmacro 77 78INIT_XMM ssse3 79%if ARCH_X86_32 80DECLARE_REG_TMP 5, 6 81 %if STACK_ALIGNMENT < 16 82 %assign extra_stack 13*16 83 %else 84 %assign extra_stack 12*16 85 %endif 86cglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \ 87 dst, stride, left, lpf, w, flt 88 %if STACK_ALIGNMENT < 16 89 %define lpfm dword [esp+calloff+16*12+ 0] 90 %define wm dword [esp+calloff+16*12+ 4] 91 %define hd dword [esp+calloff+16*12+ 8] 92 %define edgeb byte [esp+calloff+16*12+12] 93 %define edged dword [esp+calloff+16*12+12] 94 %else 95 %define hd dword r5m 96 %define edgeb byte r7m 97 %endif 98 %define PICmem dword [esp+calloff+4*0] 99 %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers 100 %define t1m dword [esp+calloff+4*2] 101 %define t2m dword [esp+calloff+4*3] 102 %define t3m dword [esp+calloff+4*4] 103 %define t4m dword [esp+calloff+4*5] 104 %define t5m dword [esp+calloff+4*6] 105 %define t6m dword [esp+calloff+4*7] 106 %define t2 t2m 107 %define t3 t3m 108 %define t4 t4m 109 %define t5 t5m 110 %define t6 t6m 111 %define m8 [esp+calloff+16*2] 112 %define m9 [esp+calloff+16*3] 113 %define m10 [esp+calloff+16*4] 114 %define m11 [esp+calloff+16*5] 115 %define m12 [esp+calloff+16*6] 116 %define m13 [esp+calloff+16*7] 117 %define m14 [esp+calloff+16*8] 118 %define m15 [esp+calloff+16*9] 119 %define r10 r4 120 %define base t0-wiener_shifts 121 %assign calloff 0 122 %if STACK_ALIGNMENT < 16 123 mov wd, [rstk+stack_offset+20] 124 mov wm, wd 125 mov r5, [rstk+stack_offset+24] 126 mov hd, r5 127 mov r5, [rstk+stack_offset+32] 128 mov edged, r5 ; edge 129 %endif 130%else 131DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers 132cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ 133 w, h, edge, flt 134 %define base 135%endif 136%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 137 movifnidn wd, wm 138%endif 139%if ARCH_X86_64 140 mov fltq, r6mp 141 movifnidn hd, hm 142 mov edged, r7m 143 mov t3d, r8m ; pixel_max 144 movq m13, [fltq] 145 movq m15, [fltq+16] 146%else 147 %if STACK_ALIGNMENT < 16 148 mov t0, [rstk+stack_offset+28] 149 mov t1, [rstk+stack_offset+36] ; pixel_max 150 movq m1, [t0] ; fx 151 movq m3, [t0+16] ; fy 152 LEA t0, wiener_shifts 153 %else 154 mov fltq, r6m 155 movq m1, [fltq] 156 movq m3, [fltq+16] 157 LEA t0, wiener_shifts 158 mov t1, r8m ; pixel_max 159 %endif 160 mov PICmem, t0 161%endif 162 mova m6, [base+wiener_shufA] 163 mova m7, [base+wiener_shufB] 164%if ARCH_X86_64 165 lea t4, [wiener_shifts] 166 add wd, wd 167 pshufd m12, m13, q0000 ; x0 x1 168 pshufd m13, m13, q1111 ; x2 x3 169 pshufd m14, m15, q0000 ; y0 y1 170 pshufd m15, m15, q1111 ; y2 y3 171 mova m8, [wiener_shufC] 172 mova m9, [wiener_shufD] 173 add lpfq, wq 174 lea t1, [rsp+wq+16] 175 add dstq, wq 176 neg wq 177 shr t3d, 11 178 %define base t4-wiener_shifts 179 movd m10, [base+wiener_round+t3*4] 180 movq m11, [base+wiener_shifts+t3*8] 181 pshufd m10, m10, q0000 182 pshufd m0, m11, q0000 183 pshufd m11, m11, q1111 184 pmullw m12, m0 ; upshift filter coefs to make the 185 pmullw m13, m0 ; horizontal downshift constant 186 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 187 %define lpfm [rsp] 188 %define base 189 %define wiener_lshuf7_mem [wiener_lshuf7] 190 %define pd_m262128_mem [pd_m262128] 191%else 192 add wd, wd 193 mova m4, [base+wiener_shufC] 194 mova m5, [base+wiener_shufD] 195 pshufd m0, m1, q0000 196 pshufd m1, m1, q1111 197 pshufd m2, m3, q0000 198 pshufd m3, m3, q1111 199 mova m8, m4 200 mova m9, m5 201 mova m14, m2 202 mova m15, m3 203 shr t1, 11 204 add lpfq, wq 205 mova m3, [base+pd_m262128] 206 movd m4, [base+wiener_round+t1*4] 207 movq m5, [base+wiener_shifts+t1*8] 208 lea t1, [esp+extra_stack+wq+16] 209 add dstq, wq 210 neg wq 211 pshufd m4, m4, q0000 212 pshufd m2, m5, q0000 213 pshufd m5, m5, q1111 214 mov wm, wq 215 pmullw m0, m2 216 pmullw m1, m2 217 mova m2, [base+wiener_lshuf7] 218 %define pd_m262128_mem [esp+calloff+16*10] 219 mova pd_m262128_mem, m3 220 mova m10, m4 221 mova m11, m5 222 mova m12, m0 223 mova m13, m1 224 %define wiener_lshuf7_mem [esp+calloff+16*11] 225 mova wiener_lshuf7_mem, m2 226%endif 227 test edgeb, 4 ; LR_HAVE_TOP 228 jz .no_top 229 call .h_top 230 add lpfq, strideq 231 mov t6, t1 232 mov t5, t1 233 add t1, 384*2 234 call .h_top 235 lea r10, [lpfq+strideq*4] 236 mov lpfq, dstq 237 mov t4, t1 238 add t1, 384*2 239 add r10, strideq 240 mov lpfm, r10 ; below 241 call .h 242 mov t3, t1 243 mov t2, t1 244 dec hd 245 jz .v1 246 add lpfq, strideq 247 add t1, 384*2 248 call .h 249 mov t2, t1 250 dec hd 251 jz .v2 252 add lpfq, strideq 253 add t1, 384*2 254 call .h 255 dec hd 256 jz .v3 257.main: 258 lea t0, [t1+384*2] 259.main_loop: 260 call .hv 261 dec hd 262 jnz .main_loop 263 test edgeb, 8 ; LR_HAVE_BOTTOM 264 jz .v3 265 mov lpfq, lpfm 266 call .hv_bottom 267 add lpfq, strideq 268 call .hv_bottom 269.v1: 270 call .v 271 RET 272.no_top: 273 lea r10, [lpfq+strideq*4] 274 mov lpfq, dstq 275 lea r10, [r10+strideq*2] 276 mov lpfm, r10 277 call .h 278 mov t6, t1 279 mov t5, t1 280 mov t4, t1 281 mov t3, t1 282 mov t2, t1 283 dec hd 284 jz .v1 285 add lpfq, strideq 286 add t1, 384*2 287 call .h 288 mov t2, t1 289 dec hd 290 jz .v2 291 add lpfq, strideq 292 add t1, 384*2 293 call .h 294 dec hd 295 jz .v3 296 lea t0, [t1+384*2] 297 call .hv 298 dec hd 299 jz .v3 300 add t0, 384*8 301 call .hv 302 dec hd 303 jnz .main 304.v3: 305 call .v 306 movif32 wq, wm 307.v2: 308 call .v 309 movif32 wq, wm 310 jmp .v1 311.extend_right: 312%assign stack_offset stack_offset+8 313%assign calloff 8 314 movif32 t0, PICmem 315 pxor m0, m0 316 movd m1, wd 317 mova m2, [base+pb_0to15] 318 pshufb m1, m0 319 mova m0, [base+pb_6_7] 320 psubb m0, m1 321 pminub m0, m2 322 pshufb m3, m0 323 mova m0, [base+pb_m2_m1] 324 psubb m0, m1 325 pminub m0, m2 326 pshufb m4, m0 327 mova m0, [base+pb_m10_m9] 328 psubb m0, m1 329 pminub m0, m2 330 pshufb m5, m0 331 movif32 t0, t0m 332 ret 333%assign stack_offset stack_offset-4 334%assign calloff 4 335.h: 336 movif64 wq, r4 337 movif32 wq, wm 338 test edgeb, 1 ; LR_HAVE_LEFT 339 jz .h_extend_left 340 movq m3, [leftq] 341 movhps m3, [lpfq+wq] 342 add leftq, 8 343 jmp .h_main 344.h_extend_left: 345 mova m3, [lpfq+wq] ; avoid accessing memory located 346 pshufb m3, wiener_lshuf7_mem ; before the start of the buffer 347 jmp .h_main 348.h_top: 349 movif64 wq, r4 350 test edgeb, 1 ; LR_HAVE_LEFT 351 jz .h_extend_left 352.h_loop: 353 movu m3, [lpfq+wq-8] 354.h_main: 355 mova m4, [lpfq+wq+0] 356 movu m5, [lpfq+wq+8] 357 test edgeb, 2 ; LR_HAVE_RIGHT 358 jnz .h_have_right 359 cmp wd, -20 360 jl .h_have_right 361 call .extend_right 362.h_have_right: 363 pshufb m0, m3, m6 364 pshufb m1, m4, m7 365 paddw m0, m1 366 pshufb m3, m8 367 pmaddwd m0, m12 368 pshufb m1, m4, m9 369 paddw m3, m1 370 pshufb m1, m4, m6 371 pmaddwd m3, m13 372 pshufb m2, m5, m7 373 paddw m1, m2 374 mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18) 375 pshufb m4, m8 376 pmaddwd m1, m12 377 pshufb m5, m9 378 paddw m4, m5 379 pmaddwd m4, m13 380 paddd m0, m2 381 paddd m1, m2 382 paddd m0, m3 383 paddd m1, m4 384 psrad m0, 4 385 psrad m1, 4 386 packssdw m0, m1 387 psraw m0, 1 388 mova [t1+wq], m0 389 add wq, 16 390 jl .h_loop 391 movif32 wq, wm 392 ret 393ALIGN function_align 394.hv: 395 add lpfq, strideq 396 movif64 wq, r4 397 movif32 t0m, t0 398 movif32 t1m, t1 399 test edgeb, 1 ; LR_HAVE_LEFT 400 jz .hv_extend_left 401 movq m3, [leftq] 402 movhps m3, [lpfq+wq] 403 add leftq, 8 404 jmp .hv_main 405.hv_extend_left: 406 mova m3, [lpfq+wq] 407 pshufb m3, wiener_lshuf7_mem 408 jmp .hv_main 409.hv_bottom: 410 movif64 wq, r4 411 movif32 t0m, t0 412 movif32 t1m, t1 413 test edgeb, 1 ; LR_HAVE_LEFT 414 jz .hv_extend_left 415.hv_loop: 416 movu m3, [lpfq+wq-8] 417.hv_main: 418 mova m4, [lpfq+wq+0] 419 movu m5, [lpfq+wq+8] 420 test edgeb, 2 ; LR_HAVE_RIGHT 421 jnz .hv_have_right 422 cmp wd, -20 423 jl .hv_have_right 424 call .extend_right 425.hv_have_right: 426 movif32 t1, t4m 427 movif32 t0, t2m 428 pshufb m0, m3, m6 429 pshufb m1, m4, m7 430 paddw m0, m1 431 pshufb m3, m8 432 pmaddwd m0, m12 433 pshufb m1, m4, m9 434 paddw m3, m1 435 pshufb m1, m4, m6 436 pmaddwd m3, m13 437 pshufb m2, m5, m7 438 paddw m1, m2 439 mova m2, pd_m262128_mem 440 pshufb m4, m8 441 pmaddwd m1, m12 442 pshufb m5, m9 443 paddw m4, m5 444 pmaddwd m4, m13 445 paddd m0, m2 446 paddd m1, m2 447%if ARCH_X86_64 448 mova m2, [t4+wq] 449 paddw m2, [t2+wq] 450 mova m5, [t3+wq] 451%else 452 mova m2, [t1+wq] 453 paddw m2, [t0+wq] 454 mov t1, t3m 455 mov t0, t5m 456 mova m5, [t1+wq] 457 mov t1, t1m 458%endif 459 paddd m0, m3 460 paddd m1, m4 461 psrad m0, 4 462 psrad m1, 4 463 packssdw m0, m1 464%if ARCH_X86_64 465 mova m4, [t5+wq] 466 paddw m4, [t1+wq] 467 psraw m0, 1 468 paddw m3, m0, [t6+wq] 469%else 470 mova m4, [t0+wq] 471 paddw m4, [t1+wq] 472 mov t0, t0m 473 mov t1, t6m 474 psraw m0, 1 475 paddw m3, m0, [t1+wq] 476%endif 477 mova [t0+wq], m0 478 punpcklwd m0, m2, m5 479 pmaddwd m0, m15 480 punpckhwd m2, m5 481 pmaddwd m2, m15 482 punpcklwd m1, m3, m4 483 pmaddwd m1, m14 484 punpckhwd m3, m4 485 pmaddwd m3, m14 486 paddd m0, m10 487 paddd m2, m10 488 paddd m0, m1 489 paddd m2, m3 490 psrad m0, 6 491 psrad m2, 6 492 packssdw m0, m2 493 pmulhw m0, m11 494 pxor m1, m1 495 pmaxsw m0, m1 496 mova [dstq+wq], m0 497 add wq, 16 498 jl .hv_loop 499%if ARCH_X86_64 500 mov t6, t5 501 mov t5, t4 502 mov t4, t3 503 mov t3, t2 504 mov t2, t1 505 mov t1, t0 506 mov t0, t6 507%else 508 mov r4, t5m 509 mov t1, t4m 510 mov t6m, r4 511 mov t5m, t1 512 mov r4, t3m 513 mov t1, t2m 514 mov t4m, r4 515 mov t3m, t1 516 mov r4, t1m 517 mov t1, t0 518 mov t2m, r4 519 mov t0, t6m 520 mov wq, wm 521%endif 522 add dstq, strideq 523 ret 524.v: 525 movif64 wq, r4 526 movif32 t0m, t0 527 movif32 t1m, t1 528.v_loop: 529%if ARCH_X86_64 530 mova m1, [t4+wq] 531 paddw m1, [t2+wq] 532 mova m2, [t3+wq] 533 mova m4, [t1+wq] 534 paddw m3, m4, [t6+wq] 535 paddw m4, [t5+wq] 536%else 537 mov t0, t4m 538 mov t1, t2m 539 mova m1, [t0+wq] 540 paddw m1, [t1+wq] 541 mov t0, t3m 542 mov t1, t1m 543 mova m2, [t0+wq] 544 mova m4, [t1+wq] 545 mov t0, t6m 546 mov t1, t5m 547 paddw m3, m4, [t0+wq] 548 paddw m4, [t1+wq] 549%endif 550 punpcklwd m0, m1, m2 551 pmaddwd m0, m15 552 punpckhwd m1, m2 553 pmaddwd m1, m15 554 punpcklwd m2, m3, m4 555 pmaddwd m2, m14 556 punpckhwd m3, m4 557 pmaddwd m3, m14 558 paddd m0, m10 559 paddd m1, m10 560 paddd m0, m2 561 paddd m1, m3 562 psrad m0, 6 563 psrad m1, 6 564 packssdw m0, m1 565 pmulhw m0, m11 566 pxor m1, m1 567 pmaxsw m0, m1 568 mova [dstq+wq], m0 569 add wq, 16 570 jl .v_loop 571%if ARCH_X86_64 572 mov t6, t5 573 mov t5, t4 574 mov t4, t3 575 mov t3, t2 576 mov t2, t1 577%else 578 mov t0, t5m 579 mov t1, t4m 580 mov r4, t3m 581 mov t6m, t0 582 mov t5m, t1 583 mov t4m, r4 584 mov r4, t2m 585 mov t1, t1m 586 mov t0, t0m 587 mov t3m, r4 588 mov t2m, t1 589%endif 590 add dstq, strideq 591 ret 592 593%if ARCH_X86_32 594 %if STACK_ALIGNMENT < 16 595 %assign stack_size 12*16+384*8 596 %else 597 %assign stack_size 11*16+384*8 598 %endif 599cglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \ 600 lpf, w, flt 601 %if STACK_ALIGNMENT < 16 602 %define lpfm dword [esp+calloff+4*6] 603 %define wm dword [esp+calloff+4*7] 604 %define hd dword [esp+calloff+16*10+0] 605 %define edgeb byte [esp+calloff+16*10+4] 606 %define edged dword [esp+calloff+16*10+4] 607 %else 608 %define hd dword r5m 609 %define edgeb byte r7m 610 %endif 611 %define PICmem dword [esp+calloff+4*0] 612 %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers 613 %define t1m dword [esp+calloff+4*2] 614 %define t2m dword [esp+calloff+4*3] 615 %define t3m dword [esp+calloff+4*4] 616 %define t4m dword [esp+calloff+4*5] 617 %define t2 t2m 618 %define t3 t3m 619 %define t4 t4m 620 %define m8 [esp+calloff+16*2] 621 %define m9 [esp+calloff+16*3] 622 %define m10 [esp+calloff+16*4] 623 %define m11 [esp+calloff+16*5] 624 %define m12 [esp+calloff+16*6] 625 %define m13 [esp+calloff+16*7] 626 %define m14 [esp+calloff+16*8] 627 %define m15 [esp+calloff+16*9] 628 %define base t0-wiener_shifts 629 %assign calloff 0 630 %if STACK_ALIGNMENT < 16 631 mov wd, [rstk+stack_offset+20] 632 mov wm, wd 633 mov r5, [rstk+stack_offset+24] 634 mov hd, r5 635 mov r5, [rstk+stack_offset+32] 636 mov edged, r5 ; edge 637 %endif 638%else 639cglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \ 640 w, h, edge, flt 641 %define base 642%endif 643%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 644 movifnidn wd, wm 645%endif 646%if ARCH_X86_64 647 mov fltq, r6mp 648 movifnidn hd, hm 649 mov edged, r7m 650 mov t3d, r8m ; pixel_max 651 movq m12, [fltq] 652 movq m14, [fltq+16] 653%else 654 %if STACK_ALIGNMENT < 16 655 mov t0, [rstk+stack_offset+28] 656 mov t1, [rstk+stack_offset+36] ; pixel_max 657 movq m1, [t0] ; fx 658 movq m3, [t0+16] ; fy 659 LEA t0, wiener_shifts 660 %else 661 mov fltq, r6m 662 movq m1, [fltq] 663 movq m3, [fltq+16] 664 LEA t0, wiener_shifts 665 mov t1, r8m ; pixel_max 666 %endif 667 mov PICmem, t0 668%endif 669 mova m5, [base+wiener_shufE] 670 mova m6, [base+wiener_shufB] 671 mova m7, [base+wiener_shufD] 672%if ARCH_X86_64 673 lea t4, [wiener_shifts] 674 add wd, wd 675 punpcklwd m11, m12, m12 676 pshufd m11, m11, q1111 ; x1 677 pshufd m12, m12, q1111 ; x2 x3 678 punpcklwd m13, m14, m14 679 pshufd m13, m13, q1111 ; y1 680 pshufd m14, m14, q1111 ; y2 y3 681 shr t3d, 11 682 mova m8, [pd_m262128] ; (1 << 4) - (1 << 18) 683 add lpfq, wq 684 lea t1, [rsp+wq+16] 685 add dstq, wq 686 neg wq 687 %define base t4-wiener_shifts 688 movd m9, [base+wiener_round+t3*4] 689 movq m10, [base+wiener_shifts+t3*8] 690 pshufd m9, m9, q0000 691 pshufd m0, m10, q0000 692 pshufd m10, m10, q1111 693 mova m15, [wiener_lshuf5] 694 pmullw m11, m0 695 pmullw m12, m0 696 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 697 %define lpfm [rsp] 698 %define base 699%else 700 add wd, wd 701 punpcklwd m0, m1, m1 702 pshufd m0, m0, q1111 ; x1 703 pshufd m1, m1, q1111 ; x2 x3 704 punpcklwd m2, m3, m3 705 pshufd m2, m2, q1111 ; y1 706 pshufd m3, m3, q1111 ; y2 y3 707 mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18) 708 mova m13, m2 709 mova m14, m3 710 mova m8, m4 711 shr t1, 11 712 add lpfq, wq 713 movd m2, [base+wiener_round+t1*4] 714 movq m3, [base+wiener_shifts+t1*8] 715 %if STACK_ALIGNMENT < 16 716 lea t1, [esp+16*11+wq+16] 717 %else 718 lea t1, [esp+16*10+wq+16] 719 %endif 720 add dstq, wq 721 neg wq 722 pshufd m2, m2, q0000 723 pshufd m4, m3, q0000 724 pshufd m3, m3, q1111 725 mov wm, wq 726 pmullw m0, m4 727 pmullw m1, m4 728 mova m4, [base+wiener_lshuf5] 729 mova m9, m2 730 mova m10, m3 731 mova m11, m0 732 mova m12, m1 733 mova m15, m4 734%endif 735 test edgeb, 4 ; LR_HAVE_TOP 736 jz .no_top 737 call .h_top 738 add lpfq, strideq 739 mov t4, t1 740 add t1, 384*2 741 call .h_top 742 lea r10, [lpfq+strideq*4] 743 mov lpfq, dstq 744 mov t3, t1 745 add t1, 384*2 746 add r10, strideq 747 mov lpfm, r10 ; below 748 call .h 749 mov t2, t1 750 dec hd 751 jz .v1 752 add lpfq, strideq 753 add t1, 384*2 754 call .h 755 dec hd 756 jz .v2 757.main: 758 mov t0, t4 759.main_loop: 760 call .hv 761 dec hd 762 jnz .main_loop 763 test edgeb, 8 ; LR_HAVE_BOTTOM 764 jz .v2 765 mov lpfq, lpfm 766 call .hv_bottom 767 add lpfq, strideq 768 call .hv_bottom 769.end: 770 RET 771.no_top: 772 lea r10, [lpfq+strideq*4] 773 mov lpfq, dstq 774 lea r10, [r10+strideq*2] 775 mov lpfm, r10 776 call .h 777 mov t4, t1 778 mov t3, t1 779 mov t2, t1 780 dec hd 781 jz .v1 782 add lpfq, strideq 783 add t1, 384*2 784 call .h 785 dec hd 786 jz .v2 787 lea t0, [t1+384*2] 788 call .hv 789 dec hd 790 jz .v2 791 add t0, 384*6 792 call .hv 793 dec hd 794 jnz .main 795.v2: 796 call .v 797%if ARCH_X86_64 798 mov t4, t3 799 mov t3, t2 800 mov t2, t1 801%else 802 mov t0, t3m 803 mov r4, t2m 804 mov t1, t1m 805 mov t4m, t0 806 mov t3m, r4 807 mov t2m, t1 808 mov wq, wm 809%endif 810 add dstq, strideq 811.v1: 812 call .v 813 jmp .end 814.extend_right: 815%assign stack_offset stack_offset+8 816%assign calloff 8 817 movif32 t0, PICmem 818 pxor m1, m1 819 movd m2, wd 820 mova m0, [base+pb_2_3] 821 pshufb m2, m1 822 mova m1, [base+pb_m6_m5] 823 psubb m0, m2 824 psubb m1, m2 825 mova m2, [base+pb_0to15] 826 pminub m0, m2 827 pminub m1, m2 828 pshufb m3, m0 829 pshufb m4, m1 830 ret 831%assign stack_offset stack_offset-4 832%assign calloff 4 833.h: 834 movif64 wq, r4 835 movif32 wq, wm 836 test edgeb, 1 ; LR_HAVE_LEFT 837 jz .h_extend_left 838 mova m4, [lpfq+wq] 839 movd m3, [leftq+4] 840 pslldq m4, 4 841 por m3, m4 842 add leftq, 8 843 jmp .h_main 844.h_extend_left: 845 mova m3, [lpfq+wq] ; avoid accessing memory located 846 pshufb m3, m15 ; before the start of the buffer 847 jmp .h_main 848.h_top: 849 movif64 wq, r4 850 movif32 wq, wm 851 test edgeb, 1 ; LR_HAVE_LEFT 852 jz .h_extend_left 853.h_loop: 854 movu m3, [lpfq+wq-4] 855.h_main: 856 movu m4, [lpfq+wq+4] 857 test edgeb, 2 ; LR_HAVE_RIGHT 858 jnz .h_have_right 859 cmp wd, -18 860 jl .h_have_right 861 call .extend_right 862.h_have_right: 863 pshufb m0, m3, m5 864 pmaddwd m0, m11 865 pshufb m1, m4, m5 866 pmaddwd m1, m11 867 pshufb m2, m3, m6 868 pshufb m3, m7 869 paddw m2, m3 870 pshufb m3, m4, m6 871 pmaddwd m2, m12 872 pshufb m4, m7 873 paddw m3, m4 874 pmaddwd m3, m12 875 paddd m0, m8 876 paddd m1, m8 877 paddd m0, m2 878 paddd m1, m3 879 psrad m0, 4 880 psrad m1, 4 881 packssdw m0, m1 882 psraw m0, 1 883 mova [t1+wq], m0 884 add wq, 16 885 jl .h_loop 886 movif32 wq, wm 887 ret 888ALIGN function_align 889.hv: 890 add lpfq, strideq 891 movif64 wq, r4 892 movif32 t0m, t0 893 movif32 t1m, t1 894 test edgeb, 1 ; LR_HAVE_LEFT 895 jz .hv_extend_left 896 mova m4, [lpfq+wq] 897 movd m3, [leftq+4] 898 pslldq m4, 4 899 por m3, m4 900 add leftq, 8 901 jmp .hv_main 902.hv_extend_left: 903 mova m3, [lpfq+wq] 904 pshufb m3, m15 905 jmp .hv_main 906.hv_bottom: 907 movif64 wq, r4 908 movif32 t0m, t0 909 movif32 t1m, t1 910 test edgeb, 1 ; LR_HAVE_LEFT 911 jz .hv_extend_left 912.hv_loop: 913 movu m3, [lpfq+wq-4] 914.hv_main: 915 movu m4, [lpfq+wq+4] 916 test edgeb, 2 ; LR_HAVE_RIGHT 917 jnz .hv_have_right 918 cmp wd, -18 919 jl .hv_have_right 920 call .extend_right 921.hv_have_right: 922 movif32 t1, t1m 923 movif32 t0, t3m 924 pshufb m0, m3, m5 925 pmaddwd m0, m11 926 pshufb m1, m4, m5 927 pmaddwd m1, m11 928 pshufb m2, m3, m6 929 pshufb m3, m7 930 paddw m2, m3 931 pshufb m3, m4, m6 932 pmaddwd m2, m12 933 pshufb m4, m7 934 paddw m3, m4 935 pmaddwd m3, m12 936 paddd m0, m8 937 paddd m1, m8 938 paddd m0, m2 939%if ARCH_X86_64 940 mova m2, [t3+wq] 941 paddw m2, [t1+wq] 942 paddd m1, m3 943 mova m4, [t2+wq] 944%else 945 mova m2, [t0+wq] 946 mov t0, t2m 947 paddw m2, [t1+wq] 948 mov t1, t4m 949 paddd m1, m3 950 mova m4, [t0+wq] 951 mov t0, t0m 952%endif 953 punpckhwd m3, m2, m4 954 pmaddwd m3, m14 955 punpcklwd m2, m4 956%if ARCH_X86_64 957 mova m4, [t4+wq] 958%else 959 mova m4, [t1+wq] 960%endif 961 psrad m0, 4 962 psrad m1, 4 963 packssdw m0, m1 964 pmaddwd m2, m14 965 psraw m0, 1 966 mova [t0+wq], m0 967 punpckhwd m1, m0, m4 968 pmaddwd m1, m13 969 punpcklwd m0, m4 970 pmaddwd m0, m13 971 paddd m3, m9 972 paddd m2, m9 973 paddd m1, m3 974 paddd m0, m2 975 psrad m1, 6 976 psrad m0, 6 977 packssdw m0, m1 978 pmulhw m0, m10 979 pxor m1, m1 980 pmaxsw m0, m1 981 mova [dstq+wq], m0 982 add wq, 16 983 jl .hv_loop 984%if ARCH_X86_64 985 mov t4, t3 986 mov t3, t2 987 mov t2, t1 988 mov t1, t0 989 mov t0, t4 990%else 991 mov r4, t3m 992 mov t1, t2m 993 mov t4m, r4 994 mov t3m, t1 995 mov r4, t1m 996 mov t1, t0 997 mov t2m, r4 998 mov t0, t4m 999 mov wq, wm 1000%endif 1001 add dstq, strideq 1002 ret 1003.v: 1004 movif64 wq, r4 1005 movif32 t1m, t1 1006.v_loop: 1007%if ARCH_X86_64 1008 mova m0, [t1+wq] 1009 paddw m2, m0, [t3+wq] 1010 mova m1, [t2+wq] 1011 mova m4, [t4+wq] 1012%else 1013 mov t0, t3m 1014 mova m0, [t1+wq] 1015 mov t1, t2m 1016 paddw m2, m0, [t0+wq] 1017 mov t0, t4m 1018 mova m1, [t1+wq] 1019 mova m4, [t0+wq] 1020%endif 1021 punpckhwd m3, m2, m1 1022 pmaddwd m3, m14 1023 punpcklwd m2, m1 1024 pmaddwd m2, m14 1025 punpckhwd m1, m0, m4 1026 pmaddwd m1, m13 1027 punpcklwd m0, m4 1028 pmaddwd m0, m13 1029 paddd m3, m9 1030 paddd m2, m9 1031 paddd m1, m3 1032 paddd m0, m2 1033 psrad m1, 6 1034 psrad m0, 6 1035 packssdw m0, m1 1036 pmulhw m0, m10 1037 pxor m1, m1 1038 pmaxsw m0, m1 1039 mova [dstq+wq], m0 1040 add wq, 16 1041%if ARCH_X86_64 1042 jl .v_loop 1043%else 1044 jge .v_end 1045 mov t1, t1m 1046 jmp .v_loop 1047.v_end: 1048%endif 1049 ret 1050 1051%macro GATHERDD 3 ; dst, src, tmp 1052 movd %3d, %2 1053 %if ARCH_X86_64 1054 movd %1, [r13+%3] 1055 pextrw %3d, %2, 2 1056 pinsrw %1, [r13+%3+2], 3 1057 pextrw %3d, %2, 4 1058 pinsrw %1, [r13+%3+2], 5 1059 pextrw %3d, %2, 6 1060 pinsrw %1, [r13+%3+2], 7 1061 %else 1062 movd %1, [base+sgr_x_by_x-0xf03+%3] 1063 pextrw %3, %2, 2 1064 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 1065 pextrw %3, %2, 4 1066 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 1067 pextrw %3, %2, 6 1068 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 1069 %endif 1070%endmacro 1071 1072%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore 1073 %if ARCH_X86_64 1074 %define tmp r14 1075 %else 1076 %define tmp %4 1077 %endif 1078 GATHERDD %1, %2, tmp 1079 GATHERDD %2, %3, tmp 1080 movif32 %4, %5 1081 psrld %1, 24 1082 psrld %2, 24 1083 packssdw %1, %2 1084%endmacro 1085 1086%macro MAXSD 3-4 0 ; dst, src, restore_tmp 1087 pcmpgtd %3, %1, %2 1088 pand %1, %3 1089 pandn %3, %2 1090 por %1, %3 1091 %if %4 == 1 1092 pxor %3, %3 1093 %endif 1094%endmacro 1095 1096%macro MULLD 3 ; dst, src, tmp 1097 pmulhuw %3, %1, %2 1098 pmullw %1, %2 1099 pslld %3, 16 1100 paddd %1, %3 1101%endmacro 1102 1103%if ARCH_X86_32 1104DECLARE_REG_TMP 0, 1, 2, 3, 5 1105 %if STACK_ALIGNMENT < 16 1106 %assign extra_stack 5*16 1107 %else 1108 %assign extra_stack 3*16 1109 %endif 1110cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \ 1111 dst, stride, left, lpf, w 1112 %if STACK_ALIGNMENT < 16 1113 %define dstm dword [esp+calloff+16*0+4*6] 1114 %define stridemp dword [esp+calloff+16*0+4*7] 1115 %define leftm dword [esp+calloff+16*3+4*0] 1116 %define lpfm dword [esp+calloff+16*3+4*1] 1117 %define w0m dword [esp+calloff+16*3+4*2] 1118 %define hd dword [esp+calloff+16*3+4*3] 1119 %define edgeb byte [esp+calloff+16*3+4*4] 1120 %define edged dword [esp+calloff+16*3+4*4] 1121 %define leftmp leftm 1122 %else 1123 %define w0m wm 1124 %define hd dword r5m 1125 %define edgeb byte r7m 1126 %define edged dword r7m 1127 %endif 1128 %define hvsrcm dword [esp+calloff+4*0] 1129 %define w1m dword [esp+calloff+4*1] 1130 %define t0m dword [esp+calloff+4*2] 1131 %define t2m dword [esp+calloff+4*3] 1132 %define t3m dword [esp+calloff+4*4] 1133 %define t4m dword [esp+calloff+4*5] 1134 %define m8 [base+pd_8] 1135 %define m9 [base+pd_0xfffffff0] 1136 %define m10 [esp+calloff+16*2] 1137 %define m11 [base+pd_0xf00800a4] 1138 %define m12 [base+sgr_lshuf5] 1139 %define m13 [base+pd_34816] 1140 %define m14 [base+pw_1023] 1141 %define r10 r4 1142 %define base r6-$$ 1143 %assign calloff 0 1144 %if STACK_ALIGNMENT < 16 1145 mov strideq, [rstk+stack_offset+ 8] 1146 mov leftq, [rstk+stack_offset+12] 1147 mov lpfq, [rstk+stack_offset+16] 1148 mov wd, [rstk+stack_offset+20] 1149 mov dstm, dstq 1150 mov stridemp, strideq 1151 mov leftm, leftq 1152 mov r1, [rstk+stack_offset+24] 1153 mov r2, [rstk+stack_offset+32] 1154 mov lpfm, lpfq 1155 mov hd, r1 1156 mov edged, r2 1157 %endif 1158%else 1159cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \ 1160 w, h, edge, params 1161%endif 1162%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1163 movifnidn wd, wm 1164%endif 1165%if ARCH_X86_64 1166 mov paramsq, r6mp 1167 lea r13, [sgr_x_by_x-0xf03] 1168 movifnidn hd, hm 1169 add wd, wd 1170 mov edged, r7m 1171 movu m10, [paramsq] 1172 mova m12, [sgr_lshuf5] 1173 add lpfq, wq 1174 mova m8, [pd_8] 1175 lea t1, [rsp+wq+20] 1176 mova m9, [pd_0xfffffff0] 1177 add dstq, wq 1178 lea t3, [rsp+wq*2+400*12+16] 1179 mova m11, [pd_0xf00800a4] 1180 lea t4, [rsp+wq+400*20+16] 1181 pshufhw m7, m10, q0000 1182 pshufb m10, [pw_256] ; s0 1183 punpckhqdq m7, m7 ; w0 1184 neg wq 1185 mova m13, [pd_34816] ; (1 << 11) + (1 << 15) 1186 pxor m6, m6 1187 mova m14, [pw_1023] 1188 psllw m7, 4 1189 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 1190 %define lpfm [rsp] 1191%else 1192 mov r1, [rstk+stack_offset+28] ; params 1193 LEA r6, $$ 1194 add wd, wd 1195 movu m1, [r1] 1196 add lpfm, wq 1197 lea t1, [rsp+extra_stack+wq+20] 1198 add dstq, wq 1199 lea t3, [rsp+extra_stack+wq*2+400*12+16] 1200 mov dstm, dstq 1201 lea t4, [rsp+extra_stack+wq+400*20+16] 1202 mov t3m, t3 1203 pshufhw m7, m1, q0000 1204 mov t4m, t4 1205 pshufb m1, [base+pw_256] ; s0 1206 punpckhqdq m7, m7 ; w0 1207 psllw m7, 4 1208 neg wq 1209 mova m10, m1 1210 pxor m6, m6 1211 mov w1m, wd 1212 sub wd, 4 1213 mov lpfq, lpfm 1214 mov w0m, wd 1215 %define strideq r5 1216%endif 1217 test edgeb, 4 ; LR_HAVE_TOP 1218 jz .no_top 1219 call .h_top 1220 add lpfq, stridemp 1221 movif32 t2m, t1 1222 mov t2, t1 1223 call .top_fixup 1224 add t1, 400*6 1225 call .h_top 1226 movif32 strideq, stridemp 1227 lea r10, [lpfq+strideq*4] 1228 mov lpfq, dstq 1229 add r10, strideq 1230 mov lpfm, r10 ; below 1231 movif32 t0m, t2 1232 mov t0, t2 1233 dec hd 1234 jz .height1 1235 or edged, 16 1236 call .h 1237.main: 1238 add lpfq, stridemp 1239 movif32 t4, t4m 1240 call .hv 1241 call .prep_n 1242 sub hd, 2 1243 jl .extend_bottom 1244.main_loop: 1245 movif32 lpfq, hvsrcm 1246 add lpfq, stridemp 1247%if ARCH_X86_64 1248 test hb, hb 1249%else 1250 mov r4, hd 1251 test r4, r4 1252%endif 1253 jz .odd_height 1254 call .h 1255 add lpfq, stridemp 1256 call .hv 1257 movif32 dstq, dstm 1258 call .n0 1259 call .n1 1260 sub hd, 2 1261 movif32 t0, t0m 1262 jge .main_loop 1263 test edgeb, 8 ; LR_HAVE_BOTTOM 1264 jz .extend_bottom 1265 mov lpfq, lpfm 1266 call .h_top 1267 add lpfq, stridemp 1268 call .hv_bottom 1269.end: 1270 movif32 dstq, dstm 1271 call .n0 1272 call .n1 1273.end2: 1274 RET 1275.height1: 1276 movif32 t4, t4m 1277 call .hv 1278 call .prep_n 1279 jmp .odd_height_end 1280.odd_height: 1281 call .hv 1282 movif32 dstq, dstm 1283 call .n0 1284 call .n1 1285.odd_height_end: 1286 call .v 1287 movif32 dstq, dstm 1288 call .n0 1289 jmp .end2 1290.extend_bottom: 1291 call .v 1292 jmp .end 1293.no_top: 1294 movif32 strideq, stridemp 1295 lea r10, [lpfq+strideq*4] 1296 mov lpfq, dstq 1297 lea r10, [r10+strideq*2] 1298 mov lpfm, r10 1299 call .h 1300 lea t2, [t1+400*6] 1301 movif32 t2m, t2 1302 call .top_fixup 1303 dec hd 1304 jz .no_top_height1 1305 or edged, 16 1306 mov t0, t1 1307 mov t1, t2 1308 movif32 t0m, t0 1309 jmp .main 1310.no_top_height1: 1311 movif32 t3, t3m 1312 movif32 t4, t4m 1313 call .v 1314 call .prep_n 1315 jmp .odd_height_end 1316.extend_right: 1317 movd m0, wd 1318 movd m1, [lpfq-2] 1319 mova m2, [base+pw_256] 1320 mova m3, [base+pb_m14_m13] 1321 pshufb m0, m6 1322 pshufb m1, m2 1323 psubb m2, m0 1324 psubb m3, m0 1325 mova m0, [base+pb_0to15] 1326 pcmpgtb m2, m0 1327 pcmpgtb m3, m0 1328 pand m4, m2 1329 pand m5, m3 1330 pandn m2, m1 1331 pandn m3, m1 1332 por m4, m2 1333 por m5, m3 1334 ret 1335%assign stack_offset stack_offset+4 1336%assign calloff 4 1337.h: ; horizontal boxsum 1338%if ARCH_X86_64 1339 lea wq, [r4-4] 1340%else 1341 %define leftq r4 1342%endif 1343 test edgeb, 1 ; LR_HAVE_LEFT 1344 jz .h_extend_left 1345 movif32 leftq, leftm 1346 movddup m5, [leftq] 1347 movif32 wq, w0m 1348 mova m4, [lpfq+wq+4] 1349 add leftmp, 8 1350 palignr m4, m5, 10 1351 jmp .h_main 1352.h_extend_left: 1353 movif32 wq, w0m 1354 mova m4, [lpfq+wq+4] 1355 pshufb m4, m12 1356 jmp .h_main 1357.h_top: 1358%if ARCH_X86_64 1359 lea wq, [r4-4] 1360%endif 1361 test edgeb, 1 ; LR_HAVE_LEFT 1362 jz .h_extend_left 1363 movif32 wq, w0m 1364.h_loop: 1365 movu m4, [lpfq+wq- 2] 1366.h_main: 1367 movu m5, [lpfq+wq+14] 1368 test edgeb, 2 ; LR_HAVE_RIGHT 1369 jnz .h_have_right 1370 cmp wd, -20 1371 jl .h_have_right 1372 call .extend_right 1373.h_have_right: 1374 palignr m2, m5, m4, 2 1375 paddw m0, m4, m2 1376 palignr m3, m5, m4, 6 1377 paddw m0, m3 1378 punpcklwd m1, m2, m3 1379 pmaddwd m1, m1 1380 punpckhwd m2, m3 1381 pmaddwd m2, m2 1382 palignr m5, m4, 8 1383 paddw m0, m5 1384 punpcklwd m3, m4, m5 1385 pmaddwd m3, m3 1386 paddd m1, m3 1387 punpckhwd m3, m4, m5 1388 pmaddwd m3, m3 1389 shufps m4, m5, q2121 1390 paddw m0, m4 ; sum 1391 punpcklwd m5, m4, m6 1392 pmaddwd m5, m5 1393 punpckhwd m4, m6 1394 pmaddwd m4, m4 1395 paddd m2, m3 1396 test edgeb, 16 ; y > 0 1397 jz .h_loop_end 1398 paddw m0, [t1+wq+400*0] 1399 paddd m1, [t1+wq+400*2] 1400 paddd m2, [t1+wq+400*4] 1401.h_loop_end: 1402 paddd m1, m5 ; sumsq 1403 paddd m2, m4 1404 mova [t1+wq+400*0], m0 1405 mova [t1+wq+400*2], m1 1406 mova [t1+wq+400*4], m2 1407 add wq, 16 1408 jl .h_loop 1409 ret 1410.top_fixup: 1411%if ARCH_X86_64 1412 lea wq, [r4-4] 1413%else 1414 mov wd, w0m 1415%endif 1416.top_fixup_loop: ; the sums of the first row needs to be doubled 1417 mova m0, [t1+wq+400*0] 1418 mova m1, [t1+wq+400*2] 1419 mova m2, [t1+wq+400*4] 1420 paddw m0, m0 1421 paddd m1, m1 1422 paddd m2, m2 1423 mova [t2+wq+400*0], m0 1424 mova [t2+wq+400*2], m1 1425 mova [t2+wq+400*4], m2 1426 add wq, 16 1427 jl .top_fixup_loop 1428 ret 1429ALIGN function_align 1430.hv: ; horizontal boxsum + vertical boxsum + ab 1431%if ARCH_X86_64 1432 lea wq, [r4-4] 1433%else 1434 mov hvsrcm, lpfq 1435%endif 1436 test edgeb, 1 ; LR_HAVE_LEFT 1437 jz .hv_extend_left 1438 movif32 leftq, leftm 1439 movddup m5, [leftq] 1440 movif32 wq, w0m 1441 mova m4, [lpfq+wq+4] 1442 add leftmp, 8 1443 palignr m4, m5, 10 1444 jmp .hv_main 1445.hv_extend_left: 1446 movif32 wq, w0m 1447 mova m4, [lpfq+wq+4] 1448 pshufb m4, m12 1449 jmp .hv_main 1450.hv_bottom: 1451%if ARCH_X86_64 1452 lea wq, [r4-4] 1453%else 1454 mov hvsrcm, lpfq 1455%endif 1456 test edgeb, 1 ; LR_HAVE_LEFT 1457 jz .hv_extend_left 1458 movif32 wq, w0m 1459%if ARCH_X86_32 1460 jmp .hv_loop_start 1461%endif 1462.hv_loop: 1463 movif32 lpfq, hvsrcm 1464.hv_loop_start: 1465 movu m4, [lpfq+wq- 2] 1466.hv_main: 1467 movu m5, [lpfq+wq+14] 1468 test edgeb, 2 ; LR_HAVE_RIGHT 1469 jnz .hv_have_right 1470 cmp wd, -20 1471 jl .hv_have_right 1472 call .extend_right 1473.hv_have_right: 1474 movif32 t3, hd 1475 palignr m3, m5, m4, 2 1476 paddw m0, m4, m3 1477 palignr m1, m5, m4, 6 1478 paddw m0, m1 1479 punpcklwd m2, m3, m1 1480 pmaddwd m2, m2 1481 punpckhwd m3, m1 1482 pmaddwd m3, m3 1483 palignr m5, m4, 8 1484 paddw m0, m5 1485 punpcklwd m1, m4, m5 1486 pmaddwd m1, m1 1487 paddd m2, m1 1488 punpckhwd m1, m4, m5 1489 pmaddwd m1, m1 1490 shufps m4, m5, q2121 1491 paddw m0, m4 ; h sum 1492 punpcklwd m5, m4, m6 1493 pmaddwd m5, m5 1494 punpckhwd m4, m6 1495 pmaddwd m4, m4 1496 paddd m3, m1 1497 paddd m2, m5 ; h sumsq 1498 paddd m3, m4 1499 paddw m1, m0, [t1+wq+400*0] 1500 paddd m4, m2, [t1+wq+400*2] 1501 paddd m5, m3, [t1+wq+400*4] 1502%if ARCH_X86_64 1503 test hd, hd 1504%else 1505 test t3, t3 1506%endif 1507 jz .hv_last_row 1508.hv_main2: 1509 paddw m1, [t2+wq+400*0] ; hv sum 1510 paddd m4, [t2+wq+400*2] ; hv sumsq 1511 paddd m5, [t2+wq+400*4] 1512 mova [t0+wq+400*0], m0 1513 mova [t0+wq+400*2], m2 1514 mova [t0+wq+400*4], m3 1515 psrlw m3, m1, 1 1516 paddd m4, m8 1517 pavgw m3, m6 ; (b + 2) >> 2 1518 paddd m5, m8 1519 pand m4, m9 ; ((a + 8) >> 4) << 4 1520 pand m5, m9 1521 psrld m2, m4, 4 1522 psrld m0, m5, 4 1523 paddd m2, m4 1524 psrld m4, 1 1525 paddd m0, m5 1526 psrld m5, 1 1527 paddd m4, m2 ; a * 25 1528 paddd m5, m0 1529 punpcklwd m2, m3, m6 1530 punpckhwd m3, m6 1531 pmaddwd m2, m2 ; b * b 1532 pmaddwd m3, m3 1533 punpcklwd m0, m1, m6 ; b 1534 punpckhwd m1, m6 1535 MAXSD m4, m2, m6 1536 MAXSD m5, m3, m6, 1 1537 psubd m4, m2 ; p 1538 psubd m5, m3 1539 MULLD m4, m10, m2 ; p * s 1540 MULLD m5, m10, m2 1541 pmaddwd m0, m11 ; b * 164 1542 pmaddwd m1, m11 1543 paddusw m4, m11 1544 paddusw m5, m11 1545 psrld m4, 20 ; min(z, 255) 1546 movif32 t3, t3m 1547 psrld m5, 20 1548 GATHER_X_BY_X m3, m4, m5, t2, t2m 1549 punpcklwd m4, m3, m3 1550 punpckhwd m5, m3, m3 1551 MULLD m0, m4, m2 1552 MULLD m1, m5, m2 1553 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 1554 paddd m1, m13 1555 mova [t4+wq+4], m3 1556 psrld m0, 12 ; b 1557 psrld m1, 12 1558 mova [t3+wq*2+ 8], m0 1559 mova [t3+wq*2+24], m1 1560 add wq, 16 1561 jl .hv_loop 1562 mov t2, t1 1563 mov t1, t0 1564 mov t0, t2 1565 movif32 t2m, t2 1566 movif32 t0m, t0 1567 ret 1568.hv_last_row: ; esoteric edge case for odd heights 1569 mova [t1+wq+400*0], m1 1570 paddw m1, m0 1571 mova [t1+wq+400*2], m4 1572 paddd m4, m2 1573 mova [t1+wq+400*4], m5 1574 paddd m5, m3 1575 jmp .hv_main2 1576.v: ; vertical boxsum + ab 1577%if ARCH_X86_64 1578 lea wq, [r4-4] 1579%else 1580 mov wd, w0m 1581%endif 1582.v_loop: 1583 mova m0, [t1+wq+400*0] 1584 mova m2, [t1+wq+400*2] 1585 mova m3, [t1+wq+400*4] 1586 paddw m1, m0, [t2+wq+400*0] 1587 paddd m4, m2, [t2+wq+400*2] 1588 paddd m5, m3, [t2+wq+400*4] 1589 paddw m0, m0 1590 paddd m2, m2 1591 paddd m3, m3 1592 paddw m1, m0 ; hv sum 1593 paddd m4, m2 ; hv sumsq 1594 paddd m5, m3 1595 psrlw m3, m1, 1 1596 paddd m4, m8 1597 pavgw m3, m6 ; (b + 2) >> 2 1598 paddd m5, m8 1599 pand m4, m9 ; ((a + 8) >> 4) << 4 1600 pand m5, m9 1601 psrld m2, m4, 4 1602 psrld m0, m5, 4 1603 paddd m2, m4 1604 psrld m4, 1 1605 paddd m0, m5 1606 psrld m5, 1 1607 paddd m4, m2 ; a * 25 1608 paddd m5, m0 1609 punpcklwd m2, m3, m6 1610 punpckhwd m3, m6 1611 pmaddwd m2, m2 ; b * b 1612 pmaddwd m3, m3 1613 punpcklwd m0, m1, m6 ; b 1614 punpckhwd m1, m6 1615 MAXSD m4, m2, m6 1616 MAXSD m5, m3, m6, 1 1617 psubd m4, m2 ; p 1618 psubd m5, m3 1619 MULLD m4, m10, m2 ; p * s 1620 MULLD m5, m10, m2 1621 pmaddwd m0, m11 ; b * 164 1622 pmaddwd m1, m11 1623 paddusw m4, m11 1624 paddusw m5, m11 1625 psrld m4, 20 ; min(z, 255) 1626 psrld m5, 20 1627 GATHER_X_BY_X m3, m4, m5, t2, t2m 1628 punpcklwd m4, m3, m3 1629 punpckhwd m5, m3, m3 1630 MULLD m0, m4, m2 1631 MULLD m1, m5, m2 1632 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 1633 paddd m1, m13 1634 mova [t4+wq+4], m3 1635 psrld m0, 12 ; b 1636 psrld m1, 12 1637 mova [t3+wq*2+ 8], m0 1638 mova [t3+wq*2+24], m1 1639 add wq, 16 1640 jl .v_loop 1641 ret 1642.prep_n: ; initial neighbor setup 1643 movif64 wq, r4 1644 movif32 wd, w1m 1645.prep_n_loop: 1646 movu m0, [t4+wq*1+ 2] 1647 movu m3, [t4+wq*1+ 4] 1648 movu m1, [t3+wq*2+ 4] 1649 movu m4, [t3+wq*2+ 8] 1650 movu m2, [t3+wq*2+20] 1651 movu m5, [t3+wq*2+24] 1652 paddw m3, m0 1653 paddd m4, m1 1654 paddd m5, m2 1655 paddw m3, [t4+wq*1+ 0] 1656 paddd m4, [t3+wq*2+ 0] 1657 paddd m5, [t3+wq*2+16] 1658 paddw m0, m3 1659 psllw m3, 2 1660 paddd m1, m4 1661 pslld m4, 2 1662 paddd m2, m5 1663 pslld m5, 2 1664 paddw m0, m3 ; a 565 1665 paddd m1, m4 ; b 565 1666 paddd m2, m5 1667 mova [t4+wq*1+400*2+ 0], m0 1668 mova [t3+wq*2+400*4+ 0], m1 1669 mova [t3+wq*2+400*4+16], m2 1670 add wq, 16 1671 jl .prep_n_loop 1672 ret 1673ALIGN function_align 1674.n0: ; neighbor + output (even rows) 1675 movif64 wq, r4 1676 movif32 wd, w1m 1677.n0_loop: 1678 movu m0, [t4+wq*1+ 2] 1679 movu m3, [t4+wq*1+ 4] 1680 movu m1, [t3+wq*2+ 4] 1681 movu m4, [t3+wq*2+ 8] 1682 movu m2, [t3+wq*2+20] 1683 movu m5, [t3+wq*2+24] 1684 paddw m3, m0 1685 paddd m4, m1 1686 paddd m5, m2 1687 paddw m3, [t4+wq*1+ 0] 1688 paddd m4, [t3+wq*2+ 0] 1689 paddd m5, [t3+wq*2+16] 1690 paddw m0, m3 1691 psllw m3, 2 1692 paddd m1, m4 1693 pslld m4, 2 1694 paddd m2, m5 1695 pslld m5, 2 1696 paddw m0, m3 ; a 565 1697 paddd m1, m4 ; b 565 1698 paddd m2, m5 1699 paddw m3, m0, [t4+wq*1+400*2+ 0] 1700 paddd m4, m1, [t3+wq*2+400*4+ 0] 1701 paddd m5, m2, [t3+wq*2+400*4+16] 1702 mova [t4+wq*1+400*2+ 0], m0 1703 mova [t3+wq*2+400*4+ 0], m1 1704 mova [t3+wq*2+400*4+16], m2 1705 mova m0, [dstq+wq] 1706 punpcklwd m1, m0, m6 ; src 1707 punpcklwd m2, m3, m6 ; a 1708 pmaddwd m2, m1 ; a * src 1709 punpckhwd m1, m0, m6 1710 punpckhwd m3, m6 1711 pmaddwd m3, m1 1712 psubd m4, m2 ; b - a * src + (1 << 8) 1713 psubd m5, m3 1714 psrad m4, 9 1715 psrad m5, 9 1716 packssdw m4, m5 1717 pmulhrsw m4, m7 1718 paddw m0, m4 1719 pmaxsw m0, m6 1720 pminsw m0, m14 1721 mova [dstq+wq], m0 1722 add wq, 16 1723 jl .n0_loop 1724 add dstq, stridemp 1725 ret 1726ALIGN function_align 1727.n1: ; neighbor + output (odd rows) 1728 movif64 wq, r4 1729 movif32 wd, w1m 1730.n1_loop: 1731 mova m0, [dstq+wq] 1732 mova m3, [t4+wq*1+400*2+ 0] 1733 mova m4, [t3+wq*2+400*4+ 0] 1734 mova m5, [t3+wq*2+400*4+16] 1735 punpcklwd m1, m0, m6 ; src 1736 punpcklwd m2, m3, m6 ; a 1737 pmaddwd m2, m1 1738 punpckhwd m1, m0, m6 1739 punpckhwd m3, m6 1740 pmaddwd m3, m1 1741 psubd m4, m2 ; b - a * src + (1 << 7) 1742 psubd m5, m3 1743 psrad m4, 8 1744 psrad m5, 8 1745 packssdw m4, m5 1746 pmulhrsw m4, m7 1747 paddw m0, m4 1748 pmaxsw m0, m6 1749 pminsw m0, m14 1750 mova [dstq+wq], m0 1751 add wq, 16 1752 jl .n1_loop 1753 add dstq, stridemp 1754 movif32 dstm, dstq 1755 ret 1756 1757%if ARCH_X86_32 1758 %if STACK_ALIGNMENT < 16 1759 %assign extra_stack 4*16 1760 %else 1761 %assign extra_stack 2*16 1762 %endif 1763cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \ 1764 dst, stride, left, lpf, w 1765 %if STACK_ALIGNMENT < 16 1766 %define dstm dword [esp+calloff+16*2+4*0] 1767 %define stridemp dword [esp+calloff+16*2+4*1] 1768 %define leftm dword [esp+calloff+16*2+4*2] 1769 %define lpfm dword [esp+calloff+16*2+4*3] 1770 %define w0m dword [esp+calloff+16*2+4*4] 1771 %define hd dword [esp+calloff+16*2+4*5] 1772 %define edgeb byte [esp+calloff+16*2+4*6] 1773 %define edged dword [esp+calloff+16*2+4*6] 1774 %define leftmp leftm 1775 %else 1776 %define w0m wm 1777 %define hd dword r5m 1778 %define edgeb byte r7m 1779 %define edged dword r7m 1780 %endif 1781 %define hvsrcm dword [esp+calloff+4*0] 1782 %define w1m dword [esp+calloff+4*1] 1783 %define t3m dword [esp+calloff+4*2] 1784 %define t4m dword [esp+calloff+4*3] 1785 %define m8 [base+pd_8] 1786 %define m9 [esp+calloff+16*1] 1787 %define m10 [base+pd_0xf00801c7] 1788 %define m11 [base+pd_34816] 1789 %define m12 [base+sgr_lshuf3] 1790 %define m13 [base+pw_1023] 1791 %define m14 m6 1792 %define base r6-$$ 1793 %assign calloff 0 1794 %if STACK_ALIGNMENT < 16 1795 mov strideq, [rstk+stack_offset+ 8] 1796 mov leftq, [rstk+stack_offset+12] 1797 mov lpfq, [rstk+stack_offset+16] 1798 mov wd, [rstk+stack_offset+20] 1799 mov dstm, dstq 1800 mov stridemp, strideq 1801 mov leftm, leftq 1802 mov r1, [rstk+stack_offset+24] 1803 mov r2, [rstk+stack_offset+32] 1804 mov lpfm, lpfq 1805 mov hd, r1 1806 mov edged, r2 1807 %endif 1808%else 1809cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \ 1810 w, h, edge, params 1811%endif 1812%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1813 movifnidn wd, wm 1814%endif 1815%if ARCH_X86_64 1816 mov paramsq, r6mp 1817 lea r13, [sgr_x_by_x-0xf03] 1818 movifnidn hd, hm 1819 add wd, wd 1820 mov edged, r7m 1821 movq m9, [paramsq+4] 1822 add lpfq, wq 1823 lea t1, [rsp+wq+12] 1824 mova m8, [pd_8] 1825 add dstq, wq 1826 lea t3, [rsp+wq*2+400*12+8] 1827 mova m10, [pd_0xf00801c7] 1828 lea t4, [rsp+wq+400*32+8] 1829 mova m11, [pd_34816] 1830 pshuflw m7, m9, q3333 1831 pshufb m9, [pw_256] ; s1 1832 punpcklqdq m7, m7 ; w1 1833 neg wq 1834 pxor m6, m6 1835 mova m13, [pw_1023] 1836 psllw m7, 4 1837 mova m12, [sgr_lshuf3] 1838 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 1839 %define lpfm [rsp] 1840%else 1841 mov r1, [rstk+stack_offset+28] ; params 1842 LEA r6, $$ 1843 add wd, wd 1844 movq m1, [r1+4] 1845 add lpfm, wq 1846 lea t1, [rsp+extra_stack+wq+20] 1847 add dstq, wq 1848 lea t3, [rsp+extra_stack+wq*2+400*12+16] 1849 mov dstm, dstq 1850 lea t4, [rsp+extra_stack+wq+400*32+16] 1851 mov t3m, t3 1852 pshuflw m7, m1, q3333 1853 mov t4m, t4 1854 pshufb m1, [base+pw_256] ; s1 1855 punpcklqdq m7, m7 ; w1 1856 psllw m7, 4 1857 neg wq 1858 mova m9, m1 1859 pxor m6, m6 1860 mov w1m, wd 1861 sub wd, 4 1862 mov lpfq, lpfm 1863 mov w0m, wd 1864 %define strideq r5 1865%endif 1866 test edgeb, 4 ; LR_HAVE_TOP 1867 jz .no_top 1868 call .h_top 1869 add lpfq, stridemp 1870 mov t2, t1 1871 add t1, 400*6 1872 call .h_top 1873 movif32 strideq, stridemp 1874 lea r10, [lpfq+strideq*4] 1875 mov lpfq, dstq 1876 add r10, strideq 1877 mov lpfm, r10 ; below 1878 movif32 t4, t4m 1879 call .hv0 1880.main: 1881 dec hd 1882 jz .height1 1883 movif32 lpfq, hvsrcm 1884 add lpfq, stridemp 1885 call .hv1 1886 call .prep_n 1887 sub hd, 2 1888 jl .extend_bottom 1889.main_loop: 1890 movif32 lpfq, hvsrcm 1891 add lpfq, stridemp 1892 call .hv0 1893%if ARCH_X86_64 1894 test hb, hb 1895%else 1896 mov r4, hd 1897 test r4, r4 1898%endif 1899 jz .odd_height 1900 movif32 lpfq, hvsrcm 1901 add lpfq, stridemp 1902 call .hv1 1903 call .n0 1904 call .n1 1905 sub hd, 2 1906 jge .main_loop 1907 test edgeb, 8 ; LR_HAVE_BOTTOM 1908 jz .extend_bottom 1909 mov lpfq, lpfm 1910 call .hv0_bottom 1911 movif32 lpfq, hvsrcm 1912 add lpfq, stridemp 1913 call .hv1_bottom 1914.end: 1915 call .n0 1916 call .n1 1917.end2: 1918 RET 1919.height1: 1920 call .v1 1921 call .prep_n 1922 jmp .odd_height_end 1923.odd_height: 1924 call .v1 1925 call .n0 1926 call .n1 1927.odd_height_end: 1928 call .v0 1929 call .v1 1930 call .n0 1931 jmp .end2 1932.extend_bottom: 1933 call .v0 1934 call .v1 1935 jmp .end 1936.no_top: 1937 movif32 strideq, stridemp 1938 lea r10, [lpfq+strideq*4] 1939 mov lpfq, dstq 1940 lea r10, [r10+strideq*2] 1941 mov lpfm, r10 1942 call .h 1943%if ARCH_X86_64 1944 lea wq, [r4-4] 1945%else 1946 mov wq, w0m 1947 mov hvsrcm, lpfq 1948%endif 1949 lea t2, [t1+400*6] 1950.top_fixup_loop: 1951 mova m0, [t1+wq+400*0] 1952 mova m1, [t1+wq+400*2] 1953 mova m2, [t1+wq+400*4] 1954 mova [t2+wq+400*0], m0 1955 mova [t2+wq+400*2], m1 1956 mova [t2+wq+400*4], m2 1957 add wq, 16 1958 jl .top_fixup_loop 1959 movif32 t3, t3m 1960 movif32 t4, t4m 1961 call .v0 1962 jmp .main 1963.extend_right: 1964 movd m1, wd 1965 movd m5, [lpfq-2] 1966 mova m2, [base+pw_256] 1967 mova m3, [base+pb_0to15] 1968 pshufb m1, m6 1969 pshufb m5, m2 1970 psubb m2, m1 1971 pcmpgtb m2, m3 1972 pand m4, m2 1973 pandn m2, m5 1974 por m4, m2 1975 ret 1976%assign stack_offset stack_offset+4 1977%assign calloff 4 1978.h: ; horizontal boxsum 1979%if ARCH_X86_64 1980 lea wq, [r4-4] 1981%else 1982 %define leftq r4 1983%endif 1984 test edgeb, 1 ; LR_HAVE_LEFT 1985 jz .h_extend_left 1986 movif32 leftq, leftm 1987 movddup m5, [leftq] 1988 movif32 wq, w0m 1989 mova m4, [lpfq+wq+4] 1990 add leftmp, 8 1991 palignr m4, m5, 12 1992 jmp .h_main 1993.h_extend_left: 1994 movif32 wq, w0m 1995 mova m4, [lpfq+wq+4] 1996 pshufb m4, m12 1997 jmp .h_main 1998.h_top: 1999%if ARCH_X86_64 2000 lea wq, [r4-4] 2001%endif 2002 test edgeb, 1 ; LR_HAVE_LEFT 2003 jz .h_extend_left 2004 movif32 wq, w0m 2005.h_loop: 2006 movu m4, [lpfq+wq+ 0] 2007.h_main: 2008 movu m5, [lpfq+wq+16] 2009 test edgeb, 2 ; LR_HAVE_RIGHT 2010 jnz .h_have_right 2011 cmp wd, -18 2012 jl .h_have_right 2013 call .extend_right 2014.h_have_right: 2015 palignr m0, m5, m4, 2 2016 paddw m1, m4, m0 2017 punpcklwd m2, m4, m0 2018 pmaddwd m2, m2 2019 punpckhwd m3, m4, m0 2020 pmaddwd m3, m3 2021 palignr m5, m4, 4 2022 paddw m1, m5 ; sum 2023 punpcklwd m4, m5, m6 2024 pmaddwd m4, m4 2025 punpckhwd m5, m6 2026 pmaddwd m5, m5 2027 paddd m2, m4 ; sumsq 2028 paddd m3, m5 2029 mova [t1+wq+400*0], m1 2030 mova [t1+wq+400*2], m2 2031 mova [t1+wq+400*4], m3 2032 add wq, 16 2033 jl .h_loop 2034 ret 2035ALIGN function_align 2036.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 2037%if ARCH_X86_64 2038 lea wq, [r4-4] 2039%else 2040 mov hvsrcm, lpfq 2041%endif 2042 test edgeb, 1 ; LR_HAVE_LEFT 2043 jz .hv0_extend_left 2044 movif32 leftq, leftm 2045 movddup m5, [leftq] 2046 movif32 wq, w0m 2047 mova m4, [lpfq+wq+4] 2048 add leftmp, 8 2049 palignr m4, m5, 12 2050 jmp .hv0_main 2051.hv0_extend_left: 2052 movif32 wq, w0m 2053 mova m4, [lpfq+wq+4] 2054 pshufb m4, m12 2055 jmp .hv0_main 2056.hv0_bottom: 2057%if ARCH_X86_64 2058 lea wq, [r4-4] 2059%else 2060 mov hvsrcm, lpfq 2061%endif 2062 test edgeb, 1 ; LR_HAVE_LEFT 2063 jz .hv0_extend_left 2064 movif32 wq, w0m 2065%if ARCH_X86_32 2066 jmp .hv0_loop_start 2067%endif 2068.hv0_loop: 2069 movif32 lpfq, hvsrcm 2070.hv0_loop_start: 2071 movu m4, [lpfq+wq+ 0] 2072.hv0_main: 2073 movu m5, [lpfq+wq+16] 2074 test edgeb, 2 ; LR_HAVE_RIGHT 2075 jnz .hv0_have_right 2076 cmp wd, -18 2077 jl .hv0_have_right 2078 call .extend_right 2079.hv0_have_right: 2080 palignr m0, m5, m4, 2 2081 paddw m1, m4, m0 2082 punpcklwd m2, m4, m0 2083 pmaddwd m2, m2 2084 punpckhwd m3, m4, m0 2085 pmaddwd m3, m3 2086 palignr m5, m4, 4 2087 paddw m1, m5 ; sum 2088 punpcklwd m4, m5, m6 2089 pmaddwd m4, m4 2090 punpckhwd m5, m6 2091 pmaddwd m5, m5 2092 paddd m2, m4 ; sumsq 2093 paddd m3, m5 2094 paddw m0, m1, [t1+wq+400*0] 2095 paddd m4, m2, [t1+wq+400*2] 2096 paddd m5, m3, [t1+wq+400*4] 2097 mova [t1+wq+400*0], m1 2098 mova [t1+wq+400*2], m2 2099 mova [t1+wq+400*4], m3 2100 paddw m1, m0, [t2+wq+400*0] 2101 paddd m2, m4, [t2+wq+400*2] 2102 paddd m3, m5, [t2+wq+400*4] 2103 mova [t2+wq+400*0], m0 2104 mova [t2+wq+400*2], m4 2105 mova [t2+wq+400*4], m5 2106 paddd m2, m8 2107 paddd m3, m8 2108 psrld m2, 4 ; (a + 8) >> 4 2109 psrld m3, 4 2110 pslld m4, m2, 3 2111 pslld m5, m3, 3 2112 paddd m4, m2 ; ((a + 8) >> 4) * 9 2113 paddd m5, m3 2114 psrlw m3, m1, 1 2115 pavgw m3, m6 ; (b + 2) >> 2 2116 punpcklwd m2, m3, m6 2117 pmaddwd m2, m2 2118 punpckhwd m3, m6 2119 pmaddwd m3, m3 2120 punpcklwd m0, m1, m6 ; b 2121 punpckhwd m1, m6 2122 MAXSD m4, m2, m14 2123 MAXSD m5, m3, m14 2124 psubd m4, m2 ; p 2125 psubd m5, m3 2126 MULLD m4, m9, m14 ; p * s 2127 MULLD m5, m9, m14 2128 pmaddwd m0, m10 ; b * 455 2129 pmaddwd m1, m10 2130 paddusw m4, m10 2131 paddusw m5, m10 2132 psrld m4, 20 ; min(z, 255) 2133 movif32 t3, t3m 2134 psrld m5, 20 2135 GATHER_X_BY_X m3, m4, m5, r0, dstm 2136 punpcklwd m4, m3, m3 2137 punpckhwd m5, m3, m3 2138 MULLD m0, m4, m14 2139 MULLD m1, m5, m14 2140%if ARCH_X86_32 2141 pxor m6, m6 2142%endif 2143 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2144 paddd m1, m11 2145 mova [t4+wq+4], m3 2146 psrld m0, 12 2147 psrld m1, 12 2148 mova [t3+wq*2+ 8], m0 2149 mova [t3+wq*2+24], m1 2150 add wq, 16 2151 jl .hv0_loop 2152 ret 2153ALIGN function_align 2154.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2155%if ARCH_X86_64 2156 lea wq, [r4-4] 2157%else 2158 mov hvsrcm, lpfq 2159%endif 2160 test edgeb, 1 ; LR_HAVE_LEFT 2161 jz .hv1_extend_left 2162 movif32 leftq, leftm 2163 movddup m5, [leftq] 2164 movif32 wq, w0m 2165 mova m4, [lpfq+wq+4] 2166 add leftmp, 8 2167 palignr m4, m5, 12 2168 jmp .hv1_main 2169.hv1_extend_left: 2170 movif32 wq, w0m 2171 mova m4, [lpfq+wq+4] 2172 pshufb m4, m12 2173 jmp .hv1_main 2174.hv1_bottom: 2175%if ARCH_X86_64 2176 lea wq, [r4-4] 2177%else 2178 mov hvsrcm, lpfq 2179%endif 2180 test edgeb, 1 ; LR_HAVE_LEFT 2181 jz .hv1_extend_left 2182 movif32 wq, w0m 2183%if ARCH_X86_32 2184 jmp .hv1_loop_start 2185%endif 2186.hv1_loop: 2187 movif32 lpfq, hvsrcm 2188.hv1_loop_start: 2189 movu m4, [lpfq+wq+ 0] 2190.hv1_main: 2191 movu m5, [lpfq+wq+16] 2192 test edgeb, 2 ; LR_HAVE_RIGHT 2193 jnz .hv1_have_right 2194 cmp wd, -18 2195 jl .hv1_have_right 2196 call .extend_right 2197.hv1_have_right: 2198 palignr m1, m5, m4, 2 2199 paddw m0, m4, m1 2200 punpcklwd m2, m4, m1 2201 pmaddwd m2, m2 2202 punpckhwd m3, m4, m1 2203 pmaddwd m3, m3 2204 palignr m5, m4, 4 2205 paddw m0, m5 ; h sum 2206 punpcklwd m1, m5, m6 2207 pmaddwd m1, m1 2208 punpckhwd m5, m6 2209 pmaddwd m5, m5 2210 paddd m2, m1 ; h sumsq 2211 paddd m3, m5 2212 paddw m1, m0, [t2+wq+400*0] 2213 paddd m4, m2, [t2+wq+400*2] 2214 paddd m5, m3, [t2+wq+400*4] 2215 mova [t2+wq+400*0], m0 2216 mova [t2+wq+400*2], m2 2217 mova [t2+wq+400*4], m3 2218 paddd m4, m8 2219 paddd m5, m8 2220 psrld m4, 4 ; (a + 8) >> 4 2221 psrld m5, 4 2222 pslld m2, m4, 3 2223 pslld m3, m5, 3 2224 paddd m4, m2 ; ((a + 8) >> 4) * 9 2225 paddd m5, m3 2226 psrlw m3, m1, 1 2227 pavgw m3, m6 ; (b + 2) >> 2 2228 punpcklwd m2, m3, m6 2229 pmaddwd m2, m2 2230 punpckhwd m3, m6 2231 pmaddwd m3, m3 2232 punpcklwd m0, m1, m6 ; b 2233 punpckhwd m1, m6 2234 MAXSD m4, m2, m14 2235 MAXSD m5, m3, m14 2236 psubd m4, m2 ; p 2237 psubd m5, m3 2238 MULLD m4, m9, m14 ; p * s 2239 MULLD m5, m9, m14 2240 pmaddwd m0, m10 ; b * 455 2241 pmaddwd m1, m10 2242 paddusw m4, m10 2243 paddusw m5, m10 2244 psrld m4, 20 ; min(z, 255) 2245 movif32 t3, t3m 2246 psrld m5, 20 2247 GATHER_X_BY_X m3, m4, m5, r0, dstm 2248 punpcklwd m4, m3, m3 2249 punpckhwd m5, m3, m3 2250 MULLD m0, m4, m14 2251 MULLD m1, m5, m14 2252%if ARCH_X86_32 2253 pxor m6, m6 2254%endif 2255 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2256 paddd m1, m11 2257 mova [t4+wq*1+400*2 +4], m3 2258 psrld m0, 12 2259 psrld m1, 12 2260 mova [t3+wq*2+400*4+ 8], m0 2261 mova [t3+wq*2+400*4+24], m1 2262 add wq, 16 2263 jl .hv1_loop 2264 mov r10, t2 2265 mov t2, t1 2266 mov t1, r10 2267 ret 2268.v0: ; vertical boxsums + ab (even rows) 2269%if ARCH_X86_64 2270 lea wq, [r4-4] 2271%else 2272 mov wd, w0m 2273%endif 2274.v0_loop: 2275 mova m0, [t1+wq+400*0] 2276 mova m4, [t1+wq+400*2] 2277 mova m5, [t1+wq+400*4] 2278 paddw m0, m0 2279 paddd m4, m4 2280 paddd m5, m5 2281 paddw m1, m0, [t2+wq+400*0] 2282 paddd m2, m4, [t2+wq+400*2] 2283 paddd m3, m5, [t2+wq+400*4] 2284 mova [t2+wq+400*0], m0 2285 mova [t2+wq+400*2], m4 2286 mova [t2+wq+400*4], m5 2287 paddd m2, m8 2288 paddd m3, m8 2289 psrld m2, 4 ; (a + 8) >> 4 2290 psrld m3, 4 2291 pslld m4, m2, 3 2292 pslld m5, m3, 3 2293 paddd m4, m2 ; ((a + 8) >> 4) * 9 2294 paddd m5, m3 2295 psrlw m3, m1, 1 2296 pavgw m3, m6 ; (b + 2) >> 2 2297 punpcklwd m2, m3, m6 2298 pmaddwd m2, m2 2299 punpckhwd m3, m6 2300 pmaddwd m3, m3 2301 punpcklwd m0, m1, m6 ; b 2302 punpckhwd m1, m6 2303 MAXSD m4, m2, m14 2304 MAXSD m5, m3, m14 2305 psubd m4, m2 ; p 2306 psubd m5, m3 2307 MULLD m4, m9, m14 ; p * s 2308 MULLD m5, m9, m14 2309 pmaddwd m0, m10 ; b * 455 2310 pmaddwd m1, m10 2311 paddusw m4, m10 2312 paddusw m5, m10 2313 psrld m4, 20 ; min(z, 255) 2314 psrld m5, 20 2315 GATHER_X_BY_X m3, m4, m5, r0, dstm 2316 punpcklwd m4, m3, m3 2317 punpckhwd m5, m3, m3 2318 MULLD m0, m4, m14 2319 MULLD m1, m5, m14 2320%if ARCH_X86_32 2321 pxor m6, m6 2322%endif 2323 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2324 paddd m1, m11 2325 mova [t4+wq*1+400*0+ 4], m3 2326 psrld m0, 12 2327 psrld m1, 12 2328 mova [t3+wq*2+400*0+ 8], m0 2329 mova [t3+wq*2+400*0+24], m1 2330 add wq, 16 2331 jl .v0_loop 2332 ret 2333.v1: ; vertical boxsums + ab (odd rows) 2334%if ARCH_X86_64 2335 lea wq, [r4-4] 2336%else 2337 mov wd, w0m 2338%endif 2339.v1_loop: 2340 mova m0, [t1+wq+400*0] 2341 mova m4, [t1+wq+400*2] 2342 mova m5, [t1+wq+400*4] 2343 paddw m1, m0, [t2+wq+400*0] 2344 paddd m2, m4, [t2+wq+400*2] 2345 paddd m3, m5, [t2+wq+400*4] 2346 mova [t2+wq+400*0], m0 2347 mova [t2+wq+400*2], m4 2348 mova [t2+wq+400*4], m5 2349 paddd m2, m8 2350 paddd m3, m8 2351 psrld m2, 4 ; (a + 8) >> 4 2352 psrld m3, 4 2353 pslld m4, m2, 3 2354 pslld m5, m3, 3 2355 paddd m4, m2 ; ((a + 8) >> 4) * 9 2356 paddd m5, m3 2357 psrlw m3, m1, 1 2358 pavgw m3, m6 ; (b + 2) >> 2 2359 punpcklwd m2, m3, m6 2360 pmaddwd m2, m2 2361 punpckhwd m3, m6 2362 pmaddwd m3, m3 2363 punpcklwd m0, m1, m6 ; b 2364 punpckhwd m1, m6 2365 MAXSD m4, m2, m14 2366 MAXSD m5, m3, m14 2367 psubd m4, m2 ; p 2368 psubd m5, m3 2369 MULLD m4, m9, m14 ; p * s 2370 MULLD m5, m9, m14 2371 pmaddwd m0, m10 ; b * 455 2372 pmaddwd m1, m10 2373 paddusw m4, m10 2374 paddusw m5, m10 2375 psrld m4, 20 ; min(z, 255) 2376 psrld m5, 20 2377 GATHER_X_BY_X m3, m4, m5, r0, dstm 2378 punpcklwd m4, m3, m3 2379 punpckhwd m5, m3, m3 2380 MULLD m0, m4, m14 2381 MULLD m1, m5, m14 2382%if ARCH_X86_32 2383 pxor m6, m6 2384%endif 2385 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2386 paddd m1, m11 2387 mova [t4+wq*1+400*2+ 4], m3 2388 psrld m0, 12 2389 psrld m1, 12 2390 mova [t3+wq*2+400*4+ 8], m0 2391 mova [t3+wq*2+400*4+24], m1 2392 add wq, 16 2393 jl .v1_loop 2394 mov r10, t2 2395 mov t2, t1 2396 mov t1, r10 2397 ret 2398.prep_n: ; initial neighbor setup 2399 movif64 wq, r4 2400 movif32 wd, w1m 2401.prep_n_loop: 2402 movu m0, [t4+wq*1+400*0+ 4] 2403 movu m1, [t3+wq*2+400*0+ 8] 2404 movu m2, [t3+wq*2+400*0+24] 2405 movu m3, [t4+wq*1+400*0+ 2] 2406 movu m4, [t3+wq*2+400*0+ 4] 2407 movu m5, [t3+wq*2+400*0+20] 2408 paddw m0, [t4+wq*1+400*0+ 0] 2409 paddd m1, [t3+wq*2+400*0+ 0] 2410 paddd m2, [t3+wq*2+400*0+16] 2411 paddw m3, m0 2412 paddd m4, m1 2413 paddd m5, m2 2414 psllw m3, 2 ; a[-1] 444 2415 pslld m4, 2 ; b[-1] 444 2416 pslld m5, 2 2417 psubw m3, m0 ; a[-1] 343 2418 psubd m4, m1 ; b[-1] 343 2419 psubd m5, m2 2420 mova [t4+wq*1+400*4], m3 2421 mova [t3+wq*2+400*8+ 0], m4 2422 mova [t3+wq*2+400*8+16], m5 2423 movu m0, [t4+wq*1+400*2+ 4] 2424 movu m1, [t3+wq*2+400*4+ 8] 2425 movu m2, [t3+wq*2+400*4+24] 2426 movu m3, [t4+wq*1+400*2+ 2] 2427 movu m4, [t3+wq*2+400*4+ 4] 2428 movu m5, [t3+wq*2+400*4+20] 2429 paddw m0, [t4+wq*1+400*2+ 0] 2430 paddd m1, [t3+wq*2+400*4+ 0] 2431 paddd m2, [t3+wq*2+400*4+16] 2432 paddw m3, m0 2433 paddd m4, m1 2434 paddd m5, m2 2435 psllw m3, 2 ; a[ 0] 444 2436 pslld m4, 2 ; b[ 0] 444 2437 pslld m5, 2 2438 mova [t4+wq*1+400* 6], m3 2439 mova [t3+wq*2+400*12+ 0], m4 2440 mova [t3+wq*2+400*12+16], m5 2441 psubw m3, m0 ; a[ 0] 343 2442 psubd m4, m1 ; b[ 0] 343 2443 psubd m5, m2 2444 mova [t4+wq*1+400* 8], m3 2445 mova [t3+wq*2+400*16+ 0], m4 2446 mova [t3+wq*2+400*16+16], m5 2447 add wq, 16 2448 jl .prep_n_loop 2449 ret 2450ALIGN function_align 2451.n0: ; neighbor + output (even rows) 2452 movif64 wq, r4 2453 movif32 wd, w1m 2454.n0_loop: 2455 movu m3, [t4+wq*1+400*0+4] 2456 movu m1, [t4+wq*1+400*0+2] 2457 paddw m3, [t4+wq*1+400*0+0] 2458 paddw m1, m3 2459 psllw m1, 2 ; a[ 1] 444 2460 psubw m2, m1, m3 ; a[ 1] 343 2461 paddw m3, m2, [t4+wq*1+400*4] 2462 paddw m3, [t4+wq*1+400*6] 2463 mova [t4+wq*1+400*4], m2 2464 mova [t4+wq*1+400*6], m1 2465 movu m4, [t3+wq*2+400*0+8] 2466 movu m1, [t3+wq*2+400*0+4] 2467 paddd m4, [t3+wq*2+400*0+0] 2468 paddd m1, m4 2469 pslld m1, 2 ; b[ 1] 444 2470 psubd m2, m1, m4 ; b[ 1] 343 2471 paddd m4, m2, [t3+wq*2+400* 8+ 0] 2472 paddd m4, [t3+wq*2+400*12+ 0] 2473 mova [t3+wq*2+400* 8+ 0], m2 2474 mova [t3+wq*2+400*12+ 0], m1 2475 movu m5, [t3+wq*2+400*0+24] 2476 movu m1, [t3+wq*2+400*0+20] 2477 paddd m5, [t3+wq*2+400*0+16] 2478 paddd m1, m5 2479 pslld m1, 2 2480 psubd m2, m1, m5 2481 paddd m5, m2, [t3+wq*2+400* 8+16] 2482 paddd m5, [t3+wq*2+400*12+16] 2483 mova [t3+wq*2+400* 8+16], m2 2484 mova [t3+wq*2+400*12+16], m1 2485 mova m0, [dstq+wq] 2486 punpcklwd m1, m0, m6 2487 punpcklwd m2, m3, m6 2488 pmaddwd m2, m1 ; a * src 2489 punpckhwd m1, m0, m6 2490 punpckhwd m3, m6 2491 pmaddwd m3, m1 2492 psubd m4, m2 ; b - a * src + (1 << 8) 2493 psubd m5, m3 2494 psrad m4, 9 2495 psrad m5, 9 2496 packssdw m4, m5 2497 pmulhrsw m4, m7 2498 paddw m0, m4 2499 pmaxsw m0, m6 2500 pminsw m0, m13 2501 mova [dstq+wq], m0 2502 add wq, 16 2503 jl .n0_loop 2504 add dstq, stridemp 2505 ret 2506ALIGN function_align 2507.n1: ; neighbor + output (odd rows) 2508 movif64 wq, r4 2509 movif32 wd, w1m 2510.n1_loop: 2511 movu m3, [t4+wq*1+400*2+4] 2512 movu m1, [t4+wq*1+400*2+2] 2513 paddw m3, [t4+wq*1+400*2+0] 2514 paddw m1, m3 2515 psllw m1, 2 ; a[ 1] 444 2516 psubw m2, m1, m3 ; a[ 1] 343 2517 paddw m3, m2, [t4+wq*1+400*6] 2518 paddw m3, [t4+wq*1+400*8] 2519 mova [t4+wq*1+400*6], m1 2520 mova [t4+wq*1+400*8], m2 2521 movu m4, [t3+wq*2+400*4+8] 2522 movu m1, [t3+wq*2+400*4+4] 2523 paddd m4, [t3+wq*2+400*4+0] 2524 paddd m1, m4 2525 pslld m1, 2 ; b[ 1] 444 2526 psubd m2, m1, m4 ; b[ 1] 343 2527 paddd m4, m2, [t3+wq*2+400*12+ 0] 2528 paddd m4, [t3+wq*2+400*16+ 0] 2529 mova [t3+wq*2+400*12+ 0], m1 2530 mova [t3+wq*2+400*16+ 0], m2 2531 movu m5, [t3+wq*2+400*4+24] 2532 movu m1, [t3+wq*2+400*4+20] 2533 paddd m5, [t3+wq*2+400*4+16] 2534 paddd m1, m5 2535 pslld m1, 2 2536 psubd m2, m1, m5 2537 paddd m5, m2, [t3+wq*2+400*12+16] 2538 paddd m5, [t3+wq*2+400*16+16] 2539 mova [t3+wq*2+400*12+16], m1 2540 mova [t3+wq*2+400*16+16], m2 2541 mova m0, [dstq+wq] 2542 punpcklwd m1, m0, m6 2543 punpcklwd m2, m3, m6 2544 pmaddwd m2, m1 ; a * src 2545 punpckhwd m1, m0, m6 2546 punpckhwd m3, m6 2547 pmaddwd m3, m1 2548 psubd m4, m2 ; b - a * src + (1 << 8) 2549 psubd m5, m3 2550 psrad m4, 9 2551 psrad m5, 9 2552 packssdw m4, m5 2553 pmulhrsw m4, m7 2554 paddw m0, m4 2555 pmaxsw m0, m6 2556 pminsw m0, m13 2557 mova [dstq+wq], m0 2558 add wq, 16 2559 jl .n1_loop 2560 add dstq, stridemp 2561 movif32 dstm, dstq 2562 ret 2563 2564%if ARCH_X86_32 2565 %if STACK_ALIGNMENT < 16 2566 %assign extra_stack 10*16 2567 %else 2568 %assign extra_stack 8*16 2569 %endif 2570cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \ 2571 dst, stride, left, lpf, w 2572 %if STACK_ALIGNMENT < 16 2573 %define dstm dword [esp+calloff+16*8+4*0] 2574 %define stridemp dword [esp+calloff+16*8+4*1] 2575 %define leftm dword [esp+calloff+16*8+4*2] 2576 %define lpfm dword [esp+calloff+16*8+4*3] 2577 %define w0m dword [esp+calloff+16*8+4*4] 2578 %define hd dword [esp+calloff+16*8+4*5] 2579 %define edgeb byte [esp+calloff+16*8+4*6] 2580 %define edged dword [esp+calloff+16*8+4*6] 2581 %define leftmp leftm 2582 %else 2583 %define w0m wm 2584 %define hd dword r5m 2585 %define edgeb byte r7m 2586 %define edged dword r7m 2587 %endif 2588 %define hvsrcm dword [esp+calloff+4*0] 2589 %define w1m dword [esp+calloff+4*1] 2590 %define t3m dword [esp+calloff+4*2] 2591 %define t4m dword [esp+calloff+4*3] 2592 %xdefine m8 m6 2593 %define m9 [base+pd_8] 2594 %define m10 [base+pd_34816] 2595 %define m11 [base+pd_0xf00801c7] 2596 %define m12 [base+pd_0xf00800a4] 2597 %define m13 [esp+calloff+16*4] 2598 %define m14 [esp+calloff+16*5] 2599 %define m15 [esp+calloff+16*6] 2600 %define m6 [esp+calloff+16*7] 2601 %define base r6-$$ 2602 %assign calloff 0 2603 %if STACK_ALIGNMENT < 16 2604 mov strideq, [rstk+stack_offset+ 8] 2605 mov leftq, [rstk+stack_offset+12] 2606 mov lpfq, [rstk+stack_offset+16] 2607 mov wd, [rstk+stack_offset+20] 2608 mov dstm, dstq 2609 mov stridemp, strideq 2610 mov leftm, leftq 2611 mov r1, [rstk+stack_offset+24] 2612 mov r2, [rstk+stack_offset+32] 2613 mov lpfm, lpfq 2614 mov hd, r1 2615 mov edged, r2 2616 %endif 2617%else 2618cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ 2619 w, h, edge, params 2620%endif 2621%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 2622 movifnidn wd, wm 2623%endif 2624%if ARCH_X86_64 2625 mov paramsq, r6mp 2626 lea r13, [sgr_x_by_x-0xf03] 2627 movifnidn hd, hm 2628 add wd, wd 2629 mov edged, r7m 2630 mova m14, [paramsq] 2631 add lpfq, wq 2632 mova m9, [pd_8] 2633 lea t1, [rsp+wq+44] 2634 mova m10, [pd_34816] 2635 add dstq, wq 2636 mova m11, [pd_0xf00801c7] 2637 lea t3, [rsp+wq*2+400*24+40] 2638 mova m12, [pd_0xf00800a4] 2639 lea t4, [rsp+wq+400*52+40] 2640 neg wq 2641 pshufd m15, m14, q2222 ; w0 w1 2642 punpcklwd m14, m14 2643 pshufd m13, m14, q0000 ; s0 2644 pshufd m14, m14, q2222 ; s1 2645 pxor m6, m6 2646 psllw m15, 2 2647 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 2648 %define lpfm [rsp] 2649%else 2650 mov r1, [rstk+stack_offset+28] ; params 2651 LEA r6, $$ 2652 add wd, wd 2653 mova m2, [r1] 2654 add lpfm, wq 2655 lea t1, [rsp+extra_stack+wq+52] 2656 add dstq, wq 2657 lea t3, [rsp+extra_stack+wq*2+400*24+48] 2658 mov dstm, dstq 2659 lea t4, [rsp+extra_stack+wq+400*52+48] 2660 mov t3m, t3 2661 mov t4m, t4 2662 neg wq 2663 pshuflw m0, m2, q0000 2664 pshuflw m1, m2, q2222 2665 pshufhw m2, m2, q1010 2666 punpcklqdq m0, m0 ; s0 2667 punpcklqdq m1, m1 ; s1 2668 punpckhqdq m2, m2 ; w0 w1 2669 mov w1m, wd 2670 pxor m3, m3 2671 psllw m2, 2 2672 mova m13, m0 2673 mova m14, m1 2674 sub wd, 4 2675 mova m15, m2 2676 mova m6, m3 2677 mov lpfq, lpfm 2678 mov w0m, wd 2679 %define strideq r5 2680%endif 2681 test edgeb, 4 ; LR_HAVE_TOP 2682 jz .no_top 2683 call .h_top 2684 add lpfq, stridemp 2685 mov t2, t1 2686%if ARCH_X86_64 2687 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup 2688%else 2689 mov wq, w0m 2690 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop 2691%endif 2692 add t1, 400*12 2693 call .h_top 2694 movif32 strideq, stridemp 2695 lea r10, [lpfq+strideq*4] 2696 mov lpfq, dstq 2697 add r10, strideq 2698 mov lpfm, r10 ; below 2699 movif32 t4, t4m 2700 call .hv0 2701.main: 2702 dec hd 2703 jz .height1 2704 movif32 lpfq, hvsrcm 2705 add lpfq, stridemp 2706 call .hv1 2707 call .prep_n 2708 sub hd, 2 2709 jl .extend_bottom 2710.main_loop: 2711 movif32 lpfq, hvsrcm 2712 add lpfq, stridemp 2713 call .hv0 2714%if ARCH_X86_64 2715 test hd, hd 2716%else 2717 mov r4, hd 2718 test r4, r4 2719%endif 2720 jz .odd_height 2721 movif32 lpfq, hvsrcm 2722 add lpfq, stridemp 2723 call .hv1 2724 call .n0 2725 call .n1 2726 sub hd, 2 2727 jge .main_loop 2728 test edgeb, 8 ; LR_HAVE_BOTTOM 2729 jz .extend_bottom 2730 mov lpfq, lpfm 2731 call .hv0_bottom 2732 movif32 lpfq, hvsrcm 2733 add lpfq, stridemp 2734 call .hv1_bottom 2735.end: 2736 call .n0 2737 call .n1 2738.end2: 2739 RET 2740.height1: 2741 call .v1 2742 call .prep_n 2743 jmp .odd_height_end 2744.odd_height: 2745 call .v1 2746 call .n0 2747 call .n1 2748.odd_height_end: 2749 call .v0 2750 call .v1 2751 call .n0 2752 jmp .end2 2753.extend_bottom: 2754 call .v0 2755 call .v1 2756 jmp .end 2757.no_top: 2758 movif32 strideq, stridemp 2759 lea r10, [lpfq+strideq*4] 2760 mov lpfq, dstq 2761 lea r10, [r10+strideq*2] 2762 mov lpfm, r10 2763 call .h 2764%if ARCH_X86_64 2765 lea wq, [r4-4] 2766%else 2767 mov wq, w0m 2768 mov hvsrcm, lpfq 2769%endif 2770 lea t2, [t1+400*12] 2771.top_fixup_loop: 2772 mova m0, [t1+wq+400* 0] 2773 mova m1, [t1+wq+400* 2] 2774 mova m2, [t1+wq+400* 4] 2775 paddw m0, m0 2776 mova m3, [t1+wq+400* 6] 2777 paddd m1, m1 2778 mova m4, [t1+wq+400* 8] 2779 paddd m2, m2 2780 mova m5, [t1+wq+400*10] 2781 mova [t2+wq+400* 0], m0 2782 mova [t2+wq+400* 2], m1 2783 mova [t2+wq+400* 4], m2 2784 mova [t2+wq+400* 6], m3 2785 mova [t2+wq+400* 8], m4 2786 mova [t2+wq+400*10], m5 2787 add wq, 16 2788 jl .top_fixup_loop 2789 movif32 t3, t3m 2790 movif32 t4, t4m 2791 call .v0 2792 jmp .main 2793.h: ; horizontal boxsum 2794%assign stack_offset stack_offset+4 2795%assign calloff 4 2796%if ARCH_X86_64 2797 lea wq, [r4-4] 2798%else 2799 %define leftq r4 2800%endif 2801 test edgeb, 1 ; LR_HAVE_LEFT 2802 jz .h_extend_left 2803 movif32 leftq, leftm 2804 movddup m5, [leftq] 2805 movif32 wq, w0m 2806 mova m4, [lpfq+wq+4] 2807 add leftmp, 8 2808 palignr m4, m5, 10 2809 jmp .h_main 2810.h_extend_left: 2811 movif32 wq, w0m 2812 mova m4, [lpfq+wq+4] 2813 pshufb m4, [base+sgr_lshuf5] 2814 jmp .h_main 2815.h_top: 2816%if ARCH_X86_64 2817 lea wq, [r4-4] 2818%endif 2819 test edgeb, 1 ; LR_HAVE_LEFT 2820 jz .h_extend_left 2821 movif32 wq, w0m 2822.h_loop: 2823 movu m4, [lpfq+wq- 2] 2824.h_main: 2825 movu m5, [lpfq+wq+14] 2826 test edgeb, 2 ; LR_HAVE_RIGHT 2827 jnz .h_have_right 2828 cmp wd, -20 2829 jl .h_have_right 2830%if ARCH_X86_32 2831 pxor m8, m8 2832%endif 2833 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right 2834.h_have_right: 2835 palignr m3, m5, m4, 2 2836 palignr m0, m5, m4, 4 2837 paddw m1, m3, m0 2838 punpcklwd m2, m3, m0 2839 pmaddwd m2, m2 2840 punpckhwd m3, m0 2841 pmaddwd m3, m3 2842 palignr m0, m5, m4, 6 2843 paddw m1, m0 ; sum3 2844 punpcklwd m7, m0, m6 2845 pmaddwd m7, m7 2846 punpckhwd m0, m6 2847 pmaddwd m0, m0 2848 paddd m2, m7 ; sumsq3 2849 palignr m5, m4, 8 2850 punpcklwd m7, m5, m4 2851 paddw m8, m4, m5 2852 pmaddwd m7, m7 2853 punpckhwd m5, m4 2854 pmaddwd m5, m5 2855 paddd m3, m0 2856 mova [t1+wq+400* 6], m1 2857 mova [t1+wq+400* 8], m2 2858 mova [t1+wq+400*10], m3 2859 paddw m8, m1 ; sum5 2860 paddd m7, m2 ; sumsq5 2861 paddd m5, m3 2862 mova [t1+wq+400* 0], m8 2863 mova [t1+wq+400* 2], m7 2864 mova [t1+wq+400* 4], m5 2865 add wq, 16 2866 jl .h_loop 2867 ret 2868ALIGN function_align 2869.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 2870%if ARCH_X86_64 2871 lea wq, [r4-4] 2872%else 2873 mov hvsrcm, lpfq 2874%endif 2875 test edgeb, 1 ; LR_HAVE_LEFT 2876 jz .hv0_extend_left 2877 movif32 leftq, leftm 2878 movddup m5, [leftq] 2879 movif32 wq, w0m 2880 mova m4, [lpfq+wq+4] 2881 add leftmp, 8 2882 palignr m4, m5, 10 2883 jmp .hv0_main 2884.hv0_extend_left: 2885 movif32 wq, w0m 2886 mova m4, [lpfq+wq+4] 2887 pshufb m4, [base+sgr_lshuf5] 2888 jmp .hv0_main 2889.hv0_bottom: 2890%if ARCH_X86_64 2891 lea wq, [r4-4] 2892%else 2893 mov hvsrcm, lpfq 2894%endif 2895 test edgeb, 1 ; LR_HAVE_LEFT 2896 jz .hv0_extend_left 2897 movif32 wq, w0m 2898%if ARCH_X86_32 2899 jmp .hv0_loop_start 2900%endif 2901.hv0_loop: 2902 movif32 lpfq, hvsrcm 2903.hv0_loop_start: 2904 movu m4, [lpfq+wq- 2] 2905.hv0_main: 2906 movu m5, [lpfq+wq+14] 2907 test edgeb, 2 ; LR_HAVE_RIGHT 2908 jnz .hv0_have_right 2909 cmp wd, -20 2910 jl .hv0_have_right 2911%if ARCH_X86_32 2912 pxor m8, m8 2913%endif 2914 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right 2915.hv0_have_right: 2916 palignr m3, m5, m4, 2 2917 palignr m0, m5, m4, 4 2918 movif32 t3, t3m 2919 paddw m1, m3, m0 2920 punpcklwd m2, m3, m0 2921 pmaddwd m2, m2 2922 punpckhwd m3, m0 2923 pmaddwd m3, m3 2924 palignr m0, m5, m4, 6 2925 paddw m1, m0 ; h sum3 2926 punpcklwd m7, m0, m6 2927 pmaddwd m7, m7 2928 punpckhwd m0, m6 2929 pmaddwd m0, m0 2930 paddd m2, m7 ; h sumsq3 2931 palignr m5, m4, 8 2932 punpcklwd m7, m5, m4 2933 paddw m8, m4, m5 2934 pmaddwd m7, m7 2935 punpckhwd m5, m4 2936 pmaddwd m5, m5 2937 paddd m3, m0 2938 paddw m8, m1 ; h sum5 2939 paddd m7, m2 ; h sumsq5 2940 paddd m5, m3 2941 mova [t3+wq*2+400*8+ 8], m8 2942 mova [t3+wq*2+400*0+ 8], m7 2943 mova [t3+wq*2+400*0+24], m5 2944 paddw m8, [t1+wq+400* 0] 2945 paddd m7, [t1+wq+400* 2] 2946 paddd m5, [t1+wq+400* 4] 2947 mova [t1+wq+400* 0], m8 2948 mova [t1+wq+400* 2], m7 2949 mova [t1+wq+400* 4], m5 2950 paddw m0, m1, [t1+wq+400* 6] 2951 paddd m4, m2, [t1+wq+400* 8] 2952 paddd m5, m3, [t1+wq+400*10] 2953 mova [t1+wq+400* 6], m1 2954 mova [t1+wq+400* 8], m2 2955 mova [t1+wq+400*10], m3 2956 paddw m1, m0, [t2+wq+400* 6] 2957 paddd m2, m4, [t2+wq+400* 8] 2958 paddd m3, m5, [t2+wq+400*10] 2959 mova [t2+wq+400* 6], m0 2960 mova [t2+wq+400* 8], m4 2961 mova [t2+wq+400*10], m5 2962 paddd m2, m9 2963 paddd m3, m9 2964 psrld m2, 4 ; (a3 + 8) >> 4 2965 psrld m3, 4 2966%if ARCH_X86_32 2967 pxor m7, m7 2968%else 2969 SWAP m7, m6 2970%endif 2971 pslld m4, m2, 3 2972 pslld m5, m3, 3 2973 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2974 paddd m5, m3 2975 psrlw m3, m1, 1 2976 pavgw m3, m7 ; (b3 + 2) >> 2 2977 punpcklwd m2, m3, m7 2978 pmaddwd m2, m2 2979 punpckhwd m3, m7 2980 pmaddwd m3, m3 2981 punpcklwd m0, m1, m7 ; b3 2982 punpckhwd m1, m7 2983%if ARCH_X86_64 2984 SWAP m7, m6 2985%endif 2986 MAXSD m4, m2, m7 2987 MAXSD m5, m3, m7 2988 psubd m4, m2 ; p3 2989 psubd m5, m3 2990 MULLD m4, m14, m7 ; p3 * s1 2991 MULLD m5, m14, m7 2992 pmaddwd m0, m11 ; b3 * 455 2993 pmaddwd m1, m11 2994 paddusw m4, m11 2995 paddusw m5, m11 2996 psrld m4, 20 ; min(z3, 255) 2997 psrld m5, 20 2998 GATHER_X_BY_X m3, m4, m5, r0, dstm 2999 punpcklwd m4, m3, m3 3000 punpckhwd m5, m3, m3 3001 MULLD m0, m4, m7 3002 MULLD m1, m5, m7 3003 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3004 paddd m1, m10 3005 mova [t4+wq*1+400*2+ 4], m3 3006 psrld m0, 12 3007 psrld m1, 12 3008 mova [t3+wq*2+400*4+ 8], m0 3009 mova [t3+wq*2+400*4+24], m1 3010 add wq, 16 3011 jl .hv0_loop 3012 ret 3013ALIGN function_align 3014.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 3015%if ARCH_X86_64 3016 lea wq, [r4-4] 3017%else 3018 mov hvsrcm, lpfq 3019%endif 3020 test edgeb, 1 ; LR_HAVE_LEFT 3021 jz .hv1_extend_left 3022 movif32 leftq, leftm 3023 movddup m5, [leftq] 3024 movif32 wq, w0m 3025 mova m4, [lpfq+wq+4] 3026 add leftmp, 8 3027 palignr m4, m5, 10 3028 jmp .hv1_main 3029.hv1_extend_left: 3030 movif32 wq, w0m 3031 mova m4, [lpfq+wq+4] 3032 pshufb m4, [base+sgr_lshuf5] 3033 jmp .hv1_main 3034.hv1_bottom: 3035%if ARCH_X86_64 3036 lea wq, [r4-4] 3037%else 3038 mov hvsrcm, lpfq 3039%endif 3040 test edgeb, 1 ; LR_HAVE_LEFT 3041 jz .hv1_extend_left 3042 movif32 wq, w0m 3043%if ARCH_X86_32 3044 jmp .hv1_loop_start 3045%endif 3046.hv1_loop: 3047 movif32 lpfq, hvsrcm 3048.hv1_loop_start: 3049 movu m4, [lpfq+wq- 2] 3050.hv1_main: 3051 movu m5, [lpfq+wq+14] 3052 test edgeb, 2 ; LR_HAVE_RIGHT 3053 jnz .hv1_have_right 3054 cmp wd, -20 3055 jl .hv1_have_right 3056%if ARCH_X86_32 3057 pxor m8, m8 3058%endif 3059 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right 3060.hv1_have_right: 3061 palignr m7, m5, m4, 2 3062 palignr m3, m5, m4, 4 3063 paddw m2, m7, m3 3064 punpcklwd m0, m7, m3 3065 pmaddwd m0, m0 3066 punpckhwd m7, m3 3067 pmaddwd m7, m7 3068 palignr m3, m5, m4, 6 3069 paddw m2, m3 ; h sum3 3070 punpcklwd m1, m3, m6 3071 pmaddwd m1, m1 3072 punpckhwd m3, m6 3073 pmaddwd m3, m3 3074 paddd m0, m1 ; h sumsq3 3075 palignr m5, m4, 8 3076 punpckhwd m1, m4, m5 3077 paddw m8, m4, m5 3078 pmaddwd m1, m1 3079 punpcklwd m4, m5 3080 pmaddwd m4, m4 3081 paddd m7, m3 3082 paddw m5, m2, [t2+wq+400* 6] 3083 mova [t2+wq+400* 6], m2 3084 paddw m8, m2 ; h sum5 3085 paddd m2, m0, [t2+wq+400* 8] 3086 paddd m3, m7, [t2+wq+400*10] 3087 mova [t2+wq+400* 8], m0 3088 mova [t2+wq+400*10], m7 3089 paddd m4, m0 ; h sumsq5 3090 paddd m1, m7 3091 paddd m2, m9 3092 paddd m3, m9 3093 psrld m2, 4 ; (a3 + 8) >> 4 3094 psrld m3, 4 3095 pslld m0, m2, 3 3096 pslld m7, m3, 3 3097 paddd m2, m0 ; ((a3 + 8) >> 4) * 9 3098 paddd m3, m7 3099 psrlw m7, m5, 1 3100 pavgw m7, m6 ; (b3 + 2) >> 2 3101 punpcklwd m0, m7, m6 3102 pmaddwd m0, m0 3103 punpckhwd m7, m6 3104 pmaddwd m7, m7 3105%if ARCH_X86_32 3106 mova [esp+20], m8 3107%else 3108 SWAP m8, m6 3109%endif 3110 MAXSD m2, m0, m8 3111 MAXSD m3, m7, m8 3112 pxor m8, m8 3113 psubd m2, m0 ; p3 3114 psubd m3, m7 3115 punpcklwd m0, m5, m8 ; b3 3116 punpckhwd m5, m8 3117 MULLD m2, m14, m8 ; p3 * s1 3118 MULLD m3, m14, m8 3119 pmaddwd m0, m11 ; b3 * 455 3120 pmaddwd m5, m11 3121 paddusw m2, m11 3122 paddusw m3, m11 3123 psrld m2, 20 ; min(z3, 255) 3124 movif32 t3, t3m 3125 psrld m3, 20 3126 GATHER_X_BY_X m8, m2, m3, r0, dstm 3127 punpcklwd m2, m8, m8 3128 punpckhwd m3, m8, m8 3129 MULLD m0, m2, m7 3130 MULLD m5, m3, m7 3131 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3132 paddd m5, m10 3133 psrld m0, 12 3134 psrld m5, 12 3135 mova [t4+wq*1+400*4+4], m8 3136 mova [t3+wq*2+400*8+ 8], m0 3137 mova [t3+wq*2+400*8+24], m5 3138%if ARCH_X86_32 3139 mova m8, [esp+20] 3140%else 3141 SWAP m6, m8 3142 pxor m6, m6 3143%endif 3144 paddw m5, m8, [t2+wq+400*0] 3145 paddd m2, m4, [t2+wq+400*2] 3146 paddd m3, m1, [t2+wq+400*4] 3147 paddw m5, [t1+wq+400*0] 3148 paddd m2, [t1+wq+400*2] 3149 paddd m3, [t1+wq+400*4] 3150 mova [t2+wq+400*0], m8 3151 paddd m2, m9 3152 paddd m3, m9 3153 psrld m2, 4 ; (a5 + 8) >> 4 3154 psrld m3, 4 3155 mova [t2+wq+400*2], m4 3156 pslld m8, m2, 4 3157 mova [t2+wq+400*4], m1 3158 pslld m4, m3, 4 3159 paddd m8, m2 3160 pslld m2, 3 3161 paddd m4, m3 3162 pslld m3, 3 3163 paddd m2, m8 ; ((a5 + 8) >> 4) * 25 3164 paddd m3, m4 3165%if ARCH_X86_32 3166 pxor m7, m7 3167%else 3168 SWAP m7, m6 3169%endif 3170 psrlw m1, m5, 1 3171 pavgw m1, m7 ; (b5 + 2) >> 2 3172 punpcklwd m4, m1, m7 3173 pmaddwd m4, m4 3174 punpckhwd m1, m7 3175 pmaddwd m1, m1 3176 punpcklwd m0, m5, m7 ; b5 3177 punpckhwd m5, m7 3178%if ARCH_X86_64 3179 SWAP m7, m6 3180%endif 3181 MAXSD m2, m4, m7 3182 psubd m2, m4 ; p5 3183 MAXSD m3, m1, m7 3184 psubd m3, m1 3185 MULLD m2, m13, m7 ; p5 * s0 3186 MULLD m3, m13, m7 3187 pmaddwd m0, m12 ; b5 * 164 3188 pmaddwd m5, m12 3189 paddusw m2, m12 3190 paddusw m3, m12 3191 psrld m2, 20 ; min(z5, 255) 3192 psrld m3, 20 3193 GATHER_X_BY_X m1, m2, m3, r0, dstm 3194 punpcklwd m2, m1, m1 3195 punpckhwd m3, m1, m1 3196 MULLD m0, m2, m7 3197 MULLD m5, m3, m7 3198 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3199 paddd m5, m10 3200 mova [t4+wq*1+400*0+ 4], m1 3201 psrld m0, 12 3202 psrld m5, 12 3203 mova [t3+wq*2+400*0+ 8], m0 3204 mova [t3+wq*2+400*0+24], m5 3205 add wq, 16 3206 jl .hv1_loop 3207 mov r10, t2 3208 mov t2, t1 3209 mov t1, r10 3210 ret 3211.v0: ; vertical boxsums + ab3 (even rows) 3212%if ARCH_X86_64 3213 lea wq, [r4-4] 3214%else 3215 mov wd, w0m 3216%endif 3217.v0_loop: 3218 mova m0, [t1+wq+400* 6] 3219 mova m4, [t1+wq+400* 8] 3220 mova m5, [t1+wq+400*10] 3221 paddw m0, m0 3222 paddd m4, m4 3223 paddd m5, m5 3224 paddw m1, m0, [t2+wq+400* 6] 3225 paddd m2, m4, [t2+wq+400* 8] 3226 paddd m3, m5, [t2+wq+400*10] 3227 mova [t2+wq+400* 6], m0 3228 mova [t2+wq+400* 8], m4 3229 mova [t2+wq+400*10], m5 3230 paddd m2, m9 3231 paddd m3, m9 3232 psrld m2, 4 ; (a3 + 8) >> 4 3233 psrld m3, 4 3234%if ARCH_X86_32 3235 pxor m7, m7 3236%else 3237 SWAP m7, m6 3238%endif 3239 pslld m4, m2, 3 3240 pslld m5, m3, 3 3241 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 3242 paddd m5, m3 3243 psrlw m3, m1, 1 3244 pavgw m3, m7 ; (b3 + 2) >> 2 3245 punpcklwd m2, m3, m7 3246 pmaddwd m2, m2 3247 punpckhwd m3, m7 3248 pmaddwd m3, m3 3249 punpcklwd m0, m1, m7 ; b3 3250 punpckhwd m1, m7 3251%if ARCH_X86_64 3252 SWAP m7, m6 3253%endif 3254 MAXSD m4, m2, m7 3255 MAXSD m5, m3, m7 3256 psubd m4, m2 ; p3 3257 psubd m5, m3 3258 MULLD m4, m14, m7 ; p3 * s1 3259 MULLD m5, m14, m7 3260 pmaddwd m0, m11 ; b3 * 455 3261 pmaddwd m1, m11 3262 paddusw m4, m11 3263 paddusw m5, m11 3264 psrld m4, 20 ; min(z3, 255) 3265 psrld m5, 20 3266 GATHER_X_BY_X m3, m4, m5, r0, dstm 3267 punpcklwd m4, m3, m3 3268 punpckhwd m5, m3, m3 3269 MULLD m0, m4, m7 3270 MULLD m1, m5, m7 3271 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3272 paddd m1, m10 3273 mova [t4+wq*1+400*2+4], m3 3274 psrld m0, 12 3275 psrld m1, 12 3276 mova m3, [t1+wq+400*0] 3277 mova m4, [t1+wq+400*2] 3278 mova m5, [t1+wq+400*4] 3279 mova [t3+wq*2+400*8+ 8], m3 3280 mova [t3+wq*2+400*0+ 8], m4 3281 mova [t3+wq*2+400*0+24], m5 3282 paddw m3, m3 ; cc5 3283 paddd m4, m4 3284 paddd m5, m5 3285 mova [t1+wq+400*0], m3 3286 mova [t1+wq+400*2], m4 3287 mova [t1+wq+400*4], m5 3288 mova [t3+wq*2+400*4+ 8], m0 3289 mova [t3+wq*2+400*4+24], m1 3290 add wq, 16 3291 jl .v0_loop 3292 ret 3293.v1: ; vertical boxsums + ab (odd rows) 3294%if ARCH_X86_64 3295 lea wq, [r4-4] 3296%else 3297 mov wd, w0m 3298%endif 3299.v1_loop: 3300 mova m4, [t1+wq+400* 6] 3301 mova m5, [t1+wq+400* 8] 3302 mova m7, [t1+wq+400*10] 3303 paddw m1, m4, [t2+wq+400* 6] 3304 paddd m2, m5, [t2+wq+400* 8] 3305 paddd m3, m7, [t2+wq+400*10] 3306 mova [t2+wq+400* 6], m4 3307 mova [t2+wq+400* 8], m5 3308 mova [t2+wq+400*10], m7 3309 paddd m2, m9 3310 paddd m3, m9 3311 psrld m2, 4 ; (a3 + 8) >> 4 3312 psrld m3, 4 3313%if ARCH_X86_32 3314 pxor m7, m7 3315%else 3316 SWAP m7, m6 3317%endif 3318 pslld m4, m2, 3 3319 pslld m5, m3, 3 3320 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 3321 paddd m5, m3 3322 psrlw m3, m1, 1 3323 pavgw m3, m7 ; (b3 + 2) >> 2 3324 punpcklwd m2, m3, m7 3325 pmaddwd m2, m2 3326 punpckhwd m3, m7 3327 pmaddwd m3, m3 3328 punpcklwd m0, m1, m7 ; b3 3329 punpckhwd m1, m7 3330%if ARCH_X86_64 3331 SWAP m7, m6 3332%endif 3333 MAXSD m4, m2, m7 3334 MAXSD m5, m3, m7 3335 psubd m4, m2 ; p3 3336 psubd m5, m3 3337 MULLD m4, m14, m7 ; p3 * s1 3338 MULLD m5, m14, m7 3339 pmaddwd m0, m11 ; b3 * 455 3340 pmaddwd m1, m11 3341 paddusw m4, m11 3342 paddusw m5, m11 3343 psrld m4, 20 ; min(z3, 255) 3344 psrld m5, 20 3345 GATHER_X_BY_X m3, m4, m5, r0, dstm 3346 punpcklwd m4, m3, m3 3347 punpckhwd m5, m3, m3 3348 MULLD m0, m4, m7 3349 MULLD m1, m5, m7 3350 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3351 paddd m1, m10 3352 mova [t4+wq*1+400*4+4], m3 3353 psrld m0, 12 3354 psrld m8, m1, 12 3355 mova m4, [t3+wq*2+400*8+ 8] 3356 mova m5, [t3+wq*2+400*0+ 8] 3357 mova m7, [t3+wq*2+400*0+24] 3358 paddw m1, m4, [t2+wq+400*0] 3359 paddd m2, m5, [t2+wq+400*2] 3360 paddd m3, m7, [t2+wq+400*4] 3361 paddw m1, [t1+wq+400*0] 3362 paddd m2, [t1+wq+400*2] 3363 paddd m3, [t1+wq+400*4] 3364 mova [t2+wq+400*0], m4 3365 mova [t2+wq+400*2], m5 3366 mova [t2+wq+400*4], m7 3367 paddd m2, m9 3368 paddd m3, m9 3369 psrld m2, 4 ; (a5 + 8) >> 4 3370 psrld m3, 4 3371 mova [t3+wq*2+400*8+ 8], m0 3372 pslld m4, m2, 4 3373 mova [t3+wq*2+400*8+24], m8 3374 pslld m5, m3, 4 3375 paddd m4, m2 3376 pslld m2, 3 3377 paddd m5, m3 3378 pslld m3, 3 3379 paddd m2, m4 3380 paddd m3, m5 3381%if ARCH_X86_32 3382 pxor m7, m7 3383%else 3384 SWAP m7, m6 3385%endif 3386 psrlw m5, m1, 1 3387 pavgw m5, m7 ; (b5 + 2) >> 2 3388 punpcklwd m4, m5, m7 3389 pmaddwd m4, m4 3390 punpckhwd m5, m7 3391 pmaddwd m5, m5 3392 punpcklwd m0, m1, m7 ; b5 3393 punpckhwd m1, m7 3394%if ARCH_X86_64 3395 SWAP m7, m6 3396%endif 3397 MAXSD m2, m4, m7 3398 psubd m2, m4 ; p5 3399 MAXSD m3, m5, m7 3400 psubd m3, m5 3401 MULLD m2, m13, m7 ; p5 * s0 3402 MULLD m3, m13, m7 3403 pmaddwd m0, m12 ; b5 * 164 3404 pmaddwd m1, m12 3405 paddusw m2, m12 3406 paddusw m3, m12 3407 psrld m2, 20 ; min(z5, 255) 3408 psrld m3, 20 3409 GATHER_X_BY_X m4, m2, m3, r0, dstm 3410 punpcklwd m2, m4, m4 3411 punpckhwd m3, m4, m4 3412 MULLD m0, m2, m7 3413 MULLD m1, m3, m7 3414 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3415 paddd m1, m10 3416 mova [t4+wq*1+400*0+ 4], m4 3417 psrld m0, 12 3418 psrld m1, 12 3419 mova [t3+wq*2+400*0+ 8], m0 3420 mova [t3+wq*2+400*0+24], m1 3421 add wq, 16 3422 jl .v1_loop 3423 mov r10, t2 3424 mov t2, t1 3425 mov t1, r10 3426 ret 3427.prep_n: ; initial neighbor setup 3428 movif64 wq, r4 3429 movif32 wd, w1m 3430.prep_n_loop: 3431 movu m0, [t4+wq*1+400*0+ 2] 3432 movu m1, [t3+wq*2+400*0+ 4] 3433 movu m2, [t3+wq*2+400*0+20] 3434 movu m7, [t4+wq*1+400*0+ 4] 3435 movu m8, [t3+wq*2+400*0+ 8] 3436 paddw m3, m0, [t4+wq*1+400*0+ 0] 3437 paddd m4, m1, [t3+wq*2+400*0+ 0] 3438 paddd m5, m2, [t3+wq*2+400*0+16] 3439 paddw m3, m7 3440 paddd m4, m8 3441 movu m7, [t3+wq*2+400*0+24] 3442 paddw m0, m3 3443 paddd m1, m4 3444 psllw m3, 2 3445 pslld m4, 2 3446 paddd m5, m7 3447 paddd m2, m5 3448 pslld m5, 2 3449 paddw m0, m3 ; a5 565 3450 paddd m1, m4 ; b5 565 3451 paddd m2, m5 3452 mova [t4+wq*1+400* 6+ 0], m0 3453 mova [t3+wq*2+400*12+ 0], m1 3454 mova [t3+wq*2+400*12+16], m2 3455 movu m0, [t4+wq*1+400*2+ 4] 3456 movu m1, [t3+wq*2+400*4+ 8] 3457 movu m2, [t3+wq*2+400*4+24] 3458 movu m3, [t4+wq*1+400*2+ 2] 3459 movu m4, [t3+wq*2+400*4+ 4] 3460 movu m5, [t3+wq*2+400*4+20] 3461 paddw m0, [t4+wq*1+400*2+ 0] 3462 paddd m1, [t3+wq*2+400*4+ 0] 3463 paddd m2, [t3+wq*2+400*4+16] 3464 paddw m3, m0 3465 paddd m4, m1 3466 paddd m5, m2 3467 psllw m3, 2 ; a3[-1] 444 3468 pslld m4, 2 ; b3[-1] 444 3469 pslld m5, 2 3470 psubw m3, m0 ; a3[-1] 343 3471 psubd m4, m1 ; b3[-1] 343 3472 psubd m5, m2 3473 mova [t4+wq*1+400* 8+ 0], m3 3474 mova [t3+wq*2+400*16+ 0], m4 3475 mova [t3+wq*2+400*16+16], m5 3476 movu m0, [t4+wq*1+400*4+ 4] 3477 movu m1, [t3+wq*2+400*8+ 8] 3478 movu m2, [t3+wq*2+400*8+24] 3479 movu m3, [t4+wq*1+400*4+ 2] 3480 movu m4, [t3+wq*2+400*8+ 4] 3481 movu m5, [t3+wq*2+400*8+20] 3482 paddw m0, [t4+wq*1+400*4+ 0] 3483 paddd m1, [t3+wq*2+400*8+ 0] 3484 paddd m2, [t3+wq*2+400*8+16] 3485 paddw m3, m0 3486 paddd m4, m1 3487 paddd m5, m2 3488 psllw m3, 2 ; a3[ 0] 444 3489 pslld m4, 2 ; b3[ 0] 444 3490 pslld m5, 2 3491 mova [t4+wq*1+400*10+ 0], m3 3492 mova [t3+wq*2+400*20+ 0], m4 3493 mova [t3+wq*2+400*20+16], m5 3494 psubw m3, m0 ; a3[ 0] 343 3495 psubd m4, m1 ; b3[ 0] 343 3496 psubd m5, m2 3497 mova [t4+wq*1+400*12+ 0], m3 3498 mova [t3+wq*2+400*24+ 0], m4 3499 mova [t3+wq*2+400*24+16], m5 3500 add wq, 16 3501 jl .prep_n_loop 3502 ret 3503ALIGN function_align 3504.n0: ; neighbor + output (even rows) 3505 movif64 wq, r4 3506 movif32 wd, w1m 3507.n0_loop: 3508 movu m0, [t4+wq*1+ 4] 3509 movu m2, [t4+wq*1+ 2] 3510 paddw m0, [t4+wq*1+ 0] 3511 paddw m0, m2 3512 paddw m2, m0 3513 psllw m0, 2 3514 paddw m0, m2 ; a5 3515 movu m4, [t3+wq*2+ 8] 3516 movu m5, [t3+wq*2+24] 3517 movu m1, [t3+wq*2+ 4] 3518 movu m3, [t3+wq*2+20] 3519 paddd m4, [t3+wq*2+ 0] 3520 paddd m5, [t3+wq*2+16] 3521 paddd m4, m1 3522 paddd m5, m3 3523 paddd m1, m4 3524 paddd m3, m5 3525 pslld m4, 2 3526 pslld m5, 2 3527 paddd m4, m1 ; b5 3528 paddd m5, m3 3529 movu m2, [t4+wq*1+400* 6] 3530 paddw m2, m0 3531 mova [t4+wq*1+400* 6], m0 3532 paddd m0, m4, [t3+wq*2+400*12+ 0] 3533 paddd m1, m5, [t3+wq*2+400*12+16] 3534 mova [t3+wq*2+400*12+ 0], m4 3535 mova [t3+wq*2+400*12+16], m5 3536 mova [rsp+16+ARCH_X86_32*4], m1 3537 movu m3, [t4+wq*1+400*2+4] 3538 movu m5, [t4+wq*1+400*2+2] 3539 paddw m3, [t4+wq*1+400*2+0] 3540 paddw m5, m3 3541 psllw m5, 2 ; a3[ 1] 444 3542 psubw m4, m5, m3 ; a3[ 1] 343 3543 movu m3, [t4+wq*1+400* 8] 3544 paddw m3, [t4+wq*1+400*10] 3545 paddw m3, m4 3546 mova [t4+wq*1+400* 8], m4 3547 mova [t4+wq*1+400*10], m5 3548 movu m1, [t3+wq*2+400*4+ 8] 3549 movu m5, [t3+wq*2+400*4+ 4] 3550 movu m7, [t3+wq*2+400*4+24] 3551 movu m8, [t3+wq*2+400*4+20] 3552 paddd m1, [t3+wq*2+400*4+ 0] 3553 paddd m7, [t3+wq*2+400*4+16] 3554 paddd m5, m1 3555 paddd m8, m7 3556 pslld m5, 2 ; b3[ 1] 444 3557 pslld m8, 2 3558 psubd m4, m5, m1 ; b3[ 1] 343 3559%if ARCH_X86_32 3560 mova [esp+52], m8 3561 psubd m8, m7 3562%else 3563 psubd m6, m8, m7 3564 SWAP m8, m6 3565%endif 3566 paddd m1, m4, [t3+wq*2+400*16+ 0] 3567 paddd m7, m8, [t3+wq*2+400*16+16] 3568 paddd m1, [t3+wq*2+400*20+ 0] 3569 paddd m7, [t3+wq*2+400*20+16] 3570 mova [t3+wq*2+400*16+ 0], m4 3571 mova [t3+wq*2+400*16+16], m8 3572 mova [t3+wq*2+400*20+ 0], m5 3573%if ARCH_X86_32 3574 mova m8, [esp+52] 3575%else 3576 SWAP m8, m6 3577 pxor m6, m6 3578%endif 3579 mova [t3+wq*2+400*20+16], m8 3580 mova [rsp+32+ARCH_X86_32*4], m7 3581 movu m5, [dstq+wq] 3582 punpcklwd m4, m5, m6 3583 punpcklwd m7, m2, m6 3584 pmaddwd m7, m4 ; a5 * src 3585 punpcklwd m8, m3, m6 3586 pmaddwd m8, m4 ; a3 * src 3587 punpckhwd m5, m6 3588 punpckhwd m2, m6 3589 pmaddwd m2, m5 3590 punpckhwd m3, m6 3591 pmaddwd m3, m5 3592 pslld m4, 13 3593 pslld m5, 13 3594 psubd m0, m7 ; b5 - a5 * src + (1 << 8) 3595 psubd m1, m8 ; b3 - a3 * src + (1 << 8) 3596 mova m7, [base+pd_0xffff] 3597 psrld m0, 9 3598 pslld m1, 7 3599 pand m0, m7 3600 pandn m8, m7, m1 3601 por m0, m8 3602 mova m1, [rsp+16+ARCH_X86_32*4] 3603 mova m8, [rsp+32+ARCH_X86_32*4] 3604 psubd m1, m2 3605 psubd m8, m3 3606 mova m2, [base+pd_4096] 3607 psrld m1, 9 3608 pslld m8, 7 3609 pand m1, m7 3610 pandn m7, m8 3611 por m1, m7 3612 pmaddwd m0, m15 3613 pmaddwd m1, m15 3614%if ARCH_X86_32 3615 pxor m7, m7 3616%else 3617 SWAP m7, m6 3618%endif 3619 paddd m4, m2 3620 paddd m5, m2 3621 paddd m0, m4 3622 paddd m1, m5 3623 psrad m0, 8 3624 psrad m1, 8 3625 packssdw m0, m1 ; clip 3626 pmaxsw m0, m7 3627 psrlw m0, 5 3628 mova [dstq+wq], m0 3629 add wq, 16 3630 jl .n0_loop 3631 add dstq, stridemp 3632 ret 3633%if ARCH_X86_64 3634 SWAP m6, m7 3635%endif 3636ALIGN function_align 3637.n1: ; neighbor + output (odd rows) 3638 movif64 wq, r4 3639 movif32 wd, w1m 3640.n1_loop: 3641 movu m3, [t4+wq*1+400*4+4] 3642 movu m5, [t4+wq*1+400*4+2] 3643 paddw m3, [t4+wq*1+400*4+0] 3644 paddw m5, m3 3645 psllw m5, 2 ; a3[ 1] 444 3646 psubw m4, m5, m3 ; a3[ 1] 343 3647 paddw m3, m4, [t4+wq*1+400*12] 3648 paddw m3, [t4+wq*1+400*10] 3649 mova [t4+wq*1+400*10], m5 3650 mova [t4+wq*1+400*12], m4 3651 movu m1, [t3+wq*2+400*8+ 8] 3652 movu m5, [t3+wq*2+400*8+ 4] 3653 movu m7, [t3+wq*2+400*8+24] 3654 movu m8, [t3+wq*2+400*8+20] 3655 paddd m1, [t3+wq*2+400*8+ 0] 3656 paddd m7, [t3+wq*2+400*8+16] 3657 paddd m5, m1 3658 paddd m8, m7 3659 pslld m5, 2 ; b3[ 1] 444 3660 pslld m8, 2 3661 psubd m4, m5, m1 ; b3[ 1] 343 3662 psubd m0, m8, m7 3663 paddd m1, m4, [t3+wq*2+400*24+ 0] 3664 paddd m7, m0, [t3+wq*2+400*24+16] 3665 paddd m1, [t3+wq*2+400*20+ 0] 3666 paddd m7, [t3+wq*2+400*20+16] 3667 mova [t3+wq*2+400*20+ 0], m5 3668 mova [t3+wq*2+400*20+16], m8 3669 mova [t3+wq*2+400*24+ 0], m4 3670 mova [t3+wq*2+400*24+16], m0 3671 mova m5, [dstq+wq] 3672 mova m2, [t4+wq*1+400* 6] 3673 punpcklwd m4, m5, m6 3674 punpcklwd m8, m2, m6 3675 pmaddwd m8, m4 ; a5 * src 3676 punpcklwd m0, m3, m6 3677 pmaddwd m0, m4 ; a3 * src 3678 punpckhwd m5, m6 3679 punpckhwd m2, m6 3680 pmaddwd m2, m5 3681 punpckhwd m3, m6 3682 pmaddwd m3, m5 3683 psubd m1, m0 ; b3 - a3 * src + (1 << 8) 3684 pslld m4, 13 3685 pslld m5, 13 3686 mova m0, [t3+wq*2+400*12+ 0] 3687 psubd m0, m8 ; b5 - a5 * src + (1 << 8) 3688 mova m8, [t3+wq*2+400*12+16] 3689 psubd m8, m2 3690 psubd m7, m3 3691 mova m2, [base+pd_0xffff] 3692 pslld m1, 7 3693 psrld m0, 8 3694 psrld m8, 8 3695 pslld m7, 7 3696 pand m0, m2 3697 pandn m3, m2, m1 3698 por m0, m3 3699 pand m8, m2 3700 pandn m2, m7 3701 por m2, m8 3702 mova m1, [base+pd_4096] 3703 pmaddwd m0, m15 3704 pmaddwd m2, m15 3705%if ARCH_X86_64 3706 SWAP m7, m6 3707%endif 3708 pxor m7, m7 3709 paddd m4, m1 3710 paddd m5, m1 3711 paddd m0, m4 3712 paddd m2, m5 3713 psrad m0, 8 3714 psrad m2, 8 3715 packssdw m0, m2 ; clip 3716 pmaxsw m0, m7 3717 psrlw m0, 5 3718 mova [dstq+wq], m0 3719 add wq, 16 3720 jl .n1_loop 3721 add dstq, stridemp 3722 movif32 dstm, dstq 3723 ret 3724