1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; Copyright © 2019, VideoLabs 4; All rights reserved. 5; 6; Redistribution and use in source and binary forms, with or without 7; modification, are permitted provided that the following conditions are met: 8; 9; 1. Redistributions of source code must retain the above copyright notice, this 10; list of conditions and the following disclaimer. 11; 12; 2. Redistributions in binary form must reproduce the above copyright notice, 13; this list of conditions and the following disclaimer in the documentation 14; and/or other materials provided with the distribution. 15; 16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27%include "config.asm" 28%include "ext/x86/x86inc.asm" 29 30SECTION_RODATA 16 31 32%macro DUP8 1-* 33 %rep %0 34 times 8 db %1 35 %rotate 1 36 %endrep 37%endmacro 38 39div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105 40 dd 420, 210, 140, 105, 105, 105, 105, 105 41div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210 42 dw 168, 168, 140, 140, 120, 120, 105, 105 43 dw 420, 420, 210, 210, 140, 140, 105, 105 44 dw 105, 105, 105, 105, 105, 105, 105, 105 45const shufw_6543210x, \ 46 db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 47shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 48pw_8: times 8 dw 8 49pw_128: times 8 dw 128 50pw_256: times 8 dw 256 51pw_2048: times 8 dw 2048 52pw_0x7FFF: times 8 dw 0x7FFF 53pw_0x8000: times 8 dw 0x8000 54tap_table: ; masks for 8-bit shift emulation 55 DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80 56 ; weights 57 DUP8 4, 2, 3, 3, 2, 1 58 ; taps indices 59 db -1 * 16 + 1, -2 * 16 + 2 60 db 0 * 16 + 1, -1 * 16 + 2 61 db 0 * 16 + 1, 0 * 16 + 2 62 db 0 * 16 + 1, 1 * 16 + 2 63 db 1 * 16 + 1, 2 * 16 + 2 64 db 1 * 16 + 0, 2 * 16 + 1 65 db 1 * 16 + 0, 2 * 16 + 0 66 db 1 * 16 + 0, 2 * 16 - 1 67 ; the last 6 are repeats of the first 6 so we don't need to & 7 68 db -1 * 16 + 1, -2 * 16 + 2 69 db 0 * 16 + 1, -1 * 16 + 2 70 db 0 * 16 + 1, 0 * 16 + 2 71 db 0 * 16 + 1, 1 * 16 + 2 72 db 1 * 16 + 1, 2 * 16 + 2 73 db 1 * 16 + 0, 2 * 16 + 1 74 75SECTION .text 76 77%macro movif32 2 78 %if ARCH_X86_32 79 mov %1, %2 80 %endif 81%endmacro 82 83%macro PMOVZXBW 2-3 0 ; %3 = half 84 %if cpuflag(sse4) && %3 == 0 85 pmovzxbw %1, %2 86 %else 87 %if %3 == 1 88 movd %1, %2 89 %else 90 movq %1, %2 91 %endif 92 punpcklbw %1, m7 93 %endif 94%endmacro 95 96%macro PSHUFB_0 2 97 %if cpuflag(ssse3) 98 pshufb %1, %2 99 %else 100 punpcklbw %1, %1 101 pshuflw %1, %1, q0000 102 punpcklqdq %1, %1 103 %endif 104%endmacro 105 106%macro MOVDDUP 2 107%if cpuflag(ssse3) 108 movddup %1, %2 109%else 110 movq %1, %2 111 punpcklqdq %1, %1 112%endif 113%endmacro 114 115%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax 116 ; load p0/p1 117 movsx offq, byte [dirq+kq+%1+14*8] ; off1 118 %if %6 == 4 119 movq m5, [stkq+offq*2+32*0] ; p0 120 movhps m5, [stkq+offq*2+32*1] 121 %else 122 movu m5, [stkq+offq*2+32*0] ; p0 123 %endif 124 neg offq ; -off1 125 %if %6 == 4 126 movq m6, [stkq+offq*2+32*0] ; p1 127 movhps m6, [stkq+offq*2+32*1] 128 %else 129 movu m6, [stkq+offq*2+32*0] ; p1 130 %endif 131 %if %7 132 %if cpuflag(sse4) 133 ; out of bounds values are set to a value that is a both a large unsigned 134 ; value and a negative signed value. 135 ; use signed max and unsigned min to remove them 136 pmaxsw m7, m5 137 pminuw m8, m5 138 pmaxsw m7, m6 139 pminuw m8, m6 140 %else 141 pcmpeqw m3, m14, m5 142 pminsw m8, m5 ; min after p0 143 pandn m3, m5 144 pmaxsw m7, m3 ; max after p0 145 pcmpeqw m3, m14, m6 146 pminsw m8, m6 ; min after p1 147 pandn m3, m6 148 pmaxsw m7, m3 ; max after p1 149 %endif 150 %endif 151 152 ; accumulate sum[m13] over p0/p1 153 psubw m5, m4 ; diff_p0(p0 - px) 154 psubw m6, m4 ; diff_p1(p1 - px) 155 packsswb m5, m6 ; convert pixel diff to 8-bit 156 %if cpuflag(ssse3) 157 pshufb m5, m13 ; group diffs p0 and p1 into pairs 158 pabsb m6, m5 159 psignb m3, %5, m5 160 %else 161 movlhps m6, m5 162 punpckhbw m6, m5 163 pxor m5, m5 164 pcmpgtb m5, m6 165 paddb m6, m5 166 pxor m6, m5 167 paddb m3, %5, m5 168 pxor m3, m5 169 %endif 170 pand m9, %3, m6 ; emulate 8-bit shift 171 psrlw m9, %2 172 psubusb m5, %4, m9 173 pminub m5, m6 ; constrain(diff_p) 174 %if cpuflag(ssse3) 175 pmaddubsw m5, m3 ; constrain(diff_p) * taps 176 %else 177 psrlw m9, m5, 8 178 psraw m6, m3, 8 179 psllw m5, 8 180 psllw m3, 8 181 pmullw m9, m6 182 pmulhw m5, m3 183 paddw m5, m9 184 %endif 185 paddw m0, m5 186%endmacro 187 188%macro LOAD_BODY 3 ; dst, src, block_width 189 %if %3 == 4 190 PMOVZXBW m0, [%2+strideq*0] 191 PMOVZXBW m1, [%2+strideq*1] 192 PMOVZXBW m2, [%2+strideq*2] 193 PMOVZXBW m3, [%2+stride3q] 194 mova [%1+32*0], m0 195 mova [%1+32*1], m1 196 mova [%1+32*2], m2 197 mova [%1+32*3], m3 198 %else 199 movu m0, [%2+strideq*0] 200 movu m1, [%2+strideq*1] 201 movu m2, [%2+strideq*2] 202 movu m3, [%2+stride3q] 203 punpcklbw m4, m0, m7 204 punpckhbw m0, m7 205 mova [%1+32*0+ 0], m4 206 mova [%1+32*0+16], m0 207 punpcklbw m4, m1, m7 208 punpckhbw m1, m7 209 mova [%1+32*1+ 0], m4 210 mova [%1+32*1+16], m1 211 punpcklbw m4, m2, m7 212 punpckhbw m2, m7 213 mova [%1+32*2+ 0], m4 214 mova [%1+32*2+16], m2 215 punpcklbw m4, m3, m7 216 punpckhbw m3, m7 217 mova [%1+32*3+ 0], m4 218 mova [%1+32*3+16], m3 219 %endif 220%endmacro 221 222%macro CDEF_FILTER_END 2 ; w, minmax 223 pxor m6, m6 224 pcmpgtw m6, m0 225 paddw m0, m6 226 %if cpuflag(ssse3) 227 pmulhrsw m0, m15 228 %else 229 paddw m0, m15 230 psraw m0, 4 231 %endif 232 paddw m4, m0 233 %if %2 234 pminsw m4, m7 235 pmaxsw m4, m8 236 %endif 237 packuswb m4, m4 238 %if %1 == 4 239 movd [dstq+strideq*0], m4 240 psrlq m4, 32 241 movd [dstq+strideq*1], m4 242 add stkq, 32*2 243 lea dstq, [dstq+strideq*2] 244 %else 245 movq [dstq], m4 246 add stkq, 32 247 add dstq, strideq 248 %endif 249%endmacro 250 251%macro CDEF_FILTER 2 ; w, h 252 %if ARCH_X86_64 253cglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \ 254 dst, stride, left, top, bot, pri, dst4, edge, \ 255 stride3 256 %define px rsp+3*16+2*32 257 %define base 0 258 %else 259cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ 260 dst, stride, left, edge, stride3 261 %define topq r2 262 %define botq r2 263 %define dst4q r2 264 LEA r5, tap_table 265 %define px esp+7*16+2*32 266 %define base r5-tap_table 267 %endif 268 mov edged, r9m 269 %if cpuflag(sse4) 270 %define OUT_OF_BOUNDS_MEM [base+pw_0x8000] 271 %else 272 %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF] 273 %endif 274 mova m6, OUT_OF_BOUNDS_MEM 275 pxor m7, m7 276 277 ; prepare pixel buffers - body/right 278 %if %2 == 8 279 lea dst4q, [dstq+strideq*4] 280 %endif 281 lea stride3q, [strideq*3] 282 test edgeb, 2 ; have_right 283 jz .no_right 284 LOAD_BODY px, dstq, %1 285 %if %2 == 8 286 LOAD_BODY px+4*32, dst4q, %1 287 %endif 288 jmp .body_done 289.no_right: 290 PMOVZXBW m0, [dstq+strideq*0], %1 == 4 291 PMOVZXBW m1, [dstq+strideq*1], %1 == 4 292 PMOVZXBW m2, [dstq+strideq*2], %1 == 4 293 PMOVZXBW m3, [dstq+stride3q ], %1 == 4 294 mova [px+32*0], m0 295 mova [px+32*1], m1 296 mova [px+32*2], m2 297 mova [px+32*3], m3 298 movd [px+32*0+%1*2], m6 299 movd [px+32*1+%1*2], m6 300 movd [px+32*2+%1*2], m6 301 movd [px+32*3+%1*2], m6 302 %if %2 == 8 303 PMOVZXBW m0, [dst4q+strideq*0], %1 == 4 304 PMOVZXBW m1, [dst4q+strideq*1], %1 == 4 305 PMOVZXBW m2, [dst4q+strideq*2], %1 == 4 306 PMOVZXBW m3, [dst4q+stride3q ], %1 == 4 307 mova [px+32*4], m0 308 mova [px+32*5], m1 309 mova [px+32*6], m2 310 mova [px+32*7], m3 311 movd [px+32*4+%1*2], m6 312 movd [px+32*5+%1*2], m6 313 movd [px+32*6+%1*2], m6 314 movd [px+32*7+%1*2], m6 315 %endif 316.body_done: 317 318 ; top 319 movifnidn topq, r3mp 320 test edgeb, 4 ; have_top 321 jz .no_top 322 test edgeb, 1 ; have_left 323 jz .top_no_left 324 test edgeb, 2 ; have_right 325 jz .top_no_right 326 %if %1 == 4 327 PMOVZXBW m0, [topq+strideq*0-2] 328 PMOVZXBW m1, [topq+strideq*1-2] 329 %else 330 movu m0, [topq+strideq*0-4] 331 movu m1, [topq+strideq*1-4] 332 punpckhbw m2, m0, m7 333 punpcklbw m0, m7 334 punpckhbw m3, m1, m7 335 punpcklbw m1, m7 336 movu [px-32*2+8], m2 337 movu [px-32*1+8], m3 338 %endif 339 movu [px-32*2-%1], m0 340 movu [px-32*1-%1], m1 341 jmp .top_done 342.top_no_right: 343 %if %1 == 4 344 PMOVZXBW m0, [topq+strideq*0-%1] 345 PMOVZXBW m1, [topq+strideq*1-%1] 346 movu [px-32*2-8], m0 347 movu [px-32*1-8], m1 348 %else 349 movu m0, [topq+strideq*0-%1] 350 movu m1, [topq+strideq*1-%2] 351 punpckhbw m2, m0, m7 352 punpcklbw m0, m7 353 punpckhbw m3, m1, m7 354 punpcklbw m1, m7 355 mova [px-32*2-16], m0 356 mova [px-32*2+ 0], m2 357 mova [px-32*1-16], m1 358 mova [px-32*1+ 0], m3 359 %endif 360 movd [px-32*2+%1*2], m6 361 movd [px-32*1+%1*2], m6 362 jmp .top_done 363.top_no_left: 364 test edgeb, 2 ; have_right 365 jz .top_no_left_right 366 %if %1 == 4 367 PMOVZXBW m0, [topq+strideq*0] 368 PMOVZXBW m1, [topq+strideq*1] 369 %else 370 movu m0, [topq+strideq*0] 371 movu m1, [topq+strideq*1] 372 punpckhbw m2, m0, m7 373 punpcklbw m0, m7 374 punpckhbw m3, m1, m7 375 punpcklbw m1, m7 376 movd [px-32*2+16], m2 377 movd [px-32*1+16], m3 378 %endif 379 movd [px-32*2- 4], m6 380 movd [px-32*1- 4], m6 381 mova [px-32*2+ 0], m0 382 mova [px-32*1+ 0], m1 383 jmp .top_done 384.top_no_left_right: 385 PMOVZXBW m0, [topq+strideq*0], %1 == 4 386 PMOVZXBW m1, [topq+strideq*1], %1 == 4 387 movd [px-32*2-4], m6 388 movd [px-32*1-4], m6 389 mova [px-32*2+0], m0 390 mova [px-32*1+0], m1 391 movd [px-32*2+%1*2], m6 392 movd [px-32*1+%1*2], m6 393 jmp .top_done 394.no_top: 395 movu [px-32*2- 4], m6 396 movu [px-32*1- 4], m6 397 %if %1 == 8 398 movq [px-32*2+12], m6 399 movq [px-32*1+12], m6 400 %endif 401.top_done: 402 403 ; left 404 test edgeb, 1 ; have_left 405 jz .no_left 406 movifnidn leftq, leftmp 407 %if %2 == 4 408 movq m0, [leftq] 409 %else 410 movu m0, [leftq] 411 %endif 412 %if %2 == 4 413 punpcklbw m0, m7 414 %else 415 punpckhbw m1, m0, m7 416 punpcklbw m0, m7 417 movhlps m3, m1 418 movd [px+32*4-4], m1 419 movd [px+32*6-4], m3 420 psrlq m1, 32 421 psrlq m3, 32 422 movd [px+32*5-4], m1 423 movd [px+32*7-4], m3 424 %endif 425 movhlps m2, m0 426 movd [px+32*0-4], m0 427 movd [px+32*2-4], m2 428 psrlq m0, 32 429 psrlq m2, 32 430 movd [px+32*1-4], m0 431 movd [px+32*3-4], m2 432 jmp .left_done 433.no_left: 434 movd [px+32*0-4], m6 435 movd [px+32*1-4], m6 436 movd [px+32*2-4], m6 437 movd [px+32*3-4], m6 438 %if %2 == 8 439 movd [px+32*4-4], m6 440 movd [px+32*5-4], m6 441 movd [px+32*6-4], m6 442 movd [px+32*7-4], m6 443 %endif 444.left_done: 445 446 ; bottom 447 movifnidn botq, r4mp 448 test edgeb, 8 ; have_bottom 449 jz .no_bottom 450 test edgeb, 1 ; have_left 451 jz .bottom_no_left 452 test edgeb, 2 ; have_right 453 jz .bottom_no_right 454 %if %1 == 4 455 PMOVZXBW m0, [botq+strideq*0-(%1/2)] 456 PMOVZXBW m1, [botq+strideq*1-(%1/2)] 457 %else 458 movu m0, [botq+strideq*0-4] 459 movu m1, [botq+strideq*1-4] 460 punpckhbw m2, m0, m7 461 punpcklbw m0, m7 462 punpckhbw m3, m1, m7 463 punpcklbw m1, m7 464 movu [px+32*(%2+0)+8], m2 465 movu [px+32*(%2+1)+8], m3 466 %endif 467 movu [px+32*(%2+0)-%1], m0 468 movu [px+32*(%2+1)-%1], m1 469 jmp .bottom_done 470.bottom_no_right: 471 %if %1 == 4 472 PMOVZXBW m0, [botq+strideq*0-4] 473 PMOVZXBW m1, [botq+strideq*1-4] 474 movu [px+32*(%2+0)-8], m0 475 movu [px+32*(%2+1)-8], m1 476 %else 477 movu m0, [botq+strideq*0-8] 478 movu m1, [botq+strideq*1-8] 479 punpckhbw m2, m0, m7 480 punpcklbw m0, m7 481 punpckhbw m3, m1, m7 482 punpcklbw m1, m7 483 mova [px+32*(%2+0)-16], m0 484 mova [px+32*(%2+0)+ 0], m2 485 mova [px+32*(%2+1)-16], m1 486 mova [px+32*(%2+1)+ 0], m3 487 movd [px+32*(%2-1)+16], m6 ; overwritten by first mova 488 %endif 489 movd [px+32*(%2+0)+%1*2], m6 490 movd [px+32*(%2+1)+%1*2], m6 491 jmp .bottom_done 492.bottom_no_left: 493 test edgeb, 2 ; have_right 494 jz .bottom_no_left_right 495 %if %1 == 4 496 PMOVZXBW m0, [botq+strideq*0] 497 PMOVZXBW m1, [botq+strideq*1] 498 %else 499 movu m0, [botq+strideq*0] 500 movu m1, [botq+strideq*1] 501 punpckhbw m2, m0, m7 502 punpcklbw m0, m7 503 punpckhbw m3, m1, m7 504 punpcklbw m1, m7 505 mova [px+32*(%2+0)+16], m2 506 mova [px+32*(%2+1)+16], m3 507 %endif 508 mova [px+32*(%2+0)+ 0], m0 509 mova [px+32*(%2+1)+ 0], m1 510 movd [px+32*(%2+0)- 4], m6 511 movd [px+32*(%2+1)- 4], m6 512 jmp .bottom_done 513.bottom_no_left_right: 514 PMOVZXBW m0, [botq+strideq*0], %1 == 4 515 PMOVZXBW m1, [botq+strideq*1], %1 == 4 516 mova [px+32*(%2+0)+ 0], m0 517 mova [px+32*(%2+1)+ 0], m1 518 movd [px+32*(%2+0)+%1*2], m6 519 movd [px+32*(%2+1)+%1*2], m6 520 movd [px+32*(%2+0)- 4], m6 521 movd [px+32*(%2+1)- 4], m6 522 jmp .bottom_done 523.no_bottom: 524 movu [px+32*(%2+0)- 4], m6 525 movu [px+32*(%2+1)- 4], m6 526 %if %1 == 8 527 movq [px+32*(%2+0)+12], m6 528 movq [px+32*(%2+1)+12], m6 529 %endif 530.bottom_done: 531 532 ; actual filter 533 %if ARCH_X86_64 534 DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec 535 mova m13, [shufb_lohi] 536 %if cpuflag(ssse3) 537 mova m15, [pw_2048] 538 %else 539 mova m15, [pw_8] 540 %endif 541 mova m14, m6 542 %else 543 DEFINE_ARGS dst, pridmp, sec, damping, pri, tap 544 %xdefine m8 m1 545 %xdefine m9 m2 546 %xdefine m10 m0 547 %xdefine m13 [base+shufb_lohi] 548 %xdefine m14 OUT_OF_BOUNDS_MEM 549 %if cpuflag(ssse3) 550 %xdefine m15 [base+pw_2048] 551 %else 552 %xdefine m15 [base+pw_8] 553 %endif 554 %endif 555 movifnidn prid, r5m 556 movifnidn secd, r6m 557 mov dampingd, r8m 558 movif32 [esp+0x3C], r1d 559 test prid, prid 560 jz .sec_only 561 movd m1, r5m 562 bsr pridmpd, prid 563 test secd, secd 564 jz .pri_only 565 movd m10, r6m 566 tzcnt secd, secd 567 and prid, 1 568 sub pridmpd, dampingd 569 sub secd, dampingd 570 xor dampingd, dampingd 571 add prid, prid 572 neg pridmpd 573 cmovs pridmpd, dampingd 574 neg secd 575 PSHUFB_0 m1, m7 576 PSHUFB_0 m10, m7 577 %if ARCH_X86_64 578 DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec 579 lea tapq, [tap_table] 580 MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask 581 MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask 582 mov [rsp+0x00], pridmpq ; pri_shift 583 mov [rsp+0x10], secq ; sec_shift 584 DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off 585 %else 586 MOVDDUP m2, [tapq+pridmpq*8] 587 MOVDDUP m3, [tapq+secq*8] 588 mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw 589 mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP 590 mov [esp+0x00], pridmpd 591 mov [esp+0x30], secd 592 DEFINE_ARGS dst, stride, dir, stk, pri, tap, h 593 %define offq dstq 594 %define kd strided 595 %define kq strideq 596 mova [esp+0x10], m2 597 mova [esp+0x40], m3 598 mova [esp+0x20], m1 599 mova [esp+0x50], m10 600 %endif 601 mov dird, r7m 602 lea stkq, [px] 603 lea priq, [tapq+8*8+priq*8] ; pri_taps 604 mov hd, %1*%2/8 605 lea dirq, [tapq+dirq*2] 606.v_loop: 607 movif32 [esp+0x38], dstd 608 mov kd, 1 609 %if %1 == 4 610 movq m4, [stkq+32*0] 611 movhps m4, [stkq+32*1] 612 %else 613 mova m4, [stkq+32*0] ; px 614 %endif 615 pxor m0, m0 ; sum 616 mova m7, m4 ; max 617 mova m8, m4 ; min 618.k_loop: 619 MOVDDUP m2, [priq+kq*8] 620 %if ARCH_X86_64 621 ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1 622 MOVDDUP m2, [tapq+12*8+kq*8] 623 ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1 624 ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1 625 %else 626 ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1 627 MOVDDUP m2, [tapq+12*8+kq*8] 628 ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 629 MOVDDUP m2, [tapq+12*8+kq*8] 630 ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 631 %endif 632 dec kd 633 jge .k_loop 634 movif32 dstq, [esp+0x38] 635 movif32 strideq, [esp+0x3C] 636 CDEF_FILTER_END %1, 1 637 dec hd 638 jg .v_loop 639 RET 640 641.pri_only: 642%if ARCH_X86_64 643 DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap 644 lea tapq, [tap_table] 645 %else 646 DEFINE_ARGS dst, pridmp, zero, damping, pri, tap 647 %endif 648 and prid, 1 649 xor zerod, zerod 650 sub dampingd, pridmpd 651 cmovs dampingd, zerod 652 add prid, prid 653 PSHUFB_0 m1, m7 654 MOVDDUP m7, [tapq+dampingq*8] 655 mov [rsp+0x00], dampingq 656 %if ARCH_X86_64 657 DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off 658 %else 659 mov [rsp+0x04], zerod 660 DEFINE_ARGS dst, stride, dir, stk, pri, tap, h 661 %endif 662 mov dird, r7m 663 lea stkq, [px] 664 lea priq, [tapq+8*8+priq*8] 665 mov hd, %1*%2/8 666 lea dirq, [tapq+dirq*2] 667.pri_v_loop: 668 movif32 [esp+0x38], dstd 669 mov kd, 1 670 %if %1 == 4 671 movq m4, [stkq+32*0] 672 movhps m4, [stkq+32*1] 673 %else 674 mova m4, [stkq+32*0] 675 %endif 676 pxor m0, m0 677.pri_k_loop: 678 MOVDDUP m2, [priq+kq*8] 679 ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0 680 dec kd 681 jge .pri_k_loop 682 movif32 dstq, [esp+0x38] 683 movif32 strideq, [esp+0x3C] 684 CDEF_FILTER_END %1, 0 685 dec hd 686 jg .pri_v_loop 687 RET 688 689.sec_only: 690%if ARCH_X86_64 691 DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec 692%else 693 DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero 694%endif 695 movd m1, r6m 696 tzcnt secd, secd 697 mov dird, r7m 698 xor zerod, zerod 699 sub dampingd, secd 700 cmovs dampingd, zerod 701 PSHUFB_0 m1, m7 702 %if ARCH_X86_64 703 lea tapq, [tap_table] 704 %else 705 mov [rsp+0x04], zerod 706 %endif 707 mov [rsp+0x00], dampingq 708 MOVDDUP m7, [tapq+dampingq*8] 709 lea dirq, [tapq+dirq*2] 710 %if ARCH_X86_64 711 DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k 712 %else 713 DEFINE_ARGS dst, stride, off, stk, dir, tap, h 714 %endif 715 lea stkq, [px] 716 mov hd, %1*%2/8 717.sec_v_loop: 718 mov kd, 1 719 %if %1 == 4 720 movq m4, [stkq+32*0] 721 movhps m4, [stkq+32*1] 722 %else 723 mova m4, [stkq+32*0] 724 %endif 725 pxor m0, m0 726.sec_k_loop: 727 MOVDDUP m2, [tapq+12*8+kq*8] 728 ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0 729 %if ARCH_X86_32 730 MOVDDUP m2, [tapq+12*8+kq*8] 731 %endif 732 ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0 733 dec kd 734 jge .sec_k_loop 735 movif32 strideq, [esp+0x3C] 736 CDEF_FILTER_END %1, 0 737 dec hd 738 jg .sec_v_loop 739 RET 740%endmacro 741 742%macro MULLD 2 743 %if cpuflag(sse4) 744 pmulld %1, %2 745 %else 746 %if ARCH_X86_32 747 %define m15 m1 748 %endif 749 pmulhuw m15, %1, %2 750 pmullw %1, %2 751 pslld m15, 16 752 paddd %1, m15 753 %endif 754%endmacro 755 756%macro CDEF_DIR 0 757 %if ARCH_X86_64 758cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var 759 lea r6, [strideq*3] 760 movq m1, [srcq+strideq*0] 761 movhps m1, [srcq+strideq*1] 762 movq m3, [srcq+strideq*2] 763 movhps m3, [srcq+r6 ] 764 lea srcq, [srcq+strideq*4] 765 movq m5, [srcq+strideq*0] 766 movhps m5, [srcq+strideq*1] 767 movq m7, [srcq+strideq*2] 768 movhps m7, [srcq+r6 ] 769 770 pxor m8, m8 771 psadbw m9, m1, m8 772 psadbw m2, m3, m8 773 psadbw m4, m5, m8 774 psadbw m6, m7, m8 775 packssdw m9, m2 776 packssdw m4, m6 777 packssdw m9, m4 778 779 punpcklbw m0, m1, m8 780 punpckhbw m1, m8 781 punpcklbw m2, m3, m8 782 punpckhbw m3, m8 783 punpcklbw m4, m5, m8 784 punpckhbw m5, m8 785 punpcklbw m6, m7, m8 786 punpckhbw m7, m8 787cglobal_label .main 788 mova m8, [pw_128] 789 psubw m0, m8 790 psubw m1, m8 791 psubw m2, m8 792 psubw m3, m8 793 psubw m4, m8 794 psubw m5, m8 795 psubw m6, m8 796 psubw m7, m8 797 psllw m8, 3 798 psubw m9, m8 ; partial_sum_hv[0] 799 800 paddw m8, m0, m1 801 paddw m10, m2, m3 802 paddw m8, m4 803 paddw m10, m5 804 paddw m8, m6 805 paddw m10, m7 806 paddw m8, m10 ; partial_sum_hv[1] 807 808 pmaddwd m8, m8 809 pmaddwd m9, m9 810 phaddd m9, m8 811 SWAP m8, m9 812 MULLD m8, [div_table%+SUFFIX+48] 813 814 pslldq m9, m1, 2 815 psrldq m10, m1, 14 816 pslldq m11, m2, 4 817 psrldq m12, m2, 12 818 pslldq m13, m3, 6 819 psrldq m14, m3, 10 820 paddw m9, m0 821 paddw m10, m12 822 paddw m11, m13 823 paddw m10, m14 ; partial_sum_diag[0] top/right half 824 paddw m9, m11 ; partial_sum_diag[0] top/left half 825 pslldq m11, m4, 8 826 psrldq m12, m4, 8 827 pslldq m13, m5, 10 828 psrldq m14, m5, 6 829 paddw m9, m11 830 paddw m10, m12 831 paddw m9, m13 832 paddw m10, m14 833 pslldq m11, m6, 12 834 psrldq m12, m6, 4 835 pslldq m13, m7, 14 836 psrldq m14, m7, 2 837 paddw m9, m11 838 paddw m10, m12 839 paddw m9, m13 ; partial_sum_diag[0][0-7] 840 paddw m10, m14 ; partial_sum_diag[0][8-14,zero] 841 pshufb m10, [shufw_6543210x] 842 punpckhwd m11, m9, m10 843 punpcklwd m9, m10 844 pmaddwd m11, m11 845 pmaddwd m9, m9 846 MULLD m11, [div_table%+SUFFIX+16] 847 MULLD m9, [div_table%+SUFFIX+0] 848 paddd m9, m11 ; cost[0a-d] 849 850 pslldq m10, m0, 14 851 psrldq m11, m0, 2 852 pslldq m12, m1, 12 853 psrldq m13, m1, 4 854 pslldq m14, m2, 10 855 psrldq m15, m2, 6 856 paddw m10, m12 857 paddw m11, m13 858 paddw m10, m14 859 paddw m11, m15 860 pslldq m12, m3, 8 861 psrldq m13, m3, 8 862 pslldq m14, m4, 6 863 psrldq m15, m4, 10 864 paddw m10, m12 865 paddw m11, m13 866 paddw m10, m14 867 paddw m11, m15 868 pslldq m12, m5, 4 869 psrldq m13, m5, 12 870 pslldq m14, m6, 2 871 psrldq m15, m6, 14 872 paddw m10, m12 873 paddw m11, m13 874 paddw m10, m14 875 paddw m11, m15 ; partial_sum_diag[1][8-14,zero] 876 paddw m10, m7 ; partial_sum_diag[1][0-7] 877 pshufb m11, [shufw_6543210x] 878 punpckhwd m12, m10, m11 879 punpcklwd m10, m11 880 pmaddwd m12, m12 881 pmaddwd m10, m10 882 MULLD m12, [div_table%+SUFFIX+16] 883 MULLD m10, [div_table%+SUFFIX+0] 884 paddd m10, m12 ; cost[4a-d] 885 phaddd m9, m10 ; cost[0a/b,4a/b] 886 887 paddw m10, m0, m1 888 paddw m11, m2, m3 889 paddw m12, m4, m5 890 paddw m13, m6, m7 891 phaddw m0, m4 892 phaddw m1, m5 893 phaddw m2, m6 894 phaddw m3, m7 895 896 ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1) 897 pslldq m4, m11, 2 898 psrldq m5, m11, 14 899 pslldq m6, m12, 4 900 psrldq m7, m12, 12 901 pslldq m14, m13, 6 902 psrldq m15, m13, 10 903 paddw m4, m10 904 paddw m5, m7 905 paddw m4, m6 906 paddw m5, m15 ; partial_sum_alt[3] right 907 paddw m4, m14 ; partial_sum_alt[3] left 908 pshuflw m6, m5, q3012 909 punpckhwd m5, m4 910 punpcklwd m4, m6 911 pmaddwd m5, m5 912 pmaddwd m4, m4 913 MULLD m5, [div_table%+SUFFIX+48] 914 MULLD m4, [div_table%+SUFFIX+32] 915 paddd m4, m5 ; cost[7a-d] 916 917 pslldq m5, m10, 6 918 psrldq m6, m10, 10 919 pslldq m7, m11, 4 920 psrldq m10, m11, 12 921 pslldq m11, m12, 2 922 psrldq m12, 14 923 paddw m5, m7 924 paddw m6, m10 925 paddw m5, m11 926 paddw m6, m12 927 paddw m5, m13 928 pshuflw m7, m6, q3012 929 punpckhwd m6, m5 930 punpcklwd m5, m7 931 pmaddwd m6, m6 932 pmaddwd m5, m5 933 MULLD m6, [div_table%+SUFFIX+48] 934 MULLD m5, [div_table%+SUFFIX+32] 935 paddd m5, m6 ; cost[5a-d] 936 937 pslldq m6, m1, 2 938 psrldq m7, m1, 14 939 pslldq m10, m2, 4 940 psrldq m11, m2, 12 941 pslldq m12, m3, 6 942 psrldq m13, m3, 10 943 paddw m6, m0 944 paddw m7, m11 945 paddw m6, m10 946 paddw m7, m13 ; partial_sum_alt[3] right 947 paddw m6, m12 ; partial_sum_alt[3] left 948 pshuflw m10, m7, q3012 949 punpckhwd m7, m6 950 punpcklwd m6, m10 951 pmaddwd m7, m7 952 pmaddwd m6, m6 953 MULLD m7, [div_table%+SUFFIX+48] 954 MULLD m6, [div_table%+SUFFIX+32] 955 paddd m6, m7 ; cost[1a-d] 956 957 pshufd m0, m0, q1032 958 pshufd m1, m1, q1032 959 pshufd m2, m2, q1032 960 pshufd m3, m3, q1032 961 962 pslldq m10, m0, 6 963 psrldq m11, m0, 10 964 pslldq m12, m1, 4 965 psrldq m13, m1, 12 966 pslldq m14, m2, 2 967 psrldq m2, 14 968 paddw m10, m12 969 paddw m11, m13 970 paddw m10, m14 971 paddw m11, m2 972 paddw m10, m3 973 pshuflw m12, m11, q3012 974 punpckhwd m11, m10 975 punpcklwd m10, m12 976 pmaddwd m11, m11 977 pmaddwd m10, m10 978 MULLD m11, [div_table%+SUFFIX+48] 979 MULLD m10, [div_table%+SUFFIX+32] 980 paddd m10, m11 ; cost[3a-d] 981 982 phaddd m9, m8 ; cost[0,4,2,6] 983 phaddd m6, m10 984 phaddd m5, m4 985 phaddd m6, m5 ; cost[1,3,5,7] 986 pshufd m4, m9, q3120 987 988 ; now find the best cost 989 %if cpuflag(sse4) 990 pmaxsd m9, m6 991 pshufd m0, m9, q1032 992 pmaxsd m0, m9 993 pshufd m1, m0, q2301 994 pmaxsd m0, m1 ; best cost 995 %else 996 pcmpgtd m0, m9, m6 997 pand m9, m0 998 pandn m0, m6 999 por m9, m0 1000 pshufd m1, m9, q1032 1001 pcmpgtd m0, m9, m1 1002 pand m9, m0 1003 pandn m0, m1 1004 por m9, m0 1005 pshufd m1, m9, q2301 1006 pcmpgtd m0, m9, m1 1007 pand m9, m0 1008 pandn m0, m1 1009 por m0, m9 1010 %endif 1011 1012 ; get direction and variance 1013 punpckhdq m1, m4, m6 1014 punpckldq m4, m6 1015 psubd m2, m0, m1 1016 psubd m3, m0, m4 1017%if WIN64 1018 WIN64_RESTORE_XMM 1019 %define tmp rsp+stack_offset+8 1020%else 1021 %define tmp rsp-40 1022%endif 1023 mova [tmp+0x00], m2 ; emulate ymm in stack 1024 mova [tmp+0x10], m3 1025 pcmpeqd m1, m0 ; compute best cost mask 1026 pcmpeqd m4, m0 1027 packssdw m4, m1 1028 pmovmskb eax, m4 ; get byte-idx from mask 1029 tzcnt eax, eax 1030 mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm 1031 shr eax, 1 ; get direction by converting byte-idx to word-idx 1032 shr r1d, 10 1033 mov [varq], r1d 1034 %else 1035cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3 1036%define base r2-shufw_6543210x 1037 LEA r2, shufw_6543210x 1038 pxor m0, m0 1039 lea stride3q, [strideq*3] 1040 movq m5, [srcq+strideq*0] 1041 movhps m5, [srcq+strideq*1] 1042 movq m7, [srcq+strideq*2] 1043 movhps m7, [srcq+stride3q] 1044 mova m1, [base+pw_128] 1045 psadbw m2, m5, m0 1046 psadbw m3, m7, m0 1047 packssdw m2, m3 1048 punpcklbw m4, m5, m0 1049 punpckhbw m5, m0 1050 punpcklbw m6, m7, m0 1051 punpckhbw m7, m0 1052 psubw m4, m1 1053 psubw m5, m1 1054 psubw m6, m1 1055 psubw m7, m1 1056 1057 mova [esp+0x00], m4 1058 mova [esp+0x10], m5 1059 mova [esp+0x20], m6 1060 mova [esp+0x50], m7 1061 1062 lea srcq, [srcq+strideq*4] 1063 movq m5, [srcq+strideq*0] 1064 movhps m5, [srcq+strideq*1] 1065 movq m7, [srcq+strideq*2] 1066 movhps m7, [srcq+stride3q] 1067 psadbw m3, m5, m0 1068 psadbw m0, m7 1069 packssdw m3, m0 1070 pxor m0, m0 1071 punpcklbw m4, m5, m0 1072 punpckhbw m5, m0 1073 punpcklbw m6, m7, m0 1074 punpckhbw m7, m0 1075cglobal_label .main 1076 psubw m4, m1 1077 psubw m5, m1 1078 psubw m6, m1 1079 psubw m7, m1 1080 packssdw m2, m3 1081 psllw m1, 3 1082 psubw m2, m1 ; partial_sum_hv[0] 1083 pmaddwd m2, m2 1084 1085 mova m3, [esp+0x50] 1086 mova m0, [esp+0x00] 1087 paddw m0, [esp+0x10] 1088 paddw m1, m3, [esp+0x20] 1089 paddw m0, m4 1090 paddw m1, m5 1091 paddw m0, m6 1092 paddw m1, m7 1093 paddw m0, m1 ; partial_sum_hv[1] 1094 pmaddwd m0, m0 1095 1096 phaddd m2, m0 1097 MULLD m2, [base+div_table%+SUFFIX+48] 1098 mova [esp+0x30], m2 1099 1100 mova m1, [esp+0x10] 1101 pslldq m0, m1, 2 1102 psrldq m1, 14 1103 paddw m0, [esp+0x00] 1104 pslldq m2, m3, 6 1105 psrldq m3, 10 1106 paddw m0, m2 1107 paddw m1, m3 1108 mova m3, [esp+0x20] 1109 pslldq m2, m3, 4 1110 psrldq m3, 12 1111 paddw m0, m2 ; partial_sum_diag[0] top/left half 1112 paddw m1, m3 ; partial_sum_diag[0] top/right half 1113 pslldq m2, m4, 8 1114 psrldq m3, m4, 8 1115 paddw m0, m2 1116 paddw m1, m3 1117 pslldq m2, m5, 10 1118 psrldq m3, m5, 6 1119 paddw m0, m2 1120 paddw m1, m3 1121 pslldq m2, m6, 12 1122 psrldq m3, m6, 4 1123 paddw m0, m2 1124 paddw m1, m3 1125 pslldq m2, m7, 14 1126 psrldq m3, m7, 2 1127 paddw m0, m2 ; partial_sum_diag[0][0-7] 1128 paddw m1, m3 ; partial_sum_diag[0][8-14,zero] 1129 mova m3, [esp+0x50] 1130 pshufb m1, [base+shufw_6543210x] 1131 punpckhwd m2, m0, m1 1132 punpcklwd m0, m1 1133 pmaddwd m2, m2 1134 pmaddwd m0, m0 1135 MULLD m2, [base+div_table%+SUFFIX+16] 1136 MULLD m0, [base+div_table%+SUFFIX+ 0] 1137 paddd m0, m2 ; cost[0a-d] 1138 mova [esp+0x40], m0 1139 1140 mova m1, [esp+0x00] 1141 pslldq m0, m1, 14 1142 psrldq m1, 2 1143 paddw m0, m7 1144 pslldq m2, m3, 8 1145 psrldq m3, 8 1146 paddw m0, m2 1147 paddw m1, m3 1148 mova m3, [esp+0x20] 1149 pslldq m2, m3, 10 1150 psrldq m3, 6 1151 paddw m0, m2 1152 paddw m1, m3 1153 mova m3, [esp+0x10] 1154 pslldq m2, m3, 12 1155 psrldq m3, 4 1156 paddw m0, m2 1157 paddw m1, m3 1158 pslldq m2, m4, 6 1159 psrldq m3, m4, 10 1160 paddw m0, m2 1161 paddw m1, m3 1162 pslldq m2, m5, 4 1163 psrldq m3, m5, 12 1164 paddw m0, m2 1165 paddw m1, m3 1166 pslldq m2, m6, 2 1167 psrldq m3, m6, 14 1168 paddw m0, m2 ; partial_sum_diag[1][0-7] 1169 paddw m1, m3 ; partial_sum_diag[1][8-14,zero] 1170 mova m3, [esp+0x50] 1171 pshufb m1, [base+shufw_6543210x] 1172 punpckhwd m2, m0, m1 1173 punpcklwd m0, m1 1174 pmaddwd m2, m2 1175 pmaddwd m0, m0 1176 MULLD m2, [base+div_table%+SUFFIX+16] 1177 MULLD m0, [base+div_table%+SUFFIX+ 0] 1178 paddd m0, m2 ; cost[4a-d] 1179 phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b] 1180 phaddd m1, [esp+0x30] ; cost[0,4,2,6] 1181 mova [esp+0x30], m1 1182 1183 phaddw m0, [esp+0x00], m4 1184 phaddw m1, [esp+0x10], m5 1185 paddw m4, m5 1186 mova m2, [esp+0x20] 1187 paddw m5, m2, m3 1188 phaddw m2, m6 1189 paddw m6, m7 1190 phaddw m3, m7 1191 mova m7, [esp+0x00] 1192 paddw m7, [esp+0x10] 1193 mova [esp+0x00], m0 1194 mova [esp+0x10], m1 1195 mova [esp+0x20], m2 1196 1197 pslldq m1, m4, 4 1198 pslldq m2, m6, 6 1199 pslldq m0, m5, 2 1200 paddw m1, m2 1201 paddw m0, m7 1202 psrldq m2, m5, 14 1203 paddw m0, m1 ; partial_sum_alt[3] left 1204 psrldq m1, m4, 12 1205 paddw m1, m2 1206 psrldq m2, m6, 10 1207 paddw m1, m2 ; partial_sum_alt[3] right 1208 pshuflw m1, m1, q3012 1209 punpckhwd m2, m0, m1 1210 punpcklwd m0, m1 1211 pmaddwd m2, m2 1212 pmaddwd m0, m0 1213 MULLD m2, [base+div_table%+SUFFIX+48] 1214 MULLD m0, [base+div_table%+SUFFIX+32] 1215 paddd m0, m2 ; cost[7a-d] 1216 mova [esp+0x40], m0 1217 1218 pslldq m0, m7, 6 1219 psrldq m7, 10 1220 pslldq m1, m5, 4 1221 psrldq m5, 12 1222 pslldq m2, m4, 2 1223 psrldq m4, 14 1224 paddw m0, m6 1225 paddw m7, m5 1226 paddw m0, m1 1227 paddw m7, m4 1228 paddw m0, m2 1229 pshuflw m2, m7, q3012 1230 punpckhwd m7, m0 1231 punpcklwd m0, m2 1232 pmaddwd m7, m7 1233 pmaddwd m0, m0 1234 MULLD m7, [base+div_table%+SUFFIX+48] 1235 MULLD m0, [base+div_table%+SUFFIX+32] 1236 paddd m0, m7 ; cost[5a-d] 1237 mova [esp+0x50], m0 1238 1239 mova m7, [esp+0x10] 1240 mova m2, [esp+0x20] 1241 pslldq m0, m7, 2 1242 psrldq m7, 14 1243 pslldq m4, m2, 4 1244 psrldq m2, 12 1245 pslldq m5, m3, 6 1246 psrldq m6, m3, 10 1247 paddw m0, [esp+0x00] 1248 paddw m7, m2 1249 paddw m4, m5 1250 paddw m7, m6 ; partial_sum_alt[3] right 1251 paddw m0, m4 ; partial_sum_alt[3] left 1252 pshuflw m2, m7, q3012 1253 punpckhwd m7, m0 1254 punpcklwd m0, m2 1255 pmaddwd m7, m7 1256 pmaddwd m0, m0 1257 MULLD m7, [base+div_table%+SUFFIX+48] 1258 MULLD m0, [base+div_table%+SUFFIX+32] 1259 paddd m0, m7 ; cost[1a-d] 1260 SWAP m0, m4 1261 1262 pshufd m0, [esp+0x00], q1032 1263 pshufd m1, [esp+0x10], q1032 1264 pshufd m2, [esp+0x20], q1032 1265 pshufd m3, m3, q1032 1266 mova [esp+0x00], m4 1267 1268 pslldq m4, m0, 6 1269 psrldq m0, 10 1270 pslldq m5, m1, 4 1271 psrldq m1, 12 1272 pslldq m6, m2, 2 1273 psrldq m2, 14 1274 paddw m4, m3 1275 paddw m0, m1 1276 paddw m5, m6 1277 paddw m0, m2 1278 paddw m4, m5 1279 pshuflw m2, m0, q3012 1280 punpckhwd m0, m4 1281 punpcklwd m4, m2 1282 pmaddwd m0, m0 1283 pmaddwd m4, m4 1284 MULLD m0, [base+div_table%+SUFFIX+48] 1285 MULLD m4, [base+div_table%+SUFFIX+32] 1286 paddd m4, m0 ; cost[3a-d] 1287 1288 mova m1, [esp+0x00] 1289 mova m2, [esp+0x50] 1290 mova m0, [esp+0x30] ; cost[0,4,2,6] 1291 phaddd m1, m4 1292 phaddd m2, [esp+0x40] ; cost[1,3,5,7] 1293 phaddd m1, m2 1294 pshufd m2, m0, q3120 1295 1296 ; now find the best cost 1297 %if cpuflag(sse4) 1298 pmaxsd m0, m1 1299 pshufd m3, m0, q1032 1300 pmaxsd m3, m0 1301 pshufd m0, m3, q2301 1302 pmaxsd m0, m3 1303 %else 1304 pcmpgtd m3, m0, m1 1305 pand m0, m3 1306 pandn m3, m1 1307 por m0, m3 1308 pshufd m4, m0, q1032 1309 pcmpgtd m3, m0, m4 1310 pand m0, m3 1311 pandn m3, m4 1312 por m0, m3 1313 pshufd m4, m0, q2301 1314 pcmpgtd m3, m0, m4 1315 pand m0, m3 1316 pandn m3, m4 1317 por m0, m3 1318 %endif 1319 1320 ; get direction and variance 1321 mov vard, varm 1322 punpckhdq m3, m2, m1 1323 punpckldq m2, m1 1324 psubd m1, m0, m3 1325 psubd m4, m0, m2 1326 mova [esp+0x00], m1 ; emulate ymm in stack 1327 mova [esp+0x10], m4 1328 pcmpeqd m3, m0 ; compute best cost mask 1329 pcmpeqd m2, m0 1330 packssdw m2, m3 1331 pmovmskb eax, m2 ; get byte-idx from mask 1332 tzcnt eax, eax 1333 mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm 1334 shr eax, 1 ; get direction by converting byte-idx to word-idx 1335 shr r1d, 10 1336 mov [vard], r1d 1337 %endif 1338 1339 RET 1340%endmacro 1341 1342INIT_XMM sse4 1343CDEF_FILTER 8, 8 1344CDEF_FILTER 4, 8 1345CDEF_FILTER 4, 4 1346CDEF_DIR 1347 1348INIT_XMM ssse3 1349CDEF_FILTER 8, 8 1350CDEF_FILTER 4, 8 1351CDEF_FILTER 4, 4 1352CDEF_DIR 1353 1354INIT_XMM sse2 1355CDEF_FILTER 8, 8 1356CDEF_FILTER 4, 8 1357CDEF_FILTER 4, 4 1358