1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 33%macro DIR_TABLE 1 ; stride 34 db 1 * %1 + 0, 2 * %1 + 0 35 db 1 * %1 + 0, 2 * %1 - 2 36 db -1 * %1 + 2, -2 * %1 + 4 37 db 0 * %1 + 2, -1 * %1 + 4 38 db 0 * %1 + 2, 0 * %1 + 4 39 db 0 * %1 + 2, 1 * %1 + 4 40 db 1 * %1 + 2, 2 * %1 + 4 41 db 1 * %1 + 0, 2 * %1 + 2 42 db 1 * %1 + 0, 2 * %1 + 0 43 db 1 * %1 + 0, 2 * %1 - 2 44 db -1 * %1 + 2, -2 * %1 + 4 45 db 0 * %1 + 2, -1 * %1 + 4 46%endmacro 47 48dir_table4: DIR_TABLE 16 49dir_table8: DIR_TABLE 32 50pri_taps: dw 4, 4, 3, 3, 2, 2, 3, 3 51 52dir_shift: times 2 dw 0x4000 53 times 2 dw 0x1000 54 55pw_2048: times 2 dw 2048 56pw_m16384: times 2 dw -16384 57 58cextern cdef_dir_8bpc_avx2.main 59 60SECTION .text 61 62%macro CDEF_FILTER 2 ; w, h 63 DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp 64 movifnidn prid, r5m 65 movifnidn secd, r6m 66 mov dird, r7m 67 vpbroadcastd m8, [base+pw_2048] 68 lea dirq, [base+dir_table%1+dirq*2] 69 test prid, prid 70 jz .sec_only 71%if WIN64 72 vpbroadcastw m6, prim 73 movaps [rsp+16*0], xmm9 74 movaps [rsp+16*1], xmm10 75%else 76 movd xm6, prid 77 vpbroadcastw m6, xm6 78%endif 79 lzcnt pridmpd, prid 80 rorx tmpd, prid, 2 81 cmp dword r10m, 0xfff ; if (bpc == 12) 82 cmove prid, tmpd ; pri >>= 2 83 mov tmpd, r8m ; damping 84 and prid, 4 85 sub tmpd, 31 86 vpbroadcastd m9, [base+pri_taps+priq+8*0] 87 vpbroadcastd m10, [base+pri_taps+priq+8*1] 88 test secd, secd 89 jz .pri_only 90%if WIN64 91 movaps r8m, xmm13 92 vpbroadcastw m13, secm 93 movaps r4m, xmm11 94 movaps r6m, xmm12 95%else 96 movd xm0, secd 97 vpbroadcastw m13, xm0 98%endif 99 lzcnt secd, secd 100 xor prid, prid 101 add pridmpd, tmpd 102 cmovs pridmpd, prid 103 add secd, tmpd 104 lea tmpq, [px] 105 mov [pri_shift], pridmpq 106 mov [sec_shift], secq 107%rep %1*%2/16 108 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec 109%endrep 110%if WIN64 111 movaps xmm11, r4m 112 movaps xmm12, r6m 113 movaps xmm13, r8m 114%endif 115 jmp .pri_end 116.pri_only: 117 add pridmpd, tmpd 118 cmovs pridmpd, secd 119 lea tmpq, [px] 120 mov [pri_shift], pridmpq 121%rep %1*%2/16 122 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri 123%endrep 124.pri_end: 125%if WIN64 126 movaps xmm9, [rsp+16*0] 127 movaps xmm10, [rsp+16*1] 128%endif 129.end: 130 RET 131.sec_only: 132 mov tmpd, r8m ; damping 133%if WIN64 134 vpbroadcastw m6, secm 135%else 136 movd xm6, secd 137 vpbroadcastw m6, xm6 138%endif 139 tzcnt secd, secd 140 sub tmpd, secd 141 mov [sec_shift], tmpq 142 lea tmpq, [px] 143%rep %1*%2/16 144 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec 145%endrep 146 jmp .end 147%if %1 == %2 148ALIGN function_align 149.pri: 150 movsx offq, byte [dirq+4] ; off_k0 151%if %1 == 4 152 mova m1, [tmpq+32*0] 153 punpcklqdq m1, [tmpq+32*1] ; 0 2 1 3 154 movu m2, [tmpq+offq+32*0] 155 punpcklqdq m2, [tmpq+offq+32*1] ; k0p0 156 neg offq 157 movu m3, [tmpq+offq+32*0] 158 punpcklqdq m3, [tmpq+offq+32*1] ; k0p1 159%else 160 mova xm1, [tmpq+32*0] 161 vinserti128 m1, [tmpq+32*1], 1 162 movu xm2, [tmpq+offq+32*0] 163 vinserti128 m2, [tmpq+offq+32*1], 1 164 neg offq 165 movu xm3, [tmpq+offq+32*0] 166 vinserti128 m3, [tmpq+offq+32*1], 1 167%endif 168 movsx offq, byte [dirq+5] ; off_k1 169 psubw m2, m1 ; diff_k0p0 170 psubw m3, m1 ; diff_k0p1 171 pabsw m4, m2 ; adiff_k0p0 172 psrlw m5, m4, [pri_shift+gprsize] 173 psubusw m0, m6, m5 174 pabsw m5, m3 ; adiff_k0p1 175 pminsw m0, m4 176 psrlw m4, m5, [pri_shift+gprsize] 177 psignw m0, m2 ; constrain(diff_k0p0) 178 psubusw m2, m6, m4 179 pminsw m2, m5 180%if %1 == 4 181 movu m4, [tmpq+offq+32*0] 182 punpcklqdq m4, [tmpq+offq+32*1] ; k1p0 183 neg offq 184 movu m5, [tmpq+offq+32*0] 185 punpcklqdq m5, [tmpq+offq+32*1] ; k1p1 186%else 187 movu xm4, [tmpq+offq+32*0] 188 vinserti128 m4, [tmpq+offq+32*1], 1 189 neg offq 190 movu xm5, [tmpq+offq+32*0] 191 vinserti128 m5, [tmpq+offq+32*1], 1 192%endif 193 psubw m4, m1 ; diff_k1p0 194 psubw m5, m1 ; diff_k1p1 195 psignw m2, m3 ; constrain(diff_k0p1) 196 pabsw m3, m4 ; adiff_k1p0 197 paddw m0, m2 ; constrain(diff_k0) 198 psrlw m2, m3, [pri_shift+gprsize] 199 psubusw m7, m6, m2 200 pabsw m2, m5 ; adiff_k1p1 201 pminsw m7, m3 202 psrlw m3, m2, [pri_shift+gprsize] 203 psignw m7, m4 ; constrain(diff_k1p0) 204 psubusw m4, m6, m3 205 pminsw m4, m2 206 psignw m4, m5 ; constrain(diff_k1p1) 207 paddw m7, m4 ; constrain(diff_k1) 208 pmullw m0, m9 ; pri_tap_k0 209 pmullw m7, m10 ; pri_tap_k1 210 paddw m0, m7 ; sum 211 psraw m2, m0, 15 212 paddw m0, m2 213 pmulhrsw m0, m8 214 add tmpq, 32*2 215 paddw m0, m1 216%if %1 == 4 217 vextracti128 xm1, m0, 1 218 movq [dstq+strideq*0], xm0 219 movq [dstq+strideq*1], xm1 220 movhps [dstq+strideq*2], xm0 221 movhps [dstq+r9 ], xm1 222 lea dstq, [dstq+strideq*4] 223%else 224 mova [dstq+strideq*0], xm0 225 vextracti128 [dstq+strideq*1], m0, 1 226 lea dstq, [dstq+strideq*2] 227%endif 228 ret 229ALIGN function_align 230.sec: 231 movsx offq, byte [dirq+8] ; off1_k0 232%if %1 == 4 233 mova m1, [tmpq+32*0] 234 punpcklqdq m1, [tmpq+32*1] 235 movu m2, [tmpq+offq+32*0] 236 punpcklqdq m2, [tmpq+offq+32*1] ; k0s0 237 neg offq 238 movu m3, [tmpq+offq+32*0] 239 punpcklqdq m3, [tmpq+offq+32*1] ; k0s1 240%else 241 mova xm1, [tmpq+32*0] 242 vinserti128 m1, [tmpq+32*1], 1 243 movu xm2, [tmpq+offq+32*0] 244 vinserti128 m2, [tmpq+offq+32*1], 1 245 neg offq 246 movu xm3, [tmpq+offq+32*0] 247 vinserti128 m3, [tmpq+offq+32*1], 1 248%endif 249 movsx offq, byte [dirq+0] ; off2_k0 250 psubw m2, m1 ; diff_k0s0 251 psubw m3, m1 ; diff_k0s1 252 pabsw m4, m2 ; adiff_k0s0 253 psrlw m5, m4, [sec_shift+gprsize] 254 psubusw m0, m6, m5 255 pabsw m5, m3 ; adiff_k0s1 256 pminsw m0, m4 257 psrlw m4, m5, [sec_shift+gprsize] 258 psignw m0, m2 ; constrain(diff_k0s0) 259 psubusw m2, m6, m4 260 pminsw m2, m5 261%if %1 == 4 262 movu m4, [tmpq+offq+32*0] 263 punpcklqdq m4, [tmpq+offq+32*1] ; k0s2 264 neg offq 265 movu m5, [tmpq+offq+32*0] 266 punpcklqdq m5, [tmpq+offq+32*1] ; k0s3 267%else 268 movu xm4, [tmpq+offq+32*0] 269 vinserti128 m4, [tmpq+offq+32*1], 1 270 neg offq 271 movu xm5, [tmpq+offq+32*0] 272 vinserti128 m5, [tmpq+offq+32*1], 1 273%endif 274 movsx offq, byte [dirq+9] ; off1_k1 275 psubw m4, m1 ; diff_k0s2 276 psubw m5, m1 ; diff_k0s3 277 psignw m2, m3 ; constrain(diff_k0s1) 278 pabsw m3, m4 ; adiff_k0s2 279 paddw m0, m2 280 psrlw m2, m3, [sec_shift+gprsize] 281 psubusw m7, m6, m2 282 pabsw m2, m5 ; adiff_k0s3 283 pminsw m7, m3 284 psrlw m3, m2, [sec_shift+gprsize] 285 psignw m7, m4 ; constrain(diff_k0s2) 286 psubusw m4, m6, m3 287 pminsw m4, m2 288%if %1 == 4 289 movu m2, [tmpq+offq+32*0] 290 punpcklqdq m2, [tmpq+offq+32*1] ; k1s0 291 neg offq 292 movu m3, [tmpq+offq+32*0] 293 punpcklqdq m3, [tmpq+offq+32*1] ; k1s1 294%else 295 movu xm2, [tmpq+offq+32*0] 296 vinserti128 m2, [tmpq+offq+32*1], 1 297 neg offq 298 movu xm3, [tmpq+offq+32*0] 299 vinserti128 m3, [tmpq+offq+32*1], 1 300%endif 301 movsx offq, byte [dirq+1] ; off2_k1 302 paddw m0, m7 303 psignw m4, m5 ; constrain(diff_k0s3) 304 paddw m0, m4 ; constrain(diff_k0) 305 psubw m2, m1 ; diff_k1s0 306 psubw m3, m1 ; diff_k1s1 307 paddw m0, m0 ; sec_tap_k0 308 pabsw m4, m2 ; adiff_k1s0 309 psrlw m5, m4, [sec_shift+gprsize] 310 psubusw m7, m6, m5 311 pabsw m5, m3 ; adiff_k1s1 312 pminsw m7, m4 313 psrlw m4, m5, [sec_shift+gprsize] 314 psignw m7, m2 ; constrain(diff_k1s0) 315 psubusw m2, m6, m4 316 pminsw m2, m5 317%if %1 == 4 318 movu m4, [tmpq+offq+32*0] 319 punpcklqdq m4, [tmpq+offq+32*1] ; k1s2 320 neg offq 321 movu m5, [tmpq+offq+32*0] 322 punpcklqdq m5, [tmpq+offq+32*1] ; k1s3 323%else 324 movu xm4, [tmpq+offq+32*0] 325 vinserti128 m4, [tmpq+offq+32*1], 1 326 neg offq 327 movu xm5, [tmpq+offq+32*0] 328 vinserti128 m5, [tmpq+offq+32*1], 1 329%endif 330 paddw m0, m7 331 psubw m4, m1 ; diff_k1s2 332 psubw m5, m1 ; diff_k1s3 333 psignw m2, m3 ; constrain(diff_k1s1) 334 pabsw m3, m4 ; adiff_k1s2 335 paddw m0, m2 336 psrlw m2, m3, [sec_shift+gprsize] 337 psubusw m7, m6, m2 338 pabsw m2, m5 ; adiff_k1s3 339 pminsw m7, m3 340 psrlw m3, m2, [sec_shift+gprsize] 341 psignw m7, m4 ; constrain(diff_k1s2) 342 psubusw m4, m6, m3 343 pminsw m4, m2 344 paddw m0, m7 345 psignw m4, m5 ; constrain(diff_k1s3) 346 paddw m0, m4 ; sum 347 psraw m2, m0, 15 348 paddw m0, m2 349 pmulhrsw m0, m8 350 add tmpq, 32*2 351 paddw m0, m1 352%if %1 == 4 353 vextracti128 xm1, m0, 1 354 movq [dstq+strideq*0], xm0 355 movq [dstq+strideq*1], xm1 356 movhps [dstq+strideq*2], xm0 357 movhps [dstq+r9 ], xm1 358 lea dstq, [dstq+strideq*4] 359%else 360 mova [dstq+strideq*0], xm0 361 vextracti128 [dstq+strideq*1], m0, 1 362 lea dstq, [dstq+strideq*2] 363%endif 364 ret 365ALIGN function_align 366.pri_sec: 367 movsx offq, byte [dirq+8] ; off2_k0 368%if %1 == 4 369 mova m1, [tmpq+32*0] 370 punpcklqdq m1, [tmpq+32*1] 371 movu m2, [tmpq+offq+32*0] 372 punpcklqdq m2, [tmpq+offq+32*1] ; k0s0 373 neg offq 374 movu m3, [tmpq+offq+32*0] 375 punpcklqdq m3, [tmpq+offq+32*1] ; k0s1 376%else 377 mova xm1, [dstq+strideq*0] 378 vinserti128 m1, [dstq+strideq*1], 1 379 movu xm2, [tmpq+offq+32*0] 380 vinserti128 m2, [tmpq+offq+32*1], 1 381 neg offq 382 movu xm3, [tmpq+offq+32*0] 383 vinserti128 m3, [tmpq+offq+32*1], 1 384%endif 385 movsx offq, byte [dirq+0] ; off3_k0 386 pmaxsw m11, m2, m3 387 pminuw m12, m2, m3 388 psubw m2, m1 ; diff_k0s0 389 psubw m3, m1 ; diff_k0s1 390 pabsw m4, m2 ; adiff_k0s0 391 psrlw m5, m4, [sec_shift+gprsize] 392 psubusw m0, m13, m5 393 pabsw m5, m3 ; adiff_k0s1 394 pminsw m0, m4 395 psrlw m4, m5, [sec_shift+gprsize] 396 psignw m0, m2 ; constrain(diff_k0s0) 397 psubusw m2, m13, m4 398 pminsw m2, m5 399%if %1 == 4 400 movu m4, [tmpq+offq+32*0] 401 punpcklqdq m4, [tmpq+offq+32*1] ; k0s2 402 neg offq 403 movu m5, [tmpq+offq+32*0] 404 punpcklqdq m5, [tmpq+offq+32*1] ; k0s3 405%else 406 movu xm4, [tmpq+offq+32*0] 407 vinserti128 m4, [tmpq+offq+32*1], 1 408 neg offq 409 movu xm5, [tmpq+offq+32*0] 410 vinserti128 m5, [tmpq+offq+32*1], 1 411%endif 412 movsx offq, byte [dirq+9] ; off2_k1 413 psignw m2, m3 ; constrain(diff_k0s1) 414 pmaxsw m11, m4 415 pminuw m12, m4 416 pmaxsw m11, m5 417 pminuw m12, m5 418 psubw m4, m1 ; diff_k0s2 419 psubw m5, m1 ; diff_k0s3 420 paddw m0, m2 421 pabsw m3, m4 ; adiff_k0s2 422 psrlw m2, m3, [sec_shift+gprsize] 423 psubusw m7, m13, m2 424 pabsw m2, m5 ; adiff_k0s3 425 pminsw m7, m3 426 psrlw m3, m2, [sec_shift+gprsize] 427 psignw m7, m4 ; constrain(diff_k0s2) 428 psubusw m4, m13, m3 429 pminsw m4, m2 430%if %1 == 4 431 movu m2, [tmpq+offq+32*0] 432 punpcklqdq m2, [tmpq+offq+32*1] ; k1s0 433 neg offq 434 movu m3, [tmpq+offq+32*0] 435 punpcklqdq m3, [tmpq+offq+32*1] ; k1s1 436%else 437 movu xm2, [tmpq+offq+32*0] 438 vinserti128 m2, [tmpq+offq+32*1], 1 439 neg offq 440 movu xm3, [tmpq+offq+32*0] 441 vinserti128 m3, [tmpq+offq+32*1], 1 442%endif 443 movsx offq, byte [dirq+1] ; off3_k1 444 paddw m0, m7 445 psignw m4, m5 ; constrain(diff_k0s3) 446 pmaxsw m11, m2 447 pminuw m12, m2 448 pmaxsw m11, m3 449 pminuw m12, m3 450 paddw m0, m4 ; constrain(diff_k0) 451 psubw m2, m1 ; diff_k1s0 452 psubw m3, m1 ; diff_k1s1 453 paddw m0, m0 ; sec_tap_k0 454 pabsw m4, m2 ; adiff_k1s0 455 psrlw m5, m4, [sec_shift+gprsize] 456 psubusw m7, m13, m5 457 pabsw m5, m3 ; adiff_k1s1 458 pminsw m7, m4 459 psrlw m4, m5, [sec_shift+gprsize] 460 psignw m7, m2 ; constrain(diff_k1s0) 461 psubusw m2, m13, m4 462 pminsw m2, m5 463%if %1 == 4 464 movu m4, [tmpq+offq+32*0] 465 punpcklqdq m4, [tmpq+offq+32*1] ; k1s2 466 neg offq 467 movu m5, [tmpq+offq+32*0] 468 punpcklqdq m5, [tmpq+offq+32*1] ; k1s3 469%else 470 movu xm4, [tmpq+offq+32*0] 471 vinserti128 m4, [tmpq+offq+32*1], 1 472 neg offq 473 movu xm5, [tmpq+offq+32*0] 474 vinserti128 m5, [tmpq+offq+32*1], 1 475%endif 476 movsx offq, byte [dirq+4] ; off1_k0 477 paddw m0, m7 478 psignw m2, m3 ; constrain(diff_k1s1) 479 pmaxsw m11, m4 480 pminuw m12, m4 481 pmaxsw m11, m5 482 pminuw m12, m5 483 psubw m4, m1 ; diff_k1s2 484 psubw m5, m1 ; diff_k1s3 485 pabsw m3, m4 ; adiff_k1s2 486 paddw m0, m2 487 psrlw m2, m3, [sec_shift+gprsize] 488 psubusw m7, m13, m2 489 pabsw m2, m5 ; adiff_k1s3 490 pminsw m7, m3 491 psrlw m3, m2, [sec_shift+gprsize] 492 psignw m7, m4 ; constrain(diff_k1s2) 493 psubusw m4, m13, m3 494 pminsw m4, m2 495 paddw m0, m7 496%if %1 == 4 497 movu m2, [tmpq+offq+32*0] 498 punpcklqdq m2, [tmpq+offq+32*1] ; k0p0 499 neg offq 500 movu m3, [tmpq+offq+32*0] 501 punpcklqdq m3, [tmpq+offq+32*1] ; k0p1 502%else 503 movu xm2, [tmpq+offq+32*0] 504 vinserti128 m2, [tmpq+offq+32*1], 1 505 neg offq 506 movu xm3, [tmpq+offq+32*0] 507 vinserti128 m3, [tmpq+offq+32*1], 1 508%endif 509 movsx offq, byte [dirq+5] ; off1_k1 510 psignw m4, m5 ; constrain(diff_k1s3) 511 pmaxsw m11, m2 512 pminuw m12, m2 513 pmaxsw m11, m3 514 pminuw m12, m3 515 psubw m2, m1 ; diff_k0p0 516 psubw m3, m1 ; diff_k0p1 517 paddw m0, m4 518 pabsw m4, m2 ; adiff_k0p0 519 psrlw m5, m4, [pri_shift+gprsize] 520 psubusw m7, m6, m5 521 pabsw m5, m3 ; adiff_k0p1 522 pminsw m7, m4 523 psrlw m4, m5, [pri_shift+gprsize] 524 psignw m7, m2 ; constrain(diff_k0p0) 525 psubusw m2, m6, m4 526 pminsw m2, m5 527%if %1 == 4 528 movu m4, [tmpq+offq+32*0] 529 punpcklqdq m4, [tmpq+offq+32*1] ; k1p0 530 neg offq 531 movu m5, [tmpq+offq+32*0] 532 punpcklqdq m5, [tmpq+offq+32*1] ; k1p1 533%else 534 movu xm4, [tmpq+offq+32*0] 535 vinserti128 m4, [tmpq+offq+32*1], 1 536 neg offq 537 movu xm5, [tmpq+offq+32*0] 538 vinserti128 m5, [tmpq+offq+32*1], 1 539%endif 540 psignw m2, m3 ; constrain(diff_k0p1) 541 paddw m7, m2 ; constrain(diff_k0) 542 pmaxsw m11, m4 543 pminuw m12, m4 544 pmaxsw m11, m5 545 pminuw m12, m5 546 psubw m4, m1 ; diff_k1p0 547 psubw m5, m1 ; diff_k1p1 548 pabsw m3, m4 ; adiff_k1p0 549 pmullw m7, m9 ; pri_tap_k0 550 paddw m0, m7 551 psrlw m2, m3, [pri_shift+gprsize] 552 psubusw m7, m6, m2 553 pabsw m2, m5 ; adiff_k1p1 554 pminsw m7, m3 555 psrlw m3, m2, [pri_shift+gprsize] 556 psignw m7, m4 ; constrain(diff_k1p0) 557 psubusw m4, m6, m3 558 pminsw m4, m2 559 psignw m4, m5 ; constrain(diff_k1p1) 560 paddw m7, m4 ; constrain(diff_k1) 561 pmullw m7, m10 ; pri_tap_k1 562 paddw m0, m7 ; sum 563 psraw m2, m0, 15 564 paddw m0, m2 565 pmulhrsw m0, m8 566 add tmpq, 32*2 567 pmaxsw m11, m1 568 pminuw m12, m1 569 paddw m0, m1 570 pminsw m0, m11 571 pmaxsw m0, m12 572%if %1 == 4 573 vextracti128 xm1, m0, 1 574 movq [dstq+strideq*0], xm0 575 movq [dstq+strideq*1], xm1 576 movhps [dstq+strideq*2], xm0 577 movhps [dstq+r9 ], xm1 578 lea dstq, [dstq+strideq*4] 579%else 580 mova [dstq+strideq*0], xm0 581 vextracti128 [dstq+strideq*1], m0, 1 582 lea dstq, [dstq+strideq*2] 583%endif 584 ret 585%endif 586%endmacro 587 588INIT_YMM avx2 589cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \ 590 pri, sec, edge 591%if WIN64 592 %define px rsp+16*6 593 %define offq r8 594 %define pri_shift rsp+16*2 595 %define sec_shift rsp+16*3 596%else 597 %define px rsp+16*4 598 %define offq r4 599 %define pri_shift rsp+16*0 600 %define sec_shift rsp+16*1 601%endif 602 %define base r8-dir_table4 603 mov edged, r9m 604 lea r8, [dir_table4] 605 movu xm0, [dstq+strideq*0] 606 movu xm1, [dstq+strideq*1] 607 lea r9, [strideq*3] 608 movu xm2, [dstq+strideq*2] 609 movu xm3, [dstq+r9 ] 610 vpbroadcastd m7, [base+pw_m16384] 611 mova [px+16*0+0], xm0 612 mova [px+16*1+0], xm1 613 mova [px+16*2+0], xm2 614 mova [px+16*3+0], xm3 615 test edgeb, 4 ; HAVE_TOP 616 jz .no_top 617 movu xm0, [topq+strideq*0] 618 movu xm1, [topq+strideq*1] 619 mova [px-16*2+0], xm0 620 mova [px-16*1+0], xm1 621 test edgeb, 1 ; HAVE_LEFT 622 jz .top_no_left 623 movd xm0, [topq+strideq*0-4] 624 movd xm1, [topq+strideq*1-4] 625 movd [px-16*2-4], xm0 626 movd [px-16*1-4], xm1 627 jmp .top_done 628.no_top: 629 mova [px-16*2+0], m7 630.top_no_left: 631 movd [px-16*2-4], xm7 632 movd [px-16*1-4], xm7 633.top_done: 634 test edgeb, 8 ; HAVE_BOTTOM 635 jz .no_bottom 636 movu xm0, [botq+strideq*0] 637 movu xm1, [botq+strideq*1] 638 mova [px+16*4+0], xm0 639 mova [px+16*5+0], xm1 640 test edgeb, 1 ; HAVE_LEFT 641 jz .bottom_no_left 642 movd xm0, [botq+strideq*0-4] 643 movd xm1, [botq+strideq*1-4] 644 movd [px+16*4-4], xm0 645 movd [px+16*5-4], xm1 646 jmp .bottom_done 647.no_bottom: 648 mova [px+16*4+0], m7 649.bottom_no_left: 650 movd [px+16*4-4], xm7 651 movd [px+16*5-4], xm7 652.bottom_done: 653 test edgeb, 1 ; HAVE_LEFT 654 jz .no_left 655 movd xm0, [leftq+4*0] 656 movd xm1, [leftq+4*1] 657 movd xm2, [leftq+4*2] 658 movd xm3, [leftq+4*3] 659 movd [px+16*0-4], xm0 660 movd [px+16*1-4], xm1 661 movd [px+16*2-4], xm2 662 movd [px+16*3-4], xm3 663 jmp .left_done 664.no_left: 665 REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3 666.left_done: 667 test edgeb, 2 ; HAVE_RIGHT 668 jnz .padding_done 669 REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5 670.padding_done: 671 CDEF_FILTER 4, 4 672 673cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \ 674 pri, sec, edge 675 mov edged, r9m 676 movu xm0, [dstq+strideq*0] 677 movu xm1, [dstq+strideq*1] 678 lea r9, [strideq*3] 679 movu xm2, [dstq+strideq*2] 680 movu xm3, [dstq+r9 ] 681 lea r6, [dstq+strideq*4] 682 movu xm4, [r6 +strideq*0] 683 movu xm5, [r6 +strideq*1] 684 movu xm6, [r6 +strideq*2] 685 movu xm7, [r6 +r9 ] 686 lea r8, [dir_table4] 687 mova [px+16*0+0], xm0 688 mova [px+16*1+0], xm1 689 mova [px+16*2+0], xm2 690 mova [px+16*3+0], xm3 691 mova [px+16*4+0], xm4 692 mova [px+16*5+0], xm5 693 mova [px+16*6+0], xm6 694 mova [px+16*7+0], xm7 695 vpbroadcastd m7, [base+pw_m16384] 696 test edgeb, 4 ; HAVE_TOP 697 jz .no_top 698 movu xm0, [topq+strideq*0] 699 movu xm1, [topq+strideq*1] 700 mova [px-16*2+0], xm0 701 mova [px-16*1+0], xm1 702 test edgeb, 1 ; HAVE_LEFT 703 jz .top_no_left 704 movd xm0, [topq+strideq*0-4] 705 movd xm1, [topq+strideq*1-4] 706 movd [px-16*2-4], xm0 707 movd [px-16*1-4], xm1 708 jmp .top_done 709.no_top: 710 mova [px-16*2+0], m7 711.top_no_left: 712 movd [px-16*2-4], xm7 713 movd [px-16*1-4], xm7 714.top_done: 715 test edgeb, 8 ; HAVE_BOTTOM 716 jz .no_bottom 717 movu xm0, [botq+strideq*0] 718 movu xm1, [botq+strideq*1] 719 mova [px+16*8+0], xm0 720 mova [px+16*9+0], xm1 721 test edgeb, 1 ; HAVE_LEFT 722 jz .bottom_no_left 723 movd xm0, [botq+strideq*0-4] 724 movd xm1, [botq+strideq*1-4] 725 movd [px+16*8-4], xm0 726 movd [px+16*9-4], xm1 727 jmp .bottom_done 728.no_bottom: 729 mova [px+16*8+0], m7 730.bottom_no_left: 731 movd [px+16*8-4], xm7 732 movd [px+16*9-4], xm7 733.bottom_done: 734 test edgeb, 1 ; HAVE_LEFT 735 jz .no_left 736 movd xm0, [leftq+4*0] 737 movd xm1, [leftq+4*1] 738 movd xm2, [leftq+4*2] 739 movd xm3, [leftq+4*3] 740 movd [px+16*0-4], xm0 741 movd [px+16*1-4], xm1 742 movd [px+16*2-4], xm2 743 movd [px+16*3-4], xm3 744 movd xm0, [leftq+4*4] 745 movd xm1, [leftq+4*5] 746 movd xm2, [leftq+4*6] 747 movd xm3, [leftq+4*7] 748 movd [px+16*4-4], xm0 749 movd [px+16*5-4], xm1 750 movd [px+16*6-4], xm2 751 movd [px+16*7-4], xm3 752 jmp .left_done 753.no_left: 754 REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7 755.left_done: 756 test edgeb, 2 ; HAVE_RIGHT 757 jnz .padding_done 758 REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 759.padding_done: 760 CDEF_FILTER 4, 8 761 762cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \ 763 pri, sec, edge 764%if WIN64 765 %define px rsp+32*4 766%else 767 %define px rsp+32*3 768%endif 769 %define base r8-dir_table8 770 mov edged, r9m 771 movu m0, [dstq+strideq*0] 772 movu m1, [dstq+strideq*1] 773 lea r6, [dstq+strideq*2] 774 movu m2, [r6 +strideq*0] 775 movu m3, [r6 +strideq*1] 776 lea r6, [r6 +strideq*2] 777 movu m4, [r6 +strideq*0] 778 movu m5, [r6 +strideq*1] 779 lea r6, [r6 +strideq*2] 780 movu m6, [r6 +strideq*0] 781 movu m7, [r6 +strideq*1] 782 lea r8, [dir_table8] 783 mova [px+32*0+0], m0 784 mova [px+32*1+0], m1 785 mova [px+32*2+0], m2 786 mova [px+32*3+0], m3 787 mova [px+32*4+0], m4 788 mova [px+32*5+0], m5 789 mova [px+32*6+0], m6 790 mova [px+32*7+0], m7 791 vpbroadcastd m7, [base+pw_m16384] 792 test edgeb, 4 ; HAVE_TOP 793 jz .no_top 794 movu m0, [topq+strideq*0] 795 movu m1, [topq+strideq*1] 796 mova [px-32*2+0], m0 797 mova [px-32*1+0], m1 798 test edgeb, 1 ; HAVE_LEFT 799 jz .top_no_left 800 movd xm0, [topq+strideq*0-4] 801 movd xm1, [topq+strideq*1-4] 802 movd [px-32*2-4], xm0 803 movd [px-32*1-4], xm1 804 jmp .top_done 805.no_top: 806 mova [px-32*2+0], m7 807 mova [px-32*1+0], m7 808.top_no_left: 809 movd [px-32*2-4], xm7 810 movd [px-32*1-4], xm7 811.top_done: 812 test edgeb, 8 ; HAVE_BOTTOM 813 jz .no_bottom 814 movu m0, [botq+strideq*0] 815 movu m1, [botq+strideq*1] 816 mova [px+32*8+0], m0 817 mova [px+32*9+0], m1 818 test edgeb, 1 ; HAVE_LEFT 819 jz .bottom_no_left 820 movd xm0, [botq+strideq*0-4] 821 movd xm1, [botq+strideq*1-4] 822 movd [px+32*8-4], xm0 823 movd [px+32*9-4], xm1 824 jmp .bottom_done 825.no_bottom: 826 mova [px+32*8+0], m7 827 mova [px+32*9+0], m7 828.bottom_no_left: 829 movd [px+32*8-4], xm7 830 movd [px+32*9-4], xm7 831.bottom_done: 832 test edgeb, 1 ; HAVE_LEFT 833 jz .no_left 834 movd xm0, [leftq+4*0] 835 movd xm1, [leftq+4*1] 836 movd xm2, [leftq+4*2] 837 movd xm3, [leftq+4*3] 838 movd [px+32*0-4], xm0 839 movd [px+32*1-4], xm1 840 movd [px+32*2-4], xm2 841 movd [px+32*3-4], xm3 842 movd xm0, [leftq+4*4] 843 movd xm1, [leftq+4*5] 844 movd xm2, [leftq+4*6] 845 movd xm3, [leftq+4*7] 846 movd [px+32*4-4], xm0 847 movd [px+32*5-4], xm1 848 movd [px+32*6-4], xm2 849 movd [px+32*7-4], xm3 850 jmp .left_done 851.no_left: 852 REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7 853.left_done: 854 test edgeb, 2 ; HAVE_RIGHT 855 jnz .padding_done 856 REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 857.padding_done: 858 CDEF_FILTER 8, 8 859 860cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax 861 lea r6, [dir_shift] 862 shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc 863 vpbroadcastd m4, [r6+bdmaxq*4] 864 lea r6, [strideq*3] 865 mova xm0, [srcq+strideq*0] 866 mova xm1, [srcq+strideq*1] 867 mova xm2, [srcq+strideq*2] 868 mova xm3, [srcq+r6 ] 869 lea srcq, [srcq+strideq*4] 870 vinserti128 m0, [srcq+r6 ], 1 871 vinserti128 m1, [srcq+strideq*2], 1 872 vinserti128 m2, [srcq+strideq*1], 1 873 vinserti128 m3, [srcq+strideq*0], 1 874 REPX {pmulhuw x, m4}, m0, m1, m2, m3 875 jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main 876 877%endif ; ARCH_X86_64 878