1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28%include "x86/filmgrain_common.asm" 29 30SECTION_RODATA 16 31pd_16: times 4 dd 16 32pw_1: times 8 dw 1 33pw_16384: times 8 dw 16384 34pw_8192: times 8 dw 8192 35pw_23_22: dw 23, 22 36 times 3 dw 0, 32 37pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 38pw_27_17_17_27: dw 27, 17, 17, 27 39 times 2 dw 0, 32 40rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 41pw_seed_xor: times 2 dw 0xb524 42 times 2 dw 0x49d8 43pb_1: times 4 db 1 44hmul_bits: dw 32768, 16384, 8192, 4096 45round: dw 2048, 1024, 512 46mul_bits: dw 256, 128, 64, 32, 16 47round_vals: dw 32, 64, 128, 256, 512, 1024 48max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 49min: dw 0, 16*4, 16*16 50; these two should be next to each other 51pw_4: times 2 dw 4 52pw_16: times 2 dw 16 53 54%macro JMP_TABLE 1-* 55 %xdefine %1_table %%table 56 %xdefine %%base %1_table 57 %xdefine %%prefix mangle(private_prefix %+ _%1) 58 %%table: 59 %rep %0 - 1 60 dd %%prefix %+ .ar%2 - %%base 61 %rotate 1 62 %endrep 63%endmacro 64 65JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 66JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 67JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 68JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3 69 70SECTION .text 71 72%if ARCH_X86_32 73%undef base 74%define PIC_ptr(a) base+a 75%else 76%define PIC_ptr(a) a 77%endif 78 79%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 80 81%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg 82%assign %%idx 0 83%define %%tmp %2 84%if %0 == 8 85%define %%tmp %8 86%endif 87%rep (%6/2) 88%if %%idx == 0 89 movd %5 %+ d, %2 90 pshuflw %%tmp, %2, q3232 91%else 92 movd %5 %+ d, %%tmp 93%if %6 == 8 94%if %%idx == 2 95 punpckhqdq %%tmp, %%tmp 96%elif %%idx == 4 97 psrlq %%tmp, 32 98%endif 99%endif 100%endif 101 movzx %4 %+ d, %5 %+ w 102 shr %5 %+ d, 16 103 104%if %%idx == 0 105 movd %1, [%3+%4*%7] 106%else 107 pinsrw %1, [%3+%4*%7], %%idx + 0 108%endif 109 pinsrw %1, [%3+%5*%7], %%idx + 1 110%assign %%idx %%idx+2 111%endrep 112%endmacro 113 114%macro SPLATD 2 ; dst, src 115%ifnidn %1, %2 116 movd %1, %2 117%endif 118 pshufd %1, %1, q0000 119%endmacro 120 121%macro SPLATW 2 ; dst, src 122%ifnidn %1, %2 123 movd %1, %2 124%endif 125 pshuflw %1, %1, q0000 126 punpcklqdq %1, %1 127%endmacro 128 129 130INIT_XMM ssse3 131%if ARCH_X86_64 132cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax 133 lea r4, [pb_mask] 134%define base r4-pb_mask 135%else 136cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax 137 LEA r4, $$ 138%define base r4-$$ 139%endif 140 movq m1, [base+rnd_next_upperbit_mask] 141 movq m4, [base+mul_bits] 142 movq m7, [base+hmul_bits] 143 mov r3d, [fg_dataq+FGData.grain_scale_shift] 144 lea r5d, [bdmaxq+1] 145 shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc 146 sub r3, r5 147 SPLATW m6, [base+round+r3*2-2] 148 mova m5, [base+pb_mask] 149 SPLATW m0, [fg_dataq+FGData.seed] 150 mov r3, -73*82*2 151 sub bufq, r3 152%if ARCH_X86_64 153 lea r6, [gaussian_sequence] 154%endif 155.loop: 156 pand m2, m0, m1 157 psrlw m3, m2, 10 158 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 159 pmullw m2, m4 ; bits 0x0f00 are set 160 pshufb m3, m5, m2 ; set 15th bit for next 4 seeds 161 psllq m2, m3, 30 162 por m2, m3 163 psllq m3, m2, 15 164 por m2, m3 ; aggregate each bit into next seed's high bit 165 pmulhuw m3, m0, m7 166 por m2, m3 ; 4 next output seeds 167 pshuflw m0, m2, q3333 168 psrlw m2, 5 169%if ARCH_X86_64 170 vpgatherdw m3, m2, r6, r5, r7, 4, 2 171%else 172 vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2 173%endif 174 paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 175 ; shifts by 0, which pmulhrsw does not support 176 pmulhrsw m3, m6 177 movq [bufq+r3], m3 178 add r3, 4*2 179 jl .loop 180 181 ; auto-regression code 182 movsxd r3, [fg_dataq+FGData.ar_coeff_lag] 183 movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4] 184 lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table] 185 jmp r3 186 187.ar1: 188%if WIN64 189 DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 190 lea bufq, [r0-2*(82*73-(82*3+79))] 191 PUSH r8 192%else 193%if ARCH_X86_64 194 DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 195%else ; x86-32 196 DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0 197 PUSH r6 198%define shiftd r1d 199%endif 200 sub bufq, 2*(82*73-(82*3+79)) 201%endif 202 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 203 movd m4, [fg_dataq+FGData.ar_coeffs_y] 204 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 205%if WIN64 206 DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 207%elif ARCH_X86_64 208 DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 209%else ; x86-32 210%undef shiftd 211 DEFINE_ARGS buf, shift, min, val3, x, cf3, val0 212%define hd dword r0m 213%define maxd dword minm 214%endif 215%if cpuflag(sse4) 216 pmovsxbw m4, m4 217%else 218 pxor m3, m3 219 pcmpgtb m3, m4 220 punpcklbw m4, m3 221%endif 222 pinsrw m4, [base+pw_1], 3 223 pshufd m5, m4, q1111 224 pshufd m4, m4, q0000 225 SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd 226 mov hd, 70 227 sar maxd, 1 228 mov mind, maxd 229 xor mind, -1 230.y_loop_ar1: 231 mov xq, -76 232 movsx val3d, word [bufq+xq*2-2] 233.x_loop_ar1: 234 movu m0, [bufq+xq*2-82*2-2] ; top/left 235 psrldq m2, m0, 2 ; top 236 psrldq m1, m0, 4 ; top/right 237 punpcklwd m0, m2 238 punpcklwd m1, m3 239 pmaddwd m0, m4 240 pmaddwd m1, m5 241 paddd m0, m1 242.x_loop_ar1_inner: 243 movd val0d, m0 244 psrldq m0, 4 245 imul val3d, cf3d 246 add val3d, val0d 247 sar val3d, shiftb 248 movsx val0d, word [bufq+xq*2] 249 add val3d, val0d 250 cmp val3d, maxd 251 cmovg val3d, maxd 252 cmp val3d, mind 253 cmovl val3d, mind 254 mov word [bufq+xq*2], val3w 255 ; keep val3d in-place as left for next x iteration 256 inc xq 257 jz .x_loop_ar1_end 258 test xq, 3 259 jnz .x_loop_ar1_inner 260 jmp .x_loop_ar1 261 262.x_loop_ar1_end: 263 add bufq, 82*2 264 dec hd 265 jg .y_loop_ar1 266%if WIN64 267 POP r8 268%elif ARCH_X86_32 269 POP r6 270%undef maxd 271%undef hd 272%endif 273.ar0: 274 RET 275 276.ar2: 277%if ARCH_X86_32 278 ALLOC_STACK -16*8 279%endif 280 DEFINE_ARGS buf, fg_data, bdmax, shift 281 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 282 movd m0, [base+round_vals-12+shiftq*2] 283 pshuflw m0, m0, q0000 284 movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 285 pxor m2, m2 286 punpcklwd m0, m2 287 pcmpgtb m2, m6 288 punpckhbw m3, m6, m2 289 punpcklbw m6, m2 290 pshufd m2, m6, q3333 291 pshufd m1, m6, q2222 292 pshufd m7, m6, q1111 293 pshufd m6, m6, q0000 294 pshufd m4, m3, q1111 295 pshufd m3, m3, q0000 296%if ARCH_X86_64 297 SWAP 0, 12 298 SWAP 1, 8 299 SWAP 2, 9 300 SWAP 3, 10 301 SWAP 4, 11 302%else 303%define m12 [rsp+0*16] 304%define m8 [rsp+1*16] 305%define m9 [rsp+2*16] 306%define m10 [rsp+3*16] 307%define m11 [rsp+4*16] 308 mova m12, m0 309 mova m8, m1 310 mova m9, m2 311 mova m10, m3 312 mova m11, m4 313 mov bdmaxd, bdmaxm 314%endif 315 sar bdmaxd, 1 316 SPLATW m0, bdmaxd ; max_grain 317 pcmpeqw m1, m1 318%if !cpuflag(sse4) 319 pcmpeqw m2, m2 320 psrldq m2, 14 321 pslldq m2, 2 322 pxor m2, m1 323%endif 324 pxor m1, m0 ; min_grain 325%if ARCH_X86_64 326 SWAP 0, 13 327 SWAP 1, 14 328 SWAP 2, 15 329%else 330%define m13 [rsp+5*16] 331%define m14 [rsp+6*16] 332 mova m13, m0 333 mova m14, m1 334%if !cpuflag(sse4) 335%define m15 [rsp+7*16] 336 mova m15, m2 337%endif 338%endif 339 sub bufq, 2*(82*73-(82*3+79)) 340 DEFINE_ARGS buf, fg_data, h, x 341 mov hd, 70 342.y_loop_ar2: 343 mov xq, -76 344 345.x_loop_ar2: 346 movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] 347 movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] 348 psrldq m2, m0, 2 349 psrldq m3, m0, 4 350 psrldq m4, m0, 6 351 psrldq m5, m0, 8 352 punpcklwd m0, m2 353 punpcklwd m3, m4 354 punpcklwd m5, m1 355 psrldq m2, m1, 2 356 psrldq m4, m1, 4 357 punpcklwd m2, m4 358 psrldq m4, m1, 6 359 psrldq m1, 8 360 punpcklwd m4, m1 361 pmaddwd m0, m6 362 pmaddwd m3, m7 363 pmaddwd m5, m8 364 pmaddwd m2, m9 365 pmaddwd m4, m10 366 paddd m0, m3 367 paddd m5, m2 368 paddd m0, m4 369 paddd m0, m5 ; accumulated top 2 rows 370 paddd m0, m12 371 372 movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] 373 pshufd m4, m1, q3321 374 pxor m2, m2 375 pcmpgtw m2, m4 376 punpcklwd m4, m2 ; in dwords, y=0,x=[0,3] 377.x_loop_ar2_inner: 378 pmaddwd m2, m1, m11 379 paddd m2, m0 380 psrldq m0, 4 ; shift top to next pixel 381 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 382 paddd m2, m4 383 packssdw m2, m2 384 pminsw m2, m13 385 pmaxsw m2, m14 386 psrldq m4, 4 387 pslldq m2, 2 388 psrldq m1, 2 389%if cpuflag(sse4) 390 pblendw m1, m2, 00000010b 391%else 392 pand m1, m15 393 pandn m3, m15, m2 394 por m1, m3 395%endif 396 ; overwrite previous pixel, this should be ok 397 movd [bufq+xq*2-2], m1 398 inc xq 399 jz .x_loop_ar2_end 400 test xq, 3 401 jnz .x_loop_ar2_inner 402 jmp .x_loop_ar2 403 404.x_loop_ar2_end: 405 add bufq, 82*2 406 dec hd 407 jg .y_loop_ar2 408%if ARCH_X86_32 409%undef m8 410%undef m9 411%undef m10 412%undef m11 413%undef m12 414%undef m13 415%undef m14 416%undef m15 417%endif 418 RET 419 420.ar3: 421 DEFINE_ARGS buf, fg_data, bdmax, shift 422%if WIN64 423 mov r6, rsp 424 and rsp, ~15 425 sub rsp, 64 426 %define tmp rsp 427%elif ARCH_X86_64 428 %define tmp rsp+stack_offset-72 429%else 430 ALLOC_STACK -16*12 431 %define tmp rsp 432 mov bdmaxd, bdmaxm 433%endif 434 sar bdmaxd, 1 435 SPLATW m7, bdmaxd ; max_grain 436 pcmpeqw m6, m6 437%if !cpuflag(sse4) 438 pcmpeqw m4, m4 439 psrldq m4, 14 440 pslldq m4, 4 441 pxor m4, m6 442%endif 443 pxor m6, m7 ; min_grain 444 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 445 446%if ARCH_X86_64 447 SWAP 6, 14 448 SWAP 7, 15 449%else 450%define m14 [rsp+10*16] 451%define m15 [esp+11*16] 452 mova m14, m6 453 mova m15, m7 454%endif 455 456 ; build cf0-1 until 18-19 in m5-12 and r0/1 457 pxor m1, m1 458 movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 459 pcmpgtb m1, m0 460 punpckhbw m2, m0, m1 461 punpcklbw m0, m1 462 463%if cpuflag(sse4) 464 pshufd m4, m2, q3333 465%else 466 pshufd m5, m2, q3333 467 mova [tmp+48], m5 468%endif 469 pshufd m3, m2, q2222 470 pshufd m1, m2, q0000 471 pshufd m2, m2, q1111 472 pshufd m7, m0, q2222 473 pshufd m6, m0, q1111 474 pshufd m5, m0, q0000 475 pshufd m0, m0, q3333 476 477%if ARCH_X86_64 478 SWAP 0, 8 479 SWAP 1, 9 480 SWAP 2, 10 481 SWAP 3, 11 482 SWAP 4, 12 483%else 484%define m8 [rsp+4*16] 485%define m9 [esp+5*16] 486%define m10 [rsp+6*16] 487%define m11 [esp+7*16] 488%define m12 [rsp+8*16] 489 mova m8, m0 490 mova m9, m1 491 mova m10, m2 492 mova m11, m3 493 mova m12, m4 494%endif 495 496 ; build cf20,round in r2 497 ; build cf21-23,round*2 in m13 498 pxor m1, m1 499 movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 500 pcmpgtb m1, m0 501 punpcklbw m0, m1 502 pshufd m1, m0, q0000 503 pshufd m2, m0, q1111 504 mova [tmp+ 0], m1 505 mova [tmp+16], m2 506 psrldq m3, m0, 10 507 pinsrw m3, [base+round_vals+shiftq*2-10], 3 508 509%if ARCH_X86_64 510 SWAP 3, 13 511%else 512%define m13 [esp+9*16] 513 mova m13, m3 514%endif 515 516 pinsrw m0, [base+round_vals+shiftq*2-12], 5 517 pshufd m3, m0, q2222 518 mova [tmp+32], m3 519 520 DEFINE_ARGS buf, fg_data, h, x 521 sub bufq, 2*(82*73-(82*3+79)) 522 mov hd, 70 523.y_loop_ar3: 524 mov xq, -76 525 526.x_loop_ar3: 527 movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] 528 movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] 529 palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] 530 palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] 531 punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] 532 punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] 533 shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] 534 535 pmaddwd m0, m5 536 pmaddwd m2, m6 537 pmaddwd m3, m7 538 paddd m0, m2 539 paddd m0, m3 540 ; m0 = top line first 6 multiplied by cf, m1 = top line last entry 541 542 movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] 543 movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] 544 punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] 545 palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5] 546 palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6] 547 punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] 548 punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 549 shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 550 551 pmaddwd m1, m8 552 pmaddwd m4, m9 553 pmaddwd m3, m10 554 pmaddwd m2, m11 555 paddd m1, m4 556 paddd m3, m2 557 paddd m0, m1 558 paddd m0, m3 559 ; m0 = top 2 lines multiplied by cf 560 561 movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] 562 movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] 563 palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] 564 palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] 565 punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] 566 punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] 567 shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] 568 punpcklwd m2, [base+pw_1] 569 570%if cpuflag(sse4) 571 pmaddwd m1, m12 572%else 573 pmaddwd m1, [tmp+48] 574%endif 575 pmaddwd m3, [tmp+ 0] 576 pmaddwd m4, [tmp+16] 577 pmaddwd m2, [tmp+32] 578 paddd m1, m3 579 paddd m4, m2 580 paddd m0, m1 581 paddd m0, m4 582 ; m0 = top 3 lines multiplied by cf plus rounding for downshift 583 584 movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] 585.x_loop_ar3_inner: 586 pmaddwd m2, m1, m13 587 pshufd m3, m2, q1111 588 paddd m2, m3 ; left+cur 589 paddd m2, m0 ; add top 590 psrldq m0, 4 591 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 592 packssdw m2, m2 593 pminsw m2, m15 594 pmaxsw m2, m14 595 pslldq m2, 4 596 psrldq m1, 2 597%if cpuflag(sse4) 598 pblendw m1, m2, 00000100b 599%else 600 pand m1, m12 601 pandn m3, m12, m2 602 por m1, m3 603%endif 604 ; overwrite a couple of pixels, should be ok 605 movq [bufq+xq*2-4], m1 606 inc xq 607 jz .x_loop_ar3_end 608 test xq, 3 609 jnz .x_loop_ar3_inner 610 jmp .x_loop_ar3 611 612.x_loop_ar3_end: 613 add bufq, 82*2 614 dec hd 615 jg .y_loop_ar3 616%if WIN64 617 mov rsp, r6 618%elif ARCH_X86_32 619%undef m8 620%undef m9 621%undef m10 622%undef m11 623%undef m12 624%undef m13 625%undef m14 626%undef m15 627%endif 628 RET 629 630%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y 631INIT_XMM ssse3 632%if ARCH_X86_64 633cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg 634%define base r8-pb_mask 635 lea r8, [pb_mask] 636 movifnidn bdmaxd, bdmaxm 637 lea r6d, [bdmaxq+1] 638%else 639cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h 640%define base r2-$$ 641 LEA r2, $$ 642 mov fg_dataq, r2m 643 mov r6d, r4m 644 inc r6d 645%endif 646 movq m1, [base+rnd_next_upperbit_mask] 647 movq m4, [base+mul_bits] 648 movq m7, [base+hmul_bits] 649 mov r5d, [fg_dataq+FGData.grain_scale_shift] 650 shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc 651 sub r5, r6 652 SPLATW m6, [base+round+r5*2-2] 653 mova m5, [base+pb_mask] 654 SPLATW m0, [fg_dataq+FGData.seed] 655%if ARCH_X86_64 656 SPLATW m2, [base+pw_seed_xor+uvq*4] 657%else 658 mov r5d, r3m 659 SPLATW m2, [base+pw_seed_xor+r5*4] 660%endif 661 pxor m0, m2 662%if ARCH_X86_64 663 lea r6, [gaussian_sequence] 664%endif 665%if %2 666 mov hd, 73-35*%3 667 add bufq, 44*2 668.loop_y: 669 mov xq, -44 670%else 671 mov xq, -82*73 672 add bufq, 82*73*2 673%endif 674.loop_x: 675 pand m2, m0, m1 676 psrlw m3, m2, 10 677 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 678 pmullw m2, m4 ; bits 0x0f00 are set 679 pshufb m3, m5, m2 ; set 15th bit for next 4 seeds 680 psllq m2, m3, 30 681 por m2, m3 682 psllq m3, m2, 15 683 por m2, m3 ; aggregate each bit into next seed's high bit 684 pmulhuw m3, m0, m7 685 por m2, m3 ; 4 next output seeds 686 pshuflw m0, m2, q3333 687 psrlw m2, 5 688%if ARCH_X86_64 689 vpgatherdw m3, m2, r6, r9, r10, 4, 2 690%else 691 vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2 692%endif 693 paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 694 ; shifts by 0, which pmulhrsw does not support 695 pmulhrsw m3, m6 696 movq [bufq+xq*2], m3 697 add xq, 4 698 jl .loop_x 699%if %2 700 add bufq, 82*2 701 dec hd 702 jg .loop_y 703%endif 704 705 ; auto-regression code 706 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 707 movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4] 708 lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table] 709 jmp r5 710 711.ar0: 712%if ARCH_X86_64 713 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 714%else 715 DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift 716 ALLOC_STACK -16*2 717 mov bufyq, r1m 718 mov uvd, r3m 719%endif 720 imul uvd, 28 721 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 722 movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 723 SPLATW m3, [base+hmul_bits+shiftq*2-10] 724%if ARCH_X86_64 725 sar bdmaxd, 1 726 SPLATW m1, bdmaxd ; max_gain 727%else 728 SPLATW m1, r4m 729 psraw m1, 1 730%endif 731 pcmpeqw m7, m7 732 pxor m7, m1 ; min_grain 733%if ARCH_X86_64 734 SWAP 1, 14 735 DEFINE_ARGS buf, bufy, h, x 736%else 737%define m14 [rsp+0*16] 738 mova m14, m1 739 DEFINE_ARGS buf, bufy, pic_reg, h, x 740%endif 741 pxor m5, m5 742 pcmpgtb m5, m4 743 punpcklbw m4, m5 744%if %2 745 SPLATW m6, [base+hmul_bits+2+%3*2] 746%endif 747 SPLATW m4, m4 748 pxor m5, m5 749%if %2 750%if !cpuflag(sse4) 751 pcmpeqw m2, m2 752 pslldq m2, 12 753%if ARCH_X86_64 754 SWAP 2, 12 755%else 756%define m12 [rsp+1*16] 757 mova m12, m2 758%endif 759%endif 760%endif 761%if %2 762 sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) 763%else 764 sub bufq, 2*(82*70-3) 765%endif 766 add bufyq, 2*(3+82*3) 767 mov hd, 70-35*%3 768.y_loop_ar0: 769 ; first 32 pixels 770 xor xd, xd 771.x_loop_ar0: 772 movu m0, [bufyq+xq*(2<<%2)] 773%if %2 774%if %3 775 movu m2, [bufyq+xq*4+82*2] 776 paddw m0, m2 777%endif 778 movu m1, [bufyq+xq*4 +16] 779%if %3 780 movu m2, [bufyq+xq*4+82*2+16] 781 paddw m1, m2 782%endif 783 phaddw m0, m1 784 pmulhrsw m0, m6 785%endif 786 punpckhwd m1, m0, m5 787 punpcklwd m0, m5 788 REPX {pmaddwd x, m4}, m0, m1 789 REPX {psrad x, 5}, m0, m1 790 packssdw m0, m1 791 pmulhrsw m0, m3 792 movu m1, [bufq+xq*2] 793 paddw m0, m1 794 pminsw m0, m14 795 pmaxsw m0, m7 796 cmp xd, 72-40*%2 797 je .end 798 movu [bufq+xq*2], m0 799 add xd, 8 800 jmp .x_loop_ar0 801 802 ; last 6/4 pixels 803.end: 804%if %2 805%if cpuflag(sse4) 806 pblendw m0, m1, 11000000b 807%else 808 pand m1, m12 809 pandn m2, m12, m0 810 por m0, m1, m2 811%endif 812 movu [bufq+xq*2], m0 813%else 814 movq [bufq+xq*2], m0 815%endif 816 817 add bufq, 82*2 818 add bufyq, 82*(2<<%3) 819 dec hd 820 jg .y_loop_ar0 821%if ARCH_X86_32 822%undef m12 823%undef m14 824%endif 825 RET 826 827.ar1: 828%if ARCH_X86_64 829 DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x 830%else 831 RESET_STACK_STATE 832 DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 833 mov bufyq, r1m 834 mov uvd, r3m 835%endif 836 imul uvd, 28 837 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 838 movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 839%if WIN64 840 DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 841%if %2 842 lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))] 843%else 844 lea bufq, [r0-2*(82*69+3)] 845%endif 846%else 847%if ARCH_X86_64 848 DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 849%else 850 DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3 851%define hd dword r1m 852%define mind dword r3m 853%define maxd dword r4m 854%endif 855%if %2 856 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 857%else 858 sub bufq, 2*(82*69+3) 859%endif 860%endif 861%if ARCH_X86_64 862 mov shiftd, [r2+FGData.ar_coeff_shift] 863%else 864 mov shiftd, [r3+FGData.ar_coeff_shift] 865%endif 866 pxor m5, m5 867 pcmpgtb m5, m4 868 punpcklbw m4, m5 ; cf0-4 in words 869 pshuflw m4, m4, q2100 870 psrldq m4, 2 ; cf0-3,4 in words 871 pshufd m5, m4, q1111 872 pshufd m4, m4, q0000 873 movd m3, [base+round_vals+shiftq*2-12] ; rnd 874 pxor m6, m6 875 punpcklwd m3, m6 876%if %2 877 SPLATW m6, [base+hmul_bits+2+%3*2] 878%endif 879 SPLATD m3, m3 880 add bufyq, 2*(79+82*3) 881 mov hd, 70-35*%3 882 sar maxd, 1 883%if ARCH_X86_64 884 mov mind, maxd 885 xor mind, -1 886%else 887 DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3 888 mov r2, maxd 889 xor r2, -1 890 mov mind, r2 891%endif 892.y_loop_ar1: 893 mov xq, -(76>>%2) 894 movsx val3d, word [bufq+xq*2-2] 895.x_loop_ar1: 896 movu m0, [bufq+xq*2-82*2-2] ; top/left 897%if %2 898 movu m7, [bufyq+xq*4] 899%if %3 900 movu m1, [bufyq+xq*4+82*2] 901 phaddw m7, m1 902%else 903 phaddw m7, m7 904%endif 905%else 906 movq m7, [bufyq+xq*2] 907%endif 908 psrldq m2, m0, 2 ; top 909 psrldq m1, m0, 4 ; top/right 910 punpcklwd m0, m2 911%if %2 912%if %3 913 pshufd m2, m7, q3232 914 paddw m7, m2 915%endif 916 pmulhrsw m7, m6 917%endif 918 punpcklwd m1, m7 919 pmaddwd m0, m4 920 pmaddwd m1, m5 921 paddd m0, m1 922 paddd m0, m3 923.x_loop_ar1_inner: 924 movd val0d, m0 925 psrldq m0, 4 926 imul val3d, cf3d 927 add val3d, val0d 928 sar val3d, shiftb 929 movsx val0d, word [bufq+xq*2] 930 add val3d, val0d 931 cmp val3d, maxd 932 cmovg val3d, maxd 933 cmp val3d, mind 934 cmovl val3d, mind 935 mov word [bufq+xq*2], val3w 936 ; keep val3d in-place as left for next x iteration 937 inc xq 938 jz .x_loop_ar1_end 939 test xq, 3 940 jnz .x_loop_ar1_inner 941 jmp .x_loop_ar1 942 943.x_loop_ar1_end: 944 add bufq, 82*2 945 add bufyq, 82*2<<%3 946 dec hd 947 jg .y_loop_ar1 948%if ARCH_X86_32 949%undef maxd 950%undef mind 951%undef hd 952%endif 953 RET 954 955.ar2: 956%if ARCH_X86_64 957 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 958%else 959 DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift 960 ALLOC_STACK -16*8 961 mov bufyq, r1m 962 mov uvd, r3m 963%endif 964 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 965 imul uvd, 28 966%if ARCH_X86_64 967 sar bdmaxd, 1 968 SPLATW m5, bdmaxd ; max_grain 969%else 970 SPLATW m5, r4m 971 psraw m5, 1 972%endif 973 pcmpeqw m6, m6 974%if !cpuflag(sse4) 975 pcmpeqw m7, m7 976 psrldq m7, 14 977 pslldq m7, 2 978 pxor m7, m6 979%endif 980 pxor m6, m5 ; min_grain 981%if %2 && cpuflag(sse4) 982 SPLATW m7, [base+hmul_bits+2+%3*2] 983%endif 984 985%if ARCH_X86_64 986 SWAP 5, 13 987 SWAP 6, 14 988 SWAP 7, 15 989%else 990%define m13 [rsp+5*16] 991%define m14 [rsp+6*16] 992%define m15 [rsp+7*16] 993 mova m13, m5 994 mova m14, m6 995 mova m15, m7 996%endif 997 998 ; coef values 999 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] 1000 pxor m1, m1 1001 pcmpgtb m1, m0 1002 punpckhbw m2, m0, m1 1003 punpcklbw m0, m1 1004 pinsrw m2, [base+round_vals-12+shiftq*2], 5 1005 1006 pshufd m6, m0, q0000 1007 pshufd m7, m0, q1111 1008 pshufd m1, m0, q3333 1009 pshufd m0, m0, q2222 1010 pshufd m3, m2, q1111 1011 pshufd m4, m2, q2222 1012 pshufd m2, m2, q0000 1013 1014%if ARCH_X86_64 1015 SWAP 0, 8 1016 SWAP 1, 9 1017 SWAP 2, 10 1018 SWAP 3, 11 1019 SWAP 4, 12 1020%else 1021%define m8 [rsp+0*16] 1022%define m9 [rsp+1*16] 1023%define m10 [rsp+2*16] 1024%define m11 [rsp+3*16] 1025%define m12 [rsp+4*16] 1026 mova m8, m0 1027 mova m9, m1 1028 mova m10, m2 1029 mova m11, m3 1030 mova m12, m4 1031%endif 1032 1033%if ARCH_X86_64 1034 DEFINE_ARGS buf, bufy, fg_data, h, x 1035%else 1036 DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x 1037%endif 1038%if %2 1039 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 1040%else 1041 sub bufq, 2*(82*69+3) 1042%endif 1043 add bufyq, 2*(79+82*3) 1044 mov hd, 70-35*%3 1045.y_loop_ar2: 1046 mov xq, -(76>>%2) 1047 1048.x_loop_ar2: 1049 movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] 1050 movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] 1051 psrldq m4, m0, 2 ; y=-2,x=[-1,+5] 1052 psrldq m1, m0, 4 ; y=-2,x=[-0,+5] 1053 psrldq m3, m0, 6 ; y=-2,x=[+1,+5] 1054 psrldq m2, m0, 8 ; y=-2,x=[+2,+5] 1055 punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 1056 punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 1057 punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1] 1058 pmaddwd m0, m6 1059 pmaddwd m1, m7 1060 pmaddwd m2, m8 1061 paddd m0, m1 1062 paddd m0, m2 1063 psrldq m3, m5, 2 ; y=-1,x=[-1,+5] 1064 psrldq m1, m5, 4 ; y=-1,x=[-0,+5] 1065 psrldq m4, m5, 6 ; y=-1,x=[+1,+5] 1066 psrldq m2, m5, 8 ; y=-1,x=[+2,+5] 1067 punpcklwd m3, m1 1068 punpcklwd m4, m2 1069 pmaddwd m3, m9 1070 pmaddwd m4, m10 1071 paddd m3, m4 1072 paddd m0, m3 1073 1074 ; luma component & rounding 1075%if %2 1076 movu m1, [bufyq+xq*4] 1077%if %3 1078 movu m2, [bufyq+xq*4+82*2] 1079 phaddw m1, m2 1080 pshufd m2, m1, q3232 1081 paddw m1, m2 1082%else 1083 phaddw m1, m1 1084%endif 1085%if cpuflag(sse4) 1086 pmulhrsw m1, m15 1087%elif %3 1088 pmulhrsw m1, [base+pw_8192] 1089%else 1090 pmulhrsw m1, [base+pw_16384] 1091%endif 1092%else 1093 movq m1, [bufyq+xq*2] 1094%endif 1095 punpcklwd m1, [base+pw_1] 1096 pmaddwd m1, m12 1097 paddd m0, m1 1098 1099 movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] 1100 pshufd m2, m1, q3321 1101 pxor m3, m3 1102 pcmpgtw m3, m2 1103 punpcklwd m2, m3 ; y=0,x=[0,3] in dword 1104.x_loop_ar2_inner: 1105 pmaddwd m3, m1, m11 1106 paddd m3, m0 1107 psrldq m0, 4 ; shift top to next pixel 1108 psrad m3, [fg_dataq+FGData.ar_coeff_shift] 1109 ; we do not need to packssdw since we only care about one value 1110 paddd m3, m2 1111 packssdw m3, m3 1112 pminsw m3, m13 1113 pmaxsw m3, m14 1114 psrldq m1, 2 1115 pslldq m3, 2 1116 psrldq m2, 4 1117%if cpuflag(sse4) 1118 pblendw m1, m3, 00000010b 1119%else 1120 pand m1, m15 1121 pandn m4, m15, m3 1122 por m1, m4 1123%endif 1124 ; overwrite previous pixel, should be ok 1125 movd [bufq+xq*2-2], m1 1126 inc xq 1127 jz .x_loop_ar2_end 1128 test xq, 3 1129 jnz .x_loop_ar2_inner 1130 jmp .x_loop_ar2 1131 1132.x_loop_ar2_end: 1133 add bufq, 82*2 1134 add bufyq, 82*2<<%3 1135 dec hd 1136 jg .y_loop_ar2 1137%if ARCH_X86_32 1138%undef m13 1139%undef m14 1140%undef m15 1141%endif 1142 RET 1143 1144.ar3: 1145%if ARCH_X86_64 1146 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 1147%if WIN64 1148 mov r6, rsp 1149 and rsp, ~15 1150 sub rsp, 96 1151 %define tmp rsp 1152%else 1153 %define tmp rsp+stack_offset-120 1154%endif 1155%else 1156 DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift 1157 ALLOC_STACK -16*14 1158 mov bufyq, r1m 1159 mov uvd, r3m 1160 %define tmp rsp 1161%endif 1162 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 1163 imul uvd, 28 1164 SPLATW m4, [base+round_vals-12+shiftq*2] 1165 pxor m5, m5 1166 pcmpgtw m5, m4 1167 punpcklwd m4, m5 1168%if ARCH_X86_64 1169 sar bdmaxd, 1 1170 SPLATW m6, bdmaxd ; max_grain 1171%else 1172 SPLATW m6, r4m 1173 psraw m6, 1 1174%endif 1175 pcmpeqw m7, m7 1176%if !cpuflag(sse4) 1177 pcmpeqw m3, m3 1178 psrldq m3, 14 1179 pslldq m3, 4 1180 pxor m3, m7 1181%endif 1182 pxor m7, m6 ; min_grain 1183%if %2 && cpuflag(sse4) 1184 SPLATW m3, [base+hmul_bits+2+%3*2] 1185%endif 1186 1187%if ARCH_X86_64 1188 SWAP 3, 11 1189 SWAP 4, 12 1190 SWAP 6, 14 1191 SWAP 7, 15 1192%else 1193%define m11 [rsp+ 9*16] 1194%define m12 [rsp+10*16] 1195%define m14 [rsp+12*16] 1196%define m15 [rsp+13*16] 1197 mova m11, m3 1198 mova m12, m4 1199 mova m14, m6 1200 mova m15, m7 1201%endif 1202 1203 ; cf from y=-3,x=-3 until y=-3,x=-2 1204 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] 1205 pxor m1, m1 1206 pcmpgtb m1, m0 1207 punpckhbw m2, m0, m1 1208 punpcklbw m0, m1 1209 pshufd m1, m0, q0000 1210 pshufd m3, m0, q1111 1211 pshufd m4, m0, q2222 1212 pshufd m0, m0, q3333 1213 pshufd m5, m2, q0000 1214 pshufd m6, m2, q1111 1215 mova [tmp+16*0], m1 1216 mova [tmp+16*1], m3 1217 mova [tmp+16*2], m4 1218 mova [tmp+16*3], m0 1219 mova [tmp+16*4], m5 1220 mova [tmp+16*5], m6 1221 pshufd m6, m2, q2222 1222 pshufd m7, m2, q3333 1223 1224 ; cf from y=-1,x=-1 to y=0,x=-1 + luma component 1225 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] 1226 pxor m1, m1 1227 pcmpgtb m1, m0 1228 punpckhbw m2, m0, m1 ; luma 1229 punpcklbw m0, m1 1230 pshufd m3, m0, q3232 1231 psrldq m5, m0, 10 1232 ; y=0,x=[-3 to -1] + "1.0" for current pixel 1233 pinsrw m5, [base+round_vals-10+shiftq*2], 3 1234 ; y=-1,x=[-1 to +2] 1235 pshufd m1, m0, q0000 1236 pshufd m0, m0, q1111 1237 ; y=-1,x=+3 + luma 1238 punpcklwd m3, m2 1239 pshufd m3, m3, q0000 1240 1241%if ARCH_X86_64 1242 SWAP 1, 8 1243 SWAP 0, 9 1244 SWAP 3, 10 1245 SWAP 5, 13 1246 DEFINE_ARGS buf, bufy, fg_data, h, x 1247%else 1248%define m8 [rsp+ 6*16] 1249%define m9 [rsp+ 7*16] 1250%define m10 [rsp+ 8*16] 1251%define m13 [rsp+11*16] 1252 mova m8, m1 1253 mova m9, m0 1254 mova m10, m3 1255 mova m13, m5 1256 DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x 1257%endif 1258%if %2 1259 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 1260%else 1261 sub bufq, 2*(82*69+3) 1262%endif 1263 add bufyq, 2*(79+82*3) 1264 mov hd, 70-35*%3 1265.y_loop_ar3: 1266 mov xq, -(76>>%2) 1267 1268.x_loop_ar3: 1269 ; first line 1270 movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] 1271 movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] 1272 palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] 1273 palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] 1274 punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] 1275 punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] 1276 shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] 1277 1278 pmaddwd m0, [tmp+0*16] 1279 pmaddwd m2, [tmp+1*16] 1280 pmaddwd m3, [tmp+2*16] 1281 paddd m0, m2 1282 paddd m0, m3 ; first 6 x of top y 1283 1284 ; second line [m0/1 are busy] 1285 movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] 1286 movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] 1287 punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] 1288 palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5] 1289 palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5] 1290 punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] 1291 punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 1292 shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 1293 pmaddwd m1, [tmp+3*16] 1294 pmaddwd m4, [tmp+4*16] 1295 pmaddwd m3, [tmp+5*16] 1296 pmaddwd m5, m6 1297 paddd m1, m4 1298 paddd m3, m5 1299 paddd m0, m1 1300 paddd m0, m3 ; top 2 lines 1301 1302 ; third line [m0 is busy] & luma + round 1303 movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] 1304 movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] 1305%if %2 1306 movu m5, [bufyq+xq*4] 1307%if %3 1308 movu m4, [bufyq+xq*4+82*2] 1309 phaddw m5, m4 1310%else 1311 phaddw m5, m5 1312%endif 1313%else 1314 movq m5, [bufyq+xq*2] 1315%endif 1316 palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] 1317 palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] 1318%if %3 1319 pshufd m4, m5, q3232 1320 paddw m5, m4 1321%endif 1322%if %2 1323%if cpuflag(sse4) 1324 pmulhrsw m5, m11 1325%elif %3 1326 pmulhrsw m5, [base+pw_8192] 1327%else 1328 pmulhrsw m5, [base+pw_16384] 1329%endif 1330%endif 1331 punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] 1332 punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] 1333 shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] 1334 punpcklwd m2, m5 1335 pmaddwd m1, m7 1336 pmaddwd m3, m8 1337 pmaddwd m4, m9 1338 pmaddwd m2, m10 1339 paddd m1, m3 1340 paddd m4, m2 1341 paddd m0, m12 ; += round 1342 paddd m1, m4 1343 paddd m0, m1 1344 1345 movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] 1346.x_loop_ar3_inner: 1347 pmaddwd m2, m1, m13 1348 pshufd m3, m2, q1111 1349 paddd m2, m3 ; left+cur 1350 paddd m2, m0 ; add top 1351 psrldq m0, 4 1352 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 1353 packssdw m2, m2 1354 pminsw m2, m14 1355 pmaxsw m2, m15 1356 pslldq m2, 4 1357 psrldq m1, 2 1358%if cpuflag(sse4) 1359 pblendw m1, m2, 00000100b 1360%else 1361 pand m1, m11 1362 pandn m3, m11, m2 1363 por m1, m3 1364%endif 1365 ; overwrite previous pixels, should be ok 1366 movq [bufq+xq*2-4], m1 1367 inc xq 1368 jz .x_loop_ar3_end 1369 test xq, 3 1370 jnz .x_loop_ar3_inner 1371 jmp .x_loop_ar3 1372 1373.x_loop_ar3_end: 1374 add bufq, 82*2 1375 add bufyq, 82*2<<%3 1376 dec hd 1377 jg .y_loop_ar3 1378%if WIN64 1379 mov rsp, r6 1380%elif ARCH_X86_32 1381%undef m8 1382%undef m9 1383%undef m10 1384%undef m11 1385%undef m12 1386%undef m13 1387%undef m14 1388%undef m15 1389%endif 1390 RET 1391%endmacro 1392 1393generate_grain_uv_fn 420, 1, 1 1394generate_grain_uv_fn 422, 1, 0 1395generate_grain_uv_fn 444, 0, 0 1396 1397%macro SCRATCH 3 1398%if ARCH_X86_32 1399 mova [rsp+%3*mmsize], m%1 1400%define m%2 [rsp+%3*mmsize] 1401%else 1402 SWAP %1, %2 1403%endif 1404%endmacro 1405 1406INIT_XMM ssse3 1407%if ARCH_X86_32 1408%if STACK_ALIGNMENT < mmsize 1409cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \ 1410 dst, src, scaling, unused1, fg_data, picptr, unused2 1411 ; copy stack arguments to new position post-alignment, so that we 1412 ; don't have to keep the old stack location in a separate register 1413 mov r0, r0m 1414 mov r1, r2m 1415 mov r2, r4m 1416 mov r3, r6m 1417 mov r4, r7m 1418 mov r5, r8m 1419 1420%define r0m [rsp+8*mmsize+ 3*gprsize] 1421%define r2m [rsp+8*mmsize+ 5*gprsize] 1422%define r4m [rsp+8*mmsize+ 7*gprsize] 1423%define r6m [rsp+8*mmsize+ 9*gprsize] 1424%define r7m [rsp+8*mmsize+10*gprsize] 1425%define r8m [rsp+8*mmsize+11*gprsize] 1426 1427 mov r0m, r0 1428 mov r2m, r1 1429 mov r4m, r2 1430 mov r6m, r3 1431 mov r7m, r4 1432 mov r8m, r5 1433%else 1434cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \ 1435 dst, src, scaling, unused1, fg_data, picptr, unused2 1436%endif 1437 mov srcq, srcm 1438 mov scalingq, r5m 1439 mov fg_dataq, r3m 1440%if STACK_ALIGNMENT < mmsize 1441 mov r6, r9m 1442 1443%define r9m [rsp+8*mmsize+ 4*gprsize] 1444%define r3m [rsp+8*mmsize+ 6*gprsize] 1445%define r5m [rsp+8*mmsize+ 8*gprsize] 1446 1447 mov r9m, r6 1448%endif 1449 LEA r5, $$ 1450%define base r5-$$ 1451 mov r5m, picptrq 1452%else 1453cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut 1454 lea r8, [pb_mask] 1455%define base r8-pb_mask 1456%endif 1457 mov r6d, [fg_dataq+FGData.scaling_shift] 1458 SPLATW m3, [base+mul_bits+r6*2-14] 1459 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 1460%if ARCH_X86_32 1461 DECLARE_REG_TMP 0, 3 1462%else 1463 DECLARE_REG_TMP 9, 10 1464%endif 1465 mov t0d, r9m ; bdmax 1466 sar t0d, 11 ; is_12bpc 1467 inc t0d 1468 mov t1d, r6d 1469 imul t1d, t0d 1470 dec t0d 1471 SPLATW m5, [base+min+t1*2] 1472 lea t0d, [t0d*3] 1473 lea t0d, [r6d*2+t0d] 1474 SPLATW m4, [base+max+t0*2] 1475 SPLATW m2, r9m 1476 1477 pcmpeqw m1, m1 1478 psraw m7, m2, 1 ; max_grain 1479 pxor m1, m7 ; min_grain 1480 SPLATD m6, [base+pd_16] 1481 1482 SCRATCH 1, 9, 0 1483 SCRATCH 2, 10, 1 1484 SCRATCH 3, 11, 2 1485 SCRATCH 4, 12, 3 1486 SCRATCH 5, 13, 4 1487 SCRATCH 6, 14, 5 1488 SCRATCH 7, 15, 6 1489 1490 mova m6, [base+pw_27_17_17_27] ; for horizontal filter 1491 1492%if ARCH_X86_32 1493 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 1494 DECLARE_REG_TMP 0 1495%else 1496 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ 1497 sby, see 1498 DECLARE_REG_TMP 7 1499%endif 1500 1501 mov sbyd, r8m 1502 movzx t0d, byte [fg_dataq+FGData.overlap_flag] 1503 test t0d, t0d 1504 jz .no_vertical_overlap 1505 test sbyd, sbyd 1506 jnz .vertical_overlap 1507.no_vertical_overlap: 1508 mov dword r8m, t0d 1509 1510%if ARCH_X86_32 1511 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused 1512 imul seed, (173 << 24) | 37 1513%else 1514 imul seed, sbyd, (173 << 24) | 37 1515%endif 1516 add seed, (105 << 24) | 178 1517 rol seed, 8 1518 movzx seed, seew 1519 xor seed, [fg_dataq+FGData.seed] 1520 1521%if ARCH_X86_32 1522 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1523 1524 mov r3m, seed 1525 mov wq, r4m 1526%else 1527 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1528 unused1, unused2, see, src_bak 1529%endif 1530 1531 lea src_bakq, [srcq+wq*2] 1532 mov r9mp, src_bakq 1533 neg wq 1534 sub dstmp, srcq 1535%if ARCH_X86_32 1536 mov r4m, wq 1537%endif 1538 1539.loop_x: 1540%if ARCH_X86_32 1541 mov seed, r3m 1542%endif 1543 mov r6d, seed 1544 or seed, 0xEFF4 1545 shr r6d, 1 1546 test seeb, seeh 1547 lea seed, [r6+0x8000] 1548 cmovp seed, r6d ; updated seed 1549 1550%if ARCH_X86_32 1551 mov r3m, seed 1552 1553 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1554 1555 mov offxd, offyd 1556%else 1557 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1558 offx, offy, see, src_bak 1559 1560 mov offyd, seed 1561 mov offxd, seed 1562%endif 1563 ror offyd, 8 1564 shr offxd, 12 1565 and offyd, 0xf 1566 imul offyd, 164 1567 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1568 1569%if ARCH_X86_32 1570 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1571%else 1572 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1573 h, offxy, see, src_bak 1574%endif 1575 1576.loop_x_odd: 1577 movzx hd, word r7m 1578 mov grain_lutq, grain_lutmp 1579.loop_y: 1580 ; src 1581 pand m0, m10, [srcq+ 0] 1582 pand m1, m10, [srcq+16] ; m0-1: src as word 1583 1584 ; scaling[src] 1585%if ARCH_X86_32 1586 vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4 1587 vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4 1588%else 1589 vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 1590 vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 1591%endif 1592 REPX {psrlw x, 8}, m2, m3 1593 1594 ; grain = grain_lut[offy+y][offx+x] 1595 movu m4, [grain_lutq+offxyq*2] 1596 movu m5, [grain_lutq+offxyq*2+16] 1597 1598 ; noise = round2(scaling[src] * grain, scaling_shift) 1599 REPX {pmullw x, m11}, m2, m3 1600 pmulhrsw m4, m2 1601 pmulhrsw m5, m3 1602 1603 ; dst = clip_pixel(src, noise) 1604 paddw m0, m4 1605 paddw m1, m5 1606 pmaxsw m0, m13 1607 pmaxsw m1, m13 1608 pminsw m0, m12 1609 pminsw m1, m12 1610 movifnidn dstq, dstmp 1611 mova [dstq+srcq+ 0], m0 1612 mova [dstq+srcq+16], m1 1613 1614 add srcq, r2mp ; src += stride 1615 add grain_lutq, 82*2 1616 dec hd 1617 jg .loop_y 1618 1619%if ARCH_X86_32 1620 add r4mp, 16 1621%else 1622 add wq, 16 1623%endif 1624 jge .end 1625%if ARCH_X86_32 1626 mov srcq, r9mp 1627 add srcq, r4mp 1628 add srcq, r4mp 1629%else 1630 mov src_bakq, r9mp 1631 lea srcq, [src_bakq+wq*2] 1632%endif 1633 btc dword r8m, 2 1634 jc .next_blk 1635 add offxyd, 16 1636 test dword r8m, 2 1637 jz .loop_x_odd 1638%if ARCH_X86_32 1639 add dword [rsp+8*mmsize+1*gprsize], 16 1640%else 1641 add r12d, 16 ; top_offxy += 16 1642%endif 1643 jmp .loop_x_odd_v_overlap 1644 1645.next_blk: 1646 test dword r8m, 1 1647 jz .loop_x 1648 1649 ; r8m = sbym 1650 test dword r8m, 2 1651 jnz .loop_x_hv_overlap 1652 1653 ; horizontal overlap (without vertical overlap) 1654.loop_x_h_overlap: 1655%if ARCH_X86_32 1656 add offxyd, 16 1657 mov [rsp+8*mmsize+0*gprsize], offxyd 1658 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1659 mov seed, r3m 1660%endif 1661 1662 mov r6d, seed 1663 or seed, 0xEFF4 1664 shr r6d, 1 1665 test seeb, seeh 1666 lea seed, [r6+0x8000] 1667 cmovp seed, r6d ; updated seed 1668 1669%if ARCH_X86_32 1670 mov r3m, seed 1671 1672 DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx 1673 1674 mov offxd, offyd 1675%else 1676 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1677 offx, offy, see, src_bak, left_offxy 1678 1679 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 1680 1681 mov offyd, seed 1682 mov offxd, seed 1683%endif 1684 ror offyd, 8 1685 shr offxd, 12 1686 and offyd, 0xf 1687 imul offyd, 164 1688 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1689 1690%if ARCH_X86_32 1691 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1692%else 1693 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1694 h, offxy, see, src_bak, left_offxy 1695%endif 1696 1697 mov hd, dword r7m 1698 mov grain_lutq, grain_lutmp 1699.loop_y_h_overlap: 1700 ; grain = grain_lut[offy+y][offx+x] 1701 movu m5, [grain_lutq+offxyq*2] 1702%if ARCH_X86_32 1703 mov r5, [rsp+8*mmsize+0*gprsize] 1704 movd m4, [grain_lutq+r5*2] 1705%else 1706 movd m4, [grain_lutq+left_offxyq*2] 1707%endif 1708 punpcklwd m4, m5 1709 pmaddwd m4, m6 1710 paddd m4, m14 1711 psrad m4, 5 1712 packssdw m4, m4 1713 pminsw m4, m15 1714 pmaxsw m4, m9 1715 shufps m4, m5, q3210 1716 1717 ; src 1718 pand m0, m10, [srcq+ 0] 1719 pand m1, m10, [srcq+16] ; m0-1: src as word 1720 1721 ; scaling[src] 1722%if ARCH_X86_32 1723 vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5 1724 vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5 1725%else 1726 vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 1727 vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 1728%endif 1729 REPX {psrlw x, 8}, m2, m3 1730 1731 ; noise = round2(scaling[src] * grain, scaling_shift) 1732 movu m5, [grain_lutq+offxyq*2+16] 1733 REPX {pmullw x, m11}, m2, m3 1734 pmulhrsw m4, m2 1735 pmulhrsw m5, m3 1736 1737 ; dst = clip_pixel(src, noise) 1738 paddw m0, m4 1739 paddw m1, m5 1740 pmaxsw m0, m13 1741 pmaxsw m1, m13 1742 pminsw m0, m12 1743 pminsw m1, m12 1744 movifnidn dstq, dstmp 1745 mova [dstq+srcq+ 0], m0 1746 mova [dstq+srcq+16], m1 1747 1748 add srcq, r2mp 1749 add grain_lutq, 82*2 1750 dec hd 1751 jg .loop_y_h_overlap 1752 1753%if ARCH_X86_32 1754 add r4mp, 16 1755%else 1756 add wq, 16 1757%endif 1758 jge .end 1759%if ARCH_X86_32 1760 mov srcq, r9mp 1761 add srcq, r4mp 1762 add srcq, r4mp 1763%else 1764 mov src_bakq, r9mp 1765 lea srcq, [src_bakq+wq*2] 1766%endif 1767 or dword r8m, 4 1768 add offxyd, 16 1769 1770 ; r8m = sbym 1771 test dword r8m, 2 1772 jz .loop_x_odd 1773%if ARCH_X86_32 1774 add dword [rsp+8*mmsize+1*gprsize], 16 1775%else 1776 add r12d, 16 ; top_offxy += 16 1777%endif 1778 jmp .loop_x_odd_v_overlap 1779 1780.end: 1781 RET 1782 1783.vertical_overlap: 1784 or t0d, 2 1785 mov r8m, t0d 1786 1787%if ARCH_X86_32 1788 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused 1789%else 1790 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ 1791 sby, see 1792%endif 1793 1794 movzx sbyd, sbyb 1795%if ARCH_X86_32 1796 imul r4, [fg_dataq+FGData.seed], 0x00010001 1797 DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused 1798%else 1799 imul seed, [fg_dataq+FGData.seed], 0x00010001 1800%endif 1801 imul t0d, sbyd, 173 * 0x00010001 1802 imul sbyd, 37 * 0x01000100 1803 add t0d, (105 << 16) | 188 1804 add sbyd, (178 << 24) | (141 << 8) 1805 and t0d, 0x00ff00ff 1806 and sbyd, 0xff00ff00 1807 xor seed, t0d 1808%if ARCH_X86_32 1809 xor sbyd, seed 1810 1811 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1812 1813 mov r3m, seed 1814 mov wq, r4m 1815%else 1816 xor seed, sbyd ; (cur_seed << 16) | top_seed 1817 1818 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1819 unused1, unused2, see, src_bak 1820%endif 1821 1822 lea src_bakq, [srcq+wq*2] 1823 mov r9mp, src_bakq 1824 neg wq 1825 sub dstmp, srcq 1826%if ARCH_X86_32 1827 mov r4m, wq 1828%endif 1829 1830.loop_x_v_overlap: 1831%if ARCH_X86_32 1832 mov r5, r5m 1833 SPLATD m7, [base+pw_27_17_17_27] 1834 mov seed, r3m 1835%else 1836 SPLATD m7, [pw_27_17_17_27] 1837%endif 1838 1839 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1840 mov r6d, seed 1841 or seed, 0xeff4eff4 1842 test seeb, seeh 1843 setp t0b ; parity of top_seed 1844 shr seed, 16 1845 shl t0d, 16 1846 test seeb, seeh 1847 setp t0b ; parity of cur_seed 1848 or r6d, 0x00010001 1849 xor t0d, r6d 1850 mov seed, t0d 1851 ror seed, 1 ; updated (cur_seed << 16) | top_seed 1852 1853%if ARCH_X86_32 1854 mov r3m, seed 1855 1856 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1857 1858 mov offxd, offyd 1859%else 1860 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1861 offx, offy, see, src_bak, unused, top_offxy 1862 1863 mov offyd, seed 1864 mov offxd, seed 1865%endif 1866 ror offyd, 8 1867 ror offxd, 12 1868 and offyd, 0xf000f 1869 and offxd, 0xf000f 1870 imul offyd, 164 1871 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1872 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1873 1874%if ARCH_X86_32 1875 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 1876%else 1877 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1878 h, offxy, see, src_bak, unused, top_offxy 1879%endif 1880 1881 movzx top_offxyd, offxyw 1882%if ARCH_X86_32 1883 mov [rsp+8*mmsize+1*gprsize], top_offxyd 1884 1885 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1886%endif 1887 shr offxyd, 16 1888 1889.loop_x_odd_v_overlap: 1890%if ARCH_X86_32 1891 mov r5, r5m 1892%endif 1893 SPLATD m7, [PIC_ptr(pw_27_17_17_27)] 1894 mov hd, dword r7m 1895 mov grain_lutq, grain_lutmp 1896.loop_y_v_overlap: 1897 ; grain = grain_lut[offy+y][offx+x] 1898 movu m3, [grain_lutq+offxyq*2] 1899%if ARCH_X86_32 1900 mov r5, [rsp+8*mmsize+1*gprsize] 1901 movu m2, [grain_lutq+r5*2] 1902%else 1903 movu m2, [grain_lutq+top_offxyq*2] 1904%endif 1905 punpckhwd m4, m2, m3 1906 punpcklwd m2, m3 1907 REPX {pmaddwd x, m7}, m4, m2 1908 REPX {paddd x, m14}, m4, m2 1909 REPX {psrad x, 5}, m4, m2 1910 packssdw m2, m4 1911 pminsw m2, m15 1912 pmaxsw m2, m9 1913 movu m4, [grain_lutq+offxyq*2+16] 1914%if ARCH_X86_32 1915 movu m3, [grain_lutq+r5*2+16] 1916%else 1917 movu m3, [grain_lutq+top_offxyq*2+16] 1918%endif 1919 punpckhwd m5, m3, m4 1920 punpcklwd m3, m4 1921 REPX {pmaddwd x, m7}, m5, m3 1922 REPX {paddd x, m14}, m5, m3 1923 REPX {psrad x, 5}, m5, m3 1924 packssdw m3, m5 1925 pminsw m3, m15 1926 pmaxsw m3, m9 1927 1928 ; src 1929 pand m0, m10, [srcq+ 0] ; m0-1: src as word 1930 pand m1, m10, [srcq+16] ; m0-1: src as word 1931 1932 ; scaling[src] 1933 ; noise = round2(scaling[src] * grain, scaling_shift) 1934%if ARCH_X86_32 1935 vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 1936%else 1937 vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 1938%endif 1939 psrlw m4, 8 1940 pmullw m4, m11 1941 pmulhrsw m4, m2 1942%if ARCH_X86_32 1943 vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2 1944%else 1945 vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 1946%endif 1947 psrlw m5, 8 1948 pmullw m5, m11 1949 pmulhrsw m5, m3 1950 1951 ; dst = clip_pixel(src, noise) 1952 paddw m0, m4 1953 paddw m1, m5 1954 pmaxsw m0, m13 1955 pmaxsw m1, m13 1956 pminsw m0, m12 1957 pminsw m1, m12 1958 movifnidn dstq, dstmp 1959 mova [dstq+srcq+ 0], m0 1960 mova [dstq+srcq+16], m1 1961 1962 add srcq, r2mp 1963 add grain_lutq, 82*2 1964 dec hw 1965 jz .end_y_v_overlap 1966 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1967 ; remaining (up to) 30 lines 1968%if ARCH_X86_32 1969 mov r5, r5m 1970%endif 1971 SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] 1972 xor hd, 0x10000 1973 test hd, 0x10000 1974 jnz .loop_y_v_overlap 1975 jmp .loop_y 1976 1977.end_y_v_overlap: 1978%if ARCH_X86_32 1979 add r4mp, 16 1980%else 1981 add wq, 16 1982%endif 1983 jge .end_hv 1984%if ARCH_X86_32 1985 mov srcq, r9mp 1986 add srcq, r4mp 1987 add srcq, r4mp 1988%else 1989 mov src_bakq, r9mp 1990 lea srcq, [src_bakq+wq*2] 1991%endif 1992 btc dword r8m, 2 1993 jc .next_blk_v 1994%if ARCH_X86_32 1995 add dword [rsp+8*mmsize+1*gprsize], 16 1996%else 1997 add top_offxyd, 16 1998%endif 1999 add offxyd, 16 2000 jmp .loop_x_odd_v_overlap 2001 2002.next_blk_v: 2003 ; since fg_dataq.overlap is guaranteed to be set, we never jump 2004 ; back to .loop_x_v_overlap, and instead always fall-through to 2005 ; h+v overlap 2006 2007.loop_x_hv_overlap: 2008%if ARCH_X86_32 2009 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 2010 2011 mov r0, [rsp+8*mmsize+1*gprsize] 2012 add r3, 16 2013 add r0, 16 2014 mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy 2015 mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy 2016 2017 mov seed, r3m 2018 xor r0, r0 2019%else 2020 ; we assume from the block above that bits 8-15 of r7d are zero'ed 2021%endif 2022 mov r6d, seed 2023 or seed, 0xeff4eff4 2024 test seeb, seeh 2025 setp t0b ; parity of top_seed 2026 shr seed, 16 2027 shl t0d, 16 2028 test seeb, seeh 2029 setp t0b ; parity of cur_seed 2030 or r6d, 0x00010001 2031 xor t0d, r6d 2032 mov seed, t0d 2033 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2034 2035%if ARCH_X86_32 2036 mov r3m, seed 2037 2038 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2039 2040 mov offxd, offyd 2041%else 2042 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2043 offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy 2044 2045 lea topleft_offxyq, [top_offxyq+16] 2046 lea left_offxyq, [offyq+16] 2047 mov offyd, seed 2048 mov offxd, seed 2049%endif 2050 ror offyd, 8 2051 ror offxd, 12 2052 and offyd, 0xf000f 2053 and offxd, 0xf000f 2054 imul offyd, 164 2055 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2056 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 2057 2058%if ARCH_X86_32 2059 DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut 2060%else 2061 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2062 h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy 2063%endif 2064 2065 movzx top_offxyd, offxyw 2066%if ARCH_X86_32 2067 mov [rsp+8*mmsize+1*gprsize], top_offxyd 2068 2069 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2070%endif 2071 shr offxyd, 16 2072 2073%if ARCH_X86_32 2074 mov r5, r5m 2075%endif 2076 SPLATD m7, [PIC_ptr(pw_27_17_17_27)] 2077 2078 movzx hd, word r7m 2079 mov grain_lutq, grain_lutmp 2080.loop_y_hv_overlap: 2081 ; grain = grain_lut[offy+y][offx+x] 2082 movu m2, [grain_lutq+offxyq*2] 2083%if ARCH_X86_32 2084 mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy 2085 mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy 2086 movu m4, [grain_lutq+r0*2] 2087 movd m5, [grain_lutq+r5*2] 2088 mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy 2089 movd m3, [grain_lutq+r5*2] 2090%else 2091 movu m4, [grain_lutq+top_offxyq*2] 2092 movd m5, [grain_lutq+left_offxyq*2] 2093 movd m3, [grain_lutq+topleft_offxyq*2] 2094%endif 2095 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 2096 punpcklwd m5, m2 2097 punpcklwd m3, m4 2098 REPX {pmaddwd x, m6}, m5, m3 2099 REPX {paddd x, m14}, m5, m3 2100 REPX {psrad x, 5}, m5, m3 2101 packssdw m5, m3 2102 pminsw m5, m15 2103 pmaxsw m5, m9 2104 shufps m3, m5, m2, q3210 2105 shufps m5, m4, q3232 2106 ; followed by v interpolation (top | cur -> cur) 2107 movu m0, [grain_lutq+offxyq*2+16] 2108%if ARCH_X86_32 2109 movu m1, [grain_lutq+r0*2+16] 2110%else 2111 movu m1, [grain_lutq+top_offxyq*2+16] 2112%endif 2113 punpcklwd m2, m5, m3 2114 punpckhwd m5, m3 2115 punpcklwd m3, m1, m0 2116 punpckhwd m1, m0 2117 REPX {pmaddwd x, m7}, m2, m5, m3, m1 2118 REPX {paddd x, m14}, m2, m5, m3, m1 2119 REPX {psrad x, 5}, m2, m5, m3, m1 2120 packssdw m2, m5 2121 packssdw m3, m1 2122 REPX {pminsw x, m15}, m2, m3 2123 REPX {pmaxsw x, m9}, m2, m3 2124 2125 ; src 2126 pand m0, m10, [srcq+ 0] 2127 pand m1, m10, [srcq+16] ; m0-1: src as word 2128 2129 ; scaling[src] 2130 ; noise = round2(scaling[src] * grain, scaling_shift) 2131%if ARCH_X86_32 2132 vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 2133%else 2134 vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 2135%endif 2136 psrlw m4, 8 2137 pmullw m4, m11 2138 pmulhrsw m2, m4 2139%if ARCH_X86_32 2140 vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4 2141%else 2142 vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 2143%endif 2144 psrlw m5, 8 2145 pmullw m5, m11 2146 pmulhrsw m3, m5 2147 2148 ; dst = clip_pixel(src, noise) 2149 paddw m0, m2 2150 paddw m1, m3 2151 pmaxsw m0, m13 2152 pmaxsw m1, m13 2153 pminsw m0, m12 2154 pminsw m1, m12 2155 movifnidn dstq, dstmp 2156 mova [dstq+srcq+ 0], m0 2157 mova [dstq+srcq+16], m1 2158 2159 add srcq, r2mp 2160 add grain_lutq, 82*2 2161 dec hw 2162 jz .end_y_hv_overlap 2163 ; 2 lines get vertical overlap, then fall back to non-overlap code for 2164 ; remaining (up to) 30 lines 2165%if ARCH_X86_32 2166 mov r5, r5m 2167%endif 2168 SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] 2169 xor hd, 0x10000 2170 test hd, 0x10000 2171 jnz .loop_y_hv_overlap 2172 jmp .loop_y_h_overlap 2173 2174.end_y_hv_overlap: 2175 or dword r8m, 4 2176%if ARCH_X86_32 2177 add r4mp, 16 2178%else 2179 add wq, 16 2180%endif 2181 jge .end_hv 2182%if ARCH_X86_32 2183 mov r5, r5m 2184 add offxyd, 16 2185 add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 2186 mov srcq, r9mp 2187 add srcq, r4mp 2188 add srcq, r4mp 2189%else 2190 add offxyd, 16 2191 add top_offxyd, 16 2192 mov src_bakq, r9mp 2193 lea srcq, [src_bakq+wq*2] 2194%endif 2195 jmp .loop_x_odd_v_overlap 2196 2197.end_hv: 2198 RET 2199%if ARCH_X86_32 2200 DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 2201%endif 2202 2203%macro FGUV_FN 3 ; name, ss_hor, ss_ver 2204INIT_XMM ssse3 2205%if ARCH_X86_32 2206%if STACK_ALIGNMENT < mmsize 2207cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ 2208 tmp, src, scaling, h, fg_data, picptr, unused 2209 mov r0, r0m 2210 mov r1, r1m 2211 mov r2, r2m 2212 mov r4, r3m 2213 mov r3, r4m 2214 mov r5, r5m 2215%define r0m [rsp+8*mmsize+ 3*gprsize] 2216%define r1m [rsp+8*mmsize+ 4*gprsize] 2217%define r2m [rsp+8*mmsize+ 5*gprsize] 2218%define r3m [rsp+8*mmsize+ 6*gprsize] 2219%define r4m [rsp+8*mmsize+ 7*gprsize] 2220%define r5m [rsp+8*mmsize+ 8*gprsize] 2221 mov r0m, r0 2222 mov r2m, r2 2223 mov r4m, r3 2224 mov r5m, r5 2225 2226 mov r0, r6m 2227 mov r2, r7m 2228 mov r3, r8m 2229 mov r5, r9m 2230%define r6m [rsp+8*mmsize+ 9*gprsize] 2231%define r7m [rsp+8*mmsize+10*gprsize] 2232%define r8m [rsp+8*mmsize+11*gprsize] 2233%define r9m [rsp+8*mmsize+12*gprsize] 2234 mov r6m, r0 2235 mov r7m, r2 2236 mov r8m, r3 2237 mov r9m, r5 2238 2239 mov r2, r10m 2240 mov r3, r11m 2241 mov r5, r12m 2242 mov r0, r13m 2243%define r10m [rsp+8*mmsize+13*gprsize] 2244%define r11m [rsp+8*mmsize+14*gprsize] 2245%define r12m [rsp+8*mmsize+15*gprsize] 2246 mov r10m, r2 2247 mov r11m, r3 2248 mov r12m, r5 2249 2250 SPLATW m2, r13m 2251%else 2252cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ 2253 tmp, src, scaling, h, fg_data, picptr, unused 2254 mov srcq, srcm 2255 mov fg_dataq, r3m 2256%endif 2257 LEA r5, $$ 2258%define base r5-$$ 2259 2260 DECLARE_REG_TMP 0, 2, 3 2261%else 2262cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 2263 grain_lut, h, sby, luma, lstride, uv_pl, is_id 2264%define base r8-pb_mask 2265 lea r8, [pb_mask] 2266 2267 DECLARE_REG_TMP 9, 10, 11 2268%endif 2269 mov r6d, [fg_dataq+FGData.scaling_shift] 2270 SPLATW m3, [base+mul_bits+r6*2-14] 2271 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 2272%if STACK_ALIGNMENT >= mmsize 2273 mov t0d, r13m ; bdmax 2274%endif 2275 sar t0d, 11 ; is_12bpc 2276 inc t0d 2277 mov t1d, r6d 2278 imul t1d, t0d 2279 dec t0d 2280 SPLATW m5, [base+min+t1*2] 2281 lea t1d, [t0d*3] 2282 mov t2d, r12m 2283 inc t2d 2284 imul r6d, t2d 2285 add t1d, r6d 2286 SPLATW m4, [base+max+t1*2] 2287%if STACK_ALIGNMENT >= mmsize 2288 SPLATW m2, r13m 2289%endif 2290 2291 SCRATCH 2, 10, 2 2292 SCRATCH 3, 11, 3 2293 SCRATCH 4, 12, 4 2294 SCRATCH 5, 13, 5 2295 2296%define mzero m7 2297 2298%if %3 2299 SPLATD m2, [base+pw_23_22] 2300%endif 2301 2302%if ARCH_X86_32 2303 mov scalingq, r5m 2304 mov r5m, r5 2305%else 2306 mov r13mp, strideq 2307%endif 2308 2309 pcmpeqw m0, m0 2310 psraw m1, m10, 1 2311 pxor m0, m1 2312 2313 SCRATCH 0, 8, 0 2314 SCRATCH 1, 9, 1 2315 2316 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 2317 jne .csfl 2318 2319%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v 2320%if ARCH_X86_32 2321 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2322 2323 DECLARE_REG_TMP 0 2324%else 2325 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 2326 2327 DECLARE_REG_TMP 9 2328%endif 2329 2330%if %1 2331 mov r6d, r11m 2332 SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4] 2333 SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4] 2334 punpcklwd m6, m1, m0 2335 SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4] 2336 SPLATD m7, [base+pw_4+t0*4] 2337 pmullw m5, m7 2338%else 2339 SPLATD m6, [base+pd_16] 2340%if %2 2341 mova m5, [base+pw_23_22] 2342%else 2343 mova m5, [base+pw_27_17_17_27] 2344%endif 2345%endif 2346 2347 SCRATCH 6, 14, 6 2348 SCRATCH 5, 15, 7 2349 2350%if ARCH_X86_32 2351 DECLARE_REG_TMP 0 2352%else 2353 DECLARE_REG_TMP 7 2354%endif 2355 2356 mov sbyd, r8m 2357 mov t0d, [fg_dataq+FGData.overlap_flag] 2358 test t0d, t0d 2359 jz %%no_vertical_overlap 2360 test sbyd, sbyd 2361 jnz %%vertical_overlap 2362 2363%%no_vertical_overlap: 2364 mov r8m, t0d 2365%if ARCH_X86_32 2366 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap 2367 imul seed, (173 << 24) | 37 2368%else 2369 imul seed, sbyd, (173 << 24) | 37 2370%endif 2371 add seed, (105 << 24) | 178 2372 rol seed, 8 2373 movzx seed, seew 2374 xor seed, [fg_dataq+FGData.seed] 2375%if ARCH_X86_32 2376 mov r3m, seed 2377 2378 DEFINE_ARGS dst, src, scaling, see, w, picptr, luma 2379 2380 mov dstq, r0mp 2381 mov lumaq, r9mp 2382 mov wq, r4m 2383 lea r3, [srcq+wq*2] 2384 mov r1mp, r3 2385 lea r3, [dstq+wq*2] 2386 mov r11mp, r3 2387 lea r3, [lumaq+wq*(2<<%2)] 2388 mov r12mp, r3 2389%if %3 2390 shl r10mp, 1 2391%endif 2392%else 2393 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2394 unused2, unused3, see, unused4, unused5, unused6, luma, lstride 2395 2396 mov lstrideq, r10mp 2397%if %3 2398 add lstrideq, lstrideq 2399%endif 2400 mov lumaq, r9mp 2401 lea r10, [srcq+wq*2] 2402 lea r11, [dstq+wq*2] 2403 lea r12, [lumaq+wq*(2<<%2)] 2404 mov r10mp, r10 2405 mov r11mp, r11 2406 mov r12mp, r12 2407%endif 2408 neg wq 2409%if ARCH_X86_32 2410 mov r4mp, wq 2411%endif 2412 2413%%loop_x: 2414%if ARCH_X86_32 2415 mov seed, r3m 2416%endif 2417 2418 mov r6d, seed 2419 or seed, 0xEFF4 2420 shr r6d, 1 2421 test seeb, seeh 2422 lea seed, [r6+0x8000] 2423 cmovp seed, r6d ; updated seed 2424 2425%if ARCH_X86_32 2426 mov r3m, seed 2427 2428 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2429 2430 mov offxd, offyd 2431%else 2432 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2433 offx, offy, see, unused1, unused2, unused3, luma, lstride 2434 2435 mov offxd, seed 2436 mov offyd, seed 2437%endif 2438 ror offyd, 8 2439 shr offxd, 12 2440 and offyd, 0xf 2441 imul offyd, 164>>%3 2442 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 2443 2444%if ARCH_X86_32 2445 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2446%else 2447 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2448 h, offxy, see, unused1, unused2, unused3, luma, lstride 2449%endif 2450 2451%if %2 == 0 2452%%loop_x_odd: 2453%endif 2454 mov hd, r7m 2455 mov grain_lutq, grain_lutmp 2456%%loop_y: 2457 ; src 2458 mova m0, [srcq] 2459 mova m1, [srcq+16] ; m0-1: src as word 2460 2461 ; luma_src 2462 pxor mzero, mzero 2463%if ARCH_X86_32 2464 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut 2465 2466 mov lumaq, r9m 2467%endif 2468 mova m4, [lumaq+ 0] 2469 mova m6, [lumaq+(16<<%2)] 2470%if %2 2471 phaddw m4, [lumaq+16] 2472 phaddw m6, [lumaq+48] 2473%endif 2474%if ARCH_X86_32 2475 add lumaq, r10mp 2476 mov r9m, lumaq 2477%endif 2478%if %2 2479 pavgw m4, mzero 2480 pavgw m6, mzero 2481%endif 2482 2483%if %1 2484 punpckhwd m3, m4, m0 2485 punpcklwd m4, m0 2486 punpckhwd m5, m6, m1 2487 punpcklwd m6, m1 ; { luma, chroma } 2488 REPX {pmaddwd x, m14}, m3, m4, m5, m6 2489 REPX {psrad x, 6}, m3, m4, m5, m6 2490 packssdw m4, m3 2491 packssdw m6, m5 2492 REPX {paddw x, m15}, m4, m6 2493 REPX {pmaxsw x, mzero}, m4, m6 2494 REPX {pminsw x, m10}, m4, m6 ; clip_pixel() 2495%else 2496 REPX {pand x, m10}, m4, m6 2497%endif 2498 2499 ; scaling[luma_src] 2500%if ARCH_X86_32 2501 vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1 2502 vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 2503%else 2504 vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 2505 vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 2506%endif 2507 REPX {psrlw x, 8}, m3, m5 2508 2509 ; grain = grain_lut[offy+y][offx+x] 2510 movu m4, [grain_lutq+offxyq*2] 2511 movu m6, [grain_lutq+offxyq*2+16] 2512 2513 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2514 REPX {pmullw x, m11}, m3, m5 2515 pmulhrsw m4, m3 2516 pmulhrsw m6, m5 2517 2518 ; dst = clip_pixel(src, noise) 2519 paddw m0, m4 2520 paddw m1, m6 2521 pmaxsw m0, m13 2522 pmaxsw m1, m13 2523 pminsw m0, m12 2524 pminsw m1, m12 2525 movifnidn dstq, dstmp 2526 mova [dstq+ 0], m0 2527 mova [dstq+16], m1 2528 2529%if ARCH_X86_32 2530 add srcq, r2mp 2531 add dstq, r2mp 2532 mov dstmp, dstq 2533%else 2534 add srcq, r13mp 2535 add dstq, r13mp 2536 add lumaq, lstrideq 2537%endif 2538 add grain_lutq, 82*2 2539 dec hd 2540 jg %%loop_y 2541 2542%if ARCH_X86_32 2543 DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma 2544 2545 mov wq, r4mp 2546%endif 2547 add wq, 16 2548 jge %%end 2549%if ARCH_X86_32 2550 mov srcq, r1mp 2551%else 2552 mov srcq, r10mp 2553%endif 2554 mov dstq, r11mp 2555 mov lumaq, r12mp 2556 lea srcq, [srcq+wq*2] 2557 lea dstq, [dstq+wq*2] 2558 lea lumaq, [lumaq+wq*(2<<%2)] 2559%if ARCH_X86_32 2560 mov r0m, dstq 2561 mov r9m, lumaq 2562 mov r4m, wq 2563%endif 2564%if %2 == 0 2565 btc dword r8m, 2 2566 jc %%next_blk 2567 add offxyd, 16 2568 test dword r8m, 2 2569 jz %%loop_x_odd 2570%if ARCH_X86_32 2571 add dword [rsp+8*mmsize+1*gprsize], 16 2572%else 2573 add r11d, 16 2574%endif 2575 jmp %%loop_x_odd_v_overlap 2576%%next_blk: 2577%endif 2578 test dword r8m, 1 2579 je %%loop_x 2580 2581 ; r8m = sbym 2582 test dword r8m, 2 2583 jnz %%loop_x_hv_overlap 2584 2585 ; horizontal overlap (without vertical overlap) 2586%%loop_x_h_overlap: 2587%if ARCH_X86_32 2588 add offxyd, 16 2589 mov [rsp+8*mmsize+0*gprsize], offxyd 2590 2591 DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut 2592 2593 mov seed, r3m 2594%endif 2595 mov r6d, seed 2596 or seed, 0xEFF4 2597 shr r6d, 1 2598 test seeb, seeh 2599 lea seed, [r6+0x8000] 2600 cmovp seed, r6d ; updated seed 2601 2602%if ARCH_X86_32 2603 mov r3m, seed 2604 2605 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2606 2607 mov offxd, offyd 2608%else 2609 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2610 offx, offy, see, left_offxy, unused1, unused2, luma, lstride 2611 2612 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 2613 mov offxd, seed 2614 mov offyd, seed 2615%endif 2616 ror offyd, 8 2617 shr offxd, 12 2618 and offyd, 0xf 2619 imul offyd, 164>>%3 2620 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 2621 2622%if ARCH_X86_32 2623 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2624%else 2625 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2626 h, offxy, see, left_offxy, unused1, unused2, luma, lstride 2627%endif 2628 2629 mov hd, r7m 2630 mov grain_lutq, grain_lutmp 2631%%loop_y_h_overlap: 2632 mova m0, [srcq] 2633 mova m1, [srcq+16] 2634 2635 ; luma_src 2636 pxor mzero, mzero 2637%if ARCH_X86_32 2638 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut 2639 mov lumaq, r9m 2640%endif 2641 mova m4, [lumaq+ 0] 2642 mova m6, [lumaq+(16<<%2)] 2643%if %2 2644 phaddw m4, [lumaq+16] 2645 phaddw m6, [lumaq+48] 2646%endif 2647%if ARCH_X86_32 2648 add lumaq, r10mp 2649 mov r9m, lumaq 2650%endif 2651%if %2 2652 pavgw m4, mzero 2653 pavgw m6, mzero 2654%endif 2655 2656%if %1 2657 punpckhwd m3, m4, m0 2658 punpcklwd m4, m0 2659 punpckhwd m5, m6, m1 2660 punpcklwd m6, m1 ; { luma, chroma } 2661 REPX {pmaddwd x, m14}, m3, m4, m5, m6 2662 REPX {psrad x, 6}, m3, m4, m5, m6 2663 packssdw m4, m3 2664 packssdw m6, m5 2665 REPX {paddw x, m15}, m4, m6 2666 REPX {pmaxsw x, mzero}, m4, m6 2667 REPX {pminsw x, m10}, m4, m6 ; clip_pixel() 2668%else 2669 REPX {pand x, m10}, m4, m6 2670%endif 2671 2672 ; grain = grain_lut[offy+y][offx+x] 2673 movu m7, [grain_lutq+offxyq*2] 2674%if ARCH_X86_32 2675 mov r5, [rsp+8*mmsize+0*gprsize] 2676 movd m5, [grain_lutq+r5*2] 2677%else 2678 movd m5, [grain_lutq+left_offxyq*2+ 0] 2679%endif 2680 punpcklwd m5, m7 ; {left0, cur0} 2681%if %1 2682%if ARCH_X86_32 2683 mov r5, r5m 2684%endif 2685%if %2 2686 pmaddwd m5, [PIC_ptr(pw_23_22)] 2687%else 2688 pmaddwd m5, [PIC_ptr(pw_27_17_17_27)] 2689%endif 2690 paddd m5, [PIC_ptr(pd_16)] 2691%else 2692 pmaddwd m5, m15 2693 paddd m5, m14 2694%endif 2695 psrad m5, 5 2696 packssdw m5, m5 2697 pmaxsw m5, m8 2698 pminsw m5, m9 2699 shufps m5, m7, q3210 2700 movu m3, [grain_lutq+offxyq*2+16] 2701 2702 ; scaling[luma_src] 2703%if ARCH_X86_32 2704 vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1 2705 vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1 2706%else 2707 vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 2708 vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 2709%endif 2710 REPX {psrlw x, 8}, m7, m4 2711 2712 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2713 REPX {pmullw x, m11}, m7, m4 2714 pmulhrsw m5, m7 2715 pmulhrsw m3, m4 2716 2717 ; dst = clip_pixel(src, noise) 2718 paddw m0, m5 2719 paddw m1, m3 2720 pmaxsw m0, m13 2721 pmaxsw m1, m13 2722 pminsw m0, m12 2723 pminsw m1, m12 2724 movifnidn dstq, dstmp 2725 mova [dstq+ 0], m0 2726 mova [dstq+16], m1 2727 2728%if ARCH_X86_32 2729 add srcq, r2mp 2730 add dstq, r2mp 2731 mov dstmp, dstq 2732%else 2733 add srcq, r13mp 2734 add dstq, r13mp 2735 add lumaq, lstrideq 2736%endif 2737 add grain_lutq, 82*2 2738 dec hd 2739 jg %%loop_y_h_overlap 2740 2741%if ARCH_X86_32 2742 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut 2743 mov wq, r4mp 2744%endif 2745 add wq, 16 2746 jge %%end 2747%if ARCH_X86_32 2748 mov srcq, r1mp 2749%else 2750 mov srcq, r10mp 2751%endif 2752 mov dstq, r11mp 2753 mov lumaq, r12mp 2754 lea srcq, [srcq+wq*2] 2755 lea dstq, [dstq+wq*2] 2756 lea lumaq, [lumaq+wq*(2<<%2)] 2757%if ARCH_X86_32 2758 mov r0mp, dstq 2759 mov r9mp, lumaq 2760 mov r4m, wq 2761%endif 2762 2763%if %2 2764 ; r8m = sbym 2765 test dword r8m, 2 2766 jne %%loop_x_hv_overlap 2767 jmp %%loop_x_h_overlap 2768%else 2769 or dword r8m, 4 2770 add offxyd, 16 2771 2772 ; r8m = sbym 2773 test dword r8m, 2 2774 jz %%loop_x_odd 2775%if ARCH_X86_32 2776 add dword [rsp+8*mmsize+1*gprsize], 16 2777%else 2778 add r11d, 16 ; top_offxy += 16 2779%endif 2780 jmp %%loop_x_odd_v_overlap 2781%endif 2782 2783%%end: 2784 RET 2785 2786%%vertical_overlap: 2787 or t0d, 2 2788 mov r8m, t0d 2789 2790%if ARCH_X86_32 2791 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2792%else 2793 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ 2794 sby, see, unused1, unused2, unused3, lstride 2795%endif 2796 2797 movzx sbyd, sbyb 2798%if ARCH_X86_32 2799 imul r4, [fg_dataq+FGData.seed], 0x00010001 2800 2801 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 2802%else 2803 imul seed, [fg_dataq+FGData.seed], 0x00010001 2804%endif 2805 imul t0d, sbyd, 173 * 0x00010001 2806 imul sbyd, 37 * 0x01000100 2807 add t0d, (105 << 16) | 188 2808 add sbyd, (178 << 24) | (141 << 8) 2809 and t0d, 0x00ff00ff 2810 and sbyd, 0xff00ff00 2811 xor seed, t0d 2812%if ARCH_X86_32 2813 xor sbyd, seed 2814 2815 DEFINE_ARGS dst, src, scaling, see, w, picptr, luma 2816 2817 mov r3m, seed 2818 mov dstq, r0mp 2819 mov lumaq, r9mp 2820 mov wq, r4m 2821 lea r3, [srcq+wq*2] 2822 mov r1mp, r3 2823 lea r3, [dstq+wq*2] 2824 mov r11mp, r3 2825 lea r3, [lumaq+wq*(2<<%2)] 2826 mov r12mp, r3 2827%if %3 2828 shl r10mp, 1 2829%endif 2830%else 2831 xor seed, sbyd ; (cur_seed << 16) | top_seed 2832 2833 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2834 unused1, unused2, see, unused3, unused4, unused5, luma, lstride 2835 2836 mov lstrideq, r10mp 2837%if %3 2838 add lstrideq, lstrideq 2839%endif 2840 mov lumaq, r9mp 2841 lea r10, [srcq+wq*2] 2842 lea r11, [dstq+wq*2] 2843 lea r12, [lumaq+wq*(2<<%2)] 2844 mov r10mp, r10 2845 mov r11mp, r11 2846 mov r12mp, r12 2847%endif 2848 neg wq 2849%if ARCH_X86_32 2850 mov r4m, wq 2851%endif 2852 2853%%loop_x_v_overlap: 2854%if ARCH_X86_32 2855 mov seed, r3m 2856 xor t0d, t0d 2857%else 2858 ; we assume from the block above that bits 8-15 of r7d are zero'ed 2859%endif 2860 mov r6d, seed 2861 or seed, 0xeff4eff4 2862 test seeb, seeh 2863 setp t0b ; parity of top_seed 2864 shr seed, 16 2865 shl t0d, 16 2866 test seeb, seeh 2867 setp t0b ; parity of cur_seed 2868 or r6d, 0x00010001 2869 xor t0d, r6d 2870 mov seed, t0d 2871 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2872%if ARCH_X86_32 2873 mov r3m, seed 2874 2875 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2876 2877 mov offxd, offyd 2878%else 2879 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2880 offx, offy, see, unused1, top_offxy, unused2, luma, lstride 2881 2882 mov offyd, seed 2883 mov offxd, seed 2884%endif 2885 ror offyd, 8 2886 ror offxd, 12 2887 and offyd, 0xf000f 2888 and offxd, 0xf000f 2889 imul offyd, 164>>%3 2890 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2891 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2892 2893%if ARCH_X86_32 2894 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 2895%else 2896 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2897 h, offxy, see, unused1, top_offxy, unused2, luma, lstride 2898%endif 2899 movzx top_offxyd, offxyw 2900%if ARCH_X86_32 2901 mov [rsp+8*mmsize+1*gprsize], top_offxyd 2902 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2903%endif 2904 shr offxyd, 16 2905 2906%if %2 == 0 2907%%loop_x_odd_v_overlap: 2908%endif 2909%if %3 == 0 2910%if ARCH_X86_32 2911 mov r5, r5m 2912%endif 2913 SPLATD m2, [PIC_ptr(pw_27_17_17_27)] 2914%endif 2915 2916 mov hd, r7m 2917 mov grain_lutq, grain_lutmp 2918%%loop_y_v_overlap: 2919 ; grain = grain_lut[offy+y][offx+x] 2920 movu m3, [grain_lutq+offxyq*2] 2921%if ARCH_X86_32 2922 mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy 2923 movu m5, [grain_lutq+r0*2] 2924%else 2925 movu m5, [grain_lutq+top_offxyq*2] 2926%endif 2927 punpckhwd m7, m5, m3 2928 punpcklwd m5, m3 ; {top/cur interleaved} 2929 REPX {pmaddwd x, m2}, m7, m5 2930%if %1 2931%if ARCH_X86_32 2932 mov r5, r5m 2933%endif 2934 REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 2935%else 2936 REPX {paddd x, m14}, m7, m5 2937%endif 2938 REPX {psrad x, 5}, m7, m5 2939 packssdw m3, m5, m7 2940 pmaxsw m3, m8 2941 pminsw m3, m9 2942 2943 ; grain = grain_lut[offy+y][offx+x] 2944 movu m4, [grain_lutq+offxyq*2+16] 2945%if ARCH_X86_32 2946 movu m5, [grain_lutq+r0*2+16] 2947%else 2948 movu m5, [grain_lutq+top_offxyq*2+16] 2949%endif 2950 punpckhwd m7, m5, m4 2951 punpcklwd m5, m4 ; {top/cur interleaved} 2952 REPX {pmaddwd x, m2}, m7, m5 2953%if %1 2954 REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 2955%else 2956 REPX {paddd x, m14}, m7, m5 2957%endif 2958 REPX {psrad x, 5}, m7, m5 2959 packssdw m4, m5, m7 2960 pmaxsw m4, m8 2961 pminsw m4, m9 2962 2963 ; src 2964 mova m0, [srcq] 2965 mova m1, [srcq+16] 2966 2967 ; luma_src 2968 pxor mzero, mzero 2969%if ARCH_X86_32 2970 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut 2971 2972 mov lumaq, r9mp 2973%endif 2974 mova m5, [lumaq+ 0] 2975 mova m6, [lumaq+(16<<%2)] 2976%if %2 2977 phaddw m5, [lumaq+16] 2978 phaddw m6, [lumaq+48] 2979%endif 2980%if ARCH_X86_32 2981 add lumaq, r10mp 2982 mov r9mp, lumaq 2983%endif 2984%if %2 2985 pavgw m5, mzero 2986 pavgw m6, mzero 2987%endif 2988 2989%if %1 2990 punpckhwd m7, m5, m0 2991 punpcklwd m5, m0 2992 REPX {pmaddwd x, m14}, m7, m5 2993 REPX {psrad x, 6}, m7, m5 2994 packssdw m5, m7 2995 punpckhwd m7, m6, m1 2996 punpcklwd m6, m1 ; { luma, chroma } 2997 REPX {pmaddwd x, m14}, m7, m6 2998 REPX {psrad x, 6}, m7, m6 2999 packssdw m6, m7 3000 pxor mzero, mzero 3001 REPX {paddw x, m15}, m5, m6 3002 REPX {pmaxsw x, mzero}, m5, m6 3003 REPX {pminsw x, m10}, m5, m6 ; clip_pixel() 3004%else 3005 REPX {pand x, m10}, m5, m6 3006%endif 3007 3008 ; scaling[luma_src] 3009%if ARCH_X86_32 3010 vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1 3011 vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 3012%else 3013 vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 3014 vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 3015%endif 3016 REPX {psrlw x, 8}, m7, m5 3017 3018 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 3019 REPX {pmullw x, m11}, m7, m5 3020 pmulhrsw m3, m7 3021 pmulhrsw m4, m5 3022 3023 ; dst = clip_pixel(src, noise) 3024 paddw m0, m3 3025 paddw m1, m4 3026 pmaxsw m0, m13 3027 pmaxsw m1, m13 3028 pminsw m0, m12 3029 pminsw m1, m12 3030 movifnidn dstq, dstmp 3031 mova [dstq+ 0], m0 3032 mova [dstq+16], m1 3033 3034 dec hw 3035 jle %%end_y_v_overlap 3036%if ARCH_X86_32 3037 add srcq, r2mp 3038 add dstq, r2mp 3039 mov dstmp, dstq 3040%else 3041 add srcq, r13mp 3042 add dstq, r13mp 3043 add lumaq, lstrideq 3044%endif 3045 add grain_lutq, 82*2 3046%if %3 3047 jmp %%loop_y 3048%else 3049 btc hd, 16 3050 jc %%loop_y 3051%if ARCH_X86_32 3052 mov r5, r5m 3053%endif 3054 SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] 3055 jmp %%loop_y_v_overlap 3056%endif 3057 3058%%end_y_v_overlap: 3059%if ARCH_X86_32 3060 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut 3061 3062 mov wq, r4m 3063%endif 3064 add wq, 16 3065 jge %%end_hv 3066%if ARCH_X86_32 3067 mov srcq, r1mp 3068%else 3069 mov srcq, r10mp 3070%endif 3071 mov dstq, r11mp 3072 mov lumaq, r12mp 3073 lea srcq, [srcq+wq*2] 3074 lea dstq, [dstq+wq*2] 3075 lea lumaq, [lumaq+wq*(2<<%2)] 3076%if ARCH_X86_32 3077 mov r0mp, dstq 3078 mov r9mp, lumaq 3079 mov r4m, wq 3080%endif 3081 3082%if %2 3083 ; since fg_dataq.overlap is guaranteed to be set, we never jump 3084 ; back to .loop_x_v_overlap, and instead always fall-through to 3085 ; h+v overlap 3086%else 3087 btc dword r8m, 2 3088 jc %%loop_x_hv_overlap 3089 add offxyd, 16 3090%if ARCH_X86_32 3091 add dword [rsp+8*mmsize+1*gprsize], 16 3092%else 3093 add r11d, 16 3094%endif 3095 jmp %%loop_x_odd_v_overlap 3096%endif 3097 3098%%loop_x_hv_overlap: 3099%if ARCH_X86_32 3100 DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut 3101 3102 mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy 3103 add offxyd, 16 3104 add t0d, 16 3105 mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd 3106 mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd 3107 3108 DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut 3109 3110 mov seed, r3m 3111 xor t0d, t0d 3112%else 3113 ; we assume from the block above that bits 8-15 of r7d are zero'ed 3114%endif 3115 mov r6d, seed 3116 or seed, 0xeff4eff4 3117 test seeb, seeh 3118 setp t0b ; parity of top_seed 3119 shr seed, 16 3120 shl t0d, 16 3121 test seeb, seeh 3122 setp t0b ; parity of cur_seed 3123 or r6d, 0x00010001 3124 xor t0d, r6d 3125 mov seed, t0d 3126 ror seed, 1 ; updated (cur_seed << 16) | top_seed 3127%if ARCH_X86_32 3128 mov r3m, seed 3129 3130 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 3131 3132 mov offxd, offyd 3133%else 3134 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 3135 offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride 3136 3137 lea topleft_offxyq, [top_offxyq+16] 3138 lea left_offxyq, [offyq+16] 3139 mov offyd, seed 3140 mov offxd, seed 3141%endif 3142 ror offyd, 8 3143 ror offxd, 12 3144 and offyd, 0xf000f 3145 and offxd, 0xf000f 3146 imul offyd, 164>>%3 3147 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 3148 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 3149 3150%if ARCH_X86_32 3151 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy 3152%else 3153 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 3154 h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride 3155%endif 3156 movzx top_offxyd, offxyw 3157%if ARCH_X86_32 3158 mov [rsp+8*mmsize+1*gprsize], top_offxyd 3159 3160 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 3161%endif 3162 shr offxyd, 16 3163 3164%if %3 == 0 3165%if ARCH_X86_32 3166 mov r5, r5m 3167%endif 3168 SPLATD m2, [PIC_ptr(pw_27_17_17_27)] 3169%endif 3170 3171 mov hd, r7m 3172 mov grain_lutq, grain_lutmp 3173%%loop_y_hv_overlap: 3174 ; grain = grain_lut[offy+y][offx+x] 3175%if ARCH_X86_32 3176 mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy 3177 mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy 3178 movd m5, [grain_lutq+r5*2] 3179%else 3180 movd m5, [grain_lutq+left_offxyq*2] 3181%endif 3182 movu m7, [grain_lutq+offxyq*2] 3183%if ARCH_X86_32 3184 mov r5, [rsp+8*mmsize+2*gprsize] 3185 movu m4, [grain_lutq+r0*2] 3186%if %2 3187 pinsrw m5, [grain_lutq+r5*2], 2 3188%else 3189 movd m3, [grain_lutq+r5*2] 3190%endif 3191%else 3192 movu m4, [grain_lutq+top_offxyq*2] 3193%if %2 3194 pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } 3195%else 3196 movd m3, [grain_lutq+topleft_offxyq*2] 3197%endif 3198%endif 3199%if %2 == 0 3200 punpckldq m5, m3 3201%endif 3202 punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } 3203 punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } 3204%if %1 3205%if ARCH_X86_32 3206 mov r5, r5m 3207%endif 3208%if %2 3209 movddup m0, [PIC_ptr(pw_23_22)] 3210%else 3211 movddup m0, [PIC_ptr(pw_27_17_17_27)] 3212%endif 3213%else 3214 pshufd m0, m15, q1010 3215%endif 3216 pmaddwd m5, m0 3217%if %1 3218 paddd m5, [PIC_ptr(pd_16)] 3219%else 3220 paddd m5, m14 3221%endif 3222 psrad m5, 5 3223 packssdw m5, m5 3224 pmaxsw m5, m8 3225 pminsw m5, m9 3226 shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3 3227 shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter 3228 shufps m5, m4, q3231 ; top0-7 post-h_filter 3229 3230 punpckhwd m7, m5, m3 3231 punpcklwd m5, m3 ; {top/cur interleaved} 3232 REPX {pmaddwd x, m2}, m7, m5 3233%if %1 3234 REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 3235%else 3236 REPX {paddd x, m14}, m5, m7 3237%endif 3238 REPX {psrad x, 5}, m5, m7 3239 packssdw m3, m5, m7 3240 pmaxsw m3, m8 3241 pminsw m3, m9 3242 3243 ; right half 3244 movu m4, [grain_lutq+offxyq*2+16] 3245%if ARCH_X86_32 3246 movu m0, [grain_lutq+r0*2+16] 3247%else 3248 movu m0, [grain_lutq+top_offxyq*2+16] 3249%endif 3250 punpckhwd m1, m0, m4 3251 punpcklwd m0, m4 ; {top/cur interleaved} 3252 REPX {pmaddwd x, m2}, m1, m0 3253%if %1 3254 REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 3255%else 3256 REPX {paddd x, m14}, m1, m0 3257%endif 3258 REPX {psrad x, 5}, m1, m0 3259 packssdw m4, m0, m1 3260 pmaxsw m4, m8 3261 pminsw m4, m9 3262 3263 ; src 3264 mova m0, [srcq] 3265 mova m1, [srcq+16] 3266 3267 ; luma_src 3268 pxor mzero, mzero 3269%if ARCH_X86_32 3270 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut 3271 3272 mov lumaq, r9mp 3273%endif 3274 mova m6, [lumaq+ 0] 3275 mova m5, [lumaq+(16<<%2)] 3276%if %2 3277 phaddw m6, [lumaq+16] 3278 phaddw m5, [lumaq+48] 3279%endif 3280%if ARCH_X86_32 3281 add lumaq, r10mp 3282 mov r9mp, lumaq 3283%endif 3284%if %2 3285 pavgw m6, mzero 3286 pavgw m5, mzero 3287%endif 3288 3289%if %1 3290 punpckhwd m7, m6, m0 3291 punpcklwd m6, m0 3292 REPX {pmaddwd x, m14}, m7, m6 3293 REPX {psrad x, 6}, m7, m6 3294 packssdw m6, m7 3295 punpckhwd m7, m5, m1 3296 punpcklwd m5, m1 ; { luma, chroma } 3297 REPX {pmaddwd x, m14}, m7, m5 3298 REPX {psrad x, 6}, m7, m5 3299 packssdw m5, m7 3300 pxor mzero, mzero 3301 REPX {paddw x, m15}, m6, m5 3302 REPX {pmaxsw x, mzero}, m6, m5 3303 REPX {pminsw x, m10}, m6, m5 ; clip_pixel() 3304%else 3305 REPX {pand x, m10}, m6, m5 3306%endif 3307 3308 ; scaling[luma_src] 3309%if ARCH_X86_32 3310 vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 3311 vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 3312%else 3313%if %3 == 0 3314 ; register shortage :) 3315 push r12 3316%endif 3317 vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 3318 vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 3319%if %3 == 0 3320 pop r12 3321%endif 3322%endif 3323 REPX {psrlw x, 8}, m7, m6 3324 3325 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 3326 REPX {pmullw x, m11}, m7, m6 3327 pmulhrsw m3, m7 3328 pmulhrsw m4, m6 3329 3330 ; dst = clip_pixel(src, noise) 3331 paddw m0, m3 3332 paddw m1, m4 3333 pmaxsw m0, m13 3334 pmaxsw m1, m13 3335 pminsw m0, m12 3336 pminsw m1, m12 3337 movifnidn dstq, dstmp 3338 mova [dstq+ 0], m0 3339 mova [dstq+16], m1 3340 3341%if ARCH_X86_32 3342 add srcq, r2mp 3343 add dstq, r2mp 3344 mov dstmp, dstq 3345%else 3346 add srcq, r13mp 3347 add dstq, r13mp 3348 add lumaq, lstrideq 3349%endif 3350 add grain_lutq, 82*2 3351 dec hw 3352%if %3 3353 jg %%loop_y_h_overlap 3354%else 3355 jle %%end_y_hv_overlap 3356 btc hd, 16 3357 jc %%loop_y_h_overlap 3358%if ARCH_X86_32 3359 mov r5, r5m 3360%endif 3361 SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] 3362 jmp %%loop_y_hv_overlap 3363%%end_y_hv_overlap: 3364%endif 3365%if ARCH_X86_32 3366 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut 3367 3368 mov wq, r4m 3369%endif 3370 add wq, 16 3371 jge %%end_hv 3372%if ARCH_X86_32 3373 mov srcq, r1mp 3374%else 3375 mov srcq, r10mp 3376%endif 3377 mov dstq, r11mp 3378 mov lumaq, r12mp 3379 lea srcq, [srcq+wq*2] 3380 lea dstq, [dstq+wq*2] 3381 lea lumaq, [lumaq+wq*(2<<%2)] 3382%if ARCH_X86_32 3383 mov dstmp, dstq 3384 mov r9mp, lumaq 3385 mov r4m, wq 3386%endif 3387%if %2 3388 jmp %%loop_x_hv_overlap 3389%else 3390 or dword r8m, 4 3391 add offxyd, 16 3392%if ARCH_X86_32 3393 add dword [rsp+8*mmsize+1*gprsize], 16 3394%else 3395 add r11d, 16 ; top_offxy += 16 3396%endif 3397 jmp %%loop_x_odd_v_overlap 3398%endif 3399 3400%%end_hv: 3401 RET 3402%endmacro 3403 3404 %%FGUV_32x32xN_LOOP 1, %2, %3 3405.csfl: 3406 %%FGUV_32x32xN_LOOP 0, %2, %3 3407 3408%if STACK_ALIGNMENT < mmsize 3409DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3410%endif 3411%endmacro 3412 3413FGUV_FN 420, 1, 1 3414FGUV_FN 422, 1, 0 3415FGUV_FN 444, 0, 0 3416