1; Copyright © 2019-2021, VideoLAN and dav1d authors 2; Copyright © 2019, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28%include "x86/filmgrain_common.asm" 29 30SECTION_RODATA 31 32pw_1024: times 8 dw 1024 33pb_27_17_17_27: db 27, 17, 17, 27 34 times 6 db 0, 32 35pb_23_22_h: db 23, 22 36 times 7 db 0, 32 37pb_27_17: times 8 db 27, 17 38pb_17_27: times 8 db 17, 27 39pb_23_22: times 8 db 23, 22 40pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 41rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 42byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 43pw_seed_xor: times 2 dw 0xb524 44 times 2 dw 0x49d8 45pb_1: times 4 db 1 46hmul_bits: dw 32768, 16384, 8192, 4096 47round: dw 2048, 1024, 512 48mul_bits: dw 256, 128, 64, 32, 16 49round_vals: dw 32, 64, 128, 256, 512 50max: dw 255, 240, 235 51min: dw 0, 16 52pw_1: dw 1 53 54%macro JMP_TABLE 2-* 55 %xdefine %1_8bpc_%2_table %%table 56 %xdefine %%base %1_8bpc_%2_table 57 %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 58 %%table: 59 %rep %0 - 2 60 dd %%prefix %+ .ar%3 - %%base 61 %rotate 1 62 %endrep 63%endmacro 64 65JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 66JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 67JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 68JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 69 70SECTION .text 71 72%if ARCH_X86_32 73%define PIC_ptr(a) base+a 74%else 75%define PIC_ptr(a) a 76%endif 77 78%macro SCRATCH 3 79%if ARCH_X86_32 80 mova [rsp+%3*mmsize], m%1 81%define m%2 [rsp+%3*mmsize] 82%else 83 SWAP %1, %2 84%endif 85%endmacro 86 87INIT_XMM ssse3 88cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data 89 LEA r4, $$ 90%define base r4-$$ 91 movq m1, [base+rnd_next_upperbit_mask] 92 movq m4, [base+mul_bits] 93 movq m7, [base+hmul_bits] 94 mov r2d, [fg_dataq+FGData.grain_scale_shift] 95 movd m2, [base+round+r2*2] 96 movd m0, [fg_dataq+FGData.seed] 97 mova m5, [base+pb_mask] 98 pshuflw m2, m2, q0000 99 pshuflw m0, m0, q0000 100 mov r2, -73*82 101 sub bufq, r2 102 lea r3, [base+gaussian_sequence] 103.loop: 104 pand m6, m0, m1 105 psrlw m3, m6, 10 106 por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 107 pmullw m6, m4 ; bits 0x0f00 are set 108 pshufb m3, m5, m6 ; set 15th bit for next 4 seeds 109 psllq m6, m3, 30 110 por m3, m6 111 psllq m6, m3, 15 112 por m3, m6 ; aggregate each bit into next seed's high bit 113 pmulhuw m6, m0, m7 114 por m3, m6 ; 4 next output seeds 115 pshuflw m0, m3, q3333 116 psrlw m3, 5 117%if ARCH_X86_64 118 movq r6, m3 119 mov r8, r6 120 movzx r5d, r6w 121 shr r6d, 16 122 shr r8, 32 123 movzx r7, r8w 124 shr r8, 16 125 126 movd m6, [r3+r5*2] 127 pinsrw m6, [r3+r6*2], 1 128 pinsrw m6, [r3+r7*2], 2 129 pinsrw m6, [r3+r8*2], 3 130%else 131 movd r6, m3 132 pshuflw m3, m3, q3232 133 movzx r5, r6w 134 shr r6, 16 135 136 movd m6, [r3+r5*2] 137 pinsrw m6, [r3+r6*2], 1 138 139 movd r6, m3 140 movzx r5, r6w 141 shr r6, 16 142 143 pinsrw m6, [r3+r5*2], 2 144 pinsrw m6, [r3+r6*2], 3 145%endif 146 pmulhrsw m6, m2 147 packsswb m6, m6 148 movd [bufq+r2], m6 149 add r2, 4 150 jl .loop 151 152 ; auto-regression code 153 movsxd r2, [fg_dataq+FGData.ar_coeff_lag] 154 movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] 155 lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] 156 jmp r2 157 158.ar1: 159%if ARCH_X86_32 160 DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max 161%elif WIN64 162 DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 163 mov bufq, r0 164%else 165 DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 166%endif 167 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 168 movd m4, [fg_dataq+FGData.ar_coeffs_y] 169 mov ecx, [fg_dataq+FGData.ar_coeff_shift] 170%if ARCH_X86_32 171 mov r1m, cf3d 172 DEFINE_ARGS buf, shift, val3, min, max, x, val0 173%define hd r0mp 174%define cf3d r1mp 175%elif WIN64 176 DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 177%else 178 DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 179%endif 180 pxor m6, m6 181 pcmpgtb m7, m6, m4 182 punpcklbw m4, m7 183 pinsrw m4, [base+pw_1], 3 184 pshufd m5, m4, q1111 185 pshufd m4, m4, q0000 186 movd m3, [base+round_vals+shiftq*2-12] ; rnd 187 pshuflw m3, m3, q0000 188 sub bufq, 82*73-(82*3+79) 189 mov hd, 70 190 mov mind, -128 191 mov maxd, 127 192.y_loop_ar1: 193 mov xq, -76 194 movsx val3d, byte [bufq+xq-1] 195.x_loop_ar1: 196 movq m0, [bufq+xq-82-1] ; top/left 197 pcmpgtb m7, m6, m0 198 punpcklbw m0, m7 199 psrldq m2, m0, 2 ; top 200 psrldq m1, m0, 4 ; top/right 201 punpcklwd m0, m2 202 punpcklwd m1, m3 203 pmaddwd m0, m4 204 pmaddwd m1, m5 205 paddd m0, m1 206.x_loop_ar1_inner: 207 movd val0d, m0 208 psrldq m0, 4 209 imul val3d, cf3d 210 add val3d, val0d 211 sar val3d, shiftb 212 movsx val0d, byte [bufq+xq] 213 add val3d, val0d 214 cmp val3d, maxd 215 cmovns val3d, maxd 216 cmp val3d, mind 217 cmovs val3d, mind 218 mov byte [bufq+xq], val3b 219 ; keep val3d in-place as left for next x iteration 220 inc xq 221 jz .x_loop_ar1_end 222 test xq, 3 223 jnz .x_loop_ar1_inner 224 jmp .x_loop_ar1 225 226.x_loop_ar1_end: 227 add bufq, 82 228 dec hd 229 jg .y_loop_ar1 230.ar0: 231 RET 232 233.ar2: 234%if ARCH_X86_32 235 ALLOC_STACK -16*8 236%endif 237 DEFINE_ARGS buf, fg_data, shift 238 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 239 movd m6, [base+round_vals-12+shiftq*2] 240 movd m7, [base+byte_blend+1] 241 SCRATCH 7, 15, 7 242 movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 243 movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 244 pxor m7, m7 245 pshuflw m6, m6, q0000 246 punpcklwd m6, m7 247 pcmpgtb m4, m7, m0 248 pcmpgtb m5, m7, m1 249 punpcklbw m0, m4 250 punpcklbw m1, m5 251 DEFINE_ARGS buf, fg_data, h, x 252 pshufd m4, m1, q0000 253 pshufd m5, m1, q1111 254 pshufd m3, m0, q3333 255 pshufd m2, m0, q2222 256 pshufd m1, m0, q1111 257 pshufd m0, m0, q0000 258 SCRATCH 0, 8, 0 259 SCRATCH 1, 9, 1 260 SCRATCH 2, 10, 2 261 SCRATCH 3, 11, 3 262 SCRATCH 4, 12, 4 263 SCRATCH 5, 13, 5 264 SCRATCH 6, 14, 6 265 sub bufq, 82*73-(82*3+79) 266 mov hd, 70 267.y_loop_ar2: 268 mov xq, -76 269 270.x_loop_ar2: 271 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 272 movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 273 pcmpgtb m2, m7, m0 274 punpckhbw m1, m0, m2 275 punpcklbw m0, m2 276 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] 277 psrldq m3, m1, 2 ; y=-1,x=[-1,+5] 278 psrldq m4, m1, 4 ; y=-1,x=[+0,+5] 279 punpcklwd m2, m0, m5 280 punpcklwd m3, m4 281 pmaddwd m2, m8 282 pmaddwd m3, m11 283 paddd m2, m3 284 285 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] 286 psrldq m5, m0, 6 ; y=-2,x=[+1,+5] 287 psrldq m6, m0, 8 ; y=-2,x=[+2,+5] 288 punpcklwd m4, m5 289 punpcklwd m6, m1 290 psrldq m5, m1, 6 ; y=-1,x=[+1,+5] 291 psrldq m1, m1, 8 ; y=-1,x=[+2,+5] 292 punpcklwd m5, m1 293 pmaddwd m4, m9 294 pmaddwd m6, m10 295 pmaddwd m5, m12 296 paddd m4, m6 297 paddd m2, m5 298 paddd m2, m4 299 paddd m2, m14 300 301 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] 302.x_loop_ar2_inner: 303 pcmpgtb m4, m7, m0 304 punpcklbw m1, m0, m4 305 pmaddwd m3, m1, m13 306 paddd m3, m2 307 psrldq m1, 4 ; y=0,x=0 308 psrldq m2, 4 ; shift top to next pixel 309 psrad m3, [fg_dataq+FGData.ar_coeff_shift] 310 ; don't packssdw since we only care about one value 311 paddw m3, m1 312 packsswb m3, m3 313 pslldq m3, 2 314 pand m3, m15 315 pandn m1, m15, m0 316 por m0, m1, m3 317 psrldq m0, 1 318 ; overwrite 2 pixels, but that's ok 319 movd [bufq+xq-1], m0 320 inc xq 321 jz .x_loop_ar2_end 322 test xq, 3 323 jnz .x_loop_ar2_inner 324 jmp .x_loop_ar2 325 326.x_loop_ar2_end: 327 add bufq, 82 328 dec hd 329 jg .y_loop_ar2 330 RET 331 332.ar3: 333 DEFINE_ARGS buf, fg_data, shift 334%if ARCH_X86_32 335 ALLOC_STACK -16*14 336%elif WIN64 337 SUB rsp, 16*6 338%assign stack_size_padded (stack_size_padded+16*6) 339%assign stack_size (stack_size+16*6) 340%else 341 ALLOC_STACK -16*6 342%endif 343 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 344 movd m6, [base+round_vals-12+shiftq*2] 345 movd m7, [base+byte_blend] 346 movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 347 movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 348 pxor m3, m3 349 pcmpgtb m4, m3, m0 350 pcmpgtb m3, m2 351 pshuflw m6, m6, q0000 352 SCRATCH 6, 14, 12 353 SCRATCH 7, 15, 13 354 punpckhbw m1, m0, m4 355 punpcklbw m0, m4 356 punpcklbw m2, m3 357 pshufd m3, m0, q1111 358 pshufd m4, m0, q2222 359 pshufd m5, m0, q3333 360 pshufd m0, m0, q0000 361 mova [rsp+ 0*16], m0 362 mova [rsp+ 1*16], m3 363 mova [rsp+ 2*16], m4 364 mova [rsp+ 3*16], m5 365 pshufd m6, m1, q1111 366 pshufd m7, m1, q2222 367 pshufd m5, m1, q3333 368 pshufd m1, m1, q0000 369 pshufd m3, m2, q1111 370 psrldq m0, m2, 10 371 pinsrw m2, [base+pw_1], 5 372 pshufd m4, m2, q2222 373 pshufd m2, m2, q0000 374 pinsrw m0, [base+round_vals+shiftq*2-10], 3 375 mova [rsp+ 4*16], m1 376 mova [rsp+ 5*16], m6 377 SCRATCH 7, 8, 6 378 SCRATCH 5, 9, 7 379 SCRATCH 2, 10, 8 380 SCRATCH 3, 11, 9 381 SCRATCH 4, 12, 10 382 SCRATCH 0, 13, 11 383 DEFINE_ARGS buf, fg_data, h, x 384 sub bufq, 82*73-(82*3+79) 385 mov hd, 70 386.y_loop_ar3: 387 mov xq, -76 388 389.x_loop_ar3: 390 movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 391 pxor m3, m3 392 pcmpgtb m3, m0 393 punpckhbw m2, m0, m3 394 punpcklbw m0, m3 395 396 psrldq m5, m0, 2 397 psrldq m6, m0, 4 398 psrldq m7, m0, 6 399 punpcklwd m4, m0, m5 400 punpcklwd m6, m7 401 pmaddwd m4, [rsp+ 0*16] 402 pmaddwd m6, [rsp+ 1*16] 403 paddd m4, m6 404 405 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 406 pxor m5, m5 407 pcmpgtb m5, m1 408 punpckhbw m3, m1, m5 409 punpcklbw m1, m5 410 palignr m6, m2, m0, 10 411 palignr m7, m2, m0, 12 412 psrldq m0, 8 413 punpcklwd m0, m6 414 punpcklwd m7, m1 415 pmaddwd m0, [rsp+ 2*16] 416 pmaddwd m7, [rsp+ 3*16] 417 paddd m0, m7 418 paddd m0, m4 419 420 psrldq m4, m1, 2 421 psrldq m5, m1, 4 422 psrldq m6, m1, 6 423 psrldq m7, m1, 8 424 punpcklwd m4, m5 425 punpcklwd m6, m7 426 pmaddwd m4, [rsp+ 4*16] 427 pmaddwd m6, [rsp+ 5*16] 428 paddd m4, m6 429 paddd m0, m4 430 431 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 432 pxor m7, m7 433 pcmpgtb m7, m2 434 punpckhbw m5, m2, m7 435 punpcklbw m2, m7 436 palignr m7, m3, m1, 10 437 palignr m3, m1, 12 438 psrldq m1, m2, 2 439 punpcklwd m7, m3 440 punpcklwd m3, m2, m1 441 pmaddwd m7, m8 442 pmaddwd m3, m9 443 paddd m7, m3 444 paddd m0, m7 445 446 psrldq m6, m2, 4 447 psrldq m1, m2, 6 448 psrldq m3, m2, 8 449 palignr m4, m5, m2, 10 450 palignr m5, m5, m2, 12 451 452 punpcklwd m6, m1 453 punpcklwd m3, m4 454 punpcklwd m5, m14 455 pmaddwd m6, m10 456 pmaddwd m3, m11 457 pmaddwd m5, m12 458 paddd m0, m6 459 paddd m3, m5 460 paddd m0, m3 461 462 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] 463.x_loop_ar3_inner: 464 pxor m5, m5 465 pcmpgtb m5, m1 466 punpcklbw m2, m1, m5 467 pmaddwd m2, m13 468 pshufd m3, m2, q1111 469 paddd m2, m3 ; left+cur 470 paddd m2, m0 ; add top 471 psrldq m0, 4 472 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 473 ; don't packssdw since we only care about one value 474 packsswb m2, m2 475 pslldq m2, 3 476 pand m2, m15 477 pandn m3, m15, m1 478 por m1, m2, m3 479 movd [bufq+xq-3], m1 480 psrldq m1, 1 481 inc xq 482 jz .x_loop_ar3_end 483 test xq, 3 484 jnz .x_loop_ar3_inner 485 jmp .x_loop_ar3 486 487.x_loop_ar3_end: 488 add bufq, 82 489 dec hd 490 jg .y_loop_ar3 491 RET 492 493%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y 494INIT_XMM ssse3 495cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv 496 movifnidn r2, r2mp 497 movifnidn r3, r3mp 498 LEA r4, $$ 499%define base r4-$$ 500 movq m1, [base+rnd_next_upperbit_mask] 501 movq m4, [base+mul_bits] 502 movq m7, [base+hmul_bits] 503 mov r5d, [fg_dataq+FGData.grain_scale_shift] 504 movd m6, [base+round+r5*2] 505 mova m5, [base+pb_mask] 506 movd m0, [fg_dataq+FGData.seed] 507 movd m2, [base+pw_seed_xor+uvq*4] 508 pxor m0, m2 509 pshuflw m6, m6, q0000 510 pshuflw m0, m0, q0000 511 lea r6, [base+gaussian_sequence] 512%if %2 513%if ARCH_X86_64 514 mov r7d, 73-35*%3 515%else 516 mov r3mp, 73-35*%3 517%endif 518 add bufq, 44 519.loop_y: 520 mov r5, -44 521.loop_x: 522%else 523 mov r5, -82*73 524 sub bufq, r5 525.loop: 526%endif 527 pand m2, m0, m1 528 psrlw m3, m2, 10 529 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 530 pmullw m2, m4 ; bits 0x0f00 are set 531 pshufb m3, m5, m2 ; set 15th bit for next 4 seeds 532 psllq m2, m3, 30 533 por m3, m2 534 psllq m2, m3, 15 535 por m3, m2 ; aggregate each bit into next seed's high bit 536 pmulhuw m2, m0, m7 537 por m2, m3 ; 4 next output seeds 538 pshuflw m0, m2, q3333 539 psrlw m2, 5 540%if ARCH_X86_64 541 movd r9d, m2 542 pshuflw m2, m2, q3232 543 movzx r8, r9w 544 shr r9, 16 545 546 movd m3, [r6+r8*2] 547 pinsrw m3, [r6+r9*2], 1 548 549 movd r9d, m2 550 movzx r8, r9w 551 shr r9, 16 552 553 pinsrw m3, [r6+r8*2], 2 554 pinsrw m3, [r6+r9*2], 3 555%else 556 movd r2, m2 557 pshuflw m2, m2, q3232 558 movzx r1, r2w 559 shr r2, 16 560 561 movd m3, [r6+r1*2] 562 pinsrw m3, [r6+r2*2], 1 563 564 movd r2, m2 565 movzx r1, r2w 566 shr r2, 16 567 568 pinsrw m3, [r6+r1*2], 2 569 pinsrw m3, [r6+r2*2], 3 570%endif 571 pmulhrsw m3, m6 572 packsswb m3, m3 573 movd [bufq+r5], m3 574 add r5, 4 575%if %2 576 jl .loop_x 577 add bufq, 82 578%if ARCH_X86_64 579 dec r7d 580%else 581 dec r3mp 582%endif 583 jg .loop_y 584%else 585 jl .loop 586%endif 587 588%if ARCH_X86_32 589 mov r2, r2mp 590%endif 591 592 ; auto-regression code 593 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 594 movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] 595 lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] 596 jmp r5 597 598.ar0: 599 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 600 movifnidn bufyq, bufymp 601%if ARCH_X86_32 602 ALLOC_STACK -2*16 603%endif 604 imul uvd, 28 605 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 606 movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] 607 movd m4, [base+hmul_bits+shiftq*2] 608 DEFINE_ARGS buf, bufy, h, x 609 pxor m0, m0 610 pcmpgtb m0, m5 611 punpcklbw m5, m0 612 movd m7, [base+pb_1] 613%if %2 614 movd m6, [base+hmul_bits+2+%3*2] 615%endif 616 pshuflw m5, m5, q0000 617 pshuflw m4, m4, q0000 618 pshufd m7, m7, q0000 619%if %2 620 pshuflw m6, m6, q0000 621%endif 622 punpcklqdq m5, m5 623 punpcklqdq m4, m4 624%if %2 625 punpcklqdq m6, m6 626%endif 627 pcmpeqw m1, m1 628 pslldq m1, 12>>%2 629 SCRATCH 1, 8, 0 630 SCRATCH 4, 9, 1 631%if %2 632 sub bufq, 82*(73-35*%3)+82-(82*3+41) 633%else 634 sub bufq, 82*70-3 635%endif 636 add bufyq, 3+82*3 637 mov hd, 70-35*%3 638.y_loop_ar0: 639 xor xd, xd 640.x_loop_ar0: 641 ; first 32 pixels 642%if %2 643 movu m1, [bufyq+xq*2] 644%if %3 645 movu m2, [bufyq+xq*2+82] 646%endif 647 movu m3, [bufyq+xq*2+16] 648%if %3 649 movu m4, [bufyq+xq*2+82+16] 650%endif 651 pmaddubsw m0, m7, m1 652%if %3 653 pmaddubsw m1, m7, m2 654%endif 655 pmaddubsw m2, m7, m3 656%if %3 657 pmaddubsw m3, m7, m4 658 paddw m0, m1 659 paddw m2, m3 660%endif 661 pmulhrsw m0, m6 662 pmulhrsw m2, m6 663%else 664 movu m0, [bufyq+xq] 665 pxor m6, m6 666 pcmpgtb m6, m0 667 punpckhbw m2, m0, m6 668 punpcklbw m0, m6 669%endif 670 pmullw m0, m5 671 pmullw m2, m5 672 pmulhrsw m0, m9 673 pmulhrsw m2, m9 674 movu m1, [bufq+xq] 675 pxor m4, m4 676 pcmpgtb m4, m1 677 punpckhbw m3, m1, m4 678%if %2 679 punpcklbw m1, m4 680 paddw m2, m3 681 paddw m0, m1 682%else 683 punpcklbw m6, m1, m4 684 paddw m2, m3 685 paddw m0, m6 686%endif 687 packsswb m0, m2 688%if %2 689 movu [bufq+xq], m0 690 add xd, 16 691 cmp xd, 32 692 jl .x_loop_ar0 693 694 ; last 6/12 pixels 695 movu m1, [bufyq+xq*(1+%2)] 696%if %3 697 movu m2, [bufyq+xq*2+82] 698%endif 699 pmaddubsw m0, m7, m1 700%if %3 701 pmaddubsw m1, m7, m2 702 paddw m0, m1 703%endif 704 pmulhrsw m0, m6 705 pmullw m0, m5 706 pmulhrsw m0, m9 707 movq m1, [bufq+xq] 708 pxor m4, m4 709 pcmpgtb m4, m1 710 punpcklbw m2, m1, m4 711 paddw m0, m2 712 packsswb m0, m0 713 pandn m2, m8, m0 714 pand m1, m8 715 por m2, m1 716 movq [bufq+xq], m2 717%else 718 add xd, 16 719 cmp xd, 80 720 je .y_loop_final_ar0 721 movu [bufq+xq-16], m0 722 jmp .x_loop_ar0 723.y_loop_final_ar0: 724 pandn m2, m8, m0 725 pand m1, m8 726 por m2, m1 727 movu [bufq+xq-16], m2 728%endif 729 730 add bufq, 82 731 add bufyq, 82<<%3 732 dec hd 733 jg .y_loop_ar0 734 RET 735 736.ar1: 737%if ARCH_X86_32 738 RESET_STACK_STATE 739%endif 740 DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x 741 imul uvd, 28 742 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 743 movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] 744 pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 745%if ARCH_X86_32 746 mov r3mp, cf3d 747 DEFINE_ARGS buf, shift, fg_data, val3, min, max, x 748%elif WIN64 749 DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x 750 mov bufq, r0 751%else 752 DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x 753%endif 754 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 755 movd m3, [base+round_vals+shiftq*2-12] ; rnd 756%if %2 757 movd m7, [base+pb_1] 758 movd m6, [base+hmul_bits+2+%3*2] 759%endif 760 psrldq m4, 1 761%if ARCH_X86_32 762 DEFINE_ARGS buf, shift, val0, val3, min, max, x 763%elif WIN64 764 DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 765%else 766 DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 767%endif 768 pxor m5, m5 769 punpcklwd m3, m5 770%if %2 771 punpcklwd m6, m6 772%endif 773 pcmpgtb m5, m4 774 punpcklbw m4, m5 775 pshufd m5, m4, q1111 776 pshufd m4, m4, q0000 777 pshufd m3, m3, q0000 778%if %2 779 pshufd m7, m7, q0000 780 pshufd m6, m6, q0000 781 sub bufq, 82*(73-35*%3)+44-(82*3+41) 782%else 783 sub bufq, 82*69+3 784%endif 785%if ARCH_X86_32 786 add r1mp, 79+82*3 787 mov r0mp, 70-35*%3 788%else 789 add bufyq, 79+82*3 790 mov hd, 70-35*%3 791%endif 792 mov mind, -128 793 mov maxd, 127 794.y_loop_ar1: 795 mov xq, -(76>>%2) 796 movsx val3d, byte [bufq+xq-1] 797.x_loop_ar1: 798%if %2 799%if ARCH_X86_32 800 mov r2, r1mp 801 movq m0, [r2+xq*2] 802%if %3 803 movq m1, [r2+xq*2+82] 804%endif 805%else 806 movq m0, [bufyq+xq*2] 807%if %3 808 movq m1, [bufyq+xq*2+82] 809%endif 810%endif 811 pmaddubsw m2, m7, m0 812%if %3 813 pmaddubsw m0, m7, m1 814 paddw m2, m0 815%endif 816 pmulhrsw m2, m6 817%else 818%if ARCH_X86_32 819 mov r2, r1mp 820 movd m2, [r2+xq] 821%else 822 movd m2, [bufyq+xq] 823%endif 824 pxor m0, m0 825 pcmpgtb m0, m2 826 punpcklbw m2, m0 827%endif 828 829 movq m0, [bufq+xq-82-1] ; top/left 830 pxor m1, m1 831 pcmpgtb m1, m0 832 punpcklbw m0, m1 833 psrldq m1, m0, 4 ; top/right 834 punpcklwd m1, m2 835 psrldq m2, m0, 2 ; top 836 punpcklwd m0, m2 837 pmaddwd m0, m4 838 pmaddwd m1, m5 839 paddd m0, m1 840 paddd m0, m3 841.x_loop_ar1_inner: 842 movd val0d, m0 843 psrldq m0, 4 844%if ARCH_X86_32 845 imul val3d, r3mp 846%else 847 imul val3d, cf3d 848%endif 849 add val3d, val0d 850 sar val3d, shiftb 851 movsx val0d, byte [bufq+xq] 852 add val3d, val0d 853 cmp val3d, maxd 854 cmovns val3d, maxd 855 cmp val3d, mind 856 cmovs val3d, mind 857 mov byte [bufq+xq], val3b 858 ; keep val3d in-place as left for next x iteration 859 inc xq 860 jz .x_loop_ar1_end 861 test xq, 3 862 jnz .x_loop_ar1_inner 863 jmp .x_loop_ar1 864 865.x_loop_ar1_end: 866 add bufq, 82 867%if ARCH_X86_32 868 add r1mp, 82<<%3 869 dec r0mp 870%else 871 add bufyq, 82<<%3 872 dec hd 873%endif 874 jg .y_loop_ar1 875 RET 876 877.ar2: 878%if ARCH_X86_32 879 ALLOC_STACK -8*16 880%endif 881 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 882 movifnidn bufyq, bufymp 883 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 884 imul uvd, 28 885 movd m7, [base+round_vals-12+shiftq*2] 886 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 887 pxor m2, m2 888 pcmpgtb m2, m0 889 punpckhbw m1, m0, m2 890 punpcklbw m0, m2 891 pinsrw m1, [base+pw_1], 5 892 punpcklwd m7, m7 893 pshufd m7, m7, q0000 894 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 895 pshufd m4, m1, q0000 896 pshufd m5, m1, q1111 897 pshufd m6, m1, q2222 898 pshufd m3, m0, q3333 899 pshufd m2, m0, q2222 900 pshufd m1, m0, q1111 901 pshufd m0, m0, q0000 902 SCRATCH 0, 8, 0 903 SCRATCH 1, 9, 1 904 SCRATCH 2, 10, 2 905 SCRATCH 3, 11, 3 906 SCRATCH 4, 12, 4 907 SCRATCH 5, 13, 5 908 SCRATCH 6, 14, 6 909 SCRATCH 7, 15, 7 910%if %2 911 movd m7, [base+hmul_bits+2+%3*2] 912 movd m6, [base+pb_1] 913 punpcklwd m7, m7 914 pshufd m6, m6, q0000 915 pshufd m7, m7, q0000 916 sub bufq, 82*(73-35*%3)+44-(82*3+41) 917%else 918 sub bufq, 82*69+3 919%endif 920 add bufyq, 79+82*3 921 mov hd, 70-35*%3 922.y_loop_ar2: 923 mov xq, -(76>>%2) 924 925.x_loop_ar2: 926 pxor m2, m2 927 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 928 movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 929 pcmpgtb m2, m0 930 punpckhbw m1, m0, m2 931 punpcklbw m0, m2 932 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] 933 psrldq m3, m1, 2 ; y=-1,x=[-1,+5] 934 psrldq m4, m1, 4 ; y=-1,x=[+0,+5] 935 punpcklwd m2, m0, m5 936 punpcklwd m3, m4 937 pmaddwd m2, m8 938 pmaddwd m3, m11 939 paddd m2, m3 940 941 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] 942 psrldq m5, m0, 6 ; y=-2,x=[+1,+5] 943 psrldq m0, 8 ; y=-2,x=[+2,+5] 944 punpcklwd m4, m5 945 punpcklwd m0, m1 946 psrldq m3, m1, 6 ; y=-1,x=[+1,+5] 947 psrldq m1, m1, 8 ; y=-1,x=[+2,+5] 948 punpcklwd m3, m1 949 pmaddwd m4, m9 950 pmaddwd m0, m10 951 pmaddwd m3, m12 952 paddd m4, m0 953 paddd m2, m3 954 paddd m2, m4 955 956%if %2 957 movq m1, [bufyq+xq*2] 958%if %3 959 movq m3, [bufyq+xq*2+82] 960%endif 961 pmaddubsw m0, m6, m1 962%if %3 963 pmaddubsw m1, m6, m3 964 paddw m0, m1 965%endif 966 pmulhrsw m0, m7 967%else 968 movd m0, [bufyq+xq] 969 pxor m1, m1 970 pcmpgtb m1, m0 971 punpcklbw m0, m1 972%endif 973 punpcklwd m0, m15 974 pmaddwd m0, m14 975 paddd m2, m0 976 977 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] 978 pxor m4, m4 979 movd m5, [base+byte_blend+1] 980 punpcklbw m5, m5 981.x_loop_ar2_inner: 982 pcmpgtb m1, m4, m0 983 punpcklbw m0, m1 984 pmaddwd m3, m0, m13 985 paddd m3, m2 986 psrldq m2, 4 ; shift top to next pixel 987 psrad m3, [fg_dataq+FGData.ar_coeff_shift] 988 pslldq m3, 4 989 pand m3, m5 990 paddw m0, m3 991 packsswb m0, m0 992 movd [bufq+xq-2], m0 993 psrldq m0, 1 994 inc xq 995 jz .x_loop_ar2_end 996 test xq, 3 997 jnz .x_loop_ar2_inner 998 jmp .x_loop_ar2 999 1000.x_loop_ar2_end: 1001 add bufq, 82 1002 add bufyq, 82<<%3 1003 dec hd 1004 jg .y_loop_ar2 1005 RET 1006 1007.ar3: 1008%if ARCH_X86_32 1009 RESET_STACK_STATE 1010%endif 1011 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 1012 movifnidn bufyq, bufymp 1013%if ARCH_X86_32 1014 ALLOC_STACK -15*16 1015%else 1016 SUB rsp, 16*7 1017%assign stack_size_padded (stack_size_padded+16*7) 1018%assign stack_size (stack_size+16*7) 1019%endif 1020 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 1021 imul uvd, 28 1022 1023 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 1024 pxor m3, m3 1025 pcmpgtb m3, m0 1026 punpckhbw m1, m0, m3 1027 punpcklbw m0, m3 1028 pshufd m2, m0, q1111 1029 pshufd m3, m0, q2222 1030 pshufd m4, m0, q3333 1031 pshufd m0, m0, q0000 1032 pshufd m5, m1, q1111 1033 pshufd m6, m1, q2222 1034 pshufd m7, m1, q3333 1035 pshufd m1, m1, q0000 1036 mova [rsp+ 0*16], m0 1037 mova [rsp+ 1*16], m2 1038 mova [rsp+ 2*16], m3 1039 mova [rsp+ 3*16], m4 1040 mova [rsp+ 4*16], m1 1041 mova [rsp+ 5*16], m5 1042 mova [rsp+ 6*16], m6 1043 SCRATCH 7, 8, 7 1044 1045 movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] 1046 pxor m4, m4 1047 pcmpgtb m4, m2 1048 punpckhbw m5, m2, m4 1049 punpcklbw m2, m4 1050 pshufd m4, m2, q3232 1051 punpcklwd m3, m4, m5 1052 pshuflw m5, m4, q3321 1053 pshufd m4, m3, q0000 1054 pshufd m3, m2, q1111 1055 pshufd m2, m2, q0000 1056 pinsrw m5, [base+round_vals+shiftq*2-10], 3 1057 SCRATCH 2, 9, 8 1058 SCRATCH 3, 10, 9 1059 SCRATCH 4, 11, 10 1060 SCRATCH 5, 12, 11 1061 1062 movd m2, [base+round_vals-12+shiftq*2] 1063%if %2 1064 movd m1, [base+pb_1] 1065 movd m3, [base+hmul_bits+2+%3*2] 1066%endif 1067 pxor m0, m0 1068 punpcklwd m2, m0 1069%if %2 1070 punpcklwd m3, m3 1071%endif 1072 pshufd m2, m2, q0000 1073%if %2 1074 pshufd m1, m1, q0000 1075 pshufd m3, m3, q0000 1076 SCRATCH 1, 13, 12 1077%endif 1078 SCRATCH 2, 14, 13 1079%if %2 1080 SCRATCH 3, 15, 14 1081%endif 1082 1083 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 1084%if %2 1085 sub bufq, 82*(73-35*%3)+44-(82*3+41) 1086%else 1087 sub bufq, 82*69+3 1088%endif 1089 add bufyq, 79+82*3 1090 mov hd, 70-35*%3 1091.y_loop_ar3: 1092 mov xq, -(76>>%2) 1093 1094.x_loop_ar3: 1095 movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 1096 pxor m4, m4 1097 pcmpgtb m4, m0 1098 punpckhbw m3, m0, m4 1099 punpcklbw m0, m4 1100 1101 psrldq m5, m0, 2 1102 psrldq m6, m0, 4 1103 psrldq m7, m0, 6 1104 punpcklwd m4, m0, m5 1105 punpcklwd m6, m7 1106 pmaddwd m4, [rsp+ 0*16] 1107 pmaddwd m6, [rsp+ 1*16] 1108 paddd m4, m6 1109 1110 palignr m2, m3, m0, 10 1111 palignr m3, m0, 12 1112 psrldq m0, 8 1113 1114 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 1115 pxor m6, m6 1116 pcmpgtb m6, m1 1117 punpckhbw m5, m1, m6 1118 punpcklbw m1, m6 1119 1120 punpcklwd m0, m2 1121 punpcklwd m3, m1 1122 pmaddwd m0, [rsp+ 2*16] 1123 pmaddwd m3, [rsp+ 3*16] 1124 paddd m0, m3 1125 paddd m0, m4 1126 1127 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 1128 pxor m7, m7 1129 pcmpgtb m7, m2 1130 punpckhbw m6, m2, m7 1131 punpcklbw m2, m7 1132 1133 palignr m3, m5, m1, 10 1134 palignr m5, m1, 12 1135 psrldq m4, m2, 2 1136 1137 punpcklwd m3, m5 1138 punpcklwd m5, m2, m4 1139 pmaddwd m3, [rsp+ 6*16] 1140 pmaddwd m5, m8 1141 paddd m3, m5 1142 paddd m0, m3 1143 1144 psrldq m3, m1, 2 1145 psrldq m4, m1, 4 1146 psrldq m5, m1, 6 1147 psrldq m1, 8 1148 1149 punpcklwd m3, m4 1150 punpcklwd m5, m1 1151 pmaddwd m3, [rsp+ 4*16] 1152 pmaddwd m5, [rsp+ 5*16] 1153 paddd m3, m5 1154 paddd m0, m3 1155 1156%if %2 1157 movq m1, [bufyq+xq*2] 1158%if %3 1159 movq m3, [bufyq+xq*2+82] 1160%endif 1161 pmaddubsw m7, m13, m1 1162%if %3 1163 pmaddubsw m5, m13, m3 1164 paddw m7, m5 1165%endif 1166 pmulhrsw m7, m15 1167%else 1168 movd m7, [bufyq+xq] 1169 pxor m1, m1 1170 pcmpgtb m1, m7 1171 punpcklbw m7, m1 1172%endif 1173 1174 psrldq m1, m2, 4 1175 psrldq m3, m2, 6 1176 palignr m4, m6, m2, 10 1177 palignr m6, m2, 12 1178 psrldq m2, 8 1179 1180 punpcklwd m1, m3 1181 punpcklwd m2, m4 1182 punpcklwd m6, m7 1183 pmaddwd m1, m9 1184 pmaddwd m2, m10 1185 pmaddwd m6, m11 1186 paddd m1, m2 1187 paddd m0, m6 1188 paddd m0, m1 1189 paddd m0, m14 1190 1191 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] 1192 pxor m4, m4 1193 movd m5, [base+byte_blend] 1194.x_loop_ar3_inner: 1195 pcmpgtb m2, m4, m1 1196 punpcklbw m3, m1, m2 1197 pmaddwd m2, m3, m12 1198 pshufd m3, m2, q1111 1199 paddd m2, m3 ; left+cur 1200 paddd m2, m0 ; add top 1201 psrldq m0, 4 1202 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 1203 ; don't packssdw, we only care about one value 1204 packsswb m2, m2 1205 pandn m3, m5, m1 1206 pslld m2, 24 1207 pand m2, m5 1208 por m1, m2, m3 1209 movd [bufq+xq-3], m1 1210 psrldq m1, 1 1211 inc xq 1212 jz .x_loop_ar3_end 1213 test xq, 3 1214 jnz .x_loop_ar3_inner 1215 jmp .x_loop_ar3 1216 1217.x_loop_ar3_end: 1218 add bufq, 82 1219 add bufyq, 82<<%3 1220 dec hd 1221 jg .y_loop_ar3 1222 RET 1223%endmacro 1224 1225generate_grain_uv_fn 420, 1, 1 1226generate_grain_uv_fn 422, 1, 0 1227generate_grain_uv_fn 444, 0, 0 1228 1229%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg 1230%assign %%idx 0 1231%define %%tmp %2 1232%if %0 == 6 1233%define %%tmp %6 1234%endif 1235%rep 4 1236%if %%idx == 0 1237 movd %5 %+ d, %2 1238 pshuflw %%tmp, %2, q3232 1239%else 1240 movd %5 %+ d, %%tmp 1241%if %%idx == 2 1242 punpckhqdq %%tmp, %%tmp 1243%elif %%idx == 4 1244 psrlq %%tmp, 32 1245%endif 1246%endif 1247 movzx %4 %+ d, %5 %+ w 1248 shr %5 %+ d, 16 1249 1250%if %%idx == 0 1251 movd %1, [%3+%4] 1252%else 1253 pinsrw %1, [%3+%4], %%idx + 0 1254%endif 1255 pinsrw %1, [%3+%5], %%idx + 1 1256%assign %%idx %%idx+2 1257%endrep 1258%endmacro 1259 1260INIT_XMM ssse3 1261; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) 1262%if ARCH_X86_32 1263%if STACK_ALIGNMENT < mmsize 1264cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ 1265 dst, src, scaling, unused1, fg_data, picptr, unused2 1266 ; copy stack arguments to new position post-alignment, so that we 1267 ; don't have to keep the old stack location in a separate register 1268 mov r0, r0m 1269 mov r1, r2m 1270 mov r2, r4m 1271 mov r3, r6m 1272 mov r4, r7m 1273 mov r5, r8m 1274 1275 mov [rsp+5*mmsize+ 4*gprsize], r0 1276 mov [rsp+5*mmsize+ 6*gprsize], r1 1277 mov [rsp+5*mmsize+ 8*gprsize], r2 1278 mov [rsp+5*mmsize+10*gprsize], r3 1279 mov [rsp+5*mmsize+11*gprsize], r4 1280 mov [rsp+5*mmsize+12*gprsize], r5 1281%else 1282cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ 1283 dst, src, scaling, unused1, fg_data, picptr, unused2 1284%endif 1285 mov srcq, srcm 1286 mov fg_dataq, r3m 1287 mov scalingq, r5m 1288%if STACK_ALIGNMENT < mmsize 1289%define r0m [rsp+5*mmsize+ 4*gprsize] 1290%define r1m [rsp+5*mmsize+ 5*gprsize] 1291%define r2m [rsp+5*mmsize+ 6*gprsize] 1292%define r3m [rsp+5*mmsize+ 7*gprsize] 1293%define r4m [rsp+5*mmsize+ 8*gprsize] 1294%define r5m [rsp+5*mmsize+ 9*gprsize] 1295%define r6m [rsp+5*mmsize+10*gprsize] 1296%define r7m [rsp+5*mmsize+11*gprsize] 1297%define r8m [rsp+5*mmsize+12*gprsize] 1298%endif 1299 LEA r5, pb_mask 1300%define base r5-pb_mask 1301 mov r5m, picptrq 1302%else 1303cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut 1304 lea r7, [pb_mask] 1305%define base r7-pb_mask 1306%endif 1307 mov r6d, [fg_dataq+FGData.scaling_shift] 1308 movd m3, [base+mul_bits+r6*2-14] 1309 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 1310 movd m4, [base+max+r6*4] 1311 movd m5, [base+min+r6*2] 1312 punpcklwd m3, m3 1313 punpcklwd m4, m4 1314 punpcklwd m5, m5 1315 pshufd m3, m3, q0000 1316 pshufd m4, m4, q0000 1317 pshufd m5, m5, q0000 1318 SCRATCH 3, 11, 0 1319 SCRATCH 4, 12, 1 1320 SCRATCH 5, 13, 2 1321 1322%if ARCH_X86_32 1323 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 1324%else 1325 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 1326%endif 1327 1328 mov sbyd, r8m 1329 mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 1330 test overlapd, overlapd 1331 jz .no_vertical_overlap 1332 mova m6, [base+pw_1024] 1333 mova m7, [base+pb_27_17_17_27] 1334 SCRATCH 6, 14, 3 1335 SCRATCH 7, 15, 4 1336 test sbyd, sbyd 1337 jnz .vertical_overlap 1338 ; fall-through 1339 1340.no_vertical_overlap: 1341 mov r8m, overlapd 1342%if ARCH_X86_32 1343 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused 1344 imul seed, (173 << 24) | 37 1345%else 1346 imul seed, sbyd, (173 << 24) | 37 1347%endif 1348 add seed, (105 << 24) | 178 1349 rol seed, 8 1350 movzx seed, seew 1351 xor seed, [fg_dataq+FGData.seed] 1352 1353%if ARCH_X86_32 1354 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1355 1356 mov r3m, seed 1357 mov wq, r4m 1358%else 1359 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1360 unused1, unused2, see, unused3 1361%endif 1362 1363 lea src_bakq, [srcq+wq] 1364 neg wq 1365 sub dstmp, srcq 1366%if ARCH_X86_32 1367 mov r1m, src_bakq 1368 mov r4m, wq 1369 DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 1370%endif 1371 1372.loop_x: 1373%if ARCH_X86_32 1374 mov seed, r3m 1375%endif 1376 mov r6d, seed 1377 or seed, 0xEFF4 1378 shr r6d, 1 1379 test seeb, seeh 1380 lea seed, [r6+0x8000] 1381 cmovp seed, r6d ; updated seed 1382%if ARCH_X86_32 1383 mov r3m, seed 1384 1385 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1386 1387 mov offxd, offyd 1388%else 1389 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1390 offx, offy, see, unused 1391 1392 mov offyd, seed 1393 mov offxd, seed 1394%endif 1395 ror offyd, 8 1396 shr offxd, 12 1397 and offyd, 0xf 1398 imul offyd, 164 1399 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1400 1401%if ARCH_X86_32 1402 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, 1403 ; r6m=grain_lut, r7m=h, r8m=overlap_v|h 1404 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1405%else 1406 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1407 h, offxy, see, unused 1408%endif 1409 1410.loop_x_odd: 1411 mov hd, r7m 1412 mov grain_lutq, grain_lutmp 1413.loop_y: 1414 ; src 1415 mova m0, [srcq] 1416 pxor m2, m2 1417 punpckhbw m1, m0, m2 1418 punpcklbw m0, m2 ; m0-1: src as word 1419 1420 ; scaling[src] 1421%if ARCH_X86_32 1422 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 1423 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 1424%else 1425 vpgatherdw m4, m0, scalingq-1, r12, r13, m3 1426 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 1427%endif 1428 REPX {psrlw x, 8}, m4, m5 1429 1430 ; grain = grain_lut[offy+y][offx+x] 1431 movu m3, [grain_lutq+offxyq] 1432 pcmpgtb m7, m2, m3 1433 punpcklbw m2, m3, m7 1434 punpckhbw m3, m7 1435 1436 ; noise = round2(scaling[src] * grain, scaling_shift) 1437 pmullw m2, m4 1438 pmullw m3, m5 1439 pmulhrsw m2, m11 1440 pmulhrsw m3, m11 1441 1442 ; dst = clip_pixel(src, noise) 1443 paddw m0, m2 1444 paddw m1, m3 1445 pmaxsw m0, m13 1446 pmaxsw m1, m13 1447 pminsw m0, m12 1448 pminsw m1, m12 1449 packuswb m0, m1 1450 movifnidn dstq, dstmp 1451 mova [dstq+srcq], m0 1452 1453 add srcq, r2mp 1454 add grain_lutq, 82 1455 dec hd 1456 jg .loop_y 1457 1458%if ARCH_X86_32 1459 add r4mp, 16 1460%else 1461 add wq, 16 1462%endif 1463 jge .end 1464%if ARCH_X86_32 1465 mov srcq, r1mp 1466 add srcq, r4mp 1467%else 1468 lea srcq, [src_bakq+wq] 1469%endif 1470 btc dword r8m, 2 1471 jc .next_blk 1472 1473 add offxyd, 16 1474 test dword r8m, 2 ; r8m & 2 = have_top_overlap 1475 jz .loop_x_odd 1476 1477%if ARCH_X86_32 1478 add dword [rsp+5*mmsize+1*gprsize], 16 1479%else 1480 add r11d, 16 ; top_offxyd 1481%endif 1482 jnz .loop_x_odd_v_overlap 1483 1484.next_blk: 1485 test dword r8m, 1 1486 jz .loop_x 1487 1488 test dword r8m, 2 1489 jnz .loop_x_hv_overlap 1490 1491 ; horizontal overlap (without vertical overlap) 1492.loop_x_h_overlap: 1493%if ARCH_X86_32 1494 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, 1495 ; r6m=grain_lut, r7m=h, r8m=overlap_v|h 1496 DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 1497 1498 add offxyd, 16 ; left_offxyd 1499 mov [rsp+5*mmsize+0*gprsize], offxyd 1500 1501 DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 1502 1503 mov seed, r3m 1504%else 1505 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1506 offx, offy, see, left_offxy 1507 1508 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 1509%endif 1510 1511 mov r6d, seed 1512 or seed, 0xEFF4 1513 shr r6d, 1 1514 test seeb, seeh 1515 lea seed, [r6+0x8000] 1516 cmovp seed, r6d ; updated seed 1517 1518%if ARCH_X86_32 1519 mov r3m, seed 1520 1521 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1522 1523 mov offxd, offyd 1524%else 1525 mov offyd, seed 1526 mov offxd, seed 1527%endif 1528 ror offyd, 8 1529 shr offxd, 12 1530 and offyd, 0xf 1531 imul offyd, 164 1532 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1533 1534%if ARCH_X86_32 1535 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1536%else 1537 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1538 h, offxy, see, left_offxy 1539%endif 1540 1541 mov hd, r7m 1542 mov grain_lutq, grain_lutmp 1543.loop_y_h_overlap: 1544 ; src 1545 mova m0, [srcq] 1546 pxor m2, m2 1547 punpckhbw m1, m0, m2 1548 punpcklbw m0, m2 ; m0-1: src as word 1549 1550 ; scaling[src] 1551%if ARCH_X86_32 1552 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 1553 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 1554%else 1555 vpgatherdw m4, m0, scalingq-1, r12, r13, m3 1556 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 1557%endif 1558 REPX {psrlw x, 8}, m4, m5 1559 1560 ; grain = grain_lut[offy+y][offx+x] 1561 movu m3, [grain_lutq+offxyq] 1562%if ARCH_X86_32 1563 mov r5, [rsp+5*mmsize+0*gprsize] 1564 movd m7, [grain_lutq+r5] 1565%else 1566 movd m7, [grain_lutq+left_offxyq] 1567%endif 1568 punpcklbw m7, m3 1569 pmaddubsw m6, m15, m7 1570 pmulhrsw m6, m14 1571 packsswb m6, m6 1572 shufps m6, m3, q3210 1573 pcmpgtb m2, m6 1574 punpcklbw m7, m6, m2 1575 punpckhbw m6, m2 1576 1577 ; noise = round2(scaling[src] * grain, scaling_shift) 1578 pmullw m7, m4 1579 pmullw m6, m5 1580 pmulhrsw m7, m11 1581 pmulhrsw m6, m11 1582 1583 ; dst = clip_pixel(src, noise) 1584 paddw m0, m7 1585 paddw m1, m6 1586 pmaxsw m0, m13 1587 pmaxsw m1, m13 1588 pminsw m0, m12 1589 pminsw m1, m12 1590 packuswb m0, m1 1591 movifnidn dstq, dstmp 1592 mova [dstq+srcq], m0 1593 1594 add srcq, r2mp 1595 add grain_lutq, 82 1596 dec hd 1597 jg .loop_y_h_overlap 1598 1599%if ARCH_X86_32 1600 add r4mp, 16 1601%else 1602 add wq, 16 1603%endif 1604 jge .end 1605%if ARCH_X86_32 1606 mov srcq, r1m 1607 add srcq, r4m 1608%else 1609 lea srcq, [src_bakq+wq] 1610%endif 1611 xor dword r8m, 4 1612 add offxyd, 16 1613 1614 ; since this half-block had left-overlap, the next does not 1615 test dword r8m, 2 ; have_top_overlap 1616 jz .loop_x_odd 1617%if ARCH_X86_32 1618 add dword [rsp+5*mmsize+1*gprsize], 16 1619%else 1620 add r11d, 16 ; top_offxyd 1621%endif 1622 jmp .loop_x_odd_v_overlap 1623 1624.end: 1625 RET 1626 1627.vertical_overlap: 1628%if ARCH_X86_32 1629 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 1630%else 1631 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap 1632%endif 1633 1634 or overlapd, 2 ; top_overlap: overlap & 2 1635 mov r8m, overlapd 1636 movzx sbyd, sbyb 1637%if ARCH_X86_32 1638 imul r4, [fg_dataq+FGData.seed], 0x00010001 1639 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 1640%else 1641 imul seed, [fg_dataq+FGData.seed], 0x00010001 1642%endif 1643 imul tmpd, sbyd, 173 * 0x00010001 1644 imul sbyd, 37 * 0x01000100 1645 add tmpd, (105 << 16) | 188 1646 add sbyd, (178 << 24) | (141 << 8) 1647 and tmpd, 0x00ff00ff 1648 and sbyd, 0xff00ff00 1649 xor seed, tmpd 1650%if ARCH_X86_32 1651 xor sbyd, seed ; (cur_seed << 16) | top_seed 1652 1653 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1654 1655 mov r3m, seed 1656 mov wq, r4m 1657%else 1658 xor seed, sbyd ; (cur_seed << 16) | top_seed 1659 1660 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1661 tmp, unused2, see, unused3 1662%endif 1663 1664 lea src_bakq, [srcq+wq] 1665 neg wq 1666 sub dstmp, srcq 1667%if ARCH_X86_32 1668 mov r1m, src_bakq 1669 mov r4m, wq 1670 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 1671%endif 1672 1673.loop_x_v_overlap: 1674%if ARCH_X86_32 1675 mov seed, r3m 1676%endif 1677 ; we assume from the block above that bits 8-15 of tmpd are zero'ed, 1678 ; because of the 'and tmpd, 0x00ff00ff' above 1679 mov r6d, seed 1680 or seed, 0xeff4eff4 1681 test seeb, seeh 1682 setp tmpb ; parity of top_seed 1683 shr seed, 16 1684 shl tmpd, 16 1685 test seeb, seeh 1686 setp tmpb ; parity of cur_seed 1687 or r6d, 0x00010001 1688 xor tmpd, r6d 1689 mov seed, tmpd 1690 ror seed, 1 ; updated (cur_seed << 16) | top_seed 1691 1692%if ARCH_X86_32 1693 mov r3m, seed 1694 1695 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1696 1697 mov offxd, offyd 1698%else 1699 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1700 offx, offy, see, unused, top_offxy 1701 1702 mov offyd, seed 1703 mov offxd, seed 1704%endif 1705 1706 ror offyd, 8 1707 ror offxd, 12 1708 and offyd, 0xf000f 1709 and offxd, 0xf000f 1710 imul offyd, 164 1711 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1712 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1713 1714%if ARCH_X86_32 1715 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 1716%else 1717 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1718 h, offxy, see, unused, top_offxy 1719%endif 1720 1721 movzx top_offxyd, offxyw 1722%if ARCH_X86_32 1723 mov [rsp+5*mmsize+1*gprsize], top_offxyd 1724 1725 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1726%endif 1727 shr offxyd, 16 1728 1729.loop_x_odd_v_overlap: 1730%if ARCH_X86_32 1731 mov r5, r5m 1732 lea r5, [base+pb_27_17] 1733 mov [rsp+5*mmsize+12], r5 1734%else 1735 mova m8, [pb_27_17] 1736%endif 1737 mov hd, r7m 1738 mov grain_lutq, grain_lutmp 1739.loop_y_v_overlap: 1740 ; src 1741 mova m0, [srcq] 1742 pxor m2, m2 1743 punpckhbw m1, m0, m2 1744 punpcklbw m0, m2 ; m0-1: src as word 1745 1746 ; scaling[src] 1747%if ARCH_X86_32 1748 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 1749 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 1750%else 1751 vpgatherdw m4, m0, scalingq-1, r12, r13, m3 1752 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 1753%endif 1754 REPX {psrlw x, 8}, m4, m5 1755 1756 ; grain = grain_lut[offy+y][offx+x] 1757 movu m3, [grain_lutq+offxyq] 1758%if ARCH_X86_32 1759 mov r5, [rsp+5*mmsize+1*gprsize] 1760 movu m7, [grain_lutq+r5] 1761%else 1762 movu m7, [grain_lutq+top_offxyq] 1763%endif 1764 punpckhbw m6, m7, m3 1765 punpcklbw m7, m3 1766%if ARCH_X86_32 1767 mov r5, [rsp+5*mmsize+12] 1768 pmaddubsw m3, [r5], m6 1769 pmaddubsw m6, [r5], m7 1770%else 1771 pmaddubsw m3, m8, m6 1772 pmaddubsw m6, m8, m7 1773%endif 1774 pmulhrsw m3, m14 1775 pmulhrsw m6, m14 1776 packsswb m6, m3 1777 pcmpgtb m7, m2, m6 1778 punpcklbw m2, m6, m7 1779 punpckhbw m6, m7 1780 1781 ; noise = round2(scaling[src] * grain, scaling_shift) 1782 pmullw m2, m4 1783 pmullw m6, m5 1784 pmulhrsw m2, m11 1785 pmulhrsw m6, m11 1786 1787 ; dst = clip_pixel(src, noise) 1788 paddw m0, m2 1789 paddw m1, m6 1790 pmaxsw m0, m13 1791 pmaxsw m1, m13 1792 pminsw m0, m12 1793 pminsw m1, m12 1794 packuswb m0, m1 1795 movifnidn dstq, dstmp 1796 mova [dstq+srcq], m0 1797 1798%if ARCH_X86_32 1799 add dword [rsp+5*mmsize+12], mmsize 1800%else 1801 mova m8, [pb_17_27] 1802%endif 1803 add srcq, r2mp 1804 add grain_lutq, 82 1805 dec hw 1806 jz .end_y_v_overlap 1807 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1808 ; remaining (up to) 30 lines 1809 btc hd, 16 1810 jnc .loop_y_v_overlap 1811 jmp .loop_y 1812 1813.end_y_v_overlap: 1814%if ARCH_X86_32 1815 add r4mp, 16 1816%else 1817 add wq, 16 1818%endif 1819 jge .end_hv 1820%if ARCH_X86_32 1821 mov srcq, r1mp 1822 add srcq, r4mp 1823%else 1824 lea srcq, [src_bakq+wq] 1825%endif 1826 btc dword r8m, 2 1827 jc .loop_x_hv_overlap 1828 add offxyd, 16 1829%if ARCH_X86_32 1830 add dword [rsp+5*mmsize+1*gprsize], 16 1831%else 1832 add top_offxyd, 16 1833%endif 1834 jmp .loop_x_odd_v_overlap 1835 1836.loop_x_hv_overlap: 1837%if ARCH_X86_32 1838 mov r5, r5m 1839 lea r5, [base+pb_27_17] 1840 mov [rsp+5*mmsize+12], r5 1841 1842 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak 1843 1844 mov r5, [rsp+5*mmsize+1*gprsize] 1845 mov r4, offxyd 1846 add r5, 16 1847 add r4, 16 1848 mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy 1849 mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy 1850 1851 DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak 1852 1853 xor tmpd, tmpd 1854 mov seed, r3m 1855%else 1856 mova m8, [pb_27_17] 1857 1858 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1859 tmp, unused2, see, unused3 1860 1861 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 1862%endif 1863 mov r6d, seed 1864 or seed, 0xeff4eff4 1865 test seeb, seeh 1866 setp tmpb ; parity of top_seed 1867 shr seed, 16 1868 shl tmpd, 16 1869 test seeb, seeh 1870 setp tmpb ; parity of cur_seed 1871 or r6d, 0x00010001 1872 xor tmpd, r6d 1873 mov seed, tmpd 1874 ror seed, 1 ; updated (cur_seed << 16) | top_seed 1875 1876%if ARCH_X86_32 1877 mov r3m, seed 1878 1879 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1880 1881 mov offxd, offyd 1882%else 1883 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1884 offx, offy, see, left_offxy, top_offxy, topleft_offxy 1885 1886 lea topleft_offxyq, [top_offxyq+16] 1887 lea left_offxyq, [offyq+16] 1888 mov offyd, seed 1889 mov offxd, seed 1890%endif 1891 ror offyd, 8 1892 ror offxd, 12 1893 and offyd, 0xf000f 1894 and offxd, 0xf000f 1895 imul offyd, 164 1896 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1897 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1898 1899%if ARCH_X86_32 1900 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1901 1902 movzx r5, offxyw ; top_offxy 1903 mov [rsp+5*mmsize+1*gprsize], r5 1904%else 1905 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1906 h, offxy, see, left_offxy, top_offxy, topleft_offxy 1907 1908 movzx top_offxyd, offxyw 1909%endif 1910 shr offxyd, 16 1911 1912 mov hd, r7m 1913 mov grain_lutq, grain_lutmp 1914.loop_y_hv_overlap: 1915 ; grain = grain_lut[offy+y][offx+x] 1916 movu m3, [grain_lutq+offxyq] 1917%if ARCH_X86_32 1918 mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy 1919 mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy 1920 movu m6, [grain_lutq+r5] 1921 mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy 1922 movd m4, [grain_lutq+r0] 1923 movd m7, [grain_lutq+r5] 1924%else 1925 movu m6, [grain_lutq+top_offxyq] 1926 movd m4, [grain_lutq+left_offxyq] 1927 movd m7, [grain_lutq+topleft_offxyq] 1928%endif 1929 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1930 punpcklbw m4, m3 1931 punpcklbw m7, m6 1932 pmaddubsw m2, m15, m4 1933 pmaddubsw m4, m15, m7 1934 pmulhrsw m2, m14 1935 pmulhrsw m4, m14 1936 packsswb m2, m2 1937 packsswb m4, m4 1938 shufps m2, m3, q3210 1939 shufps m4, m6, q3210 1940 ; followed by v interpolation (top | cur -> cur) 1941 punpcklbw m3, m4, m2 1942 punpckhbw m4, m2 1943%if ARCH_X86_32 1944 mov r5, [rsp+5*mmsize+12] 1945 pmaddubsw m7, [r5], m4 1946 pmaddubsw m4, [r5], m3 1947%else 1948 pmaddubsw m7, m8, m4 1949 pmaddubsw m4, m8, m3 1950%endif 1951 pmulhrsw m7, m14 1952 pmulhrsw m4, m14 1953 packsswb m4, m7 1954 pxor m2, m2 1955 pcmpgtb m7, m2, m4 1956 punpcklbw m3, m4, m7 1957 punpckhbw m4, m7 1958 1959 ; src 1960 mova m0, [srcq] 1961 punpckhbw m1, m0, m2 1962 punpcklbw m0, m2 ; m0-1: src as word 1963 1964 ; scaling[src] 1965%if ARCH_X86_32 1966 vpgatherdw m5, m0, scalingq-1, r0, r5, m7 1967 vpgatherdw m6, m1, scalingq-1, r0, r5, m7 1968%else 1969 vpgatherdw m5, m0, scalingq-1, r13, r14, m7 1970 vpgatherdw m6, m1, scalingq-1, r13, r14, m7 1971%endif 1972 REPX {psrlw x, 8}, m5, m6 1973 1974 ; noise = round2(scaling[src] * grain, scaling_shift) 1975 pmullw m3, m5 1976 pmullw m4, m6 1977 pmulhrsw m3, m11 1978 pmulhrsw m4, m11 1979 1980 ; dst = clip_pixel(src, noise) 1981 paddw m0, m3 1982 paddw m1, m4 1983 pmaxsw m0, m13 1984 pmaxsw m1, m13 1985 pminsw m0, m12 1986 pminsw m1, m12 1987 packuswb m0, m1 1988 movifnidn dstq, dstmp 1989 mova [dstq+srcq], m0 1990 1991%if ARCH_X86_32 1992 add dword [rsp+5*mmsize+12], mmsize 1993%else 1994 mova m8, [pb_17_27] 1995%endif 1996 add srcq, r2mp 1997 add grain_lutq, 82 1998 dec hw 1999 jz .end_y_hv_overlap 2000 ; 2 lines get vertical overlap, then fall back to non-overlap code for 2001 ; remaining (up to) 30 lines 2002 btc hd, 16 2003 jnc .loop_y_hv_overlap 2004 jmp .loop_y_h_overlap 2005 2006.end_y_hv_overlap: 2007%if ARCH_X86_32 2008 add r4mp, 16 2009%else 2010 add wq, 16 2011%endif 2012 jge .end_hv 2013%if ARCH_X86_32 2014 mov srcq, r1m 2015 add srcq, r4m 2016%else 2017 lea srcq, [src_bakq+wq] 2018%endif 2019 xor dword r8m, 4 2020 add offxyd, 16 2021%if ARCH_X86_32 2022 add dword [rsp+5*mmsize+1*gprsize], 16 2023%else 2024 add top_offxyd, 16 2025%endif 2026 jmp .loop_x_odd_v_overlap 2027 2028.end_hv: 2029 RET 2030 2031%macro FGUV_FN 3 ; name, ss_hor, ss_ver 2032INIT_XMM ssse3 2033%if ARCH_X86_32 2034; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, 2035; sby, luma, lstride, uv_pl, is_id) 2036%if STACK_ALIGNMENT < mmsize 2037DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 2038cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ 2039 tmp, src, scaling, h, fg_data, picptr, unused 2040 mov r0, r0m 2041 mov r1, r2m 2042 mov r2, r4m 2043 mov r3, r6m 2044 mov r4, r7m 2045 mov [rsp+7*mmsize+3*gprsize], r0 2046 mov [rsp+7*mmsize+5*gprsize], r1 2047 mov [rsp+7*mmsize+7*gprsize], r2 2048 mov [rsp+7*mmsize+9*gprsize], r3 2049 mov [rsp+7*mmsize+10*gprsize], r4 2050 2051 mov r0, r8m 2052 mov r1, r9m 2053 mov r2, r10m 2054 mov r4, r11m 2055 mov r3, r12m 2056 mov [rsp+7*mmsize+11*gprsize], r0 2057 mov [rsp+7*mmsize+12*gprsize], r1 2058 mov [rsp+7*mmsize+13*gprsize], r2 2059 mov [rsp+7*mmsize+14*gprsize], r4 2060%else 2061cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ 2062 tmp, src, scaling, h, fg_data, picptr, unused 2063%endif 2064 mov srcq, srcm 2065 mov fg_dataq, r3m 2066 mov scalingq, r5m 2067%if STACK_ALIGNMENT < mmsize 2068%define r0m [rsp+7*mmsize+ 3*gprsize] 2069%define r1m [rsp+7*mmsize+ 4*gprsize] 2070%define r2m [rsp+7*mmsize+ 5*gprsize] 2071%define r3m [rsp+7*mmsize+ 6*gprsize] 2072%define r4m [rsp+7*mmsize+ 7*gprsize] 2073%define r5m [rsp+7*mmsize+ 8*gprsize] 2074%define r6m [rsp+7*mmsize+ 9*gprsize] 2075%define r7m [rsp+7*mmsize+10*gprsize] 2076%define r8m [rsp+7*mmsize+11*gprsize] 2077%define r9m [rsp+7*mmsize+12*gprsize] 2078%define r10m [rsp+7*mmsize+13*gprsize] 2079%define r11m [rsp+7*mmsize+14*gprsize] 2080%define r12m [rsp+7*mmsize+15*gprsize] 2081%endif 2082 LEA r5, pb_mask 2083%define base r5-pb_mask 2084 mov r5m, r5 2085%else 2086cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 2087 grain_lut, tmp, sby, luma, lstride, uv_pl, is_id 2088 lea r8, [pb_mask] 2089%define base r8-pb_mask 2090%endif 2091 mov r6d, [fg_dataq+FGData.scaling_shift] 2092 movd m3, [base+mul_bits+r6*2-14] 2093 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 2094 lea tmpd, [r6d*2] 2095%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize 2096 test r3, r3 2097%else 2098 cmp dword r12m, 0 ; is_idm 2099%endif 2100 movd m5, [base+min+r6*2] 2101 cmovne r6d, tmpd 2102 movd m4, [base+max+r6*2] 2103 punpcklwd m3, m3 2104 punpcklwd m5, m5 2105 punpcklwd m4, m4 2106 pshufd m3, m3, q0000 2107 pshufd m5, m5, q0000 2108 pshufd m4, m4, q0000 2109 SCRATCH 3, 11, 0 2110 SCRATCH 4, 12, 1 2111 SCRATCH 5, 13, 2 2112 2113 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 2114 jne .csfl 2115 2116%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 2117%if ARCH_X86_32 2118 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2119%else 2120 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 2121%endif 2122 2123%if %1 2124 mov r6d, dword r11m 2125 movd m0, [fg_dataq+FGData.uv_mult+r6*4] 2126 movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] 2127 punpcklbw m6, m1, m0 2128 movd m7, [fg_dataq+FGData.uv_offset+r6*4] 2129 punpcklwd m6, m6 2130 punpcklwd m7, m7 2131 pshufd m6, m6, q0000 2132 pshufd m7, m7, q0000 2133 SCRATCH 6, 14, 3 2134 SCRATCH 7, 15, 4 2135%endif 2136 2137 mov sbyd, r8m 2138 mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 2139 test overlapd, overlapd 2140 jz %%no_vertical_overlap 2141%if ARCH_X86_32 2142%if %2 2143 mova m1, [base+pb_23_22_h] 2144%else 2145 mova m1, [base+pb_27_17_17_27] 2146%endif 2147 mova m0, [base+pw_1024] 2148%else 2149%if %2 2150 mova m1, [pb_23_22_h] 2151%else 2152 mova m1, [pb_27_17_17_27] 2153%endif 2154 mova m0, [pw_1024] 2155%endif 2156 SCRATCH 0, 8, 5 2157 SCRATCH 1, 9, 6 2158 test sbyd, sbyd 2159 jnz %%vertical_overlap 2160 ; fall-through 2161 2162%%no_vertical_overlap: 2163 mov r8m, overlapd 2164%if ARCH_X86_32 2165 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap 2166 imul seed, (173 << 24) | 37 2167%else 2168 imul seed, sbyd, (173 << 24) | 37 2169%endif 2170 add seed, (105 << 24) | 178 2171 rol seed, 8 2172 movzx seed, seew 2173 xor seed, [fg_dataq+FGData.seed] 2174 2175%if ARCH_X86_32 2176 mov r3m, seed 2177 2178 DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak 2179%define luma_bakq lumaq 2180 2181 mov wq, r4m 2182%if %3 2183 shl r10mp, 1 2184%endif 2185%else 2186 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2187 unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak 2188 2189 mov lstrideq, r10mp 2190%endif 2191 2192 mov lumaq, r9mp 2193 lea src_bakq, [srcq+wq] 2194 lea luma_bakq, [lumaq+wq*(1+%2)] 2195 neg wq 2196 sub r0mp, srcq 2197%if ARCH_X86_32 2198 mov r1m, src_bakq 2199 mov r11m, luma_bakq 2200 mov r4m, wq 2201 2202 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 2203%else 2204 mov r11mp, src_bakq 2205 mov r12mp, strideq 2206%endif 2207 2208%%loop_x: 2209%if ARCH_X86_32 2210 mov seed, r3m 2211%endif 2212 mov r6d, seed 2213 or seed, 0xEFF4 2214 shr r6d, 1 2215 test seeb, seeh 2216 lea seed, [r6+0x8000] 2217 cmovp seed, r6d ; updated seed 2218%if ARCH_X86_32 2219 mov r3m, seed 2220 2221 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2222 2223 mov offxd, offyd 2224%else 2225 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2226 offx, offy, see, overlap, unused1, unused2, lstride 2227 2228 mov offyd, seed 2229 mov offxd, seed 2230%endif 2231 ror offyd, 8 2232 shr offxd, 12 2233 and offyd, 0xf 2234 imul offyd, 164>>%3 2235 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx 2236 2237%if ARCH_X86_32 2238 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2239%else 2240 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2241 h, offxy, see, overlap, unused1, unused2, lstride, luma_bak 2242%endif 2243 2244%%loop_x_odd: 2245 mov hd, r7m 2246 mov grain_lutq, grain_lutmp 2247%%loop_y: 2248 ; src 2249%if ARCH_X86_32 2250 mov lumaq, r9mp 2251%endif 2252%if %2 2253 mova m4, [lumaq+ 0] 2254 mova m6, [lumaq+16] 2255 mova m0, [srcq] 2256%if ARCH_X86_32 2257 add lumaq, r10mp 2258 mov r9mp, lumaq 2259 mov r5, r5m 2260 movd m7, [base+pb_1] 2261%else 2262 movd m7, [pb_1] 2263%endif 2264 pshufd m7, m7, q0000 2265 pxor m2, m2 2266 pmaddubsw m4, m7 2267 pmaddubsw m6, m7 2268 pavgw m4, m2 2269 pavgw m6, m2 2270%else 2271 mova m4, [lumaq] 2272 mova m0, [srcq] 2273%if ARCH_X86_32 2274 add lumaq, r10mp 2275 mov r9mp, lumaq 2276%endif 2277 pxor m2, m2 2278%endif 2279 2280%if %1 2281%if %2 2282 packuswb m4, m6 ; luma 2283%endif 2284 punpckhbw m6, m4, m0 2285 punpcklbw m4, m0 ; { luma, chroma } 2286 pmaddubsw m6, m14 2287 pmaddubsw m4, m14 2288 psraw m6, 6 2289 psraw m4, 6 2290 paddw m6, m15 2291 paddw m4, m15 2292 packuswb m4, m6 ; pack+unpack = clip 2293 punpckhbw m6, m4, m2 2294 punpcklbw m4, m2 2295%elif %2 == 0 2296 punpckhbw m6, m4, m2 2297 punpcklbw m4, m2 2298%endif 2299 2300 ; scaling[luma_src] 2301%if ARCH_X86_32 2302 vpgatherdw m7, m4, scalingq-1, r0, r5 2303 vpgatherdw m5, m6, scalingq-1, r0, r5 2304%else 2305 vpgatherdw m7, m4, scalingq-1, r12, r2 2306 vpgatherdw m5, m6, scalingq-1, r12, r2 2307%endif 2308 REPX {psrlw x, 8}, m7, m5 2309 2310 ; unpack chroma_source 2311 punpckhbw m1, m0, m2 2312 punpcklbw m0, m2 ; m0-1: src as word 2313 2314 ; grain = grain_lut[offy+y][offx+x] 2315 movu m3, [grain_lutq+offxyq+ 0] 2316 pcmpgtb m6, m2, m3 2317 punpcklbw m2, m3, m6 2318 punpckhbw m3, m6 2319 2320 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2321 pmullw m2, m7 2322 pmullw m3, m5 2323 pmulhrsw m2, m11 2324 pmulhrsw m3, m11 2325 2326%if ARCH_X86_32 2327 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2328%endif 2329 2330 ; dst = clip_pixel(src, noise) 2331 paddw m0, m2 2332 paddw m1, m3 2333 pmaxsw m0, m13 2334 pmaxsw m1, m13 2335 pminsw m0, m12 2336 pminsw m1, m12 2337 packuswb m0, m1 2338 movifnidn dstq, dstmp 2339 mova [dstq+srcq], m0 2340 2341%if ARCH_X86_32 2342 add srcq, r2mp 2343 ; we already incremented lumaq above 2344%else 2345 add srcq, r12mp 2346%if %3 2347 lea lumaq, [lumaq+lstrideq*2] 2348%else 2349 add lumaq, lstrideq 2350%endif 2351%endif 2352 add grain_lutq, 82 2353 dec hw 2354 jg %%loop_y 2355 2356%if ARCH_X86_32 2357 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2358 2359 mov wq, r4m 2360%endif 2361 add wq, 16 2362 jge %%end 2363%if ARCH_X86_32 2364 mov srcq, r1mp 2365 mov lumaq, r11mp 2366%else 2367 mov srcq, r11mp 2368%endif 2369 lea lumaq, [luma_bakq+wq*(1+%2)] 2370 add srcq, wq 2371%if ARCH_X86_32 2372 mov r4m, wq 2373 mov r9m, lumaq 2374%endif 2375%if %2 == 0 2376 ; adjust top_offxy 2377%if ARCH_X86_32 2378 add dword [rsp+7*mmsize+1*gprsize], 16 2379%else 2380 add r11d, 16 2381%endif 2382 add offxyd, 16 2383 btc dword r8m, 2 2384 jc %%loop_x_even 2385 test dword r8m, 2 2386 jz %%loop_x_odd 2387 jmp %%loop_x_odd_v_overlap 2388%%loop_x_even: 2389%endif 2390 test dword r8m, 1 2391 jz %%loop_x 2392 2393 ; r8m = sbym 2394 test dword r8m, 2 2395 jne %%loop_x_hv_overlap 2396 2397 ; horizontal overlap (without vertical overlap) 2398%%loop_x_h_overlap: 2399%if ARCH_X86_32 2400%if %2 2401 lea r6, [offxyd+16] 2402 mov [rsp+7*mmsize+0*gprsize], r6 2403%else 2404 mov [rsp+7*mmsize+0*gprsize], offxyd 2405%endif 2406 2407 DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut 2408 2409 mov seed, r3m 2410%else 2411 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2412 offx, offy, see, left_offxy, unused1, unused2, lstride 2413 2414%if %2 2415 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 2416%else 2417 mov left_offxyd, offyd 2418%endif 2419%endif 2420 mov r6d, seed 2421 or seed, 0xEFF4 2422 shr r6d, 1 2423 test seeb, seeh 2424 lea seed, [r6+0x8000] 2425 cmovp seed, r6d ; updated seed 2426 2427%if ARCH_X86_32 2428 mov r3m, seed 2429 2430 DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx 2431 2432 mov offxd, offyd 2433%else 2434 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2435 offx, offy, see, left_offxy, unused1, unused2, lstride 2436 2437 mov offyd, seed 2438 mov offxd, seed 2439%endif 2440 ror offyd, 8 2441 shr offxd, 12 2442 and offyd, 0xf 2443 imul offyd, 164>>%3 2444 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 2445 2446%if ARCH_X86_32 2447 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2448%else 2449 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2450 h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak 2451%endif 2452 2453 mov hd, r7m 2454 mov grain_lutq, grain_lutmp 2455%%loop_y_h_overlap: 2456 ; src 2457%if ARCH_X86_32 2458 mov lumaq, r9mp 2459%endif 2460%if %2 2461 mova m4, [lumaq+ 0] 2462 mova m6, [lumaq+16] 2463 mova m0, [srcq] 2464%if ARCH_X86_32 2465 add lumaq, r10mp 2466 mov r9mp, lumaq 2467 mov r5, r5m 2468 movd m7, [base+pb_1] 2469%else 2470 movd m7, [pb_1] 2471%endif 2472 pshufd m7, m7, q0000 2473 pxor m2, m2 2474 pmaddubsw m4, m7 2475 pmaddubsw m6, m7 2476 pavgw m4, m2 2477 pavgw m6, m2 2478%else 2479 mova m4, [lumaq] 2480 mova m0, [srcq] 2481%if ARCH_X86_32 2482 add lumaq, r10mp 2483 mov r9mp, lumaq 2484%endif 2485 pxor m2, m2 2486%endif 2487 2488%if %1 2489%if %2 2490 packuswb m4, m6 ; luma 2491%endif 2492 punpckhbw m6, m4, m0 2493 punpcklbw m4, m0 ; { luma, chroma } 2494 pmaddubsw m6, m14 2495 pmaddubsw m4, m14 2496 psraw m6, 6 2497 psraw m4, 6 2498 paddw m6, m15 2499 paddw m4, m15 2500 packuswb m4, m6 ; pack+unpack = clip 2501 punpckhbw m6, m4, m2 2502 punpcklbw m4, m2 2503%elif %2 == 0 2504 punpckhbw m6, m4, m2 2505 punpcklbw m4, m2 2506%endif 2507 2508 ; scaling[luma_src] 2509%if ARCH_X86_32 2510 vpgatherdw m7, m4, scalingq-1, r0, r5 2511 vpgatherdw m5, m6, scalingq-1, r0, r5 2512%else 2513 vpgatherdw m7, m4, scalingq-1, r12, r2 2514 vpgatherdw m5, m6, scalingq-1, r12, r2 2515%endif 2516 REPX {psrlw x, 8}, m7, m5 2517 2518 ; unpack chroma_source 2519 punpckhbw m1, m0, m2 2520 punpcklbw m0, m2 ; m0-1: src as word 2521 2522 ; grain = grain_lut[offy+y][offx+x] 2523 movu m4, [grain_lutq+offxyq+ 0] 2524%if ARCH_X86_32 2525 mov r0, [rsp+7*mmsize+0*gprsize] 2526 movd m2, [grain_lutq+r0+ 0] 2527%else 2528 movd m2, [grain_lutq+left_offxyq+ 0] 2529%endif 2530 punpcklbw m2, m4 2531 pmaddubsw m3, m9, m2 2532 pmulhrsw m3, m8 2533 packsswb m3, m3 2534 shufps m3, m4, q3210 2535 pxor m4, m4 2536 pcmpgtb m4, m3 2537 punpcklbw m2, m3, m4 2538 punpckhbw m3, m4 2539 2540 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2541 pmullw m2, m7 2542 pmullw m3, m5 2543 pmulhrsw m2, m11 2544 pmulhrsw m3, m11 2545 2546%if ARCH_X86_32 2547 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2548%endif 2549 2550 ; dst = clip_pixel(src, noise) 2551 paddw m0, m2 2552 paddw m1, m3 2553 pmaxsw m0, m13 2554 pmaxsw m1, m13 2555 pminsw m0, m12 2556 pminsw m1, m12 2557 packuswb m0, m1 2558 movifnidn dstq, dstmp 2559 mova [dstq+srcq], m0 2560 2561%if ARCH_X86_32 2562 add srcq, r2mp 2563 ; lumaq has already been incremented above 2564%else 2565 add srcq, r12mp 2566%if %3 2567 lea lumaq, [lumaq+lstrideq*2] 2568%else 2569 add lumaq, lstrideq 2570%endif 2571%endif 2572 add grain_lutq, 82 2573 dec hw 2574 jg %%loop_y_h_overlap 2575 2576%if ARCH_X86_32 2577 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2578 2579 mov wq, r4m 2580%endif 2581 add wq, 16 2582 jge %%end 2583%if ARCH_X86_32 2584 mov srcq, r1mp 2585 mov lumaq, r11mp 2586%else 2587 mov srcq, r11mp 2588%endif 2589 lea lumaq, [luma_bakq+wq*(1+%2)] 2590 add srcq, wq 2591%if ARCH_X86_32 2592 mov r4m, wq 2593 mov r9m, lumaq 2594%endif 2595%if %2 == 0 2596 xor dword r8m, 4 2597 ; adjust top_offxyd 2598%if ARCH_X86_32 2599 add dword [rsp+7*mmsize+1*gprsize], 16 2600%else 2601 add r11d, 16 2602%endif 2603 add offxyd, 16 2604%endif 2605 2606 ; r8m = sbym 2607 test dword r8m, 2 2608%if %2 2609 jne %%loop_x_hv_overlap 2610 jmp %%loop_x_h_overlap 2611%else 2612 jne %%loop_x_odd_v_overlap 2613 jmp %%loop_x_odd 2614%endif 2615 2616%%end: 2617 RET 2618 2619%%vertical_overlap: 2620%if ARCH_X86_32 2621 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2622%else 2623 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap 2624%endif 2625 2626 or overlapd, 2 ; top_overlap: overlap & 2 2627 mov r8m, overlapd 2628 movzx sbyd, sbyb 2629%if ARCH_X86_32 2630 imul r4, [fg_dataq+FGData.seed], 0x00010001 2631 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 2632%else 2633 imul seed, [fg_dataq+FGData.seed], 0x00010001 2634%endif 2635 imul tmpd, sbyd, 173 * 0x00010001 2636 imul sbyd, 37 * 0x01000100 2637 add tmpd, (105 << 16) | 188 2638 add sbyd, (178 << 24) | (141 << 8) 2639 and tmpd, 0x00ff00ff 2640 and sbyd, 0xff00ff00 2641 xor seed, tmpd 2642%if ARCH_X86_32 2643 xor sbyd, seed ; (cur_seed << 16) | top_seed 2644 2645 DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak 2646 2647 mov r3m, seed 2648 mov wq, r4m 2649%if %3 2650 shl r10mp, 1 2651%endif 2652%else 2653 xor seed, sbyd ; (cur_seed << 16) | top_seed 2654 2655 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2656 tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak 2657 2658 mov lstrideq, r10mp 2659%endif 2660 2661 mov lumaq, r9mp 2662 lea src_bakq, [srcq+wq] 2663 lea luma_bakq, [lumaq+wq*(1+%2)] 2664 neg wq 2665 sub r0mp, srcq 2666%if ARCH_X86_32 2667 mov r1m, src_bakq 2668 mov r11m, luma_bakq 2669 mov r4m, wq 2670 2671 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 2672%else 2673 mov r11mp, src_bakq 2674 mov r12mp, strideq 2675%endif 2676 2677%%loop_x_v_overlap: 2678%if ARCH_X86_32 2679 mov seed, r3m 2680 xor tmpd, tmpd 2681%endif 2682 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 2683 mov r6d, seed 2684 or seed, 0xeff4eff4 2685 test seeb, seeh 2686 setp tmpb ; parity of top_seed 2687 shr seed, 16 2688 shl tmpd, 16 2689 test seeb, seeh 2690 setp tmpb ; parity of cur_seed 2691 or r6d, 0x00010001 2692 xor tmpd, r6d 2693 mov seed, tmpd 2694 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2695 2696%if ARCH_X86_32 2697 mov r3m, seed 2698 2699 DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx 2700 2701 mov offxd, offyd 2702%else 2703 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2704 offx, offy, see, overlap, top_offxy, unused, lstride 2705 2706 mov offxd, seed 2707 mov offyd, seed 2708%endif 2709 ror offyd, 8 2710 ror offxd, 12 2711 and offyd, 0xf000f 2712 and offxd, 0xf000f 2713 imul offyd, 164>>%3 2714 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2715 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2716 2717%if ARCH_X86_32 2718 DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy 2719%else 2720 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2721 h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak 2722%endif 2723 2724 movzx top_offxyd, offxyw 2725 shr offxyd, 16 2726%if ARCH_X86_32 2727 mov [rsp+7*mmsize+1*gprsize], top_offxyd 2728 2729 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2730%endif 2731 2732%%loop_x_odd_v_overlap: 2733 mov hd, r7m 2734 mov grain_lutq, grain_lutmp 2735%if ARCH_X86_32 2736 mov r5, r5m 2737%endif 2738%if %3 2739 mova m1, [PIC_ptr(pb_23_22)] 2740%else 2741 mova m1, [PIC_ptr(pb_27_17)] 2742%endif 2743%%loop_y_v_overlap: 2744%if ARCH_X86_32 2745 mov lumaq, r9mp 2746%endif 2747%if %2 2748 mova m4, [lumaq+ 0] 2749 mova m6, [lumaq+16] 2750 mova m0, [srcq] 2751%if ARCH_X86_32 2752 add lumaq, r10mp 2753 mov r9mp, lumaq 2754 mov r5, r5m 2755 movd m7, [base+pb_1] 2756%else 2757 movd m7, [pb_1] 2758%endif 2759 pshufd m7, m7, q0000 2760 pxor m2, m2 2761 pmaddubsw m4, m7 2762 pmaddubsw m6, m7 2763 pavgw m4, m2 2764 pavgw m6, m2 2765%else 2766 mova m4, [lumaq] 2767 mova m0, [srcq] 2768%if ARCH_X86_32 2769 add lumaq, r10mp 2770 mov r9mp, lumaq 2771%endif 2772 pxor m2, m2 2773%endif 2774 2775%if %1 2776%if %2 2777 packuswb m4, m6 ; luma 2778%endif 2779 punpckhbw m6, m4, m0 2780 punpcklbw m4, m0 ; { luma, chroma } 2781 pmaddubsw m6, m14 2782 pmaddubsw m4, m14 2783 psraw m6, 6 2784 psraw m4, 6 2785 paddw m6, m15 2786 paddw m4, m15 2787 packuswb m4, m6 ; pack+unpack = clip 2788 punpckhbw m6, m4, m2 2789 punpcklbw m4, m2 2790%elif %2 == 0 2791 punpckhbw m6, m4, m2 2792 punpcklbw m4, m2 2793%endif 2794 2795 ; scaling[luma_src] 2796%if ARCH_X86_32 2797 vpgatherdw m7, m4, scalingq-1, r0, r5 2798 vpgatherdw m5, m6, scalingq-1, r0, r5 2799%else 2800 vpgatherdw m7, m4, scalingq-1, r12, r2 2801 vpgatherdw m5, m6, scalingq-1, r12, r2 2802%endif 2803 REPX {psrlw x, 8}, m7, m5 2804 2805 ; grain = grain_lut[offy+y][offx+x] 2806 movu m3, [grain_lutq+offxyq] 2807%if ARCH_X86_32 2808 mov r0, [rsp+7*mmsize+1*gprsize] 2809 movu m4, [grain_lutq+r0] 2810%else 2811 movu m4, [grain_lutq+top_offxyq] 2812%endif 2813 punpckhbw m6, m4, m3 2814 punpcklbw m4, m3 2815 pmaddubsw m2, m1, m6 2816 pmaddubsw m3, m1, m4 2817 pmulhrsw m2, m8 2818 pmulhrsw m3, m8 2819 packsswb m3, m2 2820 pxor m6, m6 2821 pcmpgtb m6, m3 2822 punpcklbw m2, m3, m6 2823 punpckhbw m3, m6 2824 2825 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2826 pmullw m2, m7 2827 pmullw m3, m5 2828 pmulhrsw m2, m11 2829 pmulhrsw m3, m11 2830 2831 ; unpack chroma_source 2832 pxor m4, m4 2833 punpckhbw m6, m0, m4 2834 punpcklbw m0, m4 ; m0-1: src as word 2835 2836%if ARCH_X86_32 2837 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2838%endif 2839 2840 ; dst = clip_pixel(src, noise) 2841 paddw m0, m2 2842 paddw m6, m3 2843 pmaxsw m0, m13 2844 pmaxsw m6, m13 2845 pminsw m0, m12 2846 pminsw m6, m12 2847 packuswb m0, m6 2848 movifnidn dstq, dstmp 2849 mova [dstq+srcq], m0 2850 2851 dec hw 2852 je %%end_y_v_overlap 2853%if ARCH_X86_32 2854 add srcq, r2mp 2855 ; lumaq has already been incremented above 2856%else 2857 add srcq, r12mp 2858%if %3 2859 lea lumaq, [lumaq+lstrideq*2] 2860%else 2861 add lumaq, lstrideq 2862%endif 2863%endif 2864 add grain_lutq, 82 2865%if %3 == 0 2866 btc hd, 16 2867%if ARCH_X86_32 2868 mov r5, r5m 2869%endif 2870 mova m1, [PIC_ptr(pb_17_27)] 2871 jnc %%loop_y_v_overlap 2872%endif 2873 jmp %%loop_y 2874 2875%%end_y_v_overlap: 2876%if ARCH_X86_32 2877 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2878 2879 mov wq, r4m 2880%endif 2881 add wq, 16 2882 jge %%end_hv 2883%if ARCH_X86_32 2884 mov srcq, r1mp 2885 mov lumaq, r11mp 2886%else 2887 mov srcq, r11mp 2888%endif 2889 lea lumaq, [luma_bakq+wq*(1+%2)] 2890 add srcq, wq 2891%if ARCH_X86_32 2892 mov r4m, wq 2893 mov r9m, lumaq 2894%endif 2895 2896%if %2 2897 ; since fg_dataq.overlap is guaranteed to be set, we never jump 2898 ; back to .loop_x_v_overlap, and instead always fall-through to 2899 ; h+v overlap 2900%else 2901%if ARCH_X86_32 2902 add dword [rsp+7*mmsize+1*gprsize], 16 2903%else 2904 add top_offxyd, 16 2905%endif 2906 add offxyd, 16 2907 btc dword r8m, 2 2908 jnc %%loop_x_odd_v_overlap 2909%endif 2910 2911%%loop_x_hv_overlap: 2912%if ARCH_X86_32 2913 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused 2914 2915 mov r6, [rsp+7*mmsize+1*gprsize] 2916%if %2 2917 lea r0, [r3d+16] 2918 add r6, 16 2919 mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy 2920%else 2921 mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy 2922%endif 2923 mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy 2924 2925 DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused 2926 2927 mov seed, r3m 2928 xor tmpd, tmpd 2929%else 2930 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2931 tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride 2932 2933%if %2 2934 lea topleft_offxyq, [top_offxyq+16] 2935 lea left_offxyq, [offxyq+16] 2936%else 2937 mov topleft_offxyq, top_offxyq 2938 mov left_offxyq, offxyq 2939%endif 2940 2941 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 2942%endif 2943 mov r6d, seed 2944 or seed, 0xeff4eff4 2945 test seeb, seeh 2946 setp tmpb ; parity of top_seed 2947 shr seed, 16 2948 shl tmpd, 16 2949 test seeb, seeh 2950 setp tmpb ; parity of cur_seed 2951 or r6d, 0x00010001 2952 xor tmpd, r6d 2953 mov seed, tmpd 2954 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2955 2956%if ARCH_X86_32 2957 mov r3m, seed 2958 2959 DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx 2960 2961 mov offxd, offyd 2962%else 2963 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2964 offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride 2965 2966 mov offxd, seed 2967 mov offyd, seed 2968%endif 2969 ror offyd, 8 2970 ror offxd, 12 2971 and offyd, 0xf000f 2972 and offxd, 0xf000f 2973 imul offyd, 164>>%3 2974 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2975 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2976 2977%if ARCH_X86_32 2978 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 2979%else 2980 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2981 h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak 2982%endif 2983 2984 movzx top_offxyd, offxyw 2985 shr offxyd, 16 2986%if ARCH_X86_32 2987 mov [rsp+7*mmsize+1*gprsize], top_offxyd 2988%endif 2989 2990 mov hd, r7m 2991 mov grain_lutq, grain_lutmp 2992%if ARCH_X86_32 2993 mov r5, r5m 2994%endif 2995%if %3 2996 mova m3, [PIC_ptr(pb_23_22)] 2997%else 2998 mova m3, [PIC_ptr(pb_27_17)] 2999%endif 3000%%loop_y_hv_overlap: 3001 ; grain = grain_lut[offy+y][offx+x] 3002%if ARCH_X86_32 3003 mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy 3004 mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy 3005 movd m1, [grain_lutq+r0] 3006 mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy 3007%else 3008 movd m1, [grain_lutq+topleft_offxyq] 3009%endif 3010 movu m2, [grain_lutq+offxyq] 3011%if ARCH_X86_32 3012 movu m6, [grain_lutq+r5] 3013 movd m4, [grain_lutq+r0] 3014%else 3015 movu m6, [grain_lutq+top_offxyq] 3016 movd m4, [grain_lutq+left_offxyq] 3017%endif 3018 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 3019 punpcklbw m1, m6 3020 punpcklbw m4, m2 3021 pmaddubsw m0, m9, m1 3022 pmaddubsw m1, m9, m4 3023 REPX {pmulhrsw x, m8}, m0, m1 3024 packsswb m0, m1 3025 shufps m4, m0, m2, q3232 3026 shufps m0, m6, q3210 3027 ; followed by v interpolation (top | cur -> cur) 3028 punpcklbw m2, m0, m4 3029 punpckhbw m0, m4 3030 pmaddubsw m4, m3, m0 3031 pmaddubsw m1, m3, m2 3032 pmulhrsw m4, m8 3033 pmulhrsw m1, m8 3034 packsswb m1, m4 3035 3036 ; src 3037%if ARCH_X86_32 3038 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 3039 3040 mov lumaq, r9mp 3041%endif 3042%if %2 3043 mova m4, [lumaq+ 0] 3044 mova m6, [lumaq+16] 3045 mova m0, [srcq] 3046%if ARCH_X86_32 3047 add lumaq, r10mp 3048 mov r9mp, lumaq 3049 mov r5, r5m 3050 movd m7, [base+pb_1] 3051%else 3052 movd m7, [pb_1] 3053%endif 3054 pshufd m7, m7, q0000 3055 pxor m2, m2 3056 pmaddubsw m4, m7 3057 pmaddubsw m6, m7 3058 pavgw m4, m2 3059 pavgw m6, m2 3060%else 3061 mova m4, [lumaq] 3062 mova m0, [srcq] 3063%if ARCH_X86_32 3064 add lumaq, r10mp 3065 mov r9mp, lumaq 3066%endif 3067 pxor m2, m2 3068%endif 3069 3070%if %1 3071%if %2 3072 packuswb m4, m6 ; luma 3073%endif 3074 punpckhbw m6, m4, m0 3075 punpcklbw m4, m0 ; { luma, chroma } 3076 pmaddubsw m6, m14 3077 pmaddubsw m4, m14 3078 psraw m6, 6 3079 psraw m4, 6 3080 paddw m6, m15 3081 paddw m4, m15 3082 packuswb m4, m6 ; pack+unpack = clip 3083 punpckhbw m6, m4, m2 3084 punpcklbw m4, m2 3085%elif %2 == 0 3086 punpckhbw m6, m4, m2 3087 punpcklbw m4, m2 3088%endif 3089 3090 ; scaling[src] 3091%if ARCH_X86_32 3092 vpgatherdw m7, m4, scalingq-1, r0, r5 3093 vpgatherdw m5, m6, scalingq-1, r0, r5 3094%else 3095%if %3 3096 vpgatherdw m7, m4, scalingq-1, r2, r12 3097 vpgatherdw m5, m6, scalingq-1, r2, r12 3098%else 3099 vpgatherdw m7, m4, scalingq-1, r2, r13 3100 vpgatherdw m5, m6, scalingq-1, r2, r13 3101%endif 3102%endif 3103 REPX {psrlw x, 8}, m7, m5 3104 3105 ; unpack grain 3106 pxor m4, m4 3107 pcmpgtb m4, m1 3108 punpcklbw m2, m1, m4 3109 punpckhbw m1, m4 3110 3111 ; noise = round2(scaling[src] * grain, scaling_shift) 3112 pmullw m2, m7 3113 pmullw m1, m5 3114 pmulhrsw m2, m11 3115 pmulhrsw m1, m11 3116 3117%if ARCH_X86_32 3118 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 3119%endif 3120 3121 ; unpack chroma source 3122 pxor m4, m4 3123 punpckhbw m5, m0, m4 3124 punpcklbw m0, m4 ; m0-1: src as word 3125 3126 ; dst = clip_pixel(src, noise) 3127 paddw m0, m2 3128 paddw m5, m1 3129 pmaxsw m0, m13 3130 pmaxsw m5, m13 3131 pminsw m0, m12 3132 pminsw m5, m12 3133 packuswb m0, m5 3134 movifnidn dstq, dstmp 3135 mova [dstq+srcq], m0 3136 3137%if ARCH_X86_32 3138 add srcq, r2mp 3139 ; lumaq has been adjusted above already 3140%else 3141 add srcq, r12mp 3142%if %3 3143 lea lumaq, [lumaq+lstrideq*(1+%2)] 3144%else 3145 add lumaq, r10mp 3146%endif 3147%endif 3148 add grain_lutq, 82 3149 dec hw 3150%if %3 3151 jg %%loop_y_h_overlap 3152%else 3153 jle %%end_y_hv_overlap 3154%if ARCH_X86_32 3155 mov r5, r5m 3156%endif 3157 mova m3, [PIC_ptr(pb_17_27)] 3158 btc hd, 16 3159 jnc %%loop_y_hv_overlap 3160%if ARCH_X86_64 3161 mov lstrideq, r10mp 3162%endif 3163 jmp %%loop_y_h_overlap 3164%%end_y_hv_overlap: 3165%if ARCH_X86_64 3166 mov lstrideq, r10mp 3167%endif 3168%endif 3169 3170%if ARCH_X86_32 3171 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 3172 3173 mov wq, r4m 3174%endif 3175 add wq, 16 3176 jge %%end_hv 3177%if ARCH_X86_32 3178 mov srcq, r1mp 3179 mov lumaq, r11mp 3180%else 3181 mov srcq, r11mp 3182%endif 3183 lea lumaq, [luma_bakq+wq*(1+%2)] 3184 add srcq, wq 3185%if ARCH_X86_32 3186 mov r4m, wq 3187 mov r9m, lumaq 3188%endif 3189%if %2 3190 jmp %%loop_x_hv_overlap 3191%else 3192%if ARCH_X86_32 3193 add dword [rsp+7*mmsize+1*gprsize], 16 3194%else 3195 add top_offxyd, 16 3196%endif 3197 add offxyd, 16 3198 xor dword r8m, 4 3199 jmp %%loop_x_odd_v_overlap 3200%endif 3201 3202%%end_hv: 3203 RET 3204%endmacro 3205 3206 %%FGUV_32x32xN_LOOP 1, %2, %3 3207.csfl: 3208 %%FGUV_32x32xN_LOOP 0, %2, %3 3209%endmacro 3210 3211FGUV_FN 420, 1, 1 3212 3213%if STACK_ALIGNMENT < mmsize 3214DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3215%endif 3216 3217FGUV_FN 422, 1, 0 3218 3219%if STACK_ALIGNMENT < mmsize 3220DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3221%endif 3222 3223FGUV_FN 444, 0, 0 3224