1; Copyright © 2019-2022, VideoLAN and dav1d authors 2; Copyright © 2019-2022, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28%include "x86/filmgrain_common.asm" 29 30%if ARCH_X86_64 31 32SECTION_RODATA 32 33pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 34gen_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 35gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 36gen_shufB: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 37gen_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 38gen_shufD: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 39; note: the order of (some of) the following constants matter 40pb_27_17: times 2 db 27, 17 41byte_blend: db 0, 0, 0, -1 42pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 43pb_17_27: times 2 db 17, 27 44pb_1: times 4 db 1 45pb_23_22: db 23, 22, 0, 32, 0, 32, 0, 32 46next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 47pw_seed_xor: times 2 dw 0xb524 48 times 2 dw 0x49d8 49fg_min: times 4 db 0 50 times 4 db 16 51fg_max: times 4 db 255 52 times 4 db 240 53 times 4 db 235 54pd_m65536: dd -65536 55pw_8: times 2 dw 8 56pw_1024: times 2 dw 1024 57hmul_bits: dw 32768, 16384, 8192, 4096 58round: dw 2048, 1024, 512 59mul_bits: dw 256, 128, 64, 32, 16 60round_vals: dw 32, 64, 128, 256, 512 61pw_1: dw 1 62 63%macro JMP_TABLE 2-* 64 %1_8bpc_%2_table: 65 %xdefine %%base %1_8bpc_%2_table 66 %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 67 %rep %0 - 2 68 dd %%prefix %+ .ar%3 - %%base 69 %rotate 1 70 %endrep 71%endmacro 72 73JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3 74JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 75JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 76JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3 77 78SECTION .text 79 80INIT_YMM avx2 81cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data 82%define base r4-generate_grain_y_8bpc_avx2_table 83 lea r4, [generate_grain_y_8bpc_avx2_table] 84 vpbroadcastw xm0, [fg_dataq+FGData.seed] 85 mov r6d, [fg_dataq+FGData.grain_scale_shift] 86 movq xm1, [base+next_upperbit_mask] 87 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 88 movq xm4, [base+mul_bits] 89 movq xm5, [base+hmul_bits] 90 mov r7, -73*82 91 mova xm6, [base+pb_mask] 92 sub bufq, r7 93 vpbroadcastw xm7, [base+round+r6*2] 94 lea r6, [gaussian_sequence] 95 movsxd r5, [r4+r5*4] 96.loop: 97 pand xm2, xm0, xm1 98 psrlw xm3, xm2, 10 99 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 100 pmullw xm2, xm4 ; bits 0x0f00 are set 101 pmulhuw xm0, xm5 102 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds 103 psllq xm2, xm3, 30 104 por xm2, xm3 105 psllq xm3, xm2, 15 106 por xm2, xm0 ; aggregate each bit into next seed's high bit 107 por xm3, xm2 ; 4 next output seeds 108 pshuflw xm0, xm3, q3333 109 psrlw xm3, 5 110 pand xm2, xm0, xm1 111 movq r2, xm3 112 psrlw xm3, xm2, 10 113 por xm2, xm3 114 pmullw xm2, xm4 115 pmulhuw xm0, xm5 116 movzx r3d, r2w 117 pshufb xm3, xm6, xm2 118 psllq xm2, xm3, 30 119 por xm2, xm3 120 psllq xm3, xm2, 15 121 por xm0, xm2 122 movd xm2, [r6+r3*2] 123 rorx r3, r2, 32 124 por xm3, xm0 125 shr r2d, 16 126 pinsrw xm2, [r6+r2*2], 1 127 pshuflw xm0, xm3, q3333 128 movzx r2d, r3w 129 psrlw xm3, 5 130 pinsrw xm2, [r6+r2*2], 2 131 shr r3d, 16 132 movq r2, xm3 133 pinsrw xm2, [r6+r3*2], 3 134 movzx r3d, r2w 135 pinsrw xm2, [r6+r3*2], 4 136 rorx r3, r2, 32 137 shr r2d, 16 138 pinsrw xm2, [r6+r2*2], 5 139 movzx r2d, r3w 140 pinsrw xm2, [r6+r2*2], 6 141 shr r3d, 16 142 pinsrw xm2, [r6+r3*2], 7 143 pmulhrsw xm2, xm7 144 packsswb xm2, xm2 145 movq [bufq+r7], xm2 146 add r7, 8 147 jl .loop 148 149 ; auto-regression code 150 add r5, r4 151 jmp r5 152 153.ar1: 154 DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 155 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 156 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 157 movd xm5, [fg_dataq+FGData.ar_coeffs_y] 158 mova xm2, [base+gen_shufC] 159 DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 160 pinsrb xm5, [base+pb_1], 3 161 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd 162 pmovsxbw xm5, xm5 163 pshufd xm4, xm5, q0000 164 pshufd xm5, xm5, q1111 165 sub bufq, 82*73-(82*3+79) 166 mov hd, 70 167 mov mind, -128 168 mov maxd, 127 169.y_loop_ar1: 170 mov xq, -76 171 movsx val3d, byte [bufq+xq-1] 172.x_loop_ar1: 173 pmovsxbw xm1, [bufq+xq-82-3] 174 pshufb xm0, xm1, xm2 175 punpckhwd xm1, xm3 176 pmaddwd xm0, xm4 177 pmaddwd xm1, xm5 178 paddd xm0, xm1 179.x_loop_ar1_inner: 180 movd val0d, xm0 181 psrldq xm0, 4 182 imul val3d, cf3d 183 add val3d, val0d 184 movsx val0d, byte [bufq+xq] 185 sarx val3d, val3d, shiftd 186 add val3d, val0d 187 cmp val3d, maxd 188 cmovns val3d, maxd 189 cmp val3d, mind 190 cmovs val3d, mind 191 mov [bufq+xq], val3b 192 ; keep val3d in-place as left for next x iteration 193 inc xq 194 jz .x_loop_ar1_end 195 test xb, 3 196 jnz .x_loop_ar1_inner 197 jmp .x_loop_ar1 198.x_loop_ar1_end: 199 add bufq, 82 200 dec hd 201 jg .y_loop_ar1 202.ar0: 203 RET 204 205.ar2: 206%if WIN64 207 %assign stack_size_padded 168 208 SUB rsp, stack_size_padded 209 WIN64_PUSH_XMM 16, 8 210%endif 211 DEFINE_ARGS buf, fg_data, h, x 212 mov r6d, [fg_dataq+FGData.ar_coeff_shift] 213 pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 214 movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 215 vpbroadcastd xm10, [base+round_vals-14+r6*2] 216 movd xm11, [base+byte_blend+1] 217 pmovsxbw xm9, xm9 218 pshufd xm4, xm7, q0000 219 mova xm12, [base+gen_shufA] 220 pshufd xm5, xm7, q3333 221 mova xm13, [base+gen_shufB] 222 pshufd xm6, xm7, q1111 223 mova xm14, [base+gen_shufC] 224 pshufd xm7, xm7, q2222 225 mova xm15, [base+gen_shufD] 226 pshufd xm8, xm9, q0000 227 psrld xm10, 16 228 pshufd xm9, xm9, q1111 229 sub bufq, 82*73-(82*3+79) 230 mov hd, 70 231.y_loop_ar2: 232 mov xq, -76 233.x_loop_ar2: 234 pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 235 pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 236 pshufb xm2, xm0, xm12 237 pmaddwd xm2, xm4 238 pshufb xm3, xm1, xm13 239 pmaddwd xm3, xm5 240 paddd xm2, xm3 241 pshufb xm3, xm0, xm14 242 pmaddwd xm3, xm6 243 punpckhqdq xm0, xm0 244 punpcklwd xm0, xm1 245 pmaddwd xm0, xm7 246 pshufb xm1, xm15 247 pmaddwd xm1, xm8 248 paddd xm2, xm10 249 paddd xm2, xm3 250 paddd xm0, xm1 251 paddd xm2, xm0 252 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] 253.x_loop_ar2_inner: 254 pmovsxbw xm1, xm0 255 pmaddwd xm3, xm9, xm1 256 psrldq xm1, 4 ; y=0,x=0 257 paddd xm3, xm2 258 psrldq xm2, 4 ; shift top to next pixel 259 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 260 ; don't packssdw since we only care about one value 261 paddw xm3, xm1 262 packsswb xm3, xm3 263 pextrb [bufq+xq], xm3, 0 264 pslldq xm3, 2 265 vpblendvb xm0, xm3, xm11 266 psrldq xm0, 1 267 inc xq 268 jz .x_loop_ar2_end 269 test xb, 3 270 jnz .x_loop_ar2_inner 271 jmp .x_loop_ar2 272.x_loop_ar2_end: 273 add bufq, 82 274 dec hd 275 jg .y_loop_ar2 276 RET 277 278INIT_YMM avx2 279.ar3: 280%if WIN64 281 ALLOC_STACK 16*14 282 %assign stack_size stack_size - 16*4 283 WIN64_PUSH_XMM 12, 8 284%else 285 ALLOC_STACK 16*12 286%endif 287 mov r6d, [fg_dataq+FGData.ar_coeff_shift] 288 movd xm11, [base+byte_blend] 289 pmovsxbw m1, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 290 pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 291 pshufd m0, m1, q0000 292 mova [rsp+16* 0], m0 293 pshufd m0, m1, q1111 294 mova [rsp+16* 2], m0 295 pshufd m0, m1, q2222 296 mova [rsp+16* 4], m0 297 pshufd m1, m1, q3333 298 mova [rsp+16* 6], m1 299 pshufd xm0, xm2, q0000 300 mova [rsp+16* 8], xm0 301 pshufd xm0, xm2, q1111 302 mova [rsp+16* 9], xm0 303 psrldq xm7, xm2, 10 304 mova m8, [base+gen_shufA] 305 pinsrw xm2, [base+pw_1], 5 306 mova m9, [base+gen_shufC] 307 pshufd xm2, xm2, q2222 308 movu m10, [base+gen_shufE] 309 vpbroadcastw xm6, [base+round_vals-12+r6*2] 310 pinsrw xm7, [base+round_vals+r6*2-10], 3 311 mova [rsp+16*10], xm2 312 DEFINE_ARGS buf, fg_data, h, x 313 sub bufq, 82*73-(82*3+79) 314 mov hd, 70 315.y_loop_ar3: 316 mov xq, -76 317.x_loop_ar3: 318 movu xm5, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 319 vinserti128 m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12] 320 movu xm4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 321 punpcklbw m3, m5, m5 322 punpckhwd m5, m4 323 psraw m3, 8 324 punpcklbw m5, m5 325 psraw m5, 8 326 punpcklbw xm4, xm4 327 psraw xm4, 8 328 pshufb m0, m3, m8 329 pmaddwd m0, [rsp+16*0] 330 pshufb m1, m3, m9 331 pmaddwd m1, [rsp+16*2] 332 shufps m2, m3, m5, q1032 333 paddd m0, m1 334 pshufb m1, m2, m8 335 vperm2i128 m3, m4, 0x21 336 pmaddwd m1, [rsp+16*4] 337 shufps xm2, xm3, q1021 338 vpblendd m2, m3, 0xf0 339 pshufb m2, m10 340 paddd m0, m1 341 pmaddwd m2, [rsp+16*6] 342 pshufb xm1, xm4, xm9 343 pmaddwd xm1, [rsp+16*8] 344 shufps xm4, xm5, q1132 345 paddd m0, m2 346 pshufb xm2, xm4, xm8 347 pshufd xm4, xm4, q2121 348 pmaddwd xm2, [rsp+16*9] 349 punpcklwd xm4, xm6 350 pmaddwd xm4, [rsp+16*10] 351 vextracti128 xm3, m0, 1 352 paddd xm0, xm1 353 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] 354 paddd xm2, xm4 355 paddd xm0, xm2 356 paddd xm0, xm3 357.x_loop_ar3_inner: 358 pmovsxbw xm2, xm1 359 pmaddwd xm2, xm7 360 pshufd xm3, xm2, q1111 361 paddd xm2, xm0 ; add top 362 paddd xm2, xm3 ; left+cur 363 psrldq xm0, 4 364 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 365 ; don't packssdw since we only care about one value 366 packsswb xm2, xm2 367 pextrb [bufq+xq], xm2, 0 368 pslldq xm2, 3 369 vpblendvb xm1, xm2, xm11 370 psrldq xm1, 1 371 inc xq 372 jz .x_loop_ar3_end 373 test xb, 3 374 jnz .x_loop_ar3_inner 375 jmp .x_loop_ar3 376.x_loop_ar3_end: 377 add bufq, 82 378 dec hd 379 jg .y_loop_ar3 380 RET 381 382%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y 383INIT_XMM avx2 384cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv 385%define base r4-generate_grain_uv_%1_8bpc_avx2_table 386 lea r4, [generate_grain_uv_%1_8bpc_avx2_table] 387 vpbroadcastw xm0, [fg_dataq+FGData.seed] 388 mov r6d, [fg_dataq+FGData.grain_scale_shift] 389 movq xm1, [base+next_upperbit_mask] 390 movq xm4, [base+mul_bits] 391 movq xm5, [base+hmul_bits] 392 mova xm6, [base+pb_mask] 393 vpbroadcastw xm7, [base+round+r6*2] 394 vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] 395 pxor xm0, xm2 396 lea r6, [gaussian_sequence] 397%if %2 398 mov r7d, 73-35*%3 399 add bufq, 44 400.loop_y: 401 mov r5, -44 402%else 403 mov r5, -73*82 404 sub bufq, r5 405%endif 406.loop: 407 pand xm2, xm0, xm1 408 psrlw xm3, xm2, 10 409 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 410 pmullw xm2, xm4 ; bits 0x0f00 are set 411 pmulhuw xm0, xm5 412 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds 413 psllq xm2, xm3, 30 414 por xm2, xm3 415 psllq xm3, xm2, 15 416 por xm2, xm0 ; aggregate each bit into next seed's high bit 417 por xm2, xm3 ; 4 next output seeds 418 pshuflw xm0, xm2, q3333 419 psrlw xm2, 5 420 movq r8, xm2 421 movzx r9d, r8w 422 movd xm2, [r6+r9*2] 423 rorx r9, r8, 32 424 shr r8d, 16 425 pinsrw xm2, [r6+r8*2], 1 426 movzx r8d, r9w 427 pinsrw xm2, [r6+r8*2], 2 428 shr r9d, 16 429 pinsrw xm2, [r6+r9*2], 3 430 pmulhrsw xm2, xm7 431 packsswb xm2, xm2 432 movd [bufq+r5], xm2 433 add r5, 4 434 jl .loop 435%if %2 436 add bufq, 82 437 dec r7d 438 jg .loop_y 439%endif 440 441 ; auto-regression code 442 movsxd r6, [fg_dataq+FGData.ar_coeff_lag] 443 movsxd r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4] 444 add r6, r4 445 jmp r6 446 447INIT_YMM avx2 448.ar0: 449 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 450 imul uvd, 28 451 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 452 movd xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq] 453 movd xm3, [base+hmul_bits+shiftq*2] 454 DEFINE_ARGS buf, bufy, h 455 pmovsxbw xm2, xm2 456%if %2 457 vpbroadcastd m7, [base+pb_1] 458 vpbroadcastw m6, [base+hmul_bits+2+%3*2] 459%endif 460 vpbroadcastw m2, xm2 461 vpbroadcastw m3, xm3 462 pxor m12, m12 463%if %2 464 sub bufq, 82*(73-35*%3)+82-(82*3+41) 465%else 466 sub bufq, 82*70-3 467%endif 468 add bufyq, 3+82*3 469 mov hd, 70-35*%3 470.y_loop_ar0: 471%if %2 472 ; first 32 pixels 473 movu xm4, [bufyq] 474 vinserti128 m4, [bufyq+32], 1 475%if %3 476 movu xm0, [bufyq+82] 477 vinserti128 m0, [bufyq+82+32], 1 478%endif 479 movu xm5, [bufyq+16] 480 vinserti128 m5, [bufyq+48], 1 481%if %3 482 movu xm1, [bufyq+82+16] 483 vinserti128 m1, [bufyq+82+48], 1 484%endif 485 pmaddubsw m4, m7, m4 486%if %3 487 pmaddubsw m0, m7, m0 488%endif 489 pmaddubsw m5, m7, m5 490%if %3 491 pmaddubsw m1, m7, m1 492 paddw m4, m0 493 paddw m5, m1 494%endif 495 pmulhrsw m4, m6 496 pmulhrsw m5, m6 497%else 498 xor r3d, r3d 499 ; first 32x2 pixels 500.x_loop_ar0: 501 movu m4, [bufyq+r3] 502 pcmpgtb m0, m12, m4 503 punpckhbw m5, m4, m0 504 punpcklbw m4, m0 505%endif 506 pmullw m4, m2 507 pmullw m5, m2 508 pmulhrsw m4, m3 509 pmulhrsw m5, m3 510%if %2 511 movu m1, [bufq] 512%else 513 movu m1, [bufq+r3] 514%endif 515 pcmpgtb m8, m12, m1 516 punpcklbw m0, m1, m8 517 punpckhbw m1, m8 518 paddw m0, m4 519 paddw m1, m5 520 packsswb m0, m1 521%if %2 522 movu [bufq], m0 523%else 524 movu [bufq+r3], m0 525 add r3d, 32 526 cmp r3d, 64 527 jl .x_loop_ar0 528%endif 529 530 ; last 6/12 pixels 531 movu xm4, [bufyq+32*2] 532%if %2 533%if %3 534 movu xm5, [bufyq+32*2+82] 535%endif 536 pmaddubsw xm4, xm7, xm4 537%if %3 538 pmaddubsw xm5, xm7, xm5 539 paddw xm4, xm5 540%endif 541 movq xm0, [bufq+32] 542 pmulhrsw xm4, xm6 543 pmullw xm4, xm2 544 pmulhrsw xm4, xm3 545 pcmpgtb xm5, xm12, xm0 546 punpcklbw xm5, xm0, xm5 547 paddw xm4, xm5 548 packsswb xm4, xm4 549 pblendw xm0, xm4, xm0, 1000b 550 movq [bufq+32], xm0 551%else 552 movu xm0, [bufq+64] 553 pcmpgtb xm1, xm12, xm4 554 punpckhbw xm5, xm4, xm1 555 punpcklbw xm4, xm1 556 pmullw xm5, xm2 557 pmullw xm4, xm2 558 vpblendd xm1, xm3, xm12, 0x0c 559 pmulhrsw xm5, xm1 560 pmulhrsw xm4, xm3 561 pcmpgtb xm1, xm12, xm0 562 punpckhbw xm8, xm0, xm1 563 punpcklbw xm0, xm1 564 paddw xm5, xm8 565 paddw xm0, xm4 566 packsswb xm0, xm5 567 movu [bufq+64], xm0 568%endif 569 add bufq, 82 570 add bufyq, 82<<%3 571 dec hd 572 jg .y_loop_ar0 573 RET 574 575INIT_XMM avx2 576.ar1: 577 DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift 578 imul uvd, 28 579 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 580 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 581 movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 582 pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 583 DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift 584 pmovsxbw xm4, xm4 585 pshufd xm5, xm4, q1111 586 pshufd xm4, xm4, q0000 587 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd 588%if %2 589 vpbroadcastd xm7, [base+pb_1] 590 vpbroadcastw xm6, [base+hmul_bits+2+%3*2] 591%endif 592 vpbroadcastd xm3, xm3 593%if %2 594 sub bufq, 82*(73-35*%3)+44-(82*3+41) 595%else 596 sub bufq, 82*70-(82-3) 597%endif 598 add bufyq, 79+82*3 599 mov hd, 70-35*%3 600 mov mind, -128 601 mov maxd, 127 602.y_loop_ar1: 603 mov xq, -(76>>%2) 604 movsx val3d, byte [bufq+xq-1] 605.x_loop_ar1: 606 pmovsxbw xm0, [bufq+xq-82-1] ; top/left 607%if %2 608 movq xm8, [bufyq+xq*2] 609%if %3 610 movq xm9, [bufyq+xq*2+82] 611%endif 612%endif 613 psrldq xm2, xm0, 2 ; top 614 psrldq xm1, xm0, 4 ; top/right 615%if %2 616 pmaddubsw xm8, xm7, xm8 617%if %3 618 pmaddubsw xm9, xm7, xm9 619 paddw xm8, xm9 620%endif 621 pmulhrsw xm8, xm6 622%else 623 pmovsxbw xm8, [bufyq+xq] 624%endif 625 punpcklwd xm0, xm2 626 punpcklwd xm1, xm8 627 pmaddwd xm0, xm4 628 pmaddwd xm1, xm5 629 paddd xm0, xm1 630 paddd xm0, xm3 631.x_loop_ar1_inner: 632 movd val0d, xm0 633 psrldq xm0, 4 634 imul val3d, cf3d 635 add val3d, val0d 636 sarx val3d, val3d, shiftd 637 movsx val0d, byte [bufq+xq] 638 add val3d, val0d 639 cmp val3d, maxd 640 cmovns val3d, maxd 641 cmp val3d, mind 642 cmovs val3d, mind 643 mov byte [bufq+xq], val3b 644 ; keep val3d in-place as left for next x iteration 645 inc xq 646 jz .x_loop_ar1_end 647 test xq, 3 648 jnz .x_loop_ar1_inner 649 jmp .x_loop_ar1 650 651.x_loop_ar1_end: 652 add bufq, 82 653 add bufyq, 82<<%3 654 dec hd 655 jg .y_loop_ar1 656 RET 657 658.ar2: 659 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 660 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 661 imul uvd, 28 662 vpbroadcastw xm13, [base+round_vals-12+shiftq*2] 663 pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 664 pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 665 pinsrw xm0, [base+pw_1], 5 666%if %2 667 vpbroadcastw xm12, [base+hmul_bits+2+%3*2] 668 vpbroadcastd xm11, [base+pb_1] 669%endif 670 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 671 pshufd xm4, xm7, q0000 672 pshufd xm5, xm7, q3333 673 pshufd xm6, xm7, q1111 674 pshufd xm7, xm7, q2222 675 pshufd xm8, xm0, q0000 676 pshufd xm9, xm0, q1111 677 pshufd xm10, xm0, q2222 678%if %2 679 sub bufq, 82*(73-35*%3)+44-(82*3+41) 680%else 681 sub bufq, 82*70-(82-3) 682%endif 683 add bufyq, 79+82*3 684 mov hd, 70-35*%3 685.y_loop_ar2: 686 mov xq, -(76>>%2) 687 688.x_loop_ar2: 689 pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 690 pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 691 pshufb xm2, xm0, [base+gen_shufA] 692 pmaddwd xm2, xm4 693 pshufb xm3, xm1, [base+gen_shufB] 694 pmaddwd xm3, xm5 695 paddd xm2, xm3 696 pshufb xm3, xm0, [base+gen_shufC] 697 pmaddwd xm3, xm6 698 punpckhqdq xm0, xm0 ; y=-2,x=[+2,+5] 699 punpcklwd xm0, xm1 700 pmaddwd xm0, xm7 701 pshufb xm1, [gen_shufD] 702 pmaddwd xm1, xm8 703 paddd xm2, xm3 704 paddd xm0, xm1 705 paddd xm2, xm0 706 707%if %2 708 movq xm0, [bufyq+xq*2] 709%if %3 710 movq xm3, [bufyq+xq*2+82] 711%endif 712 pmaddubsw xm0, xm11, xm0 713%if %3 714 pmaddubsw xm3, xm11, xm3 715 paddw xm0, xm3 716%endif 717 pmulhrsw xm0, xm12 718%else 719 pmovsxbw xm0, [bufyq+xq] 720%endif 721 punpcklwd xm0, xm13 722 pmaddwd xm0, xm10 723 paddd xm2, xm0 724 725 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] 726.x_loop_ar2_inner: 727 pmovsxbw xm0, xm0 728 pmaddwd xm3, xm0, xm9 729 psrldq xm0, 2 730 paddd xm3, xm2 731 psrldq xm2, 4 ; shift top to next pixel 732 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 733 pslldq xm3, 2 734 paddw xm3, xm0 735 pblendw xm0, xm3, 00000010b 736 packsswb xm0, xm0 737 pextrb [bufq+xq], xm0, 1 738 inc xq 739 jz .x_loop_ar2_end 740 test xb, 3 741 jnz .x_loop_ar2_inner 742 jmp .x_loop_ar2 743 744.x_loop_ar2_end: 745 add bufq, 82 746 add bufyq, 82<<%3 747 dec hd 748 jg .y_loop_ar2 749 RET 750 751INIT_YMM avx2 752.ar3: 753 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 754 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 755 imul uvd, 28 756 pmovsxbw m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 757 pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 758 vpbroadcastb xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] 759 movd xm13, [base+round_vals-10+shiftq*2] 760 vpbroadcastd xm14, [base+round_vals-14+shiftq*2] 761 pshufd m6, m0, q0000 762 pshufd m7, m0, q1111 763 pshufd m8, m0, q2222 764 pshufd m9, m0, q3333 765 pshufd xm10, xm1, q0000 766 pshufd xm11, xm1, q1111 767 pshufhw xm12, xm1, q0000 768 psraw xm2, 8 769 palignr xm13, xm1, 10 770 punpckhwd xm12, xm2 ; interleave luma cf 771 psrld xm14, 16 772 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 773%if %2 774 vpbroadcastw xm15, [base+hmul_bits+2+%3*2] 775 sub bufq, 82*(73-35*%3)+44-(82*3+41) 776%else 777 sub bufq, 82*70-(82-3) 778%endif 779 add bufyq, 79+82*3 780 mov hd, 70-35*%3 781.y_loop_ar3: 782 mov xq, -(76>>%2) 783.x_loop_ar3: 784 vbroadcasti128 m3, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12 785 palignr xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12] 786 vbroadcasti128 m4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 787 vpblendd m3, m1, 0x0f 788 pxor m0, m0 789 pcmpgtb m2, m0, m3 790 pcmpgtb m0, m4 791 punpcklbw m1, m3, m2 792 punpckhbw m3, m2 793 punpcklbw m2, m4, m0 794 punpckhbw xm4, xm0 795 pshufb m0, m1, [base+gen_shufA] 796 pmaddwd m0, m6 797 pshufb m5, m1, [base+gen_shufC] 798 pmaddwd m5, m7 799 shufps m1, m3, q1032 800 paddd m0, m5 801 pshufb m5, m1, [base+gen_shufA] 802 pmaddwd m5, m8 803 shufps xm1, xm3, q2121 804 vpblendd m1, m2, 0xf0 805 pshufb m1, [base+gen_shufE] 806 pmaddwd m1, m9 807 paddd m0, m5 808 pshufb xm3, xm2, [base+gen_shufC] 809 paddd m0, m1 810 pmaddwd xm3, xm10 811 palignr xm1, xm4, xm2, 2 812 punpckhwd xm1, xm2, xm1 813 pmaddwd xm1, xm11 814 palignr xm4, xm2, 12 815 paddd xm3, xm1 816%if %2 817 vpbroadcastd xm5, [base+pb_1] 818 movq xm1, [bufyq+xq*2] 819 pmaddubsw xm1, xm5, xm1 820%if %3 821 movq xm2, [bufyq+xq*2+82] 822 pmaddubsw xm5, xm2 823 paddw xm1, xm5 824%endif 825 pmulhrsw xm1, xm15 826%else 827 pmovsxbw xm1, [bufyq+xq] 828%endif 829 punpcklwd xm4, xm1 830 pmaddwd xm4, xm12 831 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] 832 vextracti128 xm2, m0, 1 833 paddd xm0, xm14 834 paddd xm3, xm4 835 paddd xm0, xm3 836 paddd xm0, xm2 837.x_loop_ar3_inner: 838 pmovsxbw xm1, xm1 839 pmaddwd xm2, xm13, xm1 840 pshuflw xm3, xm2, q1032 841 paddd xm2, xm0 ; add top 842 paddd xm2, xm3 ; left+cur 843 psrldq xm0, 4 844 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 845 psrldq xm1, 2 846 ; don't packssdw, we only care about one value 847 punpckldq xm2, xm2 848 pblendw xm1, xm2, 0100b 849 packsswb xm1, xm1 850 pextrb [bufq+xq], xm1, 2 851 inc xq 852 jz .x_loop_ar3_end 853 test xb, 3 854 jnz .x_loop_ar3_inner 855 jmp .x_loop_ar3 856.x_loop_ar3_end: 857 add bufq, 82 858 add bufyq, 82<<%3 859 dec hd 860 jg .y_loop_ar3 861 RET 862%endmacro 863 864INIT_YMM avx2 865cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \ 866 grain_lut, h, sby, see, overlap 867%define base r9-pd_m65536 868 lea r9, [pd_m65536] 869 mov r6d, [fg_dataq+FGData.scaling_shift] 870 mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 871 mov sbyd, sbym 872 mov overlapd, [fg_dataq+FGData.overlap_flag] 873 vpbroadcastd m8, [base+pd_m65536] 874 vpbroadcastw m9, [base+mul_bits+r6*2-14] 875 vpbroadcastd m10, [base+fg_min+r7*4] 876 vpbroadcastd m11, [base+fg_max+r7*8] 877 vpbroadcastd m12, [base+pw_1024] 878 movq xm13, [base+pb_27_17_17_27] 879 test sbyd, sbyd 880 setnz r7b 881 pxor m7, m7 882 test r7b, overlapb 883 jnz .vertical_overlap 884 885 imul seed, sbyd, (173 << 24) | 37 886 add seed, (105 << 24) | 178 887 rorx seed, seed, 24 888 movzx seed, seew 889 xor seed, [fg_dataq+FGData.seed] 890 891 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 892 offx, offy, see, overlap 893 894 lea src_bakq, [srcq+wq] 895 neg wq 896 sub dstq, srcq 897 898.loop_x: 899 rorx r6, seeq, 1 900 or seed, 0xEFF4 901 test seeb, seeh 902 lea seed, [r6+0x8000] 903 cmovp seed, r6d ; updated seed 904 905 rorx offyd, seed, 8 906 rorx offxq, seeq, 12 907 and offyd, 0xf 908 imul offyd, 164 909 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 910 911 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 912 h, offxy, see, overlap 913 914 mov hd, hm 915 mov grain_lutq, grain_lutmp 916.loop_y: 917 ; src 918 mova m2, [srcq] 919 punpcklbw m0, m2, m7 920 punpckhbw m1, m2, m7 921 922 ; scaling[src] 923 pandn m4, m8, m0 924 mova m6, m8 925 vpgatherdd m2, [scalingq+m4-0], m8 926 psrld m3, m0, 16 927 mova m8, m6 928 vpgatherdd m4, [scalingq+m3-2], m6 929 pandn m5, m8, m1 930 mova m6, m8 931 vpgatherdd m3, [scalingq+m5-0], m8 932 pblendw m2, m4, 0xaa 933 psrld m4, m1, 16 934 mova m8, m6 935 vpgatherdd m5, [scalingq+m4-2], m6 936 pblendw m3, m5, 0xaa 937 938 ; grain = grain_lut[offy+y][offx+x] 939 movu m5, [grain_lutq+offxyq] 940 punpcklbw m4, m5, m7 941 punpckhbw m5, m7 942 943 ; noise = round2(scaling[src] * grain, scaling_shift) 944 pmaddubsw m2, m4 945 pmaddubsw m3, m5 946 pmulhrsw m2, m9 947 pmulhrsw m3, m9 948 949 ; dst = clip_pixel(src, noise) 950 paddw m0, m2 951 paddw m1, m3 952 packuswb m0, m1 953 pmaxub m0, m10 954 pminub m0, m11 955 mova [dstq+srcq], m0 956 957 add srcq, strideq 958 add grain_lutq, 82 959 dec hd 960 jg .loop_y 961 962 add wq, 32 963 jge .end 964 lea srcq, [src_bakq+wq] 965 test overlapd, overlapd 966 jz .loop_x 967 968 ; r8m = sbym 969 cmp dword r8m, 0 970 jne .loop_x_hv_overlap 971 972 ; horizontal overlap (without vertical overlap) 973.loop_x_h_overlap: 974 rorx r6, seeq, 1 975 or seed, 0xEFF4 976 test seeb, seeh 977 lea seed, [r6+0x8000] 978 cmovp seed, r6d ; updated seed 979 980 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 981 offx, offy, see, left_offxy 982 983 lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx 984 rorx offyd, seed, 8 985 rorx offxq, seeq, 12 986 and offyd, 0xf 987 imul offyd, 164 988 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 989 990 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 991 h, offxy, see, left_offxy 992 993 mov grain_lutq, grain_lutmp 994 mov hd, hm 995.loop_y_h_overlap: 996 ; src 997 mova m2, [srcq] 998 punpcklbw m0, m2, m7 999 punpckhbw m1, m2, m7 1000 1001 ; scaling[src] 1002 pandn m4, m8, m0 1003 mova m6, m8 1004 vpgatherdd m2, [scalingq+m4-0], m8 1005 psrld m3, m0, 16 1006 mova m8, m6 1007 vpgatherdd m4, [scalingq+m3-2], m6 1008 pandn m5, m8, m1 1009 mova m6, m8 1010 vpgatherdd m3, [scalingq+m5-0], m8 1011 pblendw m2, m4, 0xaa 1012 psrld m4, m1, 16 1013 mova m8, m6 1014 vpgatherdd m5, [scalingq+m4-2], m6 1015 pblendw m3, m5, 0xaa 1016 1017 ; grain = grain_lut[offy+y][offx+x] 1018 movu m5, [grain_lutq+offxyq] 1019 movd xm4, [grain_lutq+left_offxyq] 1020 punpcklbw xm4, xm5 1021 pmaddubsw xm4, xm13, xm4 1022 pmulhrsw xm4, xm12 1023 packsswb xm4, xm4 1024 vpblendd m4, m5, 0xfe 1025 punpckhbw m5, m7 1026 punpcklbw m4, m7 1027 1028 ; noise = round2(scaling[src] * grain, scaling_shift) 1029 pmaddubsw m2, m4 1030 pmaddubsw m3, m5 1031 pmulhrsw m2, m9 1032 pmulhrsw m3, m9 1033 1034 ; dst = clip_pixel(src, noise) 1035 paddw m0, m2 1036 paddw m1, m3 1037 packuswb m0, m1 1038 pmaxub m0, m10 1039 pminub m0, m11 1040 mova [dstq+srcq], m0 1041 1042 add srcq, strideq 1043 add grain_lutq, 82 1044 dec hd 1045 jg .loop_y_h_overlap 1046 1047 add wq, 32 1048 jge .end 1049 lea srcq, [src_bakq+wq] 1050 1051 ; r8m = sbym 1052 cmp dword r8m, 0 1053 jne .loop_x_hv_overlap 1054 jmp .loop_x_h_overlap 1055 1056.vertical_overlap: 1057 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1058 unused, sby, see, overlap 1059 1060 movzx sbyd, sbyb 1061 imul seed, [fg_dataq+FGData.seed], 0x00010001 1062 imul r7d, sbyd, 173 * 0x00010001 1063 imul sbyd, 37 * 0x01000100 1064 add r7d, (105 << 16) | 188 1065 add sbyd, (178 << 24) | (141 << 8) 1066 and r7d, 0x00ff00ff 1067 and sbyd, 0xff00ff00 1068 xor seed, r7d 1069 xor seed, sbyd ; (cur_seed << 16) | top_seed 1070 1071 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1072 offx, offy, see, overlap 1073 1074 lea src_bakq, [srcq+wq] 1075 neg wq 1076 sub dstq, srcq 1077 1078.loop_x_v_overlap: 1079 vpbroadcastd m14, [pb_27_17] 1080 1081 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1082 mov r6d, seed 1083 or seed, 0xeff4eff4 1084 test seeb, seeh 1085 setp r7b ; parity of top_seed 1086 shr seed, 16 1087 shl r7d, 16 1088 test seeb, seeh 1089 setp r7b ; parity of cur_seed 1090 or r6d, 0x00010001 1091 xor r7d, r6d 1092 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1093 1094 rorx offyd, seed, 8 1095 rorx offxd, seed, 12 1096 and offyd, 0xf000f 1097 and offxd, 0xf000f 1098 imul offyd, 164 1099 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1100 lea offyd, [offyq+offxq*2+0x10001*747+32*82] 1101 1102 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1103 h, offxy, see, overlap, top_offxy 1104 1105 mov grain_lutq, grain_lutmp 1106 mov hd, hm 1107 movzx top_offxyd, offxyw 1108 shr offxyd, 16 1109.loop_y_v_overlap: 1110 ; src 1111 mova m2, [srcq] 1112 punpcklbw m0, m2, m7 1113 punpckhbw m1, m2, m7 1114 1115 ; scaling[src] 1116 pandn m4, m8, m0 1117 mova m6, m8 1118 vpgatherdd m2, [scalingq+m4-0], m8 1119 psrld m3, m0, 16 1120 mova m8, m6 1121 vpgatherdd m4, [scalingq+m3-2], m6 1122 pandn m5, m8, m1 1123 mova m6, m8 1124 vpgatherdd m3, [scalingq+m5-0], m8 1125 pblendw m2, m4, 0xaa 1126 psrld m4, m1, 16 1127 mova m8, m6 1128 vpgatherdd m5, [scalingq+m4-2], m6 1129 pblendw m3, m5, 0xaa 1130 1131 ; grain = grain_lut[offy+y][offx+x] 1132 movu m6, [grain_lutq+offxyq] 1133 movu m4, [grain_lutq+top_offxyq] 1134 punpcklbw m5, m4, m6 1135 punpckhbw m4, m6 1136 pmaddubsw m5, m14, m5 1137 pmaddubsw m4, m14, m4 1138 pmulhrsw m5, m12 1139 pmulhrsw m4, m12 1140 packsswb m5, m4 1141 punpcklbw m4, m5, m7 1142 punpckhbw m5, m7 1143 1144 ; noise = round2(scaling[src] * grain, scaling_shift) 1145 pmaddubsw m2, m4 1146 pmaddubsw m3, m5 1147 pmulhrsw m2, m9 1148 pmulhrsw m3, m9 1149 1150 ; dst = clip_pixel(src, noise) 1151 paddw m0, m2 1152 paddw m1, m3 1153 packuswb m0, m1 1154 pmaxub m0, m10 1155 pminub m0, m11 1156 mova [dstq+srcq], m0 1157 1158 add srcq, strideq 1159 add grain_lutq, 82 1160 dec hb 1161 jz .end_y_v_overlap 1162 vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line 1163 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1164 ; remaining (up to) 30 lines 1165 add hd, 0x80000000 1166 jnc .loop_y_v_overlap 1167 jmp .loop_y 1168.end_y_v_overlap: 1169 add wq, 32 1170 jge .end 1171 lea srcq, [src_bakq+wq] 1172 1173 ; since fg_dataq.overlap is guaranteed to be set, we never jump 1174 ; back to .loop_x_v_overlap, and instead always fall-through to 1175 ; h+v overlap 1176.loop_x_hv_overlap: 1177 vpbroadcastd m14, [pb_27_17] 1178 1179 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1180 mov r6d, seed 1181 or seed, 0xeff4eff4 1182 test seeb, seeh 1183 setp r7b ; parity of top_seed 1184 shr seed, 16 1185 shl r7d, 16 1186 test seeb, seeh 1187 setp r7b ; parity of cur_seed 1188 or r6d, 0x00010001 1189 xor r7d, r6d 1190 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1191 1192 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1193 offx, offy, see, left_offxy, top_offxy, topleft_offxy 1194 1195 lea topleft_offxyd, [top_offxyq+32] 1196 lea left_offxyd, [offyq+32] 1197 rorx offyd, seed, 8 1198 rorx offxd, seed, 12 1199 and offyd, 0xf000f 1200 and offxd, 0xf000f 1201 imul offyd, 164 1202 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1203 lea offyd, [offyq+offxq*2+0x10001*747+32*82] 1204 1205 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1206 h, offxy, see, left_offxy, top_offxy, topleft_offxy 1207 1208 mov grain_lutq, grain_lutmp 1209 mov hd, hm 1210 movzx top_offxyd, offxyw 1211 shr offxyd, 16 1212.loop_y_hv_overlap: 1213 ; src 1214 mova m2, [srcq] 1215 punpcklbw m0, m2, m7 1216 punpckhbw m1, m2, m7 1217 1218 ; scaling[src] 1219 pandn m4, m8, m0 1220 mova m6, m8 1221 vpgatherdd m2, [scalingq+m4-0], m8 1222 psrld m3, m0, 16 1223 mova m8, m6 1224 vpgatherdd m4, [scalingq+m3-2], m6 1225 pandn m5, m8, m1 1226 mova m6, m8 1227 vpgatherdd m3, [scalingq+m5-0], m8 1228 pblendw m2, m4, 0xaa 1229 psrld m4, m1, 16 1230 mova m8, m6 1231 vpgatherdd m5, [scalingq+m4-2], m6 1232 pblendw m3, m5, 0xaa 1233 1234 ; grain = grain_lut[offy+y][offx+x] 1235 movu m6, [grain_lutq+offxyq] 1236 movd xm7, [grain_lutq+left_offxyq] 1237 movu m4, [grain_lutq+top_offxyq] 1238 movd xm5, [grain_lutq+topleft_offxyq] 1239 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1240 punpcklbw xm7, xm6 1241 punpcklbw xm5, xm4 1242 pmaddubsw xm7, xm13, xm7 1243 pmaddubsw xm5, xm13, xm5 1244 pmulhrsw xm7, xm12 1245 pmulhrsw xm5, xm12 1246 packsswb xm7, xm7 1247 packsswb xm5, xm5 1248 vpblendd m7, m6, 0xfe 1249 vpblendd m5, m4, 0xfe 1250 ; followed by v interpolation (top | cur -> cur) 1251 punpckhbw m4, m6 1252 punpcklbw m5, m7 1253 pmaddubsw m4, m14, m4 1254 pmaddubsw m5, m14, m5 1255 pmulhrsw m4, m12 1256 pmulhrsw m5, m12 1257 pxor m7, m7 1258 packsswb m5, m4 1259 punpcklbw m4, m5, m7 1260 punpckhbw m5, m7 1261 1262 ; noise = round2(scaling[src] * grain, scaling_shift) 1263 pmaddubsw m2, m4 1264 pmaddubsw m3, m5 1265 pmulhrsw m2, m9 1266 pmulhrsw m3, m9 1267 1268 ; dst = clip_pixel(src, noise) 1269 paddw m0, m2 1270 paddw m1, m3 1271 packuswb m0, m1 1272 pmaxub m0, m10 1273 pminub m0, m11 1274 mova [dstq+srcq], m0 1275 1276 add srcq, strideq 1277 add grain_lutq, 82 1278 dec hb 1279 jz .end_y_hv_overlap 1280 vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line 1281 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1282 ; remaining (up to) 30 lines 1283 add hd, 0x80000000 1284 jnc .loop_y_hv_overlap 1285 jmp .loop_y_h_overlap 1286.end_y_hv_overlap: 1287 add wq, 32 1288 lea srcq, [src_bakq+wq] 1289 jl .loop_x_hv_overlap 1290.end: 1291 RET 1292 1293%macro FGUV_FN 3 ; name, ss_hor, ss_ver 1294cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 1295 grain_lut, h, sby, luma, overlap, uv_pl, is_id 1296%define base r11-pd_m65536 1297 lea r11, [pd_m65536] 1298 mov r6d, [fg_dataq+FGData.scaling_shift] 1299 mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 1300 mov r9d, is_idm 1301 mov sbyd, sbym 1302 mov overlapd, [fg_dataq+FGData.overlap_flag] 1303 vpbroadcastd m8, [base+pd_m65536] 1304 vpbroadcastw m9, [base+mul_bits+r6*2-14] 1305 vpbroadcastd m10, [base+fg_min+r7*4] 1306 shlx r7d, r7d, r9d 1307 vpbroadcastd m11, [base+fg_max+r7*4] 1308 vpbroadcastd m12, [base+pw_1024] 1309 pxor m7, m7 1310 test sbyd, sbyd 1311 setnz r7b 1312 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 1313 jne .csfl 1314 1315%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 1316 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1317 h, sby, see, overlap, uv_pl 1318%if %1 1319 mov r6d, uv_plm 1320 vpbroadcastd m0, [base+pw_8] 1321 vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] 1322 vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] 1323 pshufb m14, m0 ; uv_luma_mult, uv_mult 1324%elif %2 1325 vpbroadcastq m15, [base+pb_23_22] 1326%else 1327 vpbroadcastq xm15, [base+pb_27_17_17_27] 1328%endif 1329%if %3 1330 vpbroadcastw m13, [base+pb_23_22] 1331%elif %2 1332 pshufd m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27 1333%endif 1334 test r7b, overlapb 1335 jnz %%vertical_overlap 1336 1337 imul seed, sbyd, (173 << 24) | 37 1338 add seed, (105 << 24) | 178 1339 rorx seed, seed, 24 1340 movzx seed, seew 1341 xor seed, [fg_dataq+FGData.seed] 1342 1343 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1344 unused2, unused3, see, overlap, unused4, unused5, lstride 1345 1346 mov lumaq, r9mp 1347 lea r12, [srcq+wq] 1348 lea r13, [dstq+wq] 1349 lea r14, [lumaq+wq*(1+%2)] 1350 mov r11mp, r12 1351 mov r12mp, r13 1352 mov lstrideq, r10mp 1353 neg wq 1354 1355%%loop_x: 1356 rorx r6, seeq, 1 1357 or seed, 0xEFF4 1358 test seeb, seeh 1359 lea seed, [r6+0x8000] 1360 cmovp seed, r6d ; updated seed 1361 1362 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1363 offx, offy, see, overlap, unused1, unused2, lstride 1364 1365 rorx offyd, seed, 8 1366 rorx offxq, seeq, 12 1367 and offyd, 0xf 1368 imul offyd, 164>>%3 1369 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 1370 1371 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1372 h, offxy, see, overlap, unused1, unused2, lstride 1373 1374 mov grain_lutq, grain_lutmp 1375 mov hd, hm 1376%%loop_y: 1377 ; src 1378%if %2 1379 mova xm3, [lumaq+lstrideq*0+ 0] 1380 vinserti128 m3, [lumaq+lstrideq*(1+%3) +0], 1 1381 vpbroadcastd m2, [pb_1] 1382 mova xm0, [lumaq+lstrideq*0+16] 1383 vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 1384 mova xm1, [srcq] 1385 vinserti128 m1, [srcq+strideq], 1 1386 pmaddubsw m3, m2 1387 pmaddubsw m0, m2 1388 pavgw m3, m7 1389 pavgw m0, m7 1390%else 1391 mova m2, [lumaq] 1392 mova m1, [srcq] 1393%endif 1394%if %1 1395%if %2 1396 packuswb m2, m3, m0 ; luma 1397%endif 1398 punpckhbw m3, m2, m1 1399 punpcklbw m2, m1 ; { luma, chroma } 1400 pmaddubsw m3, m14 1401 pmaddubsw m2, m14 1402 psraw m3, 6 1403 psraw m2, 6 1404 paddw m3, m15 1405 paddw m2, m15 1406 packuswb m2, m3 ; pack+unpack = clip 1407%endif 1408%if %1 || %2 == 0 1409 punpcklbw m3, m2, m7 1410 punpckhbw m0, m2, m7 1411%endif 1412 1413 ; scaling[luma_src] 1414 pandn m4, m8, m3 1415 mova m6, m8 1416 vpgatherdd m2, [scalingq+m4-0], m8 1417 psrld m3, 16 1418 mova m8, m6 1419 vpgatherdd m4, [scalingq+m3-2], m6 1420 pandn m5, m8, m0 1421 mova m6, m8 1422 vpgatherdd m3, [scalingq+m5-0], m8 1423 psrld m0, 16 1424 mova m8, m6 1425 vpgatherdd m5, [scalingq+m0-2], m6 1426 pblendw m2, m4, 0xaa 1427 pblendw m3, m5, 0xaa 1428 1429 ; grain = grain_lut[offy+y][offx+x] 1430%if %2 1431 movu xm5, [grain_lutq+offxyq+ 0] 1432 vinserti128 m5, [grain_lutq+offxyq+82], 1 1433%else 1434 movu m5, [grain_lutq+offxyq] 1435%endif 1436 punpcklbw m4, m5, m7 1437 punpckhbw m5, m7 1438 1439 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1440 pmaddubsw m2, m4 1441 pmaddubsw m3, m5 1442 pmulhrsw m2, m9 1443 pmulhrsw m3, m9 1444 1445 ; unpack chroma_source 1446 punpcklbw m0, m1, m7 1447 punpckhbw m1, m7 1448 1449 ; dst = clip_pixel(src, noise) 1450 paddw m0, m2 1451 paddw m1, m3 1452 packuswb m0, m1 1453 pmaxub m0, m10 1454 pminub m0, m11 1455%if %2 1456 mova [dstq], xm0 1457 vextracti128 [dstq+strideq], m0, 1 1458%else 1459 mova [dstq], m0 1460%endif 1461 1462%if %2 1463 lea srcq, [srcq+strideq*2] 1464 lea dstq, [dstq+strideq*2] 1465 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1466%else 1467 add srcq, strideq 1468 add dstq, strideq 1469 add lumaq, lstrideq 1470%endif 1471 add grain_lutq, 82<<%2 1472 sub hb, 1+%2 1473 jg %%loop_y 1474 1475 add wq, 32>>%2 1476 jge .end 1477 mov srcq, r11mp 1478 mov dstq, r12mp 1479 lea lumaq, [r14+wq*(1+%2)] 1480 add srcq, wq 1481 add dstq, wq 1482 test overlapd, overlapd 1483 jz %%loop_x 1484 1485 ; r8m = sbym 1486 cmp dword r8m, 0 1487 jne %%loop_x_hv_overlap 1488 1489 ; horizontal overlap (without vertical overlap) 1490%%loop_x_h_overlap: 1491 rorx r6, seeq, 1 1492 or seed, 0xEFF4 1493 test seeb, seeh 1494 lea seed, [r6+0x8000] 1495 cmovp seed, r6d ; updated seed 1496 1497 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1498 offx, offy, see, left_offxy, unused1, unused2, lstride 1499 1500 lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx 1501 rorx offyd, seed, 8 1502 rorx offxq, seeq, 12 1503 and offyd, 0xf 1504 imul offyd, 164>>%3 1505 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 1506 1507 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1508 h, offxy, see, left_offxy, unused1, unused2, lstride 1509 1510 mov grain_lutq, grain_lutmp 1511 mov hd, hm 1512%%loop_y_h_overlap: 1513 ; src 1514%if %2 1515 mova xm3, [lumaq+lstrideq*0+ 0] 1516 vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 1517 vpbroadcastd m2, [pb_1] 1518 mova xm0, [lumaq+lstrideq*0+16] 1519 vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 1520 mova xm1, [srcq] 1521 vinserti128 m1, [srcq+strideq], 1 1522 pmaddubsw m3, m2 1523 pmaddubsw m0, m2 1524 pavgw m3, m7 1525 pavgw m0, m7 1526%else 1527 mova m2, [lumaq] 1528 mova m1, [srcq] 1529%endif 1530%if %1 1531%if %2 1532 packuswb m2, m3, m0 ; luma 1533%endif 1534 punpckhbw m3, m2, m1 1535 punpcklbw m2, m1 ; { luma, chroma } 1536 pmaddubsw m3, m14 1537 pmaddubsw m2, m14 1538 psraw m3, 6 1539 psraw m2, 6 1540 paddw m3, m15 1541 paddw m2, m15 1542 packuswb m2, m3 ; pack+unpack = clip 1543%endif 1544%if %1 || %2 == 0 1545 punpcklbw m3, m2, m7 1546 punpckhbw m0, m2, m7 1547%endif 1548 1549 ; scaling[luma_src] 1550 pandn m4, m8, m3 1551 mova m6, m8 1552 vpgatherdd m2, [scalingq+m4-0], m8 1553 psrld m3, 16 1554 mova m8, m6 1555 vpgatherdd m4, [scalingq+m3-2], m6 1556 pandn m5, m8, m0 1557 mova m6, m8 1558 vpgatherdd m3, [scalingq+m5-0], m8 1559 psrld m0, 16 1560 mova m8, m6 1561 vpgatherdd m5, [scalingq+m0-2], m6 1562 pblendw m2, m4, 0xaa 1563 pblendw m3, m5, 0xaa 1564 1565 ; grain = grain_lut[offy+y][offx+x] 1566%if %2 1567 movu xm5, [grain_lutq+offxyq+ 0] 1568 vinserti128 m5, [grain_lutq+offxyq+82], 1 1569 movd xm4, [grain_lutq+left_offxyq+ 0] 1570 vinserti128 m4, [grain_lutq+left_offxyq+82], 1 1571 punpcklbw m4, m5 1572%if %1 1573 vpbroadcastq m0, [pb_23_22] 1574 pmaddubsw m4, m0, m4 1575%else 1576 pmaddubsw m4, m15, m4 1577%endif 1578 pmulhrsw m4, m12 1579 packsswb m4, m4 1580 vpblendd m4, m5, 0xee 1581%else 1582 movu m5, [grain_lutq+offxyq] 1583 movd xm4, [grain_lutq+left_offxyq] 1584 punpcklbw xm4, xm5 1585%if %1 1586 movq xm0, [pb_27_17_17_27] 1587 pmaddubsw xm4, xm0, xm4 1588%else 1589 pmaddubsw xm4, xm15, xm4 1590%endif 1591 pmulhrsw xm4, xm12 1592 packsswb xm4, xm4 1593 vpblendd m4, m5, 0xfe 1594%endif 1595 punpckhbw m5, m7 1596 punpcklbw m4, m7 1597 1598 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1599 pmaddubsw m2, m4 1600 pmaddubsw m3, m5 1601 pmulhrsw m2, m9 1602 pmulhrsw m3, m9 1603 1604 ; unpack chroma_source 1605 punpcklbw m0, m1, m7 1606 punpckhbw m1, m7 1607 1608 ; dst = clip_pixel(src, noise) 1609 paddw m0, m2 1610 paddw m1, m3 1611 packuswb m0, m1 1612 pmaxub m0, m10 1613 pminub m0, m11 1614%if %2 1615 mova [dstq], xm0 1616 vextracti128 [dstq+strideq], m0, 1 1617%else 1618 mova [dstq], m0 1619%endif 1620 1621%if %2 1622 lea srcq, [srcq+strideq*2] 1623 lea dstq, [dstq+strideq*2] 1624 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1625%else 1626 add srcq, strideq 1627 add dstq, strideq 1628 add lumaq, lstrideq 1629%endif 1630 add grain_lutq, 82*(1+%2) 1631 sub hb, 1+%2 1632 jg %%loop_y_h_overlap 1633 1634 add wq, 32>>%2 1635 jge .end 1636 mov srcq, r11mp 1637 mov dstq, r12mp 1638 lea lumaq, [r14+wq*(1+%2)] 1639 add srcq, wq 1640 add dstq, wq 1641 1642 ; r8m = sbym 1643 cmp dword r8m, 0 1644 jne %%loop_x_hv_overlap 1645 jmp %%loop_x_h_overlap 1646 1647%%vertical_overlap: 1648 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ 1649 sby, see, overlap, unused1, unused2, lstride 1650 1651 movzx sbyd, sbyb 1652 imul seed, [fg_dataq+FGData.seed], 0x00010001 1653 imul r7d, sbyd, 173 * 0x00010001 1654 imul sbyd, 37 * 0x01000100 1655 add r7d, (105 << 16) | 188 1656 add sbyd, (178 << 24) | (141 << 8) 1657 and r7d, 0x00ff00ff 1658 and sbyd, 0xff00ff00 1659 xor seed, r7d 1660 xor seed, sbyd ; (cur_seed << 16) | top_seed 1661 1662 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1663 unused1, unused2, see, overlap, unused3, unused4, lstride 1664 1665 mov lumaq, r9mp 1666 lea r12, [srcq+wq] 1667 lea r13, [dstq+wq] 1668 lea r14, [lumaq+wq*(1+%2)] 1669 mov r11mp, r12 1670 mov r12mp, r13 1671 mov lstrideq, r10mp 1672 neg wq 1673 1674%%loop_x_v_overlap: 1675 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1676 mov r6d, seed 1677 or seed, 0xeff4eff4 1678 test seeb, seeh 1679 setp r7b ; parity of top_seed 1680 shr seed, 16 1681 shl r7d, 16 1682 test seeb, seeh 1683 setp r7b ; parity of cur_seed 1684 or r6d, 0x00010001 1685 xor r7d, r6d 1686 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1687 1688 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1689 offx, offy, see, overlap, top_offxy, unused, lstride 1690 1691 rorx offyd, seed, 8 1692 rorx offxd, seed, 12 1693 and offyd, 0xf000f 1694 and offxd, 0xf000f 1695 imul offyd, 164>>%3 1696 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1697 lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 1698 1699 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1700 h, offxy, see, overlap, top_offxy, unused, lstride 1701 1702 mov grain_lutq, grain_lutmp 1703 mov hd, hm 1704 movzx top_offxyd, offxyw 1705 shr offxyd, 16 1706%if %2 == 0 1707 vpbroadcastd m13, [pb_27_17] 1708%endif 1709%%loop_y_v_overlap: 1710 ; src 1711%if %2 1712 mova xm3, [lumaq+lstrideq*0+ 0] 1713 vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 1714 vpbroadcastd m2, [pb_1] 1715 mova xm0, [lumaq+lstrideq*0+16] 1716 vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 1717 mova xm1, [srcq] 1718 vinserti128 m1, [srcq+strideq], 1 1719 pmaddubsw m3, m2 1720 pmaddubsw m0, m2 1721 pavgw m3, m7 1722 pavgw m0, m7 1723%else 1724 mova m2, [lumaq] 1725 mova m1, [srcq] 1726%endif 1727%if %1 1728%if %2 1729 packuswb m2, m3, m0 ; luma 1730%endif 1731 punpckhbw m3, m2, m1 1732 punpcklbw m2, m1 ; { luma, chroma } 1733 pmaddubsw m3, m14 1734 pmaddubsw m2, m14 1735 psraw m3, 6 1736 psraw m2, 6 1737 paddw m3, m15 1738 paddw m2, m15 1739 packuswb m2, m3 ; pack+unpack = clip 1740%endif 1741%if %1 || %2 == 0 1742 punpcklbw m3, m2, m7 1743 punpckhbw m0, m2, m7 1744%endif 1745 1746 ; scaling[luma_src] 1747 pandn m4, m8, m3 1748 mova m6, m8 1749 vpgatherdd m2, [scalingq+m4-0], m8 1750 psrld m3, 16 1751 mova m8, m6 1752 vpgatherdd m4, [scalingq+m3-2], m6 1753 pandn m5, m8, m0 1754 mova m6, m8 1755 vpgatherdd m3, [scalingq+m5-0], m8 1756 psrld m0, 16 1757 mova m8, m6 1758 vpgatherdd m5, [scalingq+m0-2], m6 1759 pblendw m2, m4, 0xaa 1760 pblendw m3, m5, 0xaa 1761 1762 ; grain = grain_lut[offy+y][offx+x] 1763%if %3 == 0 1764%if %2 1765 movu xm0, [grain_lutq+offxyq] 1766 vinserti128 m0, [grain_lutq+offxyq+82], 1 1767 movu xm4, [grain_lutq+top_offxyq] 1768 vinserti128 m4, [grain_lutq+top_offxyq+82], 1 1769%else 1770 movu m0, [grain_lutq+offxyq] 1771 movu m4, [grain_lutq+top_offxyq] 1772%endif 1773 punpcklbw m5, m4, m0 1774 punpckhbw m4, m0 1775 pmaddubsw m5, m13, m5 1776 pmaddubsw m4, m13, m4 1777 pmulhrsw m5, m12 1778 pmulhrsw m4, m12 1779 packsswb m5, m4 1780%else 1781 movq xm4, [grain_lutq+offxyq] 1782 vinserti128 m4, [grain_lutq+offxyq+8], 1 1783 movq xm5, [grain_lutq+top_offxyq] 1784 vinserti128 m5, [grain_lutq+top_offxyq+8], 1 1785 punpcklbw m5, m4 1786 pmaddubsw m5, m13, m5 1787 pmulhrsw m5, m12 1788 vextracti128 xm4, m5, 1 1789 packsswb xm5, xm4 1790 ; only interpolate first line, insert second line unmodified 1791 vinserti128 m5, [grain_lutq+offxyq+82], 1 1792%endif 1793 punpcklbw m4, m5, m7 1794 punpckhbw m5, m7 1795 1796 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1797 pmaddubsw m2, m4 1798 pmaddubsw m3, m5 1799 pmulhrsw m2, m9 1800 pmulhrsw m3, m9 1801 1802 ; unpack chroma_source 1803 punpcklbw m0, m1, m7 1804 punpckhbw m1, m7 1805 1806 ; dst = clip_pixel(src, noise) 1807 paddw m0, m2 1808 paddw m1, m3 1809 packuswb m0, m1 1810 pmaxub m0, m10 1811 pminub m0, m11 1812%if %2 1813 mova [dstq], xm0 1814 vextracti128 [dstq+strideq], m0, 1 1815%else 1816 mova [dstq], m0 1817%endif 1818 1819 sub hb, 1+%2 1820 jle %%end_y_v_overlap 1821%if %2 1822 lea srcq, [srcq+strideq*2] 1823 lea dstq, [dstq+strideq*2] 1824 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1825%else 1826 add srcq, strideq 1827 add dstq, strideq 1828 add lumaq, lstrideq 1829%endif 1830 add grain_lutq, 82<<%2 1831%if %2 == 0 1832 vpbroadcastd m13, [pb_17_27] 1833 add hd, 0x80000000 1834 jnc %%loop_y_v_overlap 1835%endif 1836 jmp %%loop_y 1837 1838%%end_y_v_overlap: 1839 add wq, 32>>%2 1840 jge .end 1841 mov srcq, r11mp 1842 mov dstq, r12mp 1843 lea lumaq, [r14+wq*(1+%2)] 1844 add srcq, wq 1845 add dstq, wq 1846 1847 ; since fg_dataq.overlap is guaranteed to be set, we never jump 1848 ; back to .loop_x_v_overlap, and instead always fall-through to 1849 ; h+v overlap 1850 1851%%loop_x_hv_overlap: 1852 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1853 mov r6d, seed 1854 or seed, 0xeff4eff4 1855 test seeb, seeh 1856 setp r7b ; parity of top_seed 1857 shr seed, 16 1858 shl r7d, 16 1859 test seeb, seeh 1860 setp r7b ; parity of cur_seed 1861 or r6d, 0x00010001 1862 xor r7d, r6d 1863 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1864 1865 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1866 offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride 1867 1868 lea topleft_offxyd, [top_offxyq+(32>>%2)] 1869 lea left_offxyd, [offyq+(32>>%2)] 1870 rorx offyd, seed, 8 1871 rorx offxd, seed, 12 1872 and offyd, 0xf000f 1873 and offxd, 0xf000f 1874 imul offyd, 164>>%3 1875 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1876 lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 1877 1878 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1879 h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride 1880 1881 mov grain_lutq, grain_lutmp 1882 mov hd, hm 1883 movzx top_offxyd, offxyw 1884 shr offxyd, 16 1885%if %2 == 0 1886 vpbroadcastd m13, [pb_27_17] 1887%endif 1888%%loop_y_hv_overlap: 1889 ; src 1890%if %2 1891 mova xm3, [lumaq+lstrideq*0+ 0] 1892 vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 1893 vpbroadcastd m2, [pb_1] 1894 mova xm0, [lumaq+lstrideq*0+16] 1895 vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 1896 mova xm1, [srcq] 1897 vinserti128 m1, [srcq+strideq], 1 1898 pmaddubsw m3, m2 1899 pmaddubsw m0, m2 1900 pavgw m3, m7 1901 pavgw m0, m7 1902%else 1903 mova m2, [lumaq] 1904 mova m1, [srcq] 1905%endif 1906%if %1 1907%if %2 1908 packuswb m2, m3, m0 ; luma 1909%endif 1910 punpckhbw m3, m2, m1 1911 punpcklbw m2, m1 ; { luma, chroma } 1912 pmaddubsw m3, m14 1913 pmaddubsw m2, m14 1914 psraw m3, 6 1915 psraw m2, 6 1916 paddw m3, m15 1917 paddw m2, m15 1918 packuswb m2, m3 ; pack+unpack = clip 1919%endif 1920%if %1 || %2 == 0 1921 punpcklbw m3, m2, m7 1922 punpckhbw m0, m2, m7 1923%endif 1924 1925 ; scaling[luma_src] 1926 pandn m4, m8, m3 1927 mova m6, m8 1928 vpgatherdd m2, [scalingq+m4-0], m8 1929 psrld m3, 16 1930 mova m8, m6 1931 vpgatherdd m4, [scalingq+m3-2], m6 1932 pandn m5, m8, m0 1933 mova m6, m8 1934 vpgatherdd m3, [scalingq+m5-0], m8 1935 psrld m0, 16 1936 mova m8, m6 1937 vpgatherdd m5, [scalingq+m0-2], m6 1938 pblendw m2, m4, 0xaa 1939 pblendw m3, m5, 0xaa 1940 1941 ; grain = grain_lut[offy+y][offx+x] 1942%if %2 1943 movu xm4, [grain_lutq+offxyq] 1944 vinserti128 m4, [grain_lutq+offxyq+82], 1 1945 movd xm0, [grain_lutq+left_offxyq] 1946 vinserti128 m0, [grain_lutq+left_offxyq+82], 1 1947 movd xm6, [grain_lutq+topleft_offxyq] 1948%if %3 1949 movq xm5, [grain_lutq+top_offxyq] 1950 vinserti128 m5, [grain_lutq+top_offxyq+8], 1 1951%else 1952 vinserti128 m6, [grain_lutq+topleft_offxyq+82], 1 1953 movu xm5, [grain_lutq+top_offxyq] 1954 vinserti128 m5, [grain_lutq+top_offxyq+82], 1 1955%endif 1956 1957 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1958 punpcklbw m0, m4 1959%if %3 1960 punpcklbw xm6, xm5 1961%else 1962 punpcklbw m6, m5 1963%endif 1964 punpcklqdq m0, m6 1965%if %1 1966 vpbroadcastq m6, [pb_23_22] 1967 pmaddubsw m0, m6, m0 1968%else 1969 pmaddubsw m0, m15, m0 1970%endif 1971 pmulhrsw m0, m12 1972 packsswb m0, m0 1973 vpblendd m4, m0, 0x11 1974%if %3 1975 pshuflw xm0, xm0, q1032 1976 vpblendd m5, m0, 0x01 1977%else 1978 pshuflw m0, m0, q1032 1979 vpblendd m5, m0, 0x11 1980%endif 1981%else 1982 movu m4, [grain_lutq+offxyq] 1983 movd xm0, [grain_lutq+left_offxyq] 1984 movu m5, [grain_lutq+top_offxyq] 1985 movd xm6, [grain_lutq+topleft_offxyq] 1986 punpcklbw xm0, xm4 1987 punpcklbw xm6, xm5 1988 punpcklqdq xm0, xm6 1989%if %1 1990 vpbroadcastq xm6, [pb_27_17_17_27] 1991 pmaddubsw xm0, xm6, xm0 1992%else 1993 pmaddubsw xm0, xm15, xm0 1994%endif 1995 pmulhrsw xm0, xm12 1996 packsswb xm0, xm0 1997 vpblendd m4, m0, 0x01 1998 pshuflw xm0, xm0, q1032 1999 vpblendd m5, m0, 0x01 2000%endif 2001 2002 ; followed by v interpolation (top | cur -> cur) 2003%if %3 2004 vpermq m0, m4, q3120 2005 punpcklbw m5, m0 2006 pmaddubsw m5, m13, m5 2007 pmulhrsw m5, m12 2008 vextracti128 xm0, m5, 1 2009 packsswb xm5, xm0 2010 vpblendd m5, m4, 0xf0 2011%else 2012 punpckhbw m0, m5, m4 2013 punpcklbw m5, m4 2014 pmaddubsw m4, m13, m0 2015 pmaddubsw m5, m13, m5 2016 pmulhrsw m4, m12 2017 pmulhrsw m5, m12 2018 packsswb m5, m4 2019%endif 2020 punpcklbw m4, m5, m7 2021 punpckhbw m5, m7 2022 2023 ; noise = round2(scaling[src] * grain, scaling_shift) 2024 pmaddubsw m2, m4 2025 pmaddubsw m3, m5 2026 pmulhrsw m2, m9 2027 pmulhrsw m3, m9 2028 2029 ; unpack chroma source 2030 punpcklbw m0, m1, m7 2031 punpckhbw m1, m7 2032 2033 ; dst = clip_pixel(src, noise) 2034 paddw m0, m2 2035 paddw m1, m3 2036 packuswb m0, m1 2037 pmaxub m0, m10 2038 pminub m0, m11 2039%if %2 2040 mova [dstq], xm0 2041 vextracti128 [dstq+strideq], m0, 1 2042%else 2043 mova [dstq], m0 2044%endif 2045 2046%if %2 2047 lea srcq, [srcq+strideq*2] 2048 lea dstq, [dstq+strideq*2] 2049 lea lumaq, [lumaq+lstrideq*(2<<%3)] 2050%else 2051 add srcq, strideq 2052 add dstq, strideq 2053 add lumaq, lstrideq 2054%endif 2055 add grain_lutq, 82<<%2 2056 sub hb, 1+%2 2057%if %2 2058 jg %%loop_y_h_overlap 2059%else 2060 je %%end_y_hv_overlap 2061 vpbroadcastd m13, [pb_17_27] 2062 add hd, 0x80000000 2063 jnc %%loop_y_hv_overlap 2064 jmp %%loop_y_h_overlap 2065%endif 2066 2067%%end_y_hv_overlap: 2068 add wq, 32>>%2 2069 jge .end 2070 mov srcq, r11mp 2071 mov dstq, r12mp 2072 lea lumaq, [r14+wq*(1+%2)] 2073 add srcq, wq 2074 add dstq, wq 2075 jmp %%loop_x_hv_overlap 2076%endmacro 2077 2078 %%FGUV_32x32xN_LOOP 1, %2, %3 2079.csfl: 2080 %%FGUV_32x32xN_LOOP 0, %2, %3 2081.end: 2082 RET 2083%endmacro 2084 2085GEN_GRAIN_UV_FN 420, 1, 1 2086FGUV_FN 420, 1, 1 2087GEN_GRAIN_UV_FN 422, 1, 0 2088FGUV_FN 422, 1, 0 2089GEN_GRAIN_UV_FN 444, 0, 0 2090FGUV_FN 444, 0, 0 2091 2092%endif ; ARCH_X86_64 2093