1; Copyright © 2021-2022, VideoLAN and dav1d authors 2; Copyright © 2021-2022, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28%include "x86/filmgrain_common.asm" 29 30%if ARCH_X86_64 31 32SECTION_RODATA 16 33pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 34gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 35gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 36next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 37pw_27_17_17_27: dw 27, 17, 17, 27 38pw_23_22: dw 23, 22, 0, 32 39pw_seed_xor: times 2 dw 0xb524 40 times 2 dw 0x49d8 41gen_ar0_shift: times 4 db 128 42 times 4 db 64 43 times 4 db 32 44 times 4 db 16 45pd_16: dd 16 46pd_m65536: dd -65536 47pb_1: times 4 db 1 48grain_max: times 2 dw 511 49 times 2 dw 2047 50grain_min: times 2 dw -512 51 times 2 dw -2048 52fg_max: times 2 dw 1023 53 times 2 dw 4095 54 times 2 dw 960 55 times 2 dw 3840 56 times 2 dw 940 57 times 2 dw 3760 58fg_min: times 2 dw 0 59 times 2 dw 64 60 times 2 dw 256 61uv_offset_mul: dd 256 62 dd 1024 63hmul_bits: dw 32768, 16384, 8192, 4096 64round: dw 2048, 1024, 512 65mul_bits: dw 256, 128, 64, 32, 16, 8 66round_vals: dw 32, 64, 128, 256, 512, 1024 67pb_8_9_0_1: db 8, 9, 0, 1 68 69%macro JMP_TABLE 1-* 70 %xdefine %1_table %%table 71 %xdefine %%base %1_table 72 %xdefine %%prefix mangle(private_prefix %+ _%1) 73 %%table: 74 %rep %0 - 1 75 dd %%prefix %+ .ar%2 - %%base 76 %rotate 1 77 %endrep 78%endmacro 79 80JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 81JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 82JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 83JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 84 85SECTION .text 86 87%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 88 89INIT_YMM avx2 90cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax 91%define base r4-generate_grain_y_16bpc_avx2_table 92 lea r4, [generate_grain_y_16bpc_avx2_table] 93 vpbroadcastw xm0, [fg_dataq+FGData.seed] 94 mov r6d, [fg_dataq+FGData.grain_scale_shift] 95 movq xm1, [base+next_upperbit_mask] 96 mov r3, -73*82*2 97 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 98 lea r7d, [bdmaxq+1] 99 movq xm4, [base+mul_bits] 100 shr r7d, 11 ; 0 for 10bpc, 2 for 12bpc 101 movq xm5, [base+hmul_bits] 102 sub r6, r7 103 mova xm6, [base+pb_mask] 104 sub bufq, r3 105 vpbroadcastw xm7, [base+round+r6*2-2] 106 lea r6, [gaussian_sequence] 107 movsxd r5, [r4+r5*4] 108.loop: 109 pand xm2, xm0, xm1 110 psrlw xm3, xm2, 10 111 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 112 pmullw xm2, xm4 ; bits 0x0f00 are set 113 pmulhuw xm0, xm5 114 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds 115 psllq xm2, xm3, 30 116 por xm2, xm3 117 psllq xm3, xm2, 15 118 por xm2, xm0 ; aggregate each bit into next seed's high bit 119 por xm3, xm2 ; 4 next output seeds 120 pshuflw xm0, xm3, q3333 121 psrlw xm3, 5 122 pand xm2, xm0, xm1 123 movq r7, xm3 124 psrlw xm3, xm2, 10 125 por xm2, xm3 126 pmullw xm2, xm4 127 pmulhuw xm0, xm5 128 movzx r8d, r7w 129 pshufb xm3, xm6, xm2 130 psllq xm2, xm3, 30 131 por xm2, xm3 132 psllq xm3, xm2, 15 133 por xm0, xm2 134 movd xm2, [r6+r8*2] 135 rorx r8, r7, 32 136 por xm3, xm0 137 shr r7d, 16 138 pinsrw xm2, [r6+r7*2], 1 139 pshuflw xm0, xm3, q3333 140 movzx r7d, r8w 141 psrlw xm3, 5 142 pinsrw xm2, [r6+r7*2], 2 143 shr r8d, 16 144 movq r7, xm3 145 pinsrw xm2, [r6+r8*2], 3 146 movzx r8d, r7w 147 pinsrw xm2, [r6+r8*2], 4 148 rorx r8, r7, 32 149 shr r7d, 16 150 pinsrw xm2, [r6+r7*2], 5 151 movzx r7d, r8w 152 pinsrw xm2, [r6+r7*2], 6 153 shr r8d, 16 154 pinsrw xm2, [r6+r8*2], 7 155 paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 156 pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support 157 mova [bufq+r3], xm2 158 add r3, 8*2 159 jl .loop 160 161 ; auto-regression code 162 add r5, r4 163 jmp r5 164 165.ar1: 166 DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 167 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 168 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 169 movd xm4, [fg_dataq+FGData.ar_coeffs_y] 170 DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 171 pinsrb xm4, [base+pb_1], 3 172 pmovsxbw xm4, xm4 173 pshufd xm5, xm4, q1111 174 pshufd xm4, xm4, q0000 175 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd 176 sub bufq, 2*(82*73-(82*3+79)) 177 mov hd, 70 178 sar maxd, 1 179 mov mind, maxd 180 xor mind, -1 181.y_loop_ar1: 182 mov xq, -76 183 movsx val3d, word [bufq+xq*2-2] 184.x_loop_ar1: 185 movu xm0, [bufq+xq*2-82*2-2] ; top/left 186 psrldq xm2, xm0, 2 ; top 187 psrldq xm1, xm0, 4 ; top/right 188 punpcklwd xm0, xm2 189 punpcklwd xm1, xm3 190 pmaddwd xm0, xm4 191 pmaddwd xm1, xm5 192 paddd xm0, xm1 193.x_loop_ar1_inner: 194 movd val0d, xm0 195 psrldq xm0, 4 196 imul val3d, cf3d 197 add val3d, val0d 198 sarx val3d, val3d, shiftd 199 movsx val0d, word [bufq+xq*2] 200 add val3d, val0d 201 cmp val3d, maxd 202 cmovg val3d, maxd 203 cmp val3d, mind 204 cmovl val3d, mind 205 mov word [bufq+xq*2], val3w 206 ; keep val3d in-place as left for next x iteration 207 inc xq 208 jz .x_loop_ar1_end 209 test xb, 3 210 jnz .x_loop_ar1_inner 211 jmp .x_loop_ar1 212.x_loop_ar1_end: 213 add bufq, 82*2 214 dec hd 215 jg .y_loop_ar1 216.ar0: 217 RET 218 219.ar2: 220 DEFINE_ARGS buf, fg_data, bdmax, shift 221 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 222 movq xm0, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11 223 vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4 224 vpbroadcastw xm10, [base+round_vals-12+shiftq*2] 225 pxor m1, m1 226 punpcklwd xm10, xm1 227 pcmpgtb m1, m0 228 punpcklbw m0, m1 ; cf5-11,0-4 229 vpermq m1, m0, q3333 ; cf4 230 vbroadcasti128 m11, [base+gen_shufA] 231 pshufd m6, m0, q0000 ; cf[5,6], cf[0-1] 232 vbroadcasti128 m12, [base+gen_shufB] 233 pshufd m7, m0, q1111 ; cf[7,8], cf[2-3] 234 punpckhwd xm1, xm0 235 pshufhw xm9, xm0, q2121 236 pshufd xm8, xm1, q0000 ; cf[4,9] 237 sar bdmaxd, 1 238 punpckhqdq xm9, xm9 ; cf[10,11] 239 movd xm4, bdmaxd ; max_grain 240 pcmpeqd xm5, xm5 241 sub bufq, 2*(82*73-(82*3+79)) 242 pxor xm5, xm4 ; min_grain 243 DEFINE_ARGS buf, fg_data, h, x 244 mov hd, 70 245.y_loop_ar2: 246 mov xq, -76 247.x_loop_ar2: 248 vbroadcasti128 m2, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] 249 vinserti128 m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5] 250 pshufb m0, m1, m11 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 251 pmaddwd m0, m6 252 punpckhwd xm2, xm1 ; y=-2/-1 interleaved, x=[+2,+5] 253 pshufb m1, m12 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 254 pmaddwd m1, m7 255 pmaddwd xm2, xm8 256 paddd m0, m1 257 vextracti128 xm1, m0, 1 258 paddd xm0, xm10 259 paddd xm2, xm0 260 movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] 261 paddd xm2, xm1 262 pmovsxwd xm1, [bufq+xq*2] ; in dwords, y=0,x=[0,3] 263.x_loop_ar2_inner: 264 pmaddwd xm3, xm9, xm0 265 psrldq xm0, 2 266 paddd xm3, xm2 267 psrldq xm2, 4 ; shift top to next pixel 268 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 269 ; skip packssdw because we only care about one value 270 paddd xm3, xm1 271 pminsd xm3, xm4 272 psrldq xm1, 4 273 pmaxsd xm3, xm5 274 pextrw [bufq+xq*2], xm3, 0 275 punpcklwd xm3, xm3 276 pblendw xm0, xm3, 0010b 277 inc xq 278 jz .x_loop_ar2_end 279 test xb, 3 280 jnz .x_loop_ar2_inner 281 jmp .x_loop_ar2 282.x_loop_ar2_end: 283 add bufq, 82*2 284 dec hd 285 jg .y_loop_ar2 286 RET 287 288.ar3: 289 DEFINE_ARGS buf, fg_data, bdmax, shift 290 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 291 sar bdmaxd, 1 292 movq xm7, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6 293 movd xm0, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16 294 pinsrb xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13 295 pinsrb xm0, [base+pb_1], 3 ; cf14-16,pb_1 296 movd xm1, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23 297 vinserti128 m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13 298 vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20 299 vpbroadcastw xm11, [base+round_vals+shiftq*2-12] 300 movd xm12, bdmaxd ; max_grain 301 punpcklbw m7, m7 ; sign-extension 302 punpcklbw m0, m0 ; sign-extension 303 punpcklbw xm1, xm1 304 REPX {psraw x, 8}, m7, m0, xm1 305 pshufd m4, m7, q0000 ; cf[0,1] | cf[7,8] 306 pshufd m5, m7, q1111 ; cf[2,3] | cf[9,10] 307 pshufd m6, m7, q2222 ; cf[4,5] | cf[11,12] 308 pshufd xm7, xm7, q3333 ; cf[6,13] 309 pshufd m8, m0, q0000 ; cf[14,15] | cf[17,18] 310 pshufd m9, m0, q1111 ; cf[16],pw_1 | cf[19,20] 311 paddw xm0, xm11, xm11 312 pcmpeqd xm13, xm13 313 pblendw xm10, xm1, xm0, 00001000b 314 pxor xm13, xm12 ; min_grain 315 DEFINE_ARGS buf, fg_data, h, x 316 sub bufq, 2*(82*73-(82*3+79)) 317 mov hd, 70 318.y_loop_ar3: 319 mov xq, -76 320.x_loop_ar3: 321 movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] 322 vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] 323 movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] 324 vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] 325 palignr m3, m1, m0, 2 ; y=-3/-2,x=[-2,+5] 326 palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] 327 punpckhwd m2, m0, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] 328 punpcklwd m0, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] 329 shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] 330 pmaddwd m0, m4 331 pmaddwd m2, m6 332 pmaddwd m3, m5 333 paddd m0, m2 334 movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] 335 vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] 336 paddd m0, m3 337 psrldq m3, m2, 2 338 punpcklwd m3, m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] 339 pmaddwd m3, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] 340 paddd m0, m3 341 psrldq m3, m2, 4 342 psrldq m2, 6 343 vpblendd m2, m11, 0x0f ; rounding constant 344 punpcklwd m3, m2 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] 345 pmaddwd m3, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] 346 vextracti128 xm2, m1, 1 347 punpcklwd xm1, xm2 348 pmaddwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6] 349 paddd m0, m3 350 vextracti128 xm2, m0, 1 351 paddd xm0, xm1 352 movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] 353 paddd xm0, xm2 354.x_loop_ar3_inner: 355 pmaddwd xm2, xm1, xm10 356 pshuflw xm3, xm2, q1032 357 paddd xm2, xm0 ; add top 358 paddd xm2, xm3 ; left+cur 359 psrldq xm0, 4 360 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 361 ; skip packssdw because we only care about one value 362 pminsd xm2, xm12 363 pmaxsd xm2, xm13 364 pextrw [bufq+xq*2], xm2, 0 365 pslldq xm2, 4 366 psrldq xm1, 2 367 pblendw xm1, xm2, 0100b 368 inc xq 369 jz .x_loop_ar3_end 370 test xb, 3 371 jnz .x_loop_ar3_inner 372 jmp .x_loop_ar3 373.x_loop_ar3_end: 374 add bufq, 82*2 375 dec hd 376 jg .y_loop_ar3 377 RET 378 379%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y 380INIT_XMM avx2 381cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax 382%define base r8-generate_grain_uv_%1_16bpc_avx2_table 383 lea r8, [generate_grain_uv_%1_16bpc_avx2_table] 384 movifnidn bdmaxd, bdmaxm 385 vpbroadcastw xm0, [fg_dataq+FGData.seed] 386 mov r5d, [fg_dataq+FGData.grain_scale_shift] 387 movq xm1, [base+next_upperbit_mask] 388 lea r6d, [bdmaxq+1] 389 movq xm4, [base+mul_bits] 390 shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc 391 movq xm5, [base+hmul_bits] 392 sub r5, r6 393 mova xm6, [base+pb_mask] 394 vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] 395 vpbroadcastw xm7, [base+round+r5*2-2] 396 pxor xm0, xm2 397 lea r6, [gaussian_sequence] 398%if %2 399 mov r7d, 73-35*%3 400 add bufq, 44*2 401.loop_y: 402 mov r5, -44*2 403%else 404 mov r5, -82*73*2 405 sub bufq, r5 406%endif 407.loop_x: 408 pand xm2, xm0, xm1 409 psrlw xm3, xm2, 10 410 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 411 pmullw xm2, xm4 ; bits 0x0f00 are set 412 pmulhuw xm0, xm5 413 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds 414 psllq xm2, xm3, 30 415 por xm2, xm3 416 psllq xm3, xm2, 15 417 por xm2, xm0 ; aggregate each bit into next seed's high bit 418 por xm2, xm3 ; 4 next output seeds 419 pshuflw xm0, xm2, q3333 420 psrlw xm2, 5 421 movq r10, xm2 422 movzx r9d, r10w 423 movd xm2, [r6+r9*2] 424 rorx r9, r10, 32 425 shr r10d, 16 426 pinsrw xm2, [r6+r10*2], 1 427 movzx r10d, r9w 428 pinsrw xm2, [r6+r10*2], 2 429 shr r9d, 16 430 pinsrw xm2, [r6+r9*2], 3 431 paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 432 pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support 433 movq [bufq+r5], xm2 434 add r5, 8 435 jl .loop_x 436%if %2 437 add bufq, 82*2 438 dec r7d 439 jg .loop_y 440%endif 441 442 ; auto-regression code 443 movsxd r6, [fg_dataq+FGData.ar_coeff_lag] 444 movsxd r6, [r8+r6*4] 445 add r6, r8 446 jmp r6 447 448INIT_YMM avx2 449.ar0: 450 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 451 imul uvd, 28 452 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 453 vpbroadcastb m0, [fg_dataq+FGData.ar_coeffs_uv+uvq] 454 sar bdmaxd, 1 455 vpbroadcastd m4, [base+gen_ar0_shift-24+shiftq*4] 456 movd xm6, bdmaxd 457 pcmpeqw m7, m7 458 pmaddubsw m4, m0 ; ar_coeff << (14 - shift) 459 vpbroadcastw m6, xm6 ; max_gain 460 pxor m7, m6 ; min_grain 461 DEFINE_ARGS buf, bufy, h, x 462%if %2 463 vpbroadcastw m5, [base+hmul_bits+2+%3*2] 464 sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) 465%else 466 sub bufq, 2*(82*70-3) 467%endif 468 add bufyq, 2*(3+82*3) 469 mov hd, 70-35*%3 470.y_loop_ar0: 471%if %2 472 ; first 32 pixels 473 movu xm0, [bufyq+16*0] 474 vinserti128 m0, [bufyq+16*2], 1 475 movu xm1, [bufyq+16*1] 476 vinserti128 m1, [bufyq+16*3], 1 477%if %3 478 movu xm2, [bufyq+82*2+16*0] 479 vinserti128 m2, [bufyq+82*2+16*2], 1 480 movu xm3, [bufyq+82*2+16*1] 481 vinserti128 m3, [bufyq+82*2+16*3], 1 482 paddw m0, m2 483 paddw m1, m3 484%endif 485 phaddw m0, m1 486 movu xm1, [bufyq+16*4] 487 vinserti128 m1, [bufyq+16*6], 1 488 movu xm2, [bufyq+16*5] 489 vinserti128 m2, [bufyq+16*7], 1 490%if %3 491 movu xm3, [bufyq+82*2+16*4] 492 vinserti128 m3, [bufyq+82*2+16*6], 1 493 paddw m1, m3 494 movu xm3, [bufyq+82*2+16*5] 495 vinserti128 m3, [bufyq+82*2+16*7], 1 496 paddw m2, m3 497%endif 498 phaddw m1, m2 499 pmulhrsw m0, m5 500 pmulhrsw m1, m5 501%else 502 xor xd, xd 503.x_loop_ar0: 504 movu m0, [bufyq+xq*2] 505 movu m1, [bufyq+xq*2+32] 506%endif 507 paddw m0, m0 508 paddw m1, m1 509 pmulhrsw m0, m4 510 pmulhrsw m1, m4 511%if %2 512 paddw m0, [bufq+ 0] 513 paddw m1, [bufq+32] 514%else 515 paddw m0, [bufq+xq*2+ 0] 516 paddw m1, [bufq+xq*2+32] 517%endif 518 pminsw m0, m6 519 pminsw m1, m6 520 pmaxsw m0, m7 521 pmaxsw m1, m7 522%if %2 523 movu [bufq+ 0], m0 524 movu [bufq+32], m1 525 526 ; last 6 pixels 527 movu xm0, [bufyq+32*4] 528 movu xm1, [bufyq+32*4+16] 529%if %3 530 paddw xm0, [bufyq+32*4+82*2] 531 paddw xm1, [bufyq+32*4+82*2+16] 532%endif 533 phaddw xm0, xm1 534 movu xm1, [bufq+32*2] 535 pmulhrsw xm0, xm5 536 paddw xm0, xm0 537 pmulhrsw xm0, xm4 538 paddw xm0, xm1 539 pminsw xm0, xm6 540 pmaxsw xm0, xm7 541 vpblendd xm0, xm1, 0x08 542 movu [bufq+32*2], xm0 543%else 544 movu [bufq+xq*2+ 0], m0 545 movu [bufq+xq*2+32], m1 546 add xd, 32 547 cmp xd, 64 548 jl .x_loop_ar0 549 550 ; last 12 pixels 551 movu m0, [bufyq+64*2] 552 movu m1, [bufq+64*2] 553 paddw m0, m0 554 pmulhrsw m0, m4 555 paddw m0, m1 556 pminsw m0, m6 557 pmaxsw m0, m7 558 vpblendd m0, m1, 0xc0 559 movu [bufq+64*2], m0 560%endif 561 add bufq, 82*2 562 add bufyq, 82*2<<%3 563 dec hd 564 jg .y_loop_ar0 565 RET 566 567INIT_XMM avx2 568.ar1: 569 DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift 570 imul uvd, 28 571 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 572 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 573 movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 574 pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 575 DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift 576 pmovsxbw xm4, xm4 577 pshufd xm5, xm4, q1111 578 pshufd xm4, xm4, q0000 579 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd 580 vpbroadcastw xm6, [base+hmul_bits+2+%3*2] 581 vpbroadcastd xm3, xm3 582%if %2 583 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 584%else 585 sub bufq, 2*(82*69+3) 586%endif 587 add bufyq, 2*(79+82*3) 588 mov hd, 70-35*%3 589 sar maxd, 1 590 mov mind, maxd 591 xor mind, -1 592.y_loop_ar1: 593 mov xq, -(76>>%2) 594 movsx val3d, word [bufq+xq*2-2] 595.x_loop_ar1: 596 movu xm0, [bufq+xq*2-82*2-2] ; top/left 597%if %2 598 movu xm2, [bufyq+xq*4] 599%else 600 movq xm2, [bufyq+xq*2] 601%endif 602%if %2 603%if %3 604 phaddw xm2, [bufyq+xq*4+82*2] 605 punpckhqdq xm1, xm2, xm2 606 paddw xm2, xm1 607%else 608 phaddw xm2, xm2 609%endif 610 pmulhrsw xm2, xm6 611%endif 612 psrldq xm1, xm0, 4 ; top/right 613 punpcklwd xm1, xm2 614 psrldq xm2, xm0, 2 ; top 615 punpcklwd xm0, xm2 616 pmaddwd xm1, xm5 617 pmaddwd xm0, xm4 618 paddd xm1, xm3 619 paddd xm0, xm1 620.x_loop_ar1_inner: 621 movd val0d, xm0 622 psrldq xm0, 4 623 imul val3d, cf3d 624 add val3d, val0d 625 sarx val3d, val3d, shiftd 626 movsx val0d, word [bufq+xq*2] 627 add val3d, val0d 628 cmp val3d, maxd 629 cmovg val3d, maxd 630 cmp val3d, mind 631 cmovl val3d, mind 632 mov word [bufq+xq*2], val3w 633 ; keep val3d in-place as left for next x iteration 634 inc xq 635 jz .x_loop_ar1_end 636 test xb, 3 637 jnz .x_loop_ar1_inner 638 jmp .x_loop_ar1 639.x_loop_ar1_end: 640 add bufq, 82*2 641 add bufyq, 82*2<<%3 642 dec hd 643 jg .y_loop_ar1 644 RET 645 646INIT_YMM avx2 647.ar2: 648%if WIN64 649 %assign stack_size_padded 136 650 SUB rsp, stack_size_padded 651 WIN64_PUSH_XMM 13 + %2, 8 652%endif 653 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 654 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 655 imul uvd, 28 656 vbroadcasti128 m10, [base+gen_shufA] 657 sar bdmaxd, 1 658 vbroadcasti128 m11, [base+gen_shufB] 659 movd xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5] 660 pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4 661 pinsrb xm7, [base+pb_1], 5 662 pinsrw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3 663 movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] 664 pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13 665 pmovsxbw m7, xm7 666 movd xm8, bdmaxd ; max_grain 667 pshufd m4, m7, q0000 668 vpbroadcastw xm12, [base+round_vals-12+shiftq*2] 669 pshufd m5, m7, q1111 670 pcmpeqd xm9, xm9 671 pshufd m6, m7, q2222 672 pxor xm9, xm8 ; min_grain 673 pshufd xm7, xm7, q3333 674 DEFINE_ARGS buf, bufy, fg_data, h, x 675%if %2 676 vpbroadcastw xm13, [base+hmul_bits+2+%3*2] 677 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 678%else 679 sub bufq, 2*(82*69+3) 680%endif 681 add bufyq, 2*(79+82*3) 682 mov hd, 70-35*%3 683.y_loop_ar2: 684 mov xq, -(76>>%2) 685.x_loop_ar2: 686 vbroadcasti128 m3, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] 687 vinserti128 m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] 688 pshufb m0, m2, m10 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 689 pmaddwd m0, m4 690 pshufb m1, m2, m11 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 691 pmaddwd m1, m5 692 punpckhwd m2, m3 ; y=-2/-1 interleaved, x=[+2,+5] 693%if %2 694 movu xm3, [bufyq+xq*4] 695%if %3 696 paddw xm3, [bufyq+xq*4+82*2] 697%endif 698 phaddw xm3, xm3 699 pmulhrsw xm3, xm13 700%else 701 movq xm3, [bufyq+xq*2] 702%endif 703 punpcklwd xm3, xm12 ; luma, round interleaved 704 vpblendd m2, m3, 0x0f 705 pmaddwd m2, m6 706 paddd m1, m0 707 movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] 708 paddd m2, m1 709 vextracti128 xm1, m2, 1 710 paddd xm2, xm1 711 pshufd xm1, xm0, q3321 712 pmovsxwd xm1, xm1 ; y=0,x=[0,3] in dword 713.x_loop_ar2_inner: 714 pmaddwd xm3, xm7, xm0 715 paddd xm3, xm2 716 psrldq xm2, 4 ; shift top to next pixel 717 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 718 ; we do not need to packssdw since we only care about one value 719 paddd xm3, xm1 720 psrldq xm1, 4 721 pminsd xm3, xm8 722 pmaxsd xm3, xm9 723 pextrw [bufq+xq*2], xm3, 0 724 psrldq xm0, 2 725 pslldq xm3, 2 726 pblendw xm0, xm3, 00000010b 727 inc xq 728 jz .x_loop_ar2_end 729 test xb, 3 730 jnz .x_loop_ar2_inner 731 jmp .x_loop_ar2 732.x_loop_ar2_end: 733 add bufq, 82*2 734 add bufyq, 82*2<<%3 735 dec hd 736 jg .y_loop_ar2 737 RET 738 739.ar3: 740%if WIN64 741 %assign stack_offset 32 742 %assign stack_size_padded 152 743 SUB rsp, stack_size_padded 744 WIN64_PUSH_XMM 14 + %2, 8 745%endif 746 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 747 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 748 imul uvd, 28 749 vpbroadcastw xm11, [base+round_vals-12+shiftq*2] 750 sar bdmaxd, 1 751 movq xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] 752 pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma 753 movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7] 754 pmovsxbw m7, xm7 755%if %2 756 vpbroadcastw xm14, [base+hmul_bits+2+%3*2] 757%endif 758 pshufd m4, m7, q0000 759 pshufd m5, m7, q1111 760 pshufd m6, m7, q2222 761 pshufd m7, m7, q3333 762 movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14] 763 pinsrb xm0, [base+pb_1], 3 764 pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1 765 pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2 766 pmovsxbw m0, xm0 767 movd xm12, bdmaxd ; max_grain 768 pshufd m8, m0, q0000 769 pshufd m9, m0, q1111 770 pcmpeqd xm13, xm13 771 punpckhqdq xm10, xm0, xm0 772 pxor xm13, xm12 ; min_grain 773 pinsrw xm10, [base+round_vals-10+shiftq*2], 3 774 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 775%if %2 776 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 777%else 778 sub bufq, 2*(82*69+3) 779%endif 780 add bufyq, 2*(79+82*3) 781 mov hd, 70-35*%3 782.y_loop_ar3: 783 mov xq, -(76>>%2) 784.x_loop_ar3: 785 movu xm2, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] 786 vinserti128 m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] 787 movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] 788 vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] 789 palignr m3, m1, m2, 2 ; y=-3/-2,x=[-2,+5] 790 palignr m1, m2, 12 ; y=-3/-2,x=[+3,+6] 791 punpcklwd m0, m2, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] 792 punpckhwd m2, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] 793 shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] 794 pmaddwd m0, m4 795 pmaddwd m2, m6 796 pmaddwd m3, m5 797 paddd m0, m2 798 paddd m0, m3 799 movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] 800 vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] 801%if %2 802 movu xm3, [bufyq+xq*4] 803%if %3 804 paddw xm3, [bufyq+xq*4+82*2] 805%endif 806 phaddw xm3, xm3 807 pmulhrsw xm3, xm14 808%else 809 movq xm3, [bufyq+xq*2] 810%endif 811 punpcklwd m1, m3 812 pmaddwd m1, m7 813 paddd m0, m1 814 psrldq m1, m2, 4 815 psrldq m3, m2, 6 816 vpblendd m3, m11, 0x0f ; rounding constant 817 punpcklwd m1, m3 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] 818 pmaddwd m1, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] 819 psrldq m3, m2, 2 820 punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] 821 pmaddwd m2, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] 822 paddd m0, m1 823 movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] 824 paddd m0, m2 825 vextracti128 xm2, m0, 1 826 paddd xm0, xm2 827.x_loop_ar3_inner: 828 pmaddwd xm2, xm1, xm10 829 pshuflw xm3, xm2, q1032 830 paddd xm2, xm0 ; add top 831 paddd xm2, xm3 ; left+cur 832 psrldq xm0, 4 833 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 834 psrldq xm1, 2 835 ; no need to packssdw since we only care about one value 836 pminsd xm2, xm12 837 pmaxsd xm2, xm13 838 pextrw [bufq+xq*2], xm2, 0 839 pslldq xm2, 4 840 pblendw xm1, xm2, 00000100b 841 inc xq 842 jz .x_loop_ar3_end 843 test xb, 3 844 jnz .x_loop_ar3_inner 845 jmp .x_loop_ar3 846.x_loop_ar3_end: 847 add bufq, 82*2 848 add bufyq, 82*2<<%3 849 dec hd 850 jg .y_loop_ar3 851 RET 852%endmacro 853 854cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ 855 grain_lut, unused, sby, see 856%define base r11-grain_min 857 lea r11, [grain_min] 858 mov r6d, r9m ; bdmax 859 mov r9d, [fg_dataq+FGData.clip_to_restricted_range] 860 mov r7d, [fg_dataq+FGData.scaling_shift] 861 mov sbyd, sbym 862 vpbroadcastd m8, r9m 863 shr r6d, 11 ; is_12bpc 864 vpbroadcastd m9, [base+grain_min+r6*4] 865 shlx r10d, r9d, r6d 866 vpbroadcastd m10, [base+grain_max+r6*4] 867 lea r9d, [r6+r9*4] 868 vpbroadcastw m11, [base+mul_bits+r7*2-12] 869 vpbroadcastd m12, [base+fg_min+r10*4] 870 vpbroadcastd m13, [base+fg_max+r9*4] 871 test sbyd, sbyd 872 setnz r7b 873 vpbroadcastd m14, [base+pd_16] 874 test r7b, [fg_dataq+FGData.overlap_flag] 875 jnz .vertical_overlap 876 877 imul seed, sbyd, (173 << 24) | 37 878 add seed, (105 << 24) | 178 879 rorx seed, seed, 24 880 movzx seed, seew 881 xor seed, [fg_dataq+FGData.seed] 882 883 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 884 offx, offy, see, src_bak 885 886 lea src_bakq, [srcq+wq*2] 887 neg wq 888 sub dstq, srcq 889 890.loop_x: 891 rorx r6, seeq, 1 892 or seed, 0xEFF4 893 test seeb, seeh 894 lea seed, [r6+0x8000] 895 cmovp seed, r6d ; updated seed 896 rorx offyd, seed, 8 897 rorx offxq, seeq, 12 898 and offyd, 0xf 899 imul offyd, 164 900 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 901 902 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 903 h, offxy, see, src_bak 904 905 mov grain_lutq, grain_lutmp 906 mov hd, hm 907.loop_y: 908 ; scaling[src] 909 mova m0, [srcq+ 0] 910 mova m1, [srcq+32] 911 pand m4, m8, m0 912 psrld m3, m0, 16 913 mova m6, m9 914 vpgatherdd m2, [scalingq+m4-0], m9 915 pand m3, m8 916 mova m9, m6 917 vpgatherdd m4, [scalingq+m3-2], m6 918 pand m5, m8, m1 919 mova m6, m9 920 vpgatherdd m3, [scalingq+m5-0], m9 921 pblendw m4, m2, 0x55 922 psrld m2, m1, 16 923 mova m9, m6 924 pand m2, m8 925 vpgatherdd m5, [scalingq+m2-2], m6 926 pblendw m5, m3, 0x55 927 928 ; noise = round2(scaling[src] * grain, scaling_shift) 929 pmaddubsw m4, m11 930 pmaddubsw m5, m11 931 paddw m4, m4 932 paddw m5, m5 933 pmulhrsw m4, [grain_lutq+offxyq*2] 934 pmulhrsw m5, [grain_lutq+offxyq*2+32] 935 936 ; dst = clip_pixel(src, noise) 937 paddw m0, m4 938 paddw m1, m5 939 pmaxsw m0, m12 940 pmaxsw m1, m12 941 pminsw m0, m13 942 pminsw m1, m13 943 mova [dstq+srcq+ 0], m0 944 mova [dstq+srcq+32], m1 945 946 add srcq, strideq 947 add grain_lutq, 82*2 948 dec hd 949 jg .loop_y 950 add wq, 32 951 jge .end 952 lea srcq, [src_bakq+wq*2] 953 cmp byte [fg_dataq+FGData.overlap_flag], 0 954 je .loop_x 955 movq xm7, [pw_27_17_17_27] 956 cmp dword r8m, 0 ; sby 957 jne .loop_x_hv_overlap 958 959 ; horizontal overlap (without vertical overlap) 960.loop_x_h_overlap: 961 rorx r6, seeq, 1 962 or seed, 0xEFF4 963 test seeb, seeh 964 lea seed, [r6+0x8000] 965 cmovp seed, r6d ; updated seed 966 967 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 968 offx, offy, see, src_bak, left_offxy 969 970 lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx 971 rorx offyd, seed, 8 972 rorx offxq, seeq, 12 973 and offyd, 0xf 974 imul offyd, 164 975 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 976 977 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 978 h, offxy, see, src_bak, left_offxy 979 980 mov grain_lutq, grain_lutmp 981 mov hd, hm 982.loop_y_h_overlap: 983 ; scaling[src] 984 mova m0, [srcq+ 0] 985 mova m1, [srcq+32] 986 pand m4, m8, m0 987 psrld m3, m0, 16 988 mova m6, m9 989 vpgatherdd m2, [scalingq+m4-0], m9 990 pand m3, m8 991 mova m9, m6 992 vpgatherdd m4, [scalingq+m3-2], m6 993 pand m5, m8, m1 994 mova m6, m9 995 vpgatherdd m3, [scalingq+m5-0], m9 996 pblendw m4, m2, 0x55 997 psrld m2, m1, 16 998 mova m9, m6 999 pand m2, m8 1000 vpgatherdd m5, [scalingq+m2-2], m6 1001 pblendw m5, m3, 0x55 1002 1003 ; grain = grain_lut[offy+y][offx+x] 1004 movu m3, [grain_lutq+offxyq*2] 1005 movd xm6, [grain_lutq+left_offxyq*2] 1006 punpcklwd xm6, xm3 1007 pmaddwd xm6, xm7 1008 paddd xm6, xm14 1009 psrad xm6, 5 1010 packssdw xm6, xm6 1011 pmaxsw xm6, xm9 1012 pminsw xm6, xm10 1013 vpblendd m3, m6, 0x01 1014 1015 ; noise = round2(scaling[src] * grain, scaling_shift) 1016 pmaddubsw m4, m11 1017 pmaddubsw m5, m11 1018 paddw m4, m4 1019 paddw m5, m5 1020 pmulhrsw m4, m3 1021 pmulhrsw m5, [grain_lutq+offxyq*2+32] 1022 1023 ; dst = clip_pixel(src, noise) 1024 paddw m0, m4 1025 paddw m1, m5 1026 pmaxsw m0, m12 1027 pmaxsw m1, m12 1028 pminsw m0, m13 1029 pminsw m1, m13 1030 mova [dstq+srcq+ 0], m0 1031 mova [dstq+srcq+32], m1 1032 1033 add srcq, strideq 1034 add grain_lutq, 82*2 1035 dec hd 1036 jg .loop_y_h_overlap 1037 add wq, 32 1038 jge .end 1039 lea srcq, [src_bakq+wq*2] 1040 cmp dword r8m, 0 ; sby 1041 jne .loop_x_hv_overlap 1042 jmp .loop_x_h_overlap 1043 1044.vertical_overlap: 1045 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ 1046 sby, see, src_bak 1047 1048 movzx sbyd, sbyb 1049 imul seed, [fg_dataq+FGData.seed], 0x00010001 1050 imul r7d, sbyd, 173 * 0x00010001 1051 imul sbyd, 37 * 0x01000100 1052 add r7d, (105 << 16) | 188 1053 add sbyd, (178 << 24) | (141 << 8) 1054 and r7d, 0x00ff00ff 1055 and sbyd, 0xff00ff00 1056 xor seed, r7d 1057 xor seed, sbyd ; (cur_seed << 16) | top_seed 1058 1059 lea src_bakq, [srcq+wq*2] 1060 neg wq 1061 sub dstq, srcq 1062 1063.loop_x_v_overlap: 1064 vpbroadcastd m15, [pw_27_17_17_27] 1065 1066 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1067 mov r6d, seed 1068 or seed, 0xeff4eff4 1069 test seeb, seeh 1070 setp r7b ; parity of top_seed 1071 shr seed, 16 1072 shl r7d, 16 1073 test seeb, seeh 1074 setp r7b ; parity of cur_seed 1075 or r6d, 0x00010001 1076 xor r7d, r6d 1077 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1078 1079 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1080 offx, offy, see, src_bak, unused, top_offxy 1081 1082 rorx offyd, seed, 8 1083 rorx offxd, seed, 12 1084 and offyd, 0xf000f 1085 and offxd, 0xf000f 1086 imul offyd, 164 1087 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1088 lea offyd, [offyq+offxq*2+0x10001*747+32*82] 1089 1090 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1091 h, offxy, see, src_bak, unused, top_offxy 1092 1093 mov grain_lutq, grain_lutmp 1094 mov hd, hm 1095 movzx top_offxyd, offxyw 1096 shr offxyd, 16 1097.loop_y_v_overlap: 1098 ; scaling[src] 1099 mova m0, [srcq+ 0] 1100 mova m1, [srcq+32] 1101 pand m4, m8, m0 1102 psrld m3, m0, 16 1103 mova m6, m9 1104 vpgatherdd m2, [scalingq+m4-0], m9 1105 pand m3, m8 1106 mova m9, m6 1107 vpgatherdd m4, [scalingq+m3-2], m6 1108 pand m5, m8, m1 1109 mova m6, m9 1110 vpgatherdd m3, [scalingq+m5-0], m9 1111 pblendw m2, m4, 0xaa 1112 psrld m4, m1, 16 1113 mova m9, m6 1114 pand m4, m8 1115 vpgatherdd m5, [scalingq+m4-2], m6 1116 pblendw m3, m5, 0xaa 1117 1118 ; grain = grain_lut[offy+y][offx+x] 1119 movu m6, [grain_lutq+offxyq*2] 1120 movu m5, [grain_lutq+top_offxyq*2] 1121 punpcklwd m4, m5, m6 1122 punpckhwd m5, m6 1123 pmaddwd m4, m15 1124 pmaddwd m5, m15 1125 movu m7, [grain_lutq+offxyq*2+32] 1126 movu m6, [grain_lutq+top_offxyq*2+32] 1127 paddd m4, m14 1128 paddd m5, m14 1129 psrad m4, 5 1130 psrad m5, 5 1131 packssdw m4, m5 1132 punpcklwd m5, m6, m7 1133 punpckhwd m6, m7 1134 pmaddwd m5, m15 1135 pmaddwd m6, m15 1136 paddd m5, m14 1137 paddd m6, m14 1138 psrad m5, 5 1139 psrad m6, 5 1140 packssdw m5, m6 1141 pmaxsw m4, m9 1142 pmaxsw m5, m9 1143 pminsw m4, m10 1144 pminsw m5, m10 1145 1146 ; noise = round2(scaling[src] * grain, scaling_shift) 1147 pmaddubsw m2, m11 1148 pmaddubsw m3, m11 1149 paddw m2, m2 1150 paddw m3, m3 1151 pmulhrsw m4, m2 1152 pmulhrsw m5, m3 1153 1154 ; dst = clip_pixel(src, noise) 1155 paddw m0, m4 1156 paddw m1, m5 1157 pmaxsw m0, m12 1158 pmaxsw m1, m12 1159 pminsw m0, m13 1160 pminsw m1, m13 1161 mova [dstq+srcq+ 0], m0 1162 mova [dstq+srcq+32], m1 1163 1164 add srcq, strideq 1165 add grain_lutq, 82*2 1166 dec hb 1167 jz .end_y_v_overlap 1168 vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line 1169 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1170 ; remaining (up to) 30 lines 1171 add hd, 0x80000000 1172 jnc .loop_y_v_overlap 1173 jmp .loop_y 1174.end_y_v_overlap: 1175 add wq, 32 1176 jge .end 1177 lea srcq, [src_bakq+wq*2] 1178 1179 ; since fg_dataq.overlap is guaranteed to be set, we never jump 1180 ; back to .loop_x_v_overlap, and instead always fall-through to 1181 ; h+v overlap 1182 1183.loop_x_hv_overlap: 1184 vpbroadcastd m15, [pw_27_17_17_27] 1185 1186 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1187 mov r6d, seed 1188 or seed, 0xeff4eff4 1189 test seeb, seeh 1190 setp r7b ; parity of top_seed 1191 shr seed, 16 1192 shl r7d, 16 1193 test seeb, seeh 1194 setp r7b ; parity of cur_seed 1195 or r6d, 0x00010001 1196 xor r7d, r6d 1197 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1198 1199 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1200 offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy 1201 1202 lea topleft_offxyd, [top_offxyq+32] 1203 lea left_offxyd, [offyq+32] 1204 rorx offyd, seed, 8 1205 rorx offxd, seed, 12 1206 and offyd, 0xf000f 1207 and offxd, 0xf000f 1208 imul offyd, 164 1209 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1210 lea offyd, [offyq+offxq*2+0x10001*747+32*82] 1211 1212 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1213 h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy 1214 1215 mov grain_lutq, grain_lutmp 1216 mov hd, hm 1217 movzx top_offxyd, offxyw 1218 shr offxyd, 16 1219.loop_y_hv_overlap: 1220 ; scaling[src] 1221 mova m0, [srcq+ 0] 1222 mova m1, [srcq+32] 1223 pand m4, m8, m0 1224 psrld m3, m0, 16 1225 mova m6, m9 1226 vpgatherdd m2, [scalingq+m4-0], m9 1227 pand m3, m8 1228 mova m9, m6 1229 vpgatherdd m4, [scalingq+m3-2], m6 1230 pand m5, m8, m1 1231 mova m6, m9 1232 vpgatherdd m3, [scalingq+m5-0], m9 1233 pblendw m2, m4, 0xaa 1234 psrld m4, m1, 16 1235 mova m9, m6 1236 pand m4, m8 1237 vpgatherdd m5, [scalingq+m4-2], m6 1238 pblendw m3, m5, 0xaa 1239 1240 ; grain = grain_lut[offy+y][offx+x] 1241 movu m7, [grain_lutq+offxyq*2] 1242 movd xm6, [grain_lutq+left_offxyq*2] 1243 movu m5, [grain_lutq+top_offxyq*2] 1244 movd xm4, [grain_lutq+topleft_offxyq*2] 1245 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1246 punpcklwd xm6, xm7 1247 punpcklwd xm4, xm5 1248 punpcklqdq xm6, xm4 1249 movddup xm4, [pw_27_17_17_27] 1250 pmaddwd xm6, xm4 1251 paddd xm6, xm14 1252 psrad xm6, 5 1253 packssdw xm6, xm6 1254 pmaxsw xm6, xm9 1255 pminsw xm6, xm10 1256 pshuflw xm4, xm6, q1032 1257 vpblendd m6, m7, 0xfe 1258 vpblendd m4, m5, 0xfe 1259 ; followed by v interpolation (top | cur -> cur) 1260 punpckhwd m5, m7 1261 pmaddwd m5, m15 1262 punpcklwd m4, m6 1263 pmaddwd m4, m15 1264 movu m7, [grain_lutq+offxyq*2+32] 1265 movu m6, [grain_lutq+top_offxyq*2+32] 1266 paddd m5, m14 1267 paddd m4, m14 1268 psrad m5, 5 1269 psrad m4, 5 1270 packssdw m4, m5 1271 punpcklwd m5, m6, m7 1272 punpckhwd m6, m7 1273 pmaddwd m5, m15 1274 pmaddwd m6, m15 1275 paddd m5, m14 1276 paddd m6, m14 1277 psrad m5, 5 1278 psrad m6, 5 1279 packssdw m5, m6 1280 pmaxsw m4, m9 1281 pmaxsw m5, m9 1282 pminsw m4, m10 1283 pminsw m5, m10 1284 1285 ; noise = round2(scaling[src] * grain, scaling_shift) 1286 pmaddubsw m2, m11 1287 pmaddubsw m3, m11 1288 paddw m2, m2 1289 paddw m3, m3 1290 pmulhrsw m4, m2 1291 pmulhrsw m5, m3 1292 1293 ; dst = clip_pixel(src, noise) 1294 paddw m0, m4 1295 paddw m1, m5 1296 pmaxsw m0, m12 1297 pmaxsw m1, m12 1298 pminsw m0, m13 1299 pminsw m1, m13 1300 mova [dstq+srcq+ 0], m0 1301 mova [dstq+srcq+32], m1 1302 1303 add srcq, strideq 1304 add grain_lutq, 82*2 1305 dec hb 1306 jz .end_y_hv_overlap 1307 vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line 1308 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1309 ; remaining (up to) 30 lines 1310 add hd, 0x80000000 1311 jnc .loop_y_hv_overlap 1312 movq xm7, [pw_27_17_17_27] 1313 jmp .loop_y_h_overlap 1314.end_y_hv_overlap: 1315 add wq, 32 1316 lea srcq, [src_bakq+wq*2] 1317 jl .loop_x_hv_overlap 1318.end: 1319 RET 1320 1321%macro FGUV_FN 3 ; name, ss_hor, ss_ver 1322cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 1323 grain_lut, h, sby, luma, lstride, uv_pl, is_id 1324%define base r12-grain_min 1325 lea r12, [grain_min] 1326 mov r9d, r13m ; bdmax 1327 mov r7d, [fg_dataq+FGData.scaling_shift] 1328 mov r11d, is_idm 1329 mov sbyd, sbym 1330 vpbroadcastw m11, [base+mul_bits+r7*2-12] 1331 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 1332 shr r9d, 11 ; is_12bpc 1333 vpbroadcastd m8, [base+grain_min+r9*4] 1334 shlx r10d, r6d, r9d 1335 vpbroadcastd m9, [base+grain_max+r9*4] 1336 vpbroadcastw m10, r13m 1337 shlx r6d, r6d, r11d 1338 vpbroadcastd m12, [base+fg_min+r10*4] 1339 lea r6d, [r9+r6*2] 1340 vpbroadcastd m13, [base+fg_max+r6*4] 1341 test sbyd, sbyd 1342 setnz r7b 1343 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 1344 jne .csfl 1345 1346%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 1347 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1348 unused, sby, see, overlap 1349 1350%if %1 1351 mov r6d, r11m 1352 vpbroadcastd m0, [base+pb_8_9_0_1] 1353 vpbroadcastd m1, [base+uv_offset_mul+r9*4] 1354 vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] 1355 vpbroadcastd m15, [fg_dataq+FGData.uv_offset+r6*4] 1356 pshufb m14, m0 ; { uv_luma_mult, uv_mult } 1357 pmaddwd m15, m1 1358%else 1359%if %2 1360 vpbroadcastq m15, [base+pw_23_22] 1361%else 1362 vpbroadcastq m15, [base+pw_27_17_17_27] 1363%endif 1364 vpbroadcastd m14, [base+pd_16] 1365%endif 1366 test r7b, [fg_dataq+FGData.overlap_flag] 1367 jnz %%vertical_overlap 1368 1369 imul seed, sbyd, (173 << 24) | 37 1370 add seed, (105 << 24) | 178 1371 rorx seed, seed, 24 1372 movzx seed, seew 1373 xor seed, [fg_dataq+FGData.seed] 1374 1375 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1376 unused2, unused3, see, unused4, unused5, unused6, luma, lstride 1377 1378 mov lumaq, r9mp 1379 mov lstrideq, r10mp 1380 lea r10, [srcq+wq*2] 1381 lea r11, [dstq+wq*2] 1382 lea r12, [lumaq+wq*(2<<%2)] 1383 mov r9mp, r10 1384 mov r11mp, r11 1385 mov r12mp, r12 1386 neg wq 1387 1388%%loop_x: 1389 rorx r6, seeq, 1 1390 or seed, 0xEFF4 1391 test seeb, seeh 1392 lea seed, [r6+0x8000] 1393 cmovp seed, r6d ; updated seed 1394 1395 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1396 offx, offy, see, unused1, unused2, unused3, luma, lstride 1397 1398 rorx offyd, seed, 8 1399 rorx offxq, seeq, 12 1400 and offyd, 0xf 1401 imul offyd, 164>>%3 1402 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx 1403 1404 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1405 h, offxy, see, unused1, unused2, unused3, luma, lstride 1406 1407 mov grain_lutq, grain_lutmp 1408 mov hd, hm 1409%%loop_y: 1410 ; luma_src 1411%if %2 1412 mova xm2, [lumaq+lstrideq*0+ 0] 1413 vinserti128 m2, [lumaq+lstrideq*0+32], 1 1414 mova xm4, [lumaq+lstrideq*0+16] 1415 vinserti128 m4, [lumaq+lstrideq*0+48], 1 1416 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] 1417 vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 1418 mova xm5, [lumaq+lstrideq*(1<<%3)+16] 1419 vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 1420 phaddw m2, m4 1421 phaddw m3, m5 1422 pxor m4, m4 1423 pavgw m2, m4 1424 pavgw m3, m4 1425%elif %1 1426 mova m2, [lumaq+ 0] 1427 mova m3, [lumaq+32] 1428%endif 1429%if %1 1430 mova m0, [srcq] 1431%if %2 1432 mova m1, [srcq+strideq] 1433%else 1434 mova m1, [srcq+32] 1435%endif 1436 punpckhwd m4, m2, m0 1437 punpcklwd m2, m0 1438 punpckhwd m5, m3, m1 1439 punpcklwd m3, m1 ; { luma, chroma } 1440 REPX {pmaddwd x, m14}, m4, m2, m5, m3 1441 REPX {paddd x, m15}, m4, m2, m5, m3 1442 REPX {psrad x, 6 }, m4, m2, m5, m3 1443 packusdw m2, m4 1444 packusdw m3, m5 1445 pminuw m2, m10 1446 pminuw m3, m10 ; clip_pixel() 1447%elif %2 1448 pand m2, m10 1449 pand m3, m10 1450%else 1451 pand m2, m10, [lumaq+ 0] 1452 pand m3, m10, [lumaq+32] 1453%endif 1454 1455 ; scaling[luma_src] 1456 vpbroadcastd m7, [pd_m65536] 1457 pandn m4, m7, m2 1458 mova m6, m7 1459 vpgatherdd m5, [scalingq+m4-0], m7 1460 psrld m2, 16 1461 mova m7, m6 1462 vpgatherdd m4, [scalingq+m2-2], m6 1463 pblendw m4, m5, 0x55 1464 pandn m5, m7, m3 1465 mova m6, m7 1466 vpgatherdd m2, [scalingq+m5-0], m7 1467 psrld m3, 16 1468 vpgatherdd m5, [scalingq+m3-2], m6 1469 pblendw m5, m2, 0x55 1470 1471 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1472 pmaddubsw m4, m11 1473 pmaddubsw m5, m11 1474 paddw m4, m4 1475 paddw m5, m5 1476 pmulhrsw m4, [grain_lutq+offxyq*2] 1477%if %2 1478 pmulhrsw m5, [grain_lutq+offxyq*2+82*2] 1479%else 1480 pmulhrsw m5, [grain_lutq+offxyq*2+32] 1481%endif 1482 1483 ; dst = clip_pixel(src, noise) 1484%if %1 1485 paddw m0, m4 1486 paddw m1, m5 1487%else 1488 paddw m0, m4, [srcq] 1489%if %2 1490 paddw m1, m5, [srcq+strideq] 1491%else 1492 paddw m1, m5, [srcq+32] 1493%endif 1494%endif 1495 pmaxsw m0, m12 1496 pmaxsw m1, m12 1497 pminsw m0, m13 1498 pminsw m1, m13 1499 mova [dstq], m0 1500%if %2 1501 mova [dstq+strideq], m1 1502 lea srcq, [srcq+strideq*2] 1503 lea dstq, [dstq+strideq*2] 1504 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1505%else 1506 mova [dstq+32], m1 1507 add srcq, strideq 1508 add dstq, strideq 1509 add lumaq, lstrideq 1510%endif 1511 add grain_lutq, 82*(2<<%2) 1512%if %2 1513 sub hb, 2 1514%else 1515 dec hb 1516%endif 1517 jg %%loop_y 1518 add wq, 32>>%2 1519 jge .end 1520 mov srcq, r9mp 1521 mov dstq, r11mp 1522 mov lumaq, r12mp 1523 lea srcq, [srcq+wq*2] 1524 lea dstq, [dstq+wq*2] 1525 lea lumaq, [lumaq+wq*(2<<%2)] 1526 cmp byte [fg_dataq+FGData.overlap_flag], 0 1527 je %%loop_x 1528 cmp dword r8m, 0 ; sby 1529 jne %%loop_x_hv_overlap 1530 1531 ; horizontal overlap (without vertical overlap) 1532%%loop_x_h_overlap: 1533 rorx r6, seeq, 1 1534 or seed, 0xEFF4 1535 test seeb, seeh 1536 lea seed, [r6+0x8000] 1537 cmovp seed, r6d ; updated seed 1538 1539 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1540 offx, offy, see, left_offxy, unused1, unused2, luma, lstride 1541 1542 lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx 1543 rorx offyd, seed, 8 1544 rorx offxq, seeq, 12 1545 and offyd, 0xf 1546 imul offyd, 164>>%3 1547 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 1548 1549 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1550 h, offxy, see, left_offxy, unused1, unused2, luma, lstride 1551 1552 mov grain_lutq, grain_lutmp 1553 mov hd, hm 1554%%loop_y_h_overlap: 1555 ; luma_src 1556%if %2 1557 mova xm2, [lumaq+lstrideq*0+ 0] 1558 vinserti128 m2, [lumaq+lstrideq*0+32], 1 1559 mova xm4, [lumaq+lstrideq*0+16] 1560 vinserti128 m4, [lumaq+lstrideq*0+48], 1 1561 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] 1562 vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 1563 mova xm5, [lumaq+lstrideq*(1<<%3)+16] 1564 vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 1565 phaddw m2, m4 1566 phaddw m3, m5 1567 pxor m4, m4 1568 pavgw m2, m4 1569 pavgw m3, m4 1570%elif %1 1571 mova m2, [lumaq] 1572 mova m3, [lumaq+32] 1573%endif 1574%if %1 1575 mova m0, [srcq] 1576%if %2 1577 mova m1, [srcq+strideq] 1578%else 1579 mova m1, [srcq+32] 1580%endif 1581 punpckhwd m4, m2, m0 1582 punpcklwd m2, m0 1583 punpckhwd m5, m3, m1 1584 punpcklwd m3, m1 ; { luma, chroma } 1585 REPX {pmaddwd x, m14}, m4, m2, m5, m3 1586 REPX {paddd x, m15}, m4, m2, m5, m3 1587 REPX {psrad x, 6 }, m4, m2, m5, m3 1588 packusdw m2, m4 1589 packusdw m3, m5 1590 pminuw m2, m10 ; clip_pixel() 1591 pminuw m3, m10 1592%elif %2 1593 pand m2, m10 1594 pand m3, m10 1595%else 1596 pand m2, m10, [lumaq+ 0] 1597 pand m3, m10, [lumaq+32] 1598%endif 1599 1600 ; scaling[luma_src] 1601 vpbroadcastd m7, [pd_m65536] 1602 pandn m4, m7, m2 1603 mova m6, m7 1604 vpgatherdd m5, [scalingq+m4-0], m7 1605 psrld m2, 16 1606 mova m7, m6 1607 vpgatherdd m4, [scalingq+m2-2], m6 1608 pblendw m4, m5, 0x55 1609 pandn m5, m7, m3 1610 mova m6, m7 1611 vpgatherdd m2, [scalingq+m5-0], m7 1612 psrld m3, 16 1613 vpgatherdd m5, [scalingq+m3-2], m6 1614 pblendw m5, m2, 0x55 1615 1616 ; grain = grain_lut[offy+y][offx+x] 1617 movu m2, [grain_lutq+offxyq*2] 1618%if %2 1619 movu m3, [grain_lutq+offxyq*2+82*2] 1620%else 1621 movu m3, [grain_lutq+offxyq*2+32] 1622%endif 1623 movd xm6, [grain_lutq+left_offxyq*2] 1624%if %2 1625 pinsrw xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} 1626 punpckldq xm7, xm2, xm3 ; {cur0, cur1} 1627 punpcklwd xm6, xm7 ; {left0, cur0, left1, cur1} 1628%else 1629 punpcklwd xm6, xm2 1630%endif 1631%if %1 1632%if %2 1633 vpbroadcastq xm7, [pw_23_22] 1634%else 1635 movq xm7, [pw_27_17_17_27] 1636%endif 1637 pmaddwd xm6, xm7 1638 vpbroadcastd xm7, [pd_16] 1639 paddd xm6, xm7 1640%else 1641 pmaddwd xm6, xm15 1642 paddd xm6, xm14 1643%endif 1644 psrad xm6, 5 1645 packssdw xm6, xm6 1646 pmaxsw xm6, xm8 1647 pminsw xm6, xm9 1648 vpblendd m2, m6, 0x01 1649%if %2 1650 pshuflw xm6, xm6, q1032 1651 vpblendd m3, m6, 0x01 1652%endif 1653 1654 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1655 pmaddubsw m4, m11 1656 pmaddubsw m5, m11 1657 paddw m4, m4 1658 paddw m5, m5 1659 pmulhrsw m2, m4 1660 pmulhrsw m3, m5 1661 1662 ; dst = clip_pixel(src, noise) 1663%if %1 1664 paddw m0, m2 1665 paddw m1, m3 1666%else 1667 paddw m0, m2, [srcq] 1668%if %2 1669 paddw m1, m3, [srcq+strideq] 1670%else 1671 paddw m1, m3, [srcq+32] 1672%endif 1673%endif 1674 pmaxsw m0, m12 1675 pmaxsw m1, m12 1676 pminsw m0, m13 1677 pminsw m1, m13 1678 mova [dstq], m0 1679%if %2 1680 mova [dstq+strideq], m1 1681 lea srcq, [srcq+strideq*2] 1682 lea dstq, [dstq+strideq*2] 1683 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1684%else 1685 mova [dstq+32], m1 1686 add srcq, strideq 1687 add dstq, strideq 1688 add lumaq, r10mp 1689%endif 1690 add grain_lutq, 82*(2<<%2) 1691%if %2 1692 sub hb, 2 1693%else 1694 dec hb 1695%endif 1696 jg %%loop_y_h_overlap 1697 add wq, 32>>%2 1698 jge .end 1699 mov srcq, r9mp 1700 mov dstq, r11mp 1701 mov lumaq, r12mp 1702 lea srcq, [srcq+wq*2] 1703 lea dstq, [dstq+wq*2] 1704 lea lumaq, [lumaq+wq*(2<<%2)] 1705 cmp dword r8m, 0 ; sby 1706 jne %%loop_x_hv_overlap 1707 jmp %%loop_x_h_overlap 1708 1709%%vertical_overlap: 1710 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ 1711 sby, see, unused1, unused2, unused3, lstride 1712 1713 movzx sbyd, sbyb 1714 imul seed, [fg_dataq+FGData.seed], 0x00010001 1715 imul r7d, sbyd, 173 * 0x00010001 1716 imul sbyd, 37 * 0x01000100 1717 add r7d, (105 << 16) | 188 1718 add sbyd, (178 << 24) | (141 << 8) 1719 and r7d, 0x00ff00ff 1720 and sbyd, 0xff00ff00 1721 xor seed, r7d 1722 xor seed, sbyd ; (cur_seed << 16) | top_seed 1723 1724 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1725 offx, offy, see, unused1, top_offxy, unused2, luma, lstride 1726 1727 mov lumaq, r9mp 1728 mov lstrideq, r10mp 1729 lea r10, [srcq+wq*2] 1730 lea r11, [dstq+wq*2] 1731 lea r12, [lumaq+wq*(2<<%2)] 1732 mov r9mp, r10 1733 mov r11mp, r11 1734 mov r12mp, r12 1735 neg wq 1736 1737%%loop_x_v_overlap: 1738 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1739 mov r6d, seed 1740 or seed, 0xeff4eff4 1741 test seeb, seeh 1742 setp r7b ; parity of top_seed 1743 shr seed, 16 1744 shl r7d, 16 1745 test seeb, seeh 1746 setp r7b ; parity of cur_seed 1747 or r6d, 0x00010001 1748 xor r7d, r6d 1749 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1750 1751 rorx offyd, seed, 8 1752 rorx offxd, seed, 12 1753 and offyd, 0xf000f 1754 and offxd, 0xf000f 1755 imul offyd, 164>>%3 1756 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1757 lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 1758 1759 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1760 h, offxy, see, unused1, top_offxy, unused2, luma, lstride 1761 1762 mov grain_lutq, grain_lutmp 1763 mov hd, hm 1764 movzx top_offxyd, offxyw 1765 shr offxyd, 16 1766%if %2 == 0 1767 lea r10, [pw_27_17_17_27] 1768%endif 1769%%loop_y_v_overlap: 1770 ; luma_src 1771%if %2 1772 mova xm2, [lumaq+lstrideq*0+ 0] 1773 vinserti128 m2, [lumaq+lstrideq*0+32], 1 1774 mova xm4, [lumaq+lstrideq*0+16] 1775 vinserti128 m4, [lumaq+lstrideq*0+48], 1 1776 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] 1777 vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 1778 mova xm5, [lumaq+lstrideq*(1<<%3)+16] 1779 vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 1780 phaddw m2, m4 1781 phaddw m3, m5 1782 pxor m4, m4 1783 pavgw m2, m4 1784 pavgw m3, m4 1785%elif %1 1786 mova m2, [lumaq] 1787 mova m3, [lumaq+32] 1788%endif 1789%if %1 1790 mova m0, [srcq] 1791%if %2 1792 mova m1, [srcq+strideq] 1793%else 1794 mova m1, [srcq+32] 1795%endif 1796 punpckhwd m4, m2, m0 1797 punpcklwd m2, m0 1798 punpckhwd m5, m3, m1 1799 punpcklwd m3, m1 ; { luma, chroma } 1800 REPX {pmaddwd x, m14}, m4, m2, m5, m3 1801 REPX {paddd x, m15}, m4, m2, m5, m3 1802 REPX {psrad x, 6 }, m4, m2, m5, m3 1803 packusdw m2, m4 1804 packusdw m3, m5 1805 pminuw m2, m10 ; clip_pixel() 1806 pminuw m3, m10 1807%elif %2 1808 pand m2, m10 1809 pand m3, m10 1810%else 1811 pand m2, m10, [lumaq+ 0] 1812 pand m3, m10, [lumaq+32] 1813%endif 1814 1815 ; scaling[luma_src] 1816 vpbroadcastd m7, [pd_m65536] 1817 pandn m4, m7, m2 1818 mova m6, m7 1819 vpgatherdd m5, [scalingq+m4-0], m7 1820 psrld m2, 16 1821 mova m7, m6 1822 vpgatherdd m4, [scalingq+m2-2], m6 1823 pblendw m4, m5, 0x55 1824 pandn m5, m7, m3 1825 mova m6, m7 1826 vpgatherdd m2, [scalingq+m5-0], m7 1827 psrld m3, 16 1828 vpgatherdd m5, [scalingq+m3-2], m6 1829 pblendw m5, m2, 0x55 1830 1831 ; grain = grain_lut[offy+y][offx+x] 1832 movu m6, [grain_lutq+offxyq*2] 1833 movu m3, [grain_lutq+top_offxyq*2] 1834 punpcklwd m2, m3, m6 1835 punpckhwd m3, m6 ; { top, cur } 1836%if %3 1837 vpbroadcastd m0, [pw_23_22] 1838%elif %2 1839 vpbroadcastd m0, [pw_27_17_17_27] 1840%else 1841 vpbroadcastd m0, [r10] 1842%endif 1843 REPX {pmaddwd x, m0}, m2, m3 1844%if %1 1845 vpbroadcastd m1, [pd_16] 1846 REPX {paddd x, m1}, m2, m3 1847%else 1848 REPX {paddd x, m14}, m2, m3 1849%endif 1850 REPX {psrad x, 5}, m2, m3 1851 packssdw m2, m3 1852%if %2 1853 movu m3, [grain_lutq+offxyq*2+82*2] 1854%else 1855 movu m3, [grain_lutq+offxyq*2+32] 1856%endif 1857%if %3 1858 pmaxsw m2, m8 1859 pminsw m2, m9 1860%else 1861%if %2 1862 movu m7, [grain_lutq+top_offxyq*2+82*2] 1863 punpckhwd m6, m3, m7 ; { cur, top } 1864 punpcklwd m3, m7 1865%else 1866 movu m7, [grain_lutq+top_offxyq*2+32] 1867 punpckhwd m6, m7, m3 1868 punpcklwd m3, m7, m3 ; { top, cur } 1869%endif 1870 pmaddwd m6, m0 1871 pmaddwd m3, m0 1872%if %1 1873 paddd m6, m1 1874 paddd m3, m1 1875%else 1876 paddd m6, m14 1877 paddd m3, m14 1878%endif 1879 psrad m6, 5 1880 psrad m3, 5 1881 packssdw m3, m6 1882 pmaxsw m2, m8 1883 pmaxsw m3, m8 1884 pminsw m2, m9 1885 pminsw m3, m9 1886%endif 1887 1888 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1889 pmaddubsw m4, m11 1890 pmaddubsw m5, m11 1891 paddw m4, m4 1892 paddw m5, m5 1893 pmulhrsw m2, m4 1894 pmulhrsw m3, m5 1895 1896 ; dst = clip_pixel(src, noise) 1897 paddw m0, m2, [srcq] 1898%if %2 1899 paddw m1, m3, [srcq+strideq] 1900%else 1901 paddw m1, m3, [srcq+32] 1902%endif 1903 pmaxsw m0, m12 1904 pmaxsw m1, m12 1905 pminsw m0, m13 1906 pminsw m1, m13 1907 mova [dstq], m0 1908%if %2 1909 mova [dstq+strideq], m1 1910 sub hb, 2 1911%else 1912 mova [dstq+32], m1 1913 dec hb 1914%endif 1915 jle %%end_y_v_overlap 1916%if %2 1917 lea srcq, [srcq+strideq*2] 1918 lea dstq, [dstq+strideq*2] 1919 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1920%else 1921 add srcq, strideq 1922 add dstq, strideq 1923 add lumaq, lstrideq 1924%endif 1925 add grain_lutq, 82*(2<<%2) 1926%if %2 1927 jmp %%loop_y 1928%else 1929 add hd, 0x80000000 1930 jc %%loop_y 1931 add r10, 4 1932 jmp %%loop_y_v_overlap 1933%endif 1934%%end_y_v_overlap: 1935 add wq, 32>>%2 1936 jge .end 1937 mov srcq, r9mp 1938 mov dstq, r11mp 1939 mov lumaq, r12mp 1940 lea srcq, [srcq+wq*2] 1941 lea dstq, [dstq+wq*2] 1942 lea lumaq, [lumaq+wq*(2<<%2)] 1943 1944 ; since fg_dataq.overlap is guaranteed to be set, we never jump 1945 ; back to .loop_x_v_overlap, and instead always fall-through to 1946 ; h+v overlap 1947%%loop_x_hv_overlap: 1948 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1949 mov r6d, seed 1950 or seed, 0xeff4eff4 1951 test seeb, seeh 1952 setp r7b ; parity of top_seed 1953 shr seed, 16 1954 shl r7d, 16 1955 test seeb, seeh 1956 setp r7b ; parity of cur_seed 1957 or r6d, 0x00010001 1958 xor r7d, r6d 1959 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1960 1961 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1962 offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride 1963 1964%if %2 == 0 1965 lea r14, [pw_27_17_17_27] 1966%endif 1967 lea topleft_offxyq, [top_offxyq+(32>>%2)] 1968 lea left_offxyq, [offyq+(32>>%2)] 1969 rorx offyd, seed, 8 1970 rorx offxd, seed, 12 1971 and offyd, 0xf000f 1972 and offxd, 0xf000f 1973 imul offyd, 164>>%3 1974 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1975 lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 1976 1977 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1978 h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride 1979 1980 mov grain_lutq, grain_lutmp 1981 mov hd, hm 1982 movzx top_offxyd, offxyw 1983 shr offxyd, 16 1984%%loop_y_hv_overlap: 1985 ; luma_src 1986%if %2 1987 mova xm2, [lumaq+lstrideq*0+ 0] 1988 vinserti128 m2, [lumaq+lstrideq*0+32], 1 1989 mova xm4, [lumaq+lstrideq*0+16] 1990 vinserti128 m4, [lumaq+lstrideq*0+48], 1 1991 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] 1992 vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 1993 mova xm5, [lumaq+lstrideq*(1<<%3)+16] 1994 vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 1995 phaddw m2, m4 1996 phaddw m3, m5 1997 pxor m4, m4 1998 pavgw m2, m4 1999 pavgw m3, m4 2000%elif %1 2001 mova m2, [lumaq] 2002 mova m3, [lumaq+32] 2003%endif 2004%if %1 2005 mova m0, [srcq] 2006%if %2 2007 mova m1, [srcq+strideq] 2008%else 2009 mova m1, [srcq+32] 2010%endif 2011 punpckhwd m4, m2, m0 2012 punpcklwd m2, m0 2013 punpckhwd m5, m3, m1 2014 punpcklwd m3, m1 ; { luma, chroma } 2015 REPX {pmaddwd x, m14}, m4, m2, m5, m3 2016 REPX {paddd x, m15}, m4, m2, m5, m3 2017 REPX {psrad x, 6 }, m4, m2, m5, m3 2018 packusdw m2, m4 2019 packusdw m3, m5 2020 pminuw m2, m10 ; clip_pixel() 2021 pminuw m3, m10 2022%elif %2 2023 pand m2, m10 2024 pand m3, m10 2025%else 2026 pand m2, m10, [lumaq+ 0] 2027 pand m3, m10, [lumaq+32] 2028%endif 2029 2030 ; scaling[luma_src] 2031 vpbroadcastd m7, [pd_m65536] 2032 pandn m4, m7, m2 2033 mova m6, m7 2034 vpgatherdd m5, [scalingq+m4-0], m7 2035 psrld m2, 16 2036 mova m7, m6 2037 vpgatherdd m4, [scalingq+m2-2], m6 2038 pblendw m4, m5, 0x55 2039 pandn m5, m7, m3 2040 mova m6, m7 2041 vpgatherdd m2, [scalingq+m5-0], m7 2042 psrld m3, 16 2043 vpgatherdd m5, [scalingq+m3-2], m6 2044 pblendw m5, m2, 0x55 2045 2046 ; grain = grain_lut[offy+y][offx+x] 2047 movu m0, [grain_lutq+offxyq*2] 2048 movd xm2, [grain_lutq+left_offxyq*2] 2049 movu m6, [grain_lutq+top_offxyq*2] 2050%if %2 2051 pinsrw xm2, [grain_lutq+left_offxyq*2+82*2], 2 2052 movu m3, [grain_lutq+offxyq*2+82*2] 2053 punpckldq xm1, xm0, xm3 ; { cur0, cur1 } 2054%if %3 2055 vinserti128 m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } 2056 vinserti128 m1, [grain_lutq+top_offxyq*2], 1 ; { cur0, cur1, top0 } 2057%else 2058 vinserti128 m2, [grain_lutq+topleft_offxyq*2+82*2], 1 2059 vpbroadcastd m7, [grain_lutq+topleft_offxyq*2] 2060 vpblendd m2, m7, 0x20 2061 movd xm7, [grain_lutq+top_offxyq*2+82*2] 2062 punpckldq xm7, xm6 2063 vinserti128 m1, xm7, 1 2064 movu m7, [grain_lutq+top_offxyq*2+82*2] 2065%endif 2066 punpcklwd m2, m1 ; { cur, left } 2067%if %1 2068 vpbroadcastq m1, [pw_23_22] 2069 pmaddwd m2, m1 2070 vpbroadcastd m1, [pd_16] 2071 paddd m2, m1 2072 psrad m2, 5 2073 packssdw m2, m2 2074 vpermq m2, m2, q3120 2075%else 2076 pmaddwd m2, m15 2077 paddd m2, m14 2078 psrad m2, 5 2079 vextracti128 xm1, m2, 1 2080 packssdw xm2, xm1 2081%endif 2082%else 2083 pinsrd xm2, [grain_lutq+topleft_offxyq*2], 1 2084 movu m3, [grain_lutq+offxyq*2+32] 2085 movu m7, [grain_lutq+top_offxyq*2+32] 2086 punpckldq xm1, xm0, xm6 2087 punpcklwd xm2, xm1 ; { cur, left } 2088%if %1 2089 movddup xm1, [pw_27_17_17_27] 2090 pmaddwd xm2, xm1 2091 vpbroadcastd m1, [pd_16] 2092 paddd xm2, xm1 2093%else 2094 pmaddwd xm2, xm15 2095 paddd xm2, xm14 2096%endif 2097 psrad xm2, 5 2098 packssdw xm2, xm2 2099%endif 2100 pmaxsw xm2, xm8 2101 pminsw xm2, xm9 2102 vpblendd m0, m2, 0x01 2103%if %2 2104 pshufd xm2, xm2, q0321 2105 vpblendd m3, m2, 0x01 2106%if %3 == 0 2107 pshufd xm2, xm2, q0321 2108 vpblendd m7, m2, 0x01 2109%endif 2110%endif 2111 pshuflw xm2, xm2, q1032 2112 vpblendd m2, m6, 0xfe 2113 punpckhwd m6, m0 ; { top, cur } 2114 punpcklwd m2, m0 2115%if %3 2116 vpbroadcastd m0, [pw_23_22] 2117%elif %2 2118 vpbroadcastd m0, [pw_27_17_17_27] 2119%else 2120 vpbroadcastd m0, [r14] 2121%endif 2122 pmaddwd m6, m0 2123 pmaddwd m2, m0 2124%if %1 2125 paddd m6, m1 2126 paddd m2, m1 2127%else 2128 paddd m6, m14 2129 paddd m2, m14 2130%endif 2131 psrad m6, 5 2132 psrad m2, 5 2133 packssdw m2, m6 2134 2135%if %3 2136 pmaxsw m2, m8 2137 pminsw m2, m9 2138%else 2139%if %2 2140 punpckhwd m6, m3, m7 2141 punpcklwd m3, m7 ; { cur, top } 2142%else 2143 punpckhwd m6, m7, m3 2144 punpcklwd m3, m7, m3 ; { top, cur } 2145%endif 2146 REPX {pmaddwd x, m0}, m6, m3 2147%if %1 2148 REPX {paddd x, m1}, m6, m3 2149%else 2150 REPX {paddd x, m14}, m6, m3 2151%endif 2152 REPX {psrad x, 5}, m6, m3 2153 packssdw m3, m6 2154 pmaxsw m2, m8 2155 pmaxsw m3, m8 2156 pminsw m2, m9 2157 pminsw m3, m9 2158%endif 2159 2160 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2161 pmaddubsw m4, m11 2162 pmaddubsw m5, m11 2163 paddw m4, m4 2164 paddw m5, m5 2165 pmulhrsw m2, m4 2166 pmulhrsw m3, m5 2167 2168 ; dst = clip_pixel(src, noise) 2169 paddw m0, m2, [srcq] 2170%if %2 2171 paddw m1, m3, [srcq+strideq] 2172%else 2173 paddw m1, m3, [srcq+32] 2174%endif 2175 pmaxsw m0, m12 2176 pmaxsw m1, m12 2177 pminsw m0, m13 2178 pminsw m1, m13 2179 mova [dstq], m0 2180%if %2 2181 mova [dstq+strideq], m1 2182 lea srcq, [srcq+strideq*2] 2183 lea dstq, [dstq+strideq*2] 2184 lea lumaq, [lumaq+lstrideq*(2<<%3)] 2185%else 2186 mova [dstq+32], m1 2187 add srcq, strideq 2188 add dstq, strideq 2189 add lumaq, r10mp 2190%endif 2191 add grain_lutq, 82*(2<<%2) 2192%if %2 2193 sub hb, 2 2194 jg %%loop_y_h_overlap 2195%else 2196 dec hb 2197 jle %%end_y_hv_overlap 2198 add hd, 0x80000000 2199 jc %%loop_y_h_overlap 2200 add r14, 4 2201 jmp %%loop_y_hv_overlap 2202%endif 2203%%end_y_hv_overlap: 2204 add wq, 32>>%2 2205 jge .end 2206 mov srcq, r9mp 2207 mov dstq, r11mp 2208 mov lumaq, r12mp 2209 lea srcq, [srcq+wq*2] 2210 lea dstq, [dstq+wq*2] 2211 lea lumaq, [lumaq+wq*(2<<%2)] 2212 jmp %%loop_x_hv_overlap 2213%endmacro 2214 2215 %%FGUV_32x32xN_LOOP 1, %2, %3 2216.csfl: 2217 %%FGUV_32x32xN_LOOP 0, %2, %3 2218.end: 2219 RET 2220%endmacro 2221 2222GEN_GRAIN_UV_FN 420, 1, 1 2223FGUV_FN 420, 1, 1 2224GEN_GRAIN_UV_FN 422, 1, 0 2225FGUV_FN 422, 1, 0 2226GEN_GRAIN_UV_FN 444, 0, 0 2227FGUV_FN 444, 0, 0 2228 2229%endif ; ARCH_X86_64 2230