1; Copyright © 2022, VideoLAN and dav1d authors 2; Copyright © 2022, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28%include "x86/filmgrain_common.asm" 29 30%if ARCH_X86_64 31 32SECTION_RODATA 16 33scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1 34scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4 35pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27 36pw_23_22: dw 92, 88, 0, 128, 23, 22, 0, 32 37fg_min: times 2 dw 0 38 times 2 dw 64 39 times 2 dw 256 40fg_max: times 2 dw 1023 41 times 2 dw 4095 42 times 2 dw 960 43 times 2 dw 3840 44 times 2 dw 940 45 times 2 dw 3760 46scale_rnd: dd 64 47 dd 16 48uv_offset_mul: dd 256 49 dd 1024 50pb_8_9_0_1: db 8, 9, 0, 1 51 52cextern pb_0to63 53 54SECTION .text 55 56INIT_ZMM avx512icl 57cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \ 58 grain_lut, offx, sby, see, offy, src_bak 59%define base r11-fg_min 60 lea r11, [fg_min] 61 mov r6d, r9m ; bdmax 62 mov r9d, [fg_dataq+FGData.clip_to_restricted_range] 63 mov r7d, [fg_dataq+FGData.scaling_shift] 64 mov sbyd, sbym 65 vpbroadcastd m6, r9m 66 shr r6d, 11 ; is_12bpc 67 vbroadcasti32x4 m7, [base+scale_mask] 68 shlx r10d, r9d, r6d 69 vpbroadcastd m10, [base+scale_shift+r7*4-32] 70 lea r9d, [r6+r9*4] 71 vpbroadcastd m8, [base+fg_min+r10*4] 72 kxnorw k1, k1, k1 ; 0xffff 73 vpbroadcastd m9, [base+fg_max+r9*4] 74 mov r12, 0xeeeeeeeeeeeeeeee 75 vpbroadcastd m19, [base+scale_rnd+r6*4] 76 kshiftrb k2, k1, 4 ; 0xf 77 vpbroadcastq xm20, [base+pw_27_17_17_27+r6*8] 78 kmovq k3, r12 79 vpbroadcastd m11, [base+scale_shift+r6*8+4] 80 test sbyd, sbyd 81 setnz r7b 82 vpbroadcastd m12, [base+pw_27_17_17_27+r6*8+0] 83 vpbroadcastd m13, [base+pw_27_17_17_27+r6*8+4] 84 test r7b, [fg_dataq+FGData.overlap_flag] 85 jnz .v_overlap 86 87 imul seed, sbyd, (173 << 24) | 37 88 add seed, (105 << 24) | 178 89 rorx seed, seed, 24 90 movzx seed, seew 91 xor seed, [fg_dataq+FGData.seed] 92 lea src_bakq, [srcq+wq*2] 93 neg wq 94 sub dstq, srcq 95 96.loop_x: 97 rorx r6, seeq, 1 98 or seed, 0xeff4 99 test seeb, seeh 100 lea seed, [r6+0x8000] 101 cmovp seed, r6d ; updated seed 102 rorx offyd, seed, 8 103 rorx offxq, seeq, 12 104 and offyd, 0xf 105 imul offyd, 164 106 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 107 108 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ 109 sby, see, offxy, src_bak 110 111 mov grain_lutq, grain_lutmp 112 mov hd, hm 113.loop_y: 114 movu m4, [grain_lutq+offxyq*2+82*0] 115 movu m5, [grain_lutq+offxyq*2+82*2] 116 call .add_noise 117 sub hb, 2 118 jg .loop_y 119 add wq, 32 120 jge .end 121 lea srcq, [src_bakq+wq*2] 122 cmp byte [fg_dataq+FGData.overlap_flag], 0 123 je .loop_x 124 test sbyd, sbyd 125 jnz .hv_overlap 126 127 ; horizontal overlap (without vertical overlap) 128.loop_x_h_overlap: 129 rorx r6, seeq, 1 130 or seed, 0xeff4 131 test seeb, seeh 132 lea seed, [r6+0x8000] 133 cmovp seed, r6d ; updated seed 134 135 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ 136 sby, see, offy, src_bak, left_offxy 137 138 lea left_offxyd, [offyq+73] ; previous column's offy*stride+offx 139 rorx offyd, seed, 8 140 rorx offxq, seeq, 12 141 and offyd, 0xf 142 imul offyd, 164 143 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 144 145 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ 146 sby, see, offxy, src_bak, left_offxy 147 148 mov grain_lutq, grain_lutmp 149 mov hd, hm 150.loop_y_h_overlap: 151 movu m4, [grain_lutq+offxyq*2+82*0] 152 movu m5, [grain_lutq+offxyq*2+82*2] 153 movd xm17, [grain_lutq+left_offxyq*2-82*1] 154 pinsrd xm17, [grain_lutq+left_offxyq*2+82*1], 1 155 punpckldq xm16, xm4, xm5 156 punpcklwd xm17, xm16 157 mova xm16, xm19 158 vpdpwssd xm16, xm20, xm17 159 psrad xm16, 1 160 packssdw xm16, xm16 161 vpsravw xm16, xm11 162 vmovdqu8 m4{k2}, m16 163 vpalignr m5{k2}, m16, m16, 4 164 call .add_noise 165 sub hb, 2 166 jg .loop_y_h_overlap 167 add wq, 32 168 jge .end 169 lea srcq, [src_bakq+wq*2] 170 test sbyd, sbyd 171 jnz .hv_overlap 172 jmp .loop_x_h_overlap 173 174.v_overlap: 175 movzx sbyd, sbyb 176 imul seed, [fg_dataq+FGData.seed], 0x00010001 177 imul r7d, sbyd, 173 * 0x00010001 178 imul sbyd, 37 * 0x01000100 179 add r7d, (105 << 16) | 188 180 add sbyd, (178 << 24) | (141 << 8) 181 and r7d, 0x00ff00ff 182 and sbyd, 0xff00ff00 183 xor seed, r7d 184 xor seed, sbyd ; (cur_seed << 16) | top_seed 185 lea src_bakq, [srcq+wq*2] 186 neg wq 187 sub dstq, srcq 188 189 ; we assume from the block above that bits 8-15 of r7d are zero'ed 190 mov r6d, seed 191 or seed, 0xeff4eff4 192 test seeb, seeh 193 setp r7b ; parity of top_seed 194 shr seed, 16 195 shl r7d, 16 196 test seeb, seeh 197 setp r7b ; parity of cur_seed 198 or r6d, 0x00010001 199 xor r7d, r6d 200 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 201 202 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ 203 sby, see, offy, src_bak, _, top_offxy 204 205 rorx offyd, seed, 8 206 rorx offxd, seed, 12 207 and offyd, 0xf000f 208 and offxd, 0xf000f 209 imul offyd, 164 210 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 211 lea offyd, [offyq+offxq*2+0x10001*747+32*82] 212 213 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ 214 sby, see, offxy, src_bak, _, top_offxy 215 216 mov grain_lutq, grain_lutmp 217 mov hd, hm 218 movzx top_offxyd, offxyw 219 shr offxyd, 16 220 221 movu m16, [grain_lutq+offxyq*2+82*0] 222 movu m0, [grain_lutq+top_offxyq*2+82*0] 223 movu m17, [grain_lutq+offxyq*2+82*2] 224 movu m1, [grain_lutq+top_offxyq*2+82*2] 225 punpckhwd m4, m0, m16 226 punpcklwd m0, m16 227 punpckhwd m5, m1, m17 228 punpcklwd m1, m17 229 call .add_noise_v 230 sub hb, 2 231 jg .loop_y 232 add wq, 32 233 jge .end 234 lea srcq, [src_bakq+wq*2] 235 236 ; since fg_dataq.overlap is guaranteed to be set, we never jump back 237 ; to .v_overlap, and instead always fall-through to .hv_overlap 238.hv_overlap: 239 ; we assume from the block above that bits 8-15 of r7d are zero'ed 240 mov r6d, seed 241 or seed, 0xeff4eff4 242 test seeb, seeh 243 setp r7b ; parity of top_seed 244 shr seed, 16 245 shl r7d, 16 246 test seeb, seeh 247 setp r7b ; parity of cur_seed 248 or r6d, 0x00010001 249 xor r7d, r6d 250 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 251 252 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ 253 sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy 254 255 lea topleft_offxyd, [top_offxyq+73] 256 lea left_offxyd, [offyq+73] 257 rorx offyd, seed, 8 258 rorx offxd, seed, 12 259 and offyd, 0xf000f 260 and offxd, 0xf000f 261 imul offyd, 164 262 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 263 lea offyd, [offyq+offxq*2+0x10001*747+32*82] 264 265 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ 266 sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy 267 268 mov grain_lutq, grain_lutmp 269 mov hd, hm 270 movzx top_offxyd, offxyw 271 shr offxyd, 16 272 273 movu m5, [grain_lutq+offxyq*2+82*0] 274 movu m0, [grain_lutq+top_offxyq*2+82*0] 275 movd xm17, [grain_lutq+left_offxyq*2-82*1] 276 pinsrd xm17, [grain_lutq+topleft_offxyq*2-82*1], 1 277 movu m2, [grain_lutq+offxyq*2+82*2] 278 movu m1, [grain_lutq+top_offxyq*2+82*2] 279 movd xm18, [grain_lutq+left_offxyq*2+82*1] 280 pinsrd xm18, [grain_lutq+topleft_offxyq*2+82*1], 1 281 punpckldq xm16, xm5, xm0 282 punpcklwd xm17, xm16 283 mova xm16, xm19 284 vpdpwssd xm16, xm20, xm17 285 punpckldq xm17, xm2, xm1 286 punpcklwd xm18, xm17 287 mova xm17, xm19 288 vpdpwssd xm17, xm20, xm18 289 punpckhwd m4, m0, m5 290 punpcklwd m0, m5 291 punpckhwd m5, m1, m2 292 punpcklwd m1, m2 293 psrad xm16, 1 294 psrad xm17, 1 295 packssdw xm16, xm17 296 vpsravw xm16, xm11 297 vpshuflw m0{k2}, m16, q1302 298 punpckhqdq xm16, xm16 299 vpshuflw m1{k2}, m16, q1302 300 call .add_noise_v 301 sub hb, 2 302 jg .loop_y_h_overlap 303 add wq, 32 304 lea srcq, [src_bakq+wq*2] 305 jl .hv_overlap 306.end: 307 RET 308ALIGN function_align 309.add_noise_v: 310 mova m2, m19 311 vpdpwssd m2, m12, m4 312 mova m3, m19 313 vpdpwssd m3, m13, m5 314 mova m4, m19 315 vpdpwssd m4, m12, m0 316 mova m5, m19 317 vpdpwssd m5, m13, m1 318 REPX {psrad x, 1}, m2, m3, m4, m5 319 packssdw m4, m2 320 packssdw m5, m3 321 vpsravw m4, m11 322 vpsravw m5, m11 323.add_noise: 324 mova m0, [srcq+strideq*0] 325 mova m1, [srcq+strideq*1] 326 kmovw k4, k1 327 pand m16, m6, m0 328 psrld m3, m0, 16 329 vpgatherdd m2{k4}, [scalingq+m16] 330 vpcmpud k4, m3, m6, 2 ; px <= bdmax 331 vpgatherdd m16{k4}, [scalingq+m3] 332 kmovw k4, k1 333 pand m17, m6, m1 334 vpgatherdd m3{k4}, [scalingq+m17] 335 vpshufb m2{k3}, m16, m7 336 psrld m16, m1, 16 337 vpcmpud k4, m16, m6, 2 338 vpgatherdd m17{k4}, [scalingq+m16] 339 vpshufb m3{k3}, m17, m7 340 vpsllvw m2, m10 341 vpsllvw m3, m10 342 pmulhrsw m4, m2 343 pmulhrsw m5, m3 344 add grain_lutq, 82*4 345 paddw m0, m4 346 paddw m1, m5 347 pmaxsw m0, m8 348 pmaxsw m1, m8 349 pminsw m0, m9 350 pminsw m1, m9 351 mova [dstq+srcq], m0 352 add srcq, strideq 353 mova [dstq+srcq], m1 354 add srcq, strideq 355 ret 356 357%macro FGUV_FN 3 ; name, ss_hor, ss_ver 358cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \ 359 grain_lut, h, sby, luma, lstride, uv_pl, is_id 360%define base r12-fg_min 361 lea r12, [fg_min] 362 mov r9d, r13m ; bdmax 363 mov r7d, [fg_dataq+FGData.scaling_shift] 364 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 365 mov r11d, is_idm 366 kxnorw k1, k1, k1 ; 0xffff 367 vpbroadcastd m5, r13m 368 mov r13, 0xeeeeeeeeeeeeeeee 369 vbroadcasti32x4 m6, [base+scale_mask] 370 shr r9d, 11 ; is_12bpc 371 vpbroadcastd m7, [base+scale_shift+r7*4-32] 372 shlx r10d, r6d, r9d 373 mov sbyd, sbym 374 shlx r6d, r6d, r11d 375 vpbroadcastd m8, [base+fg_min+r10*4] 376 lea r6d, [r9+r6*2] 377 vpbroadcastd m9, [base+fg_max+r6*4] 378 kmovq k2, r13 379 vpbroadcastd m20, [base+scale_rnd+r9*4] 380 packssdw m4, m5, m5 381 vpbroadcastd m21, [base+scale_shift+r9*8+4] 382%if %2 383 mova m12, [pb_0to63] ; pw_even 384 mov r13d, 0x0101 385 vpbroadcastq m10, [base+pw_23_22+r9*8] 386 kmovw k3, r13d 387%if %3 388 pshufd m11, m10, q0000 389%else 390 vpbroadcastd ym16, [base+pw_27_17_17_27+r9*8+0] 391 vpbroadcastd m11, [base+pw_27_17_17_27+r9*8+4] 392 vmovdqu16 m11{k1}, m16 393%endif 394 psrlw m13, m12, 8 ; pw_odd 395%else 396 vpbroadcastq m10, [base+pw_27_17_17_27+r9*8] 397 kshiftrb k3, k1, 7 ; 0x01 398 kshiftrb k4, k1, 4 ; 0x0f 399 pshufd m11, m10, q0000 400%endif 401 mov lstrideq, r10mp 402 test sbyd, sbyd 403 setnz r7b 404 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 405 jne .csfl 406 407%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 408 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 409 _, sby, see, lstride 410 411%if %1 412 mov r6d, r11m 413 vpbroadcastd m0, [base+uv_offset_mul+r9*4] 414 vpbroadcastd m1, [base+pb_8_9_0_1] 415 vpbroadcastd m14, [fg_dataq+FGData.uv_offset+r6*4] 416 vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4] 417 pmaddwd m14, m0 418 pshufb m15, m1 ; { uv_luma_mult, uv_mult } 419%endif 420 test r7b, [fg_dataq+FGData.overlap_flag] 421 jnz %%v_overlap 422 423 imul seed, sbyd, (173 << 24) | 37 424 add seed, (105 << 24) | 178 425 rorx seed, seed, 24 426 movzx seed, seew 427 xor seed, [fg_dataq+FGData.seed] 428 429 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 430 offx, offy, see, lstride, luma 431 432 mov lumaq, r9mp 433 lea r12, [srcq+wq*2] 434 lea r13, [dstq+wq*2] 435 lea r14, [lumaq+wq*(2<<%2)] 436 mov r9mp, r12 437 mov r10mp, r13 438 mov r11mp, r14 439 neg wq 440 441%%loop_x: 442 rorx r6, seeq, 1 443 or seed, 0xeff4 444 test seeb, seeh 445 lea seed, [r6+0x8000] 446 cmovp seed, r6d ; updated seed 447 rorx offyd, seed, 8 448 rorx offxq, seeq, 12 449 and offyd, 0xf 450 imul offyd, 164>>%3 451 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx 452 453 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 454 h, offxy, see, lstride, luma 455 456 mov grain_lutq, grain_lutmp 457 mov hd, hm 458%%loop_y: 459%if %2 460 movu ym18, [grain_lutq+offxyq*2+82*0] 461 vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 462 movu ym19, [grain_lutq+offxyq*2+82*4] 463 vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 464%else 465 movu m18, [grain_lutq+offxyq*2+82*0] 466 movu m19, [grain_lutq+offxyq*2+82*2] 467%endif 468 call %%add_noise 469 sub hb, 2<<%2 470 jg %%loop_y 471 add wq, 32>>%2 472 jge .end 473 mov srcq, r9mp 474 mov dstq, r10mp 475 mov lumaq, r11mp 476 lea srcq, [srcq+wq*2] 477 lea dstq, [dstq+wq*2] 478 lea lumaq, [lumaq+wq*(2<<%2)] 479 cmp byte [fg_dataq+FGData.overlap_flag], 0 480 je %%loop_x 481 cmp dword r8m, 0 ; sby 482 jne %%hv_overlap 483 484 ; horizontal overlap (without vertical overlap) 485%%loop_x_h_overlap: 486 rorx r6, seeq, 1 487 or seed, 0xEFF4 488 test seeb, seeh 489 lea seed, [r6+0x8000] 490 cmovp seed, r6d ; updated seed 491 492 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 493 offx, offy, see, lstride, luma, left_offxy 494 495 lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx 496 rorx offyd, seed, 8 497 rorx offxq, seeq, 12 498 and offyd, 0xf 499 imul offyd, 164>>%3 500 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 501 502 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 503 h, offxy, see, lstride, luma, left_offxy 504 505 mov grain_lutq, grain_lutmp 506 mov hd, hm 507%%loop_y_h_overlap: 508%if %2 509 movu ym18, [grain_lutq+offxyq*2+82*0] 510 vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 511 movu ym19, [grain_lutq+offxyq*2+82*4] 512 vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 513 movd xm16, [grain_lutq+left_offxyq*2+82*0] 514 vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 515 movd xm17, [grain_lutq+left_offxyq*2+82*4] 516 vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 517 punpckldq m16, m17 518 punpckldq m17, m18, m19 519 punpcklwd m16, m17 520 mova m17, m20 521 vpdpwssd m17, m16, m10 522 psrad m17, 1 523 packssdw m17, m17 524 vpsravw m17, m21 525%else 526 movu m18, [grain_lutq+offxyq*2+82*0] 527 movu m19, [grain_lutq+offxyq*2+82*2] 528 movd xm16, [grain_lutq+left_offxyq*2+82*0] 529 pinsrd xm16, [grain_lutq+left_offxyq*2+82*2], 1 530 punpckldq xm17, xm18, xm19 531 punpcklwd xm16, xm17 532 mova xm17, xm20 533 vpdpwssd xm17, xm16, xm10 534 psrad xm17, 1 535 packssdw xm17, xm17 536 vpsravw xm17, xm21 537%endif 538 vmovdqa32 m18{k3}, m17 539 vpshufd m19{k3}, m17, q0321 540 call %%add_noise 541 sub hb, 2<<%2 542 jg %%loop_y_h_overlap 543 add wq, 32>>%2 544 jge .end 545 mov srcq, r9mp 546 mov dstq, r10mp 547 mov lumaq, r11mp 548 lea srcq, [srcq+wq*2] 549 lea dstq, [dstq+wq*2] 550 lea lumaq, [lumaq+wq*(2<<%2)] 551 cmp dword r8m, 0 ; sby 552 jne %%hv_overlap 553 jmp %%loop_x_h_overlap 554 555%%v_overlap: 556 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 557 _, sby, see, lstride 558 559 movzx sbyd, sbyb 560 imul seed, [fg_dataq+FGData.seed], 0x00010001 561 imul r7d, sbyd, 173 * 0x00010001 562 imul sbyd, 37 * 0x01000100 563 add r7d, (105 << 16) | 188 564 add sbyd, (178 << 24) | (141 << 8) 565 and r7d, 0x00ff00ff 566 and sbyd, 0xff00ff00 567 xor seed, r7d 568 xor seed, sbyd ; (cur_seed << 16) | top_seed 569 570 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 571 offx, offy, see, lstride, luma, _, top_offxy 572 573 mov lumaq, r9mp 574 lea r12, [srcq+wq*2] 575 lea r13, [dstq+wq*2] 576 lea r14, [lumaq+wq*(2<<%2)] 577 mov r9mp, r12 578 mov r10mp, r13 579 mov r11mp, r14 580 neg wq 581 582 ; we assume from the block above that bits 8-15 of r7d are zero'ed 583 mov r6d, seed 584 or seed, 0xeff4eff4 585 test seeb, seeh 586 setp r7b ; parity of top_seed 587 shr seed, 16 588 shl r7d, 16 589 test seeb, seeh 590 setp r7b ; parity of cur_seed 591 or r6d, 0x00010001 592 xor r7d, r6d 593 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 594 595 rorx offyd, seed, 8 596 rorx offxd, seed, 12 597 and offyd, 0xf000f 598 and offxd, 0xf000f 599 imul offyd, 164>>%3 600 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 601 lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 602 603 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 604 h, offxy, see, lstride, luma, _, top_offxy 605 606 mov grain_lutq, grain_lutmp 607 mov hd, hm 608 movzx top_offxyd, offxyw 609 shr offxyd, 16 610 611%if %3 612 movu ym16, [grain_lutq+offxyq*2+82*0] 613 movu ym1, [grain_lutq+top_offxyq*2+82*0] 614 vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2] 615 movu ym19, [grain_lutq+offxyq*2+82*4] 616 vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 617 punpcklwd ym17, ym1, ym16 618 punpckhwd ym1, ym16 619%elif %2 620 movu ym18, [grain_lutq+offxyq*2+82*0] 621 vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 622 movu ym17, [grain_lutq+top_offxyq*2+82*0] 623 vinserti32x8 m17, [grain_lutq+top_offxyq*2+82*2], 1 624 movu ym19, [grain_lutq+offxyq*2+82*4] 625 vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 626 punpcklwd m16, m17, m18 627 punpckhwd m17, m18 628%else 629 movu m18, [grain_lutq+offxyq*2+82*0] 630 movu m19, [grain_lutq+top_offxyq*2+82*0] 631 movu m2, [grain_lutq+offxyq*2+82*2] 632 movu m16, [grain_lutq+top_offxyq*2+82*2] 633 punpckhwd m1, m19, m18 634 punpcklwd m19, m18 635 punpckhwd m18, m2, m16 636 punpcklwd m2, m16 637%endif 638 call %%add_noise_v 639 sub hb, 2<<%2 640 jg %%loop_y 641 add wq, 32>>%2 642 jge .end 643 mov srcq, r9mp 644 mov dstq, r10mp 645 mov lumaq, r11mp 646 lea srcq, [srcq+wq*2] 647 lea dstq, [dstq+wq*2] 648 lea lumaq, [lumaq+wq*(2<<%2)] 649 650 ; since fg_dataq.overlap is guaranteed to be set, we never jump back 651 ; to %%v_overlap, and instead always fall-through to %%hv_overlap 652%%hv_overlap: 653 ; we assume from the block above that bits 8-15 of r7d are zero'ed 654 mov r6d, seed 655 or seed, 0xeff4eff4 656 test seeb, seeh 657 setp r7b ; parity of top_seed 658 shr seed, 16 659 shl r7d, 16 660 test seeb, seeh 661 setp r7b ; parity of cur_seed 662 or r6d, 0x00010001 663 xor r7d, r6d 664 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 665 666 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 667 offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy 668 669 lea topleft_offxyq, [top_offxyq+(32>>%2)] 670 lea left_offxyq, [offyq+(32>>%2)] 671 rorx offyd, seed, 8 672 rorx offxd, seed, 12 673 and offyd, 0xf000f 674 and offxd, 0xf000f 675 imul offyd, 164>>%3 676 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 677 lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 678 679 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 680 h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy 681 682 mov grain_lutq, grain_lutmp 683 mov hd, hm 684 movzx top_offxyd, offxyw 685 shr offxyd, 16 686 687 ; grain = grain_lut[offy+y][offx+x] 688%if %2 689 movd xm16, [grain_lutq+left_offxyq*2+82*0] 690 vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 691 movd xm17, [grain_lutq+left_offxyq*2+82*4] 692 vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 693 movu ym18, [grain_lutq+offxyq*2+82*0] 694 vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 695 movu ym19, [grain_lutq+offxyq*2+82*4] 696 vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 697 punpckldq m16, m17 698 punpckldq m17, m18, m19 699 punpcklwd m16, m17 700 movu ym1, [grain_lutq+top_offxyq*2+82*0] 701 movd xm17, [grain_lutq+topleft_offxyq*2+82*0] 702 mova m0, m20 703 vpdpwssd m0, m16, m10 704%if %3 705 punpcklwd xm17, xm1 706 mova xm16, xm20 707 vpdpwssd xm16, xm17, xm10 708 psrad xm16, 1 709%else 710 vinserti32x8 m1, [grain_lutq+top_offxyq*2+82*2], 1 711 vinserti32x4 m17, [grain_lutq+topleft_offxyq*2+82*2], 2 712 punpcklwd m17, m1 713 mova m16, m20 714 vpdpwssd m16, m17, m10 715 psrad m16, 1 716%endif 717 psrad m0, 1 718 packssdw m0, m16 719 vpsravw m0, m21 720 vmovdqa32 m18{k3}, m0 721 vpshufd m19{k3}, m0, q0321 722%if %3 723 vpunpckhdq ym1{k3}, ym0, ym0 724 punpcklwd ym17, ym1, ym18 725 punpckhwd ym1, ym18 726%else 727 vpunpckhdq m1{k3}, m0, m0 728 punpcklwd m16, m1, m18 729 punpckhwd m17, m1, m18 730%endif 731%else 732 movu m18, [grain_lutq+offxyq*2+82*0] 733 movu m19, [grain_lutq+top_offxyq*2+82*0] 734 movd xm17, [grain_lutq+left_offxyq*2+82*0] 735 pinsrd xm17, [grain_lutq+topleft_offxyq*2+82*0], 1 736 punpckldq xm16, xm18, xm19 737 punpcklwd xm17, xm16 738 movu m2, [grain_lutq+offxyq*2+82*2] 739 movu m0, [grain_lutq+top_offxyq*2+82*2] 740 movd xm16, [grain_lutq+left_offxyq*2+82*2] 741 pinsrd xm16, [grain_lutq+topleft_offxyq*2+82*2], 1 742 punpckldq xm1, xm2, xm0 743 punpcklwd xm1, xm16, xm1 744 mova xm16, xm20 745 vpdpwssd xm16, xm17, xm10 746 mova xm17, xm20 747 vpdpwssd xm17, xm1, xm10 748 punpckhwd m1, m19, m18 749 punpcklwd m19, m18 750 punpckhwd m18, m2, m0 751 punpcklwd m2, m0 752 psrad xm16, 1 753 psrad xm17, 1 754 packssdw xm16, xm17 755 vpsravw xm16, xm21 756 vpshuflw m19{k4}, m16, q1302 757 punpckhqdq xm16, xm16 758 vpshuflw m2{k4}, m16, q3120 759%endif 760 call %%add_noise_v 761 sub hb, 2<<%2 762 jg %%loop_y_h_overlap 763 add wq, 32>>%2 764 jge .end 765 mov srcq, r9mp 766 mov dstq, r10mp 767 mov lumaq, r11mp 768 lea srcq, [srcq+wq*2] 769 lea dstq, [dstq+wq*2] 770 lea lumaq, [lumaq+wq*(2<<%2)] 771 jmp %%hv_overlap 772 773ALIGN function_align 774%%add_noise_v: 775%if %3 776 mova ym16, ym20 777 vpdpwssd ym16, ym17, ym11 778 mova ym17, ym20 779 vpdpwssd ym17, ym1, ym11 780 psrad ym16, 1 781 psrad ym17, 1 782 packssdw ym16, ym17 783 vpsravw m18{k1}, m16, m21 784%elif %2 785 mova m18, m20 786 vpdpwssd m18, m16, m11 787 mova m16, m20 788 vpdpwssd m16, m17, m11 789 psrad m18, 1 790 psrad m16, 1 791 packssdw m18, m16 792 vpsravw m18, m21 793%else 794 mova m16, m20 795 vpdpwssd m16, m1, m11 796 mova m17, m20 797 vpdpwssd m17, m18, m11 798 mova m18, m20 799 vpdpwssd m18, m19, m11 800 mova m19, m20 801 vpdpwssd m19, m2, m11 802 REPX {psrad x, 1}, m16, m17, m18, m19 803 packssdw m18, m16 804 packssdw m19, m17 805 vpsravw m18, m21 806 vpsravw m19, m21 807%endif 808%%add_noise: 809%if %2 810 mova m2, [lumaq+lstrideq*(0<<%3)] 811 mova m0, [lumaq+lstrideq*(1<<%3)] 812 lea lumaq, [lumaq+lstrideq*(2<<%3)] 813 mova m3, [lumaq+lstrideq*(0<<%3)] 814 mova m1, [lumaq+lstrideq*(1<<%3)] 815 mova m16, m12 816 vpermi2w m16, m2, m0 817 vpermt2w m2, m13, m0 818 mova m17, m12 819 vpermi2w m17, m3, m1 820 vpermt2w m3, m13, m1 821 pavgw m2, m16 822 pavgw m3, m17 823%elif %1 824 mova m2, [lumaq+lstrideq*0] 825 mova m3, [lumaq+lstrideq*1] 826%endif 827%if %2 828 mova ym16, [srcq+strideq*0] 829 vinserti32x8 m16, [srcq+strideq*1], 1 830 lea srcq, [srcq+strideq*2] 831%else 832 mova m16, [srcq+strideq*0] 833%endif 834%if %1 835 punpckhwd m17, m2, m16 836 mova m0, m14 837 vpdpwssd m0, m17, m15 838 punpcklwd m17, m2, m16 839 mova m2, m14 840 vpdpwssd m2, m17, m15 841%endif 842%if %2 843 mova ym17, [srcq+strideq*0] 844 vinserti32x8 m17, [srcq+strideq*1], 1 845%else 846 mova m17, [srcq+strideq*1] 847%endif 848%if %1 849 psrad m0, 6 850 psrad m2, 6 851 packusdw m2, m0 852 punpckhwd m0, m3, m17 853 mova m1, m14 854 vpdpwssd m1, m15, m0 855 punpcklwd m0, m3, m17 856 mova m3, m14 857 vpdpwssd m3, m15, m0 858 psrad m1, 6 859 psrad m3, 6 860 packusdw m3, m1 861 pminuw m2, m4 862 pminuw m3, m4 863 864.add_noise_main: 865 ; scaling[luma_src] 866 kmovw k5, k1 867 pand m1, m5, m2 868 vpgatherdd m0{k5}, [scalingq+m1] 869 kmovw k5, k1 870 psrld m2, 16 871 vpgatherdd m1{k5}, [scalingq+m2] 872 vpshufb m0{k2}, m1, m6 873 kmovw k5, k1 874 psrld m1, m3, 16 875 vpgatherdd m2{k5}, [scalingq+m1] 876 kmovw k5, k1 877 pand m3, m5 878 vpgatherdd m1{k5}, [scalingq+m3] 879 vpshufb m1{k2}, m2, m6 880 881 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 882 vpsllvw m0, m7 883 vpsllvw m1, m7 884 pmulhrsw m18, m0 885 pmulhrsw m19, m1 886 add grain_lutq, 82*(4<<%2) 887 lea lumaq, [lumaq+lstrideq*(2<<%3)] 888 lea srcq, [srcq+strideq*2] 889 paddw m16, m18 890 paddw m17, m19 891 pmaxsw m16, m8 892 pmaxsw m17, m8 893 pminsw m16, m9 894 pminsw m17, m9 895%if %2 896 mova [dstq+strideq*0], ym16 897 vextracti32x8 [dstq+strideq*1], m16, 1 898 lea dstq, [dstq+strideq*2] 899 mova [dstq+strideq*0], ym17 900 vextracti32x8 [dstq+strideq*1], m17, 1 901%else 902 mova [dstq+strideq*0], m16 903 mova [dstq+strideq*1], m17 904%endif 905 lea dstq, [dstq+strideq*2] 906 ret 907%else 908%if %2 909 pand m2, m4 910 pand m3, m4 911%else 912 pand m2, m4, [lumaq+lstrideq*0] 913 pand m3, m4, [lumaq+lstrideq*1] 914%endif 915 jmp .add_noise_main 916%endif 917%endmacro 918 919 %%FGUV_32x32xN_LOOP 1, %2, %3 920.csfl: 921 %%FGUV_32x32xN_LOOP 0, %2, %3 922.end: 923 RET 924%endmacro 925 926FGUV_FN 420, 1, 1 927FGUV_FN 422, 1, 0 928FGUV_FN 444, 0, 0 929 930%endif 931