1; Copyright © 2022, VideoLAN and dav1d authors 2; Copyright © 2022, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28%include "x86/filmgrain_common.asm" 29 30%if ARCH_X86_64 31 32SECTION_RODATA 64 33 34pb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 35 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 36 db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 37 db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 38pb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 39 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 40 db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 41 db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 42interleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7 43pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 44pb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32 45pb_27_17: times 2 db 27, 17 46pb_23_22: times 2 db 23, 22 47pw_8: times 2 dw 8 48pw_1024: times 2 dw 1024 49pb_17_27: times 2 db 17, 27 50fg_max: times 4 db 255 51 times 4 db 240 52 times 4 db 235 53fg_min: times 4 db 0 54 times 4 db 16 55noise_rnd: times 2 dw 128 56 times 2 dw 64 57 times 2 dw 32 58 times 2 dw 16 59 60SECTION .text 61 62INIT_ZMM avx512icl 63cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \ 64 grain_lut, h, sby, see, overlap 65%define base r11-fg_min 66 lea r11, [fg_min] 67 mov r6d, [fg_dataq+FGData.scaling_shift] 68 mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 69 mov sbyd, sbym 70 mov overlapd, [fg_dataq+FGData.overlap_flag] 71 mov r12, 0x0000000f0000000f ; h_overlap mask 72 mova m0, [scalingq+64*0] 73 mova m1, [scalingq+64*1] 74 mova m2, [scalingq+64*2] 75 mova m3, [scalingq+64*3] 76 kmovq k1, r12 77 vbroadcasti32x4 m4, [base+interleave_hl] 78 vpbroadcastd ym16, [base+pb_27_17] 79 vpbroadcastd m12, [base+pb_17_27] 80 vpbroadcastd m6, [base+noise_rnd+r6*4-32] 81 test sbyd, sbyd 82 setnz r6b 83 vpbroadcastd m7, [base+fg_min+r7*4] 84 vpbroadcastd m8, [base+fg_max+r7*8] 85 pxor m5, m5 86 vpbroadcastd m9, [base+pw_1024] 87 vpbroadcastq m10, [base+pb_27_17_17_27] 88 vmovdqa64 m12{k1}, m16 89 test r6b, overlapb 90 jnz .v_overlap 91 92 imul seed, sbyd, (173 << 24) | 37 93 add seed, (105 << 24) | 178 94 rorx seed, seed, 24 95 movzx seed, seew 96 xor seed, [fg_dataq+FGData.seed] 97 98 DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ 99 h, sby, see, overlap 100 101 lea src_bakq, [srcq+wq] 102 neg wq 103 sub dstq, srcq 104.loop_x: 105 rorx r6, seeq, 1 106 or seed, 0xeff4 107 test seeb, seeh 108 lea seed, [r6+0x8000] 109 cmovp seed, r6d ; updated seed 110 rorx offyd, seed, 8 111 rorx offxq, seeq, 12 112 and offyd, 0xf 113 imul offyd, 164 114 lea offxd, [offyq+offxq*2+829] ; offy*stride+offx 115 116 DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ 117 h, sby, see, overlap 118 119 mov grain_lutq, grain_lutmp 120 mov hd, hm 121.loop_y: 122 movu ym21, [grain_lutq+offxyq-82] 123 vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1 124 call .add_noise 125 sub hb, 2 126 jg .loop_y 127 add wq, 32 128 jge .end 129 lea srcq, [src_bakq+wq] 130 test overlapd, overlapd 131 jz .loop_x 132 test sbyd, sbyd 133 jnz .hv_overlap 134 135.loop_x_h_overlap: 136 rorx r6, seeq, 1 137 or seed, 0xeff4 138 test seeb, seeh 139 lea seed, [r6+0x8000] 140 cmovp seed, r6d ; updated seed 141 142 DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ 143 h, sby, see, left_offxy 144 145 rorx offyd, seed, 8 146 mov left_offxyd, offxd ; previous column's offy*stride 147 rorx offxq, seeq, 12 148 and offyd, 0xf 149 imul offyd, 164 150 lea offxd, [offyq+offxq*2+829] ; offy*stride+offx 151 152 DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ 153 h, sby, see, left_offxy 154 155 mov grain_lutq, grain_lutmp 156 mov hd, hm 157.loop_y_h_overlap: 158 movu ym20, [grain_lutq+offxyq-82] 159 vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1 160 movd xm19, [grain_lutq+left_offxyq-50] 161 vinserti32x4 m19, [grain_lutq+left_offxyq+32], 2 162 punpcklbw m19, m20 163 pmaddubsw m19, m10, m19 164 pmulhrsw m19, m9 165 punpckhbw m21, m20, m5 166 packsswb m20{k1}, m19, m19 167 punpcklbw m20, m5, m20 168 call .add_noise_h 169 sub hb, 2 170 jg .loop_y_h_overlap 171 add wq, 32 172 jge .end 173 lea srcq, [src_bakq+wq] 174 test sbyd, sbyd 175 jnz .hv_overlap 176 jmp .loop_x_h_overlap 177 178.v_overlap: 179 DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \ 180 h, sby, see, overlap 181 182 movzx r6d, sbyb 183 imul seed, [fg_dataq+FGData.seed], 0x00010001 184 imul r7d, r6d, 173 * 0x00010001 185 imul r6d, 37 * 0x01000100 186 add r7d, (105 << 16) | 188 187 add r6d, (178 << 24) | (141 << 8) 188 and r7d, 0x00ff00ff 189 and r6d, 0xff00ff00 190 xor seed, r7d 191 xor seed, r6d ; (cur_seed << 16) | top_seed 192 193 DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ 194 h, sby, see, overlap 195 196 lea src_bakq, [srcq+wq] 197 neg wq 198 sub dstq, srcq 199 200 ; we assume from the block above that bits 8-15 of r7d are zero'ed 201 mov r6d, seed 202 or seed, 0xeff4eff4 203 test seeb, seeh 204 setp r7b ; parity of top_seed 205 shr seed, 16 206 shl r7d, 16 207 test seeb, seeh 208 setp r7b ; parity of cur_seed 209 or r6d, 0x00010001 210 xor r7d, r6d 211 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 212 rorx offyd, seed, 8 213 rorx offxd, seed, 12 214 and offyd, 0xf000f 215 and offxd, 0xf000f 216 imul offyd, 164 217 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 218 lea offxd, [offyq+offxq*2+0x10001*829+32*82] 219 220 DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ 221 h, sby, see, overlap, top_offxy 222 223 mov grain_lutq, grain_lutmp 224 mov hd, hm 225 movzx top_offxyd, offxyw 226 shr offxyd, 16 227 movu ym19, [grain_lutq+offxyq-82] 228 vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 229 movu ym21, [grain_lutq+top_offxyq-82] 230 vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 231 punpckhbw m20, m21, m19 232 punpcklbw m21, m19 233 call .add_noise_v 234 sub hb, 2 235 jg .loop_y 236 add wq, 32 237 jge .end 238 lea srcq, [src_bakq+wq] 239 240 ; since fg_dataq.overlap is guaranteed to be set, we never jump back 241 ; to .v_overlap, and instead always fall-through to h+v overlap 242.hv_overlap: 243 ; we assume from the block above that bits 8-15 of r7d are zero'ed 244 mov r6d, seed 245 or seed, 0xeff4eff4 246 test seeb, seeh 247 setp r7b ; parity of top_seed 248 shr seed, 16 249 shl r7d, 16 250 test seeb, seeh 251 setp r7b ; parity of cur_seed 252 or r6d, 0x00010001 253 xor r7d, r6d 254 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 255 256 DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ 257 h, sby, see, left_offxy, top_offxy, topleft_offxy 258 259 mov topleft_offxyd, top_offxyd 260 rorx offyd, seed, 8 261 mov left_offxyd, offxd 262 rorx offxd, seed, 12 263 and offyd, 0xf000f 264 and offxd, 0xf000f 265 imul offyd, 164 266 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 267 lea offxd, [offyq+offxq*2+0x10001*829+32*82] 268 269 DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ 270 h, sby, see, left_offxy, top_offxy, topleft_offxy 271 272 mov grain_lutq, grain_lutmp 273 mov hd, hm 274 movzx top_offxyd, offxyw 275 shr offxyd, 16 276 movu ym19, [grain_lutq+offxyq-82] 277 vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 278 movd xm16, [grain_lutq+left_offxyq-50] 279 vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2 280 movu ym21, [grain_lutq+top_offxyq-82] 281 vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 282 movd xm17, [grain_lutq+topleft_offxyq-50] 283 vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2 284 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 285 punpcklbw m16, m19 286 pmaddubsw m16, m10, m16 287 punpcklbw m17, m21 288 pmaddubsw m17, m10, m17 289 punpckhbw m20, m21, m19 290 pmulhrsw m16, m9 291 pmulhrsw m17, m9 292 packsswb m19{k1}, m16, m16 293 packsswb m21{k1}, m17, m17 294 ; followed by v interpolation (top | cur -> cur) 295 punpcklbw m21, m19 296 call .add_noise_v 297 sub hb, 2 298 jg .loop_y_h_overlap 299 add wq, 32 300 lea srcq, [src_bakq+wq] 301 jl .hv_overlap 302.end: 303 RET 304ALIGN function_align 305.add_noise_v: 306 pmaddubsw m20, m12, m20 307 pmaddubsw m21, m12, m21 308 pmulhrsw m20, m9 309 pmulhrsw m21, m9 310 packsswb m21, m20 311.add_noise: 312 punpcklbw m20, m5, m21 313 punpckhbw m21, m5 314.add_noise_h: 315 mova ym18, [srcq+strideq*0] 316 vinserti32x8 m18, [srcq+strideq*1], 1 317 mova m19, m0 318 punpcklbw m16, m18, m5 319 vpermt2b m19, m18, m1 ; scaling[ 0..127] 320 vpmovb2m k2, m18 321 punpckhbw m17, m18, m5 322 vpermi2b m18, m2, m3 ; scaling[128..255] 323 vmovdqu8 m19{k2}, m18 ; scaling[src] 324 pshufb m19, m4 325 pmaddubsw m18, m19, m20 326 pmaddubsw m19, m21 327 add grain_lutq, 82*2 328 pmulhrsw m18, m6 ; noise 329 pmulhrsw m19, m6 330 paddw m16, m18 331 paddw m17, m19 332 packuswb m16, m17 333 pmaxub m16, m7 334 pminub m16, m8 335 mova [dstq+srcq], ym16 336 add srcq, strideq 337 vextracti32x8 [dstq+srcq], m16, 1 338 add srcq, strideq 339 ret 340 341%macro FGUV_FN 3 ; name, ss_hor, ss_ver 342cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \ 343 scaling, grain_lut, h, sby, luma, \ 344 overlap, uv_pl, is_id, _, stride3 345 lea r11, [fg_min] 346 mov r6d, [fg_dataq+FGData.scaling_shift] 347 mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 348 mov r9d, is_idm 349 mov sbyd, sbym 350 mov overlapd, [fg_dataq+FGData.overlap_flag] 351%if %2 352 mov r12, 0x000f000f000f000f ; h_overlap mask 353 vpbroadcastq m10, [base+pb_23_22_0_32] 354 lea stride3q, [strideq*3] 355%else 356 mov r12, 0x0000000f0000000f 357 vpbroadcastq m10, [base+pb_27_17_17_27] 358%endif 359 mova m0, [scalingq+64*0] 360 mova m1, [scalingq+64*1] 361 mova m2, [scalingq+64*2] 362 mova m3, [scalingq+64*3] 363 kmovq k1, r12 364 vbroadcasti32x4 m4, [base+interleave_hl] 365 vpbroadcastd m6, [base+noise_rnd+r6*4-32] 366 vpbroadcastd m7, [base+fg_min+r7*4] 367 shlx r7d, r7d, r9d 368 vpbroadcastd m8, [base+fg_max+r7*4] 369 test sbyd, sbyd 370 setnz r7b 371 vpbroadcastd m9, [base+pw_1024] 372 mova m11, [base+pb_even] 373 mova m12, [base+pb_odd] 374 pxor m5, m5 375 mov r5, r10mp ; lstride 376 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 377 jne .csfl 378 379%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 380 DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ 381 h, sby, see, overlap, uv_pl, _, _, stride3 382%if %1 383 mov r6d, uv_plm 384 vpbroadcastd m16, [base+pw_8] 385 vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4] 386 vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] 387 pshufb m14, m16 ; uv_luma_mult, uv_mult 388%endif 389 test r7b, overlapb 390 jnz %%v_overlap 391 392 imul seed, sbyd, (173 << 24) | 37 393 add seed, (105 << 24) | 178 394 rorx seed, seed, 24 395 movzx seed, seew 396 xor seed, [fg_dataq+FGData.seed] 397 398 DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 399 offx, offy, see, overlap, _, _, _, stride3 400 401 mov lumaq, r9mp 402 lea r11, [srcq+wq] 403 lea r12, [dstq+wq] 404 lea r13, [lumaq+wq*(1+%2)] 405 mov r11mp, r11 406 mov r12mp, r12 407 neg wq 408 409%%loop_x: 410 rorx r6, seeq, 1 411 or seed, 0xeff4 412 test seeb, seeh 413 lea seed, [r6+0x8000] 414 cmovp seed, r6d ; updated seed 415 rorx offyd, seed, 8 416 rorx offxq, seeq, 12 417 and offyd, 0xf 418 imul offyd, 164>>%3 419 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 420 421 DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 422 h, offxy, see, overlap, _, _, _, stride3 423 424 mov grain_lutq, grain_lutmp 425 mov hd, hm 426%%loop_y: 427%if %2 428 movu xm21, [grain_lutq+offxyq+82*0] 429 vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 430 vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 431 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 432%else 433 movu ym21, [grain_lutq+offxyq+82*0] 434 vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 435%endif 436 call %%add_noise 437 sub hb, 2<<%2 438 jg %%loop_y 439 add wq, 32>>%2 440 jge .end 441 mov srcq, r11mp 442 mov dstq, r12mp 443 lea lumaq, [r13+wq*(1<<%2)] 444 add srcq, wq 445 add dstq, wq 446 test overlapd, overlapd 447 jz %%loop_x 448 cmp dword r8m, 0 ; sby 449 jne %%hv_overlap 450 451 ; horizontal overlap (without vertical overlap) 452%%loop_x_h_overlap: 453 rorx r6, seeq, 1 454 or seed, 0xeff4 455 test seeb, seeh 456 lea seed, [r6+0x8000] 457 cmovp seed, r6d ; updated seed 458 459 DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 460 offx, offy, see, left_offxy, _, _, _, stride3 461 462 lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx 463 rorx offyd, seed, 8 464 rorx offxq, seeq, 12 465 and offyd, 0xf 466 imul offyd, 164>>%3 467 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 468 469 DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 470 h, offxy, see, left_offxy, _, _, _, stride3 471 472 mov grain_lutq, grain_lutmp 473 mov hd, hm 474%%loop_y_h_overlap: 475%if %2 476 movu xm20, [grain_lutq+offxyq +82*0] 477 movd xm19, [grain_lutq+left_offxyq+82*0] 478 vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1 479 vinserti32x4 ym19, [grain_lutq+left_offxyq+82*1], 1 480 vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2 481 vinserti32x4 m19, [grain_lutq+left_offxyq+82*2], 2 482 vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3 483 vinserti32x4 m19, [grain_lutq+left_offxyq+82*3], 3 484%else 485 movu ym20, [grain_lutq+offxyq + 0] 486 movd xm19, [grain_lutq+left_offxyq+ 0] 487 vinserti32x8 m20, [grain_lutq+offxyq +82], 1 488 vinserti32x4 m19, [grain_lutq+left_offxyq+82], 2 489%endif 490 punpcklbw m19, m20 491 pmaddubsw m19, m10, m19 492 punpckhbw m21, m20, m5 493 pmulhrsw m19, m9 494 vpacksswb m20{k1}, m19, m19 495 punpcklbw m20, m5, m20 496 call %%add_noise_h 497 sub hb, 2<<%2 498 jg %%loop_y_h_overlap 499 add wq, 32>>%2 500 jge .end 501 mov srcq, r11mp 502 mov dstq, r12mp 503 lea lumaq, [r13+wq*(1<<%2)] 504 add srcq, wq 505 add dstq, wq 506 cmp dword r8m, 0 ; sby 507 jne %%hv_overlap 508 jmp %%loop_x_h_overlap 509 510%%v_overlap: 511 DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ 512 _, sby, see, overlap, _, _, _, stride3 513 514 movzx sbyd, sbyb 515 imul seed, [fg_dataq+FGData.seed], 0x00010001 516 imul r7d, sbyd, 173 * 0x00010001 517 imul sbyd, 37 * 0x01000100 518 add r7d, (105 << 16) | 188 519 add sbyd, (178 << 24) | (141 << 8) 520 and r7d, 0x00ff00ff 521 and sbyd, 0xff00ff00 522 xor seed, r7d 523 xor seed, sbyd ; (cur_seed << 16) | top_seed 524 525%if %3 526 vpbroadcastd m13, [base+pb_23_22] 527 kxnorw k3, k3, k3 ; v_overlap mask 528%elif %2 529 vbroadcasti32x8 m13, [base+pb_27_17] 530 kxnord k3, k3, k3 531 pshufd m13, m13, q0000 ; 8x27_17, 8x17_27 532%else 533 vpbroadcastd ym16, [base+pb_27_17] 534 vpbroadcastd m13, [base+pb_17_27] 535 vmovdqa64 m13{k1}, m16 536%endif 537 538 DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 539 offx, offy, see, overlap, top_offxy, _, _, stride3 540 541 mov lumaq, r9mp 542 lea r11, [srcq+wq] 543 lea r12, [dstq+wq] 544 lea r13, [lumaq+wq*(1<<%2)] 545 mov r11mp, r11 546 mov r12mp, r12 547 neg wq 548 549 ; we assume from the block above that bits 8-15 of r7d are zero'ed 550 mov r6d, seed 551 or seed, 0xeff4eff4 552 test seeb, seeh 553 setp r7b ; parity of top_seed 554 shr seed, 16 555 shl r7d, 16 556 test seeb, seeh 557 setp r7b ; parity of cur_seed 558 or r6d, 0x00010001 559 xor r7d, r6d 560 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 561 rorx offyd, seed, 8 562 rorx offxd, seed, 12 563 and offyd, 0x000f000f 564 and offxd, 0x000f000f 565 imul offyd, 164>>%3 566 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 567 lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 568 569 DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 570 h, offxy, see, overlap, top_offxy, _, _, stride3 571 572 mov grain_lutq, grain_lutmp 573 mov hd, hm 574 movzx top_offxyd, offxyw 575 shr offxyd, 16 576 577%if %3 578 movu xm18, [grain_lutq+offxyq+82*0] 579 movu xm20, [grain_lutq+top_offxyq+82*0] 580 ; only interpolate first line, insert remaining line unmodified 581 vbroadcasti128 ym21, [grain_lutq+offxyq+82*1] 582 vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 583 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 584 punpcklbw xm19, xm20, xm18 585 punpckhbw xm20, xm18 586%elif %2 587 movu xm18, [grain_lutq+offxyq+82*0] 588 vinserti128 ym18, [grain_lutq+offxyq+82*1], 1 589 movu xm20, [grain_lutq+top_offxyq+82*0] 590 vinserti32x4 ym20, [grain_lutq+top_offxyq+82*1], 1 591 vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2] 592 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 593 punpcklbw ym19, ym20, ym18 594 punpckhbw ym20, ym18 595%else 596 movu ym21, [grain_lutq+offxyq+82*0] 597 vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 598 movu ym20, [grain_lutq+top_offxyq+82*0] 599 vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 600%endif 601 call %%add_noise_v 602 sub hb, 2<<%2 603 jg %%loop_y 604 add wq, 32>>%2 605 jge .end 606 mov srcq, r11mp 607 mov dstq, r12mp 608 lea lumaq, [r13+wq*(1<<%2)] 609 add srcq, wq 610 add dstq, wq 611 612%%hv_overlap: 613 ; we assume from the block above that bits 8-15 of r7d are zero'ed 614 mov r6d, seed 615 or seed, 0xeff4eff4 616 test seeb, seeh 617 setp r7b ; parity of top_seed 618 shr seed, 16 619 shl r7d, 16 620 test seeb, seeh 621 setp r7b ; parity of cur_seed 622 or r6d, 0x00010001 623 xor r7d, r6d 624 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 625 626 DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 627 offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 628 629 lea topleft_offxyd, [top_offxyq+(32>>%2)] 630 lea left_offxyd, [offyq+(32>>%2)] 631 rorx offyd, seed, 8 632 rorx offxd, seed, 12 633 and offyd, 0x000f000f 634 and offxd, 0x000f000f 635 imul offyd, 164>>%3 636 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 637 lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 638 639 DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ 640 h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 641 642 mov grain_lutq, grain_lutmp 643 mov hd, hm 644 movzx top_offxyd, offxyw 645 shr offxyd, 16 646 647%if %2 648 movu xm21, [grain_lutq+offxyq+82*0] 649 movd xm16, [grain_lutq+left_offxyq+82*0] 650 vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 651 vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1 652 vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 653 vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2 654 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 655 vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3 656 movd xm18, [grain_lutq+topleft_offxyq+82*0] 657 movu xm20, [grain_lutq+top_offxyq] 658 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 659 punpcklbw m16, m21 660%if %3 661 punpcklbw xm18, xm20 662%else 663 vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1 664 vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1 665 punpcklbw ym18, ym20 666%endif 667 punpcklqdq m16, m18 668 pmaddubsw m16, m10, m16 669 pmulhrsw m16, m9 670 packsswb m16, m16 671 vmovdqu8 m21{k1}, m16 672%if %3 673 vpalignr xm20{k1}, xm16, xm16, 4 674 punpcklbw xm19, xm20, xm21 675 punpckhbw xm20, xm21 676%else 677 vpalignr ym20{k1}, ym16, ym16, 4 678 punpcklbw ym19, ym20, ym21 679 punpckhbw ym20, ym21 680%endif 681%else 682 movu ym21, [grain_lutq+offxyq+82*0] 683 vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 684 movd xm16, [grain_lutq+left_offxyq+82*0] 685 vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2 686 movu ym20, [grain_lutq+top_offxyq+82*0] 687 vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 688 movd xm18, [grain_lutq+topleft_offxyq+82*0] 689 vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2 690 punpcklbw m16, m21 691 punpcklbw m18, m20 692 punpcklqdq m16, m18 693 pmaddubsw m16, m10, m16 694 pmulhrsw m16, m9 695 packsswb m16, m16 696 vpalignr m20{k1}, m16, m16, 4 697 vmovdqu8 m21{k1}, m16 698%endif 699 call %%add_noise_v 700 sub hb, 2<<%2 701 jg %%loop_y_h_overlap 702 add wq, 32>>%2 703 jge .end 704 mov srcq, r11mp 705 mov dstq, r12mp 706 lea lumaq, [r13+wq*(1<<%2)] 707 add srcq, wq 708 add dstq, wq 709 jmp %%hv_overlap 710ALIGN function_align 711%%add_noise_v: 712%if %3 713 pmaddubsw xm19, xm13, xm19 714 pmaddubsw xm20, xm13, xm20 715 pmulhrsw xm19, xm9 716 pmulhrsw xm20, xm9 717 vpacksswb m21{k3}, m19, m20 718%elif %2 719 pmaddubsw ym19, ym13, ym19 720 pmaddubsw ym20, ym13, ym20 721 pmulhrsw ym19, ym9 722 pmulhrsw ym20, ym9 723 vpacksswb m21{k3}, m19, m20 724%else 725 punpcklbw m19, m20, m21 726 punpckhbw m20, m21 727 pmaddubsw m19, m13, m19 728 pmaddubsw m20, m13, m20 729 pmulhrsw m19, m9 730 pmulhrsw m20, m9 731 packsswb m21, m19, m20 732%endif 733%%add_noise: 734 punpcklbw m20, m5, m21 735 punpckhbw m21, m5 736%%add_noise_h: 737 mova ym18, [lumaq+lstrideq*(0<<%3)] 738 vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1 739%if %2 740 lea lumaq, [lumaq+lstrideq*(2<<%3)] 741 mova ym16, [lumaq+lstrideq*(0<<%3)] 742 vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1 743 mova xm17, [srcq+strideq*0] 744 mova m19, m11 745 vpermi2b m19, m18, m16 746 vinserti128 ym17, [srcq+strideq*1], 1 747 vpermt2b m18, m12, m16 748 vinserti32x4 m17, [srcq+strideq*2], 2 749 pavgb m18, m19 750 vinserti32x4 m17, [srcq+stride3q ], 3 751%else 752 mova ym17, [srcq+strideq*0] 753 vinserti32x8 m17, [srcq+strideq*1], 1 754%endif 755%if %1 756 punpckhbw m19, m18, m17 757 punpcklbw m18, m17 ; { luma, chroma } 758 pmaddubsw m19, m14 759 pmaddubsw m18, m14 760 psraw m19, 6 761 psraw m18, 6 762 paddw m19, m15 763 paddw m18, m15 764 packuswb m18, m19 765.add_noise_main: 766 mova m19, m0 767 vpermt2b m19, m18, m1 ; scaling[ 0..127] 768 vpmovb2m k2, m18 769 vpermi2b m18, m2, m3 ; scaling[128..255] 770 vmovdqu8 m19{k2}, m18 ; scaling[src] 771 pshufb m19, m4 772 pmaddubsw m18, m19, m20 773 pmaddubsw m19, m21 774 add grain_lutq, 82*2<<%2 775 lea lumaq, [lumaq+lstrideq*(2<<%3)] 776 lea srcq, [srcq+strideq*(2<<%2)] 777 pmulhrsw m18, m6 ; noise 778 pmulhrsw m19, m6 779 punpcklbw m16, m17, m5 ; chroma 780 punpckhbw m17, m5 781 paddw m16, m18 782 paddw m17, m19 783 packuswb m16, m17 784 pmaxub m16, m7 785 pminub m16, m8 786%if %2 787 mova [dstq+strideq*0], xm16 788 vextracti128 [dstq+strideq*1], ym16, 1 789 vextracti32x4 [dstq+strideq*2], m16, 2 790 vextracti32x4 [dstq+stride3q ], m16, 3 791%else 792 mova [dstq+strideq*0], ym16 793 vextracti32x8 [dstq+strideq*1], m16, 1 794%endif 795 lea dstq, [dstq+strideq*(2<<%2)] 796 ret 797%else 798 jmp .add_noise_main 799%endif 800%endmacro 801 802 %%FGUV_32x32xN_LOOP 1, %2, %3 803.csfl: 804 %%FGUV_32x32xN_LOOP 0, %2, %3 805.end: 806 RET 807%endmacro 808 809FGUV_FN 420, 1, 1 810FGUV_FN 422, 1, 0 811FGUV_FN 444, 0, 0 812 813%endif ; ARCH_X86_64 814