1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 30 31filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 32pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 33z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 34z_base_inc_z2: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 35z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 36z2_upsample_l: db -1, -1, -2, -1, -3, -1, -4, -1, 8, 9, 8, 9, 10, 11, 12, 13 37 db 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 38z2_top_shufA: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 39z2_top_shufB: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 40z2_left_shufA: db 14, 15, 12, 13, 10, 11, 8, 9, 12, 13, 10, 11, 8, 9, 6, 7 41z2_left_shufB: db 14, 15, 10, 11, 6, 7, 2, 3, 12, 13, 8, 9, 4, 5, 0, 1 42z_filt_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 43z_filt_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 44 db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 45z_filt_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 46z_filt_wh4: db 7, 7, 19, 7, 47z_filt_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 48ALIGN 8 49pb_2_3: times 4 db 2, 3 50z2_dy_offset: dw 96*64, 96*64, 95*64, 95*64 51z_filt_k: times 4 dw 8 52 times 4 dw 6 53 times 4 dw 4 54 times 4 dw 5 55pw_m3584: times 4 dw -3584 56pw_m3072: times 4 dw -3072 57pw_m2560: times 4 dw -2560 58pw_m2048: times 4 dw -2048 59pw_m1536: times 4 dw -1536 60pw_m1024: times 4 dw -1024 61pw_m512: times 4 dw -512 62pw_1: times 4 dw 1 63pw_2: times 4 dw 2 64pw_3: times 4 dw 3 65pw_62: times 4 dw 62 66pw_256: times 4 dw 256 67pw_512: times 4 dw 512 68pw_2048: times 4 dw 2048 69 70%define pw_4 (z_filt_k+8*2) 71%define pw_8 (z_filt_k+8*0) 72%define pw_m1to4 z2_upsample_l 73 74%macro JMP_TABLE 3-* 75 %xdefine %1_%2_table (%%table - 2*4) 76 %xdefine %%base mangle(private_prefix %+ _%1_%2) 77 %%table: 78 %rep %0 - 2 79 dd %%base %+ .%3 - (%%table - 2*4) 80 %rotate 1 81 %endrep 82%endmacro 83 84%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) 85%define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) 86%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4) 87 88JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 89JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ 90 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ 91 s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 92JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 93JMP_TABLE ipred_z1_16bpc, ssse3, w4, w8, w16, w32, w64 94JMP_TABLE ipred_z2_16bpc, ssse3, w4, w8, w16, w32, w64 95JMP_TABLE ipred_z3_16bpc, ssse3, h4, h8, h16, h32, h64 96JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ 97 s4-8*4, s8-8*4, s16-8*4, s32-8*4 98JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 99JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32 100JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 101 102cextern smooth_weights_1d_16bpc 103cextern smooth_weights_2d_16bpc 104cextern dr_intra_derivative 105cextern filter_intra_taps 106 107SECTION .text 108 109INIT_XMM ssse3 110cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h 111 LEA r5, ipred_dc_left_16bpc_ssse3_table 112 movd m4, wm 113 tzcnt wd, wm 114 add tlq, 2 115 movifnidn hd, hm 116 pxor m3, m3 117 pavgw m4, m3 118 movd m5, wd 119 movu m0, [tlq] 120 movsxd r6, [r5+wq*4] 121 add r6, r5 122 add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table 123 movsxd wq, [r5+wq*4] 124 add wq, r5 125 jmp r6 126 127cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 128 LEA r5, ipred_dc_left_16bpc_ssse3_table 129 mov hd, hm 130 movd m4, hm 131 tzcnt r6d, hd 132 sub tlq, hq 133 tzcnt wd, wm 134 pxor m3, m3 135 sub tlq, hq 136 pavgw m4, m3 137 movd m5, r6d 138 movu m0, [tlq] 139 movsxd r6, [r5+r6*4] 140 add r6, r5 141 add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table 142 movsxd wq, [r5+wq*4] 143 add wq, r5 144 jmp r6 145.h64: 146 movu m2, [tlq+112] 147 movu m1, [tlq+ 96] 148 paddw m0, m2 149 movu m2, [tlq+ 80] 150 paddw m1, m2 151 movu m2, [tlq+ 64] 152 paddw m0, m2 153 paddw m0, m1 154.h32: 155 movu m1, [tlq+ 48] 156 movu m2, [tlq+ 32] 157 paddw m1, m2 158 paddw m0, m1 159.h16: 160 movu m1, [tlq+ 16] 161 paddw m0, m1 162.h8: 163 movhlps m1, m0 164 paddw m0, m1 165.h4: 166 punpcklwd m0, m3 167 paddd m4, m0 168 punpckhqdq m0, m0 169 paddd m0, m4 170 pshuflw m4, m0, q1032 171 paddd m0, m4 172 psrld m0, m5 173 lea stride3q, [strideq*3] 174 pshuflw m0, m0, q0000 175 punpcklqdq m0, m0 176 jmp wq 177 178cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 179 movifnidn hd, hm 180 tzcnt r6d, hd 181 lea r5d, [wq+hq] 182 movd m4, r5d 183 tzcnt r5d, r5d 184 movd m5, r5d 185 LEA r5, ipred_dc_16bpc_ssse3_table 186 tzcnt wd, wd 187 movsxd r6, [r5+r6*4] 188 movsxd wq, [r5+wq*4+5*4] 189 pxor m3, m3 190 psrlw m4, 1 191 add r6, r5 192 add wq, r5 193 lea stride3q, [strideq*3] 194 jmp r6 195.h4: 196 movq m0, [tlq-8] 197 jmp wq 198.w4: 199 movq m1, [tlq+2] 200 paddw m1, m0 201 punpckhwd m0, m3 202 punpcklwd m1, m3 203 paddd m0, m1 204 paddd m4, m0 205 punpckhqdq m0, m0 206 paddd m0, m4 207 pshuflw m1, m0, q1032 208 paddd m0, m1 209 cmp hd, 4 210 jg .w4_mul 211 psrlw m0, 3 212 jmp .w4_end 213.w4_mul: 214 mov r2d, 0xAAAB 215 mov r3d, 0x6667 216 cmp hd, 16 217 cmove r2d, r3d 218 psrld m0, 2 219 movd m1, r2d 220 pmulhuw m0, m1 221 psrlw m0, 1 222.w4_end: 223 pshuflw m0, m0, q0000 224.s4: 225 movq [dstq+strideq*0], m0 226 movq [dstq+strideq*1], m0 227 movq [dstq+strideq*2], m0 228 movq [dstq+stride3q ], m0 229 lea dstq, [dstq+strideq*4] 230 sub hd, 4 231 jg .s4 232 RET 233.h8: 234 mova m0, [tlq-16] 235 jmp wq 236.w8: 237 movu m1, [tlq+2] 238 paddw m0, m1 239 punpcklwd m1, m0, m3 240 punpckhwd m0, m3 241 paddd m0, m1 242 paddd m4, m0 243 punpckhqdq m0, m0 244 paddd m0, m4 245 pshuflw m1, m0, q1032 246 paddd m0, m1 247 psrld m0, m5 248 cmp hd, 8 249 je .w8_end 250 mov r2d, 0xAAAB 251 mov r3d, 0x6667 252 cmp hd, 32 253 cmove r2d, r3d 254 movd m1, r2d 255 pmulhuw m0, m1 256 psrlw m0, 1 257.w8_end: 258 pshuflw m0, m0, q0000 259 punpcklqdq m0, m0 260.s8: 261 mova [dstq+strideq*0], m0 262 mova [dstq+strideq*1], m0 263 mova [dstq+strideq*2], m0 264 mova [dstq+stride3q ], m0 265 lea dstq, [dstq+strideq*4] 266 sub hd, 4 267 jg .s8 268 RET 269.h16: 270 mova m0, [tlq-32] 271 paddw m0, [tlq-16] 272 jmp wq 273.w16: 274 movu m1, [tlq+ 2] 275 movu m2, [tlq+18] 276 paddw m1, m2 277 paddw m0, m1 278 punpckhwd m1, m0, m3 279 punpcklwd m0, m3 280 paddd m0, m1 281 paddd m4, m0 282 punpckhqdq m0, m0 283 paddd m0, m4 284 pshuflw m1, m0, q1032 285 paddd m0, m1 286 psrld m0, m5 287 cmp hd, 16 288 je .w16_end 289 mov r2d, 0xAAAB 290 mov r3d, 0x6667 291 test hd, 8|32 292 cmovz r2d, r3d 293 movd m1, r2d 294 pmulhuw m0, m1 295 psrlw m0, 1 296.w16_end: 297 pshuflw m0, m0, q0000 298 punpcklqdq m0, m0 299.s16c: 300 mova m1, m0 301.s16: 302 mova [dstq+strideq*0+16*0], m0 303 mova [dstq+strideq*0+16*1], m1 304 mova [dstq+strideq*1+16*0], m0 305 mova [dstq+strideq*1+16*1], m1 306 mova [dstq+strideq*2+16*0], m0 307 mova [dstq+strideq*2+16*1], m1 308 mova [dstq+stride3q +16*0], m0 309 mova [dstq+stride3q +16*1], m1 310 lea dstq, [dstq+strideq*4] 311 sub hd, 4 312 jg .s16 313 RET 314.h32: 315 mova m0, [tlq-64] 316 paddw m0, [tlq-48] 317 paddw m0, [tlq-32] 318 paddw m0, [tlq-16] 319 jmp wq 320.w32: 321 movu m1, [tlq+ 2] 322 movu m2, [tlq+18] 323 paddw m1, m2 324 movu m2, [tlq+34] 325 paddw m0, m2 326 movu m2, [tlq+50] 327 paddw m1, m2 328 paddw m0, m1 329 punpcklwd m1, m0, m3 330 punpckhwd m0, m3 331 paddd m0, m1 332 paddd m4, m0 333 punpckhqdq m0, m0 334 paddd m0, m4 335 pshuflw m1, m0, q1032 336 paddd m0, m1 337 psrld m0, m5 338 cmp hd, 32 339 je .w32_end 340 mov r2d, 0xAAAB 341 mov r3d, 0x6667 342 cmp hd, 8 343 cmove r2d, r3d 344 movd m1, r2d 345 pmulhuw m0, m1 346 psrlw m0, 1 347.w32_end: 348 pshuflw m0, m0, q0000 349 punpcklqdq m0, m0 350.s32c: 351 mova m1, m0 352 mova m2, m0 353 mova m3, m0 354.s32: 355 mova [dstq+strideq*0+16*0], m0 356 mova [dstq+strideq*0+16*1], m1 357 mova [dstq+strideq*0+16*2], m2 358 mova [dstq+strideq*0+16*3], m3 359 mova [dstq+strideq*1+16*0], m0 360 mova [dstq+strideq*1+16*1], m1 361 mova [dstq+strideq*1+16*2], m2 362 mova [dstq+strideq*1+16*3], m3 363 lea dstq, [dstq+strideq*2] 364 sub hd, 2 365 jg .s32 366 RET 367.h64: 368 mova m0, [tlq-128] 369 mova m1, [tlq-112] 370 paddw m0, [tlq- 96] 371 paddw m1, [tlq- 80] 372 paddw m0, [tlq- 64] 373 paddw m1, [tlq- 48] 374 paddw m0, [tlq- 32] 375 paddw m1, [tlq- 16] 376 paddw m0, m1 377 jmp wq 378.w64: 379 movu m1, [tlq+ 2] 380 movu m2, [tlq+ 18] 381 paddw m1, m2 382 movu m2, [tlq+ 34] 383 paddw m0, m2 384 movu m2, [tlq+ 50] 385 paddw m1, m2 386 movu m2, [tlq+ 66] 387 paddw m0, m2 388 movu m2, [tlq+ 82] 389 paddw m1, m2 390 movu m2, [tlq+ 98] 391 paddw m0, m2 392 movu m2, [tlq+114] 393 paddw m1, m2 394 paddw m0, m1 395 punpcklwd m1, m0, m3 396 punpckhwd m0, m3 397 paddd m0, m1 398 paddd m4, m0 399 punpckhqdq m0, m0 400 paddd m0, m4 401 pshuflw m1, m0, q1032 402 paddd m0, m1 403 psrld m0, m5 404 cmp hd, 64 405 je .w64_end 406 mov r2d, 0xAAAB 407 mov r3d, 0x6667 408 cmp hd, 16 409 cmove r2d, r3d 410 movd m1, r2d 411 pmulhuw m0, m1 412 psrlw m0, 1 413.w64_end: 414 pshuflw m0, m0, q0000 415 punpcklqdq m0, m0 416.s64: 417 mova [dstq+16*0], m0 418 mova [dstq+16*1], m0 419 mova [dstq+16*2], m0 420 mova [dstq+16*3], m0 421 mova [dstq+16*4], m0 422 mova [dstq+16*5], m0 423 mova [dstq+16*6], m0 424 mova [dstq+16*7], m0 425 add dstq, strideq 426 dec hd 427 jg .s64 428 RET 429 430cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 431 mov r6d, r8m 432 LEA r5, ipred_dc_128_16bpc_ssse3_table 433 tzcnt wd, wm 434 shr r6d, 11 435 movifnidn hd, hm 436 movsxd wq, [r5+wq*4] 437 movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8] 438 add wq, r5 439 lea stride3q, [strideq*3] 440 jmp wq 441 442cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 443 LEA r5, ipred_dc_splat_16bpc_ssse3_table 444 movifnidn hd, hm 445 movu m0, [tlq+ 2] 446 movu m1, [tlq+ 18] 447 movu m2, [tlq+ 34] 448 movu m3, [tlq+ 50] 449 cmp wd, 64 450 je .w64 451 tzcnt wd, wd 452 movsxd wq, [r5+wq*4] 453 add wq, r5 454 lea stride3q, [strideq*3] 455 jmp wq 456.w64: 457 WIN64_SPILL_XMM 8 458 movu m4, [tlq+ 66] 459 movu m5, [tlq+ 82] 460 movu m6, [tlq+ 98] 461 movu m7, [tlq+114] 462.w64_loop: 463 mova [dstq+16*0], m0 464 mova [dstq+16*1], m1 465 mova [dstq+16*2], m2 466 mova [dstq+16*3], m3 467 mova [dstq+16*4], m4 468 mova [dstq+16*5], m5 469 mova [dstq+16*6], m6 470 mova [dstq+16*7], m7 471 add dstq, strideq 472 dec hd 473 jg .w64_loop 474 RET 475 476cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 477%define base r5-ipred_h_16bpc_ssse3_table 478 tzcnt wd, wm 479 LEA r5, ipred_h_16bpc_ssse3_table 480 movifnidn hd, hm 481 movsxd wq, [r5+wq*4] 482 movddup m2, [base+pw_256] 483 movddup m3, [base+pb_2_3] 484 add wq, r5 485 lea stride3q, [strideq*3] 486 jmp wq 487.w4: 488 sub tlq, 8 489 movq m3, [tlq] 490 pshuflw m0, m3, q3333 491 pshuflw m1, m3, q2222 492 pshuflw m2, m3, q1111 493 pshuflw m3, m3, q0000 494 movq [dstq+strideq*0], m0 495 movq [dstq+strideq*1], m1 496 movq [dstq+strideq*2], m2 497 movq [dstq+stride3q ], m3 498 lea dstq, [dstq+strideq*4] 499 sub hd, 4 500 jg .w4 501 RET 502.w8: 503 sub tlq, 8 504 movq m3, [tlq] 505 punpcklwd m3, m3 506 pshufd m0, m3, q3333 507 pshufd m1, m3, q2222 508 pshufd m2, m3, q1111 509 pshufd m3, m3, q0000 510 mova [dstq+strideq*0], m0 511 mova [dstq+strideq*1], m1 512 mova [dstq+strideq*2], m2 513 mova [dstq+stride3q ], m3 514 lea dstq, [dstq+strideq*4] 515 sub hd, 4 516 jg .w8 517 RET 518.w16: 519 sub tlq, 4 520 movd m1, [tlq] 521 pshufb m0, m1, m3 522 pshufb m1, m2 523 mova [dstq+strideq*0+16*0], m0 524 mova [dstq+strideq*0+16*1], m0 525 mova [dstq+strideq*1+16*0], m1 526 mova [dstq+strideq*1+16*1], m1 527 lea dstq, [dstq+strideq*2] 528 sub hd, 2 529 jg .w16 530 RET 531.w32: 532 sub tlq, 4 533 movd m1, [tlq] 534 pshufb m0, m1, m3 535 pshufb m1, m2 536 mova [dstq+strideq*0+16*0], m0 537 mova [dstq+strideq*0+16*1], m0 538 mova [dstq+strideq*0+16*2], m0 539 mova [dstq+strideq*0+16*3], m0 540 mova [dstq+strideq*1+16*0], m1 541 mova [dstq+strideq*1+16*1], m1 542 mova [dstq+strideq*1+16*2], m1 543 mova [dstq+strideq*1+16*3], m1 544 lea dstq, [dstq+strideq*2] 545 sub hd, 2 546 jg .w32 547 RET 548.w64: 549 sub tlq, 2 550 movd m0, [tlq] 551 pshufb m0, m2 552 mova [dstq+16*0], m0 553 mova [dstq+16*1], m0 554 mova [dstq+16*2], m0 555 mova [dstq+16*3], m0 556 mova [dstq+16*4], m0 557 mova [dstq+16*5], m0 558 mova [dstq+16*6], m0 559 mova [dstq+16*7], m0 560 add dstq, strideq 561 dec hd 562 jg .w64 563 RET 564 565cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left 566%define base r5-ipred_paeth_16bpc_ssse3_table 567 movifnidn hd, hm 568 pshuflw m4, [tlq], q0000 569 mov leftq, tlq 570 add hd, hd 571 punpcklqdq m4, m4 ; topleft 572 sub leftq, hq 573 and wd, ~7 574 jnz .w8 575 movddup m5, [tlq+2] ; top 576 psubw m6, m5, m4 577 pabsw m7, m6 578.w4_loop: 579 movd m1, [leftq+hq-4] 580 punpcklwd m1, m1 581 punpckldq m1, m1 ; left 582%macro PAETH 0 583 paddw m0, m6, m1 584 psubw m2, m4, m0 ; tldiff 585 psubw m0, m5 ; tdiff 586 pabsw m2, m2 587 pabsw m0, m0 588 pminsw m2, m0 589 pcmpeqw m0, m2 590 pand m3, m5, m0 591 pandn m0, m4 592 por m0, m3 593 pcmpgtw m3, m7, m2 594 pand m0, m3 595 pandn m3, m1 596 por m0, m3 597%endmacro 598 PAETH 599 movhps [dstq+strideq*0], m0 600 movq [dstq+strideq*1], m0 601 lea dstq, [dstq+strideq*2] 602 sub hd, 2*2 603 jg .w4_loop 604 RET 605.w8: 606%if ARCH_X86_32 607 PUSH r6 608 %define r7d hm 609 %assign regs_used 7 610%elif WIN64 611 movaps r4m, m8 612 PUSH r7 613 %assign regs_used 8 614%endif 615%if ARCH_X86_64 616 movddup m8, [pw_256] 617%endif 618 lea tlq, [tlq+wq*2+2] 619 neg wq 620 mov r7d, hd 621.w8_loop0: 622 movu m5, [tlq+wq*2] 623 mov r6, dstq 624 add dstq, 16 625 psubw m6, m5, m4 626 pabsw m7, m6 627.w8_loop: 628 movd m1, [leftq+hq-2] 629%if ARCH_X86_64 630 pshufb m1, m8 631%else 632 pshuflw m1, m1, q0000 633 punpcklqdq m1, m1 634%endif 635 PAETH 636 mova [r6], m0 637 add r6, strideq 638 sub hd, 1*2 639 jg .w8_loop 640 mov hd, r7d 641 add wq, 8 642 jl .w8_loop0 643%if WIN64 644 movaps m8, r4m 645%endif 646 RET 647 648%if ARCH_X86_64 649DECLARE_REG_TMP 7 650%else 651DECLARE_REG_TMP 4 652%endif 653 654cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights 655 LEA weightsq, smooth_weights_1d_16bpc 656 mov hd, hm 657 lea weightsq, [weightsq+hq*4] 658 neg hq 659 movd m5, [tlq+hq*2] ; bottom 660 pshuflw m5, m5, q0000 661 punpcklqdq m5, m5 662 cmp wd, 4 663 jne .w8 664 movddup m4, [tlq+2] ; top 665 lea r3, [strideq*3] 666 psubw m4, m5 ; top - bottom 667.w4_loop: 668 movq m1, [weightsq+hq*2] 669 punpcklwd m1, m1 670 pshufd m0, m1, q1100 671 punpckhdq m1, m1 672 pmulhrsw m0, m4 673 pmulhrsw m1, m4 674 paddw m0, m5 675 paddw m1, m5 676 movq [dstq+strideq*0], m0 677 movhps [dstq+strideq*1], m0 678 movq [dstq+strideq*2], m1 679 movhps [dstq+r3 ], m1 680 lea dstq, [dstq+strideq*4] 681 add hq, 4 682 jl .w4_loop 683 RET 684.w8: 685%if ARCH_X86_32 686 PUSH r6 687 %assign regs_used 7 688 mov hm, hq 689 %define hq hm 690%elif WIN64 691 PUSH r7 692 %assign regs_used 8 693%endif 694.w8_loop0: 695 mov t0, hq 696 movu m4, [tlq+2] 697 add tlq, 16 698 mov r6, dstq 699 add dstq, 16 700 psubw m4, m5 701.w8_loop: 702 movq m3, [weightsq+t0*2] 703 punpcklwd m3, m3 704 pshufd m0, m3, q0000 705 pshufd m1, m3, q1111 706 pshufd m2, m3, q2222 707 pshufd m3, m3, q3333 708 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 709 REPX {paddw x, m5}, m0, m1, m2, m3 710 mova [r6+strideq*0], m0 711 mova [r6+strideq*1], m1 712 lea r6, [r6+strideq*2] 713 mova [r6+strideq*0], m2 714 mova [r6+strideq*1], m3 715 lea r6, [r6+strideq*2] 716 add t0, 4 717 jl .w8_loop 718 sub wd, 8 719 jg .w8_loop0 720 RET 721 722cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights 723 LEA weightsq, smooth_weights_1d_16bpc 724 mov wd, wm 725 movifnidn hd, hm 726 movd m5, [tlq+wq*2] ; right 727 sub tlq, 8 728 add hd, hd 729 pshuflw m5, m5, q0000 730 sub tlq, hq 731 punpcklqdq m5, m5 732 cmp wd, 4 733 jne .w8 734 movddup m4, [weightsq+4*2] 735 lea r3, [strideq*3] 736.w4_loop: 737 movq m1, [tlq+hq] ; left 738 punpcklwd m1, m1 739 psubw m1, m5 ; left - right 740 pshufd m0, m1, q3322 741 punpckldq m1, m1 742 pmulhrsw m0, m4 743 pmulhrsw m1, m4 744 paddw m0, m5 745 paddw m1, m5 746 movhps [dstq+strideq*0], m0 747 movq [dstq+strideq*1], m0 748 movhps [dstq+strideq*2], m1 749 movq [dstq+r3 ], m1 750 lea dstq, [dstq+strideq*4] 751 sub hd, 4*2 752 jg .w4_loop 753 RET 754.w8: 755 lea weightsq, [weightsq+wq*4] 756 neg wq 757%if ARCH_X86_32 758 PUSH r6 759 %assign regs_used 7 760 %define hd hm 761%elif WIN64 762 PUSH r7 763 %assign regs_used 8 764%endif 765.w8_loop0: 766 mov t0d, hd 767 mova m4, [weightsq+wq*2] 768 mov r6, dstq 769 add dstq, 16 770.w8_loop: 771 movq m3, [tlq+t0*(1+ARCH_X86_32)] 772 punpcklwd m3, m3 773 psubw m3, m5 774 pshufd m0, m3, q3333 775 pshufd m1, m3, q2222 776 pshufd m2, m3, q1111 777 pshufd m3, m3, q0000 778 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 779 REPX {paddw x, m5}, m0, m1, m2, m3 780 mova [r6+strideq*0], m0 781 mova [r6+strideq*1], m1 782 lea r6, [r6+strideq*2] 783 mova [r6+strideq*0], m2 784 mova [r6+strideq*1], m3 785 lea r6, [r6+strideq*2] 786 sub t0d, 4*(1+ARCH_X86_64) 787 jg .w8_loop 788 add wq, 8 789 jl .w8_loop0 790 RET 791 792%if ARCH_X86_64 793DECLARE_REG_TMP 10 794%else 795DECLARE_REG_TMP 3 796%endif 797 798cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ 799 h_weights, v_weights, top 800 LEA h_weightsq, smooth_weights_2d_16bpc 801 mov wd, wm 802 mov hd, hm 803 movd m7, [tlq+wq*2] ; right 804 lea v_weightsq, [h_weightsq+hq*8] 805 neg hq 806 movd m6, [tlq+hq*2] ; bottom 807 pshuflw m7, m7, q0000 808 pshuflw m6, m6, q0000 809 cmp wd, 4 810 jne .w8 811 movq m4, [tlq+2] ; top 812 mova m5, [h_weightsq+4*4] 813 punpcklwd m4, m6 ; top, bottom 814 pxor m6, m6 815.w4_loop: 816 movq m1, [v_weightsq+hq*4] 817 sub tlq, 4 818 movd m3, [tlq] ; left 819 pshufd m0, m1, q0000 820 pshufd m1, m1, q1111 821 pmaddwd m0, m4 822 punpcklwd m3, m7 ; left, right 823 pmaddwd m1, m4 824 pshufd m2, m3, q1111 825 pshufd m3, m3, q0000 826 pmaddwd m2, m5 827 pmaddwd m3, m5 828 paddd m0, m2 829 paddd m1, m3 830 psrld m0, 8 831 psrld m1, 8 832 packssdw m0, m1 833 pavgw m0, m6 834 movq [dstq+strideq*0], m0 835 movhps [dstq+strideq*1], m0 836 lea dstq, [dstq+strideq*2] 837 add hq, 2 838 jl .w4_loop 839 RET 840.w8: 841%if ARCH_X86_32 842 lea h_weightsq, [h_weightsq+wq*4] 843 mov t0, tlq 844 mov r1m, tlq 845 mov r2m, hq 846 %define m8 [h_weightsq+16*0] 847 %define m9 [h_weightsq+16*1] 848%else 849%if WIN64 850 movaps r4m, m8 851 movaps r6m, m9 852 PUSH r7 853 PUSH r8 854%endif 855 PUSH r9 856 PUSH r10 857 %assign regs_used 11 858 lea h_weightsq, [h_weightsq+wq*8] 859 lea topq, [tlq+wq*2] 860 neg wq 861 mov r8, tlq 862 mov r9, hq 863%endif 864 punpcklqdq m6, m6 865.w8_loop0: 866%if ARCH_X86_32 867 movu m5, [t0+2] 868 add t0, 16 869 mov r0m, t0 870%else 871 movu m5, [topq+wq*2+2] 872 mova m8, [h_weightsq+wq*4+16*0] 873 mova m9, [h_weightsq+wq*4+16*1] 874%endif 875 mov t0, dstq 876 add dstq, 16 877 punpcklwd m4, m5, m6 878 punpckhwd m5, m6 879.w8_loop: 880 movd m1, [v_weightsq+hq*4] 881 sub tlq, 2 882 movd m3, [tlq] ; left 883 pshufd m1, m1, q0000 884 pmaddwd m0, m4, m1 885 pshuflw m3, m3, q0000 886 pmaddwd m1, m5 887 punpcklwd m3, m7 ; left, right 888 pmaddwd m2, m8, m3 889 pmaddwd m3, m9 890 paddd m0, m2 891 paddd m1, m3 892 psrld m0, 8 893 psrld m1, 8 894 packssdw m0, m1 895 pxor m1, m1 896 pavgw m0, m1 897 mova [t0], m0 898 add t0, strideq 899 inc hq 900 jl .w8_loop 901%if ARCH_X86_32 902 mov t0, r0m 903 mov tlq, r1m 904 add h_weightsq, 16*2 905 mov hq, r2m 906 sub dword wm, 8 907 jg .w8_loop0 908%else 909 mov tlq, r8 910 mov hq, r9 911 add wq, 8 912 jl .w8_loop0 913%endif 914%if WIN64 915 movaps m8, r4m 916 movaps m9, r6m 917%endif 918 RET 919 920%if ARCH_X86_64 921cglobal ipred_z1_16bpc, 3, 8, 8, 16*18, dst, stride, tl, w, h, angle, dx 922 %define base r7-$$ 923 %define bdmaxm r8m 924 lea r7, [$$] 925%else 926cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride, tl, w, h, angle, dx 927 %define base r1-$$ 928 %define stridemp [rsp+4*0] 929 %define bdmaxm [rsp+4*1] 930 mov r3, r8m 931 mov stridemp, r1 932 mov bdmaxm, r3 933 LEA r1, $$ 934%endif 935 tzcnt wd, wm 936 movifnidn angled, anglem 937 movifnidn hd, hm 938 add tlq, 2 939 movsxd wq, [base+ipred_z1_16bpc_ssse3_table+wq*4] 940 mov dxd, angled 941 movddup m0, [base+pw_256] 942 and dxd, 0x7e 943 movddup m7, [base+pw_62] 944 add angled, 165 ; ~90 945 lea wq, [base+wq+ipred_z1_16bpc_ssse3_table] 946 movzx dxd, word [base+dr_intra_derivative+dxq] 947 xor angled, 0x4ff ; d = 90 - angle 948 jmp wq 949.w4: 950 lea r3d, [angleq+88] 951 test r3d, 0x480 952 jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 953 sar r3d, 9 954 add r3d, hd 955 cmp r3d, 8 956 jg .w4_no_upsample ; h > 8 || (w == h && is_sm) 957 movd m3, [tlq+14] 958 movu m2, [tlq+ 0] ; 1 2 3 4 5 6 7 8 959 movd m1, bdmaxm 960 pshufb m3, m0 961 palignr m4, m3, m2, 4 ; 3 4 5 6 7 8 8 8 962 paddw m4, [tlq- 2] ; 0 1 2 3 4 5 6 7 963 add dxd, dxd 964 mova [rsp+32], m3 965 palignr m3, m2, 2 ; 2 3 4 5 6 7 8 8 966 pshufb m1, m0 967 paddw m3, m2 ; -1 * a + 9 * b + 9 * c + -1 * d 968 psubw m5, m3, m4 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 969 movd m4, dxd 970 psraw m5, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 971 paddw m3, m5 972 pxor m5, m5 973 pmaxsw m3, m5 974 mov r3d, dxd 975 pavgw m3, m5 976 pshufb m4, m0 977 pminsw m3, m1 978 punpcklwd m1, m2, m3 979 punpckhwd m2, m3 980 mova m3, [base+z_upsample] 981 movifnidn strideq, stridemp 982 mova [rsp+ 0], m1 983 paddw m5, m4, m4 984 mova [rsp+16], m2 985 punpcklqdq m4, m5 ; xpos0 xpos1 986.w4_upsample_loop: 987 lea r2d, [r3+dxq] 988 shr r3d, 6 ; base0 989 movu m1, [rsp+r3*2] 990 lea r3d, [r2+dxq] 991 shr r2d, 6 ; base1 992 movu m2, [rsp+r2*2] 993 pshufb m1, m3 994 pshufb m2, m3 995 punpcklqdq m0, m1, m2 996 punpckhqdq m1, m2 997 pand m2, m7, m4 ; frac 998 psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 999 psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) 1000 pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) 1001 paddw m4, m5 ; xpos += dx 1002 paddw m0, m1 1003 movq [dstq+strideq*0], m0 1004 movhps [dstq+strideq*1], m0 1005 lea dstq, [dstq+strideq*2] 1006 sub hd, 2 1007 jg .w4_upsample_loop 1008 RET 1009.w4_no_upsample: 1010 mov r3d, 7 ; max_base 1011 test angled, 0x400 ; !enable_intra_edge_filter 1012 jnz .w4_main 1013 lea r3d, [hq+3] 1014 movd m1, r3d 1015 movd m3, angled 1016 shr angled, 8 ; is_sm << 1 1017 pxor m2, m2 1018 pshufb m1, m2 1019 pshufb m3, m2 1020 pcmpeqb m1, [base+z_filt_wh4] 1021 pand m1, m3 1022 pcmpgtb m1, [base+z_filt_t_w48+angleq*8] 1023 pmovmskb r5d, m1 1024 mov r3d, 7 1025 test r5d, r5d 1026 jz .w4_main ; filter_strength == 0 1027 pshuflw m1, [tlq-2], q0000 1028 movu m2, [tlq+16*0] 1029 imul r5d, 0x55555555 1030 movd m3, [tlq+r3*2] 1031 shr r5d, 30 ; filter_strength 1032 movd [rsp+12], m1 1033 pshuflw m3, m3, q0000 1034 mova [rsp+16*1], m2 1035 lea r2d, [r3+2] 1036 movq [rsp+r3*2+18], m3 1037 cmp hd, 8 1038 cmovae r3d, r2d 1039 lea tlq, [rsp+16*1] 1040 call .filter_edge 1041.w4_main: 1042 lea tlq, [tlq+r3*2] 1043 movd m4, dxd 1044 movddup m1, [base+z_base_inc] ; base_inc << 6 1045 movd m6, [tlq] ; top[max_base_x] 1046 shl r3d, 6 1047 movd m3, r3d 1048 pshufb m4, m0 1049 mov r5d, dxd ; xpos 1050 pshufb m6, m0 1051 sub r5, r3 1052 pshufb m3, m0 1053 paddw m5, m4, m4 1054 psubw m3, m1 ; max_base_x 1055 punpcklqdq m4, m5 ; xpos0 xpos1 1056 movifnidn strideq, stridemp 1057.w4_loop: 1058 lea r3, [r5+dxq] 1059 sar r5, 6 ; base0 1060 movq m0, [tlq+r5*2+0] 1061 movq m1, [tlq+r5*2+2] 1062 lea r5, [r3+dxq] 1063 sar r3, 6 ; base1 1064 movhps m0, [tlq+r3*2+0] 1065 movhps m1, [tlq+r3*2+2] 1066 pand m2, m7, m4 1067 psllw m2, 9 1068 psubw m1, m0 1069 pmulhrsw m1, m2 1070 pcmpgtw m2, m3, m4 ; xpos < max_base_x 1071 paddw m4, m5 ; xpos += dx 1072 paddw m0, m1 1073 pand m0, m2 1074 pandn m2, m6 1075 por m0, m2 1076 movq [dstq+strideq*0], m0 1077 movhps [dstq+strideq*1], m0 1078 sub hd, 2 1079 jz .w4_end 1080 lea dstq, [dstq+strideq*2] 1081 test r5d, r5d 1082 jl .w4_loop 1083.w4_end_loop: 1084 movq [dstq+strideq*0], m6 1085 movq [dstq+strideq*1], m6 1086 lea dstq, [dstq+strideq*2] 1087 sub hd, 2 1088 jg .w4_end_loop 1089.w4_end: 1090 RET 1091.w8: 1092 lea r3d, [angleq+88] 1093 and r3d, ~0x7f 1094 or r3d, hd 1095 cmp r3d, 8 1096 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 1097 movu m1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 1098 movu m5, [tlq+ 2] ; 2 3 4 5 6 7 8 9 1099 movu m3, [tlq+ 4] ; 3 4 5 6 7 8 9 a 1100 paddw m5, m1 1101 paddw m3, [tlq- 2] ; 0 1 2 3 4 5 6 7 1102 psubw m2, m5, m3 1103 movu m6, [tlq+18] ; a b c d e f g _ 1104 psraw m2, 3 1105 movu m3, [tlq+20] ; b c d e f g _ _ 1106 paddw m5, m2 1107 movu m2, [tlq+16] ; 9 a b c d e f g 1108 paddw m6, m2 1109 add dxd, dxd 1110 cmp hd, 4 1111 jne .w8_upsample_h8 ; awkward single-pixel edge case 1112 pshuflw m3, m3, q1110 ; b c c _ _ _ _ _ 1113.w8_upsample_h8: 1114 paddw m3, [tlq+14] ; 8 9 a b c d e f 1115 psubw m4, m6, m3 1116 movd m3, bdmaxm 1117 psraw m4, 3 1118 mov r3d, dxd 1119 paddw m6, m4 1120 pxor m4, m4 1121 pmaxsw m5, m4 1122 pmaxsw m6, m4 1123 pshufb m3, m0 1124 pavgw m5, m4 1125 pavgw m6, m4 1126 movd m4, dxd 1127 pminsw m5, m3 1128 pminsw m6, m3 1129 mova m3, [base+z_upsample] 1130 pshufb m4, m0 1131 movifnidn strideq, stridemp 1132 punpcklwd m0, m1, m5 1133 mova [rsp+ 0], m0 1134 punpckhwd m1, m5 1135 mova [rsp+16], m1 1136 punpcklwd m0, m2, m6 1137 mova [rsp+32], m0 1138 punpckhwd m2, m6 1139 mova [rsp+48], m2 1140 mova m5, m4 1141.w8_upsample_loop: 1142 mov r2d, r3d 1143 shr r2d, 6 1144 movu m1, [rsp+r2*2+ 0] 1145 movu m2, [rsp+r2*2+16] 1146 add r3d, dxd 1147 pshufb m1, m3 1148 pshufb m2, m3 1149 punpcklqdq m0, m1, m2 1150 punpckhqdq m1, m2 1151 pand m2, m7, m4 1152 psllw m2, 9 1153 psubw m1, m0 1154 pmulhrsw m1, m2 1155 paddw m4, m5 1156 paddw m0, m1 1157 mova [dstq], m0 1158 add dstq, strideq 1159 dec hd 1160 jg .w8_upsample_loop 1161 RET 1162.w8_no_upsample: 1163 lea r3d, [hq+7] 1164 movd m1, r3d 1165 and r3d, 7 1166 or r3d, 8 ; imin(h+7, 15) 1167 test angled, 0x400 1168 jnz .w8_main 1169 movd m3, angled 1170 shr angled, 8 ; is_sm << 1 1171 pxor m2, m2 1172 pshufb m1, m2 1173 pshufb m3, m2 1174 movu m2, [base+z_filt_wh8] 1175 psrldq m4, [base+z_filt_t_w48+angleq*8], 4 1176 pcmpeqb m2, m1 1177 pand m2, m3 1178 pcmpgtb m2, m4 1179 pmovmskb r5d, m2 1180 test r5d, r5d 1181 jz .w8_main ; filter_strength == 0 1182 pshuflw m1, [tlq-2], q0000 1183 movu m2, [tlq+16*0] 1184 imul r5d, 0x55555555 1185 movu m3, [tlq+16*1] 1186 movd m4, [tlq+r3*2] 1187 shr r5d, 30 ; filter_strength 1188 movd [rsp+12], m1 1189 mova [rsp+16*1], m2 1190 pshuflw m4, m4, q0000 1191 mova [rsp+16*2], m3 1192 lea r2d, [r3+2] 1193 movq [rsp+r3*2+18], m4 1194 cmp hd, 16 1195 cmovae r3d, r2d 1196 lea tlq, [rsp+16*1] 1197 call .filter_edge 1198.w8_main: 1199 lea tlq, [tlq+r3*2] 1200 movd m5, dxd 1201 mova m4, [base+z_base_inc] 1202 shl r3d, 6 1203 movd m6, [tlq] ; top[max_base_x] 1204 movd m1, r3d 1205 pshufb m5, m0 1206 mov r5d, dxd ; xpos 1207 pshufb m1, m0 1208 sub r5, r3 1209 psubw m4, m1 ; max_base_x 1210 pshufb m6, m0 1211 paddw m4, m5 1212 movifnidn strideq, stridemp 1213.w8_loop: 1214 mov r3, r5 1215 sar r3, 6 1216 movu m0, [tlq+r3*2+0] 1217 movu m1, [tlq+r3*2+2] 1218 pand m2, m7, m4 1219 psllw m2, 9 1220 psubw m1, m0 1221 pmulhrsw m1, m2 1222 psraw m2, m4, 15 ; xpos < max_base_x 1223 paddw m4, m5 ; xpos += dx 1224 paddw m0, m1 1225 pand m0, m2 1226 pandn m2, m6 1227 por m0, m2 1228 mova [dstq], m0 1229 dec hd 1230 jz .w8_end 1231 add dstq, strideq 1232 add r5, dxq 1233 jl .w8_loop 1234.w8_end_loop: 1235 mova [dstq], m6 1236 add dstq, strideq 1237 dec hd 1238 jg .w8_end_loop 1239.w8_end: 1240 RET 1241.w16: 1242%if ARCH_X86_32 1243 %define strideq r3 1244%endif 1245 lea r3d, [hq+15] 1246 movd m1, r3d 1247 and r3d, 15 1248 or r3d, 16 ; imin(h+15, 31) 1249 test angled, 0x400 1250 jnz .w16_main 1251 movd m3, angled 1252 shr angled, 8 ; is_sm << 1 1253 pxor m2, m2 1254 pshufb m1, m2 1255 pshufb m3, m2 1256 movq m4, [base+z_filt_t_w16+angleq*4] 1257 pcmpeqb m1, [base+z_filt_wh16] 1258 pand m1, m3 1259 pcmpgtb m1, m4 1260 pmovmskb r5d, m1 1261 test r5d, r5d 1262 jz .w16_main ; filter_strength == 0 1263 pshuflw m1, [tlq-2], q0000 1264 movu m2, [tlq+16*0] 1265 imul r5d, 0x24924924 1266 movu m3, [tlq+16*1] 1267 movu m4, [tlq+16*2] 1268 shr r5d, 30 1269 movu m5, [tlq+16*3] 1270 movd m6, [tlq+r3*2] 1271 adc r5d, -1 ; filter_strength 1272 movd [rsp+12], m1 1273 mova [rsp+16*1], m2 1274 mova [rsp+16*2], m3 1275 pshuflw m6, m6, q0000 1276 mova [rsp+16*3], m4 1277 mova [rsp+16*4], m5 1278 lea r2d, [r3+2] 1279 movq [rsp+r3*2+18], m6 1280 cmp hd, 32 1281 cmovae r3d, r2d 1282 lea tlq, [rsp+16*1] 1283 call .filter_edge 1284.w16_main: 1285 lea tlq, [tlq+r3*2] 1286 movd m5, dxd 1287 mova m4, [base+z_base_inc] 1288 shl r3d, 6 1289 movd m6, [tlq] ; top[max_base_x] 1290 movd m1, r3d 1291 pshufb m5, m0 1292 mov r5d, dxd ; xpos 1293 pshufb m1, m0 1294 sub r5, r3 1295 psubw m4, m1 ; max_base_x 1296 pshufb m6, m0 1297 paddw m4, m5 1298.w16_loop: 1299 mov r3, r5 1300 sar r3, 6 1301 movu m0, [tlq+r3*2+ 0] 1302 movu m2, [tlq+r3*2+ 2] 1303 pand m3, m7, m4 1304 psllw m3, 9 1305 psubw m2, m0 1306 pmulhrsw m2, m3 1307 movu m1, [tlq+r3*2+16] 1308 paddw m0, m2 1309 movu m2, [tlq+r3*2+18] 1310 psubw m2, m1 1311 pmulhrsw m2, m3 1312 movddup m3, [base+pw_m512] 1313 paddw m1, m2 1314 psraw m2, m4, 15 1315 pcmpgtw m3, m4 1316 paddw m4, m5 1317 pand m0, m2 1318 pandn m2, m6 1319 pand m1, m3 1320 pandn m3, m6 1321 por m0, m2 1322 mova [dstq+16*0], m0 1323 por m1, m3 1324 mova [dstq+16*1], m1 1325 dec hd 1326 jz .w16_end 1327 movifnidn strideq, stridemp 1328 add dstq, strideq 1329 add r5, dxq 1330 jl .w16_loop 1331.w16_end_loop: 1332 mova [dstq+16*0], m6 1333 mova [dstq+16*1], m6 1334 add dstq, strideq 1335 dec hd 1336 jg .w16_end_loop 1337.w16_end: 1338 RET 1339.w32: 1340 lea r3d, [hq+31] 1341 and r3d, 31 1342 or r3d, 32 ; imin(h+31, 63) 1343 test angled, 0x400 ; !enable_intra_edge_filter 1344 jnz .w32_main 1345 call .filter_copy 1346 lea r5d, [r3+2] 1347 cmp hd, 64 1348 cmove r3d, r5d 1349 call .filter_edge_s3 1350.w32_main: 1351 lea tlq, [tlq+r3*2] 1352 movd m5, dxd 1353 mova m4, [base+z_base_inc] 1354 shl r3d, 6 1355 movd m6, [tlq] ; top[max_base_x] 1356 movd m1, r3d 1357 pshufb m5, m0 1358 mov r5d, dxd ; xpos 1359 pshufb m1, m0 1360 sub r5, r3 1361 psubw m4, m1 ; max_base_x 1362 pshufb m6, m0 1363 paddw m4, m5 1364.w32_loop: 1365 mov r3, r5 1366 sar r3, 6 1367 movu m0, [tlq+r3*2+ 0] 1368 movu m2, [tlq+r3*2+ 2] 1369 pand m3, m7, m4 1370 psllw m3, 9 1371 psubw m2, m0 1372 pmulhrsw m2, m3 1373 movu m1, [tlq+r3*2+16] 1374 paddw m0, m2 1375 movu m2, [tlq+r3*2+18] 1376 psubw m2, m1 1377 pmulhrsw m2, m3 1378 paddw m1, m2 1379 psraw m2, m4, 15 1380 pand m0, m2 1381 pandn m2, m6 1382 por m0, m2 1383 movddup m2, [base+pw_m512] 1384 pcmpgtw m2, m4 1385 pand m1, m2 1386 pandn m2, m6 1387 mova [dstq+16*0], m0 1388 por m1, m2 1389 mova [dstq+16*1], m1 1390 movu m0, [tlq+r3*2+32] 1391 movu m2, [tlq+r3*2+34] 1392 psubw m2, m0 1393 pmulhrsw m2, m3 1394 movu m1, [tlq+r3*2+48] 1395 paddw m0, m2 1396 movu m2, [tlq+r3*2+50] 1397 psubw m2, m1 1398 pmulhrsw m2, m3 1399 paddw m1, m2 1400 movddup m2, [base+pw_m1024] 1401 movddup m3, [base+pw_m1536] 1402 pcmpgtw m2, m4 1403 pcmpgtw m3, m4 1404 paddw m4, m5 1405 pand m0, m2 1406 pandn m2, m6 1407 pand m1, m3 1408 pandn m3, m6 1409 por m0, m2 1410 mova [dstq+16*2], m0 1411 por m1, m3 1412 mova [dstq+16*3], m1 1413 dec hd 1414 jz .w32_end 1415 movifnidn strideq, stridemp 1416 add dstq, strideq 1417 add r5, dxq 1418 jl .w32_loop 1419.w32_end_loop: 1420 REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3 1421 add dstq, strideq 1422 dec hd 1423 jg .w32_end_loop 1424.w32_end: 1425 RET 1426.w64: 1427 lea r3d, [hq+63] 1428 test angled, 0x400 ; !enable_intra_edge_filter 1429 jnz .w64_main 1430 call .filter_copy 1431 call .filter_edge_s3 1432.w64_main: 1433 lea tlq, [tlq+r3*2] 1434 movd m5, dxd 1435 mova m4, [base+z_base_inc] 1436 shl r3d, 6 1437 movd m6, [tlq] ; top[max_base_x] 1438 movd m1, r3d 1439 pshufb m5, m0 1440 mov r5d, dxd ; xpos 1441 pshufb m1, m0 1442 sub r5, r3 1443 psubw m4, m1 ; max_base_x 1444 pshufb m6, m0 1445 paddw m4, m5 1446.w64_loop: 1447 mov r3, r5 1448 sar r3, 6 1449 movu m0, [tlq+r3*2+ 0] 1450 movu m2, [tlq+r3*2+ 2] 1451 pand m3, m7, m4 1452 psllw m3, 9 1453 psubw m2, m0 1454 pmulhrsw m2, m3 1455 movu m1, [tlq+r3*2+16] 1456 paddw m0, m2 1457 movu m2, [tlq+r3*2+18] 1458 psubw m2, m1 1459 pmulhrsw m2, m3 1460 paddw m1, m2 1461 psraw m2, m4, 15 1462 pand m0, m2 1463 pandn m2, m6 1464 por m0, m2 1465 movddup m2, [base+pw_m512] 1466 pcmpgtw m2, m4 1467 pand m1, m2 1468 pandn m2, m6 1469 mova [dstq+16*0], m0 1470 por m1, m2 1471 mova [dstq+16*1], m1 1472 movu m0, [tlq+r3*2+32] 1473 movu m2, [tlq+r3*2+34] 1474 psubw m2, m0 1475 pmulhrsw m2, m3 1476 movu m1, [tlq+r3*2+48] 1477 paddw m0, m2 1478 movu m2, [tlq+r3*2+50] 1479 psubw m2, m1 1480 pmulhrsw m2, m3 1481 paddw m1, m2 1482 movddup m2, [base+pw_m1024] 1483 pcmpgtw m2, m4 1484 pand m0, m2 1485 pandn m2, m6 1486 por m0, m2 1487 movddup m2, [base+pw_m1536] 1488 pcmpgtw m2, m4 1489 pand m1, m2 1490 pandn m2, m6 1491 mova [dstq+16*2], m0 1492 por m1, m2 1493 mova [dstq+16*3], m1 1494 movu m0, [tlq+r3*2+64] 1495 movu m2, [tlq+r3*2+66] 1496 psubw m2, m0 1497 pmulhrsw m2, m3 1498 movu m1, [tlq+r3*2+80] 1499 paddw m0, m2 1500 movu m2, [tlq+r3*2+82] 1501 psubw m2, m1 1502 pmulhrsw m2, m3 1503 paddw m1, m2 1504 movddup m2, [base+pw_m2048] 1505 pcmpgtw m2, m4 1506 pand m0, m2 1507 pandn m2, m6 1508 por m0, m2 1509 movddup m2, [base+pw_m2560] 1510 pcmpgtw m2, m4 1511 pand m1, m2 1512 pandn m2, m6 1513 mova [dstq+16*4], m0 1514 por m1, m2 1515 mova [dstq+16*5], m1 1516 movu m0, [tlq+r3*2+96] 1517 movu m2, [tlq+r3*2+98] 1518 psubw m2, m0 1519 pmulhrsw m2, m3 1520 movu m1, [tlq+r3*2+112] 1521 paddw m0, m2 1522 movu m2, [tlq+r3*2+114] 1523 psubw m2, m1 1524 pmulhrsw m2, m3 1525 paddw m1, m2 1526 movddup m2, [base+pw_m3072] 1527 movddup m3, [base+pw_m3584] 1528 pcmpgtw m2, m4 1529 pcmpgtw m3, m4 1530 paddw m4, m5 1531 pand m0, m2 1532 pandn m2, m6 1533 pand m1, m3 1534 pandn m3, m6 1535 por m0, m2 1536 mova [dstq+16*6], m0 1537 por m1, m3 1538 mova [dstq+16*7], m1 1539 dec hd 1540 jz .w64_end 1541 movifnidn strideq, stridemp 1542 add dstq, strideq 1543 add r5, dxq 1544 jl .w64_loop 1545.w64_end_loop: 1546 REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 1547 add dstq, strideq 1548 dec hd 1549 jg .w64_end_loop 1550.w64_end: 1551 RET 1552ALIGN function_align 1553.filter_copy: 1554 pshuflw m2, [tlq-2], q0000 1555 pshuflw m3, [tlq+r3*2], q0000 1556 xor r5d, r5d 1557 movd [rsp+gprsize+12], m2 1558.filter_copy_loop: 1559 movu m1, [tlq+r5*2+16*0] 1560 movu m2, [tlq+r5*2+16*1] 1561 add r5d, 16 1562 mova [rsp+r5*2+gprsize-16*1], m1 1563 mova [rsp+r5*2+gprsize-16*0], m2 1564 cmp r5d, r3d 1565 jle .filter_copy_loop 1566 lea tlq, [rsp+gprsize+16*1] 1567 movq [tlq+r3*2+2], m3 1568 ret 1569.filter_edge: 1570 cmp r5d, 3 1571 je .filter_edge_s3 1572 movddup m4, [base+z_filt_k+r5*8-8] 1573 movddup m5, [base+z_filt_k+r5*8+8] 1574 xor r5d, r5d 1575 movddup m6, [base+pw_8] 1576 movu m2, [tlq-2] 1577 jmp .filter_edge_start 1578.filter_edge_loop: 1579 movu m2, [tlq+r5*2-2] 1580 mova [tlq+r5*2-16], m1 1581.filter_edge_start: 1582 pmullw m1, m4, [tlq+r5*2] 1583 movu m3, [tlq+r5*2+2] 1584 paddw m2, m3 1585 pmullw m2, m5 1586 add r5d, 8 1587 paddw m1, m6 1588 paddw m1, m2 1589 psrlw m1, 4 1590 cmp r5d, r3d 1591 jl .filter_edge_loop 1592 mova [tlq+r5*2-16], m1 1593 ret 1594.filter_edge_s3: 1595 movddup m5, [base+pw_3] 1596 xor r5d, r5d 1597 movu m2, [tlq-2] 1598 movu m3, [tlq-4] 1599 jmp .filter_edge_s3_start 1600.filter_edge_s3_loop: 1601 movu m2, [tlq+r5*2-2] 1602 movu m3, [tlq+r5*2-4] 1603 mova [tlq+r5*2-16], m1 1604.filter_edge_s3_start: 1605 paddw m2, [tlq+r5*2+0] 1606 paddw m3, m5 1607 movu m1, [tlq+r5*2+2] 1608 movu m4, [tlq+r5*2+4] 1609 add r5d, 8 1610 paddw m1, m2 1611 pavgw m3, m4 1612 paddw m1, m3 1613 psrlw m1, 2 1614 cmp r5d, r3d 1615 jl .filter_edge_s3_loop 1616 mova [tlq+r5*2-16], m1 1617 ret 1618 1619%if ARCH_X86_64 1620cglobal ipred_z2_16bpc, 4, 12, 11, 16*24, dst, stride, tl, w, h, angle, dx, _, dy 1621 %define base r7-$$ 1622 %define maxwm r6m 1623 %define maxhm r7m 1624 %define bdmaxm r8m 1625 lea r7, [$$] 1626 mov hd, hm 1627 movddup m8, [base+pw_62] 1628 lea r9d, [wq-4] 1629 shl r9d, 6 1630 mova m9, [base+z2_top_shufA] 1631 or r9d, hd 1632 mova m10, [base+z2_left_shufA] 1633%else 1634cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w, h, angle, dx 1635 %define base r1-$$ 1636 %define r9b byte [rsp+16*26+4*0] 1637 %define r9d dword [rsp+16*26+4*0] 1638 %define r10d dword [rsp+16*26+4*1] 1639 %define r11d dword [rsp+16*26+4*2] 1640 %define maxwm [rsp+16*2+4*0] 1641 %define maxhm [rsp+16*2+4*1] 1642 %define bdmaxm [rsp+16*2+4*2] 1643 %define stridemp [rsp+16*26+4*3] 1644 %define strideq r3 1645 %define dyd r4 1646 %define dyq r4 1647 mov stridemp, r1 1648 mov r1d, r6m 1649 mov r4d, r7m 1650 mov r5d, r8m 1651 mov maxwm, r1d 1652 mov maxhm, r4d 1653 mov bdmaxm, r5d 1654 LEA r1, $$ 1655 lea hd, [wq-4] 1656 mova m0, [base+z2_top_shufA] 1657 shl hd, 6 1658 mova m1, [base+z2_left_shufA] 1659 or hd, hm 1660 mova [rsp+16*24], m0 1661 mov r9d, hd 1662 mova [rsp+16*25], m1 1663%endif 1664 tzcnt wd, wd 1665 movifnidn angled, anglem 1666 mova m0, [tlq-16*8] 1667 mova m1, [tlq-16*7] 1668 mova m2, [tlq-16*6] 1669 mova m3, [tlq-16*5] 1670 movsxd wq, [base+ipred_z2_16bpc_ssse3_table+wq*4] 1671%if ARCH_X86_64 1672 movzx dxd, angleb 1673%else 1674 movzx dxd, byte anglem 1675%endif 1676 mova m4, [tlq-16*4] 1677 mova m5, [tlq-16*3] 1678 mova m6, [tlq-16*2] 1679 mova m7, [tlq-16*1] 1680 mova [rsp+16* 5], m0 1681 xor angled, 0x400 1682 mova [rsp+16* 6], m1 1683 mov dyd, dxd 1684 mova [rsp+16* 7], m2 1685 neg dxq 1686 mova [rsp+16* 8], m3 1687 and dyd, ~1 1688 mova [rsp+16* 9], m4 1689 and dxq, ~1 1690 mova [rsp+16*10], m5 1691 lea wq, [base+ipred_z2_16bpc_ssse3_table+wq] 1692 mova [rsp+16*11], m6 1693 pxor m3, m3 1694 mova [rsp+16*12], m7 1695 movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 1696 movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle 1697 movddup m0, [base+pw_256] ; 4<<6 1698 movd m4, [tlq] 1699 movu m5, [tlq+16*0+2] 1700 movu m6, [tlq+16*1+2] 1701 movsldup m1, [base+z2_dy_offset] 1702 pshufb m4, m0 1703 movq m7, [base+z_base_inc+2] 1704 mov r11d, (112-4)<<6 1705 mova [rsp+16*13], m4 1706 neg dxd 1707 mova [rsp+16*14], m5 1708 or dyd, 4<<16 1709 mova [rsp+16*15], m6 1710%if ARCH_X86_64 1711 lea r10d, [dxq+(112<<6)] ; xpos 1712%else 1713 mov [rsp+8*3], dyd 1714 lea r4d, [dxq+(112<<6)] 1715 mov r10d, r4d 1716 movzx hd, r9b 1717%endif 1718 movq [rsp+8*0], m1 1719 movq [rsp+8*1], m0 1720 movq [rsp+8*2], m7 1721 jmp wq 1722.w4: 1723 test angled, 0x400 1724 jnz .w4_main 1725 lea r3d, [hq+2] 1726 add angled, 1022 1727 pshuflw m1, m5, q3333 1728 shl r3d, 6 1729 movq [rsp+16*14+8], m1 1730 test r3d, angled 1731 jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) 1732 call .upsample_above 1733 sub angled, 1075 ; angle - 53 1734 lea r3d, [hq+3] 1735 xor angled, 0x7f ; 180 - angle 1736 movd m2, r3d 1737 movd m7, angled 1738 shr angled, 8 ; is_sm << 1 1739 pshufb m2, m3 1740 pshufb m7, m3 1741 pcmpeqb m2, [base+z_filt_wh4] 1742 pand m7, m2 1743 pcmpgtb m7, [base+z_filt_t_w48+angleq*8] 1744 jmp .w8_filter_left 1745.upsample_above: ; w4/w8 1746 paddw m2, m5, [tlq] 1747 movu m1, [rsp+gprsize+16*14+2] 1748 movu m4, [rsp+gprsize+16*14-4] 1749%if ARCH_X86_64 1750 movd m6, r9m ; bdmax, offset due to call 1751%else 1752 movd m6, [rsp+gprsize+16*2+4*2] 1753%endif 1754 paddw m4, m1 1755 psubw m1, m2, m4 1756 pshufb m6, m0 1757 psraw m1, 3 1758 paddw m2, m1 1759 add dxd, dxd 1760 pmaxsw m2, m3 1761 paddw m7, m7 1762 pavgw m2, m3 1763 pminsw m2, m6 1764%if ARCH_X86_64 1765 mova m9, [base+z2_top_shufB] 1766 lea r10d, [dxq+(113<<6)] 1767 mov r11d, (112-7)<<6 1768%else 1769 mova m1, [base+z2_top_shufB] 1770 lea r3d, [dxq+(113<<6)] 1771 mov dword [rsp+gprsize+16*26+4*2], (112-7)<<6 1772 mov [rsp+gprsize+16*26+4*1], r3d 1773 mova [rsp+gprsize+16*24], m1 1774%endif 1775 punpcklwd m1, m2, m5 1776 punpckhwd m2, m5 1777 movq [rsp+gprsize+8*2], m7 1778 mova [rsp+gprsize+16*14], m1 1779 mova [rsp+gprsize+16*15], m2 1780 ret 1781.w4_no_upsample_above: 1782 lea r3d, [hq+3] 1783 mov [rsp+16*4], angled 1784 sub angled, 1112 ; angle - 90 1785 movd m2, r3d 1786 mov r3d, 90 1787 movd m1, angled 1788 sub r3d, angled ; 180 - angle 1789 shr angled, 8 ; is_sm << 1 1790 mova m4, [base+z_filt_wh4] 1791 movd m7, r3d 1792 mova m5, [base+z_filt_t_w48+angleq*8] 1793 mov r3d, 4 1794 call .w8_filter_top 1795 mov angled, [rsp+16*4] 1796 lea r3d, [hq+2] 1797 sub angled, 139 1798 shl r3d, 6 1799 test r3d, angled 1800 jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) 1801.upsample_left: ; w4/w8 1802 mova m2, [tlq-16] 1803 lea r3d, [hq-4] 1804 movu m3, [tlq-14] 1805 movu m4, [rsp+16*12+4] 1806 pshufb m1, m2, [base+z2_upsample_l+r3*4] 1807 movd m6, bdmaxm 1808 pxor m5, m5 1809 paddw m3, m2 1810 paddw m4, m1 1811 psubw m1, m3, m4 1812 movshdup m4, [base+z2_dy_offset] 1813 psraw m1, 3 1814 pshufb m6, m0 1815 paddw m3, m1 1816 pmaxsw m3, m5 1817 pavgw m3, m5 1818 pminsw m3, m6 1819%if ARCH_X86_64 1820 mova m10, [base+z2_left_shufB] 1821 add dyd, dyd 1822%else 1823 mova m1, [base+z2_left_shufB] 1824 shl dword [rsp+8*3], 1 1825 mova [rsp+16*25], m1 1826%endif 1827 punpckhwd m1, m2, m3 1828 punpcklwd m2, m3 1829 movq [rsp+8*0], m4 1830 mova [rsp+16*12], m1 1831 mova [rsp+16*11], m2 1832.w4_main: 1833 movd m6, dxd 1834%if ARCH_X86_64 1835 movd m3, dyd 1836%else 1837 movd m3, [rsp+8*3] 1838%endif 1839 pshufb m6, m0 1840 movddup m0, [rsp+8*2] 1841 paddw m7, m6, m6 1842 movq m5, [base+pw_m1to4] 1843 pshuflw m4, m3, q0000 1844 punpcklqdq m6, m7 1845 pmullw m4, m5 1846 pshuflw m3, m3, q1111 1847 paddw m6, m0 1848 mov r2d, r10d 1849 pshuflw m0, m4, q3333 1850 psubw m4, [rsp+8*0] 1851 movq [rsp+8*3], m3 1852 movq [rsp+8*5], m0 ; dy*4 1853 mov r5, dstq 1854.w4_loop0: 1855 mova [rsp+16*4], m6 1856 movq [rsp+8*4], m4 1857%if ARCH_X86_64 1858 pand m0, m8, m4 1859%else 1860 movq m0, [base+pw_62] 1861 pand m0, m4 1862%endif 1863 psraw m4, 6 1864 psllw m0, 9 ; frac_y << 9 1865 movq [rsp+8*7], m0 1866 pabsw m4, m4 1867 movq [rsp+8*6], m4 1868 movzx hd, r9b 1869.w4_loop: 1870 lea r3d, [r2+dxq] 1871 shr r2d, 6 ; base_x0 1872 movu m2, [rsp+r2*2] 1873 lea r2d, [r3+dxq] 1874 shr r3d, 6 ; base_x1 1875 movu m1, [rsp+r3*2] 1876 lea r3d, [r2+dxq] 1877 shr r2d, 6 ; base_x2 1878 movu m3, [rsp+r2*2] 1879 lea r2d, [r3+dxq] 1880 shr r3d, 6 ; base_x3 1881 movu m4, [rsp+r3*2] 1882%if ARCH_X86_64 1883 REPX {pshufb x, m9}, m2, m1, m3, m4 1884%else 1885 mova m0, [rsp+16*24] 1886 REPX {pshufb x, m0}, m2, m1, m3, m4 1887%endif 1888 punpcklqdq m0, m2, m1 1889 punpckhqdq m2, m1 1890 punpcklqdq m1, m3, m4 1891 punpckhqdq m3, m4 1892%if ARCH_X86_64 1893 pand m5, m8, m6 1894%else 1895 movddup m5, [base+pw_62] 1896 pand m5, m6 1897%endif 1898 psllw m5, 9 1899 psubw m2, m0 1900 pmulhrsw m2, m5 1901 paddw m5, m6, m7 1902 psubw m3, m1 1903 paddw m0, m2 1904%if ARCH_X86_64 1905 pand m2, m8, m5 1906%else 1907 movddup m2, [base+pw_62] 1908 pand m2, m5 1909%endif 1910 psllw m2, 9 1911 pmulhrsw m3, m2 1912 paddw m1, m3 1913 cmp r3d, 111 ; topleft 1914 jge .w4_toponly 1915 mova [rsp+16*22], m0 1916 mova [rsp+16*23], m1 1917 movzx r3d, byte [rsp+8*6+0] ; base_y0 1918 movu m3, [rsp+r3*2] 1919 movzx r3d, byte [rsp+8*6+2] ; base_y1 1920 movu m2, [rsp+r3*2] 1921 movzx r3d, byte [rsp+8*6+4] ; base_y2 1922 movu m4, [rsp+r3*2] 1923 movzx r3d, byte [rsp+8*6+6] ; base_y3 1924 movu m0, [rsp+r3*2] 1925%if ARCH_X86_64 1926 REPX {pshufb x, m10}, m3, m2, m4, m0 1927%else 1928 mova m1, [rsp+16*25] 1929 REPX {pshufb x, m1}, m3, m2, m4, m0 1930%endif 1931 punpcklwd m1, m3, m2 1932 punpckhwd m3, m2 ; 01 1933 punpcklwd m2, m4, m0 1934 punpckhwd m4, m0 ; 23 1935 punpckldq m0, m1, m2 ; y0 d1 1936 punpckhdq m1, m2 ; y2 y3 1937 punpckldq m2, m3, m4 1938 punpckhdq m3, m4 1939 movddup m4, [rsp+8*7] 1940 psubw m2, m0 1941 psubw m3, m1 1942 pmulhrsw m2, m4 1943 pmulhrsw m3, m4 1944 psraw m6, 15 ; base_x < topleft 1945 psraw m4, m5, 15 1946 paddw m0, m2 1947 paddw m1, m3 1948 pand m0, m6 1949 pandn m6, [rsp+16*22] 1950 pand m1, m4 1951 pandn m4, [rsp+16*23] 1952 por m0, m6 1953 por m1, m4 1954.w4_toponly: 1955 movifnidn strideq, stridemp 1956 movq [dstq+strideq*0], m0 1957 movhps [dstq+strideq*1], m0 1958 lea dstq, [dstq+strideq*2] 1959 movq [dstq+strideq*0], m1 1960 movhps [dstq+strideq*1], m1 1961 sub hd, 4 1962 jz .w4_end 1963 movq m4, [rsp+8*6] 1964 paddsw m6, m5, m7 ; xpos += dx 1965 movq m5, [rsp+8*3] 1966 psubw m4, m5 1967 lea dstq, [dstq+strideq*2] 1968 movq [rsp+8*6], m4 1969 cmp r2d, r11d 1970 jge .w4_loop 1971.w4_leftonly_loop: 1972 movzx r2d, byte [rsp+8*6+0] ; base_y0 1973 movu m3, [rsp+r2*2] 1974 movzx r2d, byte [rsp+8*6+2] ; base_y1 1975 movu m2, [rsp+r2*2] 1976 movzx r2d, byte [rsp+8*6+4] ; base_y2 1977 movu m6, [rsp+r2*2] 1978 movzx r2d, byte [rsp+8*6+6] ; base_y3 1979 movu m0, [rsp+r2*2] 1980 psubw m4, m5 1981%if ARCH_X86_64 1982 REPX {pshufb x, m10}, m3, m2, m6, m0 1983%else 1984 mova m1, [rsp+16*25] 1985 REPX {pshufb x, m1}, m3, m2, m6, m0 1986%endif 1987 movq [rsp+8*6], m4 1988 punpcklwd m1, m3, m2 1989 punpckhwd m3, m2 1990 punpcklwd m2, m6, m0 1991 punpckhwd m6, m0 1992 punpckldq m0, m1, m2 1993 punpckhdq m1, m2 1994 punpckldq m2, m3, m6 1995 punpckhdq m3, m6 1996 movddup m6, [rsp+8*7] 1997 psubw m2, m0 1998 psubw m3, m1 1999 pmulhrsw m2, m6 2000 pmulhrsw m3, m6 2001 paddw m0, m2 2002 paddw m1, m3 2003 movq [dstq+strideq*0], m0 2004 movhps [dstq+strideq*1], m0 2005 lea dstq, [dstq+strideq*2] 2006 movq [dstq+strideq*0], m1 2007 movhps [dstq+strideq*1], m1 2008 lea dstq, [dstq+strideq*2] 2009 sub hd, 4 2010 jg .w4_leftonly_loop 2011.w4_end: 2012 sub r9d, 1<<8 2013 jl .w4_ret 2014 movq m4, [rsp+8*5] 2015 add r5, 8 2016 mov dstq, r5 2017 paddw m4, [rsp+8*4] ; base_y += 4*dy 2018 movzx r2d, word [rsp+8*1] 2019 movddup m6, [rsp+8*1] 2020 paddw m6, [rsp+16*4] ; base_x += (4 << upsample_above) 2021 add r2d, r10d 2022 mov r10d, r2d 2023 jmp .w4_loop0 2024.w4_ret: 2025 RET 2026.w8: 2027 test angled, 0x400 2028 jnz .w4_main 2029 lea r3d, [angleq+126] 2030 pshufhw m1, m5, q3333 2031%if ARCH_X86_64 2032 mov r3b, hb 2033%else 2034 xor r3b, r3b 2035 or r3d, hd 2036%endif 2037 movhps [rsp+16*15], m1 2038 cmp r3d, 8 2039 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm 2040 call .upsample_above 2041 sub angled, 53 2042 lea r3d, [hq+7] 2043 xor angled, 0x7f ; 180 - angle 2044 movu m1, [base+z_filt_wh8] 2045 movd m2, r3d 2046 movd m7, angled 2047 shr angled, 8 ; is_sm << 1 2048 psrldq m4, [base+z_filt_t_w48+angleq*8], 4 2049 pshufb m2, m3 2050 pshufb m7, m3 2051 pcmpeqb m2, m1 2052 movq m1, [base+pw_512] 2053 pand m7, m2 2054 pcmpgtb m7, m4 2055 movq [rsp+8*1], m1 ; 8<<6 2056 jmp .w8_filter_left 2057.w8_no_upsample_above: 2058 lea r3d, [hq+7] 2059 mov [rsp+16*4], angled 2060 sub angled, 90 2061 movd m2, r3d 2062 mov r3d, 90 2063 movd m1, angled 2064 sub r3d, angled ; 180 - angle 2065 shr angled, 8 ; is_sm << 1 2066 movu m4, [base+z_filt_wh8] 2067 movd m7, r3d 2068 psrldq m5, [base+z_filt_t_w48+angleq*8], 4 2069 mov r3d, 8 2070 call .w8_filter_top 2071 mov r3d, [rsp+16*4] 2072 sub r3d, 141 2073%if ARCH_X86_64 2074 mov r3b, hb 2075%else 2076 xor r3b, r3b 2077 or r3d, hd 2078%endif 2079 cmp r3d, 8 2080 jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm 2081.w8_filter_left: 2082 pmovmskb r5d, m7 2083 test r5d, r5d 2084 jz .w4_main 2085 imul r5d, 0x55555555 2086 neg hq 2087 mov r3, tlq 2088 movd m1, [tlq+hq*2] 2089 shr r5d, 30 ; filter_strength 2090 lea tlq, [rsp+16*13-2] 2091 pshuflw m1, m1, q0000 2092 movq [tlq+hq*2-6], m1 2093 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge 2094 jmp .filter_left_end 2095.w8_filter_top: 2096 REPX {pshufb x, m3}, m2, m1, m7 2097 pcmpeqb m2, m4 2098 pand m1, m2 2099 pand m7, m2 2100 pcmpgtb m1, m5 2101 pcmpgtb m7, m5 2102 pmovmskb r5d, m1 2103 test r5d, r5d 2104 jz .w8_filter_top_end ; filter_strength == 0 2105 imul r5d, 0x55555555 2106 mov [dstq], tlq 2107 lea tlq, [rsp+16*14+gprsize] 2108 shr r5d, 30 ; filter_strength 2109 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge 2110%if ARCH_X86_64 2111 mov r3d, r7m ; maxw, offset due to call 2112%else 2113 mov r3d, [rsp+16*2+4*1] 2114%endif 2115 mov tlq, [dstq] 2116 cmp r3d, 8 2117 jge .w8_filter_top_end 2118 movu m1, [tlq+r3*2+16*0+2] 2119 movu m2, [tlq+r3*2+16*1+2] 2120 movu [rsp+r3*2+16*14+gprsize], m1 2121 movu [rsp+r3*2+16*15+gprsize], m2 2122.w8_filter_top_end: 2123 ret 2124.w16: 2125 test angled, 0x400 2126 jnz .w4_main 2127 lea r3d, [hq+15] 2128 sub angled, 90 2129 movd m2, r3d 2130 mov r3d, 90 2131 movd m1, angled 2132 sub r3d, angled ; 180 - angle 2133 shr angled, 8 ; is_sm << 1 2134 movd m7, r3d 2135 REPX {pshufb x, m3}, m2, m1, m7 2136 movq m4, [base+z_filt_t_w16+angleq*4] 2137 pcmpeqb m2, [base+z_filt_wh16] 2138 pand m1, m2 2139 pand m7, m2 2140 pcmpgtb m1, m4 2141 pcmpgtb m7, m4 2142 pmovmskb r5d, m1 2143 test r5d, r5d 2144 jz .w16_filter_left ; filter_strength == 0 2145 imul r5d, 0x24924924 2146 pshufhw m6, m6, q3333 2147 mov [dstq], tlq 2148 lea tlq, [rsp+16*14] 2149 shr r5d, 30 2150 movhps [tlq+16*2], m6 2151 adc r5d, -1 ; filter_strength 2152 mov r3d, 16 2153 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge 2154 mov r3d, maxwm 2155 mov tlq, [dstq] 2156 cmp r3d, 16 2157 jge .w16_filter_left 2158 movu m1, [tlq+r3*2+16*0+2] 2159 movu m2, [tlq+r3*2+16*1+2] 2160 movu [rsp+r3*2+16*14], m1 2161 movu [rsp+r3*2+16*15], m2 2162.w16_filter_left: 2163 pmovmskb r5d, m7 2164 test r5d, r5d 2165 jz .w4_main 2166 imul r5d, 0x24924924 2167 neg hq 2168 mov r3, tlq 2169 movd m1, [tlq+hq*2] 2170 shr r5d, 30 2171 lea tlq, [rsp+16*13-2] 2172 pshuflw m1, m1, q0000 2173 adc r5d, -1 ; filter_strength 2174 movq [tlq+hq*2-6], m1 2175 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge 2176 jmp .filter_left_end 2177.w32: 2178 movu m1, [tlq+16*2+2] 2179 movu m2, [tlq+16*3+2] 2180 mova [rsp+16*16], m1 2181 mova [rsp+16*17], m2 2182 test angled, 0x400 2183 jnz .w4_main 2184 mov [dstq], tlq 2185 lea tlq, [rsp+16*14] 2186 pshufhw m2, m2, q3333 2187 mov r3d, 32 2188 movhps [tlq+16*4], m2 2189 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 2190 mov r3d, maxwm 2191 mov tlq, [dstq] 2192 cmp r3d, 32 2193 jge .filter_left 2194 movu m1, [tlq+r3*2+16*0+2] 2195 movu m2, [tlq+r3*2+16*1+2] 2196 movu [rsp+r3*2+16*14], m1 2197 movu [rsp+r3*2+16*15], m2 2198 cmp r3d, 16 2199 jge .filter_left 2200 movu m1, [tlq+r3*2+16*2+2] 2201 movu m2, [tlq+r3*2+16*3+2] 2202 movu [rsp+r3*2+16*16], m1 2203 movu [rsp+r3*2+16*17], m2 2204.filter_left: 2205 neg hq 2206 mov r3, tlq 2207 pshuflw m1, [tlq+hq*2], q0000 2208 lea tlq, [rsp+16*13-2] 2209 movq [tlq+hq*2-6], m1 2210 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge_s3 2211.filter_left_end: 2212 mov r2d, maxhm 2213 cmp r2d, hd 2214 jge .w4_main 2215 neg r2 2216 movu m1, [r3+r2*2-16*1] 2217 movu m2, [r3+r2*2-16*2] 2218 movu [rsp+r2*2+16*12], m1 2219 movu [rsp+r2*2+16*11], m2 2220 cmp r2d, -48 2221 jle .w4_main 2222 movu m1, [r3+r2*2-16*3] 2223 movu m2, [r3+r2*2-16*4] 2224 movu [rsp+r2*2+16*10], m1 2225 movu [rsp+r2*2+16* 9], m2 2226 cmp r2d, -32 2227 jle .w4_main 2228 movu m1, [r3+r2*2-16*5] 2229 movu m2, [r3+r2*2-16*6] 2230 movu [rsp+r2*2+16* 8], m1 2231 movu [rsp+r2*2+16* 7], m2 2232 cmp r2d, -16 2233 jle .w4_main 2234 movu m1, [r3+r2*2-16*7] 2235 movu m2, [r3+r2*2-16*8] 2236 movu [rsp+r2*2+16* 6], m1 2237 movu [rsp+r2*2+16* 5], m2 2238 jmp .w4_main 2239.w64: 2240 movu m1, [tlq+16*2+2] 2241 movu m2, [tlq+16*3+2] 2242 movu m3, [tlq+16*4+2] 2243 movu m4, [tlq+16*5+2] 2244 movu m5, [tlq+16*6+2] 2245 movu m6, [tlq+16*7+2] 2246 mov [dstq], tlq 2247 lea tlq, [rsp+16*14] 2248 mova [tlq+16*2], m1 2249 mova [tlq+16*3], m2 2250 mova [tlq+16*4], m3 2251 mova [tlq+16*5], m4 2252 mova [tlq+16*6], m5 2253 mova [tlq+16*7], m6 2254 test angled, 0x400 2255 jnz .w4_main 2256 pshufhw m6, m6, q3333 2257 mov r3d, 64 2258 movhps [tlq+16*8], m6 2259 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 2260 mov r3d, maxwm 2261 mov tlq, [dstq] 2262 cmp r3d, 64 2263 jge .filter_left 2264 movu m1, [tlq+r3*2+16*0+2] 2265 movu m2, [tlq+r3*2+16*1+2] 2266 movu [rsp+r3*2+16*14], m1 2267 movu [rsp+r3*2+16*15], m2 2268 cmp r3d, 48 2269 jge .filter_left 2270 movu m1, [tlq+r3*2+16*2+2] 2271 movu m2, [tlq+r3*2+16*3+2] 2272 movu [rsp+r3*2+16*16], m1 2273 movu [rsp+r3*2+16*17], m2 2274 cmp r3d, 32 2275 jge .filter_left 2276 movu m1, [tlq+r3*2+16*4+2] 2277 movu m2, [tlq+r3*2+16*5+2] 2278 movu [rsp+r3*2+16*18], m1 2279 movu [rsp+r3*2+16*19], m2 2280 cmp r3d, 16 2281 jge .filter_left 2282 movu m1, [tlq+r3*2+16*6+2] 2283 movu m2, [tlq+r3*2+16*7+2] 2284 movu [rsp+r3*2+16*20], m1 2285 movu [rsp+r3*2+16*21], m2 2286 jmp .filter_left 2287 2288%if ARCH_X86_64 2289cglobal ipred_z3_16bpc, 4, 9, 8, 16*18, dst, stride, tl, w, h, angle, dy, _, org_w 2290 %define base r7-$$ 2291 lea r7, [$$] 2292 mov org_wd, wd 2293%else 2294cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride, tl, w, h, angle, dy 2295 %define base r1-$$ 2296 %define org_wd r5 2297 %define org_wq r5 2298 movd m6, r8m ; pixel_max 2299 mov [dstq+4*0], strideq 2300 LEA r1, $$ 2301 mov [dstq+4*1], wd 2302%endif 2303 tzcnt hd, hm 2304 movifnidn angled, anglem 2305 sub tlq, 2 2306 movsxd hq, [base+ipred_z3_16bpc_ssse3_table+hq*4] 2307 sub angled, 180 2308 movddup m0, [base+pw_256] 2309 mov dyd, angled 2310 neg dyd 2311 xor angled, 0x400 2312 movddup m7, [base+pw_62] 2313 or dyq, ~0x7e 2314 lea hq, [base+ipred_z3_16bpc_ssse3_table+hq] 2315 movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] 2316 jmp hq 2317.h4: 2318 lea r4d, [angleq+88] 2319 test r4d, 0x480 2320 jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 2321 sar r4d, 9 2322 add r4d, wd 2323 cmp r4d, 8 2324 jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) 2325 mova m2, [tlq-14] ; 7 6 5 4 3 2 1 0 2326 movu m3, [tlq-12] ; 8 7 6 5 4 3 2 1 2327%if ARCH_X86_64 2328 movd m6, r8m 2329%endif 2330 pshufb m4, m2, m0 2331 mov tlq, rsp 2332 palignr m1, m2, m4, 14 ; 8 8 7 6 5 4 3 2 2333 add dyd, dyd 2334 palignr m5, m2, m4, 12 ; 8 8 8 7 6 5 4 3 2335 paddw m1, m2 2336 paddw m3, m5 2337 psubw m5, m1, m3 2338 mova m3, [base+z_upsample] 2339 mova [tlq+ 0], m4 2340 movd m4, dyd 2341 psraw m5, 3 2342 neg dyd 2343 paddw m1, m5 2344 pxor m5, m5 2345 lea r5d, [dyq+(16<<6)+63] ; ypos 2346 pmaxsw m1, m5 2347 pshufb m6, m0 2348 shl wd, 3 2349 pavgw m1, m5 2350 pshufb m4, m0 2351 pminsw m1, m6 2352 sub rsp, wq 2353 punpckhwd m0, m1, m2 2354 paddw m5, m4, m4 2355 punpcklwd m1, m2 2356 mova [tlq+32], m0 2357 movsd m4, m5 2358 mova [tlq+16], m1 2359.h4_upsample_loop: 2360 lea r4d, [r5+dyq] 2361 sar r5d, 6 2362 movu m2, [tlq+r5*2] 2363 lea r5d, [r4+dyq] 2364 sar r4d, 6 2365 movu m1, [tlq+r4*2] 2366 pshufb m2, m3 2367 pshufb m1, m3 2368 punpckhqdq m0, m1, m2 2369 punpcklqdq m1, m2 2370 pand m2, m7, m4 2371 psllw m2, 9 2372 psubw m1, m0 2373 pmulhrsw m1, m2 2374 paddw m4, m5 2375 paddw m0, m1 2376 mova [rsp+wq-16], m0 2377 sub wd, 16 2378 jg .h4_upsample_loop 2379 or r3d, 4*2 2380 jmp .end_transpose 2381.h4_no_upsample: 2382 mov r4d, 7 2383 test angled, 0x400 ; !enable_intra_edge_filter 2384 jnz .h4_main 2385 lea r4d, [wq+3] 2386 movd m1, r4d 2387 movd m3, angled 2388 shr angled, 8 ; is_sm << 1 2389 pxor m2, m2 2390 pshufb m1, m2 2391 pshufb m3, m2 2392 pcmpeqb m1, [base+z_filt_wh4] 2393 pand m1, m3 2394 pcmpgtb m1, [base+z_filt_t_w48+angleq*8] 2395 pmovmskb r5d, m1 2396 mov r4d, 7 2397 test r5d, r5d 2398 jz .h4_main ; filter_strength == 0 2399 pshuflw m1, [tlq+2], q0000 2400 imul r5d, 0x55555555 2401 mova m2, [tlq-14] 2402 neg r4 2403 movd m3, [tlq+r4*2] 2404 shr r5d, 30 2405 movd [rsp+16*17], m1 2406 pshuflw m3, m3, q0000 2407 mova [rsp+16*16], m2 2408 lea r2, [r4-2] 2409 movq [rsp+16*17+r4*2-10], m3 2410 cmp wd, 8 2411 cmovae r4, r2 2412 lea tlq, [rsp+16*17-2] 2413 call .filter_edge 2414.h4_main: 2415 movd m4, dyd 2416 sub tlq, r4 2417 movddup m1, [base+z_base_inc_z2+8] ; base_inc << 6 2418 sub tlq, r4 2419 shl r4d, 6 2420 movd m6, [tlq] 2421 movd m3, r4d 2422 pshufb m4, m0 2423 neg dyq 2424 pshufb m6, m0 2425 lea r5, [dyq+r4+63] ; ypos 2426 pshufb m3, m0 2427 shl wd, 3 2428 paddw m5, m4, m4 2429 sub rsp, wq 2430 psubw m3, m1 ; max_base_y 2431 movsd m4, m5 ; ypos1 ypos0 2432.h4_loop: 2433 lea r4, [r5+dyq] 2434 sar r5, 6 2435 movddup m0, [tlq+r5*2-6] 2436 movddup m1, [tlq+r5*2-8] 2437 lea r5, [r4+dyq] 2438 sar r4, 6 2439 movlps m0, [tlq+r4*2-6] 2440 movlps m1, [tlq+r4*2-8] 2441 pand m2, m7, m4 2442 psllw m2, 9 2443 psubw m1, m0 2444 pmulhrsw m1, m2 2445 pcmpgtw m2, m3, m4 2446 paddw m4, m5 2447 paddw m0, m1 2448 pand m0, m2 2449 pandn m2, m6 2450 por m0, m2 2451 mova [rsp+wq-16], m0 2452 sub wd, 16 2453 jz .h4_transpose 2454 test r5d, r5d 2455 jg .h4_loop 2456.h4_end_loop: 2457 mova [rsp+wq-16], m6 2458 sub wd, 16 2459 jg .h4_end_loop 2460.h4_transpose: 2461 or r3d, 4*2 2462 jmp .end_transpose 2463.h8: 2464 lea r4d, [angleq+88] 2465 and r4d, ~0x7f 2466 or r4d, wd 2467 cmp r4d, 8 2468 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 2469 mova m2, [tlq-30] ; g f e d c b a 9 2470 movu m1, [tlq-32] ; _ g f e d c b a 2471 movu m3, [tlq-16] ; 9 8 7 6 5 4 3 2 2472 paddw m3, [tlq-14] ; 8 7 6 5 4 3 2 1 2473 pshufd m4, m2, q2100 ; _ _ g f e d c b 2474 paddw m1, m2 2475 movu m5, [tlq-28] ; f e d c b a 9 8 2476 add dyd, dyd 2477 cmp wd, 8 2478 je .h8_upsample_w8 2479 pshufhw m4, m2, q1000 ; _ _ _ _ c c c b 2480.h8_upsample_w8: 2481 paddw m4, m5 2482 psubw m5, m1, m4 2483 movu m4, [tlq-18] ; a 9 8 7 6 5 4 3 2484 psraw m5, 3 2485 paddw m1, m5 2486 movu m5, [tlq-12] ; 7 6 5 4 3 2 1 0 2487%if ARCH_X86_64 2488 movd m6, r8m ; pixel_max 2489%endif 2490 paddw m4, m5 2491 shl wd, 4 2492 psubw m5, m3, m4 2493 movd m4, dyd 2494 psraw m5, 3 2495 neg dyd 2496 paddw m3, m5 2497 pshufb m6, m0 2498 mova m5, [tlq-14] 2499 pshufb m4, m0 2500 pxor m0, m0 2501 pmaxsw m1, m0 2502 pmaxsw m3, m0 2503 mov tlq, rsp 2504 pavgw m1, m0 2505 pavgw m3, m0 2506 sub rsp, wq 2507 pminsw m1, m6 2508 pminsw m6, m3 2509 mova m3, [base+z_upsample] 2510 lea r5d, [dyq+(16<<6)+63] ; ypos 2511 punpcklwd m0, m1, m2 2512 mova [tlq+16*0], m0 2513 punpckhwd m1, m2 2514 mova [tlq+16*1], m1 2515 punpcklwd m0, m6, m5 2516 mova [tlq+16*2], m0 2517 punpckhwd m6, m5 2518 mova [tlq+16*3], m6 2519 mova m5, m4 2520.h8_upsample_loop: 2521 mov r4d, r5d 2522 sar r4d, 6 2523 movu m1, [tlq+r4*2+16*0] 2524 movu m2, [tlq+r4*2+16*1] 2525 add r5d, dyd 2526 pshufb m2, m3 2527 pshufb m1, m3 2528 punpckhqdq m0, m1, m2 2529 punpcklqdq m1, m2 2530 pand m2, m7, m4 2531 psllw m2, 9 2532 psubw m1, m0 2533 pmulhrsw m1, m2 2534 paddw m4, m5 2535 paddw m0, m1 2536 mova [rsp+wq-16], m0 2537 sub wd, 16 2538 jg .h8_upsample_loop 2539 or r3d, 8*2 2540 jmp .end_transpose 2541.h8_no_upsample: 2542 lea r4d, [wq+7] 2543 movd m1, r4d 2544 and r4d, 7 2545 or r4d, 8 ; imin(w+7, 15) 2546 test angled, 0x400 2547 jnz .h8_main 2548 movd m3, angled 2549 shr angled, 8 ; is_sm << 1 2550 pxor m2, m2 2551 pshufb m1, m2 2552 pshufb m3, m2 2553 movu m2, [base+z_filt_wh8] 2554 psrldq m4, [base+z_filt_t_w48+angleq*8], 4 2555 pcmpeqb m2, m1 2556 pand m2, m3 2557 pcmpgtb m2, m4 2558 pmovmskb r5d, m2 2559 test r5d, r5d 2560 jz .h8_main ; filter_strength == 0 2561 pshuflw m1, [tlq+2], q0000 2562 imul r5d, 0x55555555 2563 mova m2, [tlq-16*1+2] 2564 neg r4 2565 mova m3, [tlq-16*2+2] 2566 shr r5d, 30 2567 movd m4, [tlq+r4*2] 2568 movd [rsp+16*17], m1 2569 mova [rsp+16*16], m2 2570 pshuflw m4, m4, q0000 2571 mova [rsp+16*15], m3 2572 lea r2, [r4-2] 2573 movq [rsp+16*17+r4*2-10], m4 2574 cmp wd, 16 2575 cmovae r4, r2 2576 lea tlq, [rsp+16*17-2] 2577 call .filter_edge 2578.h8_main: 2579 sub tlq, r4 2580 movd m4, dyd 2581 sub tlq, r4 2582 shl r4d, 6 2583 movd m6, [tlq] 2584 movd m3, r4d 2585 pshufb m4, m0 2586 neg dyq 2587 pshufb m6, m0 2588 lea r5, [dyq+r4+63] 2589 pshufb m3, m0 2590 shl wd, 4 2591 mova m5, m4 2592 sub rsp, wq 2593 psubw m3, [base+z_base_inc_z2] 2594.h8_loop: 2595 mov r4, r5 2596 sar r4, 6 2597 movu m0, [tlq+r4*2-14] 2598 movu m1, [tlq+r4*2-16] 2599 pand m2, m7, m4 2600 psllw m2, 9 2601 psubw m1, m0 2602 pmulhrsw m1, m2 2603 pcmpgtw m2, m3, m4 2604 paddw m4, m5 2605 paddw m0, m1 2606 pand m0, m2 2607 pandn m2, m6 2608 por m0, m2 2609 mova [rsp+wq-16], m0 2610 sub wd, 8*2 2611 jz .h8_transpose 2612 add r5, dyq 2613 jg .h8_loop 2614.h8_end_loop: 2615 mova [rsp+wq-16], m6 2616 sub wd, 8*2 2617 jg .h8_end_loop 2618.h8_transpose: 2619 or r3d, 8*2 2620 jmp .end_transpose 2621.h16: 2622 lea r4d, [wq+15] 2623 movd m1, r4d 2624 and r4d, 15 2625 or r4d, 16 ; imin(w+15, 31) 2626 test angled, 0x400 2627 jnz .h16_main 2628 movd m3, angled 2629 shr angled, 8 ; is_sm << 1 2630 pxor m2, m2 2631 pshufb m1, m2 2632 pshufb m3, m2 2633 movq m4, [base+z_filt_t_w16+angleq*4] 2634 pcmpeqb m1, [base+z_filt_wh16] 2635 pand m1, m3 2636 pcmpgtb m1, m4 2637 pmovmskb r5d, m1 2638 test r5d, r5d 2639 jz .h16_main ; filter_strength == 0 2640 pshuflw m1, [tlq+2], q0000 2641 mova m2, [tlq-16*1+2] 2642 imul r5d, 0x24924924 2643 mova m3, [tlq-16*2+2] 2644 neg r4 2645 mova m4, [tlq-16*3+2] 2646 shr r5d, 30 2647 mova m5, [tlq-16*4+2] 2648 movd m6, [tlq+r4*2] 2649 adc r5d, -1 ; filter_strength 2650 movd [rsp+16*17], m1 2651 mova [rsp+16*16], m2 2652 mova [rsp+16*15], m3 2653 pshuflw m6, m6, q0000 2654 mova [rsp+16*14], m4 2655 mova [rsp+16*13], m5 2656 lea r2, [r4-2] 2657 movq [rsp+16*17+r4*2-10], m6 2658 cmp wd, 32 2659 cmovae r4, r2 2660 lea tlq, [rsp+16*17-2] 2661 call .filter_edge 2662.h16_main: 2663 sub tlq, r4 2664 movd m5, dyd 2665 sub tlq, r4 2666 shl r4d, 6 2667 movd m6, [tlq] 2668 movd m3, r4d 2669 pshufb m5, m0 2670 neg dyq 2671 pshufb m6, m0 2672 lea r5, [dyq+r4+63] 2673 pshufb m3, m0 2674 shl wd, 5 2675 paddw m4, m5, [base+z_base_inc_z2] 2676 sub rsp, wq 2677 psubw m4, m3 2678.h16_loop: 2679 mov r4, r5 2680 sar r4, 6 2681 movu m0, [tlq+r4*2-14] 2682 movu m2, [tlq+r4*2-16] 2683 pand m3, m7, m4 2684 psllw m3, 9 2685 psubw m2, m0 2686 pmulhrsw m2, m3 2687 movu m1, [tlq+r4*2-30] 2688 paddw m0, m2 2689 movu m2, [tlq+r4*2-32] 2690 psubw m2, m1 2691 pmulhrsw m2, m3 2692 movddup m3, [base+pw_m512] 2693 paddw m1, m2 2694 psraw m2, m4, 15 2695 pcmpgtw m3, m4 2696 paddw m4, m5 2697 pand m0, m2 2698 pandn m2, m6 2699 pand m1, m3 2700 pandn m3, m6 2701 por m0, m2 2702 mova [rsp+wq-16*1], m0 2703 por m1, m3 2704 mova [rsp+wq-16*2], m1 2705 sub wd, 16*2 2706 jz .h16_transpose 2707 add r5, dyq 2708 jg .h16_loop 2709.h16_end_loop: 2710 mova [rsp+wq-16*1], m6 2711 mova [rsp+wq-16*2], m6 2712 sub wd, 16*2 2713 jg .h16_end_loop 2714.h16_transpose: 2715 or r3d, 16*2 2716 jmp .end_transpose 2717.h32: 2718 lea r4d, [wq+31] 2719 and r4d, 31 2720 or r4d, 32 ; imin(w+31, 63) 2721 test angled, 0x400 ; !enable_intra_edge_filter 2722 jnz .h32_main 2723 call .filter_copy 2724 lea r5, [r4-2] 2725 cmp wd, 64 2726 cmove r4, r5 2727 call .filter_edge_s3 2728.h32_main: 2729 sub tlq, r4 2730 movd m5, dyd 2731 sub tlq, r4 2732 shl r4d, 6 2733 movd m6, [tlq] 2734 movd m3, r4d 2735 pshufb m5, m0 2736 neg dyq 2737 pshufb m6, m0 2738 lea r5, [dyq+r4+63] 2739 pshufb m3, m0 2740 paddw m4, m5, [base+z_base_inc_z2] 2741 psubw m4, m3 2742.h32_loop: 2743 mov r4, r5 2744 sar r4, 6 2745 movu m0, [tlq+r4*2-14] 2746 movu m3, [tlq+r4*2-16] 2747 pand m2, m7, m4 2748 psllw m2, 9 2749 psubw m3, m0 2750 pmulhrsw m3, m2 2751 movu m1, [tlq+r4*2-30] 2752 paddw m0, m3 2753 movu m3, [tlq+r4*2-32] 2754 psubw m3, m1 2755 pmulhrsw m3, m2 2756 sub rsp, 16*4 2757 paddw m1, m3 2758 psraw m3, m4, 15 2759 pand m0, m3 2760 pandn m3, m6 2761 por m0, m3 2762 movddup m3, [base+pw_m512] 2763 pcmpgtw m3, m4 2764 pand m1, m3 2765 pandn m3, m6 2766 mova [rsp+16*3], m0 2767 por m1, m3 2768 mova [rsp+16*2], m1 2769 movu m0, [tlq+r4*2-46] 2770 movu m3, [tlq+r4*2-48] 2771 psubw m3, m0 2772 pmulhrsw m3, m2 2773 movu m1, [tlq+r4*2-62] 2774 paddw m0, m3 2775 movu m3, [tlq+r4*2-64] 2776 psubw m3, m1 2777 pmulhrsw m3, m2 2778 movddup m2, [base+pw_m1024] 2779 paddw m1, m3 2780 movddup m3, [base+pw_m1536] 2781 pcmpgtw m2, m4 2782 pcmpgtw m3, m4 2783 paddw m4, m5 2784 pand m0, m2 2785 pandn m2, m6 2786 pand m1, m3 2787 pandn m3, m6 2788 por m0, m2 2789 mova [rsp+16*1], m0 2790 por m1, m3 2791 mova [rsp+16*0], m1 2792 dec wd 2793 jz .h32_transpose 2794 add r5, dyq 2795 jg .h32_loop 2796.h32_end_loop: 2797 sub rsp, 16*4 2798 REPX {mova [rsp+16*x], m6}, 3, 2, 1, 0 2799 dec wd 2800 jg .h32_end_loop 2801.h32_transpose: 2802 or r3d, 32*2 2803 jmp .end_transpose 2804.h64: 2805 lea r4d, [wq+63] 2806 test angled, 0x400 ; !enable_intra_edge_filter 2807 jnz .h64_main 2808 call .filter_copy 2809 call .filter_edge_s3 2810.h64_main: 2811 sub tlq, r4 2812 movd m5, dyd 2813 sub tlq, r4 2814 shl r4d, 6 2815 movd m6, [tlq] 2816 movd m3, r4d 2817 pshufb m5, m0 2818 neg dyq 2819 pshufb m6, m0 2820 lea r5, [dyq+r4+63] 2821 pshufb m3, m0 2822 paddw m4, m5, [base+z_base_inc_z2] 2823 psubw m4, m3 2824.h64_loop: 2825 mov r4, r5 2826 sar r4, 6 2827 movu m0, [tlq+r4*2- 14] 2828 movu m3, [tlq+r4*2- 16] 2829 pand m2, m7, m4 2830 psllw m2, 9 2831 psubw m3, m0 2832 pmulhrsw m3, m2 2833 movu m1, [tlq+r4*2- 30] 2834 paddw m0, m3 2835 movu m3, [tlq+r4*2- 32] 2836 psubw m3, m1 2837 pmulhrsw m3, m2 2838 sub rsp, 16*8 2839 paddw m1, m3 2840 psraw m3, m4, 15 2841 pand m0, m3 2842 pandn m3, m6 2843 por m0, m3 2844 movddup m3, [base+pw_m512] 2845 pcmpgtw m3, m4 2846 pand m1, m3 2847 pandn m3, m6 2848 mova [rsp+16*7], m0 2849 por m1, m3 2850 mova [rsp+16*6], m1 2851 movu m0, [tlq+r4*2- 46] 2852 movu m3, [tlq+r4*2- 48] 2853 psubw m3, m0 2854 pmulhrsw m3, m2 2855 movu m1, [tlq+r4*2- 62] 2856 paddw m0, m3 2857 movu m3, [tlq+r4*2- 64] 2858 psubw m3, m1 2859 pmulhrsw m3, m2 2860 paddw m1, m3 2861 movddup m3, [base+pw_m1024] 2862 pcmpgtw m3, m4 2863 pand m0, m3 2864 pandn m3, m6 2865 por m0, m3 2866 movddup m3, [base+pw_m1536] 2867 pcmpgtw m3, m4 2868 pand m1, m3 2869 pandn m3, m6 2870 mova [rsp+16*5], m0 2871 por m1, m3 2872 mova [rsp+16*4], m1 2873 movu m0, [tlq+r4*2- 78] 2874 movu m3, [tlq+r4*2- 80] 2875 psubw m3, m0 2876 pmulhrsw m3, m2 2877 movu m1, [tlq+r4*2- 94] 2878 paddw m0, m3 2879 movu m3, [tlq+r4*2- 96] 2880 psubw m3, m1 2881 pmulhrsw m3, m2 2882 paddw m1, m3 2883 movddup m3, [base+pw_m2048] 2884 pcmpgtw m3, m4 2885 pand m0, m3 2886 pandn m3, m6 2887 por m0, m3 2888 movddup m3, [base+pw_m2560] 2889 pcmpgtw m3, m4 2890 pand m1, m3 2891 pandn m3, m6 2892 mova [rsp+16*3], m0 2893 por m1, m3 2894 mova [rsp+16*2], m1 2895 movu m0, [tlq+r4*2-110] 2896 movu m3, [tlq+r4*2-112] 2897 psubw m3, m0 2898 pmulhrsw m3, m2 2899 movu m1, [tlq+r4*2-126] 2900 paddw m0, m3 2901 movu m3, [tlq+r4*2-128] 2902 psubw m3, m1 2903 pmulhrsw m3, m2 2904 movddup m2, [base+pw_m3072] 2905 paddw m1, m3 2906 movddup m3, [base+pw_m3584] 2907 pcmpgtw m2, m4 2908 pcmpgtw m3, m4 2909 paddw m4, m5 2910 pand m0, m2 2911 pandn m2, m6 2912 pand m1, m3 2913 pandn m3, m6 2914 por m0, m2 2915 mova [rsp+16*1], m0 2916 por m1, m3 2917 mova [rsp+16*0], m1 2918 dec wd 2919 jz .h64_transpose 2920 add r5, dyq 2921 jg .h64_loop 2922.h64_end_loop: 2923 sub rsp, 16*8 2924 REPX {mova [rsp+16*x], m6}, 7, 6, 5, 4, 3, 2, 1, 0 2925 dec wd 2926 jg .h64_end_loop 2927.h64_transpose: 2928 add r3d, 64*2 2929.end_transpose: 2930%if ARCH_X86_64 2931 lea r7, [strideq*3] 2932%else 2933 mov strideq, [dstq+4*0] 2934 mov org_wd, [dstq+4*1] 2935%endif 2936 lea r4d, [r3*3] 2937.end_transpose_loop: 2938 lea r2, [rsp+r3-8] 2939 lea r6, [dstq+org_wq*2-8] 2940.end_transpose_loop_y: 2941 movq m0, [r2+r4 ] 2942 movq m1, [r2+r3*2] 2943 movq m2, [r2+r3*1] 2944 movq m3, [r2+r3*0] 2945 sub r2, 8 2946 punpcklwd m0, m1 2947 punpcklwd m2, m3 2948 punpckhdq m1, m0, m2 2949 punpckldq m0, m2 2950 movhps [r6+strideq*0], m1 2951 movq [r6+strideq*1], m1 2952%if ARCH_X86_64 2953 movhps [r6+strideq*2], m0 2954 movq [r6+r7 ], m0 2955 lea r6, [r6+strideq*4] 2956%else 2957 lea r6, [r6+strideq*2] 2958 movhps [r6+strideq*0], m0 2959 movq [r6+strideq*1], m0 2960 lea r6, [r6+strideq*2] 2961%endif 2962 cmp r2, rsp 2963 jae .end_transpose_loop_y 2964 lea rsp, [rsp+r3*4] 2965 sub org_wd, 4 2966 jg .end_transpose_loop 2967 RET 2968.filter_copy: 2969 neg r4 2970 pshuflw m2, [tlq+2], q0000 2971 xor r5d, r5d 2972 pshuflw m3, [tlq+r4*2], q0000 2973 movq [rsp+gprsize+16*17], m2 2974.filter_copy_loop: 2975 mova m1, [tlq+r5*2-16*1+2] 2976 mova m2, [tlq+r5*2-16*2+2] 2977 sub r5, 16 2978 mova [rsp+r5*2+gprsize+16*18], m1 2979 mova [rsp+r5*2+gprsize+16*17], m2 2980 cmp r5d, r4d 2981 jg .filter_copy_loop 2982 lea tlq, [rsp+gprsize+16*17-2] 2983 movq [tlq+r4*2-8], m3 2984 ret 2985.filter_edge: 2986 cmp r5d, 3 2987 je .filter_edge_s3 2988 movddup m4, [base+z_filt_k+r5*8-8] 2989 movddup m5, [base+z_filt_k+r5*8+8] 2990 xor r5d, r5d 2991 movddup m6, [base+pw_8] 2992 movu m2, [tlq-12] 2993 jmp .filter_edge_start 2994.filter_edge_loop: 2995 movu m2, [tlq+r5*2-12] 2996 mova [tlq+r5*2+2], m1 2997.filter_edge_start: 2998 pmullw m1, m4, [tlq+r5*2-14] 2999 movu m3, [tlq+r5*2-16] 3000 sub r5, 8 3001 paddw m2, m3 3002 pmullw m2, m5 3003 paddw m1, m6 3004 paddw m1, m2 3005 psrlw m1, 4 3006 cmp r5d, r4d 3007 jg .filter_edge_loop 3008 mova [tlq+r5*2+2], m1 3009 neg r4d 3010 ret 3011.filter_edge_s3: 3012 movddup m5, [base+pw_3] 3013 xor r5d, r5d 3014 movu m2, [tlq-12] 3015 movu m3, [tlq-10] 3016 jmp .filter_edge_s3_start 3017.filter_edge_s3_loop: 3018 movu m2, [tlq+r5*2-12] 3019 movu m3, [tlq+r5*2-10] 3020 mova [tlq+r5*2+2], m1 3021.filter_edge_s3_start: 3022 paddw m2, [tlq+r5*2-14] 3023 paddw m3, m5 3024 movu m1, [tlq+r5*2-16] 3025 movu m4, [tlq+r5*2-18] 3026 sub r5, 8 3027 paddw m1, m2 3028 pavgw m3, m4 3029 paddw m1, m3 3030 psrlw m1, 2 3031 cmp r5d, r4d 3032 jg .filter_edge_s3_loop 3033 mova [tlq+r5*2+2], m1 3034 neg r4d 3035 ret 3036 3037%if ARCH_X86_64 3038cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter 3039%else 3040cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter 3041%define m8 [esp+16*0] 3042%define m9 [esp+16*1] 3043%define m10 [esp+16*2] 3044%define m11 [esp+16*3] 3045%define m12 [esp+16*4] 3046%define m13 [esp+16*5] 3047%define m14 [esp+16*6] 3048%define m15 [esp+16*7] 3049%endif 3050%define base r6-$$ 3051 movifnidn hd, hm 3052 movd m6, r8m ; bitdepth_max 3053%ifidn filterd, filterm 3054 movzx filterd, filterb 3055%else 3056 movzx filterd, byte filterm 3057%endif 3058 LEA r6, $$ 3059 shl filterd, 6 3060 movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3 3061 mova m1, [base+filter_intra_taps+filterq+16*0] 3062 mova m2, [base+filter_intra_taps+filterq+16*1] 3063 mova m3, [base+filter_intra_taps+filterq+16*2] 3064 mova m4, [base+filter_intra_taps+filterq+16*3] 3065 pxor m5, m5 3066%if ARCH_X86_64 3067 punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper 3068 punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid 3069 punpcklbw m10, m5, m2 ; having to perform sign-extension. 3070 punpckhbw m11, m5, m2 3071 punpcklbw m12, m5, m3 3072 punpckhbw m13, m5, m3 3073 punpcklbw m14, m5, m4 3074 punpckhbw m15, m5, m4 3075%else 3076 punpcklbw m7, m5, m1 3077 mova m8, m7 3078 punpckhbw m7, m5, m1 3079 mova m9, m7 3080 punpcklbw m7, m5, m2 3081 mova m10, m7 3082 punpckhbw m7, m5, m2 3083 mova m11, m7 3084 punpcklbw m7, m5, m3 3085 mova m12, m7 3086 punpckhbw m7, m5, m3 3087 mova m13, m7 3088 punpcklbw m7, m5, m4 3089 mova m14, m7 3090 punpckhbw m7, m5, m4 3091 mova m15, m7 3092%endif 3093 mova m7, [base+filter_shuf] 3094 add hd, hd 3095 mov r5, dstq 3096 pshuflw m6, m6, q0000 3097 mov r6, tlq 3098 punpcklqdq m6, m6 3099 sub tlq, hq 3100.left_loop: 3101 pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __ 3102 pshufd m1, m0, q0000 3103 pmaddwd m2, m8, m1 3104 pmaddwd m1, m9 3105 pshufd m4, m0, q1111 3106 pmaddwd m3, m10, m4 3107 pmaddwd m4, m11 3108 paddd m2, m3 3109 paddd m1, m4 3110 pshufd m4, m0, q2222 3111 pmaddwd m3, m12, m4 3112 pmaddwd m4, m13 3113 paddd m2, m3 3114 paddd m1, m4 3115 pshufd m3, m0, q3333 3116 pmaddwd m0, m14, m3 3117 pmaddwd m3, m15 3118 paddd m0, m2 3119 paddd m1, m3 3120 psrad m0, 11 ; x >> 3 3121 psrad m1, 11 3122 packssdw m0, m1 3123 pmaxsw m0, m5 3124 pavgw m0, m5 ; (x + 8) >> 4 3125 pminsw m0, m6 3126 movq [dstq+strideq*0], m0 3127 movhps [dstq+strideq*1], m0 3128 movlps m0, [tlq+hq-10] 3129 lea dstq, [dstq+strideq*2] 3130 sub hd, 2*2 3131 jg .left_loop 3132 sub wd, 4 3133 jz .end 3134 sub tld, r6d ; -h*2 3135 sub r6, r5 ; tl-dst 3136.right_loop0: 3137 add r5, 8 3138 mov hd, tld 3139 movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __ 3140 mov dstq, r5 3141.right_loop: 3142 pshufd m2, m0, q0000 3143 pmaddwd m1, m8, m2 3144 pmaddwd m2, m9 3145 pshufd m4, m0, q1111 3146 pmaddwd m3, m10, m4 3147 pmaddwd m4, m11 3148 pinsrw m0, [dstq+strideq*0-2], 5 3149 paddd m1, m3 3150 paddd m2, m4 3151 pshufd m0, m0, q2222 3152 movddup m4, [dstq+strideq*1-8] 3153 pmaddwd m3, m12, m0 3154 pmaddwd m0, m13 3155 paddd m1, m3 3156 paddd m0, m2 3157 pshuflw m2, m4, q3333 3158 punpcklwd m2, m5 3159 pmaddwd m3, m14, m2 3160 pmaddwd m2, m15 3161 paddd m1, m3 3162 paddd m0, m2 3163 psrad m1, 11 3164 psrad m0, 11 3165 packssdw m0, m1 3166 pmaxsw m0, m5 3167 pavgw m0, m5 3168 pminsw m0, m6 3169 movhps [dstq+strideq*0], m0 3170 movq [dstq+strideq*1], m0 3171 palignr m0, m4, 14 3172 lea dstq, [dstq+strideq*2] 3173 add hd, 2*2 3174 jl .right_loop 3175 sub wd, 4 3176 jg .right_loop0 3177.end: 3178 RET 3179 3180%if UNIX64 3181DECLARE_REG_TMP 7 3182%else 3183DECLARE_REG_TMP 5 3184%endif 3185 3186cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac 3187 LEA t0, ipred_cfl_left_16bpc_ssse3_table 3188 movd m4, wd 3189 tzcnt wd, wd 3190 movifnidn hd, hm 3191 add tlq, 2 3192 movsxd r6, [t0+wq*4] 3193 movd m5, wd 3194 jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start) 3195 3196cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha 3197 movifnidn hd, hm 3198 LEA t0, ipred_cfl_left_16bpc_ssse3_table 3199 tzcnt wd, wm 3200 lea r6d, [hq*2] 3201 movd m4, hd 3202 sub tlq, r6 3203 tzcnt r6d, hd 3204 movd m5, r6d 3205 movsxd r6, [t0+r6*4] 3206.start: 3207 movd m7, r7m 3208 movu m0, [tlq] 3209 add r6, t0 3210 add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table 3211 movsxd wq, [t0+wq*4] 3212 pxor m6, m6 3213 pshuflw m7, m7, q0000 3214 pcmpeqw m3, m3 3215 add wq, t0 3216 movifnidn acq, acmp 3217 pavgw m4, m6 3218 punpcklqdq m7, m7 3219 jmp r6 3220.h32: 3221 movu m1, [tlq+48] 3222 movu m2, [tlq+32] 3223 paddw m0, m1 3224 paddw m0, m2 3225.h16: 3226 movu m1, [tlq+16] 3227 paddw m0, m1 3228.h8: 3229 pshufd m1, m0, q1032 3230 paddw m0, m1 3231.h4: 3232 pmaddwd m0, m3 3233 psubd m4, m0 3234 pshuflw m0, m4, q1032 3235 paddd m0, m4 3236 psrld m0, m5 3237 pshuflw m0, m0, q0000 3238 punpcklqdq m0, m0 3239 jmp wq 3240 3241%macro IPRED_CFL 2 ; dst, src 3242 pabsw m%1, m%2 3243 pmulhrsw m%1, m2 3244 psignw m%2, m1 3245 psignw m%1, m%2 3246 paddw m%1, m0 3247 pmaxsw m%1, m6 3248 pminsw m%1, m7 3249%endmacro 3250 3251cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha 3252 movifnidn hd, hm 3253 tzcnt r6d, hd 3254 lea t0d, [wq+hq] 3255 movd m4, t0d 3256 tzcnt t0d, t0d 3257 movd m5, t0d 3258 LEA t0, ipred_cfl_16bpc_ssse3_table 3259 tzcnt wd, wd 3260 movd m7, r7m 3261 movsxd r6, [t0+r6*4] 3262 movsxd wq, [t0+wq*4+4*4] 3263 psrlw m4, 1 3264 pxor m6, m6 3265 pshuflw m7, m7, q0000 3266 add r6, t0 3267 add wq, t0 3268 movifnidn acq, acmp 3269 pcmpeqw m3, m3 3270 punpcklqdq m7, m7 3271 jmp r6 3272.h4: 3273 movq m0, [tlq-8] 3274 jmp wq 3275.w4: 3276 movq m1, [tlq+2] 3277 paddw m0, m1 3278 pmaddwd m0, m3 3279 psubd m4, m0 3280 pshufd m0, m4, q1032 3281 paddd m0, m4 3282 pshuflw m4, m0, q1032 3283 paddd m0, m4 3284 cmp hd, 4 3285 jg .w4_mul 3286 psrld m0, 3 3287 jmp .w4_end 3288.w4_mul: 3289 mov r6d, 0xAAAB 3290 mov r2d, 0x6667 3291 cmp hd, 16 3292 cmove r6d, r2d 3293 movd m1, r6d 3294 psrld m0, 2 3295 pmulhuw m0, m1 3296 psrlw m0, 1 3297.w4_end: 3298 pshuflw m0, m0, q0000 3299 punpcklqdq m0, m0 3300.s4: 3301 movd m1, alpham 3302 lea r6, [strideq*3] 3303 pshuflw m1, m1, q0000 3304 punpcklqdq m1, m1 3305 pabsw m2, m1 3306 psllw m2, 9 3307.s4_loop: 3308 mova m4, [acq+16*0] 3309 mova m5, [acq+16*1] 3310 add acq, 16*2 3311 IPRED_CFL 3, 4 3312 IPRED_CFL 4, 5 3313 movq [dstq+strideq*0], m3 3314 movhps [dstq+strideq*1], m3 3315 movq [dstq+strideq*2], m4 3316 movhps [dstq+r6 ], m4 3317 lea dstq, [dstq+strideq*4] 3318 sub hd, 4 3319 jg .s4_loop 3320 RET 3321.h8: 3322 mova m0, [tlq-16] 3323 jmp wq 3324.w8: 3325 movu m1, [tlq+2] 3326 paddw m0, m1 3327 pmaddwd m0, m3 3328 psubd m4, m0 3329 pshufd m0, m4, q1032 3330 paddd m0, m4 3331 pshuflw m4, m0, q1032 3332 paddd m0, m4 3333 psrld m0, m5 3334 cmp hd, 8 3335 je .w8_end 3336 mov r6d, 0xAAAB 3337 mov r2d, 0x6667 3338 cmp hd, 32 3339 cmove r6d, r2d 3340 movd m1, r6d 3341 pmulhuw m0, m1 3342 psrlw m0, 1 3343.w8_end: 3344 pshuflw m0, m0, q0000 3345 punpcklqdq m0, m0 3346.s8: 3347 movd m1, alpham 3348 pshuflw m1, m1, q0000 3349 punpcklqdq m1, m1 3350 pabsw m2, m1 3351 psllw m2, 9 3352.s8_loop: 3353 mova m4, [acq+16*0] 3354 mova m5, [acq+16*1] 3355 add acq, 16*2 3356 IPRED_CFL 3, 4 3357 IPRED_CFL 4, 5 3358 mova [dstq+strideq*0], m3 3359 mova [dstq+strideq*1], m4 3360 lea dstq, [dstq+strideq*2] 3361 sub hd, 2 3362 jg .s8_loop 3363 RET 3364.h16: 3365 mova m0, [tlq-32] 3366 paddw m0, [tlq-16] 3367 jmp wq 3368.w16: 3369 movu m1, [tlq+ 2] 3370 movu m2, [tlq+18] 3371 paddw m1, m2 3372 paddw m0, m1 3373 pmaddwd m0, m3 3374 psubd m4, m0 3375 pshufd m0, m4, q1032 3376 paddd m0, m4 3377 pshuflw m4, m0, q1032 3378 paddd m0, m4 3379 psrld m0, m5 3380 cmp hd, 16 3381 je .w16_end 3382 mov r6d, 0xAAAB 3383 mov r2d, 0x6667 3384 test hd, 8|32 3385 cmovz r6d, r2d 3386 movd m1, r6d 3387 pmulhuw m0, m1 3388 psrlw m0, 1 3389.w16_end: 3390 pshuflw m0, m0, q0000 3391 punpcklqdq m0, m0 3392.s16: 3393 movd m1, alpham 3394 pshuflw m1, m1, q0000 3395 punpcklqdq m1, m1 3396 pabsw m2, m1 3397 psllw m2, 9 3398.s16_loop: 3399 mova m4, [acq+16*0] 3400 mova m5, [acq+16*1] 3401 add acq, 16*2 3402 IPRED_CFL 3, 4 3403 IPRED_CFL 4, 5 3404 mova [dstq+16*0], m3 3405 mova [dstq+16*1], m4 3406 add dstq, strideq 3407 dec hd 3408 jg .s16_loop 3409 RET 3410.h32: 3411 mova m0, [tlq-64] 3412 paddw m0, [tlq-48] 3413 paddw m0, [tlq-32] 3414 paddw m0, [tlq-16] 3415 jmp wq 3416.w32: 3417 movu m1, [tlq+ 2] 3418 movu m2, [tlq+18] 3419 paddw m1, m2 3420 movu m2, [tlq+34] 3421 paddw m1, m2 3422 movu m2, [tlq+50] 3423 paddw m1, m2 3424 paddw m0, m1 3425 pmaddwd m0, m3 3426 psubd m4, m0 3427 pshufd m0, m4, q1032 3428 paddd m0, m4 3429 pshuflw m4, m0, q1032 3430 paddd m0, m4 3431 psrld m0, m5 3432 cmp hd, 32 3433 je .w32_end 3434 mov r6d, 0xAAAB 3435 mov r2d, 0x6667 3436 cmp hd, 8 3437 cmove r6d, r2d 3438 movd m1, r6d 3439 pmulhuw m0, m1 3440 psrlw m0, 1 3441.w32_end: 3442 pshuflw m0, m0, q0000 3443 punpcklqdq m0, m0 3444.s32: 3445 movd m1, alpham 3446 pshuflw m1, m1, q0000 3447 punpcklqdq m1, m1 3448 pabsw m2, m1 3449 psllw m2, 9 3450.s32_loop: 3451 mova m4, [acq+16*0] 3452 mova m5, [acq+16*1] 3453 IPRED_CFL 3, 4 3454 IPRED_CFL 4, 5 3455 mova [dstq+16*0], m3 3456 mova [dstq+16*1], m4 3457 mova m4, [acq+16*2] 3458 mova m5, [acq+16*3] 3459 add acq, 16*4 3460 IPRED_CFL 3, 4 3461 IPRED_CFL 4, 5 3462 mova [dstq+16*2], m3 3463 mova [dstq+16*3], m4 3464 add dstq, strideq 3465 dec hd 3466 jg .s32_loop 3467 RET 3468 3469cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac 3470 tzcnt wd, wm 3471 LEA t0, ipred_cfl_splat_16bpc_ssse3_table 3472 mov r6d, r7m 3473 movifnidn hd, hm 3474 shr r6d, 11 3475 movd m7, r7m 3476 movsxd wq, [t0+wq*4] 3477 movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8] 3478 pshuflw m7, m7, q0000 3479 pxor m6, m6 3480 add wq, t0 3481 movifnidn acq, acmp 3482 punpcklqdq m7, m7 3483 jmp wq 3484 3485cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h 3486 movifnidn hpadd, hpadm 3487%if ARCH_X86_32 && PIC 3488 pcmpeqw m5, m5 3489 pabsw m5, m5 3490 paddw m5, m5 3491%else 3492 movddup m5, [pw_2] 3493%endif 3494 mov hd, hm 3495 shl hpadd, 2 3496 pxor m4, m4 3497 sub hd, hpadd 3498 cmp dword wm, 8 3499 mov r5, acq 3500 jg .w16 3501 je .w8 3502 lea r3, [strideq*3] 3503.w4_loop: 3504 pmaddwd m0, m5, [ypxq+strideq*0] 3505 pmaddwd m1, m5, [ypxq+strideq*1] 3506 pmaddwd m2, m5, [ypxq+strideq*2] 3507 pmaddwd m3, m5, [ypxq+r3 ] 3508 lea ypxq, [ypxq+strideq*4] 3509 paddd m0, m1 3510 paddd m2, m3 3511 paddd m4, m0 3512 packssdw m0, m2 3513 paddd m4, m2 3514 mova [acq], m0 3515 add acq, 16 3516 sub hd, 2 3517 jg .w4_loop 3518 test hpadd, hpadd 3519 jz .dc 3520 punpckhqdq m0, m0 3521 pslld m2, 2 3522.w4_hpad: 3523 mova [acq+16*0], m0 3524 paddd m4, m2 3525 mova [acq+16*1], m0 3526 add acq, 16*2 3527 sub hpadd, 4 3528 jg .w4_hpad 3529 jmp .dc 3530.w8: 3531%if ARCH_X86_32 3532 cmp dword wpadm, 0 3533%else 3534 test wpadd, wpadd 3535%endif 3536 jnz .w8_wpad1 3537.w8_loop: 3538 pmaddwd m0, m5, [ypxq+strideq*0+16*0] 3539 pmaddwd m2, m5, [ypxq+strideq*1+16*0] 3540 pmaddwd m1, m5, [ypxq+strideq*0+16*1] 3541 pmaddwd m3, m5, [ypxq+strideq*1+16*1] 3542 lea ypxq, [ypxq+strideq*2] 3543 paddd m0, m2 3544 paddd m1, m3 3545 paddd m2, m0, m1 3546 packssdw m0, m1 3547 paddd m4, m2 3548 mova [acq], m0 3549 add acq, 16 3550 dec hd 3551 jg .w8_loop 3552.w8_hpad: 3553 test hpadd, hpadd 3554 jz .dc 3555 pslld m2, 2 3556 mova m1, m0 3557 jmp .hpad 3558.w8_wpad1: 3559 pmaddwd m0, m5, [ypxq+strideq*0] 3560 pmaddwd m1, m5, [ypxq+strideq*1] 3561 lea ypxq, [ypxq+strideq*2] 3562 paddd m0, m1 3563 pshufd m1, m0, q3333 3564 paddd m2, m0, m1 3565 packssdw m0, m1 3566 paddd m4, m2 3567 mova [acq], m0 3568 add acq, 16 3569 dec hd 3570 jg .w8_wpad1 3571 jmp .w8_hpad 3572.w16_wpad3: 3573 pshufd m3, m0, q3333 3574 mova m1, m3 3575 mova m2, m3 3576 jmp .w16_wpad_end 3577.w16_wpad2: 3578 pshufd m1, m3, q3333 3579 mova m2, m1 3580 jmp .w16_wpad_end 3581.w16_wpad1: 3582 pshufd m2, m1, q3333 3583 jmp .w16_wpad_end 3584.w16: 3585 movifnidn wpadd, wpadm 3586 WIN64_SPILL_XMM 7 3587.w16_loop: 3588 pmaddwd m0, m5, [ypxq+strideq*0+16*0] 3589 pmaddwd m6, m5, [ypxq+strideq*1+16*0] 3590 paddd m0, m6 3591 cmp wpadd, 2 3592 jg .w16_wpad3 3593 pmaddwd m3, m5, [ypxq+strideq*0+16*1] 3594 pmaddwd m6, m5, [ypxq+strideq*1+16*1] 3595 paddd m3, m6 3596 je .w16_wpad2 3597 pmaddwd m1, m5, [ypxq+strideq*0+16*2] 3598 pmaddwd m6, m5, [ypxq+strideq*1+16*2] 3599 paddd m1, m6 3600 jp .w16_wpad1 3601 pmaddwd m2, m5, [ypxq+strideq*0+16*3] 3602 pmaddwd m6, m5, [ypxq+strideq*1+16*3] 3603 paddd m2, m6 3604.w16_wpad_end: 3605 lea ypxq, [ypxq+strideq*2] 3606 paddd m6, m0, m3 3607 packssdw m0, m3 3608 paddd m6, m1 3609 mova [acq+16*0], m0 3610 packssdw m1, m2 3611 paddd m2, m6 3612 mova [acq+16*1], m1 3613 add acq, 16*2 3614 paddd m4, m2 3615 dec hd 3616 jg .w16_loop 3617 WIN64_RESTORE_XMM 3618 add hpadd, hpadd 3619 jz .dc 3620 paddd m2, m2 3621.hpad: 3622 mova [acq+16*0], m0 3623 mova [acq+16*1], m1 3624 paddd m4, m2 3625 mova [acq+16*2], m0 3626 mova [acq+16*3], m1 3627 add acq, 16*4 3628 sub hpadd, 4 3629 jg .hpad 3630.dc: 3631 sub r5, acq ; -w*h*2 3632 pshufd m2, m4, q1032 3633 tzcnt r1d, r5d 3634 paddd m2, m4 3635 sub r1d, 2 3636 pshufd m4, m2, q2301 3637 movd m0, r1d 3638 paddd m2, m4 3639 psrld m2, m0 3640 pxor m0, m0 3641 pavgw m2, m0 3642 packssdw m2, m2 3643.dc_loop: 3644 mova m0, [acq+r5+16*0] 3645 mova m1, [acq+r5+16*1] 3646 psubw m0, m2 3647 psubw m1, m2 3648 mova [acq+r5+16*0], m0 3649 mova [acq+r5+16*1], m1 3650 add r5, 16*2 3651 jl .dc_loop 3652 RET 3653 3654cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h 3655 movifnidn hpadd, hpadm 3656%if ARCH_X86_32 && PIC 3657 pcmpeqw m5, m5 3658 pabsw m5, m5 3659 psllw m5, 2 3660%else 3661 movddup m5, [pw_4] 3662%endif 3663 mov hd, hm 3664 shl hpadd, 2 3665 pxor m4, m4 3666 sub hd, hpadd 3667 cmp dword wm, 8 3668 mov r5, acq 3669 jg .w16 3670 je .w8 3671 lea r3, [strideq*3] 3672.w4_loop: 3673 pmaddwd m0, m5, [ypxq+strideq*0] 3674 pmaddwd m3, m5, [ypxq+strideq*1] 3675 pmaddwd m1, m5, [ypxq+strideq*2] 3676 pmaddwd m2, m5, [ypxq+r3 ] 3677 lea ypxq, [ypxq+strideq*4] 3678 paddd m4, m0 3679 packssdw m0, m3 3680 paddd m3, m1 3681 packssdw m1, m2 3682 paddd m4, m2 3683 paddd m4, m3 3684 mova [acq+16*0], m0 3685 mova [acq+16*1], m1 3686 add acq, 16*2 3687 sub hd, 4 3688 jg .w4_loop 3689 test hpadd, hpadd 3690 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3691 punpckhqdq m1, m1 3692 pslld m2, 3 3693 mova [acq+16*0], m1 3694 mova [acq+16*1], m1 3695 paddd m4, m2 3696 mova [acq+16*2], m1 3697 mova [acq+16*3], m1 3698 add acq, 16*4 3699 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3700.w8: 3701%if ARCH_X86_32 3702 cmp dword wpadm, 0 3703%else 3704 test wpadd, wpadd 3705%endif 3706 jnz .w8_wpad1 3707.w8_loop: 3708 pmaddwd m0, m5, [ypxq+strideq*0+16*0] 3709 pmaddwd m2, m5, [ypxq+strideq*0+16*1] 3710 pmaddwd m1, m5, [ypxq+strideq*1+16*0] 3711 pmaddwd m3, m5, [ypxq+strideq*1+16*1] 3712 lea ypxq, [ypxq+strideq*2] 3713 paddd m4, m0 3714 packssdw m0, m2 3715 paddd m4, m2 3716 mova [acq+16*0], m0 3717 paddd m2, m1, m3 3718 packssdw m1, m3 3719 paddd m4, m2 3720 mova [acq+16*1], m1 3721 add acq, 16*2 3722 sub hd, 2 3723 jg .w8_loop 3724.w8_hpad: 3725 test hpadd, hpadd 3726 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3727 pslld m2, 2 3728 mova m0, m1 3729 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad 3730.w8_wpad1: 3731 pmaddwd m0, m5, [ypxq+strideq*0] 3732 pmaddwd m1, m5, [ypxq+strideq*1] 3733 lea ypxq, [ypxq+strideq*2] 3734 pshufd m2, m0, q3333 3735 pshufd m3, m1, q3333 3736 paddd m4, m0 3737 packssdw m0, m2 3738 paddd m4, m2 3739 paddd m2, m1, m3 3740 packssdw m1, m3 3741 paddd m4, m2 3742 mova [acq+16*0], m0 3743 mova [acq+16*1], m1 3744 add acq, 16*2 3745 sub hd, 2 3746 jg .w8_wpad1 3747 jmp .w8_hpad 3748.w16_wpad3: 3749 pshufd m3, m0, q3333 3750 mova m1, m3 3751 mova m2, m3 3752 jmp .w16_wpad_end 3753.w16_wpad2: 3754 pshufd m1, m3, q3333 3755 mova m2, m1 3756 jmp .w16_wpad_end 3757.w16_wpad1: 3758 pshufd m2, m1, q3333 3759 jmp .w16_wpad_end 3760.w16: 3761 movifnidn wpadd, wpadm 3762 WIN64_SPILL_XMM 7 3763.w16_loop: 3764 pmaddwd m0, m5, [ypxq+16*0] 3765 cmp wpadd, 2 3766 jg .w16_wpad3 3767 pmaddwd m3, m5, [ypxq+16*1] 3768 je .w16_wpad2 3769 pmaddwd m1, m5, [ypxq+16*2] 3770 jp .w16_wpad1 3771 pmaddwd m2, m5, [ypxq+16*3] 3772.w16_wpad_end: 3773 add ypxq, strideq 3774 paddd m6, m0, m3 3775 packssdw m0, m3 3776 mova [acq+16*0], m0 3777 paddd m6, m1 3778 packssdw m1, m2 3779 paddd m2, m6 3780 mova [acq+16*1], m1 3781 add acq, 16*2 3782 paddd m4, m2 3783 dec hd 3784 jg .w16_loop 3785 WIN64_RESTORE_XMM 3786 add hpadd, hpadd 3787 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3788 paddd m2, m2 3789 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad 3790 3791cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h 3792%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table 3793 LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table 3794 tzcnt wd, wm 3795 movifnidn hpadd, hpadm 3796 pxor m4, m4 3797 movsxd wq, [r6+wq*4] 3798 movddup m5, [base+pw_1] 3799 add wq, r6 3800 mov hd, hm 3801 shl hpadd, 2 3802 sub hd, hpadd 3803 jmp wq 3804.w4: 3805 lea r3, [strideq*3] 3806 mov r5, acq 3807.w4_loop: 3808 movq m0, [ypxq+strideq*0] 3809 movhps m0, [ypxq+strideq*1] 3810 movq m1, [ypxq+strideq*2] 3811 movhps m1, [ypxq+r3 ] 3812 lea ypxq, [ypxq+strideq*4] 3813 psllw m0, 3 3814 psllw m1, 3 3815 mova [acq+16*0], m0 3816 pmaddwd m0, m5 3817 mova [acq+16*1], m1 3818 pmaddwd m2, m5, m1 3819 add acq, 16*2 3820 paddd m4, m0 3821 paddd m4, m2 3822 sub hd, 4 3823 jg .w4_loop 3824 test hpadd, hpadd 3825 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3826 punpckhqdq m1, m1 3827 mova [acq+16*0], m1 3828 pslld m2, 2 3829 mova [acq+16*1], m1 3830 punpckhqdq m2, m2 3831 mova [acq+16*2], m1 3832 paddd m4, m2 3833 mova [acq+16*3], m1 3834 add acq, 16*4 3835 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3836.w8: 3837 mov r5, acq 3838.w8_loop: 3839 mova m0, [ypxq+strideq*0] 3840 mova m1, [ypxq+strideq*1] 3841 lea ypxq, [ypxq+strideq*2] 3842 psllw m0, 3 3843 psllw m1, 3 3844 mova [acq+16*0], m0 3845 pmaddwd m0, m5 3846 mova [acq+16*1], m1 3847 pmaddwd m2, m5, m1 3848 add acq, 16*2 3849 paddd m4, m0 3850 paddd m4, m2 3851 sub hd, 2 3852 jg .w8_loop 3853.w8_hpad: 3854 test hpadd, hpadd 3855 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3856 pslld m2, 2 3857 mova m0, m1 3858 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad 3859.w16_wpad2: 3860 pshufhw m3, m2, q3333 3861 pshufhw m1, m0, q3333 3862 punpckhqdq m3, m3 3863 punpckhqdq m1, m1 3864 jmp .w16_wpad_end 3865.w16: 3866 movifnidn wpadd, wpadm 3867 mov r5, acq 3868.w16_loop: 3869 mova m2, [ypxq+strideq*0+16*0] 3870 mova m0, [ypxq+strideq*1+16*0] 3871 psllw m2, 3 3872 psllw m0, 3 3873 test wpadd, wpadd 3874 jnz .w16_wpad2 3875 mova m3, [ypxq+strideq*0+16*1] 3876 mova m1, [ypxq+strideq*1+16*1] 3877 psllw m3, 3 3878 psllw m1, 3 3879.w16_wpad_end: 3880 lea ypxq, [ypxq+strideq*2] 3881 mova [acq+16*0], m2 3882 pmaddwd m2, m5 3883 mova [acq+16*1], m3 3884 pmaddwd m3, m5 3885 paddd m4, m2 3886 pmaddwd m2, m5, m0 3887 mova [acq+16*2], m0 3888 paddd m4, m3 3889 pmaddwd m3, m5, m1 3890 mova [acq+16*3], m1 3891 add acq, 16*4 3892 paddd m2, m3 3893 paddd m4, m2 3894 sub hd, 2 3895 jg .w16_loop 3896 add hpadd, hpadd 3897 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3898 paddd m2, m2 3899 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad 3900.w32_wpad6: 3901 pshufhw m1, m0, q3333 3902 punpckhqdq m1, m1 3903 mova m2, m1 3904 mova m3, m1 3905 jmp .w32_wpad_end 3906.w32_wpad4: 3907 pshufhw m2, m1, q3333 3908 punpckhqdq m2, m2 3909 mova m3, m2 3910 jmp .w32_wpad_end 3911.w32_wpad2: 3912 pshufhw m3, m2, q3333 3913 punpckhqdq m3, m3 3914 jmp .w32_wpad_end 3915.w32: 3916 movifnidn wpadd, wpadm 3917 mov r5, acq 3918 WIN64_SPILL_XMM 8 3919.w32_loop: 3920 mova m0, [ypxq+16*0] 3921 psllw m0, 3 3922 cmp wpadd, 4 3923 jg .w32_wpad6 3924 mova m1, [ypxq+16*1] 3925 psllw m1, 3 3926 je .w32_wpad4 3927 mova m2, [ypxq+16*2] 3928 psllw m2, 3 3929 jnp .w32_wpad2 3930 mova m3, [ypxq+16*3] 3931 psllw m3, 3 3932.w32_wpad_end: 3933 add ypxq, strideq 3934 pmaddwd m6, m5, m0 3935 mova [acq+16*0], m0 3936 pmaddwd m7, m5, m1 3937 mova [acq+16*1], m1 3938 paddd m6, m7 3939 pmaddwd m7, m5, m2 3940 mova [acq+16*2], m2 3941 paddd m6, m7 3942 pmaddwd m7, m5, m3 3943 mova [acq+16*3], m3 3944 add acq, 16*4 3945 paddd m6, m7 3946 paddd m4, m6 3947 dec hd 3948 jg .w32_loop 3949%if WIN64 3950 mova m5, m6 3951 WIN64_RESTORE_XMM 3952 SWAP 5, 6 3953%endif 3954 test hpadd, hpadd 3955 jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3956.w32_hpad_loop: 3957 mova [acq+16*0], m0 3958 mova [acq+16*1], m1 3959 paddd m4, m6 3960 mova [acq+16*2], m2 3961 mova [acq+16*3], m3 3962 add acq, 16*4 3963 dec hpadd 3964 jg .w32_hpad_loop 3965 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc 3966 3967cglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h 3968%define base r2-pal_pred_16bpc_ssse3_table 3969%if ARCH_X86_32 3970 %define hd r2d 3971%endif 3972 mova m4, [palq] 3973 LEA r2, pal_pred_16bpc_ssse3_table 3974 tzcnt wd, wm 3975 pshufb m4, [base+pal_pred_shuf] 3976 movsxd wq, [r2+wq*4] 3977 pshufd m5, m4, q1032 3978 add wq, r2 3979 movifnidn hd, hm 3980 jmp wq 3981.w4: 3982 movq m0, [idxq] 3983 add idxq, 8 3984 psrlw m1, m0, 4 3985 punpcklbw m0, m1 3986 pshufb m1, m4, m0 3987 pshufb m2, m5, m0 3988 punpcklbw m0, m1, m2 3989 punpckhbw m1, m2 3990 movq [dstq+strideq*0], m0 3991 movhps [dstq+strideq*1], m0 3992 lea dstq, [dstq+strideq*2] 3993 movq [dstq+strideq*0], m1 3994 movhps [dstq+strideq*1], m1 3995 lea dstq, [dstq+strideq*2] 3996 sub hd, 4 3997 jg .w4 3998 RET 3999.w8: 4000 movu m3, [idxq] 4001 add idxq, 16 4002 psrlw m1, m3, 4 4003 punpcklbw m0, m3, m1 4004 punpckhbw m3, m1 4005 pshufb m1, m4, m0 4006 pshufb m2, m5, m0 4007 punpcklbw m0, m1, m2 4008 punpckhbw m1, m2 4009 mova [dstq+strideq*0], m0 4010 mova [dstq+strideq*1], m1 4011 lea dstq, [dstq+strideq*2] 4012 pshufb m1, m4, m3 4013 pshufb m2, m5, m3 4014 punpcklbw m0, m1, m2 4015 punpckhbw m1, m2 4016 mova [dstq+strideq*0], m0 4017 mova [dstq+strideq*1], m1 4018 lea dstq, [dstq+strideq*2] 4019 sub hd, 4 4020 jg .w8 4021 RET 4022.w16: 4023 movu m3, [idxq] 4024 add idxq, 16 4025 psrlw m1, m3, 4 4026 punpcklbw m0, m3, m1 4027 punpckhbw m3, m1 4028 pshufb m1, m4, m0 4029 pshufb m2, m5, m0 4030 punpcklbw m0, m1, m2 4031 punpckhbw m1, m2 4032 mova [dstq+ 0], m0 4033 mova [dstq+16], m1 4034 pshufb m1, m4, m3 4035 pshufb m2, m5, m3 4036 punpcklbw m0, m1, m2 4037 punpckhbw m1, m2 4038 mova [dstq+strideq+ 0], m0 4039 mova [dstq+strideq+16], m1 4040 lea dstq, [dstq+strideq*2] 4041 sub hd, 2 4042 jg .w16 4043 RET 4044.w32: 4045 movu m3, [idxq] 4046 add idxq, 16 4047 psrlw m1, m3, 4 4048 punpcklbw m0, m3, m1 4049 punpckhbw m3, m1 4050 pshufb m1, m4, m0 4051 pshufb m2, m5, m0 4052 punpcklbw m0, m1, m2 4053 punpckhbw m1, m2 4054 mova [dstq+16*0], m0 4055 mova [dstq+16*1], m1 4056 pshufb m1, m4, m3 4057 pshufb m2, m5, m3 4058 punpcklbw m0, m1, m2 4059 punpckhbw m1, m2 4060 mova [dstq+16*2], m0 4061 mova [dstq+16*3], m1 4062 add dstq, strideq 4063 dec hd 4064 jg .w32 4065 RET 4066.w64: 4067 movu m3, [idxq+16*0] 4068 psrlw m1, m3, 4 4069 punpcklbw m0, m3, m1 4070 punpckhbw m3, m1 4071 pshufb m1, m4, m0 4072 pshufb m2, m5, m0 4073 punpcklbw m0, m1, m2 4074 punpckhbw m1, m2 4075 mova [dstq+16*0], m0 4076 mova [dstq+16*1], m1 4077 pshufb m1, m4, m3 4078 pshufb m2, m5, m3 4079 movu m3, [idxq+16*1] 4080 add idxq, 32 4081 punpcklbw m0, m1, m2 4082 punpckhbw m1, m2 4083 mova [dstq+16*2], m0 4084 mova [dstq+16*3], m1 4085 psrlw m1, m3, 4 4086 punpcklbw m0, m3, m1 4087 punpckhbw m3, m1 4088 pshufb m1, m4, m0 4089 pshufb m2, m5, m0 4090 punpcklbw m0, m1, m2 4091 punpckhbw m1, m2 4092 mova [dstq+16*4], m0 4093 mova [dstq+16*5], m1 4094 pshufb m1, m4, m3 4095 pshufb m2, m5, m3 4096 punpcklbw m0, m1, m2 4097 punpckhbw m1, m2 4098 mova [dstq+16*6], m0 4099 mova [dstq+16*7], m1 4100 add dstq, strideq 4101 dec hd 4102 jg .w64 4103 RET 4104