1; Copyright © 2018-2021, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 16 30 31%macro SMOOTH_WEIGHT_TABLE 1-* 32 %rep %0 33 db %1-128, 127-%1 34 %rotate 1 35 %endrep 36%endmacro 37 38; sm_weights[], but modified to precalculate x and 256-x with offsets to 39; enable efficient use of pmaddubsw (which requires signed values) 40smooth_weights: SMOOTH_WEIGHT_TABLE \ 41 0, 0, 255, 128, 255, 149, 85, 64, \ 42 255, 197, 146, 105, 73, 50, 37, 32, \ 43 255, 225, 196, 170, 145, 123, 102, 84, \ 44 68, 54, 43, 33, 26, 20, 17, 16, \ 45 255, 240, 225, 210, 196, 182, 169, 157, \ 46 145, 133, 122, 111, 101, 92, 83, 74, \ 47 66, 59, 52, 45, 39, 34, 29, 25, \ 48 21, 17, 14, 12, 10, 9, 8, 8, \ 49 255, 248, 240, 233, 225, 218, 210, 203, \ 50 196, 189, 182, 176, 169, 163, 156, 150, \ 51 144, 138, 133, 127, 121, 116, 111, 106, \ 52 101, 96, 91, 86, 82, 77, 73, 69, \ 53 65, 61, 57, 54, 50, 47, 44, 41, \ 54 38, 35, 32, 29, 27, 25, 22, 20, \ 55 18, 16, 15, 13, 12, 10, 9, 8, \ 56 7, 6, 6, 5, 5, 4, 4, 4 57 58ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 59ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 60ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 61z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 62z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 63z_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7 64z3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 65z3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 66filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 67filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 68z_filter_wh4: db 7, 7, 19, 7, 69z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 70pd_32768: dd 32768 71z3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8 72z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 73pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 74pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 75z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 76z3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 77z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 78z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 79 db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 80z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 81z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 82 db 7, 8, 8, 9, 9, 10, 10, 11 83z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64 84z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 85z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 86z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64 87pw_m1to4: dw -1, -2, -3, -4 88z_filter_k: times 4 db 0, 16 89 times 4 db 0, 20 90 times 4 db 8, 16 91 times 4 db 32, 16 92 times 4 db 24, 20 93 times 4 db 16, 16 94 times 4 db 0, 0 95 times 4 db 0, 0 96pw_8: times 8 db 8, 0 97pb_3: times 16 db 3 98pb_16: times 16 db 16 99pw_62: times 8 dw 62 100pw_64: times 8 dw 64 101pw_256: times 8 dw 256 102pw_512: times 8 dw 512 103pw_m256: times 8 dw -256 104pb_2: times 8 db 2 105pb_4: times 8 db 4 106pb_8: times 8 db 8 107pb_128: times 8 db 128 108pb_m16: times 8 db -16 109pw_128: times 4 dw 128 110pw_255: times 4 dw 255 111pb_36_m4: times 4 db 36, -4 112pb_127_m127: times 4 db 127, -127 113 114%macro JMP_TABLE 3-* 115 %xdefine %1_%2_table (%%table - 2*4) 116 %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) 117 %%table: 118 %rep %0 - 2 119 dd %%base %+ .%3 - (%%table - 2*4) 120 %rotate 1 121 %endrep 122%endmacro 123 124%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4) 125%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4) 126 127JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 128JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ 129 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 130JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 131JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 132JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 133JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 134JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 135JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64 136JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64 137JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64 138JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 139JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ 140 s4-8*4, s8-8*4, s16-8*4, s32-8*4 141JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 142JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32 143 144cextern dr_intra_derivative 145cextern filter_intra_taps 146 147SECTION .text 148 149;--------------------------------------------------------------------------------------- 150;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 151; const int width, const int height, const int a); 152;--------------------------------------------------------------------------------------- 153%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8 154 pshuflw m1, m0, %3 ; extend 8 byte for 2 pos 155 punpcklqdq m1, m1 156 mova [dstq + %2], m1 157%if %1 > 16 158 mova [dstq + 16 + %2], m1 159%endif 160%if %1 > 32 161 mova [dstq + 32 + %2], m1 162 mova [dstq + 48 + %2], m1 163%endif 164%endmacro 165 166%macro IPRED_H 1 ; width 167 sub tlq, 4 168 movd m0, [tlq] ; get 4 bytes of topleft data 169 punpcklbw m0, m0 ; extend 2 byte 170%if %1 == 4 171 pshuflw m1, m0, q2233 172 movd [dstq+strideq*0], m1 173 psrlq m1, 32 174 movd [dstq+strideq*1], m1 175 pshuflw m0, m0, q0011 176 movd [dstq+strideq*2], m0 177 psrlq m0, 32 178 movd [dstq+stride3q ], m0 179 180%elif %1 == 8 181 punpcklwd m0, m0 182 punpckhdq m1, m0, m0 183 punpckldq m0, m0 184 movq [dstq+strideq*1], m1 185 movhps [dstq+strideq*0], m1 186 movq [dstq+stride3q ], m0 187 movhps [dstq+strideq*2], m0 188%else 189 IPRED_SET %1, 0, q3333 190 IPRED_SET %1, strideq, q2222 191 IPRED_SET %1, strideq*2, q1111 192 IPRED_SET %1, stride3q, q0000 193%endif 194 lea dstq, [dstq+strideq*4] 195 sub hd, 4 196 jg .w%1 197 RET 198%endmacro 199 200INIT_XMM ssse3 201cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3 202 LEA r5, ipred_h_ssse3_table 203 tzcnt wd, wm 204 movifnidn hd, hm 205 movsxd wq, [r5+wq*4] 206 add wq, r5 207 lea stride3q, [strideq*3] 208 jmp wq 209.w4: 210 IPRED_H 4 211.w8: 212 IPRED_H 8 213.w16: 214 IPRED_H 16 215.w32: 216 IPRED_H 32 217.w64: 218 IPRED_H 64 219 220;--------------------------------------------------------------------------------------- 221;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 222; const int width, const int height, const int a); 223;--------------------------------------------------------------------------------------- 224cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 225 LEA r5, ipred_dc_splat_ssse3_table 226 tzcnt wd, wm 227 movu m0, [tlq+ 1] 228 movu m1, [tlq+17] 229 movu m2, [tlq+33] 230 movu m3, [tlq+49] 231 movifnidn hd, hm 232 movsxd wq, [r5+wq*4] 233 add wq, r5 234 lea stride3q, [strideq*3] 235 jmp wq 236 237;--------------------------------------------------------------------------------------- 238;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 239; const int width, const int height, const int a); 240;--------------------------------------------------------------------------------------- 241cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 242 movifnidn hd, hm 243 movifnidn wd, wm 244 tzcnt r6d, hd 245 lea r5d, [wq+hq] 246 movd m4, r5d 247 tzcnt r5d, r5d 248 movd m5, r5d 249 LEA r5, ipred_dc_ssse3_table 250 tzcnt wd, wd 251 movsxd r6, [r5+r6*4] 252 movsxd wq, [r5+wq*4+20] 253 pcmpeqd m3, m3 254 psrlw m4, 1 ; dc = (width + height) >> 1; 255 add r6, r5 256 add wq, r5 257 lea stride3q, [strideq*3] 258 jmp r6 259.h4: 260 movd m0, [tlq-4] 261 pmaddubsw m0, m3 262 jmp wq 263.w4: 264 movd m1, [tlq+1] 265 pmaddubsw m1, m3 266 psubw m0, m4 267 paddw m0, m1 268 pmaddwd m0, m3 269 cmp hd, 4 270 jg .w4_mul 271 psrlw m0, 3 ; dc >>= ctz(width + height); 272 jmp .w4_end 273.w4_mul: 274 punpckhqdq m1, m0, m0 275 paddw m0, m1 276 psrlq m1, m0, 32 277 paddw m0, m1 278 psrlw m0, 2 279 mov r6d, 0x5556 280 mov r2d, 0x3334 281 test hd, 8 282 cmovz r6d, r2d 283 movd m5, r6d 284 pmulhuw m0, m5 285.w4_end: 286 pxor m1, m1 287 pshufb m0, m1 288.s4: 289 movd [dstq+strideq*0], m0 290 movd [dstq+strideq*1], m0 291 movd [dstq+strideq*2], m0 292 movd [dstq+stride3q ], m0 293 lea dstq, [dstq+strideq*4] 294 sub hd, 4 295 jg .s4 296 RET 297ALIGN function_align 298.h8: 299 movq m0, [tlq-8] 300 pmaddubsw m0, m3 301 jmp wq 302.w8: 303 movq m1, [tlq+1] 304 pmaddubsw m1, m3 305 psubw m4, m0 306 punpckhqdq m0, m0 307 psubw m0, m4 308 paddw m0, m1 309 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 310 paddw m0, m1 311 pmaddwd m0, m3 312 psrlw m0, m5 313 cmp hd, 8 314 je .w8_end 315 mov r6d, 0x5556 316 mov r2d, 0x3334 317 cmp hd, 32 318 cmovz r6d, r2d 319 movd m1, r6d 320 pmulhuw m0, m1 321.w8_end: 322 pxor m1, m1 323 pshufb m0, m1 324.s8: 325 movq [dstq+strideq*0], m0 326 movq [dstq+strideq*1], m0 327 movq [dstq+strideq*2], m0 328 movq [dstq+stride3q ], m0 329 lea dstq, [dstq+strideq*4] 330 sub hd, 4 331 jg .s8 332 RET 333ALIGN function_align 334.h16: 335 mova m0, [tlq-16] 336 pmaddubsw m0, m3 337 jmp wq 338.w16: 339 movu m1, [tlq+1] 340 pmaddubsw m1, m3 341 paddw m0, m1 342 psubw m4, m0 343 punpckhqdq m0, m0 344 psubw m0, m4 345 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 346 paddw m0, m1 347 pmaddwd m0, m3 348 psrlw m0, m5 349 cmp hd, 16 350 je .w16_end 351 mov r6d, 0x5556 352 mov r2d, 0x3334 353 test hd, 8|32 354 cmovz r6d, r2d 355 movd m1, r6d 356 pmulhuw m0, m1 357.w16_end: 358 pxor m1, m1 359 pshufb m0, m1 360.s16: 361 mova [dstq+strideq*0], m0 362 mova [dstq+strideq*1], m0 363 mova [dstq+strideq*2], m0 364 mova [dstq+stride3q ], m0 365 lea dstq, [dstq+strideq*4] 366 sub hd, 4 367 jg .s16 368 RET 369ALIGN function_align 370.h32: 371 mova m0, [tlq-32] 372 pmaddubsw m0, m3 373 mova m2, [tlq-16] 374 pmaddubsw m2, m3 375 paddw m0, m2 376 jmp wq 377.w32: 378 movu m1, [tlq+1] 379 pmaddubsw m1, m3 380 movu m2, [tlq+17] 381 pmaddubsw m2, m3 382 paddw m1, m2 383 paddw m0, m1 384 psubw m4, m0 385 punpckhqdq m0, m0 386 psubw m0, m4 387 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 388 paddw m0, m1 389 pmaddwd m0, m3 390 psrlw m0, m5 391 cmp hd, 32 392 je .w32_end 393 lea r2d, [hq*2] 394 mov r6d, 0x5556 395 mov r2d, 0x3334 396 test hd, 64|16 397 cmovz r6d, r2d 398 movd m1, r6d 399 pmulhuw m0, m1 400.w32_end: 401 pxor m1, m1 402 pshufb m0, m1 403 mova m1, m0 404.s32: 405 mova [dstq], m0 406 mova [dstq+16], m1 407 mova [dstq+strideq], m0 408 mova [dstq+strideq+16], m1 409 mova [dstq+strideq*2], m0 410 mova [dstq+strideq*2+16], m1 411 mova [dstq+stride3q], m0 412 mova [dstq+stride3q+16], m1 413 lea dstq, [dstq+strideq*4] 414 sub hd, 4 415 jg .s32 416 RET 417ALIGN function_align 418.h64: 419 mova m0, [tlq-64] 420 mova m1, [tlq-48] 421 pmaddubsw m0, m3 422 pmaddubsw m1, m3 423 paddw m0, m1 424 mova m1, [tlq-32] 425 pmaddubsw m1, m3 426 paddw m0, m1 427 mova m1, [tlq-16] 428 pmaddubsw m1, m3 429 paddw m0, m1 430 jmp wq 431.w64: 432 movu m1, [tlq+ 1] 433 movu m2, [tlq+17] 434 pmaddubsw m1, m3 435 pmaddubsw m2, m3 436 paddw m1, m2 437 movu m2, [tlq+33] 438 pmaddubsw m2, m3 439 paddw m1, m2 440 movu m2, [tlq+49] 441 pmaddubsw m2, m3 442 paddw m1, m2 443 paddw m0, m1 444 psubw m4, m0 445 punpckhqdq m0, m0 446 psubw m0, m4 447 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 448 paddw m0, m1 449 pmaddwd m0, m3 450 psrlw m0, m5 451 cmp hd, 64 452 je .w64_end 453 mov r6d, 0x5556 454 mov r2d, 0x3334 455 test hd, 32 456 cmovz r6d, r2d 457 movd m1, r6d 458 pmulhuw m0, m1 459.w64_end: 460 pxor m1, m1 461 pshufb m0, m1 462 mova m1, m0 463 mova m2, m0 464 mova m3, m0 465.s64: 466 mova [dstq], m0 467 mova [dstq+16], m1 468 mova [dstq+32], m2 469 mova [dstq+48], m3 470 mova [dstq+strideq], m0 471 mova [dstq+strideq+16], m1 472 mova [dstq+strideq+32], m2 473 mova [dstq+strideq+48], m3 474 lea dstq, [dstq+strideq*2] 475 sub hd, 2 476 jg .s64 477 RET 478 479;--------------------------------------------------------------------------------------- 480;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 481; const int width, const int height, const int a); 482;--------------------------------------------------------------------------------------- 483cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 484 LEA r5, ipred_dc_left_ssse3_table 485 mov hd, hm ; zero upper half 486 tzcnt r6d, hd 487 sub tlq, hq 488 tzcnt wd, wm 489 movu m0, [tlq] 490 movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] 491 movd m2, r6d 492 psrld m3, m2 493 movsxd r6, [r5+r6*4] 494 pcmpeqd m2, m2 495 pmaddubsw m0, m2 496 add r6, r5 497 add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table 498 movsxd wq, [r5+wq*4] 499 add wq, r5 500 jmp r6 501.h64: 502 movu m1, [tlq+48] ; unaligned when jumping here from dc_top 503 pmaddubsw m1, m2 504 paddw m0, m1 505 movu m1, [tlq+32] ; unaligned when jumping here from dc_top 506 pmaddubsw m1, m2 507 paddw m0, m1 508.h32: 509 movu m1, [tlq+16] ; unaligned when jumping here from dc_top 510 pmaddubsw m1, m2 511 paddw m0, m1 512.h16: 513 pshufd m1, m0, q3232 ; psrlq m1, m0, 16 514 paddw m0, m1 515.h8: 516 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 517 paddw m0, m1 518.h4: 519 pmaddwd m0, m2 520 pmulhrsw m0, m3 521 lea stride3q, [strideq*3] 522 pxor m1, m1 523 pshufb m0, m1 524 mova m1, m0 525 mova m2, m0 526 mova m3, m0 527 jmp wq 528 529;--------------------------------------------------------------------------------------- 530;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 531; const int width, const int height, const int a); 532;--------------------------------------------------------------------------------------- 533cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 534 LEA r5, ipred_dc_splat_ssse3_table 535 tzcnt wd, wm 536 movifnidn hd, hm 537 movsxd wq, [r5+wq*4] 538 movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128] 539 mova m1, m0 540 mova m2, m0 541 mova m3, m0 542 add wq, r5 543 lea stride3q, [strideq*3] 544 jmp wq 545 546;--------------------------------------------------------------------------------------- 547;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 548; const int width, const int height, const int a); 549;--------------------------------------------------------------------------------------- 550cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h 551 LEA r5, ipred_dc_left_ssse3_table 552 tzcnt wd, wm 553 inc tlq 554 movu m0, [tlq] 555 movifnidn hd, hm 556 movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] 557 movd m2, wd 558 psrld m3, m2 559 movsxd r6, [r5+wq*4] 560 pcmpeqd m2, m2 561 pmaddubsw m0, m2 562 add r6, r5 563 add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table 564 movsxd wq, [r5+wq*4] 565 add wq, r5 566 jmp r6 567 568;--------------------------------------------------------------------------------------- 569;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 570; const int width, const int height, const int a); 571;--------------------------------------------------------------------------------------- 572%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] 573 ; w * a = (w - 128) * a + 128 * a 574 ; (256 - w) * b = (127 - w) * b + 129 * b 575 ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b] 576 pmaddubsw m6, m%3, m%1 577 pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b 578 paddw m6, m%5 579 paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128] 580 psrlw m6, 8 581 psrlw m0, 8 582 packuswb m6, m0 583%endmacro 584 585cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights 586%define base r6-ipred_smooth_v_ssse3_table 587 LEA r6, ipred_smooth_v_ssse3_table 588 tzcnt wd, wm 589 mov hd, hm 590 movsxd wq, [r6+wq*4] 591 movddup m0, [base+pb_127_m127] 592 movddup m1, [base+pw_128] 593 lea weightsq, [base+smooth_weights+hq*4] 594 neg hq 595 movd m5, [tlq+hq] 596 pxor m2, m2 597 pshufb m5, m2 598 add wq, r6 599 jmp wq 600.w4: 601 movd m2, [tlq+1] 602 punpckldq m2, m2 603 punpcklbw m2, m5 ; top, bottom 604 lea r3, [strideq*3] 605 mova m4, [base+ipred_v_shuf] 606 mova m5, m4 607 punpckldq m4, m4 608 punpckhdq m5, m5 609 pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom 610 paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok 611 paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128 612.w4_loop: 613 movu m1, [weightsq+hq*2] 614 pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop 615 pshufb m1, m5 616 SMOOTH 0, 1, 2, 2, 3, 3 617 movd [dstq+strideq*0], m6 618 pshuflw m1, m6, q1032 619 movd [dstq+strideq*1], m1 620 punpckhqdq m6, m6 621 movd [dstq+strideq*2], m6 622 psrlq m6, 32 623 movd [dstq+r3 ], m6 624 lea dstq, [dstq+strideq*4] 625 add hq, 4 626 jl .w4_loop 627 RET 628ALIGN function_align 629.w8: 630 movq m2, [tlq+1] 631 punpcklbw m2, m5 632 mova m5, [base+ipred_v_shuf] 633 lea r3, [strideq*3] 634 pshufd m4, m5, q0000 635 pshufd m5, m5, q1111 636 pmaddubsw m3, m2, m0 637 paddw m1, m2 638 paddw m3, m1 ; m3 is output for loop 639.w8_loop: 640 movq m1, [weightsq+hq*2] 641 pshufb m0, m1, m4 642 pshufb m1, m5 643 SMOOTH 0, 1, 2, 2, 3, 3 644 movq [dstq+strideq*0], m6 645 movhps [dstq+strideq*1], m6 646 lea dstq, [dstq+strideq*2] 647 add hq, 2 648 jl .w8_loop 649 RET 650ALIGN function_align 651.w16: 652 movu m3, [tlq+1] 653 punpcklbw m2, m3, m5 654 punpckhbw m3, m5 655 pmaddubsw m4, m2, m0 656 pmaddubsw m5, m3, m0 657 paddw m0, m1, m2 658 paddw m1, m3 659 paddw m4, m0 660 paddw m5, m1 ; m4 and m5 is output for loop 661.w16_loop: 662 movd m1, [weightsq+hq*2] 663 pshuflw m1, m1, q0000 664 punpcklqdq m1, m1 665 SMOOTH 1, 1, 2, 3, 4, 5 666 mova [dstq], m6 667 add dstq, strideq 668 add hq, 1 669 jl .w16_loop 670 RET 671ALIGN function_align 672.w32: 673 WIN64_PUSH_XMM 8, 7 674 mova m7, m5 675.w32_loop_init: 676 mov r3d, 2 677.w32_loop: 678 movddup m0, [base+pb_127_m127] 679 movddup m1, [base+pw_128] 680 movu m3, [tlq+1] 681 punpcklbw m2, m3, m7 682 punpckhbw m3, m7 683 pmaddubsw m4, m2, m0 684 pmaddubsw m5, m3, m0 685 paddw m0, m1, m2 686 paddw m1, m3 687 paddw m4, m0 688 paddw m5, m1 689 movd m1, [weightsq+hq*2] 690 pshuflw m1, m1, q0000 691 punpcklqdq m1, m1 692 SMOOTH 1, 1, 2, 3, 4, 5 693 mova [dstq], m6 694 add tlq, 16 695 add dstq, 16 696 dec r3d 697 jg .w32_loop 698 lea dstq, [dstq-32+strideq] 699 sub tlq, 32 700 add hq, 1 701 jl .w32_loop_init 702 RET 703ALIGN function_align 704.w64: 705 WIN64_PUSH_XMM 8, 7 706 mova m7, m5 707.w64_loop_init: 708 mov r3d, 4 709.w64_loop: 710 movddup m0, [base+pb_127_m127] 711 movddup m1, [base+pw_128] 712 movu m3, [tlq+1] 713 punpcklbw m2, m3, m7 714 punpckhbw m3, m7 715 pmaddubsw m4, m2, m0 716 pmaddubsw m5, m3, m0 717 paddw m0, m1, m2 718 paddw m1, m3 719 paddw m4, m0 720 paddw m5, m1 721 movd m1, [weightsq+hq*2] 722 pshuflw m1, m1, q0000 723 punpcklqdq m1, m1 724 SMOOTH 1, 1, 2, 3, 4, 5 725 mova [dstq], m6 726 add tlq, 16 727 add dstq, 16 728 dec r3d 729 jg .w64_loop 730 lea dstq, [dstq-64+strideq] 731 sub tlq, 64 732 add hq, 1 733 jl .w64_loop_init 734 RET 735 736;--------------------------------------------------------------------------------------- 737;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 738; const int width, const int height, const int a); 739;--------------------------------------------------------------------------------------- 740cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h 741%define base r6-ipred_smooth_h_ssse3_table 742 LEA r6, ipred_smooth_h_ssse3_table 743 mov wd, wm 744 movd m3, [tlq+wq] 745 pxor m1, m1 746 pshufb m3, m1 ; right 747 tzcnt wd, wd 748 mov hd, hm 749 movsxd wq, [r6+wq*4] 750 movddup m4, [base+pb_127_m127] 751 movddup m5, [base+pw_128] 752 add wq, r6 753 jmp wq 754.w4: 755 movddup m6, [base+smooth_weights+4*2] 756 mova m7, [base+ipred_h_shuf] 757 sub tlq, 4 758 sub tlq, hq 759 lea r3, [strideq*3] 760.w4_loop: 761 movd m2, [tlq+hq] ; left 762 pshufb m2, m7 763 punpcklbw m1, m2, m3 ; left, right 764 punpckhbw m2, m3 765 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 766 paddw m0, m1 ; 128 * left + 129 * right 767 pmaddubsw m1, m6 768 paddw m1, m5 769 paddw m0, m1 770 pmaddubsw m1, m2, m4 771 paddw m1, m2 772 pmaddubsw m2, m6 773 paddw m2, m5 774 paddw m1, m2 775 psrlw m0, 8 776 psrlw m1, 8 777 packuswb m0, m1 778 movd [dstq+strideq*0], m0 779 pshuflw m1, m0, q1032 780 movd [dstq+strideq*1], m1 781 punpckhqdq m0, m0 782 movd [dstq+strideq*2], m0 783 psrlq m0, 32 784 movd [dstq+r3 ], m0 785 lea dstq, [dstq+strideq*4] 786 sub hd, 4 787 jg .w4_loop 788 RET 789ALIGN function_align 790.w8: 791 mova m6, [base+smooth_weights+8*2] 792 mova m7, [base+ipred_h_shuf] 793 sub tlq, 4 794 sub tlq, hq 795 punpckldq m7, m7 796.w8_loop: 797 movd m2, [tlq+hq] ; left 798 pshufb m2, m7 799 punpcklbw m1, m2, m3 ; left, right 800 punpckhbw m2, m3 801 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 802 paddw m0, m1 ; 128 * left + 129 * right 803 pmaddubsw m1, m6 804 paddw m1, m5 805 paddw m0, m1 806 pmaddubsw m1, m2, m4 807 paddw m1, m2 808 pmaddubsw m2, m6 809 paddw m2, m5 810 paddw m1, m2 811 psrlw m0, 8 812 psrlw m1, 8 813 packuswb m0, m1 814 movq [dstq+strideq*0], m0 815 movhps [dstq+strideq*1], m0 816 lea dstq, [dstq+strideq*2] 817 sub hd, 2 818 jg .w8_loop 819 RET 820ALIGN function_align 821.w16: 822 mova m6, [base+smooth_weights+16*2] 823 mova m7, [base+smooth_weights+16*3] 824 sub tlq, 1 825 sub tlq, hq 826.w16_loop: 827 pxor m1, m1 828 movd m2, [tlq+hq] ; left 829 pshufb m2, m1 830 punpcklbw m1, m2, m3 ; left, right 831 punpckhbw m2, m3 832 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 833 paddw m0, m1 ; 128 * left + 129 * right 834 pmaddubsw m1, m6 835 paddw m1, m5 836 paddw m0, m1 837 pmaddubsw m1, m2, m4 838 paddw m1, m2 839 pmaddubsw m2, m7 840 paddw m2, m5 841 paddw m1, m2 842 psrlw m0, 8 843 psrlw m1, 8 844 packuswb m0, m1 845 mova [dstq], m0 846 lea dstq, [dstq+strideq] 847 sub hd, 1 848 jg .w16_loop 849 RET 850ALIGN function_align 851.w32: 852 sub tlq, 1 853 sub tlq, hq 854 pxor m6, m6 855.w32_loop_init: 856 mov r5, 2 857 lea r3, [base+smooth_weights+16*4] 858.w32_loop: 859 mova m7, [r3] 860 add r3, 16 861 movd m2, [tlq+hq] ; left 862 pshufb m2, m6 863 punpcklbw m1, m2, m3 ; left, right 864 punpckhbw m2, m3 865 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 866 paddw m0, m1 ; 128 * left + 129 * right 867 pmaddubsw m1, m7 868 paddw m1, m5 869 paddw m0, m1 870 pmaddubsw m1, m2, m4 871 paddw m1, m2 872 mova m7, [r3] 873 add r3, 16 874 pmaddubsw m2, m7 875 paddw m2, m5 876 paddw m1, m2 877 psrlw m0, 8 878 psrlw m1, 8 879 packuswb m0, m1 880 mova [dstq], m0 881 add dstq, 16 882 dec r5 883 jg .w32_loop 884 lea dstq, [dstq-32+strideq] 885 sub hd, 1 886 jg .w32_loop_init 887 RET 888ALIGN function_align 889.w64: 890 sub tlq, 1 891 sub tlq, hq 892 pxor m6, m6 893.w64_loop_init: 894 mov r5, 4 895 lea r3, [base+smooth_weights+16*8] 896.w64_loop: 897 mova m7, [r3] 898 add r3, 16 899 movd m2, [tlq+hq] ; left 900 pshufb m2, m6 901 punpcklbw m1, m2, m3 ; left, right 902 punpckhbw m2, m3 903 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 904 paddw m0, m1 ; 128 * left + 129 * right 905 pmaddubsw m1, m7 906 paddw m1, m5 907 paddw m0, m1 908 pmaddubsw m1, m2, m4 909 paddw m1, m2 910 mova m7, [r3] 911 add r3, 16 912 pmaddubsw m2, m7 913 paddw m2, m5 914 paddw m1, m2 915 psrlw m0, 8 916 psrlw m1, 8 917 packuswb m0, m1 918 mova [dstq], m0 919 add dstq, 16 920 dec r5 921 jg .w64_loop 922 lea dstq, [dstq-64+strideq] 923 sub hd, 1 924 jg .w64_loop_init 925 RET 926 927;--------------------------------------------------------------------------------------- 928;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 929; const int width, const int height, const int a); 930;--------------------------------------------------------------------------------------- 931%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3 932 pmaddubsw m6, m%3, m%1 933 mova m0, m6 934 pmaddubsw m6, m%4, m%2 935 mova m1, m6 936%ifnum %5 937 paddw m0, m%5 938%else 939 paddw m0, %5 940%endif 941%ifnum %6 942 paddw m1, m%6 943%else 944 paddw m1, %6 945%endif 946%ifnum %7 947%else 948 mova m3, %7 949%endif 950 pavgw m0, m2 951 pavgw m1, m3 952 psrlw m0, 8 953 psrlw m1, 8 954 packuswb m0, m1 955%endmacro 956 957%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5] 958 mova m1, [rsp+16*%1] ; top 959 punpckhbw m6, m1, m0 ; top, bottom 960 punpcklbw m1, m0 ; top, bottom 961 pmaddubsw m2, m1, m5 962 mova [rsp+16*%2], m1 963 paddw m1, m3 ; 1 * top + 255 * bottom + 255 964 paddw m2, m1 ; 128 * top + 129 * bottom + 255 965 mova [rsp+16*%3], m2 966 pmaddubsw m2, m6, m5 967 mova [rsp+16*%4], m6 968 paddw m6, m3 ; 1 * top + 255 * bottom + 255 969 paddw m2, m6 ; 128 * top + 129 * bottom + 255 970 mova [rsp+16*%5], m2 971 movd m1, [tlq+hq] ; left 972 pshufb m1, [base+pb_3] ; topleft[-(1 + y)] 973 punpcklbw m1, m4 ; left, right 974 pmaddubsw m2, m1, m5 ; 127 * left - 127 * right 975 paddw m2, m1 ; 128 * left + 129 * right 976 mova m3, m2 977 pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width]; 978 pmaddubsw m1, %7 979 paddw m2, m3, m0 980 paddw m3, m1 981 movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; 982 mova m7, [rsp+16*%9] 983 pshufb m1, m7 984 mova [rsp+16*%8], m3 985 mova m4, [rsp+16*%2] 986 mova m5, [rsp+16*%3] 987 mova m3, [rsp+16*%4] 988 mova m7, [rsp+16*%5] 989 SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8] 990 mova [dstq], m0 991 movddup m3, [base+pw_255] ; recovery 992 mova m0, [rsp+16*%10] ; recovery 993 mova m4, [rsp+16*%11] ; recovery 994 mova m5, [rsp+16*%12] ; recovery 995%endmacro 996 997cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights 998%define base r6-ipred_smooth_ssse3_table 999 mov wd, wm 1000 mov hd, hm 1001 LEA r6, ipred_smooth_ssse3_table 1002 movd m4, [tlq+wq] ; right 1003 pxor m2, m2 1004 pshufb m4, m2 1005 tzcnt wd, wd 1006 mov r5, tlq 1007 sub r5, hq 1008 movsxd wq, [r6+wq*4] 1009 movddup m5, [base+pb_127_m127] 1010 movd m0, [r5] 1011 pshufb m0, m2 ; bottom 1012 movddup m3, [base+pw_255] 1013 add wq, r6 1014 lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height] 1015 jmp wq 1016.w4: 1017 mova m7, [base+ipred_v_shuf] 1018 movd m1, [tlq+1] ; left 1019 pshufd m1, m1, q0000 1020 sub tlq, 4 1021 lea r3, [strideq*3] 1022 sub tlq, hq 1023 punpcklbw m1, m0 ; top, bottom 1024 pshufd m6, m7, q1100 1025 pshufd m7, m7, q3322 1026 pmaddubsw m2, m1, m5 1027 paddw m3, m1 ; 1 * top + 255 * bottom + 255 1028 paddw m2, m3 ; 128 * top + 129 * bottom + 255 1029 mova [rsp+16*0], m1 1030 mova [rsp+16*1], m2 1031 movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width]; 1032 punpcklqdq m1, m1 1033 mova [rsp+16*2], m1 1034 mova [rsp+16*3], m4 1035 mova [rsp+16*4], m6 1036 mova [rsp+16*5], m5 1037.w4_loop: 1038 movd m1, [tlq+hq] ; left 1039 pshufb m1, [base+ipred_h_shuf] 1040 punpcklbw m0, m1, m4 ; left, right 1041 punpckhbw m1, m4 1042 pmaddubsw m2, m0, m5 ; 127 * left - 127 * right 1043 pmaddubsw m3, m1, m5 1044 paddw m2, m0 ; 128 * left + 129 * right 1045 paddw m3, m1 1046 mova m4, [rsp+16*2] 1047 pmaddubsw m0, m4 1048 pmaddubsw m1, m4 1049 paddw m2, m0 1050 paddw m3, m1 1051 movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; 1052 add v_weightsq, 8 1053 pshufb m0, m1, m6 1054 pshufb m1, m7 1055 mova m4, [rsp+16*0] 1056 mova m5, [rsp+16*1] 1057 SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 1058 mova m4, [rsp+16*3] 1059 mova m6, [rsp+16*4] 1060 mova m5, [rsp+16*5] 1061 movd [dstq+strideq*0], m0 1062 pshuflw m1, m0, q1032 1063 movd [dstq+strideq*1], m1 1064 punpckhqdq m0, m0 1065 movd [dstq+strideq*2], m0 1066 psrlq m0, 32 1067 movd [dstq+r3 ], m0 1068 lea dstq, [dstq+strideq*4] 1069 sub hd, 4 1070 jg .w4_loop 1071 RET 1072ALIGN function_align 1073.w8: 1074 mova m7, [base+ipred_v_shuf] 1075 movq m1, [tlq+1] ; left 1076 punpcklqdq m1, m1 1077 sub tlq, 4 1078 sub tlq, hq 1079 punpcklbw m1, m0 1080 pshufd m6, m7, q0000 1081 pshufd m7, m7, q1111 1082 pmaddubsw m2, m1, m5 1083 paddw m3, m1 1084 paddw m2, m3 1085 mova [rsp+16*0], m1 1086 mova [rsp+16*1], m2 1087 mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width]; 1088 mova [rsp+16*2], m1 1089 mova [rsp+16*3], m4 1090 mova [rsp+16*4], m6 1091 mova [rsp+16*5], m5 1092.w8_loop: 1093 movd m1, [tlq+hq] ; left 1094 pshufb m1, [base+ipred_h_shuf] 1095 pshufd m1, m1, q1100 1096 punpcklbw m0, m1, m4 1097 punpckhbw m1, m4 1098 pmaddubsw m2, m0, m5 1099 pmaddubsw m3, m1, m5 1100 paddw m2, m0 1101 paddw m3, m1 1102 mova m4, [rsp+16*2] 1103 pmaddubsw m0, m4 1104 pmaddubsw m1, m4 1105 paddw m2, m0 1106 paddw m3, m1 1107 movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; 1108 add v_weightsq, 4 1109 pshufb m0, m1, m6 1110 pshufb m1, m7 1111 mova m4, [rsp+16*0] 1112 mova m5, [rsp+16*1] 1113 SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 1114 mova m4, [rsp+16*3] 1115 mova m6, [rsp+16*4] 1116 mova m5, [rsp+16*5] 1117 movq [dstq+strideq*0], m0 1118 movhps [dstq+strideq*1], m0 1119 lea dstq, [dstq+strideq*2] 1120 sub hd, 2 1121 jg .w8_loop 1122 RET 1123ALIGN function_align 1124.w16: 1125 mova m7, [base+ipred_v_shuf] 1126 movu m1, [tlq+1] ; left 1127 sub tlq, 4 1128 sub tlq, hq 1129 punpckhbw m6, m1, m0 ; top, bottom 1130 punpcklbw m1, m0 ; top, bottom 1131 pshufd m7, m7, q0000 1132 mova [rsp+16*2], m7 1133 pmaddubsw m2, m6, m5 1134 mova [rsp+16*5], m6 1135 paddw m6, m3 ; 1 * top + 255 * bottom + 255 1136 paddw m2, m6 ; 128 * top + 129 * bottom + 255 1137 mova [rsp+16*6], m2 1138 pmaddubsw m2, m1, m5 1139 paddw m3, m1 ; 1 * top + 255 * bottom + 255 1140 mova [rsp+16*0], m1 1141 paddw m2, m3 ; 128 * top + 129 * bottom + 255 1142 mova [rsp+16*1], m2 1143 mova [rsp+16*3], m4 1144 mova [rsp+16*4], m5 1145.w16_loop: 1146 movd m1, [tlq+hq] ; left 1147 pshufb m1, [base+pb_3] ; topleft[-(1 + y)] 1148 punpcklbw m1, m4 ; left, right 1149 pmaddubsw m2, m1, m5 ; 127 * left - 127 * right 1150 paddw m2, m1 ; 128 * left + 129 * right 1151 mova m0, m1 1152 mova m3, m2 1153 pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width]; 1154 pmaddubsw m1, [base+smooth_weights+16*3] 1155 paddw m2, m0 1156 paddw m3, m1 1157 movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; 1158 add v_weightsq, 2 1159 mova m7, [rsp+16*2] 1160 pshufb m1, m7 1161 mova [rsp+16*7], m3 1162 mova m4, [rsp+16*0] 1163 mova m5, [rsp+16*1] 1164 mova m3, [rsp+16*5] 1165 mova m7, [rsp+16*6] 1166 SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7] 1167 mova m4, [rsp+16*3] 1168 mova m5, [rsp+16*4] 1169 mova [dstq], m0 1170 lea dstq, [dstq+strideq] 1171 sub hd, 1 1172 jg .w16_loop 1173 RET 1174ALIGN function_align 1175.w32: 1176 movu m1, [tlq+1] ; top topleft[1 + x] 1177 movu m2, [tlq+17] ; top 1178 mova [rsp+16*0], m1 1179 mova [rsp+16*1], m2 1180 sub tlq, 4 1181 sub tlq, hq 1182 mova m7, [base+ipred_v_shuf] 1183 pshufd m7, m7, q0000 1184 mova [rsp+16*2], m7 1185 mova [rsp+16*3], m0 1186 mova [rsp+16*4], m4 1187 mova [rsp+16*5], m5 1188.w32_loop: 1189 SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5 1190 add dstq, 16 1191 SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5 1192 lea dstq, [dstq-16+strideq] 1193 add v_weightsq, 2 1194 sub hd, 1 1195 jg .w32_loop 1196 RET 1197ALIGN function_align 1198.w64: 1199 movu m1, [tlq+1] ; top topleft[1 + x] 1200 movu m2, [tlq+17] ; top 1201 mova [rsp+16*0], m1 1202 mova [rsp+16*1], m2 1203 movu m1, [tlq+33] ; top 1204 movu m2, [tlq+49] ; top 1205 mova [rsp+16*11], m1 1206 mova [rsp+16*12], m2 1207 sub tlq, 4 1208 sub tlq, hq 1209 mova m7, [base+ipred_v_shuf] 1210 pshufd m7, m7, q0000 1211 mova [rsp+16*2], m7 1212 mova [rsp+16*3], m0 1213 mova [rsp+16*4], m4 1214 mova [rsp+16*5], m5 1215.w64_loop: 1216 SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5 1217 add dstq, 16 1218 SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5 1219 add dstq, 16 1220 SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5 1221 add dstq, 16 1222 SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5 1223 lea dstq, [dstq-48+strideq] 1224 add v_weightsq, 2 1225 sub hd, 1 1226 jg .w64_loop 1227 RET 1228 1229%if ARCH_X86_64 1230cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx 1231 %define base r7-$$ 1232 lea r7, [$$] 1233 mova m8, [base+pw_62] 1234 mova m9, [base+pw_64] 1235 mova m10, [base+pw_512] 1236%else 1237cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx 1238 %define base r1-$$ 1239 %define m8 [base+pw_62] 1240 %define m9 [base+pw_64] 1241 %define m10 [base+pw_512] 1242 %define strideq r3 1243 %define stridemp dword [rsp+16*12] 1244 mov stridemp, r1 1245 LEA r1, $$ 1246%endif 1247 tzcnt wd, wm 1248 movifnidn angled, anglem 1249 movifnidn hd, hm 1250 inc tlq 1251 movsxd wq, [base+ipred_z1_ssse3_table+wq*4] 1252 mov dxd, angled 1253 and dxd, 0x7e 1254 add angled, 165 ; ~90 1255 lea wq, [base+wq+ipred_z1_ssse3_table] 1256 movzx dxd, word [base+dr_intra_derivative+dxq] 1257 xor angled, 0x4ff ; d = 90 - angle 1258 jmp wq 1259.w4: 1260 lea r3d, [angleq+88] 1261 test r3d, 0x480 1262 jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 1263 sar r3d, 9 1264 add r3d, hd 1265 cmp r3d, 8 1266 jg .w4_no_upsample ; h > 8 || (w == h && is_sm) 1267 mova m1, [tlq-1] 1268 pshufb m0, m1, [base+z_upsample1] 1269 pshufb m1, [base+z_upsample2] 1270 movddup m2, [base+pb_36_m4] 1271 add dxd, dxd 1272 pmaddubsw m0, m2 1273 pshufd m7, m1, q3333 1274 movd [rsp+16], m7 ; top[max_base_x] 1275 pmaddubsw m1, m2 1276 movd m6, dxd 1277 mov r5d, dxd ; xpos 1278 pshufb m6, [base+pw_256] 1279 paddw m1, m0 1280 movq m0, [tlq] 1281 pmulhrsw m1, m10 1282 paddw m7, m6, m6 1283 punpcklqdq m6, m7 ; xpos0 xpos1 1284 packuswb m1, m1 1285 punpcklbw m0, m1 1286 movifnidn strideq, stridemp 1287 mova [rsp], m0 1288.w4_upsample_loop: 1289 lea r2d, [r5+dxq] 1290 shr r5d, 6 ; base0 1291 movq m0, [rsp+r5] 1292 lea r5d, [r2+dxq] 1293 shr r2d, 6 ; base1 1294 movhps m0, [rsp+r2] 1295 pand m2, m8, m6 ; frac 1296 psubw m1, m9, m2 ; 64-frac 1297 psllw m2, 8 1298 por m1, m2 ; 64-frac, frac 1299 pmaddubsw m0, m1 1300 paddw m6, m7 ; xpos += dx 1301 pmulhrsw m0, m10 1302 packuswb m0, m0 1303 movd [dstq+strideq*0], m0 1304 pshuflw m0, m0, q1032 1305 movd [dstq+strideq*1], m0 1306 lea dstq, [dstq+strideq*2] 1307 sub hd, 2 1308 jg .w4_upsample_loop 1309 RET 1310.w4_no_upsample: 1311 mov r3d, 7 ; max_base 1312 test angled, 0x400 ; !enable_intra_edge_filter 1313 jnz .w4_main 1314 lea r3d, [hq+3] 1315 movd m0, r3d 1316 movd m2, angled 1317 shr angled, 8 ; is_sm << 1 1318 pxor m1, m1 1319 pshufb m0, m1 1320 pshufb m2, m1 1321 pcmpeqb m1, m0, [base+z_filter_wh4] 1322 pand m1, m2 1323 pcmpgtb m1, [base+z_filter_t_w48+angleq*8] 1324 pmovmskb r5d, m1 1325 mov r3d, 7 1326 test r5d, r5d 1327 jz .w4_main ; filter_strength == 0 1328 mova m3, [tlq-1] 1329 imul r5d, 0x55555555 1330 movu m7, [base+z_filter_s+8] 1331 shr r5d, 30 ; filter_strength 1332 movddup m0, [base+pb_8] 1333 pminub m7, m0 1334 pshufb m0, m3, [base+z_filter_s] 1335 movddup m4, [base+z_filter_k-8+r5*8+24*0] 1336 pshufb m3, m7 1337 movddup m5, [base+z_filter_k-8+r5*8+24*1] 1338 shufps m2, m0, m3, q2121 1339 movddup m6, [base+z_filter_k-8+r5*8+24*2] 1340 pmaddubsw m0, m4 1341 pmaddubsw m1, m2, m4 1342 pmaddubsw m2, m5 1343 paddd m5, m6 1344 pmaddubsw m4, m3, m5 1345 pmaddubsw m3, m6 1346 paddw m0, m2 1347 paddw m1, m4 1348 paddw m0, m3 1349 pshufd m1, m1, q3333 1350 pmulhrsw m0, m10 1351 pmulhrsw m1, m10 1352 mov r5d, 9 1353 mov tlq, rsp 1354 cmp hd, 4 1355 cmovne r3d, r5d 1356 packuswb m0, m1 1357 mova [tlq], m0 1358.w4_main: 1359 add tlq, r3 1360 movd m5, dxd 1361 movddup m0, [base+z_base_inc] ; base_inc << 6 1362 movd m7, [tlq] ; top[max_base_x] 1363 shl r3d, 6 1364 movd m4, r3d 1365 pshufb m5, [base+pw_256] 1366 mov r5d, dxd ; xpos 1367 pshufb m7, [base+pw_m256] 1368 sub r5, r3 1369 pshufb m4, [base+pw_256] 1370 mova m3, [base+z1_shuf_w4] 1371 paddw m6, m5, m5 1372 psubw m4, m0 ; max_base_x 1373 punpcklqdq m5, m6 ; xpos0 xpos1 1374.w4_loop: 1375 lea r3, [r5+dxq] 1376 sar r5, 6 ; base0 1377 movq m0, [tlq+r5] 1378 lea r5, [r3+dxq] 1379 sar r3, 6 ; base1 1380 movhps m0, [tlq+r3] 1381 pand m2, m8, m5 ; frac 1382 psubw m1, m9, m2 ; 64-frac 1383 psllw m2, 8 1384 pshufb m0, m3 1385 por m1, m2 ; 64-frac, frac 1386 pmaddubsw m0, m1 1387 movifnidn strideq, stridemp 1388 pcmpgtw m1, m4, m5 ; base < max_base_x 1389 pmulhrsw m0, m10 1390 paddw m5, m6 ; xpos += dx 1391 pand m0, m1 1392 pandn m1, m7 1393 por m0, m1 1394 packuswb m0, m0 1395 movd [dstq+strideq*0], m0 1396 pshuflw m0, m0, q1032 1397 movd [dstq+strideq*1], m0 1398 sub hd, 2 1399 jz .w4_end 1400 lea dstq, [dstq+strideq*2] 1401 test r5d, r5d 1402 jl .w4_loop 1403 packuswb m7, m7 1404.w4_end_loop: 1405 movd [dstq+strideq*0], m7 1406 movd [dstq+strideq*1], m7 1407 lea dstq, [dstq+strideq*2] 1408 sub hd, 2 1409 jg .w4_end_loop 1410.w4_end: 1411 RET 1412.w8: 1413 lea r3d, [angleq+88] 1414 and r3d, ~0x7f 1415 or r3d, hd 1416 cmp r3d, 8 1417 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 1418 mova m5, [base+z_upsample1] 1419 movu m3, [base+z_filter_s+6] 1420 movd m4, hd 1421 mova m0, [tlq-1] 1422 movu m1, [tlq+7] 1423 pxor m7, m7 1424 pshufb m4, m7 1425 movddup m7, [base+pb_36_m4] 1426 pminub m4, m3 1427 add dxd, dxd 1428 pshufb m2, m0, m5 1429 pmaddubsw m2, m7 1430 pshufb m0, m3 1431 pmaddubsw m0, m7 1432 movd m6, dxd 1433 pshufb m3, m1, m5 1434 pmaddubsw m3, m7 1435 pshufb m1, m4 1436 pmaddubsw m1, m7 1437 pshufb m6, [base+pw_256] 1438 mov r5d, dxd 1439 paddw m2, m0 1440 paddw m7, m6, m6 1441 paddw m3, m1 1442 punpcklqdq m6, m7 ; xpos0 xpos1 1443 movu m1, [tlq] 1444 pmulhrsw m2, m10 1445 pmulhrsw m3, m10 1446 packuswb m2, m3 1447 punpcklbw m0, m1, m2 1448 punpckhbw m1, m2 1449 movifnidn strideq, stridemp 1450 mova [rsp+16*0], m0 1451 mova [rsp+16*1], m1 1452.w8_upsample_loop: 1453 lea r2d, [r5+dxq] 1454 shr r5d, 6 ; base0 1455 movu m0, [rsp+r5] 1456 lea r5d, [r2+dxq] 1457 shr r2d, 6 ; base1 1458 movu m1, [rsp+r2] 1459 pand m2, m8, m6 1460 psubw m3, m9, m2 1461 psllw m2, 8 1462 por m3, m2 1463 punpcklqdq m2, m3, m3 ; frac0 1464 pmaddubsw m0, m2 1465 punpckhqdq m3, m3 ; frac1 1466 pmaddubsw m1, m3 1467 paddw m6, m7 1468 pmulhrsw m0, m10 1469 pmulhrsw m1, m10 1470 packuswb m0, m1 1471 movq [dstq+strideq*0], m0 1472 movhps [dstq+strideq*1], m0 1473 lea dstq, [dstq+strideq*2] 1474 sub hd, 2 1475 jg .w8_upsample_loop 1476 RET 1477.w8_no_upsample: 1478 lea r3d, [hq+7] 1479 movd m0, r3d 1480 and r3d, 7 1481 or r3d, 8 ; imin(h+7, 15) 1482 test angled, 0x400 1483 jnz .w8_main 1484 movd m2, angled 1485 shr angled, 8 ; is_sm << 1 1486 pxor m1, m1 1487 pshufb m0, m1 1488 pshufb m2, m1 1489 movu m1, [base+z_filter_wh8] 1490 psrldq m3, [base+z_filter_t_w48+angleq*8], 4 1491 pcmpeqb m1, m0 1492 pand m1, m2 1493 pcmpgtb m1, m3 1494 pmovmskb r5d, m1 1495 test r5d, r5d 1496 jz .w8_main ; filter_strength == 0 1497 movd m3, [tlq-1] 1498 movu m0, [tlq+16*0] 1499 imul r5d, 0x55555555 1500 movu m1, [tlq+16*1] 1501 shr r5d, 30 ; filter_strength 1502 movd m2, [tlq+r3] 1503 lea tlq, [rsp+16*4] 1504 sub r5, 3 1505 mova [tlq-16*1], m0 1506 pxor m7, m7 1507 mova [tlq+16*0], m1 1508 pshufb m3, m7 1509 pshufb m2, m7 1510 mova [tlq-16*2], m3 1511 movq [tlq+r3-15], m2 1512 call .filter_edge 1513 sar r5d, 1 1514 add r5d, 17 1515 cmp hd, 8 1516 cmova r3d, r5d 1517.w8_main: 1518 add tlq, r3 1519 movd m5, dxd 1520 movd m7, [tlq] 1521 shl r3d, 6 1522 movu m3, [base+z_filter_s+2] 1523 movd m4, r3d 1524 pshufb m5, [base+pw_256] 1525 mov r5d, dxd 1526 pshufb m7, [base+pw_m256] 1527 sub r5, r3 1528 pshufb m4, [base+pw_256] 1529 psubw m4, [base+z_base_inc] 1530 mova m6, m5 1531.w8_loop: 1532 mov r3, r5 1533 sar r3, 6 1534 movu m0, [tlq+r3] 1535 pand m1, m8, m5 1536 psubw m2, m9, m1 1537 psllw m1, 8 1538 pshufb m0, m3 1539 por m1, m2 1540 pmaddubsw m0, m1 1541 pcmpgtw m1, m4, m5 1542 paddw m5, m6 1543 pmulhrsw m0, m10 1544 pand m0, m1 1545 pandn m1, m7 1546 por m0, m1 1547 packuswb m0, m0 1548 movq [dstq], m0 1549 dec hd 1550 jz .w8_end 1551 movifnidn strideq, stridemp 1552 add dstq, strideq 1553 add r5, dxq 1554 jl .w8_loop 1555 packuswb m7, m7 1556.w8_end_loop: 1557 movq [dstq], m7 1558 add dstq, strideq 1559 dec hd 1560 jg .w8_end_loop 1561.w8_end: 1562 RET 1563.w16: 1564 lea r3d, [hq+15] 1565 movd m0, r3d 1566 and r3d, 15 1567 or r3d, 16 ; imin(h+15, 31) 1568 test angled, 0x400 1569 jnz .w16_main 1570 movd m2, angled 1571 shr angled, 8 ; is_sm << 1 1572 pxor m1, m1 1573 pshufb m0, m1 1574 pshufb m2, m1 1575 movq m3, [base+z_filter_t_w16+angleq*4] 1576 pcmpeqb m0, [base+z_filter_wh16] 1577 pand m0, m2 1578 pcmpgtb m0, m3 1579 pmovmskb r5d, m0 1580 test r5d, r5d 1581 jz .w16_main ; filter_strength == 0 1582 movd m4, [tlq-1] 1583 movu m0, [tlq+16*0] 1584 imul r5d, 0x24924924 1585 movu m1, [tlq+16*1] 1586 shr r5d, 30 1587 movd m2, [tlq+30] 1588 adc r5, -4 ; filter_strength-3 1589 movd m3, [tlq+r3] 1590 lea tlq, [rsp+16*4] 1591 mova [tlq-16*1], m0 1592 pxor m7, m7 1593 mova [tlq+16*0], m1 1594 pshufb m4, m7 1595 movd [rsp], m2 1596 pshufb m3, m7 1597 mova [tlq-16*2], m4 1598 movd [tlq+r3-16], m3 1599 call .filter_edge 1600 cmp hd, 16 1601 jle .w16_main 1602 pshuflw m0, [rsp], q0000 1603 sar r5, 1 1604 movd m1, [base+z_filter_k_tail+4+r5*4] 1605 lea r3d, [r5+33] 1606 pmaddubsw m0, m1 1607%if ARCH_X86_64 1608 pmulhrsw m0, m10 1609%else 1610 pmulhrsw m0, m4 1611%endif 1612 packuswb m0, m0 1613 movd [tlq+32], m0 1614.w16_main: 1615 add tlq, r3 1616 movd m5, dxd 1617 movd m7, [tlq] 1618 movd m4, r3d 1619 shl r3d, 6 1620 pshufb m5, [base+pw_256] 1621 pxor m6, m6 1622 pshufb m7, m6 1623 mov r5d, dxd 1624 pshufb m4, m6 1625 sub r5, r3 1626 psubb m4, [base+pb_0to15] 1627 mova m6, m5 1628.w16_loop: 1629 mov r3, r5 1630 sar r3, 6 1631 movu m1, [tlq+r3+0] 1632 pand m0, m8, m5 1633 movu m2, [tlq+r3+1] 1634 psubw m3, m9, m0 1635 psllw m0, 8 1636 por m3, m0 1637 punpcklbw m0, m1, m2 1638 pmaddubsw m0, m3 1639 punpckhbw m1, m2 1640 pmaddubsw m1, m3 1641 psrlw m3, m5, 6 1642 packsswb m3, m3 1643 pmulhrsw m0, m10 1644 pmulhrsw m1, m10 1645 paddw m5, m6 1646 pcmpgtb m2, m4, m3 1647 packuswb m0, m1 1648 pand m0, m2 1649 pandn m2, m7 1650 por m0, m2 1651 mova [dstq], m0 1652 dec hd 1653 jz .w16_end 1654 movifnidn strideq, stridemp 1655 add dstq, strideq 1656 add r5, dxq 1657 jl .w16_loop 1658.w16_end_loop: 1659 mova [dstq], m7 1660 add dstq, strideq 1661 dec hd 1662 jg .w16_end_loop 1663.w16_end: 1664 RET 1665.w32: 1666 lea r3d, [hq+31] 1667 and r3d, 31 1668 or r3d, 32 ; imin(h+31, 63) 1669 test angled, 0x400 ; !enable_intra_edge_filter 1670 jnz .w32_main 1671 movd m6, [tlq-1] 1672 movu m0, [tlq+16*0] 1673 movu m1, [tlq+16*1] 1674 movu m2, [tlq+16*2] 1675 movu m3, [tlq+16*3] 1676 movd m4, [tlq+62] 1677 movd m5, [tlq+r3] 1678 lea tlq, [rsp+16*6] 1679 mova [tlq-16*3], m0 1680 pxor m7, m7 1681 mova [tlq-16*2], m1 1682 pshufb m6, m7 1683 mova [tlq-16*1], m2 1684 xor r5d, r5d ; filter_strength = 3 1685 mova [tlq+16*0], m3 1686 movd [rsp], m4 1687 pshufb m5, m7 1688 mova [tlq-16*4], m6 1689 movd [tlq+r3-48], m5 1690 call .filter_edge 1691 sub tlq, 16*2 1692 call .filter_edge 1693 cmp hd, 32 1694 jle .w32_main 1695 pshuflw m0, [rsp], q0000 1696 movd m1, [base+z_filter_k_tail+4] 1697 add r3d, 2 1698 pmaddubsw m0, m1 1699%if ARCH_X86_64 1700 pmulhrsw m0, m10 1701%else 1702 pmulhrsw m0, m4 1703%endif 1704 packuswb m0, m0 1705 movd [tlq+64], m0 1706.w32_main: 1707 add tlq, r3 1708 movd m0, r3d 1709 movd m7, [tlq] 1710 shl r3d, 6 1711 movd m5, dxd 1712 pxor m6, m6 1713 mov r5d, dxd 1714 pshufb m0, m6 1715 pshufb m5, [base+pw_256] 1716 sub r5, r3 1717 pshufb m7, m6 1718 psubb m0, [base+pb_0to15] 1719 movddup m1, [base+pb_m16] 1720 mova [rsp+16*0], m0 1721 paddb m0, m1 1722 mova [rsp+16*1], m0 1723 mova m6, m5 1724.w32_loop: 1725 mov r3, r5 1726 sar r3, 6 1727 movu m1, [tlq+r3+16*0+0] 1728 pand m0, m8, m5 1729 movu m2, [tlq+r3+16*0+1] 1730 psubw m3, m9, m0 1731 psllw m0, 8 1732 por m3, m0 1733 punpcklbw m0, m1, m2 1734 pmaddubsw m0, m3 1735 punpckhbw m1, m2 1736 pmaddubsw m1, m3 1737 psrlw m4, m5, 6 1738 pmulhrsw m0, m10 1739 pmulhrsw m1, m10 1740 packsswb m4, m4 1741 pcmpgtb m2, [rsp+16*0], m4 1742 packuswb m0, m1 1743 pand m0, m2 1744 pandn m2, m7 1745 por m0, m2 1746 movu m1, [tlq+r3+16*1+0] 1747 movu m2, [tlq+r3+16*1+1] 1748 mova [dstq+16*0], m0 1749 punpcklbw m0, m1, m2 1750 pmaddubsw m0, m3 1751 punpckhbw m1, m2 1752 pmaddubsw m1, m3 1753 paddw m5, m6 1754 pmulhrsw m0, m10 1755 pmulhrsw m1, m10 1756 pcmpgtb m2, [rsp+16*1], m4 1757 packuswb m0, m1 1758 pand m0, m2 1759 pandn m2, m7 1760 por m0, m2 1761 mova [dstq+16*1], m0 1762 dec hd 1763 jz .w32_end 1764 movifnidn strideq, stridemp 1765 add dstq, strideq 1766 add r5, dxq 1767 jl .w32_loop 1768.w32_end_loop: 1769 mova [dstq+16*0], m7 1770 mova [dstq+16*1], m7 1771 add dstq, strideq 1772 dec hd 1773 jg .w32_end_loop 1774.w32_end: 1775 RET 1776.w64: 1777 lea r3d, [hq+63] 1778 test angled, 0x400 ; !enable_intra_edge_filter 1779 jnz .w64_main 1780 movd m4, [tlq-1] 1781 movu m0, [tlq+16*0] 1782 movu m1, [tlq+16*1] 1783 movu m2, [tlq+16*2] 1784 movu m3, [tlq+16*3] 1785 mova [rsp+16*3], m0 1786 pxor m7, m7 1787 mova [rsp+16*4], m1 1788 pshufb m4, m7 1789 mova [rsp+16*5], m2 1790 mova [rsp+16*6], m3 1791 mova [rsp+16*2], m4 1792 movu m0, [tlq+16*4] 1793 movu m1, [tlq+16*5] 1794 movu m2, [tlq+16*6] 1795 movu m3, [tlq+16*7] 1796 movd m4, [tlq+r3] 1797 lea tlq, [rsp+16*10] 1798 mova [tlq-16*3], m0 1799 xor r5d, r5d ; filter_strength = 3 1800 mova [tlq-16*2], m1 1801 pshufb m4, m7 1802 mova [tlq-16*1], m2 1803 mova [tlq+16*0], m3 1804 movd [tlq+r3-16*7], m4 1805 cmp hd, 64 1806 jl .w64_filter96 ; skip one call if the last 32 bytes aren't used 1807 call .filter_edge 1808.w64_filter96: 1809 sub tlq, 16*2 1810 call .filter_edge 1811 sub tlq, 16*2 1812 call .filter_edge 1813 sub tlq, 16*2 1814 call .filter_edge 1815.w64_main: 1816 add tlq, r3 1817 movd m0, r3d 1818 movd m7, [tlq] 1819 shl r3d, 6 1820 movd m5, dxd 1821 pxor m6, m6 1822 mov r5d, dxd 1823 pshufb m0, m6 1824 sub r5, r3 1825 pshufb m5, [base+pw_256] 1826 pshufb m7, m6 1827 psubb m0, [base+pb_0to15] 1828 movddup m1, [base+pb_m16] 1829 mova [rsp+16*0], m0 1830 paddb m0, m1 1831 mova [rsp+16*1], m0 1832 paddb m0, m1 1833 mova [rsp+16*2], m0 1834 paddb m0, m1 1835 mova [rsp+16*3], m0 1836 mova m6, m5 1837.w64_loop: 1838 mov r3, r5 1839 sar r3, 6 1840 movu m1, [tlq+r3+16*0+0] 1841 pand m0, m8, m5 1842 movu m2, [tlq+r3+16*0+1] 1843 psubw m3, m9, m0 1844 psllw m0, 8 1845 por m3, m0 1846 punpcklbw m0, m1, m2 1847 pmaddubsw m0, m3 1848 punpckhbw m1, m2 1849 pmaddubsw m1, m3 1850 psrlw m4, m5, 6 1851 pmulhrsw m0, m10 1852 pmulhrsw m1, m10 1853 packsswb m4, m4 1854 pcmpgtb m2, [rsp+16*0], m4 1855 packuswb m0, m1 1856 pand m0, m2 1857 pandn m2, m7 1858 por m0, m2 1859 movu m1, [tlq+r3+16*1+0] 1860 movu m2, [tlq+r3+16*1+1] 1861 mova [dstq+16*0], m0 1862 punpcklbw m0, m1, m2 1863 pmaddubsw m0, m3 1864 punpckhbw m1, m2 1865 pmaddubsw m1, m3 1866 pmulhrsw m0, m10 1867 pmulhrsw m1, m10 1868 pcmpgtb m2, [rsp+16*1], m4 1869 packuswb m0, m1 1870 pand m0, m2 1871 pandn m2, m7 1872 por m0, m2 1873 movu m1, [tlq+r3+16*2+0] 1874 movu m2, [tlq+r3+16*2+1] 1875 mova [dstq+16*1], m0 1876 punpcklbw m0, m1, m2 1877 pmaddubsw m0, m3 1878 punpckhbw m1, m2 1879 pmaddubsw m1, m3 1880 pmulhrsw m0, m10 1881 pmulhrsw m1, m10 1882 pcmpgtb m2, [rsp+16*2], m4 1883 packuswb m0, m1 1884 pand m0, m2 1885 pandn m2, m7 1886 por m0, m2 1887 movu m1, [tlq+r3+16*3+0] 1888 movu m2, [tlq+r3+16*3+1] 1889 mova [dstq+16*2], m0 1890 punpcklbw m0, m1, m2 1891 pmaddubsw m0, m3 1892 punpckhbw m1, m2 1893 pmaddubsw m1, m3 1894 paddw m5, m6 1895 pmulhrsw m0, m10 1896 pmulhrsw m1, m10 1897 pcmpgtb m2, [rsp+16*3], m4 1898 packuswb m0, m1 1899 pand m0, m2 1900 pandn m2, m7 1901 por m0, m2 1902 mova [dstq+16*3], m0 1903 dec hd 1904 jz .w64_end 1905 movifnidn strideq, stridemp 1906 add dstq, strideq 1907 add r5, dxq 1908 jl .w64_loop 1909.w64_end_loop: 1910 mova [dstq+16*0], m7 1911 mova [dstq+16*1], m7 1912 mova [dstq+16*2], m7 1913 mova [dstq+16*3], m7 1914 add dstq, strideq 1915 dec hd 1916 jg .w64_end_loop 1917.w64_end: 1918 RET 1919ALIGN function_align 1920.filter_edge: ; 32 pixels/iteration 1921 movddup m7, [base+z_filter_k+8*2+r5*8+24*0] 1922 movu m2, [tlq-18] 1923 movu m1, [tlq-17] 1924 movu m3, [tlq- 2] 1925 movu m4, [tlq- 1] 1926 punpcklbw m0, m2, m1 1927 pmaddubsw m0, m7 1928 punpckhbw m2, m1 1929 pmaddubsw m2, m7 1930 punpcklbw m1, m3, m4 1931 pmaddubsw m1, m7 1932 punpckhbw m3, m4 1933 pmaddubsw m3, m7 1934 movddup m7, [base+z_filter_k+8*2+r5*8+24*1] 1935 mova m5, [tlq-16] 1936 movu m6, [tlq-15] 1937 punpcklbw m4, m5, m6 1938 pmaddubsw m4, m7 1939 punpckhbw m5, m6 1940 pmaddubsw m5, m7 1941 paddw m0, m4 1942 paddw m2, m5 1943 mova m5, [tlq+ 0] 1944 movu m6, [tlq+ 1] 1945 punpcklbw m4, m5, m6 1946 pmaddubsw m4, m7 1947 punpckhbw m5, m6 1948 pmaddubsw m5, m7 1949 paddw m1, m4 1950 paddw m3, m5 1951 test r5d, r5d 1952 jnz .filter_end ; 3-tap 1953 movddup m7, [base+z_filter_k+8*8] 1954 movu m5, [tlq-14] 1955 movu m6, [tlq+ 2] 1956 punpcklbw m4, m5, m5 1957 pmaddubsw m4, m7 1958 punpckhbw m5, m5 1959 pmaddubsw m5, m7 1960 paddw m0, m4 1961 paddw m2, m5 1962 punpcklbw m5, m6, m6 1963 pmaddubsw m5, m7 1964 punpckhbw m6, m6 1965 pmaddubsw m6, m7 1966 paddw m1, m5 1967 paddw m3, m6 1968.filter_end: 1969%if ARCH_X86_64 1970 REPX {pmulhrsw x, m10}, m0, m2, m1, m3 1971%else 1972 mova m4, m10 1973 REPX {pmulhrsw x, m4 }, m0, m2, m1, m3 1974%endif 1975 packuswb m0, m2 1976 packuswb m1, m3 1977 mova [tlq+16*0], m0 1978 mova [tlq+16*1], m1 1979 ret 1980 1981%if ARCH_X86_64 1982cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy 1983 %define base r7-$$ 1984 %define maxwm r6m 1985 %define maxhm r7m 1986 lea r7, [$$] 1987 mov hd, hm 1988 mova m8, [base+pw_62] 1989 mova m9, [base+pw_64] 1990 lea r9d, [wq-4] 1991 mova m10, [base+pw_512] 1992 shl r9d, 6 1993 mova m11, [base+z1_shuf_w4] 1994 or r9d, hd 1995 mova m12, [base+z2_h_shuf] 1996%else 1997cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx 1998 %define base r1-$$ 1999 %define m8 [base+pw_62] 2000 %define m9 [base+pw_64] 2001 %define m10 [base+pw_512] 2002 %define m11 [rsp+16*16] 2003 %define m12 [rsp+16*17] 2004 %define r9b byte [rsp+16*18+4*0] 2005 %define r9d dword [rsp+16*18+4*0] 2006 %define r10d dword [rsp+16*18+4*1] 2007 %define r11d dword [rsp+16*18+4*2] 2008 %define maxwm [rsp+16*18+4*3] 2009 %define maxhm [rsp+16*19+4*0] 2010 %define stridemp [rsp+16*19+4*1] 2011 %define strideq r3 2012 %define dyd r4 2013 %define dyq r4 2014 mov stridemp, r1 2015 mov r1d, r6m 2016 mov r4d, r7m 2017 mov maxwm, r1d 2018 mov maxhm, r4d 2019 LEA r1, $$ 2020 lea hd, [wq-4] 2021 mova m0, [base+z1_shuf_w4] 2022 shl hd, 6 2023 mova m1, [base+z2_h_shuf] 2024 or hd, hm 2025 mova m11, m0 2026 mov r9d, hd 2027 mova m12, m1 2028%endif 2029 tzcnt wd, wd 2030 movifnidn angled, anglem 2031 movsxd wq, [base+ipred_z2_ssse3_table+wq*4] 2032%if ARCH_X86_64 2033 movzx dxd, angleb 2034%else 2035 movzx dxd, byte anglem 2036%endif 2037 xor angled, 0x400 2038 mova m0, [tlq-16*4] 2039 mov dyd, dxd 2040 mova m1, [tlq-16*3] 2041 neg dxq 2042 mova m2, [tlq-16*2] 2043 and dyd, ~1 2044 mova m3, [tlq-16*1] 2045 and dxq, ~1 2046 movd m4, [tlq] 2047 movu m5, [tlq+16*0+1] 2048 movu m6, [tlq+16*1+1] 2049 movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 2050 movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle 2051 mova [rsp+16*2], m0 2052 pxor m7, m7 2053 mova [rsp+16*3], m1 2054 pshufb m4, m7 2055 mova [rsp+16*4], m2 2056 lea wq, [base+ipred_z2_ssse3_table+wq] 2057 mova [rsp+16*5], m3 2058 neg dxd 2059 mova [rsp+16*6], m4 2060 or dyd, 4<<16 2061 mova [rsp+16*7], m4 2062 mova [rsp+16*8], m5 2063 mova [rsp+16*9], m6 2064 movq m0, [base+z_base_inc+2] 2065 movsldup m1, [base+z2_dy_offset] 2066 movq m2, [base+pw_256] ; 4<<6 2067 movq [rsp+16*14+8*0], m0 2068 movq [rsp+16*15+8*0], m1 2069 movq [rsp+16*15+8*1], m2 2070%if ARCH_X86_64 2071 lea r10d, [dxq+(128<<6)] ; xpos 2072%else 2073 mov [rsp+16*7+4*1], dyd 2074 lea r4d, [dxq+(128<<6)] 2075 mov r10d, r4d 2076 movzx hd, r9b 2077%endif 2078 mov r11d, (128-4)<<6 2079 jmp wq 2080.w4: 2081 test angled, 0x400 2082 jnz .w4_main 2083 movd m5, [tlq+4] 2084 lea r3d, [hq+2] 2085 add angled, 1022 2086 pshufb m5, m7 2087 shl r3d, 6 2088 movd [rsp+16*8+4], m5 2089 test r3d, angled 2090 jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) 2091 call .upsample_above 2092 sub angled, 1075 ; angle - 53 2093 lea r3d, [hq+3] 2094 xor angled, 0x7f ; 180 - angle 2095 movd m0, r3d 2096 movd m6, angled 2097 shr angled, 8 ; is_sm << 1 2098 pshufb m0, m7 2099 pshufb m6, m7 2100 pcmpeqb m0, [base+z_filter_wh4] 2101 pand m6, m0 2102 pcmpgtb m6, [base+z_filter_t_w48+angleq*8] 2103 jmp .w8_filter_left 2104.upsample_above: ; w4/w8 2105 movq m3, [rsp+gprsize+16*8-2] 2106 movq m1, [rsp+gprsize+16*8-1] 2107 movq m0, [rsp+gprsize+16*8+0] 2108 movq m4, [rsp+gprsize+16*8+1] 2109 movddup m5, [base+pb_36_m4] 2110 punpcklbw m1, m3 2111 punpcklbw m2, m0, m4 2112 pmaddubsw m1, m5 2113 pmaddubsw m2, m5 2114%if ARCH_X86_64 2115 mova m11, [base+pb_0to15] 2116 lea r10d, [r10+dxq+(1<<6)] 2117 mov r11d, (128-7)<<6 2118%else 2119 mova m3, [base+pb_0to15] 2120 mov r3d, [rsp+gprsize+16*18+4*1] 2121 mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6 2122 lea r3d, [r3+dxq+(1<<6)] 2123 mov [rsp+gprsize+16*18+4*1], r3d 2124 mova [rsp+gprsize+16*16], m3 2125%endif 2126 add dxd, dxd 2127 paddw m1, m2 2128 pmulhrsw m1, m10 2129 movq m2, [rsp+gprsize+16*14] 2130 paddw m2, m2 2131 movq [rsp+gprsize+16*14], m2 2132 packuswb m1, m1 2133 punpcklbw m1, m0 2134 mova [rsp+gprsize+16*8], m1 2135 ret 2136.w4_no_upsample_above: 2137 lea r3d, [hq+3] 2138 mov [rsp], angled 2139 sub angled, 1112 ; angle - 90 2140 movd m0, r3d 2141 mov r3d, 90 2142 movd m1, angled 2143 sub r3d, angled ; 180 - angle 2144 shr angled, 8 ; is_sm << 1 2145 movu m3, [base+z_filter_wh4] 2146 mova m4, [base+z_filter_t_w48+angleq*8] 2147 call .w8_filter_top 2148 mov angled, [rsp] 2149 lea r3d, [hq+2] 2150 sub angled, 139 2151 shl r3d, 6 2152 test r3d, angled 2153 jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) 2154.upsample_left: ; w4/w8 2155 neg hq 2156 movd m0, [tlq+hq] 2157 pshufb m0, m7 2158 movd [rsp+16*6+hq-4], m0 2159 movq m3, [rsp+16*5+7] 2160 movq m0, [rsp+16*5+8] 2161 movq m2, [rsp+16*5+9] 2162 movq m4, [rsp+16*5+10] 2163 movddup m5, [base+pb_36_m4] 2164 punpcklbw m1, m0, m3 2165 punpcklbw m2, m4 2166 pmaddubsw m1, m5 2167 pmaddubsw m2, m5 2168 movshdup m3, [base+z2_dy_offset] 2169%if ARCH_X86_64 2170 mova m12, [base+z2_upsample] 2171 add dyd, dyd 2172%else 2173 mova m4, [base+z2_upsample] 2174 shl dword [rsp+16*7+4*1], 1 2175 mova m12, m4 2176%endif 2177 paddw m1, m2 2178 pmulhrsw m1, m10 2179 movq [rsp+16*15], m3 2180 packuswb m1, m1 2181 punpcklbw m0, m1 2182 mova [rsp+16*5], m0 2183.w4_main: 2184 movd m6, dxd 2185%if ARCH_X86_64 2186 movd m3, dyd 2187%else 2188 movd m3, [rsp+16*7+4*1] 2189%endif 2190 movddup m0, [rsp+16*14+8*0] 2191 pshufb m6, [base+pw_256] 2192 paddw m7, m6, m6 2193 movq m5, [base+pw_m1to4] 2194 pshuflw m4, m3, q0000 2195 punpcklqdq m6, m7 2196 pmullw m4, m5 2197 pshuflw m3, m3, q1111 2198 paddw m6, m0 2199 mov r2d, r10d 2200 pshuflw m0, m4, q3333 2201 psubw m4, [rsp+16*15] 2202 movq [rsp+16*6+8*1], m3 2203 movq [rsp+8*1], m0 ; dy*4 2204 mov r5, dstq 2205.w4_loop0: 2206 mova [rsp+16*12], m6 2207 movq [rsp+8*0], m4 2208 pand m0, m4, m8 2209 psraw m4, 6 2210 psubw m1, m9, m0 2211 psllw m0, 8 2212 por m0, m1 ; 64-frac_y, frac_y 2213 movq [rsp+8*3], m0 2214 pabsw m4, m4 2215 movq [rsp+8*2], m4 2216 movzx hd, r9b 2217.w4_loop: 2218 lea r3d, [r2+dxq] 2219 shr r2d, 6 ; base_x0 2220 movq m0, [rsp+r2] 2221 lea r2d, [r3+dxq] 2222 shr r3d, 6 ; base_x1 2223 movhps m0, [rsp+r3] 2224 lea r3d, [r2+dxq] 2225 shr r2d, 6 ; base_x2 2226 movq m1, [rsp+r2] 2227 lea r2d, [r3+dxq] 2228 shr r3d, 6 ; base_x3 2229 movhps m1, [rsp+r3] 2230 pand m2, m8, m6 2231 paddsw m5, m6, m7 2232 psubw m3, m9, m2 2233 psllw m2, 8 2234 pshufb m0, m11 2235 por m2, m3 2236 pmaddubsw m0, m2 2237 pand m2, m8, m5 2238 psubw m3, m9, m2 2239 psllw m2, 8 2240 pshufb m1, m11 2241 por m2, m3 2242 pmaddubsw m1, m2 2243 cmp r3d, 127 ; topleft 2244 jge .w4_toponly 2245 movzx r3d, byte [rsp+8*2+0] ; base_y0 2246 movq m3, [rsp+r3] 2247 movzx r3d, byte [rsp+8*2+2] ; base_y1 2248 movhps m3, [rsp+r3] 2249 movzx r3d, byte [rsp+8*2+4] ; base_y2 2250 movq m4, [rsp+r3] 2251 movzx r3d, byte [rsp+8*2+6] ; base_y3 2252 movhps m4, [rsp+r3] 2253 pshufb m3, m12 2254 pshufb m4, m12 2255 punpckldq m2, m3, m4 2256 punpckhdq m3, m4 2257 movddup m4, [rsp+8*3] 2258 pmaddubsw m2, m4 2259 pmaddubsw m3, m4 2260 psraw m6, 15 ; base_x < topleft 2261 pand m2, m6 2262 pandn m6, m0 2263 por m0, m2, m6 2264 psraw m6, m5, 15 2265 pand m3, m6 2266 pandn m6, m1 2267 por m1, m3, m6 2268.w4_toponly: 2269 pmulhrsw m0, m10 2270 pmulhrsw m1, m10 2271 movifnidn strideq, stridemp 2272 packuswb m0, m1 2273 movd [dstq+strideq*0], m0 2274 pshuflw m1, m0, q1032 2275 movd [dstq+strideq*1], m1 2276 lea dstq, [dstq+strideq*2] 2277 punpckhqdq m0, m0 2278 movd [dstq+strideq*0], m0 2279 psrlq m0, 32 2280 movd [dstq+strideq*1], m0 2281 sub hd, 4 2282 jz .w4_end 2283 movq m4, [rsp+8*2] 2284 movq m3, [rsp+16*6+8*1] 2285 paddw m6, m5, m7 ; xpos += dx 2286 psubw m4, m3 2287 movq [rsp+8*2], m4 2288 lea dstq, [dstq+strideq*2] 2289 cmp r2d, r11d 2290 jge .w4_loop 2291 movddup m5, [rsp+8*3] 2292.w4_leftonly_loop: 2293 movzx r2d, byte [rsp+8*2+0] ; base_y0 2294 movq m1, [rsp+r2] 2295 movzx r2d, byte [rsp+8*2+2] ; base_y1 2296 movhps m1, [rsp+r2] 2297 movzx r2d, byte [rsp+8*2+4] ; base_y2 2298 movq m2, [rsp+r2] 2299 movzx r2d, byte [rsp+8*2+6] ; base_y3 2300 movhps m2, [rsp+r2] 2301 psubw m4, m3 2302 pshufb m1, m12 2303 pshufb m2, m12 2304 movq [rsp+8*2], m4 2305 punpckldq m0, m1, m2 2306 punpckhdq m1, m2 2307 pmaddubsw m0, m5 2308 pmaddubsw m1, m5 2309 pmulhrsw m0, m10 2310 pmulhrsw m1, m10 2311 packuswb m0, m1 2312 movd [dstq+strideq*0], m0 2313 pshuflw m1, m0, q1032 2314 movd [dstq+strideq*1], m1 2315 lea dstq, [dstq+strideq*2] 2316 punpckhqdq m0, m0 2317 movd [dstq+strideq*0], m0 2318 psrlq m0, 32 2319 movd [dstq+strideq*1], m0 2320 lea dstq, [dstq+strideq*2] 2321 sub hd, 4 2322 jg .w4_leftonly_loop 2323.w4_end: 2324 sub r9d, 1<<8 2325 jl .w4_ret 2326 movq m4, [rsp+8*1] 2327 add r5, 4 2328 mov dstq, r5 2329 paddw m4, [rsp+8*0] ; base_y += 4*dy 2330 movzx r2d, word [rsp+16*15+8*1] 2331 movddup m6, [rsp+16*15+8*1] 2332 paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above) 2333 add r2d, r10d 2334 mov r10d, r2d 2335 jmp .w4_loop0 2336.w4_ret: 2337 RET 2338.w8: 2339 test angled, 0x400 2340 jnz .w4_main 2341 movd m5, [tlq+8] 2342 lea r3d, [angleq+126] 2343 pshufb m5, m7 2344%if ARCH_X86_64 2345 mov r3b, hb 2346%else 2347 xor r3b, r3b 2348 or r3d, hd 2349%endif 2350 movd [rsp+16*8+8], m5 2351 cmp r3d, 8 2352 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm 2353 call .upsample_above 2354 sub angled, 53 2355 lea r3d, [hq+7] 2356 xor angled, 0x7f ; 180 - angle 2357 movu m1, [base+z_filter_wh8] 2358 movd m0, r3d 2359 movd m6, angled 2360 shr angled, 8 ; is_sm << 1 2361 psrldq m2, [base+z_filter_t_w48+angleq*8], 4 2362 pshufb m0, m7 2363 pshufb m6, m7 2364 pcmpeqb m0, m1 2365 pand m6, m0 2366 pcmpgtb m6, m2 2367%if ARCH_X86_64 2368 movq [rsp+16*15+8*1], m10 ; 8<<6 2369%else 2370 movq m0, m10 2371 movq [rsp+16*15+8*1], m0 2372%endif 2373 jmp .w8_filter_left 2374.w8_no_upsample_above: 2375 lea r3d, [hq+7] 2376 mov [rsp], angled 2377 sub angled, 90 2378 movd m0, r3d 2379 mov r3d, 90 2380 movd m1, angled 2381 sub r3d, angled ; 180 - angle 2382 shr angled, 8 ; is_sm << 1 2383 movu m3, [base+z_filter_wh8] 2384 psrldq m4, [base+z_filter_t_w48+angleq*8], 4 2385 call .w8_filter_top 2386 mov r3d, [rsp] 2387 sub r3d, 141 2388%if ARCH_X86_64 2389 mov r3b, hb 2390%else 2391 xor r3b, r3b 2392 or r3d, hd 2393%endif 2394 cmp r3d, 8 2395 jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm 2396.w8_filter_left: 2397 pmovmskb r5d, m6 2398 test r5d, r5d 2399 jz .w4_main 2400 imul r5d, 0x55555555 2401 mov r3, tlq 2402 shr r5d, 30 2403 sub r5, 3 ; filter_strength-3 2404 jmp .filter_left 2405.w8_filter_top: 2406 movd m6, r3d 2407 REPX {pshufb x, m7}, m0, m1, m6 2408 pcmpeqb m0, m3 2409 pand m1, m0 2410 pand m6, m0 2411 pcmpgtb m1, m4 2412 pcmpgtb m6, m4 2413 pmovmskb r5d, m1 2414 test r5d, r5d 2415 jz .w8_filter_top_end ; filter_strength == 0 2416 imul r5d, 0x55555555 2417 movq m0, [rsp+gprsize+16*8-2] 2418 shr r5d, 30 2419 movq m1, [rsp+gprsize+16*8-1] 2420 sub r5, 3 ; filter_strength-3 2421 movddup m7, [base+z_filter_k+8*2+r5*8+24*0] 2422 punpcklbw m0, m1 2423 pmaddubsw m0, m7 2424 movq m1, [rsp+gprsize+16*8+0] 2425 movq m2, [rsp+gprsize+16*8+1] 2426 movddup m7, [base+z_filter_k+8*2+r5*8+24*1] 2427 punpcklbw m1, m2 2428 pmaddubsw m1, m7 2429 movq m2, [rsp+gprsize+16*8+2] 2430 movddup m7, [base+z_filter_k+8*2+r5*8+24*2] 2431 punpcklbw m2, m2 2432 pmaddubsw m2, m7 2433 paddw m0, m1 2434 paddw m0, m2 2435%if ARCH_X86_64 2436 mov r3d, r7m ; maxw, offset due to call 2437%else 2438 mov r3d, [rsp+gprsize+16*18+4*3] 2439%endif 2440 pmulhrsw m0, m10 2441 pmulhrsw m1, m10 2442 packuswb m0, m1 2443 movq [rsp+gprsize+16*8], m0 2444 cmp r3d, 8 2445 jge .w8_filter_top_end 2446 movq m0, [tlq+r3+1] 2447 movq [rsp+gprsize+r3+16*8], m0 2448.w8_filter_top_end: 2449 ret 2450.w16: 2451 test angled, 0x400 2452 jnz .w4_main 2453 lea r3d, [hq+15] 2454 sub angled, 90 2455 movd m0, r3d 2456 mov r3d, 90 2457 movd m1, angled 2458 sub r3d, angled ; 180 - angle 2459 shr angled, 8 ; is_sm << 1 2460 movd m6, r3d 2461 REPX {pshufb x, m7}, m0, m1, m6 2462 movq m3, [base+z_filter_t_w16+angleq*4] 2463 pcmpeqb m0, [base+z_filter_wh16] 2464 pand m1, m0 2465 pand m6, m0 2466 pcmpgtb m1, m3 2467 pcmpgtb m6, m3 2468 pmovmskb r5d, m1 2469 mov r3, tlq 2470 test r5d, r5d 2471 jz .w16_filter_left ; filter_strength == 0 2472 imul r5d, 0x24924924 2473 pshufb m5, [base+z_filter_t_w16] ; tlq[16] 2474 shr r5d, 30 2475 adc r5, -4 ; filter_strength-3 2476 movd [rsp+16*9], m5 2477 movddup m7, [base+z_filter_k+8*2+r5*8+24*0] 2478 movu m1, [rsp+16*8-2] 2479 movu m2, [rsp+16*8-1] 2480 punpcklbw m0, m1, m2 2481 pmaddubsw m0, m7 2482 punpckhbw m1, m2 2483 pmaddubsw m1, m7 2484 movddup m7, [base+z_filter_k+8*2+r5*8+24*1] 2485 mova m3, [rsp+16*8+0] 2486 movu m4, [rsp+16*8+1] 2487 punpcklbw m2, m3, m4 2488 pmaddubsw m2, m7 2489 punpckhbw m3, m4 2490 pmaddubsw m3, m7 2491 paddw m0, m2 2492 paddw m1, m3 2493 test r5d, r5d 2494 jnz .w16_filter_end ; 3-tap 2495 movddup m7, [base+z_filter_k+8*8] 2496 movu m3, [rsp+16*8+2] 2497 punpcklbw m2, m3, m3 2498 pmaddubsw m2, m7 2499 punpckhbw m3, m3 2500 pmaddubsw m3, m7 2501 paddw m0, m2 2502 paddw m1, m3 2503.w16_filter_end: 2504 mov r2d, maxwm 2505 pmulhrsw m0, m10 2506 pmulhrsw m1, m10 2507 packuswb m0, m1 2508 mova [rsp+16*8], m0 2509 cmp r2d, 16 2510 jge .w16_filter_left 2511 movu m0, [r3+r2+1] 2512 movu [rsp+r2+16*8], m0 2513.w16_filter_left: 2514 pmovmskb r5d, m6 2515 test r5d, r5d 2516 jz .w4_main 2517 imul r5d, 0x24924924 2518 shr r5d, 30 2519 adc r5, -4 ; filter_strength-3 2520 jmp .filter_left 2521.w32: 2522 test angled, 0x400 2523 jnz .w4_main 2524 pshufb m6, [base+z_filter_t_w16] ; tlq[32] 2525 mov r3, tlq 2526 lea tlq, [rsp+16*9] 2527 movd [tlq+16*1], m6 2528 xor r5d, r5d ; filter_strength = 3 2529 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 2530 mova m0, [tlq+16*0] 2531 mova m1, [tlq+16*1] 2532 mov r2d, maxwm 2533 mova [rsp+16*8], m0 2534 mova [rsp+16*9], m1 2535 cmp r2d, 32 2536 jge .filter_left 2537 movu m0, [r3+r2+16*0+1] 2538 movu m1, [r3+r2+16*1+1] 2539 movu [rsp+r2+16*8], m0 2540 movu [rsp+r2+16*9], m1 2541 jmp .filter_left 2542.w64: 2543 movu m0, [tlq+16*2+1] 2544 movu m1, [tlq+16*3+1] 2545 mova [rsp+16*10], m0 2546 mova [rsp+16*11], m1 2547 test angled, 0x400 2548 jnz .w4_main 2549 pshufb m1, [base+z_filter_t_w16] ; tlq[64] 2550 mov r3, tlq 2551 lea tlq, [rsp+16*11] 2552 movd [tlq+16*1], m1 2553 xor r5d, r5d ; filter_strength = 3 2554 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 2555 sub tlq, 16*2 2556 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 2557 mova m0, [tlq+16*0] 2558 mova m1, [tlq+16*1] 2559 mova m2, [tlq+16*2] 2560 mova m3, [tlq+16*3] 2561 mov r2d, maxwm 2562 mova [rsp+16* 8], m0 2563 mova [rsp+16* 9], m1 2564 mova [rsp+16*10], m2 2565 mova [rsp+16*11], m3 2566 cmp r2d, 64 2567 jge .filter_left 2568 movu m0, [r3+r2+16*0+1] 2569 movu m1, [r3+r2+16*1+1] 2570 movu [rsp+r2+16* 8], m0 2571 movu [rsp+r2+16* 9], m1 2572 cmp r2d, 32 2573 jge .filter_left 2574 movu m0, [r3+r2+16*2+1] 2575 movu m1, [r3+r2+16*3+1] 2576 movu [rsp+r2+16*10], m0 2577 movu [rsp+r2+16*11], m1 2578.filter_left: 2579 neg hq 2580 movd m0, [r3+hq] 2581 pxor m1, m1 2582 pshufb m0, m1 2583 movd [rsp+16*6+hq-4], m0 2584 lea tlq, [rsp+16*5] 2585 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 2586 cmp hd, -32 2587 jge .filter_left_end 2588 sub tlq, 16*2 2589 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 2590 mova m0, [tlq+16*0] 2591 mova m1, [tlq+16*1] 2592 mova [rsp+16*2], m0 2593 mova [rsp+16*3], m1 2594.filter_left_end: 2595 mov r2d, maxhm 2596 mova m0, [rsp+16*5] 2597 mova m1, [rsp+16*6] 2598 mova m2, [rsp+16*7] 2599 neg r2 2600 mova [rsp+16*4], m0 2601 mova [rsp+16*5], m1 2602 mova [rsp+16*6], m2 2603 cmp r2d, hd 2604 jle .w4_main 2605 movu m0, [r3+r2-16*2] 2606 movu m1, [r3+r2-16*1] 2607 movu [rsp+r2+16*4], m0 2608 movu [rsp+r2+16*5], m1 2609 cmp r2d, -32 2610 jle .w4_main 2611 movu m0, [r3+r2-16*4] 2612 movu m1, [r3+r2-16*3] 2613 movu [rsp+r2+16*2], m0 2614 movu [rsp+r2+16*3], m1 2615 jmp .w4_main 2616 2617%if ARCH_X86_64 2618cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w 2619 %define base r7-$$ 2620 lea r7, [$$] 2621 mova m8, [base+pw_62] 2622 mova m9, [base+pw_64] 2623 mova m10, [base+pw_512] 2624 mov org_wd, wd 2625%else 2626cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy 2627 %define base r1-$$ 2628 %define m8 [base+pw_62] 2629 %define m9 [base+pw_64] 2630 %define m10 [base+pw_512] 2631 %define org_wd r5 2632 %define org_wq r5 2633 mov [dstq+strideq*0], strideq 2634 mov [dstq+strideq*1], wd 2635 LEA r1, $$ 2636%endif 2637 tzcnt hd, hm 2638 movifnidn angled, anglem 2639 dec tlq 2640 movsxd hq, [base+ipred_z3_ssse3_table+hq*4] 2641 sub angled, 180 2642 mov dyd, angled 2643 neg dyd 2644 xor angled, 0x400 2645 or dyq, ~0x7e 2646 lea hq, [base+ipred_z3_ssse3_table+hq] 2647 movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] 2648 jmp hq 2649.h4: 2650 lea r4d, [angleq+88] 2651 test r4d, 0x480 2652 jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 2653 sar r4d, 9 2654 add r4d, wd 2655 cmp r4d, 8 2656 jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) 2657 movu m3, [tlq-7] 2658 movu m1, [base+z_upsample1-4] 2659 movu m4, [base+z_filter_s+2] 2660 pshufb m0, m3, m1 2661 pxor m1, m1 2662 pshufb m2, m3, m1 2663 pshufb m1, m3, m4 2664 mova [rsp+16], m2 ; top[max_base_y] 2665 movddup m2, [base+pb_36_m4] 2666 add dyd, dyd 2667 pmaddubsw m0, m2 2668 pmaddubsw m1, m2 2669 movd m5, dyd 2670 mov r5d, dyd 2671 pshufb m5, [base+pw_256] 2672 paddw m0, m1 2673 pmulhrsw m0, m10 2674 shl wd, 2 2675 mov tlq, rsp 2676 sub rsp, wq 2677 packuswb m0, m0 2678 punpcklbw m0, m3 2679 paddw m6, m5, m5 2680 punpcklqdq m5, m6 2681 pshufb m0, [base+pb_15to0] 2682 mova [tlq], m0 2683.h4_upsample_loop: 2684 lea r4d, [r5+dyq] 2685 shr r5d, 6 2686 movq m0, [tlq+r5] 2687 lea r5d, [r4+dyq] 2688 shr r4d, 6 2689 movhps m0, [tlq+r4] 2690 pand m2, m8, m5 2691 psubw m1, m9, m2 2692 psllw m2, 8 2693 por m1, m2 2694 pmaddubsw m0, m1 2695 paddw m5, m6 2696 pmulhrsw m0, m10 2697 packuswb m0, m0 2698 movq [rsp+wq-8], m0 2699 sub wd, 8 2700 jg .h4_upsample_loop 2701 jmp .h4_transpose 2702.h4_no_upsample: 2703 mov r4d, 7 2704 test angled, 0x400 ; !enable_intra_edge_filter 2705 jnz .h4_main 2706 lea r4d, [wq+3] 2707 movd m0, r4d 2708 movd m2, angled 2709 shr angled, 8 ; is_sm << 1 2710 pxor m1, m1 2711 pshufb m0, m1 2712 pshufb m2, m1 2713 pcmpeqb m1, m0, [base+z_filter_wh4] 2714 pand m1, m2 2715 pcmpgtb m1, [base+z_filter_t_w48+angleq*8] 2716 pmovmskb r5d, m1 2717 mov r4d, 7 2718 test r5d, r5d 2719 jz .h4_main ; filter_strength == 0 2720 movu m2, [tlq-7] 2721 imul r5d, 0x55555555 2722 movu m3, [base+z_filter_s-2] 2723 shr r5d, 30 ; filter_strength 2724 mova m4, [base+z_upsample2] 2725 movddup m5, [base+z_filter_k-8+r5*8+24*0] 2726 movddup m6, [base+z_filter_k-8+r5*8+24*1] 2727 movddup m7, [base+z_filter_k-8+r5*8+24*2] 2728 pshufb m0, m2, m3 2729 shufps m3, m4, q2121 2730 pmaddubsw m1, m0, m5 2731 pmaddubsw m0, m6 2732 pshufb m5, m2, m3 2733 pmaddubsw m3, m5, m6 2734 pmaddubsw m5, m7 2735 pshufb m2, m4 2736 pmaddubsw m2, m7 2737 paddw m0, m1 2738 paddw m1, m3 2739 paddw m0, m5 2740 paddw m1, m2 2741 pmulhrsw m0, m10 2742 pmulhrsw m1, m10 2743 lea r2d, [r4+2] 2744 cmp wd, 4 2745 cmovne r4d, r2d 2746 pshufd m0, m0, q0000 2747 lea tlq, [rsp+15] 2748 packuswb m0, m1 2749 mova [rsp], m0 2750.h4_main: 2751 movd m5, dyd 2752 movddup m0, [base+z_base_inc] ; base_inc << 6 2753 sub tlq, r4 2754 shl r4d, 6 2755 movd m7, [tlq] 2756 movd m4, r4d 2757 pshufb m5, [base+pw_256] 2758 neg dyq 2759 pshufb m7, [base+pw_m256] 2760 mova m3, [base+z3_shuf_h4] 2761 lea r5, [dyq+r4+63] ; ypos 2762 pshufb m4, [base+pw_256] 2763 psubw m4, m0 ; max_base_y 2764 shl wd, 2 2765 paddw m6, m5, m5 2766 sub rsp, wq 2767 punpcklqdq m5, m6 2768.h4_loop: 2769 lea r4, [r5+dyq] 2770 sar r5, 6 2771 movq m0, [tlq+r5-4] 2772 lea r5, [r4+dyq] 2773 sar r4, 6 2774 movhps m0, [tlq+r4-4] 2775 pand m2, m8, m5 2776 psubw m1, m9, m2 2777 psllw m2, 8 2778 pshufb m0, m3 2779 por m1, m2 2780 pmaddubsw m0, m1 2781 pcmpgtw m1, m4, m5 2782 paddw m5, m6 2783 pmulhrsw m0, m10 2784 pand m0, m1 2785 pandn m1, m7 2786 por m0, m1 2787 packuswb m0, m0 2788 movq [rsp+wq-8], m0 2789 sub wd, 8 2790 jz .h4_transpose 2791 test r5d, r5d 2792 jg .h4_loop 2793 packuswb m7, m7 2794.h4_end_loop: 2795 movq [rsp+wq-8], m7 2796 sub wd, 8 2797 jg .h4_end_loop 2798.h4_transpose: 2799 mova m1, [base+z_transpose4] 2800%if ARCH_X86_32 2801 mov strideq, [dstq] 2802 mov org_wd, [dstq+strideq] 2803%endif 2804 lea r2, [strideq*3] 2805 lea dstq, [dstq+org_wq-4] 2806.h4_transpose_loop: 2807 mova m0, [rsp] 2808 add rsp, 16 2809 pshufb m0, m1 2810 movd [dstq+strideq*0], m0 2811 pshuflw m2, m0, q1032 2812 movd [dstq+strideq*1], m2 2813 punpckhqdq m0, m0 2814 movd [dstq+strideq*2], m0 2815 psrlq m0, 32 2816 movd [dstq+r2 ], m0 2817 sub dstq, 4 2818 sub org_wd, 4 2819 jg .h4_transpose_loop 2820 RET 2821.h8: 2822 lea r4d, [angleq+88] 2823 and r4d, ~0x7f 2824 or r4d, wd 2825 cmp r4d, 8 2826 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 2827 mova m4, [tlq-15] 2828 and r4d, 4 2829 movu m3, [tlq- 9] 2830 movd m1, r4d 2831 movu m2, [base+z_filter_s+2] 2832 pxor m0, m0 2833 movu m5, [base+z_filter_s+6] 2834 movddup m7, [base+pb_36_m4] 2835 pshufb m1, m0 ; w & 4 2836 movu m0, [base+z_upsample1-4] 2837 pmaxub m1, m0 ; clip 4x8 2838 add dyd, dyd 2839 pshufb m0, m4, m1 2840 pmaddubsw m0, m7 2841 pshufb m1, m4, m2 2842 pmaddubsw m1, m7 2843 pshufb m2, m3, [base+z_upsample1] 2844 pmaddubsw m2, m7 2845 pshufb m3, m5 2846 pmaddubsw m3, m7 2847 movd m5, dyd 2848 neg dyq 2849 paddw m1, m0 2850 paddw m2, m3 2851 pmulhrsw m1, m10 2852 pmulhrsw m2, m10 2853 shl wd, 3 2854 lea tlq, [rsp+16] 2855 pshufb m5, [base+pw_256] 2856 sub rsp, wq 2857 packuswb m1, m2 2858 lea r5, [dyq+63] 2859 punpcklbw m0, m1, m4 2860 punpckhbw m1, m4 2861 mova [tlq-16*1], m0 2862 mova [tlq-16*0], m1 2863 paddw m6, m5, m5 2864 punpcklqdq m5, m6 2865.h8_upsample_loop: 2866 lea r4, [r5+dyq] 2867 sar r5, 6 2868 movu m0, [tlq+r5] 2869 lea r5, [r4+dyq] 2870 sar r4, 6 2871 movu m1, [tlq+r4] 2872 pand m3, m8, m5 2873 psubw m2, m9, m3 2874 psllw m2, 8 2875 por m3, m2 2876 pshufd m2, m3, q1010 2877 pmaddubsw m0, m2 2878 punpckhqdq m3, m3 2879 pmaddubsw m1, m3 2880 paddw m5, m6 2881 pmulhrsw m0, m10 2882 pmulhrsw m1, m10 2883 packuswb m1, m0 2884 mova [rsp+wq-16], m1 2885 sub wd, 16 2886 jg .h8_upsample_loop 2887 jmp .h8_transpose 2888.h8_no_upsample: 2889 lea r4d, [wq+7] 2890 movd m0, r4d 2891 and r4d, 7 2892 or r4d, 8 ; imin(w+7, 15) 2893 test angled, 0x400 2894 jnz .h8_main 2895 movd m2, angled 2896 shr angled, 8 ; is_sm << 1 2897 pxor m1, m1 2898 pshufb m0, m1 2899 pshufb m2, m1 2900 movu m1, [base+z_filter_wh8] 2901 psrldq m3, [base+z_filter_t_w48+angleq*8], 4 2902 pcmpeqb m1, m0 2903 pand m1, m2 2904 pcmpgtb m1, m3 2905 pmovmskb r5d, m1 2906 test r5d, r5d 2907 jz .h8_main ; filter_strength == 0 2908 mova m0, [tlq-15] 2909 imul r5d, 0x55555555 2910 movd m1, [tlq+1] 2911 neg r4 2912 movd m2, [tlq+r4] 2913 shr r5d, 30 2914 pxor m7, m7 2915 lea tlq, [rsp+16*2] 2916 sub r5, 3 ; filter_strength-3 2917 mova [tlq+16*0], m0 2918 pshufb m1, m7 2919 mova [tlq+16*1], m1 2920 pshufb m2, m7 2921 movq [tlq+r4+8], m2 2922 neg r4d 2923 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 2924 sar r5d, 1 2925 add tlq, 31 2926 add r5d, 17 2927 cmp wd, 8 2928 cmova r4d, r5d 2929.h8_main: 2930 movd m5, dyd 2931 sub tlq, r4 2932 shl r4d, 6 2933 movd m7, [tlq] 2934 movd m4, r4d 2935 pshufb m5, [base+pw_256] 2936 neg dyq 2937 pshufb m7, [base+pw_m256] 2938 mova m3, [base+z3_shuf] 2939 lea r5, [dyq+r4+63] 2940 pshufb m4, [base+pw_256] 2941 psubw m4, [base+z3_base_inc] 2942 shl wd, 3 2943 mova m6, m5 2944 sub rsp, wq 2945.h8_loop: 2946 mov r4, r5 2947 sar r4, 6 2948 movu m0, [tlq+r4-8] 2949 pand m2, m8, m5 2950 psubw m1, m9, m2 2951 psllw m2, 8 2952 pshufb m0, m3 2953 por m1, m2 2954 pmaddubsw m0, m1 2955 pcmpgtw m1, m4, m5 2956 paddw m5, m6 2957 pmulhrsw m0, m10 2958 pand m0, m1 2959 pandn m1, m7 2960 por m0, m1 2961 packuswb m0, m0 2962 movq [rsp+wq-8], m0 2963 sub wd, 8 2964 jz .h8_transpose 2965 add r5, dyq 2966 jg .h8_loop 2967 packuswb m7, m7 2968.h8_end_loop: 2969 movq [rsp+wq-8], m7 2970 sub wd, 8 2971 jg .h8_end_loop 2972.h8_transpose: 2973%if ARCH_X86_32 2974 mov strideq, [dstq] 2975 mov org_wd, [dstq+strideq] 2976%endif 2977 or r3d, 8 2978 cmp org_wd, 4 2979%if ARCH_X86_64 2980 jne .end_transpose_main 2981%else 2982 jne .end_transpose_loop 2983%endif 2984 mova m1, [rsp+16*1] 2985 mova m0, [rsp+16*0] 2986 lea r2, [strideq*3] 2987 add rsp, 16*2 2988 punpcklbw m2, m1, m0 2989 punpckhbw m1, m0 2990 punpckhbw m0, m1, m2 2991 punpcklbw m1, m2 2992.write_4x8_end: 2993 call .write_4x8 2994 RET 2995.write_4x8: 2996 movd [dstq+r2 ], m0 2997 pshuflw m4, m0, q1032 2998 movd [dstq+strideq*2], m4 2999 punpckhqdq m0, m0 3000 movd [dstq+strideq*1], m0 3001 psrlq m0, 32 3002 movd [dstq+strideq*0], m0 3003 lea dstq, [dstq+strideq*4] 3004 movd [dstq+r2 ], m1 3005 pshuflw m4, m1, q1032 3006 movd [dstq+strideq*2], m4 3007 punpckhqdq m1, m1 3008 movd [dstq+strideq*1], m1 3009 psrlq m1, 32 3010 movd [dstq+strideq*0], m1 3011 ret 3012.h16: 3013 lea r4d, [wq+15] 3014 movd m0, r4d 3015 and r4d, 15 3016 or r4d, 16 ; imin(w+15, 31) 3017 test angled, 0x400 3018 jnz .h16_main 3019 movd m2, angled 3020 shr angled, 8 ; is_sm << 1 3021 pxor m1, m1 3022 pshufb m0, m1 3023 pshufb m2, m1 3024 movq m3, [base+z_filter_t_w16+angleq*4] 3025 pcmpeqb m1, m0, [base+z_filter_wh16] 3026 pand m1, m2 3027 pcmpgtb m1, m3 3028 pmovmskb r5d, m1 3029 test r5d, r5d 3030 jz .h16_main ; filter_strength == 0 3031 mova m0, [tlq-16*2+1] 3032 imul r5d, 0x24924924 3033 mova m1, [tlq-16*1+1] 3034 neg r4 3035 movd m2, [tlq-16*0+1] 3036 shr r5d, 30 3037 movd m3, [tlq+r4] 3038 adc r5, -4 ; filter_strength-3 3039 pxor m7, m7 3040 lea tlq, [rsp+16*2] 3041 mova [tlq-16*1], m0 3042 pshufb m2, m7 3043 mova [tlq+16*0], m1 3044 pshufb m3, m7 3045 mova [tlq+16*1], m2 3046 movq [tlq+r4+8], m3 3047 neg r4d 3048 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3049 add tlq, 31 3050 cmp wd, 16 3051 jle .h16_main 3052 pshuflw m0, [tlq-47], q0000 3053 sar r5, 1 3054 movq m1, [base+z3_filter_k_tail+r5*4] 3055 lea r4d, [r5+33] 3056 pmaddubsw m0, m1 3057%if ARCH_X86_64 3058 pmulhrsw m0, m10 3059%else 3060 pmulhrsw m0, m4 3061%endif 3062 packuswb m0, m0 3063 movd [tlq-35], m0 3064.h16_main: 3065 movd m5, dyd 3066 sub tlq, r4 3067 movd m4, r4d 3068 shl r4d, 6 3069 movd m7, [tlq] 3070 pxor m6, m6 3071 pshufb m5, [base+pw_256] 3072 neg dyq 3073 pshufb m7, m6 3074 mova m3, [base+z3_shuf] 3075 lea r5, [dyq+r4+63] 3076 pshufb m4, m6 3077 psubb m4, [base+pb_15to0] 3078 shl wd, 4 3079 mova m6, m5 3080 sub rsp, wq 3081.h16_loop: 3082 mov r4, r5 3083 pand m2, m8, m5 3084 sar r4, 6 3085 psubw m1, m9, m2 3086 psllw m2, 8 3087 movu m0, [tlq+r4-8*2] 3088 por m2, m1 3089 movu m1, [tlq+r4-8*1] 3090 pshufb m0, m3 3091 pmaddubsw m0, m2 3092 pshufb m1, m3 3093 pmaddubsw m1, m2 3094 psrlw m2, m5, 6 3095 paddw m5, m6 3096 pmulhrsw m0, m10 3097 pmulhrsw m1, m10 3098 packsswb m2, m2 3099 packuswb m0, m1 3100 pcmpgtb m1, m4, m2 3101 pand m0, m1 3102 pandn m1, m7 3103 por m0, m1 3104 mova [rsp+wq-16], m0 3105 sub wd, 16 3106 jz .h16_transpose 3107 add r5, dyq 3108 jg .h16_loop 3109.h16_end_loop: 3110 mova [rsp+wq-16], m7 3111 sub wd, 16 3112 jg .h16_end_loop 3113.h16_transpose: 3114%if ARCH_X86_32 3115 mov strideq, [dstq] 3116 mov org_wd, [dstq+strideq] 3117%endif 3118 or r3d, 16 3119 cmp org_wd, 4 3120%if ARCH_X86_64 3121 jne .end_transpose_main 3122%else 3123 jne .end_transpose_loop 3124%endif 3125.h16_transpose_w4: 3126 mova m2, [rsp+16*3] 3127 mova m4, [rsp+16*2] 3128 mova m3, [rsp+16*1] 3129 mova m0, [rsp+16*0] 3130 lea r2, [strideq*3] 3131 add rsp, 16*4 3132 punpckhbw m1, m2, m4 3133 punpcklbw m2, m4 3134 punpckhbw m4, m3, m0 3135 punpcklbw m3, m0 3136 punpckhwd m0, m1, m4 3137 punpcklwd m1, m4 3138 call .write_4x8 3139 lea dstq, [dstq+strideq*4] 3140 punpckhwd m0, m2, m3 3141 punpcklwd m1, m2, m3 3142 jmp .write_4x8_end 3143.h32: 3144 lea r4d, [wq+31] 3145 and r4d, 31 3146 or r4d, 32 ; imin(w+31, 63) 3147 test angled, 0x400 ; !enable_intra_edge_filter 3148 jnz .h32_main 3149 mova m0, [tlq-16*4+1] 3150 mova m1, [tlq-16*3+1] 3151 mova m2, [tlq-16*2+1] 3152 mova m3, [tlq-16*1+1] 3153 movd m4, [tlq-16*0+1] 3154 neg r4 3155 movd m5, [tlq+r4] 3156 pxor m7, m7 3157 lea tlq, [rsp+16*4] 3158 mova [tlq-16*3], m0 3159 mova [tlq-16*2], m1 3160 xor r5d, r5d ; filter_strength = 3 3161 mova [tlq-16*1], m2 3162 pshufb m4, m7 3163 mova [tlq+16*0], m3 3164 pshufb m5, m7 3165 mova [tlq+16*1], m4 3166 movq [tlq+r4+8], m5 3167 neg r4d 3168 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3169 sub tlq, 16*2 3170 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3171 add tlq, 63 3172 cmp wd, 32 3173 jle .h32_main 3174 pshuflw m0, [tlq-79], q0000 3175 movq m1, [base+z3_filter_k_tail] 3176 add r4d, 2 3177 pmaddubsw m0, m1 3178%if ARCH_X86_64 3179 pmulhrsw m0, m10 3180%else 3181 pmulhrsw m0, m4 3182%endif 3183 packuswb m0, m0 3184 movd [tlq-67], m0 3185.h32_main: 3186 movd m5, dyd 3187 sub tlq, r4 3188 movd m4, r4d 3189 shl r4d, 6 3190 movd m7, [tlq] 3191 pxor m6, m6 3192 pshufb m5, [base+pw_256] 3193 neg dyq 3194 pshufb m7, m6 3195 mova m3, [base+z3_shuf] 3196 lea r5, [dyq+r4+63] 3197 pshufb m4, m6 3198 psubb m4, [base+pb_15to0] 3199 mova m6, m5 3200.h32_loop: 3201 mov r4, r5 3202 pand m2, m8, m5 3203 sar r4, 6 3204 psubw m1, m9, m2 3205 psllw m2, 8 3206 movu m0, [tlq+r4-8*4] 3207 por m2, m1 3208 movu m1, [tlq+r4-8*3] 3209 pshufb m0, m3 3210 pmaddubsw m0, m2 3211 pshufb m1, m3 3212 pmaddubsw m1, m2 3213 pmulhrsw m0, m10 3214 pmulhrsw m1, m10 3215 sub rsp, 32 3216 packuswb m0, m1 3217 mova [rsp+16*0], m0 3218 movu m0, [tlq+r4-8*2] 3219 movu m1, [tlq+r4-8*1] 3220 pshufb m0, m3 3221 pshufb m1, m3 3222 pmaddubsw m0, m2 3223 pmaddubsw m1, m2 3224 pmulhrsw m0, m10 3225 pmulhrsw m1, m10 3226 psrlw m2, m5, 6 3227 paddw m5, m6 3228 packsswb m2, m2 3229 packuswb m0, m1 3230 pcmpgtb m1, m4, m2 3231 paddsb m2, [base+pb_16] 3232 pand m0, m1 3233 pandn m1, m7 3234 por m0, m1 3235 pcmpgtb m1, m4, m2 3236 mova [rsp+16*1], m0 3237 pand m0, m1, [rsp+16*0] 3238 pandn m1, m7 3239 por m0, m1 3240 mova [rsp+16*0], m0 3241 dec wd 3242 jz .h32_transpose 3243 add r5, dyq 3244 jg .h32_loop 3245.h32_end_loop: 3246 sub rsp, 32 3247 mova [rsp+16*1], m7 3248 mova [rsp+16*0], m7 3249 dec wd 3250 jg .h32_end_loop 3251.h32_transpose: 3252 or r3d, 32 3253 jmp .end_transpose_main 3254.h64: 3255 lea r4d, [wq+63] 3256 test angled, 0x400 ; !enable_intra_edge_filter 3257 jnz .h64_main 3258 mova m0, [tlq-16*8+1] 3259 mova m1, [tlq-16*7+1] 3260 mova m2, [tlq-16*6+1] 3261 mova m3, [tlq-16*5+1] 3262 mova [rsp+16*1], m0 3263 mova [rsp+16*2], m1 3264 mova [rsp+16*3], m2 3265 mova [rsp+16*4], m3 3266 mova m0, [tlq-16*4+1] 3267 mova m1, [tlq-16*3+1] 3268 mova m2, [tlq-16*2+1] 3269 mova m3, [tlq-16*1+1] 3270 movd m4, [tlq-16*0+1] 3271 neg r4 3272 movd m5, [tlq+r4] 3273 pxor m7, m7 3274 lea tlq, [rsp+16*8] 3275 mova [tlq-16*3], m0 3276 mova [tlq-16*2], m1 3277 xor r5d, r5d ; filter_strength = 3 3278 mova [tlq-16*1], m2 3279 pshufb m4, m7 3280 mova [tlq+16*0], m3 3281 pshufb m5, m7 3282 mova [tlq+16*1], m4 3283 movq [tlq+r4+8], m5 3284 neg r4d 3285 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3286 sub tlq, 16*2 3287 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3288 sub tlq, 16*2 3289 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3290 sub tlq, 16*2 3291 cmp wd, 64 3292 jl .h64_filter96 ; skip one call if the last 32 bytes aren't used 3293 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge 3294.h64_filter96: 3295 add tlq, 127 3296.h64_main: 3297 movd m5, dyd 3298 sub tlq, r4 3299 movd m4, r4d 3300 shl r4d, 6 3301 movd m7, [tlq] 3302 pxor m6, m6 3303 pshufb m5, [base+pw_256] 3304 neg dyq 3305 pshufb m7, m6 3306 mova m3, [base+z3_shuf] 3307 lea r5, [dyq+r4+63] 3308 pshufb m4, m6 3309 psubb m4, [base+pb_15to0] 3310 mova m6, m5 3311.h64_loop: 3312 mov r4, r5 3313 pand m2, m8, m5 3314 sar r4, 6 3315 psubw m1, m9, m2 3316 psllw m2, 8 3317 movu m0, [tlq+r4-8*8] 3318 por m2, m1 3319 movu m1, [tlq+r4-8*7] 3320 pshufb m0, m3 3321 pmaddubsw m0, m2 3322 pshufb m1, m3 3323 pmaddubsw m1, m2 3324 pmulhrsw m0, m10 3325 pmulhrsw m1, m10 3326 sub rsp, 64 3327 packuswb m0, m1 3328 mova [rsp+16*0], m0 3329 movu m0, [tlq+r4-8*6] 3330 movu m1, [tlq+r4-8*5] 3331 pshufb m0, m3 3332 pshufb m1, m3 3333 pmaddubsw m0, m2 3334 pmaddubsw m1, m2 3335 pmulhrsw m0, m10 3336 pmulhrsw m1, m10 3337 packuswb m0, m1 3338 mova [rsp+16*1], m0 3339 movu m0, [tlq+r4-8*4] 3340 movu m1, [tlq+r4-8*3] 3341 pshufb m0, m3 3342 pshufb m1, m3 3343 pmaddubsw m0, m2 3344 pmaddubsw m1, m2 3345 pmulhrsw m0, m10 3346 pmulhrsw m1, m10 3347 packuswb m0, m1 3348 mova [rsp+16*2], m0 3349 movu m0, [tlq+r4-8*2] 3350 movu m1, [tlq+r4-8*1] 3351 pshufb m0, m3 3352 pshufb m1, m3 3353 pmaddubsw m0, m2 3354 pmaddubsw m1, m2 3355 pmulhrsw m0, m10 3356 pmulhrsw m1, m10 3357 psrlw m2, m5, 6 3358 paddw m5, m6 3359 packsswb m2, m2 3360 packuswb m0, m1 3361 pcmpgtb m1, m4, m2 3362 paddsb m2, [base+pb_16] 3363 pand m0, m1 3364 pandn m1, m7 3365 por m0, m1 3366 pcmpgtb m1, m4, m2 3367 paddsb m2, [base+pb_16] 3368 mova [rsp+16*3], m0 3369 pand m0, m1, [rsp+16*2] 3370 pandn m1, m7 3371 por m0, m1 3372 pcmpgtb m1, m4, m2 3373 paddsb m2, [base+pb_16] 3374 mova [rsp+16*2], m0 3375 pand m0, m1, [rsp+16*1] 3376 pandn m1, m7 3377 por m0, m1 3378 pcmpgtb m1, m4, m2 3379 mova [rsp+16*1], m0 3380 pand m0, m1, [rsp+16*0] 3381 pandn m1, m7 3382 por m0, m1 3383 mova [rsp+16*0], m0 3384 dec wd 3385 jz .h64_transpose 3386 add r5, dyq 3387 jg .h64_loop 3388.h64_end_loop: 3389 sub rsp, 64 3390 mova [rsp+16*3], m7 3391 mova [rsp+16*2], m7 3392 mova [rsp+16*1], m7 3393 mova [rsp+16*0], m7 3394 dec wd 3395 jg .h64_end_loop 3396.h64_transpose: 3397 or r3d, 64 3398.end_transpose_main: 3399%if ARCH_X86_64 3400 lea r5, [r3*3] 3401 lea r7, [strideq*3] 3402%else 3403 mov strideq, [dstq] 3404 mov org_wd, [dstq+strideq] 3405%endif 3406.end_transpose_loop: 3407 lea r4, [rsp+r3-8] 3408 lea r6, [dstq+org_wq-8] 3409.end_transpose_loop_y: 3410 movq m0, [r4+r3*1] 3411 movq m4, [r4+r3*0] 3412%if ARCH_X86_64 3413 movq m1, [r4+r5 ] 3414 movq m5, [r4+r3*2] 3415 lea r2, [r4+r3*4] 3416%else 3417 lea r2, [r4+r3*2] 3418 movq m1, [r2+r3*1] 3419 movq m5, [r2+r3*0] 3420 lea r2, [r2+r3*2] 3421%endif 3422 movq m2, [r2+r3*1] 3423 movq m6, [r2+r3*0] 3424%if ARCH_X86_64 3425 movq m3, [r2+r5 ] 3426 movq m7, [r2+r3*2] 3427%else 3428 lea r2, [r2+r3*2] 3429 movq m3, [r2+r3*1] 3430 movq m7, [r2+r3*0] 3431%endif 3432 sub r4, 8 3433 punpcklbw m0, m4 3434 punpcklbw m1, m5 3435 punpcklbw m2, m6 3436 punpcklbw m3, m7 3437 punpckhwd m4, m1, m0 3438 punpcklwd m1, m0 3439 punpckhwd m0, m3, m2 3440 punpcklwd m3, m2 3441 punpckhdq m2, m3, m1 3442 punpckldq m3, m1 3443 punpckldq m1, m0, m4 3444 punpckhdq m0, m4 3445 movhps [r6+strideq*0], m0 3446 movq [r6+strideq*1], m0 3447%if ARCH_X86_64 3448 movhps [r6+strideq*2], m1 3449 movq [r6+r7 ], m1 3450 lea r6, [r6+strideq*4] 3451%else 3452 lea r6, [r6+strideq*2] 3453 movhps [r6+strideq*0], m1 3454 movq [r6+strideq*1], m1 3455 lea r6, [r6+strideq*2] 3456%endif 3457 movhps [r6+strideq*0], m2 3458 movq [r6+strideq*1], m2 3459%if ARCH_X86_64 3460 movhps [r6+strideq*2], m3 3461 movq [r6+r7 ], m3 3462 lea r6, [r6+strideq*4] 3463%else 3464 lea r6, [r6+strideq*2] 3465 movhps [r6+strideq*0], m3 3466 movq [r6+strideq*1], m3 3467 lea r6, [r6+strideq*2] 3468%endif 3469 cmp r4, rsp 3470 jae .end_transpose_loop_y 3471 lea rsp, [rsp+r3*8] 3472 sub org_wd, 8 3473 jg .end_transpose_loop 3474 RET 3475 3476;------------------------------------------------------------------------------- 3477;int dav1d_pal_pred_ssse3(pixel *dst, ptrdiff_t stride, const pixel *pal, 3478; const uint8_t *idx, int w, int h); 3479;------------------------------------------------------------------------------- 3480cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h 3481 movq m4, [palq] 3482 LEA r2, pal_pred_ssse3_table 3483 tzcnt wd, wm 3484 movifnidn hd, hm 3485 movsxd wq, [r2+wq*4] 3486 add wq, r2 3487 lea r2, [strideq*3] 3488 jmp wq 3489.w4: 3490 movq m1, [idxq] 3491 add idxq, 8 3492 psrlw m0, m1, 4 3493 punpcklbw m1, m0 3494 pshufb m0, m4, m1 3495 movd [dstq+strideq*0], m0 3496 pshuflw m1, m0, q1032 3497 movd [dstq+strideq*1], m1 3498 punpckhqdq m0, m0 3499 movd [dstq+strideq*2], m0 3500 psrlq m0, 32 3501 movd [dstq+r2 ], m0 3502 lea dstq, [dstq+strideq*4] 3503 sub hd, 4 3504 jg .w4 3505 RET 3506.w8: 3507 movu m0, [idxq] 3508 add idxq, 16 3509 pshufb m1, m4, m0 3510 psrlw m0, 4 3511 pshufb m2, m4, m0 3512 punpcklbw m0, m1, m2 3513 punpckhbw m1, m2 3514 movq [dstq+strideq*0], m0 3515 movhps [dstq+strideq*1], m0 3516 movq [dstq+strideq*2], m1 3517 movhps [dstq+r2 ], m1 3518 lea dstq, [dstq+strideq*4] 3519 sub hd, 4 3520 jg .w8 3521 RET 3522.w16: 3523 movu m0, [idxq] 3524 add idxq, 16 3525 pshufb m1, m4, m0 3526 psrlw m0, 4 3527 pshufb m2, m4, m0 3528 punpcklbw m0, m1, m2 3529 punpckhbw m1, m2 3530 mova [dstq+strideq*0], m0 3531 mova [dstq+strideq*1], m1 3532 lea dstq, [dstq+strideq*2] 3533 sub hd, 2 3534 jg .w16 3535 RET 3536.w32: 3537 movu m0, [idxq] 3538 add idxq, 16 3539 pshufb m1, m4, m0 3540 psrlw m0, 4 3541 pshufb m2, m4, m0 3542 punpcklbw m0, m1, m2 3543 punpckhbw m1, m2 3544 mova [dstq+16*0], m0 3545 mova [dstq+16*1], m1 3546 add dstq, strideq 3547 dec hd 3548 jg .w32 3549 RET 3550.w64: 3551 movu m0, [idxq+16*0] 3552 movu m2, [idxq+16*1] 3553 add idxq, 32 3554 pshufb m1, m4, m0 3555 psrlw m0, 4 3556 pshufb m3, m4, m0 3557 punpcklbw m0, m1, m3 3558 punpckhbw m1, m3 3559 mova [dstq+16*0], m0 3560 mova [dstq+16*1], m1 3561 pshufb m1, m4, m2 3562 psrlw m2, 4 3563 pshufb m3, m4, m2 3564 punpcklbw m0, m1, m3 3565 punpckhbw m1, m3 3566 mova [dstq+16*2], m0 3567 mova [dstq+16*3], m1 3568 add dstq, strideq 3569 sub hd, 1 3570 jg .w64 3571 RET 3572 3573;--------------------------------------------------------------------------------------- 3574;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 3575; const int width, const int height, const int16_t *ac, const int alpha); 3576;--------------------------------------------------------------------------------------- 3577%macro IPRED_CFL 1 ; ac in, unpacked pixels out 3578 psignw m3, m%1, m1 3579 pabsw m%1, m%1 3580 pmulhrsw m%1, m2 3581 psignw m%1, m3 3582 paddw m%1, m0 3583%endmacro 3584 3585%if UNIX64 3586DECLARE_REG_TMP 7 3587%else 3588DECLARE_REG_TMP 5 3589%endif 3590 3591cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 3592 movifnidn wd, wm 3593 movifnidn hd, hm 3594 tzcnt r6d, hd 3595 lea t0d, [wq+hq] 3596 movd m4, t0d 3597 tzcnt t0d, t0d 3598 movd m5, t0d 3599 LEA t0, ipred_cfl_ssse3_table 3600 tzcnt wd, wd 3601 movsxd r6, [t0+r6*4] 3602 movsxd wq, [t0+wq*4+16] 3603 pcmpeqd m3, m3 3604 psrlw m4, 1 3605 add r6, t0 3606 add wq, t0 3607 movifnidn acq, acmp 3608 jmp r6 3609.h4: 3610 movd m0, [tlq-4] 3611 pmaddubsw m0, m3 3612 jmp wq 3613.w4: 3614 movd m1, [tlq+1] 3615 pmaddubsw m1, m3 3616 psubw m0, m4 3617 paddw m0, m1 3618 pmaddwd m0, m3 3619 cmp hd, 4 3620 jg .w4_mul 3621 psrlw m0, 3 ; dc >>= ctz(width + height); 3622 jmp .w4_end 3623.w4_mul: 3624 punpckhqdq m1, m0, m0 3625 paddw m0, m1 3626 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 3627 paddw m0, m1 3628 psrlw m0, 2 3629 mov r6d, 0x5556 3630 mov r2d, 0x3334 3631 test hd, 8 3632 cmovz r6d, r2d 3633 movd m5, r6d 3634 pmulhuw m0, m5 3635.w4_end: 3636 pshuflw m0, m0, q0000 3637 punpcklqdq m0, m0 3638.s4: 3639 movd m1, alpham 3640 pshuflw m1, m1, q0000 3641 punpcklqdq m1, m1 3642 lea r6, [strideq*3] 3643 pabsw m2, m1 3644 psllw m2, 9 3645.s4_loop: 3646 mova m4, [acq] 3647 mova m5, [acq+16] 3648 IPRED_CFL 4 3649 IPRED_CFL 5 3650 packuswb m4, m5 3651 movd [dstq+strideq*0], m4 3652 pshuflw m4, m4, q1032 3653 movd [dstq+strideq*1], m4 3654 punpckhqdq m4, m4 3655 movd [dstq+strideq*2], m4 3656 psrlq m4, 32 3657 movd [dstq+r6 ], m4 3658 lea dstq, [dstq+strideq*4] 3659 add acq, 32 3660 sub hd, 4 3661 jg .s4_loop 3662 RET 3663ALIGN function_align 3664.h8: 3665 movq m0, [tlq-8] 3666 pmaddubsw m0, m3 3667 jmp wq 3668.w8: 3669 movq m1, [tlq+1] 3670 pmaddubsw m1, m3 3671 psubw m4, m0 3672 punpckhqdq m0, m0 3673 psubw m0, m4 3674 paddw m0, m1 3675 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 3676 paddw m0, m1 3677 pmaddwd m0, m3 3678 psrlw m0, m5 3679 cmp hd, 8 3680 je .w8_end 3681 mov r6d, 0x5556 3682 mov r2d, 0x3334 3683 cmp hd, 32 3684 cmovz r6d, r2d 3685 movd m1, r6d 3686 pmulhuw m0, m1 3687.w8_end: 3688 pshuflw m0, m0, q0000 3689 punpcklqdq m0, m0 3690.s8: 3691 movd m1, alpham 3692 pshuflw m1, m1, q0000 3693 punpcklqdq m1, m1 3694 lea r6, [strideq*3] 3695 pabsw m2, m1 3696 psllw m2, 9 3697.s8_loop: 3698 mova m4, [acq] 3699 mova m5, [acq+16] 3700 IPRED_CFL 4 3701 IPRED_CFL 5 3702 packuswb m4, m5 3703 movq [dstq ], m4 3704 movhps [dstq+strideq ], m4 3705 mova m4, [acq+32] 3706 mova m5, [acq+48] 3707 IPRED_CFL 4 3708 IPRED_CFL 5 3709 packuswb m4, m5 3710 movq [dstq+strideq*2], m4 3711 movhps [dstq+r6 ], m4 3712 lea dstq, [dstq+strideq*4] 3713 add acq, 64 3714 sub hd, 4 3715 jg .s8_loop 3716 RET 3717ALIGN function_align 3718.h16: 3719 mova m0, [tlq-16] 3720 pmaddubsw m0, m3 3721 jmp wq 3722.w16: 3723 movu m1, [tlq+1] 3724 pmaddubsw m1, m3 3725 paddw m0, m1 3726 psubw m4, m0 3727 punpckhqdq m0, m0 3728 psubw m0, m4 3729 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 3730 paddw m0, m1 3731 pmaddwd m0, m3 3732 psrlw m0, m5 3733 cmp hd, 16 3734 je .w16_end 3735 mov r6d, 0x5556 3736 mov r2d, 0x3334 3737 test hd, 8|32 3738 cmovz r6d, r2d 3739 movd m1, r6d 3740 pmulhuw m0, m1 3741.w16_end: 3742 pshuflw m0, m0, q0000 3743 punpcklqdq m0, m0 3744.s16: 3745 movd m1, alpham 3746 pshuflw m1, m1, q0000 3747 punpcklqdq m1, m1 3748 pabsw m2, m1 3749 psllw m2, 9 3750.s16_loop: 3751 mova m4, [acq] 3752 mova m5, [acq+16] 3753 IPRED_CFL 4 3754 IPRED_CFL 5 3755 packuswb m4, m5 3756 mova [dstq], m4 3757 mova m4, [acq+32] 3758 mova m5, [acq+48] 3759 IPRED_CFL 4 3760 IPRED_CFL 5 3761 packuswb m4, m5 3762 mova [dstq+strideq], m4 3763 lea dstq, [dstq+strideq*2] 3764 add acq, 64 3765 sub hd, 2 3766 jg .s16_loop 3767 RET 3768ALIGN function_align 3769.h32: 3770 mova m0, [tlq-32] 3771 pmaddubsw m0, m3 3772 mova m2, [tlq-16] 3773 pmaddubsw m2, m3 3774 paddw m0, m2 3775 jmp wq 3776.w32: 3777 movu m1, [tlq+1] 3778 pmaddubsw m1, m3 3779 movu m2, [tlq+17] 3780 pmaddubsw m2, m3 3781 paddw m1, m2 3782 paddw m0, m1 3783 psubw m4, m0 3784 punpckhqdq m0, m0 3785 psubw m0, m4 3786 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 3787 paddw m0, m1 3788 pmaddwd m0, m3 3789 psrlw m0, m5 3790 cmp hd, 32 3791 je .w32_end 3792 lea r2d, [hq*2] 3793 mov r6d, 0x5556 3794 mov r2d, 0x3334 3795 test hd, 64|16 3796 cmovz r6d, r2d 3797 movd m1, r6d 3798 pmulhuw m0, m1 3799.w32_end: 3800 pshuflw m0, m0, q0000 3801 punpcklqdq m0, m0 3802.s32: 3803 movd m1, alpham 3804 pshuflw m1, m1, q0000 3805 punpcklqdq m1, m1 3806 pabsw m2, m1 3807 psllw m2, 9 3808.s32_loop: 3809 mova m4, [acq] 3810 mova m5, [acq+16] 3811 IPRED_CFL 4 3812 IPRED_CFL 5 3813 packuswb m4, m5 3814 mova [dstq], m4 3815 mova m4, [acq+32] 3816 mova m5, [acq+48] 3817 IPRED_CFL 4 3818 IPRED_CFL 5 3819 packuswb m4, m5 3820 mova [dstq+16], m4 3821 add dstq, strideq 3822 add acq, 64 3823 dec hd 3824 jg .s32_loop 3825 RET 3826 3827;--------------------------------------------------------------------------------------- 3828;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 3829; const int width, const int height, const int16_t *ac, const int alpha); 3830;--------------------------------------------------------------------------------------- 3831cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 3832 mov hd, hm ; zero upper half 3833 tzcnt r6d, hd 3834 sub tlq, hq 3835 tzcnt wd, wm 3836 movu m0, [tlq] 3837 mov t0d, 0x8000 3838 movd m3, t0d 3839 movd m2, r6d 3840 psrld m3, m2 3841 LEA t0, ipred_cfl_left_ssse3_table 3842 movsxd r6, [t0+r6*4] 3843 pcmpeqd m2, m2 3844 pmaddubsw m0, m2 3845 add r6, t0 3846 add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table 3847 movsxd wq, [t0+wq*4] 3848 add wq, t0 3849 movifnidn acq, acmp 3850 jmp r6 3851.h32: 3852 movu m1, [tlq+16] ; unaligned when jumping here from dc_top 3853 pmaddubsw m1, m2 3854 paddw m0, m1 3855.h16: 3856 pshufd m1, m0, q3232 ; psrlq m1, m0, 16 3857 paddw m0, m1 3858.h8: 3859 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 3860 paddw m0, m1 3861.h4: 3862 pmaddwd m0, m2 3863 pmulhrsw m0, m3 3864 pshuflw m0, m0, q0000 3865 punpcklqdq m0, m0 3866 jmp wq 3867 3868;--------------------------------------------------------------------------------------- 3869;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 3870; const int width, const int height, const int16_t *ac, const int alpha); 3871;--------------------------------------------------------------------------------------- 3872cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 3873 LEA t0, ipred_cfl_left_ssse3_table 3874 tzcnt wd, wm 3875 inc tlq 3876 movu m0, [tlq] 3877 movifnidn hd, hm 3878 mov r6d, 0x8000 3879 movd m3, r6d 3880 movd m2, wd 3881 psrld m3, m2 3882 movsxd r6, [t0+wq*4] 3883 pcmpeqd m2, m2 3884 pmaddubsw m0, m2 3885 add r6, t0 3886 add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table 3887 movsxd wq, [t0+wq*4] 3888 add wq, t0 3889 movifnidn acq, acmp 3890 jmp r6 3891 3892;--------------------------------------------------------------------------------------- 3893;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, 3894; const int width, const int height, const int16_t *ac, const int alpha); 3895;--------------------------------------------------------------------------------------- 3896cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 3897 tzcnt wd, wm 3898 movifnidn hd, hm 3899 LEA r6, ipred_cfl_splat_ssse3_table 3900 movsxd wq, [r6+wq*4] 3901 movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128] 3902 add wq, r6 3903 movifnidn acq, acmp 3904 jmp wq 3905 3906%macro RELOAD_ACQ_32 1 3907 mov acq, ac_bakq ; restore acq 3908%endmacro 3909 3910%if ARCH_X86_64 3911cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak 3912DECLARE_REG_TMP 7 3913 movddup m2, [pb_2] 3914%else 3915cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h 3916DECLARE_REG_TMP 4 3917%define ac_bakq acmp 3918 mov t0d, 0x02020202 3919 movd m2, t0d 3920 pshufd m2, m2, q0000 3921%endif 3922 movifnidn wd, wm 3923 mov t0d, hm 3924 mov hd, t0d 3925 imul t0d, wd 3926 movd m5, t0d 3927 movifnidn hpadd, hpadm 3928%if ARCH_X86_64 3929 mov ac_bakq, acq 3930%endif 3931 shl hpadd, 2 3932 sub hd, hpadd 3933 pxor m4, m4 3934 cmp wd, 8 3935 jg .w16 3936 je .w8 3937 ; fall-through 3938%if ARCH_X86_64 3939 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak 3940%else 3941 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h 3942%endif 3943.w4: 3944 lea stride3q, [strideq*3] 3945.w4_loop: 3946 movq m0, [yq] 3947 movq m1, [yq+strideq] 3948 movhps m0, [yq+strideq*2] 3949 movhps m1, [yq+stride3q] 3950 pmaddubsw m0, m2 3951 pmaddubsw m1, m2 3952 paddw m0, m1 3953 mova [acq], m0 3954 paddw m4, m0 3955 lea yq, [yq+strideq*4] 3956 add acq, 16 3957 sub hd, 2 3958 jg .w4_loop 3959 test hpadd, hpadd 3960 jz .calc_avg_4_8 3961 punpckhqdq m0, m0 3962.w4_hpad_loop: 3963 mova [acq], m0 3964 paddw m4, m0 3965 add acq, 16 3966 sub hpadd, 2 3967 jg .w4_hpad_loop 3968 jmp .calc_avg_4_8 3969.w8: 3970 lea stride3q, [strideq*3] 3971 test wpadd, wpadd 3972 jnz .w8_wpad 3973.w8_loop: 3974 mova m0, [yq] 3975 mova m1, [yq+strideq] 3976 pmaddubsw m0, m2 3977 pmaddubsw m1, m2 3978 paddw m0, m1 3979 mova [acq], m0 3980 paddw m4, m0 3981 mova m0, [yq+strideq*2] 3982 mova m1, [yq+stride3q] 3983 pmaddubsw m0, m2 3984 pmaddubsw m1, m2 3985 paddw m0, m1 3986 mova [acq+16], m0 3987 paddw m4, m0 3988 lea yq, [yq+strideq*4] 3989 add acq, 32 3990 sub hd, 2 3991 jg .w8_loop 3992 test hpadd, hpadd 3993 jz .calc_avg_4_8 3994 jmp .w8_hpad 3995.w8_wpad: ; wpadd=1 3996 movddup m0, [yq] 3997 movddup m1, [yq+strideq] 3998 pmaddubsw m0, m2 3999 pmaddubsw m1, m2 4000 paddw m0, m1 4001 pshufhw m0, m0, q3333 4002 mova [acq], m0 4003 paddw m4, m0 4004 lea yq, [yq+strideq*2] 4005 add acq, 16 4006 sub hd, 1 4007 jg .w8_wpad 4008 test hpadd, hpadd 4009 jz .calc_avg_4_8 4010.w8_hpad: 4011 mova [acq], m0 4012 paddw m4, m0 4013 add acq, 16 4014 sub hpadd, 1 4015 jg .w8_hpad 4016 jmp .calc_avg_4_8 4017.w16: 4018 test wpadd, wpadd 4019 jnz .w16_wpad 4020.w16_loop: 4021 mova m0, [yq] 4022 mova m1, [yq+strideq] 4023 pmaddubsw m0, m2 4024 pmaddubsw m1, m2 4025 paddw m0, m1 4026 mova [acq], m0 4027 paddw m4, m0 4028 mova m6, [yq+16] 4029 mova m1, [yq+strideq+16] 4030 pmaddubsw m6, m2 4031 pmaddubsw m1, m2 4032 paddw m6, m1 4033 mova [acq+16], m6 4034 paddw m4, m6 4035 lea yq, [yq+strideq*2] 4036 add acq, 32 4037 dec hd 4038 jg .w16_loop 4039 test hpadd, hpadd 4040 jz .calc_avg16 4041 jmp .w16_hpad_loop 4042.w16_wpad: 4043 cmp wpadd, 2 4044 jl .w16_pad1 4045 je .w16_pad2 4046.w16_pad3: 4047 movddup m0, [yq] 4048 movddup m1, [yq+strideq] 4049 pmaddubsw m0, m2 4050 pmaddubsw m1, m2 4051 paddw m0, m1 4052 pshufhw m0, m0, q3333 4053 mova [acq], m0 4054 paddw m4, m0 4055 mova m6, m0 4056 punpckhqdq m6, m0, m0 4057 mova [acq+16], m6 4058 paddw m4, m6 4059 lea yq, [yq+strideq*2] 4060 add acq, 32 4061 dec hd 4062 jg .w16_pad3 4063 jmp .w16_wpad_done 4064.w16_pad2: 4065 mova m0, [yq] 4066 mova m1, [yq+strideq] 4067 pmaddubsw m0, m2 4068 pmaddubsw m1, m2 4069 paddw m0, m1 4070 mova [acq], m0 4071 paddw m4, m0 4072 pshufhw m6, m0, q3333 4073 punpckhqdq m6, m6 4074 mova [acq+16], m6 4075 paddw m4, m6 4076 lea yq, [yq+strideq*2] 4077 add acq, 32 4078 dec hd 4079 jg .w16_pad2 4080 jmp .w16_wpad_done 4081.w16_pad1: 4082 mova m0, [yq] 4083 mova m1, [yq+strideq] 4084 pmaddubsw m0, m2 4085 pmaddubsw m1, m2 4086 paddw m0, m1 4087 mova [acq], m0 4088 paddw m4, m0 4089 movddup m6, [yq+16] 4090 movddup m1, [yq+strideq+16] 4091 pmaddubsw m6, m2 4092 pmaddubsw m1, m2 4093 paddw m6, m1 4094 pshufhw m6, m6, q3333 4095 mova [acq+16], m6 4096 paddw m4, m6 4097 lea yq, [yq+strideq*2] 4098 add acq, 32 4099 dec hd 4100 jg .w16_pad1 4101.w16_wpad_done: 4102 test hpadd, hpadd 4103 jz .calc_avg16 4104.w16_hpad_loop: 4105 mova [acq], m0 4106 paddw m4, m0 4107 mova [acq+16], m6 4108 paddw m4, m6 4109 add acq, 32 4110 dec hpadd 4111 jg .w16_hpad_loop 4112 jmp .calc_avg16 4113 4114%if ARCH_X86_64 4115 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak 4116%else 4117 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h 4118%endif 4119.calc_avg_4_8: 4120 psrlw m2, 9 4121 pmaddwd m4, m2 4122 jmp .calc_avg 4123.calc_avg16: 4124 psrld m0, m4, 16 4125 pslld m4, 16 4126 psrld m4, 16 4127 paddd m4, m0 4128.calc_avg: 4129 movd szd, m5 4130 psrad m5, 1 4131 tzcnt r1d, szd 4132 paddd m4, m5 4133 movd m1, r1d 4134 pshufd m0, m4, q2301 4135 paddd m0, m4 4136 pshufd m4, m0, q1032 4137 paddd m0, m4 4138 psrad m0, m1 ; sum >>= log2sz; 4139 packssdw m0, m0 4140 RELOAD_ACQ_32 acq 4141.sub_loop: 4142 mova m1, [acq] 4143 psubw m1, m0 ; ac[x] -= sum; 4144 mova [acq], m1 4145 add acq, 16 4146 sub szd, 8 4147 jg .sub_loop 4148 RET 4149 4150%if ARCH_X86_64 4151cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak 4152 movddup m2, [pb_4] 4153%else 4154cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h 4155 mov t0d, 0x04040404 4156 movd m2, t0d 4157 pshufd m2, m2, q0000 4158%endif 4159 movifnidn wd, wm 4160 mov t0d, hm 4161 mov hd, t0d 4162 imul t0d, wd 4163 movd m6, t0d 4164 movifnidn hpadd, hpadm 4165%if ARCH_X86_64 4166 mov ac_bakq, acq 4167%endif 4168 shl hpadd, 2 4169 sub hd, hpadd 4170 pxor m4, m4 4171 pxor m5, m5 4172 cmp wd, 8 4173 jg .w16 4174 je .w8 4175 ; fall-through 4176 4177%if ARCH_X86_64 4178 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak 4179%else 4180 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h 4181%endif 4182.w4: 4183 lea stride3q, [strideq*3] 4184.w4_loop: 4185 movq m1, [yq] 4186 movhps m1, [yq+strideq] 4187 movq m0, [yq+strideq*2] 4188 movhps m0, [yq+stride3q] 4189 pmaddubsw m0, m2 4190 pmaddubsw m1, m2 4191 mova [acq], m1 4192 mova [acq+16], m0 4193 paddw m4, m0 4194 paddw m5, m1 4195 lea yq, [yq+strideq*4] 4196 add acq, 32 4197 sub hd, 4 4198 jg .w4_loop 4199 test hpadd, hpadd 4200 jz .calc_avg_4 4201 punpckhqdq m0, m0 4202.w4_hpad_loop: 4203 mova [acq], m0 4204 paddw m4, m0 4205 add acq, 16 4206 sub hpadd, 2 4207 jg .w4_hpad_loop 4208 jmp .calc_avg_4 4209.w8: 4210 lea stride3q, [strideq*3] 4211 test wpadd, wpadd 4212 jnz .w8_wpad 4213.w8_loop: 4214 mova m1, [yq] 4215 mova m0, [yq+strideq] 4216 pmaddubsw m0, m2 4217 pmaddubsw m1, m2 4218 mova [acq], m1 4219 mova [acq+16], m0 4220 paddw m4, m0 4221 paddw m5, m1 4222 mova m1, [yq+strideq*2] 4223 mova m0, [yq+stride3q] 4224 pmaddubsw m0, m2 4225 pmaddubsw m1, m2 4226 mova [acq+32], m1 4227 mova [acq+48], m0 4228 paddw m4, m0 4229 paddw m5, m1 4230 lea yq, [yq+strideq*4] 4231 add acq, 64 4232 sub hd, 4 4233 jg .w8_loop 4234 test hpadd, hpadd 4235 jz .calc_avg_8_16 4236 jmp .w8_hpad 4237.w8_wpad: 4238 movddup m1, [yq] 4239 pmaddubsw m1, m2 4240 pshufhw m1, m1, q3333 4241 mova [acq], m1 4242 paddw m5, m1 4243 movddup m0, [yq+strideq] 4244 pmaddubsw m0, m2 4245 pshufhw m0, m0, q3333 4246 mova [acq+16], m0 4247 paddw m4, m0 4248 lea yq, [yq+strideq*2] 4249 add acq, 32 4250 sub hd, 2 4251 jg .w8_wpad 4252 test hpadd, hpadd 4253 jz .calc_avg_8_16 4254.w8_hpad: 4255 mova [acq], m0 4256 paddw m4, m0 4257 mova [acq+16], m0 4258 paddw m4, m0 4259 add acq, 32 4260 sub hpadd, 2 4261 jg .w8_hpad 4262 jmp .calc_avg_8_16 4263.w16: 4264 test wpadd, wpadd 4265 jnz .w16_wpad 4266.w16_loop: 4267 mova m1, [yq] 4268 mova m0, [yq+16] 4269 pmaddubsw m0, m2 4270 pmaddubsw m1, m2 4271 mova [acq], m1 4272 mova [acq+16], m0 4273 paddw m5, m0 4274 paddw m5, m1 4275 mova m1, [yq+strideq] 4276 mova m0, [yq+strideq+16] 4277 pmaddubsw m0, m2 4278 pmaddubsw m1, m2 4279 mova [acq+32], m1 4280 mova [acq+48], m0 4281 paddw m4, m0 4282 paddw m4, m1 4283 lea yq, [yq+strideq*2] 4284 add acq, 64 4285 sub hd, 2 4286 jg .w16_loop 4287 test hpadd, hpadd 4288 jz .calc_avg_8_16 4289 jmp .w16_hpad_loop 4290.w16_wpad: 4291 cmp wpadd, 2 4292 jl .w16_pad1 4293 je .w16_pad2 4294.w16_pad3: 4295 movddup m1, [yq] 4296 pmaddubsw m1, m2 4297 pshufhw m1, m1, q3333 4298 mova [acq], m1 4299 paddw m5, m1 4300 punpckhqdq m1, m1 4301 mova [acq+16], m1 4302 paddw m5, m1 4303 movddup m1, [yq+strideq] 4304 pmaddubsw m1, m2 4305 pshufhw m1, m1, q3333 4306 mova [acq+32], m1 4307 paddw m4, m1 4308 punpckhqdq m0, m1, m1 4309 mova [acq+48], m0 4310 paddw m4, m0 4311 lea yq, [yq+strideq*2] 4312 add acq, 64 4313 sub hd, 2 4314 jg .w16_pad3 4315 jmp .w16_wpad_done 4316.w16_pad2: 4317 mova m1, [yq] 4318 pmaddubsw m1, m2 4319 mova [acq], m1 4320 paddw m5, m1 4321 pshufhw m1, m1, q3333 4322 punpckhqdq m1, m1 4323 mova [acq+16], m1 4324 paddw m5, m1 4325 mova m1, [yq+strideq] 4326 pmaddubsw m1, m2 4327 mova [acq+32], m1 4328 paddw m4, m1 4329 mova m0, m1 4330 pshufhw m0, m0, q3333 4331 punpckhqdq m0, m0 4332 mova [acq+48], m0 4333 paddw m4, m0 4334 lea yq, [yq+strideq*2] 4335 add acq, 64 4336 sub hd, 2 4337 jg .w16_pad2 4338 jmp .w16_wpad_done 4339.w16_pad1: 4340 mova m1, [yq] 4341 pmaddubsw m1, m2 4342 mova [acq], m1 4343 paddw m5, m1 4344 movddup m0, [yq+16] 4345 pmaddubsw m0, m2 4346 pshufhw m0, m0, q3333 4347 mova [acq+16], m0 4348 paddw m5, m0 4349 mova m1, [yq+strideq] 4350 pmaddubsw m1, m2 4351 mova [acq+32], m1 4352 paddw m4, m1 4353 movddup m0, [yq+strideq+16] 4354 pmaddubsw m0, m2 4355 pshufhw m0, m0, q3333 4356 mova [acq+48], m0 4357 paddw m4, m0 4358 lea yq, [yq+strideq*2] 4359 add acq, 64 4360 sub hd, 2 4361 jg .w16_pad1 4362.w16_wpad_done: 4363 test hpadd, hpadd 4364 jz .calc_avg_8_16 4365.w16_hpad_loop: 4366 mova [acq], m1 4367 mova [acq+16], m0 4368 paddw m4, m1 4369 paddw m5, m0 4370 mova [acq+32], m1 4371 mova [acq+48], m0 4372 paddw m4, m1 4373 paddw m5, m0 4374 add acq, 64 4375 sub hpadd, 2 4376 jg .w16_hpad_loop 4377 jmp .calc_avg_8_16 4378 4379%if ARCH_X86_64 4380 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak 4381%else 4382 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h 4383%endif 4384.calc_avg_4: 4385 psrlw m2, 10 4386 pmaddwd m5, m2 4387 pmaddwd m0, m4, m2 4388 jmp .calc_avg 4389.calc_avg_8_16: 4390 mova m0, m5 4391 psrld m5, 16 4392 pslld m0, 16 4393 psrld m0, 16 4394 paddd m5, m0 4395 mova m0, m4 4396 psrld m0, 16 4397 pslld m4, 16 4398 psrld m4, 16 4399 paddd m0, m4 4400.calc_avg: 4401 paddd m5, m0 4402 movd szd, m6 4403 psrad m6, 1 4404 tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); 4405 paddd m5, m6 4406 movd m1, r1d 4407 pshufd m0, m5, q2301 4408 paddd m0, m5 4409 pshufd m5, m0, q1032 4410 paddd m0, m5 4411 psrad m0, m1 ; sum >>= log2sz; 4412 packssdw m0, m0 4413 RELOAD_ACQ_32 acq ; ac = ac_orig 4414.sub_loop: 4415 mova m1, [acq] 4416 psubw m1, m0 4417 mova [acq], m1 4418 add acq, 16 4419 sub szd, 8 4420 jg .sub_loop 4421 RET 4422 4423%if ARCH_X86_64 4424cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak 4425 movddup m2, [pb_4] 4426%else 4427cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h 4428%define ac_bakq [rsp+16*4] 4429 mov t0d, 0x04040404 4430 movd m2, t0d 4431 pshufd m2, m2, q0000 4432%endif 4433 movifnidn wd, wm 4434 movifnidn hpadd, hpadm 4435 movd m0, hpadd 4436 mov t0d, hm 4437 mov hd, t0d 4438 imul t0d, wd 4439 movd m6, t0d 4440 movd hpadd, m0 4441 mov ac_bakq, acq 4442 shl hpadd, 2 4443 sub hd, hpadd 4444 pxor m5, m5 4445 pxor m4, m4 4446 cmp wd, 16 4447 jg .w32 4448 cmp wd, 8 4449 jg .w16 4450 je .w8 4451 ; fall-through 4452 4453%if ARCH_X86_64 4454 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak 4455%else 4456 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h 4457%endif 4458.w4: 4459 lea stride3q, [strideq*3] 4460.w4_loop: 4461 movd m1, [yq] 4462 movd m3, [yq+strideq] 4463 punpckldq m1, m3 4464 punpcklbw m1, m1 4465 movd m0, [yq+strideq*2] 4466 movd m3, [yq+stride3q] 4467 punpckldq m0, m3 4468 punpcklbw m0, m0 4469 pmaddubsw m1, m2 4470 pmaddubsw m0, m2 4471 mova [acq], m1 4472 mova [acq+16], m0 4473 paddw m5, m0 4474 paddw m5, m1 4475 lea yq, [yq+strideq*4] 4476 add acq, 32 4477 sub hd, 4 4478 jg .w4_loop 4479 test hpadd, hpadd 4480 jz .calc_avg_4 4481 punpckhqdq m0, m0 4482.w4_hpad_loop: 4483 mova [acq], m0 4484 paddw m5, m0 4485 add acq, 16 4486 sub hpadd, 2 4487 jg .w4_hpad_loop 4488.calc_avg_4: 4489 psrlw m2, 10 4490 pmaddwd m5, m2 4491 jmp .calc_avg 4492 4493.w8: 4494 lea stride3q, [strideq*3] 4495 test wpadd, wpadd 4496 jnz .w8_wpad 4497.w8_loop: 4498 movq m1, [yq] 4499 punpcklbw m1, m1 4500 pmaddubsw m1, m2 4501 mova [acq], m1 4502 paddw m5, m1 4503 movq m0, [yq+strideq] 4504 punpcklbw m0, m0 4505 pmaddubsw m0, m2 4506 mova [acq+16], m0 4507 paddw m5, m0 4508 movq m1, [yq+strideq*2] 4509 punpcklbw m1, m1 4510 pmaddubsw m1, m2 4511 mova [acq+32], m1 4512 paddw m4, m1 4513 movq m0, [yq+stride3q] 4514 punpcklbw m0, m0 4515 pmaddubsw m0, m2 4516 mova [acq+48], m0 4517 paddw m4, m0 4518 lea yq, [yq+strideq*4] 4519 add acq, 64 4520 sub hd, 4 4521 jg .w8_loop 4522 test hpadd, hpadd 4523 jz .calc_avg_8_16 4524 jmp .w8_hpad 4525.w8_wpad: 4526 movd m1, [yq] 4527 punpcklbw m1, m1 4528 punpcklqdq m1, m1 4529 pmaddubsw m1, m2 4530 pshufhw m1, m1, q3333 4531 mova [acq], m1 4532 paddw m5, m1 4533 movd m0, [yq+strideq] 4534 punpcklbw m0, m0 4535 punpcklqdq m0, m0 4536 pmaddubsw m0, m2 4537 pshufhw m0, m0, q3333 4538 mova [acq+16], m0 4539 paddw m4, m0 4540 lea yq, [yq+strideq*2] 4541 add acq, 32 4542 sub hd, 2 4543 jg .w8_wpad 4544 test hpadd, hpadd 4545 jz .calc_avg_8_16 4546.w8_hpad: 4547 mova [acq], m0 4548 paddw m5, m0 4549 mova [acq+16], m0 4550 paddw m4, m0 4551 add acq, 32 4552 sub hpadd, 2 4553 jg .w8_hpad 4554 jmp .calc_avg_8_16 4555 4556.w16: 4557 test wpadd, wpadd 4558 jnz .w16_wpad 4559.w16_loop: 4560 mova m0, [yq] 4561 mova m1, m0 4562 punpcklbw m1, m1 4563 pmaddubsw m1, m2 4564 mova [acq], m1 4565 paddw m5, m1 4566 punpckhbw m0, m0 4567 pmaddubsw m0, m2 4568 mova [acq+16], m0 4569 paddw m5, m0 4570 mova m0, [yq+strideq] 4571 mova m1, m0 4572 punpcklbw m1, m1 4573 pmaddubsw m1, m2 4574 mova [acq+32], m1 4575 paddw m4, m1 4576 punpckhbw m0, m0 4577 pmaddubsw m0, m2 4578 mova [acq+48], m0 4579 paddw m4, m0 4580 lea yq, [yq+strideq*2] 4581 add acq, 64 4582 sub hd, 2 4583 jg .w16_loop 4584 test hpadd, hpadd 4585 jz .calc_avg_8_16 4586 jmp .w16_hpad_loop 4587.w16_wpad: 4588 cmp wpadd, 2 4589 jl .w16_pad1 4590 je .w16_pad2 4591.w16_pad3: 4592 movd m1, [yq] 4593 punpcklbw m1, m1 4594 punpcklqdq m1, m1 4595 pshufhw m1, m1, q3333 4596 pmaddubsw m1, m2 4597 mova [acq], m1 4598 paddw m5, m1 4599 punpckhqdq m1, m1 4600 mova [acq+16], m1 4601 paddw m5, m1 4602 movd m1, [yq+strideq] 4603 punpcklbw m1, m1 4604 punpcklqdq m1, m1 4605 pshufhw m1, m1, q3333 4606 pmaddubsw m1, m2 4607 mova [acq+32], m1 4608 paddw m4, m1 4609 punpckhqdq m0, m1, m1 4610 mova [acq+48], m0 4611 paddw m4, m0 4612 lea yq, [yq+strideq*2] 4613 add acq, 64 4614 sub hd, 2 4615 jg .w16_pad3 4616 jmp .w16_wpad_done 4617.w16_pad2: 4618 movq m1, [yq] 4619 punpcklbw m1, m1 4620 pmaddubsw m1, m2 4621 mova [acq], m1 4622 paddw m5, m1 4623 pshufhw m1, m1, q3333 4624 punpckhqdq m1, m1 4625 mova [acq+16], m1 4626 paddw m5, m1 4627 movq m1, [yq+strideq] 4628 punpcklbw m1, m1 4629 pmaddubsw m1, m2 4630 mova [acq+32], m1 4631 paddw m4, m1 4632 mova m0, m1 4633 pshufhw m0, m0, q3333 4634 punpckhqdq m0, m0 4635 mova [acq+48], m0 4636 paddw m4, m0 4637 lea yq, [yq+strideq*2] 4638 add acq, 64 4639 sub hd, 2 4640 jg .w16_pad2 4641 jmp .w16_wpad_done 4642.w16_pad1: 4643 mova m0, [yq] 4644 mova m1, m0 4645 punpcklbw m1, m1 4646 pmaddubsw m1, m2 4647 mova [acq], m1 4648 paddw m5, m1 4649 punpckhbw m0, m0 4650 punpcklqdq m0, m0 4651 pshufhw m0, m0, q3333 4652 pmaddubsw m0, m2 4653 mova [acq+16], m0 4654 paddw m5, m0 4655 mova m0, [yq+strideq] 4656 mova m1, m0 4657 punpcklbw m1, m1 4658 pmaddubsw m1, m2 4659 mova [acq+32], m1 4660 paddw m4, m1 4661 punpckhbw m0, m0 4662 punpcklqdq m0, m0 4663 pshufhw m0, m0, q3333 4664 pmaddubsw m0, m2 4665 mova [acq+48], m0 4666 paddw m4, m0 4667 lea yq, [yq+strideq*2] 4668 add acq, 64 4669 sub hd, 2 4670 jg .w16_pad1 4671.w16_wpad_done: 4672 test hpadd, hpadd 4673 jz .calc_avg_8_16 4674.w16_hpad_loop: 4675 mova [acq], m1 4676 mova [acq+16], m0 4677 paddw m4, m1 4678 paddw m5, m0 4679 mova [acq+32], m1 4680 mova [acq+48], m0 4681 paddw m4, m1 4682 paddw m5, m0 4683 add acq, 64 4684 sub hpadd, 2 4685 jg .w16_hpad_loop 4686.calc_avg_8_16: 4687 mova m0, m5 4688 psrld m5, 16 4689 pslld m0, 16 4690 psrld m0, 16 4691 paddd m5, m0 4692 mova m0, m4 4693 psrld m0, 16 4694 pslld m4, 16 4695 psrld m4, 16 4696 paddd m0, m4 4697 paddd m5, m0 4698 jmp .calc_avg 4699 4700.w32: 4701 pxor m0, m0 4702 mova [rsp ], m0 4703 mova [rsp+16], m0 4704 mova [rsp+32], m0 4705 mova [rsp+48], m0 4706 test wpadd, wpadd 4707 jnz .w32_wpad 4708.w32_loop: 4709 mova m0, [yq] 4710 mova m1, m0 4711 punpcklbw m1, m1 4712 pmaddubsw m1, m2 4713 mova [acq], m1 4714 paddw m5, m1, [rsp] 4715 mova [rsp ], m5 4716 punpckhbw m0, m0 4717 pmaddubsw m0, m2 4718 mova [acq+16], m0 4719 paddw m5, m0, [rsp+16] 4720 mova [rsp+16], m5 4721 mova m4, [yq+16] 4722 mova m3, m4 4723 punpcklbw m3, m3 4724 pmaddubsw m3, m2 4725 mova [acq+32], m3 4726 paddw m5, m3, [rsp+32] 4727 mova [rsp+32], m5 4728 punpckhbw m4, m4 4729 pmaddubsw m4, m2 4730 mova [acq+48], m4 4731 paddw m5, m4, [rsp+48] 4732 mova [rsp+48], m5 4733 lea yq, [yq+strideq] 4734 add acq, 64 4735 sub hd, 1 4736 jg .w32_loop 4737 test hpadd, hpadd 4738 jz .calc_avg_32 4739 jmp .w32_hpad_loop 4740.w32_wpad: 4741 cmp wpadd, 2 4742 jl .w32_pad1 4743 je .w32_pad2 4744 cmp wpadd, 4 4745 jl .w32_pad3 4746 je .w32_pad4 4747 cmp wpadd, 6 4748 jl .w32_pad5 4749 je .w32_pad6 4750.w32_pad7: 4751 movd m1, [yq] 4752 punpcklbw m1, m1 4753 punpcklqdq m1, m1 4754 pshufhw m1, m1, q3333 4755 pmaddubsw m1, m2 4756 mova [acq], m1 4757 paddw m5, m1, [rsp] 4758 mova [rsp ], m5 4759 mova m0, m1 4760 punpckhqdq m0, m0 4761 mova [acq+16], m0 4762 paddw m5, m0, [rsp+16] 4763 mova [rsp+16], m5 4764 mova m3, m0 4765 mova [acq+32], m3 4766 paddw m5, m3, [rsp+32] 4767 mova [rsp+32], m5 4768 mova m4, m3 4769 mova [acq+48], m4 4770 paddw m5, m4, [rsp+48] 4771 mova [rsp+48], m5 4772 lea yq, [yq+strideq] 4773 add acq, 64 4774 sub hd, 1 4775 jg .w32_pad7 4776 jmp .w32_wpad_done 4777.w32_pad6: 4778 mova m0, [yq] 4779 mova m1, m0 4780 punpcklbw m1, m1 4781 pmaddubsw m1, m2 4782 mova [acq], m1 4783 paddw m5, m1, [rsp] 4784 mova [rsp ], m5 4785 pshufhw m0, m1, q3333 4786 punpckhqdq m0, m0 4787 mova [acq+16], m0 4788 paddw m5, m0, [rsp+16] 4789 mova [rsp+16], m5 4790 mova m3, m0 4791 mova [acq+32], m3 4792 paddw m5, m3, [rsp+32] 4793 mova [rsp+32], m5 4794 mova m4, m3 4795 mova [acq+48], m4 4796 paddw m5, m4, [rsp+48] 4797 mova [rsp+48], m5 4798 lea yq, [yq+strideq] 4799 add acq, 64 4800 sub hd, 1 4801 jg .w32_pad6 4802 jmp .w32_wpad_done 4803.w32_pad5: 4804 mova m0, [yq] 4805 mova m1, m0 4806 punpcklbw m1, m1 4807 pmaddubsw m1, m2 4808 mova [acq], m1 4809 mova m5, [rsp] 4810 paddw m5, m1 4811 mova [rsp ], m5 4812 punpckhbw m0, m0 4813 punpcklqdq m0, m0 4814 pshufhw m0, m0, q3333 4815 pmaddubsw m0, m2 4816 mova [acq+16], m0 4817 paddw m5, m0, [rsp+16] 4818 mova [rsp+16], m5 4819 mova m3, m0 4820 punpckhqdq m3, m3 4821 mova [acq+32], m3 4822 paddw m5, m3, [rsp+32] 4823 mova [rsp+32], m5 4824 mova m4, m3 4825 mova [acq+48], m4 4826 paddw m5, m4, [rsp+48] 4827 mova [rsp+48], m5 4828 lea yq, [yq+strideq] 4829 add acq, 64 4830 sub hd, 1 4831 jg .w32_pad5 4832 jmp .w32_wpad_done 4833.w32_pad4: 4834 mova m0, [yq] 4835 mova m1, m0 4836 punpcklbw m1, m1 4837 pmaddubsw m1, m2 4838 mova [acq], m1 4839 paddw m5, m1, [rsp] 4840 mova [rsp ], m5 4841 punpckhbw m0, m0 4842 pmaddubsw m0, m2 4843 mova [acq+16], m0 4844 paddw m5, m0, [rsp+16] 4845 mova [rsp+16], m5 4846 mova m3, m0 4847 pshufhw m3, m3, q3333 4848 punpckhqdq m3, m3 4849 mova [acq+32], m3 4850 paddw m5, m3, [rsp+32] 4851 mova [rsp+32], m5 4852 mova m4, m3 4853 mova [acq+48], m4 4854 paddw m5, m4, [rsp+48] 4855 mova [rsp+48], m5 4856 lea yq, [yq+strideq] 4857 add acq, 64 4858 sub hd, 1 4859 jg .w32_pad4 4860 jmp .w32_wpad_done 4861.w32_pad3: 4862 mova m0, [yq] 4863 mova m1, m0 4864 punpcklbw m1, m1 4865 pmaddubsw m1, m2 4866 mova [acq], m1 4867 paddw m5, m1, [rsp] 4868 mova [rsp ], m5 4869 punpckhbw m0, m0 4870 pmaddubsw m0, m2 4871 mova [acq+16], m0 4872 paddw m5, m0, [rsp+16] 4873 mova [rsp+16], m5 4874 movd m3, [yq+16] 4875 punpcklbw m3, m3 4876 punpcklqdq m3, m3 4877 pshufhw m3, m3, q3333 4878 pmaddubsw m3, m2 4879 mova [acq+32], m3 4880 paddw m5, m3, [rsp+32] 4881 mova [rsp+32], m5 4882 mova m4, m3 4883 punpckhqdq m4, m4 4884 mova [acq+48], m4 4885 paddw m5, m4, [rsp+48] 4886 mova [rsp+48], m5 4887 lea yq, [yq+strideq] 4888 add acq, 64 4889 sub hd, 1 4890 jg .w32_pad3 4891 jmp .w32_wpad_done 4892.w32_pad2: 4893 mova m0, [yq] 4894 mova m1, m0 4895 punpcklbw m1, m1 4896 pmaddubsw m1, m2 4897 mova [acq], m1 4898 paddw m5, m1, [rsp] 4899 mova [rsp ], m5 4900 punpckhbw m0, m0 4901 pmaddubsw m0, m2 4902 mova [acq+16], m0 4903 paddw m5, m0, [rsp+16] 4904 mova [rsp+16], m5 4905 mova m3, [yq+16] 4906 punpcklbw m3, m3 4907 pmaddubsw m3, m2 4908 mova [acq+32], m3 4909 paddw m5, m3, [rsp+32] 4910 mova [rsp+32], m5 4911 pshufhw m4, m3, q3333 4912 punpckhqdq m4, m4 4913 mova [acq+48], m4 4914 paddw m5, m4, [rsp+48] 4915 mova [rsp+48], m5 4916 lea yq, [yq+strideq] 4917 add acq, 64 4918 sub hd, 1 4919 jg .w32_pad2 4920 jmp .w32_wpad_done 4921.w32_pad1: 4922 mova m0, [yq] 4923 mova m1, m0 4924 punpcklbw m1, m1 4925 pmaddubsw m1, m2 4926 mova [acq], m1 4927 paddw m5, m1, [rsp] 4928 mova [rsp ], m5 4929 punpckhbw m0, m0 4930 pmaddubsw m0, m2 4931 mova [acq+16], m0 4932 paddw m5, m0, [rsp+16] 4933 mova [rsp+16], m5 4934 mova m4, [yq+16] 4935 mova m3, m4 4936 punpcklbw m3, m3 4937 pmaddubsw m3, m2 4938 mova [acq+32], m3 4939 paddw m5, m3, [rsp+32] 4940 mova [rsp+32], m5 4941 punpckhbw m4, m4 4942 punpcklqdq m4, m4 4943 pshufhw m4, m4, q3333 4944 pmaddubsw m4, m2 4945 mova [acq+48], m4 4946 paddw m5, m4, [rsp+48] 4947 mova [rsp+48], m5 4948 lea yq, [yq+strideq] 4949 add acq, 64 4950 sub hd, 1 4951 jg .w32_pad1 4952.w32_wpad_done: 4953 test hpadd, hpadd 4954 jz .calc_avg_32 4955.w32_hpad_loop: 4956 mova [acq], m1 4957 mova [acq+16], m0 4958 paddw m5, m1, [rsp] 4959 mova [rsp ], m5 4960 paddw m5, m0, [rsp+16] 4961 mova [rsp+16], m5 4962 mova [acq+32], m3 4963 mova [acq+48], m4 4964 paddw m5, m3, [rsp+32] 4965 mova [rsp+32], m5 4966 paddw m5, m4, [rsp+48] 4967 mova [rsp+48], m5 4968 add acq, 64 4969 sub hpadd, 1 4970 jg .w32_hpad_loop 4971 4972%if ARCH_X86_64 4973 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak 4974%else 4975 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h 4976%endif 4977 4978.calc_avg_32: 4979 mova m5, [rsp] 4980 mova m0, m5 4981 psrld m5, 16 4982 pslld m0, 16 4983 psrld m0, 16 4984 paddd m5, m0 4985 mova m0, [rsp+16] 4986 mova m3, m0 4987 psrld m0, 16 4988 pslld m3, 16 4989 psrld m3, 16 4990 paddd m0, m3 4991 paddd m5, m0 4992 mova m0, [rsp+32] 4993 mova m3, m0 4994 psrld m0, 16 4995 pslld m3, 16 4996 psrld m3, 16 4997 paddd m0, m3 4998 mova m1, [rsp+48] 4999 mova m3, m1 5000 psrld m1, 16 5001 pslld m3, 16 5002 psrld m3, 16 5003 paddd m1, m3 5004 paddd m1, m0 5005 paddd m5, m1 5006.calc_avg: 5007 movd szd, m6 5008 psrad m6, 1 5009 tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); 5010 paddd m5, m6 5011 movd m1, r1d 5012 pshufd m0, m5, q2301 5013 paddd m0, m5 5014 pshufd m5, m0, q1032 5015 paddd m0, m5 5016 psrad m0, m1 ; sum >>= log2sz; 5017 packssdw m0, m0 5018 RELOAD_ACQ_32 acq ; ac = ac_orig 5019.sub_loop: 5020 mova m1, [acq] 5021 psubw m1, m0 5022 mova [acq], m1 5023 add acq, 16 5024 sub szd, 8 5025 jg .sub_loop 5026 RET 5027 5028; %1 simd register that hold the mask and will hold the result 5029; %2 simd register that holds the "true" values 5030; %3 location of the "false" values (simd register/memory) 5031%macro BLEND 3 ; mask, true, false 5032 pand %2, %1 5033 pandn %1, %3 5034 por %1, %2 5035%endmacro 5036 5037%macro PAETH 2 ; top, ldiff 5038 pavgb m1, m%1, m3 5039 pxor m0, m%1, m3 5040 pand m0, m4 5041 psubusb m2, m5, m1 5042 psubb m1, m0 5043 psubusb m1, m5 5044 por m1, m2 5045 paddusb m1, m1 5046 por m1, m0 ; min(tldiff, 255) 5047 psubusb m2, m5, m3 5048 psubusb m0, m3, m5 5049 por m2, m0 ; tdiff 5050%ifnum %2 5051 pminub m2, m%2 5052 pcmpeqb m0, m%2, m2 ; ldiff <= tdiff 5053%else 5054 mova m0, %2 5055 pminub m2, m0 5056 pcmpeqb m0, m2 5057%endif 5058 pminub m1, m2 5059 pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff 5060 mova m2, m3 5061 BLEND m0, m2, m%1 5062 BLEND m1, m0, m5 5063%endmacro 5064 5065cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h 5066%define base r5-ipred_paeth_ssse3_table 5067 tzcnt wd, wm 5068 movifnidn hd, hm 5069 pxor m0, m0 5070 movd m5, [tlq] 5071 pshufb m5, m0 5072 LEA r5, ipred_paeth_ssse3_table 5073 movsxd wq, [r5+wq*4] 5074 movddup m4, [base+ipred_paeth_shuf] 5075 add wq, r5 5076 jmp wq 5077.w4: 5078 movd m6, [tlq+1] ; top 5079 pshufd m6, m6, q0000 5080 lea r3, [strideq*3] 5081 psubusb m7, m5, m6 5082 psubusb m0, m6, m5 5083 por m7, m0 ; ldiff 5084.w4_loop: 5085 sub tlq, 4 5086 movd m3, [tlq] 5087 mova m1, [base+ipred_h_shuf] 5088 pshufb m3, m1 ; left 5089 PAETH 6, 7 5090 movd [dstq ], m1 5091 pshuflw m0, m1, q1032 5092 movd [dstq+strideq ], m0 5093 punpckhqdq m1, m1 5094 movd [dstq+strideq*2], m1 5095 psrlq m1, 32 5096 movd [dstq+r3 ], m1 5097 lea dstq, [dstq+strideq*4] 5098 sub hd, 4 5099 jg .w4_loop 5100 RET 5101ALIGN function_align 5102.w8: 5103 movddup m6, [tlq+1] 5104 psubusb m7, m5, m6 5105 psubusb m0, m6, m5 5106 por m7, m0 5107.w8_loop: 5108 sub tlq, 2 5109 movd m3, [tlq] 5110 pshufb m3, [base+ipred_paeth_shuf] 5111 PAETH 6, 7 5112 movq [dstq ], m1 5113 movhps [dstq+strideq], m1 5114 lea dstq, [dstq+strideq*2] 5115 sub hd, 2 5116 jg .w8_loop 5117 RET 5118ALIGN function_align 5119.w16: 5120 movu m6, [tlq+1] 5121 psubusb m7, m5, m6 5122 psubusb m0, m6, m5 5123 por m7, m0 5124.w16_loop: 5125 sub tlq, 1 5126 movd m3, [tlq] 5127 pxor m1, m1 5128 pshufb m3, m1 5129 PAETH 6, 7 5130 mova [dstq], m1 5131 add dstq, strideq 5132 sub hd, 1 5133 jg .w16_loop 5134 RET 5135ALIGN function_align 5136.w32: 5137 movu m6, [tlq+1] 5138 psubusb m7, m5, m6 5139 psubusb m0, m6, m5 5140 por m7, m0 5141 mova [rsp ], m6 5142 mova [rsp+16], m7 5143 movu m6, [tlq+17] 5144 psubusb m7, m5, m6 5145 psubusb m0, m6, m5 5146 por m7, m0 5147 mova [rsp+32], m6 5148.w32_loop: 5149 dec tlq 5150 movd m3, [tlq] 5151 pxor m1, m1 5152 pshufb m3, m1 5153 mova m6, [rsp] 5154 PAETH 6, [rsp+16] 5155 mova [dstq ], m1 5156 mova m6, [rsp+32] 5157 PAETH 6, 7 5158 mova [dstq+16], m1 5159 add dstq, strideq 5160 dec hd 5161 jg .w32_loop 5162 RET 5163ALIGN function_align 5164.w64: 5165 movu m6, [tlq+1] 5166 psubusb m7, m5, m6 5167 psubusb m0, m6, m5 5168 por m7, m0 5169 mova [rsp ], m6 5170 mova [rsp+16], m7 5171 movu m6, [tlq+17] 5172 psubusb m7, m5, m6 5173 psubusb m0, m6, m5 5174 por m7, m0 5175 mova [rsp+32], m6 5176 mova [rsp+48], m7 5177 movu m6, [tlq+33] 5178 psubusb m7, m5, m6 5179 psubusb m0, m6, m5 5180 por m7, m0 5181 mova [rsp+64], m6 5182 mova [rsp+80], m7 5183 movu m6, [tlq+49] 5184 psubusb m7, m5, m6 5185 psubusb m0, m6, m5 5186 por m7, m0 5187 mova [rsp+96], m6 5188.w64_loop: 5189 dec tlq 5190 movd m3, [tlq] 5191 pxor m1, m1 5192 pshufb m3, m1 5193 mova m6, [rsp] 5194 PAETH 6, [rsp+16] 5195 mova [dstq ], m1 5196 mova m6, [rsp+32] 5197 PAETH 6, [rsp+48] 5198 mova [dstq+16], m1 5199 mova m6, [rsp+64] 5200 PAETH 6, [rsp+80] 5201 mova [dstq+32], m1 5202 mova m6, [rsp+96] 5203 PAETH 6, 7 5204 mova [dstq+48], m1 5205 add dstq, strideq 5206 dec hd 5207 jg .w64_loop 5208 RET 5209 5210 5211%macro FILTER 4 ;dst, src, tmp, shuf 5212%ifnum %4 5213 pshufb m%2, m%4 5214%else 5215 pshufb m%2, %4 5216%endif 5217 pshufd m%1, m%2, q0000 ;p0 p1 5218 pmaddubsw m%1, m2 5219 pshufd m%3, m%2, q1111 ;p2 p3 5220 pmaddubsw m%3, m3 5221 paddw m%1, [base+pw_8] 5222 paddw m%1, m%3 5223 pshufd m%3, m%2, q2222 ;p4 p5 5224 pmaddubsw m%3, m4 5225 paddw m%1, m%3 5226 pshufd m%3, m%2, q3333 ;p6 __ 5227 pmaddubsw m%3, m5 5228 paddw m%1, m%3 5229 psraw m%1, 4 5230 packuswb m%1, m%1 5231%endmacro 5232 5233cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter 5234%define base r6-$$ 5235 LEA r6, $$ 5236 tzcnt wd, wm 5237%ifidn filterd, filterm 5238 movzx filterd, filterb 5239%else 5240 movzx filterd, byte filterm 5241%endif 5242 shl filterd, 6 5243 lea filterq, [base+filter_intra_taps+filterq] 5244 movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4 5245 movsxd wq, [base+ipred_filter_ssse3_table+wq*4] 5246 mova m2, [filterq+16*0] 5247 mova m3, [filterq+16*1] 5248 mova m4, [filterq+16*2] 5249 mova m5, [filterq+16*3] 5250 lea wq, [base+ipred_filter_ssse3_table+wq] 5251 mov hd, hm 5252 jmp wq 5253.w4: 5254 mova m1, [base+filter_shuf1] 5255 sub tlq, 3 5256 sub tlq, hq 5257 jmp .w4_loop_start 5258.w4_loop: 5259 movd m0, [tlq+hq] 5260 punpckldq m0, m6 5261 lea dstq, [dstq+strideq*2] 5262.w4_loop_start: 5263 FILTER 6, 0, 7, 1 5264 movd [dstq+strideq*0], m6 5265 pshuflw m6, m6, q1032 5266 movd [dstq+strideq*1], m6 5267 sub hd, 2 5268 jg .w4_loop 5269 RET 5270 5271ALIGN function_align 5272.w8: 5273 movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4 5274 sub tlq, 5 5275 sub tlq, hq 5276 5277.w8_loop: 5278 FILTER 7, 0, 1, [base+filter_shuf1] 5279 punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5280 FILTER 0, 6, 1, [base+filter_shuf2] 5281 5282 punpckldq m6, m7, m0 5283 movq [dstq+strideq*0], m6 5284 punpckhqdq m6, m6 5285 movq [dstq+strideq*1], m6 5286 5287 movd m0, [tlq+hq] ;_ 6 5 0 5288 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 5289 5290 lea dstq, [dstq+strideq*2] 5291 sub hd, 2 5292 jg .w8_loop 5293 RET 5294 5295ALIGN function_align 5296.w16: 5297 movu m6, [tlq+1] ;top row 5298 sub tlq, 5 5299 sub tlq, hq 5300 5301.w16_loop: 5302 FILTER 7, 0, 1, [base+filter_shuf1] 5303 punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5304 movd [dstq+strideq*0], m7 5305 psrlq m7, 32 5306 palignr m7, m6, 4 5307 5308 FILTER 6, 0, 1, [base+filter_shuf2] 5309 punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5310 movd [dstq+4+strideq*0], m6 5311 psrlq m6, 32 5312 palignr m6, m7, 4 5313 5314 FILTER 7, 0, 1, [base+filter_shuf2] 5315 punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5316 movd [dstq+8+strideq*0], m7 5317 psrlq m7, 32 5318 palignr m7, m6, 4 5319 5320 FILTER 6, 0, 1, [base+filter_shuf2] 5321 movd [dstq+12+strideq*0], m6 5322 psrlq m6, 32 5323 palignr m6, m7, 4 5324 mova [dstq+strideq*1], m6 5325 5326 movd m0, [tlq+hq] ;_ 6 5 0 5327 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 5328 5329 lea dstq, [dstq+strideq*2] 5330 sub hd, 2 5331 jg .w16_loop 5332 RET 5333 5334ALIGN function_align 5335.w32: 5336 movu m6, [tlq+1] ;top row 5337 lea filterq, [tlq+17] 5338 sub tlq, 5 5339 sub tlq, hq 5340 5341.w32_loop: 5342 FILTER 7, 0, 1, [base+filter_shuf1] 5343 punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5344 movd [dstq+strideq*0], m7 5345 psrlq m7, 32 5346 palignr m7, m6, 4 5347 5348 FILTER 6, 0, 1, [base+filter_shuf2] 5349 punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5350 movd [dstq+4+strideq*0], m6 5351 psrlq m6, 32 5352 palignr m6, m7, 4 5353 5354 FILTER 7, 0, 1, [base+filter_shuf2] 5355 punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5356 movd [dstq+8+strideq*0], m7 5357 psrlq m7, 32 5358 palignr m7, m6, 4 5359 5360 FILTER 6, 0, 1, [base+filter_shuf2] 5361 movu m1, [filterq] 5362 punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _ 5363 punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5364 movd [dstq+12+strideq*0], m6 5365 psrlq m6, 32 5366 palignr m6, m7, 4 5367 mova [dstq+strideq*1], m6 5368 5369 mova m6, m1 5370 5371 FILTER 7, 0, 6, [base+filter_shuf2] 5372 punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5373 movd [dstq+16+strideq*0], m7 5374 psrlq m7, 32 5375 palignr m7, m1, 4 5376 5377 FILTER 6, 0, 1, [base+filter_shuf2] 5378 punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5379 movd [dstq+20+strideq*0], m6 5380 psrlq m6, 32 5381 palignr m6, m7, 4 5382 5383 FILTER 7, 0, 1, [base+filter_shuf2] 5384 punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 5385 movd [dstq+24+strideq*0], m7 5386 psrlq m7, 32 5387 palignr m7, m6, 4 5388 5389 FILTER 6, 0, 1, [base+filter_shuf2] 5390 movd [dstq+28+strideq*0], m6 5391 psrlq m6, 32 5392 palignr m6, m7, 4 5393 mova [dstq+16+strideq*1], m6 5394 5395 mova m6, [dstq+strideq*1] 5396 movd m0, [tlq+hq] ;_ 6 5 0 5397 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 5398 lea filterq, [dstq+16+strideq*1] 5399 lea dstq, [dstq+strideq*2] 5400 sub hd, 2 5401 jg .w32_loop 5402 RET 5403