1; Copyright © 2018-2021, VideoLAN and dav1d authors 2; Copyright © 2018-2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32 33; dav1d_obmc_masks[] with 64-x interleaved 34obmc_masks: db 0, 0, 0, 0 35 ; 2 36 db 45, 19, 64, 0 37 ; 4 38 db 39, 25, 50, 14, 59, 5, 64, 0 39 ; 8 40 db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 41 ; 16 42 db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 43 db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 44 ; 32 45 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 46 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 47 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 48 db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 49 50warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 51 db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 52warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 53 db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 54subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 55 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 56subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 57subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 58subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 59subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 60subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 61subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 62bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 63bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 64deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 65blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 66pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 67bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 68wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 69resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 70rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 71 72wm_420_sign: dd 0x01020102, 0x01010101 73wm_422_sign: dd 0x80808080, 0x7f7f7f7f 74 75pb_64: times 4 db 64 76pw_m256: times 2 dw -256 77pw_15: times 2 dw 15 78pw_32: times 2 dw 32 79pw_34: times 2 dw 34 80pw_258: times 2 dw 258 81pw_512: times 2 dw 512 82pw_1024: times 2 dw 1024 83pw_2048: times 2 dw 2048 84pw_6903: times 2 dw 6903 85pw_8192: times 2 dw 8192 86pd_32: dd 32 87pd_63: dd 63 88pd_512: dd 512 89pd_32768: dd 32768 90pd_0x3ff: dd 0x3ff 91pd_0x4000: dd 0x4000 92pq_0x40000000: dq 0x40000000 93 94cextern mc_subpel_filters 95cextern mc_warp_filter2 96cextern resize_filter 97cextern z_filter_s 98 99%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 100 101%macro BASE_JMP_TABLE 3-* 102 %xdefine %1_%2_table (%%table - %3) 103 %xdefine %%base %1_%2 104 %%table: 105 %rep %0 - 2 106 dw %%base %+ _w%3 - %%base 107 %rotate 1 108 %endrep 109%endmacro 110 111%macro HV_JMP_TABLE 5-* 112 %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) 113 %xdefine %%base %1_%3 114 %assign %%types %4 115 %if %%types & 1 116 %xdefine %1_%2_h_%3_table (%%h - %5) 117 %%h: 118 %rep %0 - 4 119 dw %%prefix %+ .h_w%5 - %%base 120 %rotate 1 121 %endrep 122 %rotate 4 123 %endif 124 %if %%types & 2 125 %xdefine %1_%2_v_%3_table (%%v - %5) 126 %%v: 127 %rep %0 - 4 128 dw %%prefix %+ .v_w%5 - %%base 129 %rotate 1 130 %endrep 131 %rotate 4 132 %endif 133 %if %%types & 4 134 %xdefine %1_%2_hv_%3_table (%%hv - %5) 135 %%hv: 136 %rep %0 - 4 137 dw %%prefix %+ .hv_w%5 - %%base 138 %rotate 1 139 %endrep 140 %endif 141%endmacro 142 143%macro BIDIR_JMP_TABLE 2-* 144 %xdefine %1_%2_table (%%table - 2*%3) 145 %xdefine %%base %1_%2_table 146 %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 147 %%table: 148 %rep %0 - 2 149 dd %%prefix %+ .w%3 - %%base 150 %rotate 1 151 %endrep 152%endmacro 153 154%macro SCALED_JMP_TABLE 2-* 155 %xdefine %1_%2_table (%%table - %3) 156 %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) 157%%table: 158 %rep %0 - 2 159 dw %%base %+ .w%3 - %%base 160 %rotate 1 161 %endrep 162 %rotate 2 163%%dy_1024: 164 %xdefine %1_%2_dy1_table (%%dy_1024 - %3) 165 %rep %0 - 2 166 dw %%base %+ .dy1_w%3 - %%base 167 %rotate 1 168 %endrep 169 %rotate 2 170%%dy_2048: 171 %xdefine %1_%2_dy2_table (%%dy_2048 - %3) 172 %rep %0 - 2 173 dw %%base %+ .dy2_w%3 - %%base 174 %rotate 1 175 %endrep 176%endmacro 177 178%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put) 179%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep) 180 181%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX 182 183BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 184BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 185HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 186HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 187HV_JMP_TABLE put, 6tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 188HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 189HV_JMP_TABLE prep, 6tap, avx2, 1, 4, 8, 16, 32, 64, 128 190HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 191SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 192SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 193BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 194BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 195BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 196BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 197BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 198BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 199BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 200BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 201BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 32, 32 202 203SECTION .text 204 205INIT_XMM avx2 206cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy 207 movifnidn mxyd, r6m ; mx 208 lea r7, [put_avx2] 209 tzcnt wd, wm 210 movifnidn hd, hm 211 test mxyd, mxyd 212 jnz .h 213 mov mxyd, r7m ; my 214 test mxyd, mxyd 215 jnz .v 216.put: 217 movzx wd, word [r7+wq*2+table_offset(put,)] 218 add wq, r7 219 jmp wq 220.put_w2: 221 movzx r6d, word [srcq+ssq*0] 222 movzx r7d, word [srcq+ssq*1] 223 lea srcq, [srcq+ssq*2] 224 mov [dstq+dsq*0], r6w 225 mov [dstq+dsq*1], r7w 226 lea dstq, [dstq+dsq*2] 227 sub hd, 2 228 jg .put_w2 229 RET 230.put_w4: 231 mov r6d, [srcq+ssq*0] 232 mov r7d, [srcq+ssq*1] 233 lea srcq, [srcq+ssq*2] 234 mov [dstq+dsq*0], r6d 235 mov [dstq+dsq*1], r7d 236 lea dstq, [dstq+dsq*2] 237 sub hd, 2 238 jg .put_w4 239 RET 240.put_w8: 241 mov r6, [srcq+ssq*0] 242 mov r7, [srcq+ssq*1] 243 lea srcq, [srcq+ssq*2] 244 mov [dstq+dsq*0], r6 245 mov [dstq+dsq*1], r7 246 lea dstq, [dstq+dsq*2] 247 sub hd, 2 248 jg .put_w8 249 RET 250.put_w16: 251 movu m0, [srcq+ssq*0] 252 movu m1, [srcq+ssq*1] 253 lea srcq, [srcq+ssq*2] 254 mova [dstq+dsq*0], m0 255 mova [dstq+dsq*1], m1 256 lea dstq, [dstq+dsq*2] 257 sub hd, 2 258 jg .put_w16 259 RET 260INIT_YMM avx2 261.put_w32: 262 movu m0, [srcq+ssq*0] 263 movu m1, [srcq+ssq*1] 264 lea srcq, [srcq+ssq*2] 265 mova [dstq+dsq*0], m0 266 mova [dstq+dsq*1], m1 267 lea dstq, [dstq+dsq*2] 268 sub hd, 2 269 jg .put_w32 270 RET 271.put_w64: 272 movu m0, [srcq+ssq*0+32*0] 273 movu m1, [srcq+ssq*0+32*1] 274 movu m2, [srcq+ssq*1+32*0] 275 movu m3, [srcq+ssq*1+32*1] 276 lea srcq, [srcq+ssq*2] 277 mova [dstq+dsq*0+32*0], m0 278 mova [dstq+dsq*0+32*1], m1 279 mova [dstq+dsq*1+32*0], m2 280 mova [dstq+dsq*1+32*1], m3 281 lea dstq, [dstq+dsq*2] 282 sub hd, 2 283 jg .put_w64 284 RET 285.put_w128: 286 movu m0, [srcq+32*0] 287 movu m1, [srcq+32*1] 288 movu m2, [srcq+32*2] 289 movu m3, [srcq+32*3] 290 add srcq, ssq 291 mova [dstq+32*0], m0 292 mova [dstq+32*1], m1 293 mova [dstq+32*2], m2 294 mova [dstq+32*3], m3 295 add dstq, dsq 296 dec hd 297 jg .put_w128 298 RET 299.h: 300 ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 301 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 302 imul mxyd, 255 303 vbroadcasti128 m4, [z_filter_s+2] 304 add mxyd, 16 305 movd xm5, mxyd 306 mov mxyd, r7m ; my 307 vpbroadcastw m5, xm5 308 test mxyd, mxyd 309 jnz .hv 310 movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] 311 vpbroadcastd m3, [pw_2048] 312 add wq, r7 313 jmp wq 314.h_w2: 315 movd xm0, [srcq+ssq*0] 316 pinsrd xm0, [srcq+ssq*1], 1 317 lea srcq, [srcq+ssq*2] 318 pshufb xm0, xm4 319 pmaddubsw xm0, xm5 320 pmulhrsw xm0, xm3 321 packuswb xm0, xm0 322 pextrw [dstq+dsq*0], xm0, 0 323 pextrw [dstq+dsq*1], xm0, 2 324 lea dstq, [dstq+dsq*2] 325 sub hd, 2 326 jg .h_w2 327 RET 328.h_w4: 329 mova xm4, [bilin_h_shuf4] 330.h_w4_loop: 331 movq xm0, [srcq+ssq*0] 332 movhps xm0, [srcq+ssq*1] 333 lea srcq, [srcq+ssq*2] 334 pshufb xm0, xm4 335 pmaddubsw xm0, xm5 336 pmulhrsw xm0, xm3 337 packuswb xm0, xm0 338 movd [dstq+dsq*0], xm0 339 pextrd [dstq+dsq*1], xm0, 1 340 lea dstq, [dstq+dsq*2] 341 sub hd, 2 342 jg .h_w4_loop 343 RET 344.h_w8: 345 movu xm0, [srcq+ssq*0] 346 movu xm1, [srcq+ssq*1] 347 lea srcq, [srcq+ssq*2] 348 pshufb xm0, xm4 349 pshufb xm1, xm4 350 pmaddubsw xm0, xm5 351 pmaddubsw xm1, xm5 352 pmulhrsw xm0, xm3 353 pmulhrsw xm1, xm3 354 packuswb xm0, xm1 355 movq [dstq+dsq*0], xm0 356 movhps [dstq+dsq*1], xm0 357 lea dstq, [dstq+dsq*2] 358 sub hd, 2 359 jg .h_w8 360 RET 361.h_w16: 362 movu xm0, [srcq+ssq*0+8*0] 363 vinserti128 m0, [srcq+ssq*1+8*0], 1 364 movu xm1, [srcq+ssq*0+8*1] 365 vinserti128 m1, [srcq+ssq*1+8*1], 1 366 lea srcq, [srcq+ssq*2] 367 pshufb m0, m4 368 pshufb m1, m4 369 pmaddubsw m0, m5 370 pmaddubsw m1, m5 371 pmulhrsw m0, m3 372 pmulhrsw m1, m3 373 packuswb m0, m1 374 mova [dstq+dsq*0], xm0 375 vextracti128 [dstq+dsq*1], m0, 1 376 lea dstq, [dstq+dsq*2] 377 sub hd, 2 378 jg .h_w16 379 RET 380.h_w32: 381 movu m0, [srcq+8*0] 382 movu m1, [srcq+8*1] 383 add srcq, ssq 384 pshufb m0, m4 385 pshufb m1, m4 386 pmaddubsw m0, m5 387 pmaddubsw m1, m5 388 pmulhrsw m0, m3 389 pmulhrsw m1, m3 390 packuswb m0, m1 391 mova [dstq], m0 392 add dstq, dsq 393 dec hd 394 jg .h_w32 395 RET 396.h_w64: 397 movu m0, [srcq+8*0] 398 movu m1, [srcq+8*1] 399 pshufb m0, m4 400 pshufb m1, m4 401 pmaddubsw m0, m5 402 pmaddubsw m1, m5 403 pmulhrsw m0, m3 404 pmulhrsw m1, m3 405 packuswb m0, m1 406 movu m1, [srcq+8*4] 407 movu m2, [srcq+8*5] 408 add srcq, ssq 409 pshufb m1, m4 410 pshufb m2, m4 411 pmaddubsw m1, m5 412 pmaddubsw m2, m5 413 pmulhrsw m1, m3 414 pmulhrsw m2, m3 415 packuswb m1, m2 416 mova [dstq+32*0], m0 417 mova [dstq+32*1], m1 418 add dstq, dsq 419 dec hd 420 jg .h_w64 421 RET 422.h_w128: 423 mov r6, -32*3 424.h_w128_loop: 425 movu m0, [srcq+r6+32*3+8*0] 426 movu m1, [srcq+r6+32*3+8*1] 427 pshufb m0, m4 428 pshufb m1, m4 429 pmaddubsw m0, m5 430 pmaddubsw m1, m5 431 pmulhrsw m0, m3 432 pmulhrsw m1, m3 433 packuswb m0, m1 434 mova [dstq+r6+32*3], m0 435 add r6, 32 436 jle .h_w128_loop 437 add srcq, ssq 438 add dstq, dsq 439 dec hd 440 jg .h_w128 441 RET 442.v: 443 movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] 444 imul mxyd, 255 445 vpbroadcastd m5, [pw_2048] 446 add mxyd, 16 447 add wq, r7 448 movd xm4, mxyd 449 vpbroadcastw m4, xm4 450 jmp wq 451.v_w2: 452 movd xm0, [srcq+ssq*0] 453.v_w2_loop: 454 pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1 455 lea srcq, [srcq+ssq*2] 456 pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1 457 pshuflw xm1, xm1, q2301 ; 1 0 458 punpcklbw xm1, xm0 459 pmaddubsw xm1, xm4 460 pmulhrsw xm1, xm5 461 packuswb xm1, xm1 462 pextrw [dstq+dsq*0], xm1, 1 463 pextrw [dstq+dsq*1], xm1, 0 464 lea dstq, [dstq+dsq*2] 465 sub hd, 2 466 jg .v_w2_loop 467 RET 468.v_w4: 469 movd xm0, [srcq+ssq*0] 470.v_w4_loop: 471 vpbroadcastd xm2, [srcq+ssq*1] 472 lea srcq, [srcq+ssq*2] 473 vpblendd xm1, xm2, xm0, 0x01 ; 0 1 474 vpbroadcastd xm0, [srcq+ssq*0] 475 vpblendd xm2, xm0, 0x02 ; 1 2 476 punpcklbw xm1, xm2 477 pmaddubsw xm1, xm4 478 pmulhrsw xm1, xm5 479 packuswb xm1, xm1 480 movd [dstq+dsq*0], xm1 481 pextrd [dstq+dsq*1], xm1, 1 482 lea dstq, [dstq+dsq*2] 483 sub hd, 2 484 jg .v_w4_loop 485 RET 486.v_w8: 487 movq xm0, [srcq+ssq*0] 488.v_w8_loop: 489 movq xm2, [srcq+ssq*1] 490 lea srcq, [srcq+ssq*2] 491 punpcklbw xm1, xm0, xm2 492 movq xm0, [srcq+ssq*0] 493 punpcklbw xm2, xm0 494 pmaddubsw xm1, xm4 495 pmaddubsw xm2, xm4 496 pmulhrsw xm1, xm5 497 pmulhrsw xm2, xm5 498 packuswb xm1, xm2 499 movq [dstq+dsq*0], xm1 500 movhps [dstq+dsq*1], xm1 501 lea dstq, [dstq+dsq*2] 502 sub hd, 2 503 jg .v_w8_loop 504 RET 505.v_w16: 506 movu xm0, [srcq+ssq*0] 507.v_w16_loop: 508 vbroadcasti128 m3, [srcq+ssq*1] 509 lea srcq, [srcq+ssq*2] 510 vpblendd m2, m3, m0, 0x0f ; 0 1 511 vbroadcasti128 m0, [srcq+ssq*0] 512 vpblendd m3, m0, 0xf0 ; 1 2 513 punpcklbw m1, m2, m3 514 punpckhbw m2, m3 515 pmaddubsw m1, m4 516 pmaddubsw m2, m4 517 pmulhrsw m1, m5 518 pmulhrsw m2, m5 519 packuswb m1, m2 520 mova [dstq+dsq*0], xm1 521 vextracti128 [dstq+dsq*1], m1, 1 522 lea dstq, [dstq+dsq*2] 523 sub hd, 2 524 jg .v_w16_loop 525 RET 526.v_w32: 527%macro PUT_BILIN_V_W32 0 528 movu m0, [srcq+ssq*0] 529%%loop: 530 movu m3, [srcq+ssq*1] 531 lea srcq, [srcq+ssq*2] 532 punpcklbw m1, m0, m3 533 punpckhbw m2, m0, m3 534 movu m0, [srcq+ssq*0] 535 pmaddubsw m1, m4 536 pmaddubsw m2, m4 537 pmulhrsw m1, m5 538 pmulhrsw m2, m5 539 packuswb m1, m2 540 punpcklbw m2, m3, m0 541 punpckhbw m3, m0 542 pmaddubsw m2, m4 543 pmaddubsw m3, m4 544 pmulhrsw m2, m5 545 pmulhrsw m3, m5 546 packuswb m2, m3 547 mova [dstq+dsq*0], m1 548 mova [dstq+dsq*1], m2 549 lea dstq, [dstq+dsq*2] 550 sub hd, 2 551 jg %%loop 552%endmacro 553 PUT_BILIN_V_W32 554 RET 555.v_w64: 556 movu m0, [srcq+32*0] 557 movu m1, [srcq+32*1] 558.v_w64_loop: 559 add srcq, ssq 560 movu m3, [srcq+32*0] 561 punpcklbw m2, m0, m3 562 punpckhbw m0, m3 563 pmaddubsw m2, m4 564 pmaddubsw m0, m4 565 pmulhrsw m2, m5 566 pmulhrsw m0, m5 567 packuswb m2, m0 568 mova m0, m3 569 movu m3, [srcq+32*1] 570 mova [dstq+32*0], m2 571 punpcklbw m2, m1, m3 572 punpckhbw m1, m3 573 pmaddubsw m2, m4 574 pmaddubsw m1, m4 575 pmulhrsw m2, m5 576 pmulhrsw m1, m5 577 packuswb m2, m1 578 mova m1, m3 579 mova [dstq+32*1], m2 580 add dstq, dsq 581 dec hd 582 jg .v_w64_loop 583 RET 584.v_w128: 585 lea r6d, [hq+(3<<8)] 586 mov r4, srcq 587 mov r7, dstq 588.v_w128_loop: 589 PUT_BILIN_V_W32 590 add r4, 32 591 add r7, 32 592 movzx hd, r6b 593 mov srcq, r4 594 mov dstq, r7 595 sub r6d, 1<<8 596 jg .v_w128_loop 597 RET 598.hv: 599 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 600 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 601 movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] 602 WIN64_SPILL_XMM 8 603 shl mxyd, 11 ; can't shift by 12 due to signed overflow 604 vpbroadcastd m7, [pw_15] 605 movd xm6, mxyd 606 add wq, r7 607 paddb m5, m5 608 vpbroadcastw m6, xm6 609 jmp wq 610.hv_w2: 611 vpbroadcastd xm0, [srcq+ssq*0] 612 pshufb xm0, xm4 613 pmaddubsw xm0, xm5 614.hv_w2_loop: 615 movd xm1, [srcq+ssq*1] 616 lea srcq, [srcq+ssq*2] 617 pinsrd xm1, [srcq+ssq*0], 1 618 pshufb xm1, xm4 619 pmaddubsw xm1, xm5 ; 1 _ 2 _ 620 shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _ 621 mova xm0, xm1 622 psubw xm1, xm2 623 pmulhw xm1, xm6 624 pavgw xm2, xm7 625 paddw xm1, xm2 626 psrlw xm1, 4 627 packuswb xm1, xm1 628 pextrw [dstq+dsq*0], xm1, 0 629 pextrw [dstq+dsq*1], xm1, 2 630 lea dstq, [dstq+dsq*2] 631 sub hd, 2 632 jg .hv_w2_loop 633 RET 634.hv_w4: 635 mova xm4, [bilin_h_shuf4] 636 movddup xm0, [srcq+ssq*0] 637 pshufb xm0, xm4 638 pmaddubsw xm0, xm5 639.hv_w4_loop: 640 movq xm1, [srcq+ssq*1] 641 lea srcq, [srcq+ssq*2] 642 movhps xm1, [srcq+ssq*0] 643 pshufb xm1, xm4 644 pmaddubsw xm1, xm5 ; 1 2 645 shufps xm2, xm0, xm1, q1032 ; 0 1 646 mova xm0, xm1 647 psubw xm1, xm2 648 pmulhw xm1, xm6 649 pavgw xm2, xm7 650 paddw xm1, xm2 651 psrlw xm1, 4 652 packuswb xm1, xm1 653 movd [dstq+dsq*0], xm1 654 pextrd [dstq+dsq*1], xm1, 1 655 lea dstq, [dstq+dsq*2] 656 sub hd, 2 657 jg .hv_w4_loop 658 RET 659.hv_w8: 660 vbroadcasti128 m0, [srcq+ssq*0] 661 pshufb m0, m4 662 pmaddubsw m0, m5 663.hv_w8_loop: 664 movu xm1, [srcq+ssq*1] 665 lea srcq, [srcq+ssq*2] 666 vinserti128 m1, [srcq+ssq*0], 1 667 pshufb m1, m4 668 pmaddubsw m1, m5 ; 1 2 669 vperm2i128 m2, m0, m1, 0x21 ; 0 1 670 mova m0, m1 671 psubw m1, m2 672 pmulhw m1, m6 673 pavgw m2, m7 674 paddw m1, m2 675 psrlw m1, 4 676 vextracti128 xm2, m1, 1 677 packuswb xm1, xm2 678 movq [dstq+dsq*0], xm1 679 movhps [dstq+dsq*1], xm1 680 lea dstq, [dstq+dsq*2] 681 sub hd, 2 682 jg .hv_w8_loop 683 RET 684.hv_w16: 685 movu m0, [srcq+ssq*0+8*0] 686 vinserti128 m0, [srcq+ssq*0+8*1], 1 687 pshufb m0, m4 688 pmaddubsw m0, m5 689.hv_w16_loop: 690 movu xm2, [srcq+ssq*1+8*0] 691 vinserti128 m2, [srcq+ssq*1+8*1], 1 692 lea srcq, [srcq+ssq*2] 693 movu xm3, [srcq+ssq*0+8*0] 694 vinserti128 m3, [srcq+ssq*0+8*1], 1 695 pshufb m2, m4 696 pshufb m3, m4 697 pmaddubsw m2, m5 698 psubw m1, m2, m0 699 pmulhw m1, m6 700 pavgw m0, m7 701 paddw m1, m0 702 pmaddubsw m0, m3, m5 703 psubw m3, m0, m2 704 pmulhw m3, m6 705 pavgw m2, m7 706 paddw m3, m2 707 psrlw m1, 4 708 psrlw m3, 4 709 packuswb m1, m3 710 vpermq m1, m1, q3120 711 mova [dstq+dsq*0], xm1 712 vextracti128 [dstq+dsq*1], m1, 1 713 lea dstq, [dstq+dsq*2] 714 sub hd, 2 715 jg .hv_w16_loop 716 RET 717.hv_w128: 718 lea r6d, [hq+(3<<16)] 719 jmp .hv_w32_start 720.hv_w64: 721 lea r6d, [hq+(1<<16)] 722.hv_w32_start: 723 mov r4, srcq 724 mov r7, dstq 725.hv_w32: 726%if WIN64 727 movaps r4m, xmm8 728%endif 729.hv_w32_loop0: 730 movu m0, [srcq+8*0] 731 movu m1, [srcq+8*1] 732 pshufb m0, m4 733 pshufb m1, m4 734 pmaddubsw m0, m5 735 pmaddubsw m1, m5 736.hv_w32_loop: 737 add srcq, ssq 738 movu m2, [srcq+8*0] 739 movu m3, [srcq+8*1] 740 pshufb m2, m4 741 pshufb m3, m4 742 pmaddubsw m2, m5 743 pmaddubsw m3, m5 744 psubw m8, m2, m0 745 pmulhw m8, m6 746 pavgw m0, m7 747 paddw m8, m0 748 mova m0, m2 749 psubw m2, m3, m1 750 pmulhw m2, m6 751 pavgw m1, m7 752 paddw m2, m1 753 mova m1, m3 754 psrlw m8, 4 755 psrlw m2, 4 756 packuswb m8, m2 757 mova [dstq], m8 758 add dstq, dsq 759 dec hd 760 jg .hv_w32_loop 761 add r4, 32 762 add r7, 32 763 movzx hd, r6b 764 mov srcq, r4 765 mov dstq, r7 766 sub r6d, 1<<16 767 jg .hv_w32_loop0 768%if WIN64 769 movaps xmm8, r4m 770%endif 771 RET 772 773cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 774 movifnidn mxyd, r5m ; mx 775 lea r6, [prep%+SUFFIX] 776 tzcnt wd, wm 777 movifnidn hd, hm 778 test mxyd, mxyd 779 jnz .h 780 mov mxyd, r6m ; my 781 test mxyd, mxyd 782 jnz .v 783.prep: 784 movzx wd, word [r6+wq*2+table_offset(prep,)] 785 add wq, r6 786 lea stride3q, [strideq*3] 787 jmp wq 788.prep_w4: 789 movd xm0, [srcq+strideq*0] 790 pinsrd xm0, [srcq+strideq*1], 1 791 pinsrd xm0, [srcq+strideq*2], 2 792 pinsrd xm0, [srcq+stride3q ], 3 793 lea srcq, [srcq+strideq*4] 794 pmovzxbw m0, xm0 795 psllw m0, 4 796 mova [tmpq], m0 797 add tmpq, 32 798 sub hd, 4 799 jg .prep_w4 800 RET 801.prep_w8: 802 movq xm0, [srcq+strideq*0] 803 movhps xm0, [srcq+strideq*1] 804 movq xm1, [srcq+strideq*2] 805 movhps xm1, [srcq+stride3q ] 806 lea srcq, [srcq+strideq*4] 807 pmovzxbw m0, xm0 808 pmovzxbw m1, xm1 809 psllw m0, 4 810 psllw m1, 4 811 mova [tmpq+32*0], m0 812 mova [tmpq+32*1], m1 813 add tmpq, 32*2 814 sub hd, 4 815 jg .prep_w8 816 RET 817.prep_w16: 818 pmovzxbw m0, [srcq+strideq*0] 819 pmovzxbw m1, [srcq+strideq*1] 820 pmovzxbw m2, [srcq+strideq*2] 821 pmovzxbw m3, [srcq+stride3q ] 822 lea srcq, [srcq+strideq*4] 823 psllw m0, 4 824 psllw m1, 4 825 psllw m2, 4 826 psllw m3, 4 827 mova [tmpq+32*0], m0 828 mova [tmpq+32*1], m1 829 mova [tmpq+32*2], m2 830 mova [tmpq+32*3], m3 831 add tmpq, 32*4 832 sub hd, 4 833 jg .prep_w16 834 RET 835.prep_w32: 836 pmovzxbw m0, [srcq+strideq*0+16*0] 837 pmovzxbw m1, [srcq+strideq*0+16*1] 838 pmovzxbw m2, [srcq+strideq*1+16*0] 839 pmovzxbw m3, [srcq+strideq*1+16*1] 840 lea srcq, [srcq+strideq*2] 841 psllw m0, 4 842 psllw m1, 4 843 psllw m2, 4 844 psllw m3, 4 845 mova [tmpq+32*0], m0 846 mova [tmpq+32*1], m1 847 mova [tmpq+32*2], m2 848 mova [tmpq+32*3], m3 849 add tmpq, 32*4 850 sub hd, 2 851 jg .prep_w32 852 RET 853.prep_w64: 854 pmovzxbw m0, [srcq+16*0] 855 pmovzxbw m1, [srcq+16*1] 856 pmovzxbw m2, [srcq+16*2] 857 pmovzxbw m3, [srcq+16*3] 858 add srcq, strideq 859 psllw m0, 4 860 psllw m1, 4 861 psllw m2, 4 862 psllw m3, 4 863 mova [tmpq+32*0], m0 864 mova [tmpq+32*1], m1 865 mova [tmpq+32*2], m2 866 mova [tmpq+32*3], m3 867 add tmpq, 32*4 868 dec hd 869 jg .prep_w64 870 RET 871.prep_w128: 872 pmovzxbw m0, [srcq+16*0] 873 pmovzxbw m1, [srcq+16*1] 874 pmovzxbw m2, [srcq+16*2] 875 pmovzxbw m3, [srcq+16*3] 876 psllw m0, 4 877 psllw m1, 4 878 psllw m2, 4 879 psllw m3, 4 880 mova [tmpq+32*0], m0 881 mova [tmpq+32*1], m1 882 mova [tmpq+32*2], m2 883 mova [tmpq+32*3], m3 884 pmovzxbw m0, [srcq+16*4] 885 pmovzxbw m1, [srcq+16*5] 886 pmovzxbw m2, [srcq+16*6] 887 pmovzxbw m3, [srcq+16*7] 888 add tmpq, 32*8 889 add srcq, strideq 890 psllw m0, 4 891 psllw m1, 4 892 psllw m2, 4 893 psllw m3, 4 894 mova [tmpq-32*4], m0 895 mova [tmpq-32*3], m1 896 mova [tmpq-32*2], m2 897 mova [tmpq-32*1], m3 898 dec hd 899 jg .prep_w128 900 RET 901.h: 902 ; 16 * src[x] + (mx * (src[x + 1] - src[x])) 903 ; = (16 - mx) * src[x] + mx * src[x + 1] 904 imul mxyd, 255 905 vbroadcasti128 m4, [z_filter_s+2] 906 add mxyd, 16 907 movd xm5, mxyd 908 mov mxyd, r6m ; my 909 vpbroadcastw m5, xm5 910 test mxyd, mxyd 911 jnz .hv 912 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] 913 add wq, r6 914 lea stride3q, [strideq*3] 915 jmp wq 916.h_w4: 917 vbroadcasti128 m4, [bilin_h_shuf4] 918.h_w4_loop: 919 movq xm0, [srcq+strideq*0] 920 movhps xm0, [srcq+strideq*1] 921 movq xm1, [srcq+strideq*2] 922 movhps xm1, [srcq+stride3q ] 923 lea srcq, [srcq+strideq*4] 924 vinserti128 m0, xm1, 1 925 pshufb m0, m4 926 pmaddubsw m0, m5 927 mova [tmpq], m0 928 add tmpq, 32 929 sub hd, 4 930 jg .h_w4_loop 931 RET 932.h_w8: 933.h_w8_loop: 934 movu xm0, [srcq+strideq*0] 935 vinserti128 m0, [srcq+strideq*1], 1 936 movu xm1, [srcq+strideq*2] 937 vinserti128 m1, [srcq+stride3q ], 1 938 lea srcq, [srcq+strideq*4] 939 pshufb m0, m4 940 pshufb m1, m4 941 pmaddubsw m0, m5 942 pmaddubsw m1, m5 943 mova [tmpq+32*0], m0 944 mova [tmpq+32*1], m1 945 add tmpq, 32*2 946 sub hd, 4 947 jg .h_w8_loop 948 RET 949.h_w16: 950.h_w16_loop: 951 movu xm0, [srcq+strideq*0+8*0] 952 vinserti128 m0, [srcq+strideq*0+8*1], 1 953 movu xm1, [srcq+strideq*1+8*0] 954 vinserti128 m1, [srcq+strideq*1+8*1], 1 955 movu xm2, [srcq+strideq*2+8*0] 956 vinserti128 m2, [srcq+strideq*2+8*1], 1 957 movu xm3, [srcq+stride3q +8*0] 958 vinserti128 m3, [srcq+stride3q +8*1], 1 959 lea srcq, [srcq+strideq*4] 960 pshufb m0, m4 961 pshufb m1, m4 962 pshufb m2, m4 963 pshufb m3, m4 964 pmaddubsw m0, m5 965 pmaddubsw m1, m5 966 pmaddubsw m2, m5 967 pmaddubsw m3, m5 968 mova [tmpq+32*0], m0 969 mova [tmpq+32*1], m1 970 mova [tmpq+32*2], m2 971 mova [tmpq+32*3], m3 972 add tmpq, 32*4 973 sub hd, 4 974 jg .h_w16_loop 975 RET 976.h_w32: 977.h_w32_loop: 978 movu xm0, [srcq+strideq*0+8*0] 979 vinserti128 m0, [srcq+strideq*0+8*1], 1 980 movu xm1, [srcq+strideq*0+8*2] 981 vinserti128 m1, [srcq+strideq*0+8*3], 1 982 movu xm2, [srcq+strideq*1+8*0] 983 vinserti128 m2, [srcq+strideq*1+8*1], 1 984 movu xm3, [srcq+strideq*1+8*2] 985 vinserti128 m3, [srcq+strideq*1+8*3], 1 986 lea srcq, [srcq+strideq*2] 987 pshufb m0, m4 988 pshufb m1, m4 989 pshufb m2, m4 990 pshufb m3, m4 991 pmaddubsw m0, m5 992 pmaddubsw m1, m5 993 pmaddubsw m2, m5 994 pmaddubsw m3, m5 995 mova [tmpq+32*0], m0 996 mova [tmpq+32*1], m1 997 mova [tmpq+32*2], m2 998 mova [tmpq+32*3], m3 999 add tmpq, 32*4 1000 sub hd, 2 1001 jg .h_w32_loop 1002 RET 1003.h_w64: 1004 movu xm0, [srcq+8*0] 1005 vinserti128 m0, [srcq+8*1], 1 1006 movu xm1, [srcq+8*2] 1007 vinserti128 m1, [srcq+8*3], 1 1008 movu xm2, [srcq+8*4] 1009 vinserti128 m2, [srcq+8*5], 1 1010 movu xm3, [srcq+8*6] 1011 vinserti128 m3, [srcq+8*7], 1 1012 add srcq, strideq 1013 pshufb m0, m4 1014 pshufb m1, m4 1015 pshufb m2, m4 1016 pshufb m3, m4 1017 pmaddubsw m0, m5 1018 pmaddubsw m1, m5 1019 pmaddubsw m2, m5 1020 pmaddubsw m3, m5 1021 mova [tmpq+32*0], m0 1022 mova [tmpq+32*1], m1 1023 mova [tmpq+32*2], m2 1024 mova [tmpq+32*3], m3 1025 add tmpq, 32*4 1026 dec hd 1027 jg .h_w64 1028 RET 1029.h_w128: 1030 movu xm0, [srcq+8*0] 1031 vinserti128 m0, [srcq+8*1], 1 1032 movu xm1, [srcq+8*2] 1033 vinserti128 m1, [srcq+8*3], 1 1034 movu xm2, [srcq+8*4] 1035 vinserti128 m2, [srcq+8*5], 1 1036 movu xm3, [srcq+8*6] 1037 vinserti128 m3, [srcq+8*7], 1 1038 pshufb m0, m4 1039 pshufb m1, m4 1040 pshufb m2, m4 1041 pshufb m3, m4 1042 pmaddubsw m0, m5 1043 pmaddubsw m1, m5 1044 pmaddubsw m2, m5 1045 pmaddubsw m3, m5 1046 mova [tmpq+32*0], m0 1047 mova [tmpq+32*1], m1 1048 mova [tmpq+32*2], m2 1049 mova [tmpq+32*3], m3 1050 movu xm0, [srcq+8* 8] 1051 vinserti128 m0, [srcq+8* 9], 1 1052 movu xm1, [srcq+8*10] 1053 vinserti128 m1, [srcq+8*11], 1 1054 movu xm2, [srcq+8*12] 1055 vinserti128 m2, [srcq+8*13], 1 1056 movu xm3, [srcq+8*14] 1057 vinserti128 m3, [srcq+8*15], 1 1058 add tmpq, 32*8 1059 add srcq, strideq 1060 pshufb m0, m4 1061 pshufb m1, m4 1062 pshufb m2, m4 1063 pshufb m3, m4 1064 pmaddubsw m0, m5 1065 pmaddubsw m1, m5 1066 pmaddubsw m2, m5 1067 pmaddubsw m3, m5 1068 mova [tmpq-32*4], m0 1069 mova [tmpq-32*3], m1 1070 mova [tmpq-32*2], m2 1071 mova [tmpq-32*1], m3 1072 dec hd 1073 jg .h_w128 1074 RET 1075.v: 1076 WIN64_SPILL_XMM 7 1077 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] 1078 imul mxyd, 255 1079 add mxyd, 16 1080 add wq, r6 1081 lea stride3q, [strideq*3] 1082 movd xm6, mxyd 1083 vpbroadcastw m6, xm6 1084 jmp wq 1085.v_w4: 1086 movd xm0, [srcq+strideq*0] 1087.v_w4_loop: 1088 vpbroadcastd m1, [srcq+strideq*2] 1089 vpbroadcastd xm2, [srcq+strideq*1] 1090 vpbroadcastd m3, [srcq+stride3q ] 1091 lea srcq, [srcq+strideq*4] 1092 vpblendd m1, m0, 0x05 ; 0 2 2 2 1093 vpbroadcastd m0, [srcq+strideq*0] 1094 vpblendd m3, m2, 0x0f ; 1 1 3 3 1095 vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4 1096 vpblendd m1, m3, 0xaa ; 0 1 2 3 1097 vpblendd m2, m3, 0x55 ; 1 2 3 4 1098 punpcklbw m1, m2 1099 pmaddubsw m1, m6 1100 mova [tmpq], m1 1101 add tmpq, 32 1102 sub hd, 4 1103 jg .v_w4_loop 1104 RET 1105.v_w8: 1106 movq xm0, [srcq+strideq*0] 1107.v_w8_loop: 1108 vpbroadcastq m1, [srcq+strideq*2] 1109 vpbroadcastq m2, [srcq+strideq*1] 1110 vpbroadcastq m3, [srcq+stride3q ] 1111 lea srcq, [srcq+strideq*4] 1112 vpblendd m1, m0, 0x03 ; 0 2 2 2 1113 vpbroadcastq m0, [srcq+strideq*0] 1114 vpblendd m2, m3, 0xcc ; 1 3 1 3 1115 vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2 1116 vpblendd m2, m1, 0x0f ; 0 2 1 3 1117 vpblendd m3, m0, 0xc0 ; 1 3 2 4 1118 punpcklbw m1, m2, m3 1119 punpckhbw m2, m3 1120 pmaddubsw m1, m6 1121 pmaddubsw m2, m6 1122 mova [tmpq+32*0], m1 1123 mova [tmpq+32*1], m2 1124 add tmpq, 32*2 1125 sub hd, 4 1126 jg .v_w8_loop 1127 RET 1128.v_w16: 1129 vbroadcasti128 m0, [srcq+strideq*0] 1130.v_w16_loop: 1131 vbroadcasti128 m1, [srcq+strideq*1] 1132 vbroadcasti128 m2, [srcq+strideq*2] 1133 vbroadcasti128 m3, [srcq+stride3q ] 1134 lea srcq, [srcq+strideq*4] 1135 shufpd m4, m0, m2, 0x0c ; 0 2 1136 vbroadcasti128 m0, [srcq+strideq*0] 1137 shufpd m1, m3, 0x0c ; 1 3 1138 shufpd m2, m0, 0x0c ; 2 4 1139 punpcklbw m3, m4, m1 1140 punpcklbw m5, m1, m2 1141 punpckhbw m4, m1 1142 punpckhbw m1, m2 1143 pmaddubsw m3, m6 1144 pmaddubsw m5, m6 1145 pmaddubsw m4, m6 1146 pmaddubsw m1, m6 1147 mova [tmpq+32*0], m3 1148 mova [tmpq+32*1], m5 1149 mova [tmpq+32*2], m4 1150 mova [tmpq+32*3], m1 1151 add tmpq, 32*4 1152 sub hd, 4 1153 jg .v_w16_loop 1154 RET 1155.v_w32: 1156 vpermq m0, [srcq+strideq*0], q3120 1157.v_w32_loop: 1158 vpermq m1, [srcq+strideq*1], q3120 1159 vpermq m2, [srcq+strideq*2], q3120 1160 vpermq m3, [srcq+stride3q ], q3120 1161 lea srcq, [srcq+strideq*4] 1162 punpcklbw m4, m0, m1 1163 punpckhbw m5, m0, m1 1164 vpermq m0, [srcq+strideq*0], q3120 1165 pmaddubsw m4, m6 1166 pmaddubsw m5, m6 1167 mova [tmpq+32*0], m4 1168 mova [tmpq+32*1], m5 1169 punpcklbw m4, m1, m2 1170 punpckhbw m1, m2 1171 pmaddubsw m4, m6 1172 pmaddubsw m1, m6 1173 punpcklbw m5, m2, m3 1174 punpckhbw m2, m3 1175 pmaddubsw m5, m6 1176 pmaddubsw m2, m6 1177 mova [tmpq+32*2], m4 1178 mova [tmpq+32*3], m1 1179 add tmpq, 32*8 1180 punpcklbw m1, m3, m0 1181 punpckhbw m3, m0 1182 pmaddubsw m1, m6 1183 pmaddubsw m3, m6 1184 mova [tmpq-32*4], m5 1185 mova [tmpq-32*3], m2 1186 mova [tmpq-32*2], m1 1187 mova [tmpq-32*1], m3 1188 sub hd, 4 1189 jg .v_w32_loop 1190 RET 1191.v_w64: 1192 vpermq m0, [srcq+strideq*0+32*0], q3120 1193 vpermq m1, [srcq+strideq*0+32*1], q3120 1194.v_w64_loop: 1195 vpermq m2, [srcq+strideq*1+32*0], q3120 1196 vpermq m3, [srcq+strideq*1+32*1], q3120 1197 lea srcq, [srcq+strideq*2] 1198 punpcklbw m4, m0, m2 1199 punpckhbw m0, m2 1200 pmaddubsw m4, m6 1201 pmaddubsw m0, m6 1202 mova [tmpq+32*0], m4 1203 mova [tmpq+32*1], m0 1204 punpcklbw m4, m1, m3 1205 punpckhbw m5, m1, m3 1206 vpermq m0, [srcq+strideq*0+32*0], q3120 1207 vpermq m1, [srcq+strideq*0+32*1], q3120 1208 pmaddubsw m4, m6 1209 pmaddubsw m5, m6 1210 mova [tmpq+32*2], m4 1211 mova [tmpq+32*3], m5 1212 add tmpq, 32*8 1213 punpcklbw m4, m2, m0 1214 punpckhbw m2, m0 1215 punpcklbw m5, m3, m1 1216 punpckhbw m3, m1 1217 pmaddubsw m4, m6 1218 pmaddubsw m2, m6 1219 pmaddubsw m5, m6 1220 pmaddubsw m3, m6 1221 mova [tmpq-32*4], m4 1222 mova [tmpq-32*3], m2 1223 mova [tmpq-32*2], m5 1224 mova [tmpq-32*1], m3 1225 sub hd, 2 1226 jg .v_w64_loop 1227 RET 1228.v_w128: 1229 lea r6d, [hq+(3<<8)] 1230 mov r3, srcq 1231 mov r5, tmpq 1232.v_w128_loop0: 1233 vpermq m0, [srcq+strideq*0], q3120 1234.v_w128_loop: 1235 vpermq m1, [srcq+strideq*1], q3120 1236 lea srcq, [srcq+strideq*2] 1237 punpcklbw m2, m0, m1 1238 punpckhbw m3, m0, m1 1239 vpermq m0, [srcq+strideq*0], q3120 1240 pmaddubsw m2, m6 1241 pmaddubsw m3, m6 1242 punpcklbw m4, m1, m0 1243 punpckhbw m1, m0 1244 pmaddubsw m4, m6 1245 pmaddubsw m1, m6 1246 mova [tmpq+32*0], m2 1247 mova [tmpq+32*1], m3 1248 mova [tmpq+32*8], m4 1249 mova [tmpq+32*9], m1 1250 add tmpq, 32*16 1251 sub hd, 2 1252 jg .v_w128_loop 1253 add r3, 32 1254 add r5, 64 1255 movzx hd, r6b 1256 mov srcq, r3 1257 mov tmpq, r5 1258 sub r6d, 1<<8 1259 jg .v_w128_loop0 1260 RET 1261.hv: 1262 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 1263 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) 1264 WIN64_SPILL_XMM 7 1265 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] 1266 shl mxyd, 11 1267 movd xm6, mxyd 1268 vpbroadcastw m6, xm6 1269 add wq, r6 1270 lea stride3q, [strideq*3] 1271 jmp wq 1272.hv_w4: 1273 vbroadcasti128 m4, [bilin_h_shuf4] 1274 vpbroadcastq m0, [srcq+strideq*0] 1275 pshufb m0, m4 1276 pmaddubsw m0, m5 1277.hv_w4_loop: 1278 movq xm1, [srcq+strideq*1] 1279 movhps xm1, [srcq+strideq*2] 1280 movq xm2, [srcq+stride3q ] 1281 lea srcq, [srcq+strideq*4] 1282 movhps xm2, [srcq+strideq*0] 1283 vinserti128 m1, xm2, 1 1284 pshufb m1, m4 1285 pmaddubsw m1, m5 ; 1 2 3 4 1286 vpblendd m2, m1, m0, 0xc0 1287 vpermq m2, m2, q2103 ; 0 1 2 3 1288 mova m0, m1 1289 psubw m1, m2 1290 pmulhrsw m1, m6 1291 paddw m1, m2 1292 mova [tmpq], m1 1293 add tmpq, 32 1294 sub hd, 4 1295 jg .hv_w4_loop 1296 RET 1297.hv_w8: 1298 vbroadcasti128 m0, [srcq+strideq*0] 1299 pshufb m0, m4 1300 pmaddubsw m0, m5 1301.hv_w8_loop: 1302 movu xm1, [srcq+strideq*1] 1303 vinserti128 m1, [srcq+strideq*2], 1 1304 movu xm2, [srcq+stride3q ] 1305 lea srcq, [srcq+strideq*4] 1306 vinserti128 m2, [srcq+strideq*0], 1 1307 pshufb m1, m4 1308 pshufb m2, m4 1309 pmaddubsw m1, m5 ; 1 2 1310 vperm2i128 m3, m0, m1, 0x21 ; 0 1 1311 pmaddubsw m0, m2, m5 ; 3 4 1312 vperm2i128 m2, m1, m0, 0x21 ; 2 3 1313 psubw m1, m3 1314 pmulhrsw m1, m6 1315 paddw m1, m3 1316 psubw m3, m0, m2 1317 pmulhrsw m3, m6 1318 paddw m3, m2 1319 mova [tmpq+32*0], m1 1320 mova [tmpq+32*1], m3 1321 add tmpq, 32*2 1322 sub hd, 4 1323 jg .hv_w8_loop 1324 RET 1325.hv_w16: 1326 movu xm0, [srcq+strideq*0+8*0] 1327 vinserti128 m0, [srcq+strideq*0+8*1], 1 1328 pshufb m0, m4 1329 pmaddubsw m0, m5 1330.hv_w16_loop: 1331 movu xm1, [srcq+strideq*1+8*0] 1332 vinserti128 m1, [srcq+strideq*1+8*1], 1 1333 lea srcq, [srcq+strideq*2] 1334 movu xm2, [srcq+strideq*0+8*0] 1335 vinserti128 m2, [srcq+strideq*0+8*1], 1 1336 pshufb m1, m4 1337 pshufb m2, m4 1338 pmaddubsw m1, m5 1339 psubw m3, m1, m0 1340 pmulhrsw m3, m6 1341 paddw m3, m0 1342 pmaddubsw m0, m2, m5 1343 psubw m2, m0, m1 1344 pmulhrsw m2, m6 1345 paddw m2, m1 1346 mova [tmpq+32*0], m3 1347 mova [tmpq+32*1], m2 1348 add tmpq, 32*2 1349 sub hd, 2 1350 jg .hv_w16_loop 1351 RET 1352.hv_w32: 1353 movu xm0, [srcq+8*0] 1354 vinserti128 m0, [srcq+8*1], 1 1355 movu xm1, [srcq+8*2] 1356 vinserti128 m1, [srcq+8*3], 1 1357 pshufb m0, m4 1358 pshufb m1, m4 1359 pmaddubsw m0, m5 1360 pmaddubsw m1, m5 1361.hv_w32_loop: 1362 add srcq, strideq 1363 movu xm2, [srcq+8*0] 1364 vinserti128 m2, [srcq+8*1], 1 1365 pshufb m2, m4 1366 pmaddubsw m2, m5 1367 psubw m3, m2, m0 1368 pmulhrsw m3, m6 1369 paddw m3, m0 1370 mova m0, m2 1371 movu xm2, [srcq+8*2] 1372 vinserti128 m2, [srcq+8*3], 1 1373 pshufb m2, m4 1374 pmaddubsw m2, m5 1375 mova [tmpq+32*0], m3 1376 psubw m3, m2, m1 1377 pmulhrsw m3, m6 1378 paddw m3, m1 1379 mova m1, m2 1380 mova [tmpq+32*1], m3 1381 add tmpq, 32*2 1382 dec hd 1383 jg .hv_w32_loop 1384 RET 1385.hv_w128: 1386 lea r3d, [hq+(7<<8)] 1387 mov r6d, 256 1388 jmp .hv_w64_start 1389.hv_w64: 1390 lea r3d, [hq+(3<<8)] 1391 mov r6d, 128 1392.hv_w64_start: 1393%if WIN64 1394 PUSH r7 1395%endif 1396 mov r5, srcq 1397 mov r7, tmpq 1398.hv_w64_loop0: 1399 movu xm0, [srcq+strideq*0+8*0] 1400 vinserti128 m0, [srcq+strideq*0+8*1], 1 1401 pshufb m0, m4 1402 pmaddubsw m0, m5 1403.hv_w64_loop: 1404 movu xm1, [srcq+strideq*1+8*0] 1405 vinserti128 m1, [srcq+strideq*1+8*1], 1 1406 lea srcq, [srcq+strideq*2] 1407 movu xm2, [srcq+strideq*0+8*0] 1408 vinserti128 m2, [srcq+strideq*0+8*1], 1 1409 pshufb m1, m4 1410 pshufb m2, m4 1411 pmaddubsw m1, m5 1412 psubw m3, m1, m0 1413 pmulhrsw m3, m6 1414 paddw m3, m0 1415 pmaddubsw m0, m2, m5 1416 psubw m2, m0, m1 1417 pmulhrsw m2, m6 1418 paddw m2, m1 1419 mova [tmpq+r6*0], m3 1420 mova [tmpq+r6*1], m2 1421 lea tmpq, [tmpq+r6*2] 1422 sub hd, 2 1423 jg .hv_w64_loop 1424 add r5, 16 1425 add r7, 32 1426 movzx hd, r3b 1427 mov srcq, r5 1428 mov tmpq, r7 1429 sub r3d, 1<<8 1430 jg .hv_w64_loop0 1431%if WIN64 1432 POP r7 1433%endif 1434 RET 1435 1436; int8_t subpel_filters[5][15][8] 1437%assign FILTER_REGULAR (0*15 << 16) | 3*15 1438%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1439%assign FILTER_SHARP (2*15 << 16) | 3*15 1440 1441%macro FN 4-5 ; fn, type, type_h, type_v, jmp_to 1442cglobal %1_%2_8bpc 1443 mov t0d, FILTER_%3 1444%ifidn %3, %4 1445 mov t1d, t0d 1446%else 1447 mov t1d, FILTER_%4 1448%endif 1449%if %0 == 5 ; skip the jump in the last filter 1450 jmp mangle(private_prefix %+ _%5 %+ SUFFIX) 1451%endif 1452%endmacro 1453 1454%if WIN64 1455DECLARE_REG_TMP 4, 5 1456%else 1457DECLARE_REG_TMP 7, 8 1458%endif 1459 1460%define PUT_8TAP_FN FN put_8tap, 1461PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc 1462PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc 1463PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc 1464PUT_8TAP_FN regular, REGULAR, REGULAR 1465 1466cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns 1467 imul mxd, mxm, 0x010101 1468 add mxd, t0d ; 6tap_h, mx, 4tap_h 1469 imul myd, mym, 0x010101 1470 add myd, t1d ; 6tap_v, my, 4tap_v 1471 lea r8, [put_avx2] 1472 mov wd, wm 1473 movifnidn hd, hm 1474 test mxd, 0xf00 1475 jnz .h 1476 test myd, 0xf00 1477 jnz .v 1478.put: 1479 tzcnt wd, wd 1480 movzx wd, word [r8+wq*2+table_offset(put,)] 1481 add wq, r8 1482 lea r6, [ssq*3] 1483 lea r7, [dsq*3] 1484%if WIN64 1485 pop r8 1486%endif 1487 jmp wq 1488.h_w2: 1489 movzx mxd, mxb 1490 lea srcq, [srcq-1] 1491 vpbroadcastd xm4, [r8+mxq*8+subpel_filters-put_avx2+2] 1492 je .h_w4 1493 mova xm3, [subpel_h_shuf4] 1494.h_w2_loop: 1495 movq xm0, [srcq+ssq*0] 1496 movhps xm0, [srcq+ssq*1] 1497 lea srcq, [srcq+ssq*2] 1498 pshufb xm0, xm3 1499 pmaddubsw xm0, xm4 1500 phaddw xm0, xm0 1501 paddw xm0, xm5 1502 psraw xm0, 6 1503 packuswb xm0, xm0 1504 pextrw [dstq+dsq*0], xm0, 0 1505 pextrw [dstq+dsq*1], xm0, 1 1506 lea dstq, [dstq+dsq*2] 1507 sub hd, 2 1508 jg .h_w2_loop 1509 RET 1510.h_w4: 1511 mova xm3, [subpel_h_shufA] 1512.h_w4_loop: 1513 movq xm0, [srcq+ssq*0] 1514 movq xm1, [srcq+ssq*1] 1515 lea srcq, [srcq+ssq*2] 1516 pshufb xm0, xm3 1517 pshufb xm1, xm3 1518 pmaddubsw xm0, xm4 1519 pmaddubsw xm1, xm4 1520 phaddw xm0, xm1 1521 paddw xm0, xm5 1522 psraw xm0, 6 1523 packuswb xm0, xm0 1524 movd [dstq+dsq*0], xm0 1525 pextrd [dstq+dsq*1], xm0, 1 1526 lea dstq, [dstq+dsq*2] 1527 sub hd, 2 1528 jg .h_w4_loop 1529 RET 1530.h: 1531 test myd, 0xf00 1532 jnz .hv 1533 vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) 1534 cmp wd, 4 1535 jle .h_w2 1536 WIN64_SPILL_XMM 11 1537 tzcnt wd, wd 1538 vbroadcasti128 m4, [z_filter_s+ 2] ; 01 1539 shr mxd, 16 1540 vbroadcasti128 m6, [z_filter_s+ 6] ; 23 1541 sub srcq, 2 1542 vbroadcasti128 m7, [z_filter_s+10] ; 45 1543 lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2] 1544 movzx wd, word [r8+wq*2+table_offset(put, _6tap_h)] 1545 vpbroadcastw m8, [mxq+0] 1546 vpbroadcastw m9, [mxq+2] 1547 add wq, r8 1548 vpbroadcastw m10, [mxq+4] 1549 jmp wq 1550.h_w8: 1551%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2] 1552 pshufb m%2, m%1, m4 1553 pmaddubsw m%2, m8 1554 pshufb m%3, m%1, m6 1555 pmaddubsw m%3, m9 1556 pshufb m%1, m7 1557 pmaddubsw m%1, m10 1558 paddw m%2, m5 1559 paddw m%1, m%3 1560 paddw m%1, m%2 1561 psraw m%1, 6 1562%endmacro 1563 movu xm0, [srcq+ssq*0] 1564 vinserti128 m0, [srcq+ssq*1], 1 1565 lea srcq, [srcq+ssq*2] 1566 PUT_6TAP_H 0, 1, 2 1567 vextracti128 xm1, m0, 1 1568 packuswb xm0, xm1 1569 movq [dstq+dsq*0], xm0 1570 movhps [dstq+dsq*1], xm0 1571 lea dstq, [dstq+dsq*2] 1572 sub hd, 2 1573 jg .h_w8 1574 RET 1575.h_w16: 1576 movu xm0, [srcq+ssq*0+8*0] 1577 vinserti128 m0, [srcq+ssq*1+8*0], 1 1578 movu xm1, [srcq+ssq*0+8*1] 1579 vinserti128 m1, [srcq+ssq*1+8*1], 1 1580 PUT_6TAP_H 0, 2, 3 1581 lea srcq, [srcq+ssq*2] 1582 PUT_6TAP_H 1, 2, 3 1583 packuswb m0, m1 1584 mova [dstq+dsq*0], xm0 1585 vextracti128 [dstq+dsq*1], m0, 1 1586 lea dstq, [dstq+dsq*2] 1587 sub hd, 2 1588 jg .h_w16 1589 RET 1590.h_w32: 1591 xor r6d, r6d 1592 jmp .h_start 1593.h_w64: 1594 mov r6, -32*1 1595 jmp .h_start 1596.h_w128: 1597 mov r6, -32*3 1598.h_start: 1599 sub srcq, r6 1600 sub dstq, r6 1601 mov r4, r6 1602.h_loop: 1603 movu m0, [srcq+r6+8*0] 1604 movu m1, [srcq+r6+8*1] 1605 PUT_6TAP_H 0, 2, 3 1606 PUT_6TAP_H 1, 2, 3 1607 packuswb m0, m1 1608 mova [dstq+r6], m0 1609 add r6, 32 1610 jle .h_loop 1611 add srcq, ssq 1612 add dstq, dsq 1613 mov r6, r4 1614 dec hd 1615 jg .h_loop 1616 RET 1617.v: 1618 WIN64_SPILL_XMM 9, 12 1619 movzx mxd, myb 1620 shr myd, 16 1621 cmp hd, 6 1622 cmovs myd, mxd 1623 tzcnt r6d, wd 1624 movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)] 1625 vpbroadcastd m8, [pw_512] 1626 lea myq, [r8+myq*8+subpel_filters+1-put_avx2] 1627 vpbroadcastw m5, [myq+0] 1628 vpbroadcastw m6, [myq+2] 1629 vpbroadcastw m7, [myq+4] 1630 add r6, r8 1631 mov nsq, ssq 1632 neg nsq 1633 jmp r6 1634.v_w2: 1635 movd xm2, [srcq+nsq*2] 1636 pinsrw xm2, [srcq+nsq*1], 2 1637 pinsrw xm2, [srcq+ssq*0], 4 1638 pinsrw xm2, [srcq+ssq*1], 6 ; 0 1 2 3 1639 lea srcq, [srcq+ssq*2] 1640 vpbroadcastd xm0, [srcq+ssq*0] 1641 palignr xm3, xm0, xm2, 4 ; 1 2 3 4 1642 punpcklbw xm1, xm2, xm3 ; 01 12 1643 punpckhbw xm2, xm3 ; 23 34 1644.v_w2_loop: 1645 vpbroadcastd xm4, [srcq+ssq*1] 1646 lea srcq, [srcq+ssq*2] 1647 pmaddubsw xm3, xm1, xm5 ; a0 b0 1648 mova xm1, xm2 1649 pmaddubsw xm2, xm6 ; a1 b1 1650 paddw xm3, xm2 1651 vpblendd xm2, xm0, xm4, 0x02 ; 4 5 1652 vpbroadcastd xm0, [srcq+ssq*0] 1653 vpblendd xm4, xm0, 0x02 ; 5 6 1654 punpcklbw xm2, xm4 ; 67 78 1655 pmaddubsw xm4, xm2, xm7 ; a3 b3 1656 paddw xm3, xm4 1657 pmulhrsw xm3, xm8 1658 packuswb xm3, xm3 1659 pextrw [dstq+dsq*0], xm3, 0 1660 pextrw [dstq+dsq*1], xm3, 2 1661 lea dstq, [dstq+dsq*2] 1662 sub hd, 2 1663 jg .v_w2_loop 1664 RET 1665.v_w4: 1666 movd xm2, [srcq+nsq*2] 1667 pinsrd xm2, [srcq+nsq*1], 1 1668 pinsrd xm2, [srcq+ssq*0], 2 1669 pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3 1670 lea srcq, [srcq+ssq*2] 1671 vpbroadcastd xm0, [srcq+ssq*0] 1672 palignr xm3, xm0, xm2, 4 ; 1 2 3 4 1673 punpcklbw xm1, xm2, xm3 ; 01 12 1674 punpckhbw xm2, xm3 ; 23 34 1675.v_w4_loop: 1676 vpbroadcastd xm4, [srcq+ssq*1] 1677 lea srcq, [srcq+ssq*2] 1678 pmaddubsw xm3, xm1, xm5 ; a0 b0 1679 mova xm1, xm2 1680 pmaddubsw xm2, xm6 ; a1 b1 1681 paddw xm3, xm2 1682 vpblendd xm2, xm0, xm4, 0x02 ; 4 5 1683 vpbroadcastd xm0, [srcq+ssq*0] 1684 vpblendd xm4, xm0, 0x02 ; 5 6 1685 punpcklbw xm2, xm4 ; 45 56 1686 pmaddubsw xm4, xm2, xm7 ; a2 b2 1687 paddw xm3, xm4 1688 pmulhrsw xm3, xm8 1689 packuswb xm3, xm3 1690 movd [dstq+dsq*0], xm3 1691 pextrd [dstq+dsq*1], xm3, 1 1692 lea dstq, [dstq+dsq*2] 1693 sub hd, 2 1694 jg .v_w4_loop 1695 RET 1696.v_w8: 1697 movq xm1, [srcq+nsq*2] 1698 vpbroadcastq m3, [srcq+nsq*1] 1699 vpbroadcastq m2, [srcq+ssq*0] 1700 vpbroadcastq m4, [srcq+ssq*1] 1701 lea srcq, [srcq+ssq*2] 1702 vpbroadcastq m0, [srcq+ssq*0] 1703 vpblendd m1, m3, 0x30 1704 vpblendd m3, m2, 0x30 1705 punpcklbw m1, m3 ; 01 12 1706 vpblendd m2, m4, 0x30 1707 vpblendd m4, m0, 0x30 1708 punpcklbw m2, m4 ; 23 34 1709.v_w8_loop: 1710 vpbroadcastq m4, [srcq+ssq*1] 1711 lea srcq, [srcq+ssq*2] 1712 pmaddubsw m3, m1, m5 ; a0 b0 1713 mova m1, m2 1714 pmaddubsw m2, m6 ; a1 b1 1715 paddw m3, m2 1716 vpblendd m2, m0, m4, 0x30 1717 vpbroadcastq m0, [srcq+ssq*0] 1718 vpblendd m4, m0, 0x30 1719 punpcklbw m2, m4 ; 45 56 1720 pmaddubsw m4, m2, m7 ; a2 b2 1721 paddw m3, m4 1722 pmulhrsw m3, m8 1723 vextracti128 xm4, m3, 1 1724 packuswb xm3, xm4 1725 movq [dstq+dsq*0], xm3 1726 movhps [dstq+dsq*1], xm3 1727 lea dstq, [dstq+dsq*2] 1728 sub hd, 2 1729 jg .v_w8_loop 1730 RET 1731.v_w16: 1732.v_w32: 1733.v_w64: 1734.v_w128: 1735 lea r6d, [wq*8-128] 1736 WIN64_PUSH_XMM 12 1737 lea r6d, [hq+r6*2] 1738.v_w16_loop0: 1739 vbroadcasti128 m3, [srcq+nsq*2] 1740 vbroadcasti128 m4, [srcq+nsq*1] 1741 lea r4, [srcq+ssq*2] 1742 vbroadcasti128 m0, [srcq+ssq*0] 1743 vbroadcasti128 m1, [srcq+ssq*1] 1744 mov r7, dstq 1745 vbroadcasti128 m2, [r4+ssq*0] 1746 shufpd m3, m0, 0x0c 1747 shufpd m4, m1, 0x0c 1748 punpcklbw m1, m3, m4 ; 01 1749 punpckhbw m3, m4 ; 23 1750 shufpd m0, m2, 0x0c 1751 punpcklbw m2, m4, m0 ; 12 1752 punpckhbw m4, m0 ; 34 1753.v_w16_loop: 1754 vbroadcasti128 m9, [r4+ssq*1] 1755 pmaddubsw m10, m1, m5 ; a0 1756 lea r4, [r4+ssq*2] 1757 pmaddubsw m11, m2, m5 ; b0 1758 mova m1, m3 1759 pmaddubsw m3, m6 ; a1 1760 mova m2, m4 1761 pmaddubsw m4, m6 ; b1 1762 paddw m10, m3 1763 vbroadcasti128 m3, [r4+ssq*0] 1764 paddw m11, m4 1765 shufpd m4, m0, m9, 0x0d 1766 shufpd m0, m9, m3, 0x0c 1767 punpcklbw m3, m4, m0 ; 45 1768 punpckhbw m4, m0 ; 56 1769 pmaddubsw m9, m3, m7 ; a2 1770 paddw m10, m9 1771 pmaddubsw m9, m4, m7 ; b2 1772 paddw m11, m9 1773 pmulhrsw m10, m8 1774 pmulhrsw m11, m8 1775 packuswb m10, m11 1776 vpermq m10, m10, q3120 1777 mova [r7+dsq*0], xm10 1778 vextracti128 [r7+dsq*1], m10, 1 1779 lea r7, [r7+dsq*2] 1780 sub hd, 2 1781 jg .v_w16_loop 1782 add srcq, 16 1783 add dstq, 16 1784 movzx hd, r6b 1785 sub r6d, 1<<8 1786 jg .v_w16_loop0 1787 RET 1788.hv: 1789 WIN64_SPILL_XMM 12, 16 1790 cmp wd, 4 1791 jg .hv_w8 1792 movzx mxd, mxb 1793 dec srcq 1794 vpbroadcastd m6, [r8+mxq*8+subpel_filters-put_avx2+2] 1795 movzx mxd, myb 1796 shr myd, 16 1797 cmp hd, 6 1798 cmovs myd, mxd 1799 vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2] 1800 vpbroadcastd m7, [pw_8192] 1801 punpcklbw m0, m0 1802 vpbroadcastd m8, [pd_512] 1803 psraw m0, 8 ; sign-extend 1804 mov nsq, ssq 1805 pshufd m9, m0, q0000 1806 neg nsq 1807 pshufd m10, m0, q1111 1808 pshufd m11, m0, q2222 1809 cmp wd, 4 1810 je .hv_w4 1811 vbroadcasti128 m5, [subpel_h_shuf4] 1812 movq xm2, [srcq+nsq*2] 1813 movhps xm2, [srcq+nsq*1] 1814 movq xm0, [srcq+ssq*0] 1815 movhps xm0, [srcq+ssq*1] 1816 lea srcq, [srcq+ssq*2] 1817 vpbroadcastq m1, [srcq+ssq*0] 1818 vpblendd m2, m1, 0x30 1819 pshufb m2, m5 1820 pshufb xm0, xm5 1821 pmaddubsw m2, m6 1822 pmaddubsw xm0, xm6 1823 phaddw m2, m0 1824 pmulhrsw m2, m7 1825 vextracti128 xm0, m2, 1 1826 palignr xm0, xm2, 4 1827 punpcklwd xm1, xm2, xm0 ; 01 12 1828 punpckhwd xm2, xm0 ; 23 34 1829.hv_w2_loop: 1830 movq xm4, [srcq+ssq*1] 1831 lea srcq, [srcq+ssq*2] 1832 movhps xm4, [srcq+ssq*0] 1833 pshufb xm4, xm5 1834 pmaddubsw xm4, xm6 1835 pmaddwd xm3, xm9, xm1 ; a0 b0 1836 mova xm1, xm2 1837 pmaddwd xm2, xm10 ; a1 b1 1838 phaddw xm4, xm4 1839 paddd xm3, xm2 1840 pmulhrsw xm4, xm7 1841 palignr xm2, xm4, xm0, 12 1842 mova xm0, xm4 1843 punpcklwd xm2, xm4 ; 45 56 1844 pmaddwd xm4, xm11, xm2 ; a2 b2 1845 paddd xm3, xm8 1846 paddd xm3, xm4 1847 psrad xm3, 10 1848 packssdw xm3, xm3 1849 packuswb xm3, xm3 1850 pextrw [dstq+dsq*0], xm3, 0 1851 pextrw [dstq+dsq*1], xm3, 1 1852 lea dstq, [dstq+dsq*2] 1853 sub hd, 2 1854 jg .hv_w2_loop 1855 RET 1856.hv_w4: 1857 mova m5, [subpel_h_shuf4] 1858 vpbroadcastq m2, [srcq+nsq*2] 1859 vpbroadcastq m4, [srcq+nsq*1] 1860 vpbroadcastq m1, [srcq+ssq*0] 1861 vpbroadcastq m3, [srcq+ssq*1] 1862 lea srcq, [srcq+ssq*2] 1863 vpbroadcastq m0, [srcq+ssq*0] 1864 vpblendd m2, m4, 0xcc ; 0 1 1865 vpblendd m1, m3, 0xcc ; 2 3 1866 pshufb m2, m5 1867 pshufb m1, m5 1868 pshufb m0, m5 1869 pmaddubsw m2, m6 1870 pmaddubsw m1, m6 1871 pmaddubsw m0, m6 1872 phaddw m2, m1 1873 phaddw m0, m0 1874 pmulhrsw m2, m7 1875 pmulhrsw m0, m7 1876 palignr m3, m0, m2, 4 1877 punpcklwd m1, m2, m3 ; 01 12 1878 punpckhwd m2, m3 ; 23 34 1879.hv_w4_loop: 1880 vpbroadcastq m4, [srcq+ssq*1] 1881 lea srcq, [srcq+ssq*2] 1882 pmaddwd m3, m9, m1 ; a0 b0 1883 mova m1, m2 1884 pmaddwd m2, m10 ; a1 b1 1885 paddd m3, m2 1886 vpbroadcastq m2, [srcq+ssq*0] 1887 vpblendd m4, m2, 0xcc ; 5 6 1888 pshufb m4, m5 1889 pmaddubsw m4, m6 1890 phaddw m4, m4 1891 pmulhrsw m4, m7 1892 palignr m2, m4, m0, 12 1893 mova m0, m4 1894 punpcklwd m2, m4 ; 45 56 1895 pmaddwd m4, m11, m2 ; a2 b2 1896 paddd m3, m8 1897 paddd m3, m4 1898 psrad m3, 10 1899 vextracti128 xm4, m3, 1 1900 packssdw xm3, xm4 1901 packuswb xm3, xm3 1902 pshuflw xm3, xm3, q3120 1903 movd [dstq+dsq*0], xm3 1904 pextrd [dstq+dsq*1], xm3, 1 1905 lea dstq, [dstq+dsq*2] 1906 sub hd, 2 1907 jg .hv_w4_loop 1908 RET 1909.hv_w8: 1910 shr mxd, 16 1911 sub srcq, 2 1912 lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2] 1913 WIN64_PUSH_XMM 16 1914 vpbroadcastw m10, [mxq+0] 1915 vpbroadcastw m11, [mxq+2] 1916 vpbroadcastw m12, [mxq+4] 1917 movzx mxd, myb 1918 shr myd, 16 1919 cmp hd, 6 1920 cmovs myd, mxd 1921 vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2] 1922 lea r6d, [wq*8-64] 1923 vbroadcasti128 m8, [z_filter_s+ 6] 1924 punpcklbw m0, m0 1925 vbroadcasti128 m9, [z_filter_s+10] 1926 psraw m0, 8 ; sign-extend 1927 mov nsq, ssq 1928 pshufd m13, m0, q0000 1929 neg nsq 1930 pshufd m14, m0, q1111 1931 lea r6d, [hq+r6*4] 1932 pshufd m15, m0, q2222 1933.hv_w8_loop0: 1934 vbroadcasti128 m7, [z_filter_s+2] 1935 movu xm3, [srcq+nsq*2] 1936 lea r4, [srcq+ssq*2] 1937 movu xm4, [srcq+nsq*1] 1938 vbroadcasti128 m0, [srcq+ssq*0] 1939 mov r7, dstq 1940 vinserti128 m4, [srcq+ssq*1], 1 ; 1 3 1941 vpblendd m3, m0, 0xf0 ; 0 2 1942 vinserti128 m0, [r4+ssq*0], 1 ; 2 4 1943 vpbroadcastd m5, [pw_8192] 1944%macro HV_H_6TAP_W8 6 ; src/dst, tmp[1-2], shuf[1-3] 1945 pshufb %2, %1, %4 1946 pmaddubsw %2, m10 1947 pshufb %3, %1, %5 1948 pmaddubsw %3, m11 1949 pshufb %1, %6 1950 pmaddubsw %1, m12 1951 paddw %2, %3 1952 paddw %1, %2 1953%endmacro 1954 HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9 1955 HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9 1956 HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9 1957 vpermq m3, m3, q3120 1958 vpermq m4, m4, q3120 1959 vpermq m0, m0, q3120 1960 pmulhrsw m3, m5 1961 pmulhrsw m4, m5 1962 pmulhrsw m0, m5 1963 punpcklwd m1, m3, m4 ; 01 1964 punpckhwd m3, m4 ; 23 1965 punpcklwd m2, m4, m0 ; 12 1966 punpckhwd m4, m0 ; 34 1967.hv_w8_loop: 1968 movu xm7, [r4+ssq*1] 1969 lea r4, [r4+ssq*2] 1970 vinserti128 m7, [r4+ssq*0], 1 ; 5 6 1971 pmaddwd m5, m13, m1 ; a0 1972 mova m1, m3 1973 pmaddwd m6, m13, m2 ; b0 1974 mova m2, m4 1975 pmaddwd m3, m14 ; a1 1976 pmaddwd m4, m14 ; b1 1977 paddd m5, m3 1978 vbroadcasti128 m3, [z_filter_s+2] 1979 paddd m6, m4 1980 HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9 1981 vpbroadcastd m3, [pw_8192] 1982 vpbroadcastd m4, [pd_512] 1983 pmulhrsw m7, m3 1984 paddd m5, m4 1985 paddd m6, m4 1986 mova m4, m0 1987 vpermq m0, m7, q3120 1988 shufpd m4, m0, 0x05 1989 punpcklwd m3, m4, m0 ; 45 1990 pmaddwd m7, m15, m3 ; a2 1991 punpckhwd m4, m0 ; 67 1992 paddd m5, m7 1993 pmaddwd m7, m15, m4 ; b2 1994 paddd m6, m7 1995 psrad m5, 10 1996 psrad m6, 10 1997 packssdw m5, m6 1998 vextracti128 xm6, m5, 1 1999 packuswb xm5, xm6 2000 pshufd xm5, xm5, q3120 2001 movq [r7+dsq*0], xm5 2002 movhps [r7+dsq*1], xm5 2003 lea r7, [r7+dsq*2] 2004 sub hd, 2 2005 jg .hv_w8_loop 2006 add srcq, 8 2007 add dstq, 8 2008 movzx hd, r6b 2009 sub r6d, 1<<8 2010 jg .hv_w8_loop0 2011 RET 2012 2013PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc 2014PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_8bpc 2015PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc 2016PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_8bpc 2017PUT_8TAP_FN sharp, SHARP, SHARP 2018 2019cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 2020 imul mxd, mxm, 0x010101 2021 add mxd, t0d ; 8tap_h, mx, 4tap_h 2022 imul myd, mym, 0x010101 2023 add myd, t1d ; 8tap_v, my, 4tap_v 2024 lea r8, [put_avx2] 2025 movsxd wq, wm 2026 movifnidn hd, hm 2027 test mxd, 0xf00 2028 jnz .h 2029 test myd, 0xf00 2030 jz mangle(private_prefix %+ _put_6tap_8bpc_avx2).put 2031.v: 2032 WIN64_SPILL_XMM 12, 15 2033 movzx mxd, myb 2034 shr myd, 16 2035 cmp hd, 6 2036 cmovs myd, mxd 2037 tzcnt r6d, wd 2038 movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] 2039 vpbroadcastd m7, [pw_512] 2040 lea myq, [r8+myq*8+subpel_filters-put_avx2] 2041 vpbroadcastw m8, [myq+0] 2042 vpbroadcastw m9, [myq+2] 2043 vpbroadcastw m10, [myq+4] 2044 vpbroadcastw m11, [myq+6] 2045 add r6, r8 2046 lea ss3q, [ssq*3] 2047 sub srcq, ss3q 2048 jmp r6 2049.v_w2: 2050 movd xm2, [srcq+ssq*0] 2051 pinsrw xm2, [srcq+ssq*1], 2 2052 pinsrw xm2, [srcq+ssq*2], 4 2053 add srcq, ss3q 2054 pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3 2055 movd xm3, [srcq+ssq*1] 2056 vpbroadcastd xm1, [srcq+ssq*2] 2057 add srcq, ss3q 2058 vpbroadcastd xm0, [srcq+ssq*0] 2059 vpblendd xm3, xm1, 0x02 ; 4 5 2060 vpblendd xm1, xm0, 0x02 ; 5 6 2061 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 2062 punpcklbw xm3, xm1 ; 45 56 2063 punpcklbw xm1, xm2, xm4 ; 01 12 2064 punpckhbw xm2, xm4 ; 23 34 2065.v_w2_loop: 2066 pmaddubsw xm5, xm1, xm8 ; a0 b0 2067 mova xm1, xm2 2068 pmaddubsw xm2, xm9 ; a1 b1 2069 paddw xm5, xm2 2070 mova xm2, xm3 2071 pmaddubsw xm3, xm10 ; a2 b2 2072 paddw xm5, xm3 2073 vpbroadcastd xm4, [srcq+ssq*1] 2074 lea srcq, [srcq+ssq*2] 2075 vpblendd xm3, xm0, xm4, 0x02 ; 6 7 2076 vpbroadcastd xm0, [srcq+ssq*0] 2077 vpblendd xm4, xm0, 0x02 ; 7 8 2078 punpcklbw xm3, xm4 ; 67 78 2079 pmaddubsw xm4, xm3, xm11 ; a3 b3 2080 paddw xm5, xm4 2081 pmulhrsw xm5, xm7 2082 packuswb xm5, xm5 2083 pextrw [dstq+dsq*0], xm5, 0 2084 pextrw [dstq+dsq*1], xm5, 2 2085 lea dstq, [dstq+dsq*2] 2086 sub hd, 2 2087 jg .v_w2_loop 2088 RET 2089.v_w4: 2090 movd xm2, [srcq+ssq*0] 2091 pinsrd xm2, [srcq+ssq*1], 1 2092 pinsrd xm2, [srcq+ssq*2], 2 2093 add srcq, ss3q 2094 pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3 2095 movd xm3, [srcq+ssq*1] 2096 vpbroadcastd xm1, [srcq+ssq*2] 2097 add srcq, ss3q 2098 vpbroadcastd xm0, [srcq+ssq*0] 2099 vpblendd xm3, xm1, 0x02 ; 4 5 2100 vpblendd xm1, xm0, 0x02 ; 5 6 2101 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 2102 punpcklbw xm3, xm1 ; 45 56 2103 punpcklbw xm1, xm2, xm4 ; 01 12 2104 punpckhbw xm2, xm4 ; 23 34 2105.v_w4_loop: 2106 pmaddubsw xm5, xm1, xm8 ; a0 b0 2107 mova xm1, xm2 2108 pmaddubsw xm2, xm9 ; a1 b1 2109 paddw xm5, xm2 2110 mova xm2, xm3 2111 pmaddubsw xm3, xm10 ; a2 b2 2112 paddw xm5, xm3 2113 vpbroadcastd xm4, [srcq+ssq*1] 2114 lea srcq, [srcq+ssq*2] 2115 vpblendd xm3, xm0, xm4, 0x02 ; 6 7 2116 vpbroadcastd xm0, [srcq+ssq*0] 2117 vpblendd xm4, xm0, 0x02 ; 7 8 2118 punpcklbw xm3, xm4 ; 67 78 2119 pmaddubsw xm4, xm3, xm11 ; a3 b3 2120 paddw xm5, xm4 2121 pmulhrsw xm5, xm7 2122 packuswb xm5, xm5 2123 movd [dstq+dsq*0], xm5 2124 pextrd [dstq+dsq*1], xm5, 1 2125 lea dstq, [dstq+dsq*2] 2126 sub hd, 2 2127 jg .v_w4_loop 2128 RET 2129.v_w8: 2130 movq xm1, [srcq+ssq*0] 2131 vpbroadcastq m4, [srcq+ssq*1] 2132 vpbroadcastq m2, [srcq+ssq*2] 2133 add srcq, ss3q 2134 vpbroadcastq m5, [srcq+ssq*0] 2135 vpbroadcastq m3, [srcq+ssq*1] 2136 vpbroadcastq m6, [srcq+ssq*2] 2137 add srcq, ss3q 2138 vpbroadcastq m0, [srcq+ssq*0] 2139 vpblendd m1, m4, 0x30 2140 vpblendd m4, m2, 0x30 2141 punpcklbw m1, m4 ; 01 12 2142 vpblendd m2, m5, 0x30 2143 vpblendd m5, m3, 0x30 2144 punpcklbw m2, m5 ; 23 34 2145 vpblendd m3, m6, 0x30 2146 vpblendd m6, m0, 0x30 2147 punpcklbw m3, m6 ; 45 56 2148.v_w8_loop: 2149 vpbroadcastq m4, [srcq+ssq*1] 2150 lea srcq, [srcq+ssq*2] 2151 pmaddubsw m5, m1, m8 ; a0 b0 2152 mova m1, m2 2153 pmaddubsw m2, m9 ; a1 b1 2154 paddw m5, m2 2155 mova m2, m3 2156 pmaddubsw m3, m10 ; a2 b2 2157 paddw m5, m3 2158 vpblendd m3, m0, m4, 0x30 2159 vpbroadcastq m0, [srcq+ssq*0] 2160 vpblendd m4, m0, 0x30 2161 punpcklbw m3, m4 ; 67 78 2162 pmaddubsw m4, m3, m11 ; a3 b3 2163 paddw m5, m4 2164 pmulhrsw m5, m7 2165 vextracti128 xm4, m5, 1 2166 packuswb xm5, xm4 2167 movq [dstq+dsq*0], xm5 2168 movhps [dstq+dsq*1], xm5 2169 lea dstq, [dstq+dsq*2] 2170 sub hd, 2 2171 jg .v_w8_loop 2172 RET 2173.v_w16: 2174.v_w32: 2175.v_w64: 2176.v_w128: 2177 lea r6d, [wq*8-128] 2178 WIN64_PUSH_XMM 15 2179 lea r6d, [hq+r6*2] 2180.v_w16_loop0: 2181 vbroadcasti128 m4, [srcq+ssq*0] 2182 vbroadcasti128 m5, [srcq+ssq*1] 2183 lea r4, [srcq+ss3q] 2184 vbroadcasti128 m6, [srcq+ssq*2] 2185 vbroadcasti128 m0, [r4+ssq*0] 2186 mov r7, dstq 2187 vbroadcasti128 m1, [r4+ssq*1] 2188 vbroadcasti128 m2, [r4+ssq*2] 2189 add r4, ss3q 2190 vbroadcasti128 m3, [r4+ssq*0] 2191 shufpd m4, m0, 0x0c 2192 shufpd m5, m1, 0x0c 2193 punpcklbw m1, m4, m5 ; 01 2194 punpckhbw m4, m5 ; 34 2195 shufpd m6, m2, 0x0c 2196 punpcklbw m2, m5, m6 ; 12 2197 punpckhbw m5, m6 ; 45 2198 shufpd m0, m3, 0x0c 2199 punpcklbw m3, m6, m0 ; 23 2200 punpckhbw m6, m0 ; 56 2201.v_w16_loop: 2202 vbroadcasti128 m12, [r4+ssq*1] 2203 lea r4, [r4+ssq*2] 2204 pmaddubsw m13, m1, m8 ; a0 2205 pmaddubsw m14, m2, m8 ; b0 2206 mova m1, m3 2207 mova m2, m4 2208 pmaddubsw m3, m9 ; a1 2209 pmaddubsw m4, m9 ; b1 2210 paddw m13, m3 2211 paddw m14, m4 2212 mova m3, m5 2213 mova m4, m6 2214 pmaddubsw m5, m10 ; a2 2215 pmaddubsw m6, m10 ; b2 2216 paddw m13, m5 2217 vbroadcasti128 m5, [r4+ssq*0] 2218 paddw m14, m6 2219 shufpd m6, m0, m12, 0x0d 2220 shufpd m0, m12, m5, 0x0c 2221 punpcklbw m5, m6, m0 ; 67 2222 punpckhbw m6, m0 ; 78 2223 pmaddubsw m12, m5, m11 ; a3 2224 paddw m13, m12 2225 pmaddubsw m12, m6, m11 ; b3 2226 paddw m14, m12 2227 pmulhrsw m13, m7 2228 pmulhrsw m14, m7 2229 packuswb m13, m14 2230 vpermq m13, m13, q3120 2231 mova [r7+dsq*0], xm13 2232 vextracti128 [r7+dsq*1], m13, 1 2233 lea r7, [r7+dsq*2] 2234 sub hd, 2 2235 jg .v_w16_loop 2236 add srcq, 16 2237 add dstq, 16 2238 movzx hd, r6b 2239 sub r6d, 1<<8 2240 jg .v_w16_loop0 2241 RET 2242.h: 2243.h_w2: 2244.h_w4: 2245 test myd, 0xf00 2246 jnz .hv 2247 vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) 2248 cmp wd, 4 2249 jle mangle(private_prefix %+ _put_6tap_8bpc_avx2).h_w2 2250 WIN64_SPILL_XMM 11 2251 tzcnt wd, wd 2252 vbroadcasti128 m6, [subpel_h_shufA] 2253 shr mxd, 16 2254 vbroadcasti128 m7, [subpel_h_shufB] 2255 sub srcq, 3 2256 vbroadcasti128 m8, [subpel_h_shufC] 2257 movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] 2258 vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] 2259 vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] 2260 add wq, r8 2261 jmp wq 2262.h_w8: 2263%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] 2264 pshufb m%2, m%1, m7 2265 pshufb m%3, m%1, m8 2266 pshufb m%1, m6 2267 pmaddubsw m%4, m%2, m9 2268 pmaddubsw m%2, m10 2269 pmaddubsw m%3, m10 2270 pmaddubsw m%1, m9 2271 paddw m%3, m%4 2272 paddw m%1, m%2 2273 phaddw m%1, m%3 2274 paddw m%1, m5 2275 psraw m%1, 6 2276%endmacro 2277 movu xm0, [srcq+ssq*0] 2278 vinserti128 m0, [srcq+ssq*1], 1 2279 lea srcq, [srcq+ssq*2] 2280 PUT_8TAP_H 0, 1, 2, 3 2281 vextracti128 xm1, m0, 1 2282 packuswb xm0, xm1 2283 movq [dstq+dsq*0], xm0 2284 movhps [dstq+dsq*1], xm0 2285 lea dstq, [dstq+dsq*2] 2286 sub hd, 2 2287 jg .h_w8 2288 RET 2289.h_w16: 2290 movu xm0, [srcq+ssq*0+8*0] 2291 vinserti128 m0, [srcq+ssq*1+8*0], 1 2292 movu xm1, [srcq+ssq*0+8*1] 2293 vinserti128 m1, [srcq+ssq*1+8*1], 1 2294 PUT_8TAP_H 0, 2, 3, 4 2295 lea srcq, [srcq+ssq*2] 2296 PUT_8TAP_H 1, 2, 3, 4 2297 packuswb m0, m1 2298 mova [dstq+dsq*0], xm0 2299 vextracti128 [dstq+dsq*1], m0, 1 2300 lea dstq, [dstq+dsq*2] 2301 sub hd, 2 2302 jg .h_w16 2303 RET 2304.h_w32: 2305 xor r6d, r6d 2306 jmp .h_start 2307.h_w64: 2308 mov r6, -32*1 2309 jmp .h_start 2310.h_w128: 2311 mov r6, -32*3 2312.h_start: 2313 sub srcq, r6 2314 sub dstq, r6 2315 mov r4, r6 2316.h_loop: 2317 movu m0, [srcq+r6+8*0] 2318 movu m1, [srcq+r6+8*1] 2319 PUT_8TAP_H 0, 2, 3, 4 2320 PUT_8TAP_H 1, 2, 3, 4 2321 packuswb m0, m1 2322 mova [dstq+r6], m0 2323 add r6, 32 2324 jle .h_loop 2325 add srcq, ssq 2326 add dstq, dsq 2327 mov r6, r4 2328 dec hd 2329 jg .h_loop 2330 RET 2331.hv: 2332 WIN64_SPILL_XMM 14, 16 2333 cmp wd, 4 2334 jg .hv_w8 2335 movzx mxd, mxb 2336 dec srcq 2337 vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2] 2338 movzx mxd, myb 2339 shr myd, 16 2340 cmp hd, 6 2341 cmovs myd, mxd 2342 vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] 2343 lea ss3q, [ssq*3] 2344 sub srcq, ss3q 2345 punpcklbw m0, m0 2346 psraw m0, 8 ; sign-extend 2347 vpbroadcastd m8, [pw_8192] 2348 vpbroadcastd m9, [pd_512] 2349 pshufd m10, m0, q0000 2350 pshufd m11, m0, q1111 2351 pshufd m12, m0, q2222 2352 pshufd m13, m0, q3333 2353 cmp wd, 4 2354 je .hv_w4 2355 vbroadcasti128 m6, [subpel_h_shuf4] 2356 movq xm2, [srcq+ssq*0] 2357 movhps xm2, [srcq+ssq*1] 2358 movq xm0, [srcq+ssq*2] 2359 add srcq, ss3q 2360 movhps xm0, [srcq+ssq*0] 2361 vpbroadcastq m3, [srcq+ssq*1] 2362 vpbroadcastq m4, [srcq+ssq*2] 2363 add srcq, ss3q 2364 vpbroadcastq m1, [srcq+ssq*0] 2365 vpblendd m2, m3, 0x30 2366 vpblendd m0, m1, 0x30 2367 vpblendd m2, m4, 0xc0 2368 pshufb m2, m6 2369 pshufb m0, m6 2370 pmaddubsw m2, m7 2371 pmaddubsw m0, m7 2372 phaddw m2, m0 2373 pmulhrsw m2, m8 2374 vextracti128 xm3, m2, 1 2375 palignr xm4, xm3, xm2, 4 2376 punpcklwd xm1, xm2, xm4 ; 01 12 2377 punpckhwd xm2, xm4 ; 23 34 2378 pshufd xm0, xm3, q2121 2379 punpcklwd xm3, xm0 ; 45 56 2380.hv_w2_loop: 2381 movq xm4, [srcq+ssq*1] 2382 lea srcq, [srcq+ssq*2] 2383 movhps xm4, [srcq+ssq*0] 2384 pshufb xm4, xm6 2385 pmaddubsw xm4, xm7 2386 pmaddwd xm5, xm1, xm10 ; a0 b0 2387 mova xm1, xm2 2388 pmaddwd xm2, xm11 ; a1 b1 2389 paddd xm5, xm2 2390 mova xm2, xm3 2391 pmaddwd xm3, xm12 ; a2 b2 2392 phaddw xm4, xm4 2393 pmulhrsw xm4, xm8 2394 paddd xm5, xm3 2395 palignr xm3, xm4, xm0, 12 2396 mova xm0, xm4 2397 punpcklwd xm3, xm0 ; 67 78 2398 pmaddwd xm4, xm3, xm13 ; a3 b3 2399 paddd xm5, xm9 2400 paddd xm5, xm4 2401 psrad xm5, 10 2402 packssdw xm5, xm5 2403 packuswb xm5, xm5 2404 pextrw [dstq+dsq*0], xm5, 0 2405 pextrw [dstq+dsq*1], xm5, 1 2406 lea dstq, [dstq+dsq*2] 2407 sub hd, 2 2408 jg .hv_w2_loop 2409 RET 2410.hv_w4: 2411 mova m6, [subpel_h_shuf4] 2412 vpbroadcastq m2, [srcq+ssq*0] 2413 vpbroadcastq m4, [srcq+ssq*1] 2414 vpbroadcastq m0, [srcq+ssq*2] 2415 add srcq, ss3q 2416 vpbroadcastq m5, [srcq+ssq*0] 2417 vpbroadcastq m3, [srcq+ssq*1] 2418 vpblendd m2, m4, 0xcc ; 0 1 2419 vpbroadcastq m4, [srcq+ssq*2] 2420 add srcq, ss3q 2421 vpbroadcastq m1, [srcq+ssq*0] 2422 vpblendd m0, m5, 0xcc ; 2 3 2423 vpblendd m3, m4, 0xcc ; 4 5 2424 pshufb m2, m6 2425 pshufb m0, m6 2426 pshufb m3, m6 2427 pshufb m1, m6 2428 pmaddubsw m2, m7 2429 pmaddubsw m0, m7 2430 pmaddubsw m3, m7 2431 pmaddubsw m1, m7 2432 phaddw m2, m0 2433 phaddw m3, m1 2434 pmulhrsw m2, m8 2435 pmulhrsw m3, m8 2436 palignr m4, m3, m2, 4 2437 punpcklwd m1, m2, m4 ; 01 12 2438 punpckhwd m2, m4 ; 23 34 2439 pshufd m0, m3, q2121 2440 punpcklwd m3, m0 ; 45 56 2441.hv_w4_loop: 2442 vpbroadcastq m4, [srcq+ssq*1] 2443 lea srcq, [srcq+ssq*2] 2444 pmaddwd m5, m1, m10 ; a0 b0 2445 mova m1, m2 2446 pmaddwd m2, m11 ; a1 b1 2447 paddd m5, m2 2448 mova m2, m3 2449 pmaddwd m3, m12 ; a2 b2 2450 paddd m5, m3 2451 vpbroadcastq m3, [srcq+ssq*0] 2452 vpblendd m4, m3, 0xcc ; 7 8 2453 pshufb m4, m6 2454 pmaddubsw m4, m7 2455 phaddw m4, m4 2456 pmulhrsw m4, m8 2457 palignr m3, m4, m0, 12 2458 mova m0, m4 2459 punpcklwd m3, m0 ; 67 78 2460 pmaddwd m4, m3, m13 ; a3 b3 2461 paddd m5, m9 2462 paddd m5, m4 2463 psrad m5, 10 2464 vextracti128 xm4, m5, 1 2465 packssdw xm5, xm4 2466 packuswb xm5, xm5 2467 pshuflw xm5, xm5, q3120 2468 movd [dstq+dsq*0], xm5 2469 pextrd [dstq+dsq*1], xm5, 1 2470 lea dstq, [dstq+dsq*2] 2471 sub hd, 2 2472 jg .hv_w4_loop 2473 RET 2474.hv_w8: 2475 WIN64_PUSH_XMM 16 2476 shr mxd, 16 2477 sub srcq, 3 2478 vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] 2479 vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4] 2480 movzx mxd, myb 2481 shr myd, 16 2482 cmp hd, 6 2483 cmovs myd, mxd 2484 vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] 2485 lea ss3q, [ssq*3] 2486 sub srcq, ss3q 2487 punpcklbw m0, m0 2488 psraw m0, 8 ; sign-extend 2489 pshufd m12, m0, q0000 2490 pshufd m13, m0, q1111 2491 pshufd m14, m0, q2222 2492 pshufd m15, m0, q3333 2493 lea r6d, [wq*8-64] 2494 lea r6d, [hq+r6*4] 2495.hv_w8_loop0: 2496 vbroadcasti128 m7, [subpel_h_shufA] 2497 movu xm4, [srcq+ssq*0] 2498 lea r4, [srcq+ss3q] 2499 vbroadcasti128 m8, [subpel_h_shufB] 2500 movu xm5, [srcq+ssq*1] 2501 mov r7, dstq 2502 vbroadcasti128 m9, [subpel_h_shufC] 2503 movu xm6, [srcq+ssq*2] 2504 vbroadcasti128 m0, [r4+ssq*0] 2505 vpblendd m4, m0, 0xf0 ; 0 3 2506 vinserti128 m5, [r4+ssq*1], 1 ; 1 4 2507 vinserti128 m6, [r4+ssq*2], 1 ; 2 5 2508 add r4, ss3q 2509 vinserti128 m0, [r4+ssq*0], 1 ; 3 6 2510%macro HV_H_8TAP_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] 2511 pshufb %3, %1, %6 2512 pshufb %4, %1, %7 2513 pshufb %1, %5 2514 pmaddubsw %2, %3, m10 2515 pmaddubsw %4, m11 2516 pmaddubsw %3, m11 2517 pmaddubsw %1, m10 2518 paddw %2, %4 2519 paddw %1, %3 2520 phaddw %1, %2 2521%endmacro 2522 HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9 2523 HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9 2524 HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9 2525 HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9 2526 vpbroadcastd m7, [pw_8192] 2527 vpermq m4, m4, q3120 2528 vpermq m5, m5, q3120 2529 vpermq m6, m6, q3120 2530 pmulhrsw m0, m7 2531 pmulhrsw m4, m7 2532 pmulhrsw m5, m7 2533 pmulhrsw m6, m7 2534 vpermq m7, m0, q3120 2535 punpcklwd m1, m4, m5 ; 01 2536 punpckhwd m4, m5 ; 34 2537 punpcklwd m2, m5, m6 ; 12 2538 punpckhwd m5, m6 ; 45 2539 punpcklwd m3, m6, m7 ; 23 2540 punpckhwd m6, m7 ; 56 2541.hv_w8_loop: 2542 vextracti128 r6m, m0, 1 ; not enough registers 2543 movu xm0, [r4+ssq*1] 2544 lea r4, [r4+ssq*2] 2545 vinserti128 m0, [r4+ssq*0], 1 ; 7 8 2546 pmaddwd m8, m1, m12 ; a0 2547 pmaddwd m9, m2, m12 ; b0 2548 mova m1, m3 2549 mova m2, m4 2550 pmaddwd m3, m13 ; a1 2551 pmaddwd m4, m13 ; b1 2552 paddd m8, m3 2553 paddd m9, m4 2554 mova m3, m5 2555 mova m4, m6 2556 pmaddwd m5, m14 ; a2 2557 pmaddwd m6, m14 ; b2 2558 paddd m8, m5 2559 paddd m9, m6 2560 vbroadcasti128 m6, [subpel_h_shufB] 2561 vbroadcasti128 m7, [subpel_h_shufC] 2562 vbroadcasti128 m5, [subpel_h_shufA] 2563 HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7 2564 vpbroadcastd m5, [pw_8192] 2565 vpbroadcastd m7, [pd_512] 2566 vbroadcasti128 m6, r6m 2567 pmulhrsw m0, m5 2568 paddd m8, m7 2569 paddd m9, m7 2570 vpermq m7, m0, q3120 ; 7 8 2571 shufpd m6, m7, 0x04 ; 6 7 2572 punpcklwd m5, m6, m7 ; 67 2573 punpckhwd m6, m7 ; 78 2574 pmaddwd m7, m5, m15 ; a3 2575 paddd m8, m7 2576 pmaddwd m7, m6, m15 ; b3 2577 paddd m7, m9 2578 psrad m8, 10 2579 psrad m7, 10 2580 packssdw m8, m7 2581 vextracti128 xm7, m8, 1 2582 packuswb xm8, xm7 2583 pshufd xm7, xm8, q3120 2584 movq [r7+dsq*0], xm7 2585 movhps [r7+dsq*1], xm7 2586 lea r7, [r7+dsq*2] 2587 sub hd, 2 2588 jg .hv_w8_loop 2589 add srcq, 8 2590 add dstq, 8 2591 movzx hd, r6b 2592 sub r6d, 1<<8 2593 jg .hv_w8_loop0 2594 RET 2595 2596%if WIN64 2597DECLARE_REG_TMP 6, 4 2598%else 2599DECLARE_REG_TMP 6, 7 2600%endif 2601 2602%define PREP_8TAP_FN FN prep_8tap, 2603PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc 2604PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc 2605PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc 2606PREP_8TAP_FN regular, REGULAR, REGULAR 2607 2608cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my, ns 2609 imul mxd, mxm, 0x010101 2610 add mxd, t0d ; 6tap_h, mx, 4tap_h 2611 imul myd, mym, 0x010101 2612 add myd, t1d ; 6tap_v, my, 4tap_v 2613 lea r7, [prep%+SUFFIX] 2614 mov wd, wm 2615 movifnidn hd, hm 2616 test mxd, 0xf00 2617 jnz .h 2618 test myd, 0xf00 2619 jnz .v 2620.prep: 2621 tzcnt wd, wd 2622 movzx wd, word [r7+wq*2+table_offset(prep,)] 2623 add wq, r7 2624 lea r6, [ssq*3] 2625%if WIN64 2626 pop r7 2627%endif 2628 jmp wq 2629.v: 2630 WIN64_SPILL_XMM 10, 12 2631 movzx mxd, myb 2632 shr myd, 16 2633 cmp hd, 4 2634 cmove myd, mxd 2635 lea myq, [r7+myq*8+subpel_filters+1-prep%+SUFFIX] 2636 vpbroadcastd m9, [pw_8192] 2637 vpbroadcastw m6, [myq+0] 2638 mov nsq, ssq 2639 vpbroadcastw m7, [myq+2] 2640 neg nsq 2641 vpbroadcastw m8, [myq+4] 2642 cmp wd, 8 2643 jg .v_w16 2644 je .v_w8 2645.v_w4: 2646 movd xm2, [srcq+nsq*2] 2647 pinsrd xm2, [srcq+nsq*1], 1 2648 vpbroadcastd m1, [srcq+ssq*0] 2649 vpbroadcastd m3, [srcq+ssq*1] 2650 vpbroadcastd m0, [srcq+ssq*2] 2651 vbroadcasti128 m5, [deint_shuf4] 2652 vpblendd m1, m2, 0xeb 2653 punpcklqdq m3, m0 2654 vpblendd m1, m3, 0x60 ; 0 1 2 _ 2 3 4 _ 2655 pshufb m1, m5 ; 01 12 23 34 2656.v_w4_loop: 2657 lea srcq, [srcq+ssq*4] 2658 pinsrd xm0, [srcq+nsq*1], 1 2659 vpbroadcastd m2, [srcq+ssq*0] 2660 vpbroadcastd m3, [srcq+ssq*1] 2661 vpblendd m2, m0, 0xeb 2662 vpbroadcastd m0, [srcq+ssq*2] 2663 punpcklqdq m3, m0 2664 vpblendd m2, m3, 0x60 ; 4 5 6 _ 6 7 8 _ 2665 pshufb m2, m5 ; 45 56 67 78 2666 pmaddubsw m3, m1, m6 ; a0 b0 c0 d0 2667 vperm2i128 m1, m2, 0x21 ; 23 34 45 56 2668 pmaddubsw m4, m2, m8 ; a2 b2 c2 d2 2669 pmaddubsw m1, m7 ; a1 b1 c1 d1 2670 paddw m3, m4 2671 paddw m3, m1 2672 pmulhrsw m3, m9 2673 mova m1, m2 2674 mova [tmpq], m3 2675 add tmpq, 32 2676 sub hd, 4 2677 jg .v_w4_loop 2678 RET 2679.v_w8: 2680 movq xm1, [srcq+nsq*2] 2681 vpbroadcastq m3, [srcq+nsq*1] 2682 vpbroadcastq m2, [srcq+ssq*0] 2683 vpbroadcastq m4, [srcq+ssq*1] 2684 vpbroadcastq m0, [srcq+ssq*2] 2685 vpblendd m1, m3, 0x30 2686 vpblendd m3, m2, 0x30 2687 punpcklbw m1, m3 ; 01 12 2688 vpblendd m2, m4, 0x30 2689 vpblendd m4, m0, 0x30 2690 punpcklbw m2, m4 ; 23 34 2691.v_w8_loop: 2692 lea srcq, [srcq+ssq*4] 2693 pmaddubsw m1, m6 ; a0 2694 vpbroadcastq m3, [srcq+nsq*1] 2695 pmaddubsw m4, m2, m7 ; a1 2696 pmaddubsw m5, m2, m6 ; b0 2697 vpbroadcastq m2, [srcq+ssq*0] 2698 vpblendd m0, m3, 0x30 2699 vpblendd m3, m2, 0x30 2700 paddw m4, m1 2701 punpcklbw m1, m0, m3 ; 45 56 2702 vpbroadcastq m3, [srcq+ssq*1] 2703 vpbroadcastq m0, [srcq+ssq*2] 2704 vpblendd m2, m3, 0x30 2705 vpblendd m3, m0, 0x30 2706 punpcklbw m2, m3 ; 67 78 2707 pmaddubsw m3, m1, m7 ; b1 2708 paddw m5, m3 2709 pmaddubsw m3, m1, m8 ; a2 2710 paddw m4, m3 2711 pmaddubsw m3, m2, m8 ; b2 2712 paddw m5, m3 2713 pmulhrsw m4, m9 2714 pmulhrsw m5, m9 2715 mova [tmpq+32*0], m4 2716 mova [tmpq+32*1], m5 2717 add tmpq, 32*2 2718 sub hd, 4 2719 jg .v_w8_loop 2720 RET 2721.v_w16: 2722 lea r6d, [wq*2-32] 2723 lea srcq, [srcq+nsq*2] 2724 WIN64_PUSH_XMM 12 2725 lea r6d, [hq+r6*8] 2726.v_w16_loop0: 2727 vbroadcasti128 m3, [srcq+ssq*0] 2728 lea r5, [srcq+ssq*2] 2729 vbroadcasti128 m4, [srcq+ssq*1] 2730 mov r7, tmpq 2731 vbroadcasti128 m0, [r5+ssq*0] 2732 vbroadcasti128 m1, [r5+ssq*1] 2733 lea r5, [r5+ssq*2] 2734 vbroadcasti128 m2, [r5+ssq*0] 2735 shufpd m3, m0, 0x0c 2736 shufpd m4, m1, 0x0c 2737 punpcklbw m1, m3, m4 ; 01 2738 punpckhbw m3, m4 ; 23 2739 shufpd m0, m2, 0x0c 2740 punpcklbw m2, m4, m0 ; 12 2741 punpckhbw m4, m0 ; 34 2742.v_w16_loop: 2743 vbroadcasti128 m5, [r5+ssq*1] 2744 pmaddubsw m10, m1, m6 ; a0 2745 lea r5, [r5+ssq*2] 2746 pmaddubsw m11, m2, m6 ; b0 2747 mova m1, m3 2748 pmaddubsw m3, m7 ; a1 2749 mova m2, m4 2750 pmaddubsw m4, m7 ; b1 2751 paddw m10, m3 2752 vbroadcasti128 m3, [r5+ssq*0] 2753 paddw m11, m4 2754 shufpd m4, m0, m5, 0x0d 2755 shufpd m0, m5, m3, 0x0c 2756 punpcklbw m3, m4, m0 ; 45 2757 punpckhbw m4, m0 ; 56 2758 pmaddubsw m5, m3, m8 ; a2 2759 paddw m10, m5 2760 pmaddubsw m5, m4, m8 ; b2 2761 paddw m11, m5 2762 pmulhrsw m10, m9 2763 pmulhrsw m11, m9 2764 mova [r7+wq*0], m10 2765 mova [r7+wq*2], m11 2766 lea r7, [r7+wq*4] 2767 sub hd, 2 2768 jg .v_w16_loop 2769 add srcq, 16 2770 add tmpq, 32 2771 movzx hd, r6b 2772 sub r6d, 1<<8 2773 jg .v_w16_loop0 2774 RET 2775.h_w4: 2776 RESET_STACK_STATE 2777 movzx mxd, mxb 2778 vbroadcasti128 m3, [subpel_h_shufA] 2779 dec srcq 2780 vpbroadcastd m5, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] 2781 lea r3, [ssq*3] 2782.h_w4_loop: 2783 movq xm0, [srcq+ssq*0] 2784 vpbroadcastq m2, [srcq+ssq*2] 2785 movq xm1, [srcq+ssq*1] 2786 vpblendd m0, m2, 0x30 2787 vpbroadcastq m2, [srcq+r3 ] 2788 lea srcq, [srcq+ssq*4] 2789 vpblendd m1, m2, 0x30 2790 pshufb m0, m3 2791 pshufb m1, m3 2792 pmaddubsw m0, m5 2793 pmaddubsw m1, m5 2794 phaddw m0, m1 2795 pmulhrsw m0, m4 2796 mova [tmpq], m0 2797 add tmpq, 32 2798 sub hd, 4 2799 jg .h_w4_loop 2800 RET 2801.h: 2802 test myd, 0xf00 2803 jnz .hv 2804 vpbroadcastd m4, [pw_8192] 2805 cmp wd, 4 2806 je .h_w4 2807 WIN64_SPILL_XMM 10 2808 tzcnt wd, wd 2809 vbroadcasti128 m3, [z_filter_s+ 2] 2810 shr mxd, 16 2811 vbroadcasti128 m5, [z_filter_s+ 6] 2812 sub srcq, 2 2813 vbroadcasti128 m6, [z_filter_s+10] 2814 lea mxq, [r7+mxq*8+subpel_filters+1-prep%+SUFFIX] 2815 movzx wd, word [r7+wq*2+table_offset(prep, _6tap_h)] 2816 vpbroadcastw m7, [mxq+0] 2817 vpbroadcastw m8, [mxq+2] 2818 add wq, r7 2819 vpbroadcastw m9, [mxq+4] 2820 jmp wq 2821.h_w8: 2822 movu xm0, [srcq+ssq*0] 2823 vinserti128 m0, [srcq+ssq*1], 1 2824 lea srcq, [srcq+ssq*2] 2825%macro PREP_6TAP_H 0 2826 pshufb m1, m0, m3 2827 pmaddubsw m1, m7 2828 pshufb m2, m0, m5 2829 pmaddubsw m2, m8 2830 pshufb m0, m6 2831 pmaddubsw m0, m9 2832 paddw m1, m2 2833 paddw m0, m1 2834 pmulhrsw m0, m4 2835%endmacro 2836 PREP_6TAP_H 2837 mova [tmpq], m0 2838 add tmpq, 32 2839 sub hd, 2 2840 jg .h_w8 2841 RET 2842.h_w16: 2843 movu xm0, [srcq+ssq*0+8*0] 2844 vinserti128 m0, [srcq+ssq*0+8*1], 1 2845 PREP_6TAP_H 2846 mova [tmpq+32*0], m0 2847 movu xm0, [srcq+ssq*1+8*0] 2848 vinserti128 m0, [srcq+ssq*1+8*1], 1 2849 lea srcq, [srcq+ssq*2] 2850 PREP_6TAP_H 2851 mova [tmpq+32*1], m0 2852 add tmpq, 32*2 2853 sub hd, 2 2854 jg .h_w16 2855 RET 2856.h_w32: 2857 xor r6d, r6d 2858 jmp .h_start 2859.h_w64: 2860 mov r6, -32*1 2861 jmp .h_start 2862.h_w128: 2863 mov r6, -32*3 2864.h_start: 2865 sub srcq, r6 2866 mov r5, r6 2867.h_loop: 2868 movu xm0, [srcq+r6+8*0] 2869 vinserti128 m0, [srcq+r6+8*1], 1 2870 PREP_6TAP_H 2871 mova [tmpq+32*0], m0 2872 movu xm0, [srcq+r6+8*2] 2873 vinserti128 m0, [srcq+r6+8*3], 1 2874 PREP_6TAP_H 2875 mova [tmpq+32*1], m0 2876 add tmpq, 32*2 2877 add r6, 32 2878 jle .h_loop 2879 add srcq, ssq 2880 mov r6, r5 2881 dec hd 2882 jg .h_loop 2883 RET 2884.hv: 2885 WIN64_SPILL_XMM 14, 16 2886 cmp wd, 4 2887 jne .hv_w8 2888.hv_w4: 2889 movzx mxd, mxb 2890 dec srcq 2891 vpbroadcastd m7, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] 2892 movzx mxd, myb 2893 shr myd, 16 2894 cmp hd, 4 2895 cmove myd, mxd 2896 mova m6, [subpel_h_shuf4] 2897 vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep%+SUFFIX] 2898 mov nsq, ssq 2899 pmovzxbd m13, [deint_shuf4] 2900 neg nsq 2901 vpbroadcastd m8, [pw_8192] 2902 vpbroadcastd m9, [pd_32] 2903 punpcklbw m0, m0 2904 vpbroadcastq m2, [srcq+nsq*2] 2905 psraw m0, 8 ; sign-extend 2906 vpbroadcastq m4, [srcq+nsq*1] 2907 pshufd m10, m0, q0000 2908 vpbroadcastq m1, [srcq+ssq*0] 2909 pshufd m11, m0, q1111 2910 vpbroadcastq m3, [srcq+ssq*1] 2911 pshufd m12, m0, q2222 2912 vpbroadcastq m0, [srcq+ssq*2] 2913 vpblendd m2, m4, 0xcc ; 0 1 2914 vpblendd m1, m3, 0xcc ; 2 3 2915 pshufb m2, m6 2916 pshufb m1, m6 2917 pshufb m0, m6 2918 pmaddubsw m2, m7 2919 pmaddubsw m1, m7 2920 pmaddubsw m0, m7 2921 phaddw m2, m1 ; 0 1 2 3 2922 phaddw m0, m0 ; 4 2923 pmulhrsw m2, m8 2924 pmulhrsw m0, m8 2925 palignr m0, m2, 4 2926 punpcklwd m1, m2, m0 ; 01 12 2927 punpckhwd m2, m0 ; 23 34 2928.hv_w4_loop: 2929 pmaddwd m4, m10, m1 ; a0 b0 2930 lea srcq, [srcq+ssq*4] 2931 pmaddwd m5, m2, m10 ; c0 d0 2932 vpbroadcastq m1, [srcq+nsq*1] 2933 pmaddwd m2, m11 ; a1 b1 2934 vpbroadcastq m3, [srcq+ssq*0] 2935 paddd m4, m2 2936 vpbroadcastq m2, [srcq+ssq*1] 2937 vpblendd m1, m3, 0xcc ; 5 6 2938 vpbroadcastq m3, [srcq+ssq*2] 2939 vpblendd m2, m3, 0xcc ; 7 8 2940 pshufb m1, m6 2941 pshufb m2, m6 2942 pmaddubsw m1, m7 2943 pmaddubsw m2, m7 2944 phaddw m1, m2 ; 5 6 7 8 2945 pmulhrsw m1, m8 2946 paddd m5, m9 2947 paddd m4, m9 2948 palignr m2, m1, m0, 12 2949 mova m0, m1 2950 punpcklwd m1, m2, m0 ; 45 56 2951 punpckhwd m2, m0 ; 67 78 2952 pmaddwd m3, m11, m1 ; c1 d1 2953 paddd m5, m3 2954 pmaddwd m3, m12, m1 ; a2 b2 2955 paddd m4, m3 2956 pmaddwd m3, m12, m2 ; c2 d2 2957 paddd m5, m3 2958 psrad m4, 6 2959 psrad m5, 6 2960 packssdw m4, m5 2961 vpermd m4, m13, m4 2962 mova [tmpq], m4 2963 add tmpq, 32 2964 sub hd, 4 2965 jg .hv_w4_loop 2966 RET 2967.hv_w8: 2968 shr mxd, 16 2969 lea mxq, [r7+mxq*8+subpel_filters+1-prep_avx2] 2970 WIN64_PUSH_XMM 16 2971 vpbroadcastw m10, [mxq+0] 2972 vpbroadcastw m11, [mxq+2] 2973 vpbroadcastw m12, [mxq+4] 2974 movzx mxd, myb 2975 shr myd, 16 2976 cmp hd, 6 2977 cmovs myd, mxd 2978 vpbroadcastq m0, [r7+myq*8+subpel_filters+1-prep_avx2] 2979 lea r7, [ssq*2+2] 2980 vbroadcasti128 m8, [z_filter_s+ 6] 2981 punpcklbw m0, m0 2982 vbroadcasti128 m9, [z_filter_s+10] 2983 psraw m0, 8 ; sign-extend 2984 lea r6d, [wq*8-64] 2985 pshufd m13, m0, q0000 2986 sub srcq, r7 2987 pshufd m14, m0, q1111 2988 lea r6d, [hq+r6*4] 2989 pshufd m15, m0, q2222 2990.hv_w8_loop0: 2991 vbroadcasti128 m7, [z_filter_s+2] 2992 movu xm3, [srcq+ssq*0] 2993 lea r5, [srcq+ssq*2] 2994 movu xm4, [srcq+ssq*1] 2995 vbroadcasti128 m0, [r5+ssq*0] 2996 mov r7, tmpq 2997 vinserti128 m4, [r5+ssq*1], 1 ; 1 3 2998 lea r5, [r5+ssq*2] 2999 vpblendd m3, m0, 0xf0 ; 0 2 3000 vinserti128 m0, [r5+ssq*0], 1 ; 2 4 3001 vpbroadcastd m5, [pw_8192] 3002 HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9 3003 HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9 3004 HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9 3005 vpermq m3, m3, q3120 3006 vpermq m4, m4, q3120 3007 vpermq m0, m0, q3120 3008 pmulhrsw m3, m5 3009 pmulhrsw m4, m5 3010 pmulhrsw m0, m5 3011 punpcklwd m1, m3, m4 ; 01 3012 punpckhwd m3, m4 ; 23 3013 punpcklwd m2, m4, m0 ; 12 3014 punpckhwd m4, m0 ; 34 3015.hv_w8_loop: 3016 movu xm7, [r5+ssq*1] 3017 lea r5, [r5+ssq*2] 3018 vinserti128 m7, [r5+ssq*0], 1 ; 5 6 3019 pmaddwd m5, m13, m1 ; a0 3020 mova m1, m3 3021 pmaddwd m6, m13, m2 ; b0 3022 mova m2, m4 3023 pmaddwd m3, m14 ; a1 3024 pmaddwd m4, m14 ; b1 3025 paddd m5, m3 3026 vbroadcasti128 m3, [z_filter_s+2] 3027 paddd m6, m4 3028 HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9 3029 vpbroadcastd m3, [pw_8192] 3030 vpbroadcastd m4, [pd_32] 3031 pmulhrsw m7, m3 3032 paddd m5, m4 3033 paddd m6, m4 3034 mova m4, m0 3035 vpermq m0, m7, q3120 3036 shufpd m4, m0, 0x05 3037 punpcklwd m3, m4, m0 ; 45 3038 pmaddwd m7, m15, m3 ; a2 3039 punpckhwd m4, m0 ; 67 3040 paddd m5, m7 3041 pmaddwd m7, m15, m4 ; b2 3042 paddd m6, m7 3043 psrad m5, 6 3044 psrad m6, 6 3045 packssdw m5, m6 3046 vpermq m5, m5, q3120 3047 mova [r7+wq*0], xm5 3048 vextracti128 [r7+wq*2], m5, 1 3049 lea r7, [r7+wq*4] 3050 sub hd, 2 3051 jg .hv_w8_loop 3052 add srcq, 8 3053 add tmpq, 16 3054 movzx hd, r6b 3055 sub r6d, 1<<8 3056 jg .hv_w8_loop0 3057 RET 3058 3059PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc 3060PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_8bpc 3061PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc 3062PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_8bpc 3063PREP_8TAP_FN sharp, SHARP, SHARP 3064 3065cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 3066 imul mxd, mxm, 0x010101 3067 add mxd, t0d ; 8tap_h, mx, 4tap_h 3068 imul myd, mym, 0x010101 3069 add myd, t1d ; 8tap_v, my, 4tap_v 3070 lea r7, [prep%+SUFFIX] 3071 mov wd, wm 3072 movifnidn hd, hm 3073 test mxd, 0xf00 3074 jnz .h 3075 test myd, 0xf00 3076 jz mangle(private_prefix %+ _prep_6tap_8bpc_avx2).prep 3077.v: 3078 WIN64_SPILL_XMM 12, 15 3079 movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. 3080 shr myd, 16 ; Note that the code is 8-tap only, having 3081 cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 3082 cmove myd, mxd ; had a negligible effect on performance. 3083 lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX] 3084 lea stride3q, [strideq*3] 3085 sub srcq, stride3q 3086 vpbroadcastd m7, [pw_8192] 3087 vpbroadcastw m8, [myq+0] 3088 vpbroadcastw m9, [myq+2] 3089 vpbroadcastw m10, [myq+4] 3090 vpbroadcastw m11, [myq+6] 3091 cmp wd, 8 3092 jg .v_w16 3093 je .v_w8 3094.v_w4: 3095 movd xm0, [srcq+strideq*0] 3096 vpbroadcastd m1, [srcq+strideq*2] 3097 vpbroadcastd xm2, [srcq+strideq*1] 3098 add srcq, stride3q 3099 vpbroadcastd m3, [srcq+strideq*0] 3100 vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _ 3101 vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _ 3102 vpbroadcastd m0, [srcq+strideq*1] 3103 vpbroadcastd m2, [srcq+strideq*2] 3104 vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _ 3105 vpbroadcastd m0, [srcq+stride3q ] 3106 vbroadcasti128 m5, [deint_shuf4] 3107 vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5 3108 vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5 3109 vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _ 3110 punpcklbw m1, m2, m3 ; 01 12 23 34 3111 vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6 3112 punpckhbw m2, m3 ; 23 34 45 56 3113.v_w4_loop: 3114 lea srcq, [srcq+strideq*4] 3115 pinsrd xm0, [srcq+strideq*0], 1 3116 vpbroadcastd m3, [srcq+strideq*1] 3117 vpbroadcastd m4, [srcq+strideq*2] 3118 vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _ 3119 vpbroadcastd m0, [srcq+stride3q ] 3120 vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _ 3121 vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _ 3122 pshufb m3, m5 ; 67 78 89 9a 3123 pmaddubsw m4, m1, m8 3124 vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78 3125 pmaddubsw m2, m9 3126 paddw m4, m2 3127 mova m2, m3 3128 pmaddubsw m3, m11 3129 paddw m3, m4 3130 pmaddubsw m4, m1, m10 3131 paddw m3, m4 3132 pmulhrsw m3, m7 3133 mova [tmpq], m3 3134 add tmpq, 32 3135 sub hd, 4 3136 jg .v_w4_loop 3137 RET 3138.v_w8: 3139 movq xm1, [srcq+strideq*0] 3140 vpbroadcastq m4, [srcq+strideq*1] 3141 vpbroadcastq m2, [srcq+strideq*2] 3142 vpbroadcastq m5, [srcq+stride3q ] 3143 lea srcq, [srcq+strideq*4] 3144 vpbroadcastq m3, [srcq+strideq*0] 3145 vpbroadcastq m6, [srcq+strideq*1] 3146 vpbroadcastq m0, [srcq+strideq*2] 3147 vpblendd m1, m4, 0x30 3148 vpblendd m4, m2, 0x30 3149 punpcklbw m1, m4 ; 01 12 3150 vpblendd m2, m5, 0x30 3151 vpblendd m5, m3, 0x30 3152 punpcklbw m2, m5 ; 23 34 3153 vpblendd m3, m6, 0x30 3154 vpblendd m6, m0, 0x30 3155 punpcklbw m3, m6 ; 45 56 3156.v_w8_loop: 3157 vpbroadcastq m4, [srcq+stride3q ] 3158 lea srcq, [srcq+strideq*4] 3159 pmaddubsw m5, m2, m9 ; a1 3160 pmaddubsw m6, m2, m8 ; b0 3161 vpblendd m2, m0, m4, 0x30 3162 vpbroadcastq m0, [srcq+strideq*0] 3163 vpblendd m4, m0, 0x30 3164 punpcklbw m2, m4 ; 67 78 3165 pmaddubsw m1, m8 ; a0 3166 pmaddubsw m4, m3, m9 ; b1 3167 paddw m5, m1 3168 mova m1, m3 3169 pmaddubsw m3, m10 ; a2 3170 paddw m6, m4 3171 paddw m5, m3 3172 vpbroadcastq m4, [srcq+strideq*1] 3173 vpblendd m3, m0, m4, 0x30 3174 vpbroadcastq m0, [srcq+strideq*2] 3175 vpblendd m4, m0, 0x30 3176 punpcklbw m3, m4 ; 89 9a 3177 pmaddubsw m4, m2, m11 ; a3 3178 paddw m5, m4 3179 pmaddubsw m4, m2, m10 ; b2 3180 paddw m6, m4 3181 pmaddubsw m4, m3, m11 ; b3 3182 paddw m6, m4 3183 pmulhrsw m5, m7 3184 pmulhrsw m6, m7 3185 mova [tmpq+32*0], m5 3186 mova [tmpq+32*1], m6 3187 add tmpq, 32*2 3188 sub hd, 4 3189 jg .v_w8_loop 3190 RET 3191.v_w16: 3192 lea r6d, [wq*2-32] 3193 WIN64_PUSH_XMM 15 3194 lea r6d, [hq+r6*8] 3195.v_w16_loop0: 3196 vbroadcasti128 m4, [srcq+strideq*0] 3197 vbroadcasti128 m5, [srcq+strideq*1] 3198 lea r5, [srcq+strideq*2] 3199 vbroadcasti128 m0, [r5+strideq*1] 3200 vbroadcasti128 m6, [r5+strideq*0] 3201 lea r5, [r5+strideq*2] 3202 vbroadcasti128 m1, [r5+strideq*0] 3203 vbroadcasti128 m2, [r5+strideq*1] 3204 lea r5, [r5+strideq*2] 3205 vbroadcasti128 m3, [r5+strideq*0] 3206 mov r7, tmpq 3207 shufpd m4, m0, 0x0c 3208 shufpd m5, m1, 0x0c 3209 punpcklbw m1, m4, m5 ; 01 3210 punpckhbw m4, m5 ; 34 3211 shufpd m6, m2, 0x0c 3212 punpcklbw m2, m5, m6 ; 12 3213 punpckhbw m5, m6 ; 45 3214 shufpd m0, m3, 0x0c 3215 punpcklbw m3, m6, m0 ; 23 3216 punpckhbw m6, m0 ; 56 3217.v_w16_loop: 3218 vbroadcasti128 m12, [r5+strideq*1] 3219 lea r5, [r5+strideq*2] 3220 pmaddubsw m13, m1, m8 ; a0 3221 pmaddubsw m14, m2, m8 ; b0 3222 mova m1, m3 3223 mova m2, m4 3224 pmaddubsw m3, m9 ; a1 3225 pmaddubsw m4, m9 ; b1 3226 paddw m13, m3 3227 paddw m14, m4 3228 mova m3, m5 3229 mova m4, m6 3230 pmaddubsw m5, m10 ; a2 3231 pmaddubsw m6, m10 ; b2 3232 paddw m13, m5 3233 vbroadcasti128 m5, [r5+strideq*0] 3234 paddw m14, m6 3235 shufpd m6, m0, m12, 0x0d 3236 shufpd m0, m12, m5, 0x0c 3237 punpcklbw m5, m6, m0 ; 67 3238 punpckhbw m6, m0 ; 78 3239 pmaddubsw m12, m5, m11 ; a3 3240 paddw m13, m12 3241 pmaddubsw m12, m6, m11 ; b3 3242 paddw m14, m12 3243 pmulhrsw m13, m7 3244 pmulhrsw m14, m7 3245 mova [r7+wq*0], m13 3246 mova [r7+wq*2], m14 3247 lea r7, [r7+wq*4] 3248 sub hd, 2 3249 jg .v_w16_loop 3250 add srcq, 16 3251 add tmpq, 32 3252 movzx hd, r6b 3253 sub r6d, 1<<8 3254 jg .v_w16_loop0 3255 RET 3256.h: 3257.h_w4: 3258 test myd, 0xf00 3259 jnz .hv 3260 vpbroadcastd m4, [pw_8192] 3261 cmp wd, 4 3262 je mangle(private_prefix %+ _prep_6tap_8bpc_avx2).h_w4 3263 WIN64_SPILL_XMM 10 3264 vbroadcasti128 m5, [subpel_h_shufA] 3265 tzcnt wd, wd 3266 vbroadcasti128 m6, [subpel_h_shufB] 3267 vbroadcasti128 m7, [subpel_h_shufC] 3268 shr mxd, 16 3269 sub srcq, 3 3270 movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] 3271 vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] 3272 vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] 3273 add wq, r7 3274 jmp wq 3275.h_w8: 3276 movu xm0, [srcq+strideq*0] 3277 vinserti128 m0, [srcq+strideq*1], 1 3278 lea srcq, [srcq+strideq*2] 3279%macro PREP_8TAP_H 0 3280 pshufb m1, m0, m5 3281 pshufb m2, m0, m6 3282 pshufb m3, m0, m7 3283 pmaddubsw m1, m8 3284 pmaddubsw m0, m2, m8 3285 pmaddubsw m2, m9 3286 pmaddubsw m3, m9 3287 paddw m1, m2 3288 paddw m0, m3 3289 phaddw m0, m1, m0 3290 pmulhrsw m0, m4 3291%endmacro 3292 PREP_8TAP_H 3293 mova [tmpq], m0 3294 add tmpq, 32 3295 sub hd, 2 3296 jg .h_w8 3297 RET 3298.h_w16: 3299 movu xm0, [srcq+strideq*0+8*0] 3300 vinserti128 m0, [srcq+strideq*0+8*1], 1 3301 PREP_8TAP_H 3302 mova [tmpq+32*0], m0 3303 movu xm0, [srcq+strideq*1+8*0] 3304 vinserti128 m0, [srcq+strideq*1+8*1], 1 3305 lea srcq, [srcq+strideq*2] 3306 PREP_8TAP_H 3307 mova [tmpq+32*1], m0 3308 add tmpq, 32*2 3309 sub hd, 2 3310 jg .h_w16 3311 RET 3312.h_w32: 3313 xor r6d, r6d 3314 jmp .h_start 3315.h_w64: 3316 mov r6, -32*1 3317 jmp .h_start 3318.h_w128: 3319 mov r6, -32*3 3320.h_start: 3321 sub srcq, r6 3322 mov r5, r6 3323.h_loop: 3324 movu xm0, [srcq+r6+8*0] 3325 vinserti128 m0, [srcq+r6+8*1], 1 3326 PREP_8TAP_H 3327 mova [tmpq+32*0], m0 3328 movu xm0, [srcq+r6+8*2] 3329 vinserti128 m0, [srcq+r6+8*3], 1 3330 PREP_8TAP_H 3331 mova [tmpq+32*1], m0 3332 add tmpq, 32*2 3333 add r6, 32 3334 jle .h_loop 3335 add srcq, strideq 3336 mov r6, r5 3337 dec hd 3338 jg .h_loop 3339 RET 3340.hv: 3341 WIN64_SPILL_XMM 16 3342 cmp wd, 4 3343 je .hv_w4 3344 shr mxd, 16 3345 sub srcq, 3 3346 vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] 3347 vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] 3348 movzx mxd, myb 3349 shr myd, 16 3350 cmp hd, 4 3351 cmove myd, mxd 3352 vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] 3353 lea stride3q, [strideq*3] 3354 sub srcq, stride3q 3355 punpcklbw m0, m0 3356 psraw m0, 8 ; sign-extend 3357 pshufd m12, m0, q0000 3358 pshufd m13, m0, q1111 3359 pshufd m14, m0, q2222 3360 pshufd m15, m0, q3333 3361 jmp .hv_w8 3362.hv_w4: 3363 movzx mxd, mxb 3364 dec srcq 3365 vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] 3366 movzx mxd, myb 3367 shr myd, 16 3368 cmp hd, 4 3369 cmove myd, mxd 3370 vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] 3371 lea stride3q, [strideq*3] 3372 sub srcq, stride3q 3373 mova m7, [subpel_h_shuf4] 3374 pmovzxbd m9, [deint_shuf4] 3375 vpbroadcastd m10, [pw_8192] 3376 punpcklbw m0, m0 3377 psraw m0, 8 ; sign-extend 3378 vpbroadcastd m11, [pd_32] 3379 pshufd m12, m0, q0000 3380 pshufd m13, m0, q1111 3381 pshufd m14, m0, q2222 3382 pshufd m15, m0, q3333 3383 vpbroadcastq m2, [srcq+strideq*0] 3384 vpbroadcastq m4, [srcq+strideq*1] 3385 vpbroadcastq m0, [srcq+strideq*2] 3386 vpbroadcastq m5, [srcq+stride3q ] 3387 lea srcq, [srcq+strideq*4] 3388 vpbroadcastq m3, [srcq+strideq*0] 3389 vpbroadcastq m6, [srcq+strideq*1] 3390 vpbroadcastq m1, [srcq+strideq*2] 3391 vpblendd m2, m4, 0xcc ; 0 1 3392 vpblendd m0, m5, 0xcc ; 2 3 3393 vpblendd m3, m6, 0xcc ; 4 5 3394 pshufb m2, m7 ; 00 01 10 11 02 03 12 13 3395 pshufb m0, m7 ; 20 21 30 31 22 23 32 33 3396 pshufb m3, m7 ; 40 41 50 51 42 43 52 53 3397 pshufb m1, m7 ; 60 61 60 61 62 63 62 63 3398 pmaddubsw m2, m8 3399 pmaddubsw m0, m8 3400 pmaddubsw m3, m8 3401 pmaddubsw m1, m8 3402 phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b 3403 phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __ 3404 pmulhrsw m2, m10 3405 pmulhrsw m3, m10 3406 palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b 3407 punpcklwd m1, m2, m4 ; 01 12 3408 punpckhwd m2, m4 ; 23 34 3409 pshufd m0, m3, q2121 3410 punpcklwd m3, m0 ; 45 56 3411.hv_w4_loop: 3412 pmaddwd m5, m1, m12 ; a0 b0 3413 pmaddwd m6, m2, m12 ; c0 d0 3414 pmaddwd m2, m13 ; a1 b1 3415 pmaddwd m4, m3, m13 ; c1 d1 3416 mova m1, m3 3417 pmaddwd m3, m14 ; a2 b2 3418 paddd m5, m2 3419 vpbroadcastq m2, [srcq+stride3q ] 3420 lea srcq, [srcq+strideq*4] 3421 paddd m6, m4 3422 vpbroadcastq m4, [srcq+strideq*0] 3423 paddd m5, m3 3424 vpbroadcastq m3, [srcq+strideq*1] 3425 vpblendd m2, m4, 0xcc 3426 vpbroadcastq m4, [srcq+strideq*2] 3427 vpblendd m3, m4, 0xcc 3428 pshufb m2, m7 3429 pshufb m3, m7 3430 pmaddubsw m2, m8 3431 pmaddubsw m3, m8 3432 phaddw m2, m3 3433 pmulhrsw m2, m10 3434 palignr m3, m2, m0, 12 3435 mova m0, m2 3436 punpcklwd m2, m3, m0 ; 67 78 3437 punpckhwd m3, m0 ; 89 9a 3438 pmaddwd m4, m2, m14 ; c2 d2 3439 paddd m6, m11 3440 paddd m5, m11 3441 paddd m6, m4 3442 pmaddwd m4, m2, m15 ; a3 b3 3443 paddd m5, m4 3444 pmaddwd m4, m3, m15 ; c3 d3 3445 paddd m6, m4 3446 psrad m5, 6 3447 psrad m6, 6 3448 packssdw m5, m6 3449 vpermd m5, m9, m5 3450 mova [tmpq], m5 3451 add tmpq, 32 3452 sub hd, 4 3453 jg .hv_w4_loop 3454 RET 3455.hv_w8: 3456 lea r6d, [wq*8-64] 3457 lea r6d, [hq+r6*4] 3458.hv_w8_loop0: 3459 vbroadcasti128 m7, [subpel_h_shufA] 3460 movu xm4, [srcq+strideq*0] 3461 lea r5, [srcq+strideq*2] 3462 vbroadcasti128 m8, [subpel_h_shufB] 3463 movu xm5, [srcq+strideq*1] 3464 mov r7, tmpq 3465 vbroadcasti128 m9, [subpel_h_shufC] 3466 movu xm6, [r5+strideq*0] 3467 vbroadcasti128 m0, [r5+strideq*1] 3468 lea r5, [r5+strideq*2] 3469 vpblendd m4, m0, 0xf0 ; 0 3 3470 vinserti128 m5, [r5+strideq*0], 1 ; 1 4 3471 vinserti128 m6, [r5+strideq*1], 1 ; 2 5 3472 lea r5, [r5+strideq*2] 3473 vinserti128 m0, [r5+strideq*0], 1 ; 3 6 3474 HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9 3475 HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9 3476 HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9 3477 HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9 3478 vpbroadcastd m7, [pw_8192] 3479 vpermq m4, m4, q3120 3480 vpermq m5, m5, q3120 3481 vpermq m6, m6, q3120 3482 pmulhrsw m0, m7 3483 pmulhrsw m4, m7 3484 pmulhrsw m5, m7 3485 pmulhrsw m6, m7 3486 vpermq m7, m0, q3120 3487 punpcklwd m1, m4, m5 ; 01 3488 punpckhwd m4, m5 ; 34 3489 punpcklwd m2, m5, m6 ; 12 3490 punpckhwd m5, m6 ; 45 3491 punpcklwd m3, m6, m7 ; 23 3492 punpckhwd m6, m7 ; 56 3493.hv_w8_loop: 3494 vextracti128 [r7], m0, 1 ; not enough registers 3495 movu xm0, [r5+strideq*1] 3496 lea r5, [r5+strideq*2] 3497 vinserti128 m0, [r5+strideq*0], 1 ; 7 8 3498 pmaddwd m8, m1, m12 ; a0 3499 pmaddwd m9, m2, m12 ; b0 3500 mova m1, m3 3501 mova m2, m4 3502 pmaddwd m3, m13 ; a1 3503 pmaddwd m4, m13 ; b1 3504 paddd m8, m3 3505 paddd m9, m4 3506 mova m3, m5 3507 mova m4, m6 3508 pmaddwd m5, m14 ; a2 3509 pmaddwd m6, m14 ; b2 3510 paddd m8, m5 3511 paddd m9, m6 3512 vbroadcasti128 m6, [subpel_h_shufB] 3513 vbroadcasti128 m7, [subpel_h_shufC] 3514 vbroadcasti128 m5, [subpel_h_shufA] 3515 HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7 3516 vpbroadcastd m5, [pw_8192] 3517 vpbroadcastd m7, [pd_32] 3518 vbroadcasti128 m6, [r7] 3519 pmulhrsw m0, m5 3520 paddd m8, m7 3521 paddd m9, m7 3522 vpermq m7, m0, q3120 ; 7 8 3523 shufpd m6, m7, 0x04 ; 6 7 3524 punpcklwd m5, m6, m7 ; 67 3525 punpckhwd m6, m7 ; 78 3526 pmaddwd m7, m5, m15 ; a3 3527 paddd m8, m7 3528 pmaddwd m7, m6, m15 ; b3 3529 paddd m7, m9 3530 psrad m8, 6 3531 psrad m7, 6 3532 packssdw m8, m7 3533 vpermq m7, m8, q3120 3534 mova [r7+wq*0], xm7 3535 vextracti128 [r7+wq*2], m7, 1 3536 lea r7, [r7+wq*4] 3537 sub hd, 2 3538 jg .hv_w8_loop 3539 add srcq, 8 3540 add tmpq, 16 3541 movzx hd, r6b 3542 sub r6d, 1<<8 3543 jg .hv_w8_loop0 3544 RET 3545 3546%macro movifprep 2 3547 %if isprep 3548 mov %1, %2 3549 %endif 3550%endmacro 3551 3552%macro REMAP_REG 2 3553 %xdefine r%1 r%2 3554 %xdefine r%1q r%2q 3555 %xdefine r%1d r%2d 3556%endmacro 3557 3558%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 3559 %if isprep 3560 %xdefine r14_save r14 3561 %assign %%i 14 3562 %rep 14 3563 %assign %%j %%i-1 3564 REMAP_REG %%i, %%j 3565 %assign %%i %%i-1 3566 %endrep 3567 %endif 3568%endmacro 3569 3570%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 3571 %if isprep 3572 %assign %%i 1 3573 %rep 13 3574 %assign %%j %%i+1 3575 REMAP_REG %%i, %%j 3576 %assign %%i %%i+1 3577 %endrep 3578 %xdefine r14 r14_save 3579 %undef r14_save 3580 %endif 3581%endmacro 3582 3583%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged 3584 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 3585 RET 3586 %if %1 3587 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 3588 %endif 3589%endmacro 3590 3591%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] 3592 movq xm%1, [srcq+ r4] 3593 movq xm%2, [srcq+ r6] 3594 movhps xm%1, [srcq+ r7] 3595 movhps xm%2, [srcq+ r9] 3596 vinserti128 m%1, [srcq+r10], 1 3597 vinserti128 m%2, [srcq+r11], 1 3598 vpbroadcastq m%5, [srcq+r13] 3599 vpbroadcastq m%6, [srcq+ rX] 3600 add srcq, ssq 3601 movq xm%3, [srcq+ r4] 3602 movq xm%4, [srcq+ r6] 3603 movhps xm%3, [srcq+ r7] 3604 movhps xm%4, [srcq+ r9] 3605 vinserti128 m%3, [srcq+r10], 1 3606 vinserti128 m%4, [srcq+r11], 1 3607 vpbroadcastq m%7, [srcq+r13] 3608 vpbroadcastq m%8, [srcq+ rX] 3609 add srcq, ssq 3610 vpblendd m%1, m%5, 0xc0 3611 vpblendd m%2, m%6, 0xc0 3612 vpblendd m%3, m%7, 0xc0 3613 vpblendd m%4, m%8, 0xc0 3614 pmaddubsw m%1, m15 3615 pmaddubsw m%2, m10 3616 pmaddubsw m%3, m15 3617 pmaddubsw m%4, m10 3618 phaddw m%1, m%2 3619 phaddw m%3, m%4 3620 phaddw m%1, m%3 3621 pmulhrsw m%1, m12 3622%endmacro 3623 3624%macro MC_8TAP_SCALED 1 3625%ifidn %1, put 3626 %assign isprep 0 3627cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy 3628 %xdefine base_reg r12 3629 %define rndshift 10 3630%else 3631 %assign isprep 1 3632cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy 3633 %define tmp_stridem qword [rsp+120] 3634 %xdefine base_reg r11 3635 %define rndshift 6 3636%endif 3637 lea base_reg, [%1_8tap_scaled_8bpc_avx2] 3638%define base base_reg-%1_8tap_scaled_8bpc_avx2 3639 tzcnt wd, wm 3640 vpbroadcastd m8, dxm 3641%if isprep && UNIX64 3642 movd xm14, mxd 3643 vpbroadcastd m14, xm14 3644 mov r5d, t0d 3645 DECLARE_REG_TMP 5, 7 3646%else 3647 vpbroadcastd m14, mxm 3648%endif 3649 mov dyd, dym 3650%ifidn %1, put 3651 %if WIN64 3652 mov r8d, hm 3653 DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 3654 %define hm r5m 3655 %define dxm r8m 3656 %else 3657 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 3658 %define hm r6m 3659 %endif 3660 %define dsm [rsp+112] 3661 %define rX r1 3662 %define rXd r1d 3663%else ; prep 3664 %if WIN64 3665 mov r7d, hm 3666 DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 3667 %define hm r4m 3668 %define dxm r7m 3669 %else 3670 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 3671 %define hm [rsp+112] 3672 %endif 3673 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 3674 %define rX r14 3675 %define rXd r14d 3676%endif 3677 vpbroadcastd m10, [base+pd_0x3ff] 3678 vpbroadcastd m12, [base+pw_8192] 3679%ifidn %1, put 3680 vpbroadcastd m13, [base+pd_512] 3681%else 3682 vpbroadcastd m13, [base+pd_32] 3683%endif 3684 pxor m9, m9 3685 lea ss3q, [ssq*3] 3686 movzx r7d, t1b 3687 shr t1d, 16 3688 cmp hd, 6 3689 cmovs t1d, r7d 3690 sub srcq, ss3q 3691 cmp dyd, 1024 3692 je .dy1 3693 cmp dyd, 2048 3694 je .dy2 3695 movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] 3696 add wq, base_reg 3697 jmp wq 3698%ifidn %1, put 3699.w2: 3700 mov myd, mym 3701 movzx t0d, t0b 3702 dec srcq 3703 movd xm15, t0d 3704 punpckldq m8, m9, m8 3705 paddd m14, m8 ; mx+dx*[0,1] 3706 vpbroadcastd m11, [base+pd_0x4000] 3707 vpbroadcastd xm15, xm15 3708 pand m8, m14, m10 3709 psrld m8, 6 3710 paddd xm15, xm8 3711 movd r4d, xm15 3712 pextrd r6d, xm15, 1 3713 vbroadcasti128 m5, [base+bdct_lb_dw] 3714 vbroadcasti128 m6, [base+subpel_s_shuf2] 3715 vpbroadcastd m15, [base+subpel_filters+r4*8+2] 3716 vpbroadcastd m7, [base+subpel_filters+r6*8+2] 3717 pcmpeqd m8, m9 3718 psrld m14, 10 3719 movq xm0, [srcq+ssq*0] 3720 movq xm1, [srcq+ssq*2] 3721 movhps xm0, [srcq+ssq*1] 3722 movhps xm1, [srcq+ss3q ] 3723 lea srcq, [srcq+ssq*4] 3724 pshufb m14, m5 3725 paddb m14, m6 3726 vinserti128 m0, [srcq+ssq*0], 1 3727 vinserti128 m1, [srcq+ssq*2], 1 3728 vpbroadcastq m2, [srcq+ssq*1] 3729 vpbroadcastq m3, [srcq+ss3q ] 3730 lea srcq, [srcq+ssq*4] 3731 vpblendd m15, m7, 0xaa 3732 vpblendd m0, m2, 0xc0 ; 0 1 4 5 3733 vpblendd m1, m3, 0xc0 ; 2 3 6 7 3734 pblendvb m15, m11, m8 3735 pshufb m0, m14 3736 pshufb m1, m14 3737 pmaddubsw m0, m15 3738 pmaddubsw m1, m15 3739 phaddw m0, m1 3740 pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 3741 vextracti128 xm1, m0, 1 ; 4 5 6 7 3742 palignr xm2, xm1, xm0, 4 ; 1 2 3 4 3743 punpcklwd xm3, xm0, xm2 ; 01 12 3744 punpckhwd xm0, xm2 ; 23 34 3745 pshufd xm4, xm1, q0321 ; 5 6 7 _ 3746 punpcklwd xm2, xm1, xm4 ; 45 56 3747 punpckhwd xm4, xm1, xm4 ; 67 __ 3748.w2_loop: 3749 and myd, 0x3ff 3750 mov r6d, 64 << 24 3751 mov r4d, myd 3752 shr r4d, 6 3753 lea r4d, [t1+r4] 3754 cmovnz r6q, [base+subpel_filters+r4*8] 3755 movq xm11, r6q 3756 pmovsxbw xm11, xm11 3757 pshufd xm8, xm11, q0000 3758 pshufd xm9, xm11, q1111 3759 pshufd xm10, xm11, q2222 3760 pshufd xm11, xm11, q3333 3761 pmaddwd xm5, xm3, xm8 3762 pmaddwd xm6, xm0, xm9 3763 pmaddwd xm7, xm2, xm10 3764 pmaddwd xm8, xm4, xm11 3765 paddd xm5, xm6 3766 paddd xm7, xm8 3767 paddd xm5, xm13 3768 paddd xm5, xm7 3769 psrad xm5, 10 3770 packssdw xm5, xm5 3771 packuswb xm5, xm5 3772 pextrw [dstq], xm5, 0 3773 add dstq, dsq 3774 dec hd 3775 jz .ret 3776 add myd, dyd 3777 test myd, ~0x3ff 3778 jz .w2_loop 3779 movq xm5, [srcq] 3780 test myd, 0x400 3781 jz .w2_skip_line 3782 add srcq, ssq 3783 shufps xm3, xm0, q1032 ; 01 12 3784 shufps xm0, xm2, q1032 ; 23 34 3785 shufps xm2, xm4, q1032 ; 45 56 3786 pshufb xm5, xm14 3787 pmaddubsw xm5, xm15 3788 phaddw xm5, xm5 3789 pmulhrsw xm5, xm12 3790 palignr xm1, xm5, xm1, 12 3791 punpcklqdq xm1, xm1 ; 6 7 6 7 3792 punpcklwd xm4, xm1, xm5 ; 67 __ 3793 jmp .w2_loop 3794.w2_skip_line: 3795 movhps xm5, [srcq+ssq*1] 3796 lea srcq, [srcq+ssq*2] 3797 mova xm3, xm0 ; 01 12 3798 mova xm0, xm2 ; 23 34 3799 pshufb xm5, xm14 3800 pmaddubsw xm5, xm15 3801 phaddw xm5, xm5 3802 pmulhrsw xm5, xm12 ; 6 7 6 7 3803 palignr xm1, xm5, xm1, 8 ; 4 5 6 7 3804 pshufd xm5, xm1, q0321 ; 5 6 7 _ 3805 punpcklwd xm2, xm1, xm5 ; 45 56 3806 punpckhwd xm4, xm1, xm5 ; 67 __ 3807 jmp .w2_loop 3808%endif 3809.w4: 3810 mov myd, mym 3811 vbroadcasti128 m7, [base+rescale_mul] 3812 movzx t0d, t0b 3813 dec srcq 3814 movd xm15, t0d 3815 pmaddwd m8, m7 3816 vpbroadcastd m11, [base+pd_0x4000] 3817 vpbroadcastd xm15, xm15 3818 paddd m14, m8 ; mx+dx*[0-3] 3819 pand m0, m14, m10 3820 psrld m0, 6 3821 paddd xm15, xm0 3822 movd r4d, xm15 3823 pextrd r6d, xm15, 1 3824 pextrd r11d, xm15, 2 3825 pextrd r13d, xm15, 3 3826 movd xm15, [base+subpel_filters+r4*8+2] 3827 vbroadcasti128 m5, [base+bdct_lb_dw] 3828 vpbroadcastq m6, [base+subpel_s_shuf2] 3829 pinsrd xm15, [base+subpel_filters+r6*8+2], 1 3830 pcmpeqd m0, m9 3831 psrld m14, 10 3832 movu xm7, [srcq+ssq*0] 3833 movu xm9, [srcq+ssq*1] 3834 pinsrd xm15, [base+subpel_filters+r11*8+2], 2 3835 movu xm8, [srcq+ssq*2] 3836 movu xm10, [srcq+ss3q ] 3837 pinsrd xm15, [base+subpel_filters+r13*8+2], 3 3838 lea srcq, [srcq+ssq*4] 3839 pshufb m14, m5 3840 paddb m14, m6 3841 vinserti128 m7, [srcq+ssq*0], 1 3842 vinserti128 m9, [srcq+ssq*1], 1 3843 vinserti128 m15, xm15, 1 3844 vinserti128 m8, [srcq+ssq*2], 1 3845 vinserti128 m10, [srcq+ss3q ], 1 3846 lea srcq, [srcq+ssq*4] 3847 pblendvb m15, m11, m0 3848 pshufb m7, m14 3849 pshufb m9, m14 3850 pshufb m8, m14 3851 pshufb m10, m14 3852 pmaddubsw m7, m15 3853 pmaddubsw m9, m15 3854 pmaddubsw m8, m15 3855 pmaddubsw m10, m15 3856 phaddw m7, m9 3857 phaddw m8, m10 3858 pmulhrsw m7, m12 ; 0 1 4 5 3859 pmulhrsw m8, m12 ; 2 3 6 7 3860 vextracti128 xm9, m7, 1 ; 4 5 3861 vextracti128 xm3, m8, 1 ; 6 7 3862 shufps xm4, xm7, xm8, q1032 ; 1 2 3863 shufps xm5, xm8, xm9, q1032 ; 3 4 3864 shufps xm6, xm9, xm3, q1032 ; 5 6 3865 psrldq xm11, xm3, 8 ; 7 _ 3866 punpcklwd xm0, xm7, xm4 ; 01 3867 punpckhwd xm7, xm4 ; 12 3868 punpcklwd xm1, xm8, xm5 ; 23 3869 punpckhwd xm8, xm5 ; 34 3870 punpcklwd xm2, xm9, xm6 ; 45 3871 punpckhwd xm9, xm6 ; 56 3872 punpcklwd xm3, xm11 ; 67 3873 mova [rsp+0x00], xm7 3874 mova [rsp+0x10], xm8 3875 mova [rsp+0x20], xm9 3876.w4_loop: 3877 and myd, 0x3ff 3878 mov r6d, 64 << 24 3879 mov r4d, myd 3880 shr r4d, 6 3881 lea r4d, [t1+r4] 3882 cmovnz r6q, [base+subpel_filters+r4*8] 3883 movq xm10, r6q 3884 pmovsxbw xm10, xm10 3885 pshufd xm7, xm10, q0000 3886 pshufd xm8, xm10, q1111 3887 pshufd xm9, xm10, q2222 3888 pshufd xm10, xm10, q3333 3889 pmaddwd xm4, xm0, xm7 3890 pmaddwd xm5, xm1, xm8 3891 pmaddwd xm6, xm2, xm9 3892 pmaddwd xm7, xm3, xm10 3893 paddd xm4, xm5 3894 paddd xm6, xm7 3895 paddd xm4, xm13 3896 paddd xm4, xm6 3897 psrad xm4, rndshift 3898 packssdw xm4, xm4 3899%ifidn %1, put 3900 packuswb xm4, xm4 3901 movd [dstq], xm4 3902 add dstq, dsq 3903%else 3904 movq [tmpq], xm4 3905 add tmpq, 8 3906%endif 3907 dec hd 3908 jz .ret 3909 add myd, dyd 3910 test myd, ~0x3ff 3911 jz .w4_loop 3912 movu xm4, [srcq] 3913 test myd, 0x400 3914 jz .w4_skip_line 3915 mova xm0, [rsp+0x00] 3916 mova [rsp+0x00], xm1 3917 mova xm1, [rsp+0x10] 3918 mova [rsp+0x10], xm2 3919 mova xm2, [rsp+0x20] 3920 mova [rsp+0x20], xm3 3921 pshufb xm4, xm14 3922 pmaddubsw xm4, xm15 3923 phaddw xm4, xm4 3924 pmulhrsw xm4, xm12 3925 punpcklwd xm3, xm11, xm4 3926 mova xm11, xm4 3927 add srcq, ssq 3928 jmp .w4_loop 3929.w4_skip_line: 3930 movu xm5, [srcq+ssq*1] 3931 movu m6, [rsp+0x10] 3932 pshufb xm4, xm14 3933 pshufb xm5, xm14 3934 pmaddubsw xm4, xm15 3935 pmaddubsw xm5, xm15 3936 movu [rsp+0x00], m6 3937 phaddw xm4, xm5 3938 pmulhrsw xm4, xm12 3939 punpcklwd xm9, xm11, xm4 3940 mova [rsp+0x20], xm9 3941 psrldq xm11, xm4, 8 3942 mova xm0, xm1 3943 mova xm1, xm2 3944 mova xm2, xm3 3945 punpcklwd xm3, xm4, xm11 3946 lea srcq, [srcq+ssq*2] 3947 jmp .w4_loop 3948.w8: 3949 mov dword [rsp+48], 1 3950 movifprep tmp_stridem, 16 3951 jmp .w_start 3952.w16: 3953 mov dword [rsp+48], 2 3954 movifprep tmp_stridem, 32 3955 jmp .w_start 3956.w32: 3957 mov dword [rsp+48], 4 3958 movifprep tmp_stridem, 64 3959 jmp .w_start 3960.w64: 3961 mov dword [rsp+48], 8 3962 movifprep tmp_stridem, 128 3963 jmp .w_start 3964.w128: 3965 mov dword [rsp+48], 16 3966 movifprep tmp_stridem, 256 3967.w_start: 3968%ifidn %1, put 3969 movifnidn dsm, dsq 3970%endif 3971 shr t0d, 16 3972 sub srcq, 3 3973 pmaddwd m8, [base+rescale_mul] 3974 movd xm15, t0d 3975 mov [rsp+72], t0d 3976 mov [rsp+56], srcq 3977 mov [rsp+64], r0q ; dstq / tmpq 3978%if UNIX64 3979 mov hm, hd 3980%endif 3981 shl dword dxm, 3 ; dx*8 3982 vpbroadcastd m15, xm15 3983 paddd m14, m8 ; mx+dx*[0-7] 3984 jmp .hloop 3985.hloop_prep: 3986 dec dword [rsp+48] 3987 jz .ret 3988 add qword [rsp+64], 8*(isprep+1) 3989 mov hd, hm 3990 vpbroadcastd m8, dxm 3991 vpbroadcastd m10, [base+pd_0x3ff] 3992 paddd m14, m8, [rsp+16] 3993 vpbroadcastd m15, [rsp+72] 3994 pxor m9, m9 3995 mov srcq, [rsp+56] 3996 mov r0q, [rsp+64] ; dstq / tmpq 3997.hloop: 3998 vpbroadcastq m11, [base+pq_0x40000000] 3999 pand m6, m14, m10 4000 psrld m6, 6 4001 paddd m15, m6 4002 pcmpeqd m6, m9 4003 vextracti128 xm7, m15, 1 4004 movd r4d, xm15 4005 pextrd r6d, xm15, 2 4006 pextrd r7d, xm15, 1 4007 pextrd r9d, xm15, 3 4008 movd r10d, xm7 4009 pextrd r11d, xm7, 2 4010 pextrd r13d, xm7, 1 4011 pextrd rXd, xm7, 3 4012 movu [rsp+16], m14 4013 movq xm15, [base+subpel_filters+ r4*8] 4014 movq xm10, [base+subpel_filters+ r6*8] 4015 movhps xm15, [base+subpel_filters+ r7*8] 4016 movhps xm10, [base+subpel_filters+ r9*8] 4017 vinserti128 m15, [base+subpel_filters+r10*8], 1 4018 vinserti128 m10, [base+subpel_filters+r11*8], 1 4019 vpbroadcastq m9, [base+subpel_filters+r13*8] 4020 vpbroadcastq m8, [base+subpel_filters+ rX*8] 4021 psrld m14, 10 4022 vextracti128 xm7, m14, 1 4023 mova [rsp], xm14 4024 movd r4d, xm14 4025 pextrd r6d, xm14, 2 4026 pextrd r7d, xm14, 1 4027 pextrd r9d, xm14, 3 4028 movd r10d, xm7 4029 pextrd r11d, xm7, 2 4030 pextrd r13d, xm7, 1 4031 pextrd rXd, xm7, 3 4032 pshufd m5, m6, q1100 4033 pshufd m6, m6, q3322 4034 vpblendd m15, m9, 0xc0 4035 vpblendd m10, m8, 0xc0 4036 pblendvb m15, m11, m5 4037 pblendvb m10, m11, m6 4038 vbroadcasti128 m14, [base+subpel_s_shuf8] 4039 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 4040 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 4041 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 4042 MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b 4043 mov myd, mym 4044 mov dyd, dym 4045 pshufb m0, m14 ; 01a 01b 4046 pshufb m1, m14 ; 23a 23b 4047 pshufb m2, m14 ; 45a 45b 4048 pshufb m3, m14 ; 67a 67b 4049 vbroadcasti128 m14, [base+wswap] 4050.vloop: 4051 and myd, 0x3ff 4052 mov r6d, 64 << 24 4053 mov r4d, myd 4054 shr r4d, 6 4055 lea r4d, [t1+r4] 4056 cmovnz r6q, [base+subpel_filters+r4*8] 4057 movq xm11, r6q 4058 punpcklqdq xm11, xm11 4059 pmovsxbw m11, xm11 4060 pshufd m8, m11, q0000 4061 pshufd m9, m11, q1111 4062 pmaddwd m4, m0, m8 4063 pmaddwd m5, m1, m9 4064 pshufd m8, m11, q2222 4065 pshufd m11, m11, q3333 4066 pmaddwd m6, m2, m8 4067 pmaddwd m7, m3, m11 4068 paddd m4, m5 4069 paddd m6, m7 4070 paddd m4, m13 4071 paddd m4, m6 4072 psrad m4, rndshift 4073 vextracti128 xm5, m4, 1 4074 packssdw xm4, xm5 4075%ifidn %1, put 4076 packuswb xm4, xm4 4077 movq [dstq], xm4 4078 add dstq, dsm 4079%else 4080 mova [tmpq], xm4 4081 add tmpq, tmp_stridem 4082%endif 4083 dec hd 4084 jz .hloop_prep 4085 add myd, dyd 4086 test myd, ~0x3ff 4087 jz .vloop 4088 test myd, 0x400 4089 mov [rsp+52], myd 4090 mov r4d, [rsp+ 0] 4091 mov r6d, [rsp+ 8] 4092 mov r7d, [rsp+ 4] 4093 mov r9d, [rsp+12] 4094 jz .skip_line 4095 vpbroadcastq m6, [srcq+r13] 4096 vpbroadcastq m7, [srcq+ rX] 4097 movq xm4, [srcq+ r4] 4098 movq xm5, [srcq+ r6] 4099 movhps xm4, [srcq+ r7] 4100 movhps xm5, [srcq+ r9] 4101 vinserti128 m4, [srcq+r10], 1 4102 vinserti128 m5, [srcq+r11], 1 4103 add srcq, ssq 4104 mov myd, [rsp+52] 4105 mov dyd, dym 4106 pshufb m0, m14 4107 pshufb m1, m14 4108 pshufb m2, m14 4109 pshufb m3, m14 4110 vpblendd m4, m6, 0xc0 4111 vpblendd m5, m7, 0xc0 4112 pmaddubsw m4, m15 4113 pmaddubsw m5, m10 4114 phaddw m4, m5 4115 pslld m5, m4, 16 4116 paddw m4, m5 4117 pmulhrsw m4, m12 4118 pblendw m0, m1, 0xaa 4119 pblendw m1, m2, 0xaa 4120 pblendw m2, m3, 0xaa 4121 pblendw m3, m4, 0xaa 4122 jmp .vloop 4123.skip_line: 4124 mova m0, m1 4125 mova m1, m2 4126 mova m2, m3 4127 vpbroadcastq m7, [srcq+r13] 4128 vpbroadcastq m8, [srcq+ rX] 4129 movq xm3, [srcq+ r4] 4130 movq xm4, [srcq+ r6] 4131 movhps xm3, [srcq+ r7] 4132 movhps xm4, [srcq+ r9] 4133 vinserti128 m3, [srcq+r10], 1 4134 vinserti128 m4, [srcq+r11], 1 4135 add srcq, ssq 4136 movq xm5, [srcq+ r4] 4137 movq xm6, [srcq+ r6] 4138 movhps xm5, [srcq+ r7] 4139 movhps xm6, [srcq+ r9] 4140 vinserti128 m5, [srcq+r10], 1 4141 vinserti128 m6, [srcq+r11], 1 4142 vpbroadcastq m9, [srcq+r13] 4143 vpbroadcastq m11, [srcq+ rX] 4144 add srcq, ssq 4145 mov myd, [rsp+52] 4146 mov dyd, dym 4147 vpblendd m3, m7, 0xc0 4148 vpblendd m4, m8, 0xc0 4149 vpblendd m5, m9, 0xc0 4150 vpblendd m6, m11, 0xc0 4151 pmaddubsw m3, m15 4152 pmaddubsw m4, m10 4153 pmaddubsw m5, m15 4154 pmaddubsw m6, m10 4155 phaddw m3, m4 4156 phaddw m5, m6 4157 psrld m4, m3, 16 4158 pslld m6, m5, 16 4159 paddw m3, m4 4160 paddw m5, m6 4161 pblendw m3, m5, 0xaa 4162 pmulhrsw m3, m12 4163 jmp .vloop 4164.dy1: 4165 movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] 4166 add wq, base_reg 4167 jmp wq 4168%ifidn %1, put 4169.dy1_w2: 4170 mov myd, mym 4171 movzx t0d, t0b 4172 dec srcq 4173 movd xm15, t0d 4174 punpckldq m8, m9, m8 4175 paddd m14, m8 ; mx+dx*[0-1] 4176 vpbroadcastd m11, [base+pd_0x4000] 4177 vpbroadcastd xm15, xm15 4178 pand m8, m14, m10 4179 psrld m8, 6 4180 paddd xm15, xm8 4181 movd r4d, xm15 4182 pextrd r6d, xm15, 1 4183 vbroadcasti128 m5, [base+bdct_lb_dw] 4184 vbroadcasti128 m6, [base+subpel_s_shuf2] 4185 vpbroadcastd m15, [base+subpel_filters+r4*8+2] 4186 vpbroadcastd m7, [base+subpel_filters+r6*8+2] 4187 pcmpeqd m8, m9 4188 psrld m14, 10 4189 movq xm0, [srcq+ssq*0] 4190 movq xm1, [srcq+ssq*2] 4191 movhps xm0, [srcq+ssq*1] 4192 movhps xm1, [srcq+ss3q ] 4193 lea srcq, [srcq+ssq*4] 4194 shr myd, 6 4195 mov r4d, 64 << 24 4196 lea myd, [t1+myq] 4197 cmovnz r4q, [base+subpel_filters+myq*8] 4198 pshufb m14, m5 4199 paddb m14, m6 4200 vinserti128 m0, [srcq+ssq*0], 1 4201 vinserti128 m1, [srcq+ssq*2], 1 4202 vpbroadcastq m2, [srcq+ssq*1] 4203 add srcq, ss3q 4204 movq xm10, r4q 4205 pmovsxbw xm10, xm10 4206 vpblendd m15, m7, 0xaa 4207 pblendvb m15, m11, m8 4208 pshufd xm8, xm10, q0000 4209 pshufd xm9, xm10, q1111 4210 pshufd xm11, xm10, q3333 4211 pshufd xm10, xm10, q2222 4212 vpblendd m0, m2, 0xc0 4213 pshufb m1, m14 4214 pshufb m0, m14 4215 pmaddubsw m1, m15 4216 pmaddubsw m0, m15 4217 phaddw m0, m1 4218 pmulhrsw m0, m12 4219 vextracti128 xm1, m0, 1 4220 palignr xm2, xm1, xm0, 4 4221 pshufd xm4, xm1, q2121 4222 punpcklwd xm3, xm0, xm2 ; 01 12 4223 punpckhwd xm0, xm2 ; 23 34 4224 punpcklwd xm2, xm1, xm4 ; 45 56 4225.dy1_w2_loop: 4226 movq xm1, [srcq+ssq*0] 4227 movhps xm1, [srcq+ssq*1] 4228 lea srcq, [srcq+ssq*2] 4229 pmaddwd xm5, xm3, xm8 4230 pmaddwd xm6, xm0, xm9 4231 pmaddwd xm7, xm2, xm10 4232 mova xm3, xm0 4233 mova xm0, xm2 4234 paddd xm5, xm13 4235 paddd xm6, xm7 4236 pshufb xm1, xm14 4237 pmaddubsw xm1, xm15 4238 phaddw xm1, xm1 4239 pmulhrsw xm1, xm12 4240 palignr xm7, xm1, xm4, 12 4241 punpcklwd xm2, xm7, xm1 ; 67 78 4242 pmaddwd xm7, xm2, xm11 4243 mova xm4, xm1 4244 paddd xm5, xm6 4245 paddd xm5, xm7 4246 psrad xm5, rndshift 4247 packssdw xm5, xm5 4248 packuswb xm5, xm5 4249 pextrw [dstq+dsq*0], xm5, 0 4250 pextrw [dstq+dsq*1], xm5, 1 4251 lea dstq, [dstq+dsq*2] 4252 sub hd, 2 4253 jg .dy1_w2_loop 4254 RET 4255%endif 4256.dy1_w4: 4257 mov myd, mym 4258 vbroadcasti128 m7, [base+rescale_mul] 4259 movzx t0d, t0b 4260 dec srcq 4261 movd xm15, t0d 4262 pmaddwd m8, m7 4263 vpbroadcastd m11, [base+pd_0x4000] 4264 vpbroadcastd xm15, xm15 4265 paddd m14, m8 ; mx+dx*[0-3] 4266 pand m8, m14, m10 4267 psrld m8, 6 4268 paddd xm15, xm8 4269 vpermq m8, m8, q3120 4270 movd r4d, xm15 4271 pextrd r6d, xm15, 2 4272 pextrd r11d, xm15, 1 4273 pextrd r13d, xm15, 3 4274 movd xm15, [base+subpel_filters+r4*8+2] 4275 vpbroadcastd m7, [base+subpel_filters+r6*8+2] 4276 movu xm2, [srcq+ssq*0] 4277 movu xm3, [srcq+ssq*2] 4278 vbroadcasti128 m5, [base+bdct_lb_dw] 4279 vpbroadcastq m6, [base+subpel_s_shuf2] 4280 pcmpeqd m8, m9 4281 psrld m14, 10 4282 pinsrd xm15, [base+subpel_filters+r11*8+2], 1 4283 vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 4284 vinserti128 m2, [srcq+ssq*1], 1 4285 vinserti128 m3, [srcq+ss3q ], 1 4286 lea srcq, [srcq+ssq*4] 4287 shr myd, 6 4288 mov r4d, 64 << 24 4289 lea myd, [t1+myq] 4290 cmovnz r4q, [base+subpel_filters+myq*8] 4291 pshufb m14, m5 4292 paddb m14, m6 4293 movu xm4, [srcq+ssq*0] 4294 movu xm5, [srcq+ssq*2] 4295 vinserti128 m4, [srcq+ssq*1], 1 4296 add srcq, ss3q 4297 vpblendd m15, m7, 0x30 4298 punpcklqdq m15, m15 4299 pblendvb m15, m11, m8 4300 movq xm10, r4q 4301 punpcklqdq xm10, xm10 4302 pmovsxbw m10, xm10 4303 pshufb m2, m14 4304 pshufb m3, m14 4305 pshufb m4, m14 4306 pshufb xm5, xm14 4307 vpermq m2, m2, q3120 4308 vpermq m3, m3, q3120 4309 vpermq m4, m4, q3120 4310 vpermq m5, m5, q3120 4311 pshufd m7, m10, q0000 4312 pshufd m8, m10, q1111 4313 pshufd m9, m10, q2222 4314 pshufd m10, m10, q3333 4315 pmaddubsw m2, m15 4316 pmaddubsw m3, m15 4317 pmaddubsw m4, m15 4318 pmaddubsw m5, m15 4319 phaddw m2, m3 4320 phaddw m4, m5 4321 pmulhrsw m2, m12 4322 pmulhrsw m4, m12 4323 palignr m5, m4, m2, 4 4324 pshufd m3, m4, q2121 4325 punpcklwd m0, m2, m5 ; 01 12 4326 punpckhwd m1, m2, m5 ; 23 34 4327 punpcklwd m2, m4, m3 ; 45 56 4328.dy1_w4_loop: 4329 movu xm11, [srcq+ssq*0] 4330 vinserti128 m11, [srcq+ssq*1], 1 4331 lea srcq, [srcq+ssq*2] 4332 pmaddwd m4, m0, m7 4333 pmaddwd m5, m1, m8 4334 pmaddwd m6, m2, m9 4335 mova m0, m1 4336 mova m1, m2 4337 paddd m4, m13 4338 paddd m5, m6 4339 pshufb m11, m14 4340 vpermq m11, m11, q3120 4341 pmaddubsw m11, m15 4342 phaddw m11, m11 4343 pmulhrsw m11, m12 4344 palignr m6, m11, m3, 12 4345 punpcklwd m2, m6, m11 ; 67 78 4346 mova m3, m11 4347 pmaddwd m6, m2, m10 4348 paddd m4, m5 4349 paddd m4, m6 4350 psrad m4, rndshift 4351 vextracti128 xm5, m4, 1 4352 packssdw xm4, xm5 4353%ifidn %1, put 4354 packuswb xm4, xm4 4355 pshuflw xm4, xm4, q3120 4356 movd [dstq+dsq*0], xm4 4357 pextrd [dstq+dsq*1], xm4, 1 4358 lea dstq, [dstq+dsq*2] 4359%else 4360 pshufd xm4, xm4, q3120 4361 mova [tmpq], xm4 4362 add tmpq, 16 4363%endif 4364 sub hd, 2 4365 jg .dy1_w4_loop 4366 MC_8TAP_SCALED_RET 4367.dy1_w8: 4368 mov dword [rsp+72], 1 4369 movifprep tmp_stridem, 16 4370 jmp .dy1_w_start 4371.dy1_w16: 4372 mov dword [rsp+72], 2 4373 movifprep tmp_stridem, 32 4374 jmp .dy1_w_start 4375.dy1_w32: 4376 mov dword [rsp+72], 4 4377 movifprep tmp_stridem, 64 4378 jmp .dy1_w_start 4379.dy1_w64: 4380 mov dword [rsp+72], 8 4381 movifprep tmp_stridem, 128 4382 jmp .dy1_w_start 4383.dy1_w128: 4384 mov dword [rsp+72], 16 4385 movifprep tmp_stridem, 256 4386.dy1_w_start: 4387 mov myd, mym 4388%ifidn %1, put 4389 movifnidn dsm, dsq 4390%endif 4391 shr t0d, 16 4392 sub srcq, 3 4393 shr myd, 6 4394 mov r4d, 64 << 24 4395 lea myd, [t1+myq] 4396 cmovnz r4q, [base+subpel_filters+myq*8] 4397 pmaddwd m8, [base+rescale_mul] 4398 movd xm15, t0d 4399 mov [rsp+76], t0d 4400 mov [rsp+80], srcq 4401 mov [rsp+88], r0q ; dstq / tmpq 4402%if UNIX64 4403 mov hm, hd 4404%endif 4405 shl dword dxm, 3 ; dx*8 4406 vpbroadcastd m15, xm15 4407 paddd m14, m8 ; mx+dx*[0-7] 4408 movq xm0, r4q 4409 pmovsxbw xm0, xm0 4410 mova [rsp+96], xm0 4411 jmp .dy1_hloop 4412.dy1_hloop_prep: 4413 dec dword [rsp+72] 4414 jz .ret 4415 add qword [rsp+88], 8*(isprep+1) 4416 mov hd, hm 4417 vpbroadcastd m8, dxm 4418 vpbroadcastd m10, [base+pd_0x3ff] 4419 paddd m14, m8, [rsp+32] 4420 vpbroadcastd m15, [rsp+76] 4421 pxor m9, m9 4422 mov srcq, [rsp+80] 4423 mov r0q, [rsp+88] ; dstq / tmpq 4424.dy1_hloop: 4425 vpbroadcastq m11, [base+pq_0x40000000] 4426 pand m6, m14, m10 4427 psrld m6, 6 4428 paddd m15, m6 4429 pcmpeqd m6, m9 4430 vextracti128 xm7, m15, 1 4431 movd r4d, xm15 4432 pextrd r6d, xm15, 2 4433 pextrd r7d, xm15, 1 4434 pextrd r9d, xm15, 3 4435 movd r10d, xm7 4436 pextrd r11d, xm7, 2 4437 pextrd r13d, xm7, 1 4438 pextrd rXd, xm7, 3 4439 movu [rsp+32], m14 4440 movq xm15, [base+subpel_filters+ r4*8] 4441 movq xm10, [base+subpel_filters+ r6*8] 4442 movhps xm15, [base+subpel_filters+ r7*8] 4443 movhps xm10, [base+subpel_filters+ r9*8] 4444 vinserti128 m15, [base+subpel_filters+r10*8], 1 4445 vinserti128 m10, [base+subpel_filters+r11*8], 1 4446 vpbroadcastq m9, [base+subpel_filters+r13*8] 4447 vpbroadcastq m8, [base+subpel_filters+ rX*8] 4448 psrld m14, 10 4449 vextracti128 xm7, m14, 1 4450 movq [rsp+64], xm14 4451 movd r4d, xm14 4452 pextrd r6d, xm14, 2 4453 pextrd r7d, xm14, 1 4454 pextrd r9d, xm14, 3 4455 movd r10d, xm7 4456 pextrd r11d, xm7, 2 4457 pextrd r13d, xm7, 1 4458 pextrd rXd, xm7, 3 4459 pshufd m5, m6, q1100 4460 pshufd m6, m6, q3322 4461 vpblendd m15, m9, 0xc0 4462 vpblendd m10, m8, 0xc0 4463 pblendvb m15, m11, m5 4464 pblendvb m10, m11, m6 4465 vbroadcasti128 m14, [base+subpel_s_shuf8] 4466 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 4467 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 4468 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 4469 MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b 4470 movu [rsp], m10 4471 vpbroadcastd m8, [rsp+0x60] 4472 vpbroadcastd m9, [rsp+0x64] 4473 vpbroadcastd m10, [rsp+0x68] 4474 vpbroadcastd m11, [rsp+0x6c] 4475 pshufb m0, m14 ; 01a 01b 4476 pshufb m1, m14 ; 23a 23b 4477 pshufb m2, m14 ; 45a 45b 4478 pshufb m3, m14 ; 67a 67b 4479 vbroadcasti128 m14, [base+wswap] 4480.dy1_vloop: 4481 pmaddwd m4, m0, m8 4482 pmaddwd m5, m1, m9 4483 pmaddwd m6, m2, m10 4484 pmaddwd m7, m3, m11 4485 paddd m4, m5 4486 paddd m6, m7 4487 paddd m4, m13 4488 paddd m4, m6 4489 psrad m4, rndshift 4490 vextracti128 xm5, m4, 1 4491 packssdw xm4, xm5 4492%ifidn %1, put 4493 packuswb xm4, xm4 4494 movq [dstq], xm4 4495 add dstq, dsm 4496%else 4497 mova [tmpq], xm4 4498 add tmpq, tmp_stridem 4499%endif 4500 dec hd 4501 jz .dy1_hloop_prep 4502 movq xm4, [srcq+ r4] 4503 movq xm5, [srcq+ r6] 4504 movhps xm4, [srcq+ r7] 4505 movhps xm5, [srcq+ r9] 4506 vinserti128 m4, [srcq+r10], 1 4507 vinserti128 m5, [srcq+r11], 1 4508 vpbroadcastq m6, [srcq+r13] 4509 vpbroadcastq m7, [srcq+ rX] 4510 add srcq, ssq 4511 pshufb m0, m14 4512 pshufb m1, m14 4513 pshufb m2, m14 4514 pshufb m3, m14 4515 vpblendd m4, m6, 0xc0 4516 vpblendd m5, m7, 0xc0 4517 pmaddubsw m4, m15 4518 pmaddubsw m5, [rsp] 4519 phaddw m4, m5 4520 pslld m5, m4, 16 4521 paddw m4, m5 4522 pmulhrsw m4, m12 4523 pblendw m0, m1, 0xaa 4524 pblendw m1, m2, 0xaa 4525 pblendw m2, m3, 0xaa 4526 pblendw m3, m4, 0xaa 4527 jmp .dy1_vloop 4528.dy2: 4529 movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] 4530 add wq, base_reg 4531 jmp wq 4532%ifidn %1, put 4533.dy2_w2: 4534 mov myd, mym 4535 movzx t0d, t0b 4536 dec srcq 4537 movd xm15, t0d 4538 punpckldq m8, m9, m8 4539 paddd m14, m8 ; mx+dx*[0-1] 4540 vpbroadcastd m11, [base+pd_0x4000] 4541 vpbroadcastd xm15, xm15 4542 pand m8, m14, m10 4543 psrld m8, 6 4544 paddd xm15, xm8 4545 movd r4d, xm15 4546 pextrd r6d, xm15, 1 4547 vbroadcasti128 m5, [base+bdct_lb_dw] 4548 vbroadcasti128 m6, [base+subpel_s_shuf2] 4549 vpbroadcastd m15, [base+subpel_filters+r4*8+2] 4550 vpbroadcastd m7, [base+subpel_filters+r6*8+2] 4551 pcmpeqd m8, m9 4552 psrld m14, 10 4553 movq xm0, [srcq+ssq*0] 4554 vpbroadcastq m2, [srcq+ssq*1] 4555 movhps xm0, [srcq+ssq*2] 4556 vpbroadcastq m3, [srcq+ss3q ] 4557 lea srcq, [srcq+ssq*4] 4558 pshufb m14, m5 4559 paddb m14, m6 4560 vpblendd m15, m7, 0xaa 4561 pblendvb m15, m11, m8 4562 movhps xm1, [srcq+ssq*0] 4563 vpbroadcastq m4, [srcq+ssq*1] 4564 lea srcq, [srcq+ssq*2] 4565 shr myd, 6 4566 mov r4d, 64 << 24 4567 lea myd, [t1+myq] 4568 cmovnz r4q, [base+subpel_filters+myq*8] 4569 vpblendd m0, m2, 0x30 4570 vpblendd m1, m4, 0xc0 4571 vpblendd m0, m3, 0xc0 4572 pshufb m0, m14 4573 pshufb m1, m14 4574 pmaddubsw m0, m15 4575 pmaddubsw m1, m15 4576 movq xm11, r4q 4577 pmovsxbw xm11, xm11 4578 phaddw m0, m1 4579 pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 4580 pshufd xm8, xm11, q0000 4581 pshufd xm9, xm11, q1111 4582 pshufd xm10, xm11, q2222 4583 pshufd xm11, xm11, q3333 4584 pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 4585 vextracti128 xm1, m2, 1 4586 punpcklwd xm3, xm2, xm1 ; 01 23 4587 punpckhwd xm2, xm1 ; 23 45 4588.dy2_w2_loop: 4589 movq xm6, [srcq+ssq*0] 4590 vpbroadcastq m7, [srcq+ssq*1] 4591 movhps xm6, [srcq+ssq*2] 4592 vpbroadcastq m1, [srcq+ss3q ] 4593 lea srcq, [srcq+ssq*4] 4594 pmaddwd xm4, xm3, xm8 4595 pmaddwd xm5, xm2, xm9 4596 vpblendd m6, m7, 0x30 4597 vpblendd m6, m1, 0xc0 4598 pshufb m6, m14 4599 pmaddubsw m6, m15 4600 phaddw m6, m6 4601 pmulhrsw m6, m12 4602 palignr m0, m6, m0, 8 4603 pshufd m2, m0, q3221 4604 vextracti128 xm1, m2, 1 4605 punpcklwd xm3, xm2, xm1 ; 45 67 4606 punpckhwd xm2, xm1 ; 67 89 4607 pmaddwd xm6, xm3, xm10 4608 pmaddwd xm7, xm2, xm11 4609 paddd xm4, xm5 4610 paddd xm4, xm13 4611 paddd xm6, xm7 4612 paddd xm4, xm6 4613 psrad xm4, rndshift 4614 packssdw xm4, xm4 4615 packuswb xm4, xm4 4616 pextrw [dstq+dsq*0], xm4, 0 4617 pextrw [dstq+dsq*1], xm4, 1 4618 lea dstq, [dstq+dsq*2] 4619 sub hd, 2 4620 jg .dy2_w2_loop 4621 RET 4622%endif 4623.dy2_w4: 4624 mov myd, mym 4625 vbroadcasti128 m7, [base+rescale_mul] 4626 movzx t0d, t0b 4627 dec srcq 4628 movd xm15, t0d 4629 pmaddwd m8, m7 4630 vpbroadcastd m11, [base+pd_0x4000] 4631 vpbroadcastd xm15, xm15 4632 paddd m14, m8 ; mx+dx*[0-3] 4633 pand m8, m14, m10 4634 psrld m8, 6 4635 paddd xm15, xm8 4636 movd r4d, xm15 4637 pextrd r6d, xm15, 1 4638 pextrd r11d, xm15, 2 4639 pextrd r13d, xm15, 3 4640 movd xm15, [base+subpel_filters+r4*8+2] 4641 vbroadcasti128 m5, [base+bdct_lb_dw] 4642 vpbroadcastq m6, [base+subpel_s_shuf2] 4643 pinsrd xm15, [base+subpel_filters+r6*8+2], 1 4644 pcmpeqd m8, m9 4645 psrld m14, 10 4646 movu xm0, [srcq+ssq*0] 4647 movu xm2, [srcq+ssq*2] 4648 pinsrd xm15, [base+subpel_filters+r11*8+2], 2 4649 movu xm1, [srcq+ssq*1] 4650 movu xm3, [srcq+ss3q ] 4651 pinsrd xm15, [base+subpel_filters+r13*8+2], 3 4652 lea srcq, [srcq+ssq*4] 4653 shr myd, 6 4654 mov r4d, 64 << 24 4655 lea myd, [t1+myq] 4656 cmovnz r4q, [base+subpel_filters+myq*8] 4657 vinserti128 m15, xm15, 1 4658 pshufb m14, m5 4659 paddb m14, m6 4660 vinserti128 m2, [srcq+ssq*0], 1 4661 vinserti128 m3, [srcq+ssq*1], 1 4662 lea srcq, [srcq+ssq*2] 4663 pblendvb m15, m11, m8 4664 pshufb xm0, xm14 4665 pshufb m2, m14 4666 pshufb xm1, xm14 4667 pshufb m3, m14 4668 pmaddubsw xm0, xm15 4669 pmaddubsw m2, m15 4670 pmaddubsw xm1, xm15 4671 pmaddubsw m3, m15 4672 movq xm11, r4q 4673 punpcklqdq xm11, xm11 4674 pmovsxbw m11, xm11 4675 phaddw m0, m2 4676 phaddw m1, m3 4677 pmulhrsw m0, m12 ; 0 2 _ 4 4678 pmulhrsw m1, m12 ; 1 3 _ 5 4679 pshufd m8, m11, q0000 4680 pshufd m9, m11, q1111 4681 pshufd m10, m11, q2222 4682 pshufd m11, m11, q3333 4683 punpcklwd xm2, xm0, xm1 4684 punpckhwd m1, m0, m1 ; 23 45 4685 vinserti128 m0, m2, xm1, 1 ; 01 23 4686.dy2_w4_loop: 4687 movu xm6, [srcq+ssq*0] 4688 movu xm7, [srcq+ssq*1] 4689 vinserti128 m6, [srcq+ssq*2], 1 4690 vinserti128 m7, [srcq+ss3q ], 1 4691 lea srcq, [srcq+ssq*4] 4692 pmaddwd m4, m0, m8 4693 pmaddwd m5, m1, m9 4694 pshufb m6, m14 4695 pshufb m7, m14 4696 pmaddubsw m6, m15 4697 pmaddubsw m7, m15 4698 psrld m2, m6, 16 4699 pslld m3, m7, 16 4700 paddw m6, m2 4701 paddw m7, m3 4702 pblendw m6, m7, 0xaa ; 67 89 4703 pmulhrsw m6, m12 4704 paddd m4, m5 4705 vperm2i128 m0, m1, m6, 0x21 ; 45 67 4706 mova m1, m6 4707 pmaddwd m6, m0, m10 4708 pmaddwd m7, m1, m11 4709 paddd m4, m13 4710 paddd m6, m7 4711 paddd m4, m6 4712 psrad m4, rndshift 4713 vextracti128 xm5, m4, 1 4714 packssdw xm4, xm5 4715%ifidn %1, put 4716 packuswb xm4, xm4 4717 movd [dstq+dsq*0], xm4 4718 pextrd [dstq+dsq*1], xm4, 1 4719 lea dstq, [dstq+dsq*2] 4720%else 4721 mova [tmpq], xm4 4722 add tmpq, 16 4723%endif 4724 sub hd, 2 4725 jg .dy2_w4_loop 4726 MC_8TAP_SCALED_RET 4727.dy2_w8: 4728 mov dword [rsp+40], 1 4729 movifprep tmp_stridem, 16 4730 jmp .dy2_w_start 4731.dy2_w16: 4732 mov dword [rsp+40], 2 4733 movifprep tmp_stridem, 32 4734 jmp .dy2_w_start 4735.dy2_w32: 4736 mov dword [rsp+40], 4 4737 movifprep tmp_stridem, 64 4738 jmp .dy2_w_start 4739.dy2_w64: 4740 mov dword [rsp+40], 8 4741 movifprep tmp_stridem, 128 4742 jmp .dy2_w_start 4743.dy2_w128: 4744 mov dword [rsp+40], 16 4745 movifprep tmp_stridem, 256 4746.dy2_w_start: 4747 mov myd, mym 4748%ifidn %1, put 4749 movifnidn dsm, dsq 4750%endif 4751 shr t0d, 16 4752 sub srcq, 3 4753 shr myd, 6 4754 mov r4d, 64 << 24 4755 lea myd, [t1+myq] 4756 cmovnz r4q, [base+subpel_filters+myq*8] 4757 pmaddwd m8, [base+rescale_mul] 4758 movd xm15, t0d 4759 mov [rsp+64], t0d 4760 mov [rsp+48], srcq 4761 mov [rsp+56], r0q ; dstq / tmpq 4762%if UNIX64 4763 mov hm, hd 4764%endif 4765 shl dword dxm, 3 ; dx*8 4766 vpbroadcastd m15, xm15 4767 paddd m14, m8 ; mx+dx*[0-7] 4768 movq xm0, r4q 4769 pmovsxbw xm0, xm0 4770 mova [rsp+0x50], xm0 4771 jmp .dy2_hloop 4772.dy2_hloop_prep: 4773 dec dword [rsp+40] 4774 jz .ret 4775 add qword [rsp+56], 8*(isprep+1) 4776 mov hd, hm 4777 vpbroadcastd m8, dxm 4778 vpbroadcastd m10, [base+pd_0x3ff] 4779 paddd m14, m8, [rsp] 4780 vpbroadcastd m15, [rsp+64] 4781 pxor m9, m9 4782 mov srcq, [rsp+48] 4783 mov r0q, [rsp+56] ; dstq / tmpq 4784.dy2_hloop: 4785 vpbroadcastq m11, [base+pq_0x40000000] 4786 pand m6, m14, m10 4787 psrld m6, 6 4788 paddd m15, m6 4789 pcmpeqd m6, m9 4790 vextracti128 xm7, m15, 1 4791 movd r4d, xm15 4792 pextrd r6d, xm15, 2 4793 pextrd r7d, xm15, 1 4794 pextrd r9d, xm15, 3 4795 movd r10d, xm7 4796 pextrd r11d, xm7, 2 4797 pextrd r13d, xm7, 1 4798 pextrd rXd, xm7, 3 4799 movu [rsp], m14 4800 movq xm15, [base+subpel_filters+ r4*8] 4801 movq xm10, [base+subpel_filters+ r6*8] 4802 movhps xm15, [base+subpel_filters+ r7*8] 4803 movhps xm10, [base+subpel_filters+ r9*8] 4804 vinserti128 m15, [base+subpel_filters+r10*8], 1 4805 vinserti128 m10, [base+subpel_filters+r11*8], 1 4806 vpbroadcastq m9, [base+subpel_filters+r13*8] 4807 vpbroadcastq m8, [base+subpel_filters+ rX*8] 4808 psrld m14, 10 4809 vextracti128 xm7, m14, 1 4810 movd r4d, xm14 4811 pextrd r6d, xm14, 2 4812 pextrd r7d, xm14, 1 4813 pextrd r9d, xm14, 3 4814 movd r10d, xm7 4815 pextrd r11d, xm7, 2 4816 pextrd r13d, xm7, 1 4817 pextrd rXd, xm7, 3 4818 pshufd m5, m6, q1100 4819 pshufd m6, m6, q3322 4820 vpblendd m15, m9, 0xc0 4821 vpblendd m10, m8, 0xc0 4822 pblendvb m15, m11, m5 4823 pblendvb m10, m11, m6 4824 vbroadcasti128 m14, [base+subpel_s_shuf8] 4825 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b 4826 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b 4827 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b 4828 MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b 4829 vpbroadcastd m8, [rsp+0x50] 4830 vpbroadcastd m9, [rsp+0x54] 4831 vpbroadcastd m11, [rsp+0x58] 4832 vpbroadcastd m4, [rsp+0x5c] 4833 pshufb m0, m14 ; 01a 01b 4834 pshufb m1, m14 ; 23a 23b 4835 pshufb m2, m14 ; 45a 45b 4836 pshufb m3, m14 ; 67a 67b 4837 SWAP m14, m4 4838.dy2_vloop: 4839 pmaddwd m4, m0, m8 4840 pmaddwd m5, m1, m9 4841 pmaddwd m6, m2, m11 4842 pmaddwd m7, m3, m14 4843 paddd m4, m5 4844 paddd m6, m7 4845 paddd m4, m13 4846 paddd m4, m6 4847 psrad m4, rndshift 4848 vextracti128 xm5, m4, 1 4849 packssdw xm4, xm5 4850%ifidn %1, put 4851 packuswb xm4, xm4 4852 movq [dstq], xm4 4853 add dstq, dsm 4854%else 4855 mova [tmpq], xm4 4856 add tmpq, tmp_stridem 4857%endif 4858 dec hd 4859 jz .dy2_hloop_prep 4860 mova m0, m1 4861 mova m1, m2 4862 mova m2, m3 4863 movq xm3, [srcq+ r4] 4864 movq xm4, [srcq+ r6] 4865 movhps xm3, [srcq+ r7] 4866 movhps xm4, [srcq+ r9] 4867 vinserti128 m3, [srcq+r10], 1 4868 vinserti128 m4, [srcq+r11], 1 4869 vpbroadcastq m5, [srcq+r13] 4870 vpbroadcastq m6, [srcq+ rX] 4871 add srcq, ssq 4872 vpblendd m3, m5, 0xc0 4873 vpblendd m4, m6, 0xc0 4874 pmaddubsw m3, m15 4875 pmaddubsw m4, m10 4876 phaddw m3, m4 4877 movq xm4, [srcq+ r4] 4878 movq xm5, [srcq+ r6] 4879 movhps xm4, [srcq+ r7] 4880 movhps xm5, [srcq+ r9] 4881 vinserti128 m4, [srcq+r10], 1 4882 vinserti128 m5, [srcq+r11], 1 4883 vpbroadcastq m6, [srcq+r13] 4884 vpbroadcastq m7, [srcq+ rX] 4885 add srcq, ssq 4886 vpblendd m4, m6, 0xc0 4887 vpblendd m5, m7, 0xc0 4888 pmaddubsw m4, m15 4889 pmaddubsw m5, m10 4890 phaddw m4, m5 4891 psrld m5, m3, 16 4892 pslld m6, m4, 16 4893 paddw m3, m5 4894 paddw m4, m6 4895 pblendw m3, m4, 0xaa 4896 pmulhrsw m3, m12 4897 jmp .dy2_vloop 4898.ret: 4899 MC_8TAP_SCALED_RET 0 4900%undef isprep 4901%endmacro 4902 4903%macro BILIN_SCALED_FN 1 4904cglobal %1_bilin_scaled_8bpc 4905 mov t0d, (5*15 << 16) | 5*15 4906 mov t1d, t0d 4907 jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) 4908%endmacro 4909 4910%if WIN64 4911DECLARE_REG_TMP 6, 5 4912%else 4913DECLARE_REG_TMP 6, 8 4914%endif 4915 4916%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, 4917%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, 4918 4919BILIN_SCALED_FN put 4920PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_8bpc 4921PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_8bpc 4922PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_8bpc 4923PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_8bpc 4924PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_8bpc 4925PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_8bpc 4926PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_8bpc 4927PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_8bpc 4928PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR 4929MC_8TAP_SCALED put 4930 4931%if WIN64 4932DECLARE_REG_TMP 5, 4 4933%else 4934DECLARE_REG_TMP 6, 7 4935%endif 4936 4937BILIN_SCALED_FN prep 4938PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_8bpc 4939PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_8bpc 4940PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_8bpc 4941PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_8bpc 4942PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_8bpc 4943PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_8bpc 4944PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_8bpc 4945PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_8bpc 4946PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR 4947MC_8TAP_SCALED prep 4948 4949%macro WARP_V 5 ; dst, 02, 46, 13, 57 4950 ; Can be done using gathers, but that's terribly slow on many CPU:s 4951 lea tmp1d, [myq+deltaq*4] 4952 lea tmp2d, [myq+deltaq*1] 4953 shr myd, 10 4954 shr tmp1d, 10 4955 movq xm8, [filterq+myq *8] 4956 vinserti128 m8, [filterq+tmp1q*8], 1 ; a e 4957 lea tmp1d, [tmp2q+deltaq*4] 4958 lea myd, [tmp2q+deltaq*1] 4959 shr tmp2d, 10 4960 shr tmp1d, 10 4961 movq xm0, [filterq+tmp2q*8] 4962 vinserti128 m0, [filterq+tmp1q*8], 1 ; b f 4963 lea tmp1d, [myq+deltaq*4] 4964 lea tmp2d, [myq+deltaq*1] 4965 shr myd, 10 4966 shr tmp1d, 10 4967 movq xm9, [filterq+myq *8] 4968 vinserti128 m9, [filterq+tmp1q*8], 1 ; c g 4969 lea tmp1d, [tmp2q+deltaq*4] 4970 lea myd, [tmp2q+gammaq] ; my += gamma 4971 shr tmp2d, 10 4972 shr tmp1d, 10 4973 punpcklwd m8, m0 4974 movq xm0, [filterq+tmp2q*8] 4975 vinserti128 m0, [filterq+tmp1q*8], 1 ; d h 4976 punpcklwd m0, m9, m0 4977 punpckldq m9, m8, m0 4978 punpckhdq m0, m8, m0 4979 punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 4980 punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 4981 pmaddwd m%2, m8 4982 pmaddwd m9, m%3 4983 punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 4984 punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 4985 pmaddwd m8, m%4 4986 pmaddwd m0, m%5 4987 paddd m%2, m9 4988 paddd m0, m8 4989 paddd m%1, m0, m%2 4990%endmacro 4991 4992cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts 4993%if WIN64 4994 sub rsp, 0xa0 4995%endif 4996 call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main 4997.loop: 4998 psrad m7, 13 4999 psrad m0, 13 5000 packssdw m7, m0 5001 pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7 5002 vpermq m7, m7, q3120 5003 mova [tmpq+tsq*0], xm7 5004 vextracti128 [tmpq+tsq*2], m7, 1 5005 dec r4d 5006 jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end 5007 call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2 5008 lea tmpq, [tmpq+tsq*4] 5009 jmp .loop 5010 5011cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ 5012 beta, filter, tmp1, delta, my, gamma 5013%if WIN64 5014 %assign xmm_regs_used 16 5015 %assign stack_size_padded 0xa0 5016 SUB rsp, stack_size_padded 5017%endif 5018 call .main 5019 jmp .start 5020.loop: 5021 call .main2 5022 lea dstq, [dstq+dsq*2] 5023.start: 5024 psrad m7, 18 5025 psrad m0, 18 5026 packusdw m7, m0 5027 pavgw m7, m11 ; (x + (1 << 10)) >> 11 5028 vextracti128 xm0, m7, 1 5029 packuswb xm7, xm0 5030 pshufd xm7, xm7, q3120 5031 movq [dstq+dsq*0], xm7 5032 movhps [dstq+dsq*1], xm7 5033 dec r4d 5034 jg .loop 5035.end: 5036 RET 5037ALIGN function_align 5038.main: 5039 ; Stack is offset due to call 5040 %assign stack_offset stack_offset + gprsize 5041 %assign stack_size stack_size + gprsize 5042 %assign stack_size_padded stack_size_padded + gprsize 5043 movifnidn abcdq, abcdmp 5044 movifnidn mxd, mxm 5045 WIN64_PUSH_XMM 5046 movsx alphad, word [abcdq+2*0] 5047 movsx betad, word [abcdq+2*1] 5048 mova m12, [warp_8x8_shufA] 5049 mova m13, [warp_8x8_shufB] 5050 vpbroadcastd m14, [pw_8192] 5051 vpbroadcastd m15, [pd_32768] 5052 pxor m11, m11 5053 lea filterq, [mc_warp_filter2] 5054 lea tmp1q, [ssq*3+3] 5055 add mxd, 512+(64<<10) 5056 lea tmp2d, [alphaq*3] 5057 sub srcq, tmp1q ; src -= src_stride*3 + 3 5058 sub betad, tmp2d ; beta -= alpha*3 5059 mov myd, r6m 5060 call .h 5061 psrld m1, m0, 16 5062 call .h 5063 psrld m4, m0, 16 5064 call .h 5065 pblendw m1, m0, 0xaa ; 02 5066 call .h 5067 pblendw m4, m0, 0xaa ; 13 5068 call .h 5069 psrld m2, m1, 16 5070 pblendw m2, m0, 0xaa ; 24 5071 call .h 5072 psrld m5, m4, 16 5073 pblendw m5, m0, 0xaa ; 35 5074 call .h 5075 psrld m3, m2, 16 5076 pblendw m3, m0, 0xaa ; 46 5077 movsx deltad, word [abcdq+2*2] 5078 movsx gammad, word [abcdq+2*3] 5079 add myd, 512+(64<<10) 5080 mov r4d, 4 5081 lea tmp1d, [deltaq*3] 5082 sub gammad, tmp1d ; gamma -= delta*3 5083.main2: 5084 call .h 5085 psrld m6, m5, 16 5086 pblendw m6, m0, 0xaa ; 57 5087 WARP_V 7, 1, 3, 4, 6 5088 call .h 5089 mova m1, m2 5090 mova m2, m3 5091 psrld m3, 16 5092 pblendw m3, m0, 0xaa ; 68 5093 WARP_V 0, 4, 6, 1, 3 5094 mova m4, m5 5095 mova m5, m6 5096 ret 5097ALIGN function_align 5098.h: 5099 lea tmp1d, [mxq+alphaq*4] 5100 lea tmp2d, [mxq+alphaq*1] 5101 vbroadcasti128 m10, [srcq] 5102 shr mxd, 10 5103 shr tmp1d, 10 5104 movq xm8, [filterq+mxq *8] 5105 vinserti128 m8, [filterq+tmp1q*8], 1 5106 lea tmp1d, [tmp2q+alphaq*4] 5107 lea mxd, [tmp2q+alphaq*1] 5108 shr tmp2d, 10 5109 shr tmp1d, 10 5110 movq xm0, [filterq+tmp2q*8] 5111 vinserti128 m0, [filterq+tmp1q*8], 1 5112 lea tmp1d, [mxq+alphaq*4] 5113 lea tmp2d, [mxq+alphaq*1] 5114 shr mxd, 10 5115 shr tmp1d, 10 5116 movq xm9, [filterq+mxq *8] 5117 vinserti128 m9, [filterq+tmp1q*8], 1 5118 lea tmp1d, [tmp2q+alphaq*4] 5119 lea mxd, [tmp2q+betaq] ; mx += beta 5120 shr tmp2d, 10 5121 shr tmp1d, 10 5122 punpcklqdq m8, m0 ; 0 1 4 5 5123 movq xm0, [filterq+tmp2q*8] 5124 vinserti128 m0, [filterq+tmp1q*8], 1 5125 punpcklqdq m9, m0 ; 2 3 6 7 5126 pshufb m0, m10, m12 5127 pmaddubsw m0, m8 5128 pshufb m10, m13 5129 pmaddubsw m10, m9 5130 add srcq, ssq 5131 phaddw m0, m10 5132 pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 5133 paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword 5134 ret 5135 5136%macro BIDIR_FN 1 ; op 5137 %1 0 5138 lea stride3q, [strideq*3] 5139 jmp wq 5140.w4: 5141 vextracti128 xm1, m0, 1 5142 movd [dstq ], xm0 5143 pextrd [dstq+strideq*1], xm0, 1 5144 movd [dstq+strideq*2], xm1 5145 pextrd [dstq+stride3q ], xm1, 1 5146 cmp hd, 4 5147 je .ret 5148 lea dstq, [dstq+strideq*4] 5149 pextrd [dstq ], xm0, 2 5150 pextrd [dstq+strideq*1], xm0, 3 5151 pextrd [dstq+strideq*2], xm1, 2 5152 pextrd [dstq+stride3q ], xm1, 3 5153 cmp hd, 8 5154 je .ret 5155 %1 2 5156 lea dstq, [dstq+strideq*4] 5157 vextracti128 xm1, m0, 1 5158 movd [dstq ], xm0 5159 pextrd [dstq+strideq*1], xm0, 1 5160 movd [dstq+strideq*2], xm1 5161 pextrd [dstq+stride3q ], xm1, 1 5162 lea dstq, [dstq+strideq*4] 5163 pextrd [dstq ], xm0, 2 5164 pextrd [dstq+strideq*1], xm0, 3 5165 pextrd [dstq+strideq*2], xm1, 2 5166 pextrd [dstq+stride3q ], xm1, 3 5167.ret: 5168 RET 5169.w8_loop: 5170 %1_INC_PTR 2 5171 %1 0 5172 lea dstq, [dstq+strideq*4] 5173.w8: 5174 vextracti128 xm1, m0, 1 5175 movq [dstq ], xm0 5176 movq [dstq+strideq*1], xm1 5177 movhps [dstq+strideq*2], xm0 5178 movhps [dstq+stride3q ], xm1 5179 sub hd, 4 5180 jg .w8_loop 5181 RET 5182.w16_loop: 5183 %1_INC_PTR 4 5184 %1 0 5185 lea dstq, [dstq+strideq*4] 5186.w16: 5187 vpermq m0, m0, q3120 5188 mova [dstq ], xm0 5189 vextracti128 [dstq+strideq*1], m0, 1 5190 %1 2 5191 vpermq m0, m0, q3120 5192 mova [dstq+strideq*2], xm0 5193 vextracti128 [dstq+stride3q ], m0, 1 5194 sub hd, 4 5195 jg .w16_loop 5196 RET 5197.w32_loop: 5198 %1_INC_PTR 4 5199 %1 0 5200 lea dstq, [dstq+strideq*2] 5201.w32: 5202 vpermq m0, m0, q3120 5203 mova [dstq+strideq*0], m0 5204 %1 2 5205 vpermq m0, m0, q3120 5206 mova [dstq+strideq*1], m0 5207 sub hd, 2 5208 jg .w32_loop 5209 RET 5210.w64_loop: 5211 %1_INC_PTR 4 5212 %1 0 5213 add dstq, strideq 5214.w64: 5215 vpermq m0, m0, q3120 5216 mova [dstq], m0 5217 %1 2 5218 vpermq m0, m0, q3120 5219 mova [dstq+32], m0 5220 dec hd 5221 jg .w64_loop 5222 RET 5223.w128_loop: 5224 %1 0 5225 add dstq, strideq 5226.w128: 5227 vpermq m0, m0, q3120 5228 mova [dstq+0*32], m0 5229 %1 2 5230 vpermq m0, m0, q3120 5231 mova [dstq+1*32], m0 5232 %1_INC_PTR 8 5233 %1 -4 5234 vpermq m0, m0, q3120 5235 mova [dstq+2*32], m0 5236 %1 -2 5237 vpermq m0, m0, q3120 5238 mova [dstq+3*32], m0 5239 dec hd 5240 jg .w128_loop 5241 RET 5242%endmacro 5243 5244%macro AVG 1 ; src_offset 5245 mova m0, [tmp1q+(%1+0)*32] 5246 paddw m0, [tmp2q+(%1+0)*32] 5247 mova m1, [tmp1q+(%1+1)*32] 5248 paddw m1, [tmp2q+(%1+1)*32] 5249 pmulhrsw m0, m2 5250 pmulhrsw m1, m2 5251 packuswb m0, m1 5252%endmacro 5253 5254%macro AVG_INC_PTR 1 5255 add tmp1q, %1*32 5256 add tmp2q, %1*32 5257%endmacro 5258 5259cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 5260%define base r6-avg %+ SUFFIX %+ _table 5261 lea r6, [avg %+ SUFFIX %+ _table] 5262 tzcnt wd, wm 5263 movifnidn hd, hm 5264 movsxd wq, dword [r6+wq*4] 5265 vpbroadcastd m2, [base+pw_1024] 5266 add wq, r6 5267 BIDIR_FN AVG 5268 5269%macro W_AVG 1 ; src_offset 5270 ; (a * weight + b * (16 - weight) + 128) >> 8 5271 ; = ((a - b) * weight + (b << 4) + 128) >> 8 5272 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 5273 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 5274 mova m0, [tmp1q+(%1+0)*32] 5275 psubw m2, m0, [tmp2q+(%1+0)*32] 5276 mova m1, [tmp1q+(%1+1)*32] 5277 psubw m3, m1, [tmp2q+(%1+1)*32] 5278 pmulhw m2, m4 5279 pmulhw m3, m4 5280 paddw m0, m2 5281 paddw m1, m3 5282 pmulhrsw m0, m5 5283 pmulhrsw m1, m5 5284 packuswb m0, m1 5285%endmacro 5286 5287%define W_AVG_INC_PTR AVG_INC_PTR 5288 5289cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 5290%define base r6-w_avg %+ SUFFIX %+ _table 5291 lea r6, [w_avg %+ SUFFIX %+ _table] 5292 tzcnt wd, wm 5293 movifnidn hd, hm 5294 vpbroadcastw m4, r6m ; weight 5295 movsxd wq, dword [r6+wq*4] 5296 vpbroadcastd m5, [base+pw_2048] 5297 psllw m4, 12 ; (weight-16) << 12 when interpreted as signed 5298 add wq, r6 5299 cmp dword r6m, 7 5300 jg .weight_gt7 5301 mov r6, tmp1q 5302 pxor m0, m0 5303 mov tmp1q, tmp2q 5304 psubw m4, m0, m4 ; -weight 5305 mov tmp2q, r6 5306.weight_gt7: 5307 BIDIR_FN W_AVG 5308 5309%macro MASK 1 ; src_offset 5310 ; (a * m + b * (64 - m) + 512) >> 10 5311 ; = ((a - b) * m + (b << 6) + 512) >> 10 5312 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 5313 vpermq m3, [maskq+%1*16], q3120 5314 mova m0, [tmp2q+(%1+0)*32] 5315 psubw m1, m0, [tmp1q+(%1+0)*32] 5316 psubb m3, m4, m3 5317 paddw m1, m1 ; (b - a) << 1 5318 paddb m3, m3 5319 punpcklbw m2, m4, m3 ; -m << 9 5320 pmulhw m1, m2 5321 paddw m0, m1 5322 mova m1, [tmp2q+(%1+1)*32] 5323 psubw m2, m1, [tmp1q+(%1+1)*32] 5324 paddw m2, m2 5325 punpckhbw m3, m4, m3 5326 pmulhw m2, m3 5327 paddw m1, m2 5328 pmulhrsw m0, m5 5329 pmulhrsw m1, m5 5330 packuswb m0, m1 5331%endmacro 5332 5333%macro MASK_INC_PTR 1 5334 add maskq, %1*16 5335 add tmp2q, %1*32 5336 add tmp1q, %1*32 5337%endmacro 5338 5339cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 5340%define base r7-mask %+ SUFFIX %+ _table 5341 lea r7, [mask %+ SUFFIX %+ _table] 5342 tzcnt wd, wm 5343 movifnidn hd, hm 5344 mov maskq, maskmp 5345 movsxd wq, dword [r7+wq*4] 5346 vpbroadcastd m5, [base+pw_2048] 5347 pxor m4, m4 5348 add wq, r7 5349 BIDIR_FN MASK 5350 5351%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 5352 mova m%1, [tmp1q+32*%3] 5353 mova m1, [tmp2q+32*%3] 5354 psubw m1, m%1 5355 pabsw m%2, m1 5356 psubusw m%2, m6, m%2 5357 psrlw m%2, 8 ; 64 - m 5358 psllw m2, m%2, 10 5359 pmulhw m1, m2 5360 paddw m%1, m1 5361 mova m1, [tmp1q+32*%4] 5362 mova m2, [tmp2q+32*%4] 5363 psubw m2, m1 5364 pabsw m3, m2 5365 psubusw m3, m6, m3 5366 psrlw m3, 8 5367%if %5 5368 packuswb m%2, m3 5369 psubb m%2, m5, m%2 5370 vpermq m%2, m%2, q3120 5371%else 5372 phaddw m%2, m3 5373%endif 5374 psllw m3, 10 5375 pmulhw m2, m3 5376 paddw m1, m2 5377 pmulhrsw m%1, m7 5378 pmulhrsw m1, m7 5379 packuswb m%1, m1 5380%endmacro 5381 5382cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask 5383%define base r6-blend_avx2_table 5384 lea r6, [blend_avx2_table] 5385 tzcnt wd, wm 5386 movifnidn maskq, maskmp 5387 movifnidn hd, hm 5388 movsxd wq, dword [r6+wq*4] 5389 vpbroadcastd m4, [base+pb_64] 5390 vpbroadcastd m5, [base+pw_512] 5391 sub tmpq, maskq 5392 add wq, r6 5393 lea r6, [dsq*3] 5394 jmp wq 5395.w4: 5396 movd xm0, [dstq+dsq*0] 5397 pinsrd xm0, [dstq+dsq*1], 1 5398 vpbroadcastd xm1, [dstq+dsq*2] 5399 pinsrd xm1, [dstq+r6 ], 3 5400 mova xm6, [maskq] 5401 psubb xm3, xm4, xm6 5402 punpcklbw xm2, xm3, xm6 5403 punpckhbw xm3, xm6 5404 mova xm6, [maskq+tmpq] 5405 add maskq, 4*4 5406 punpcklbw xm0, xm6 5407 punpckhbw xm1, xm6 5408 pmaddubsw xm0, xm2 5409 pmaddubsw xm1, xm3 5410 pmulhrsw xm0, xm5 5411 pmulhrsw xm1, xm5 5412 packuswb xm0, xm1 5413 movd [dstq+dsq*0], xm0 5414 pextrd [dstq+dsq*1], xm0, 1 5415 pextrd [dstq+dsq*2], xm0, 2 5416 pextrd [dstq+r6 ], xm0, 3 5417 lea dstq, [dstq+dsq*4] 5418 sub hd, 4 5419 jg .w4 5420 RET 5421ALIGN function_align 5422.w8: 5423 movq xm1, [dstq+dsq*0] 5424 movhps xm1, [dstq+dsq*1] 5425 vpbroadcastq m2, [dstq+dsq*2] 5426 vpbroadcastq m3, [dstq+r6 ] 5427 mova m0, [maskq] 5428 mova m6, [maskq+tmpq] 5429 add maskq, 8*4 5430 vpblendd m1, m2, 0x30 5431 vpblendd m1, m3, 0xc0 5432 psubb m3, m4, m0 5433 punpcklbw m2, m3, m0 5434 punpckhbw m3, m0 5435 punpcklbw m0, m1, m6 5436 punpckhbw m1, m6 5437 pmaddubsw m0, m2 5438 pmaddubsw m1, m3 5439 pmulhrsw m0, m5 5440 pmulhrsw m1, m5 5441 packuswb m0, m1 5442 vextracti128 xm1, m0, 1 5443 movq [dstq+dsq*0], xm0 5444 movhps [dstq+dsq*1], xm0 5445 movq [dstq+dsq*2], xm1 5446 movhps [dstq+r6 ], xm1 5447 lea dstq, [dstq+dsq*4] 5448 sub hd, 4 5449 jg .w8 5450 RET 5451ALIGN function_align 5452.w16: 5453 mova m0, [maskq] 5454 mova xm1, [dstq+dsq*0] 5455 vinserti128 m1, [dstq+dsq*1], 1 5456 psubb m3, m4, m0 5457 punpcklbw m2, m3, m0 5458 punpckhbw m3, m0 5459 mova m6, [maskq+tmpq] 5460 add maskq, 16*2 5461 punpcklbw m0, m1, m6 5462 punpckhbw m1, m6 5463 pmaddubsw m0, m2 5464 pmaddubsw m1, m3 5465 pmulhrsw m0, m5 5466 pmulhrsw m1, m5 5467 packuswb m0, m1 5468 mova [dstq+dsq*0], xm0 5469 vextracti128 [dstq+dsq*1], m0, 1 5470 lea dstq, [dstq+dsq*2] 5471 sub hd, 2 5472 jg .w16 5473 RET 5474ALIGN function_align 5475.w32: 5476 mova m0, [maskq] 5477 mova m1, [dstq] 5478 mova m6, [maskq+tmpq] 5479 add maskq, 32 5480 psubb m3, m4, m0 5481 punpcklbw m2, m3, m0 5482 punpckhbw m3, m0 5483 punpcklbw m0, m1, m6 5484 punpckhbw m1, m6 5485 pmaddubsw m0, m2 5486 pmaddubsw m1, m3 5487 pmulhrsw m0, m5 5488 pmulhrsw m1, m5 5489 packuswb m0, m1 5490 mova [dstq], m0 5491 add dstq, dsq 5492 dec hd 5493 jg .w32 5494 RET 5495 5496cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask 5497%define base r5-blend_v_avx2_table 5498 lea r5, [blend_v_avx2_table] 5499 tzcnt wd, wm 5500 movifnidn hd, hm 5501 movsxd wq, dword [r5+wq*4] 5502 vpbroadcastd m5, [base+pw_512] 5503 add wq, r5 5504 add maskq, obmc_masks-blend_v_avx2_table 5505 jmp wq 5506.w2: 5507 vpbroadcastd xm2, [maskq+2*2] 5508.w2_s0_loop: 5509 movd xm0, [dstq+dsq*0] 5510 pinsrw xm0, [dstq+dsq*1], 1 5511 movd xm1, [tmpq] 5512 add tmpq, 2*2 5513 punpcklbw xm0, xm1 5514 pmaddubsw xm0, xm2 5515 pmulhrsw xm0, xm5 5516 packuswb xm0, xm0 5517 pextrw [dstq+dsq*0], xm0, 0 5518 pextrw [dstq+dsq*1], xm0, 1 5519 lea dstq, [dstq+dsq*2] 5520 sub hd, 2 5521 jg .w2_s0_loop 5522 RET 5523ALIGN function_align 5524.w4: 5525 vpbroadcastq xm2, [maskq+4*2] 5526.w4_loop: 5527 movd xm0, [dstq+dsq*0] 5528 pinsrd xm0, [dstq+dsq*1], 1 5529 movq xm1, [tmpq] 5530 add tmpq, 4*2 5531 punpcklbw xm0, xm1 5532 pmaddubsw xm0, xm2 5533 pmulhrsw xm0, xm5 5534 packuswb xm0, xm0 5535 movd [dstq+dsq*0], xm0 5536 pextrd [dstq+dsq*1], xm0, 1 5537 lea dstq, [dstq+dsq*2] 5538 sub hd, 2 5539 jg .w4_loop 5540 RET 5541ALIGN function_align 5542.w8: 5543 mova xm3, [maskq+8*2] 5544.w8_loop: 5545 movq xm0, [dstq+dsq*0] 5546 vpbroadcastq xm1, [dstq+dsq*1] 5547 mova xm2, [tmpq] 5548 add tmpq, 8*2 5549 punpcklbw xm0, xm2 5550 punpckhbw xm1, xm2 5551 pmaddubsw xm0, xm3 5552 pmaddubsw xm1, xm3 5553 pmulhrsw xm0, xm5 5554 pmulhrsw xm1, xm5 5555 packuswb xm0, xm1 5556 movq [dstq+dsq*0], xm0 5557 movhps [dstq+dsq*1], xm0 5558 lea dstq, [dstq+dsq*2] 5559 sub hd, 2 5560 jg .w8_loop 5561 RET 5562ALIGN function_align 5563.w16: 5564 vbroadcasti128 m3, [maskq+16*2] 5565 vbroadcasti128 m4, [maskq+16*3] 5566.w16_loop: 5567 mova xm1, [dstq+dsq*0] 5568 vinserti128 m1, [dstq+dsq*1], 1 5569 mova m2, [tmpq] 5570 add tmpq, 16*2 5571 punpcklbw m0, m1, m2 5572 punpckhbw m1, m2 5573 pmaddubsw m0, m3 5574 pmaddubsw m1, m4 5575 pmulhrsw m0, m5 5576 pmulhrsw m1, m5 5577 packuswb m0, m1 5578 mova [dstq+dsq*0], xm0 5579 vextracti128 [dstq+dsq*1], m0, 1 5580 lea dstq, [dstq+dsq*2] 5581 sub hd, 2 5582 jg .w16_loop 5583 RET 5584ALIGN function_align 5585.w32: 5586 mova xm3, [maskq+16*4] 5587 vinserti128 m3, [maskq+16*6], 1 5588 mova xm4, [maskq+16*5] 5589 vinserti128 m4, [maskq+16*7], 1 5590.w32_loop: 5591 mova m1, [dstq] 5592 mova m2, [tmpq] 5593 add tmpq, 32 5594 punpcklbw m0, m1, m2 5595 punpckhbw m1, m2 5596 pmaddubsw m0, m3 5597 pmaddubsw m1, m4 5598 pmulhrsw m0, m5 5599 pmulhrsw m1, m5 5600 packuswb m0, m1 5601 mova [dstq], m0 5602 add dstq, dsq 5603 dec hd 5604 jg .w32_loop 5605 RET 5606 5607cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask 5608%define base r5-blend_h_avx2_table 5609 lea r5, [blend_h_avx2_table] 5610 mov r6d, wd 5611 tzcnt wd, wd 5612 mov hd, hm 5613 movsxd wq, dword [r5+wq*4] 5614 vpbroadcastd m5, [base+pw_512] 5615 add wq, r5 5616 lea maskq, [base+obmc_masks+hq*2] 5617 lea hd, [hq*3] 5618 shr hd, 2 ; h * 3/4 5619 lea maskq, [maskq+hq*2] 5620 neg hq 5621 jmp wq 5622.w2: 5623 movd xm0, [dstq+dsq*0] 5624 pinsrw xm0, [dstq+dsq*1], 1 5625 movd xm2, [maskq+hq*2] 5626 movd xm1, [tmpq] 5627 add tmpq, 2*2 5628 punpcklwd xm2, xm2 5629 punpcklbw xm0, xm1 5630 pmaddubsw xm0, xm2 5631 pmulhrsw xm0, xm5 5632 packuswb xm0, xm0 5633 pextrw [dstq+dsq*0], xm0, 0 5634 pextrw [dstq+dsq*1], xm0, 1 5635 lea dstq, [dstq+dsq*2] 5636 add hq, 2 5637 jl .w2 5638 RET 5639ALIGN function_align 5640.w4: 5641 mova xm3, [blend_shuf] 5642.w4_loop: 5643 movd xm0, [dstq+dsq*0] 5644 pinsrd xm0, [dstq+dsq*1], 1 5645 movd xm2, [maskq+hq*2] 5646 movq xm1, [tmpq] 5647 add tmpq, 4*2 5648 pshufb xm2, xm3 5649 punpcklbw xm0, xm1 5650 pmaddubsw xm0, xm2 5651 pmulhrsw xm0, xm5 5652 packuswb xm0, xm0 5653 movd [dstq+dsq*0], xm0 5654 pextrd [dstq+dsq*1], xm0, 1 5655 lea dstq, [dstq+dsq*2] 5656 add hq, 2 5657 jl .w4_loop 5658 RET 5659ALIGN function_align 5660.w8: 5661 vbroadcasti128 m4, [blend_shuf] 5662 shufpd m4, m4, 0x03 5663.w8_loop: 5664 vpbroadcastq m1, [dstq+dsq*0] 5665 movq xm0, [dstq+dsq*1] 5666 vpblendd m0, m1, 0x30 5667 vpbroadcastd m3, [maskq+hq*2] 5668 movq xm1, [tmpq+8*1] 5669 vinserti128 m1, [tmpq+8*0], 1 5670 add tmpq, 8*2 5671 pshufb m3, m4 5672 punpcklbw m0, m1 5673 pmaddubsw m0, m3 5674 pmulhrsw m0, m5 5675 vextracti128 xm1, m0, 1 5676 packuswb xm0, xm1 5677 movhps [dstq+dsq*0], xm0 5678 movq [dstq+dsq*1], xm0 5679 lea dstq, [dstq+dsq*2] 5680 add hq, 2 5681 jl .w8_loop 5682 RET 5683ALIGN function_align 5684.w16: 5685 vbroadcasti128 m4, [blend_shuf] 5686 shufpd m4, m4, 0x0c 5687.w16_loop: 5688 mova xm1, [dstq+dsq*0] 5689 vinserti128 m1, [dstq+dsq*1], 1 5690 vpbroadcastd m3, [maskq+hq*2] 5691 mova m2, [tmpq] 5692 add tmpq, 16*2 5693 pshufb m3, m4 5694 punpcklbw m0, m1, m2 5695 punpckhbw m1, m2 5696 pmaddubsw m0, m3 5697 pmaddubsw m1, m3 5698 pmulhrsw m0, m5 5699 pmulhrsw m1, m5 5700 packuswb m0, m1 5701 mova [dstq+dsq*0], xm0 5702 vextracti128 [dstq+dsq*1], m0, 1 5703 lea dstq, [dstq+dsq*2] 5704 add hq, 2 5705 jl .w16_loop 5706 RET 5707ALIGN function_align 5708.w32: ; w32/w64/w128 5709 sub dsq, r6 5710.w32_loop0: 5711 vpbroadcastw m3, [maskq+hq*2] 5712 mov wd, r6d 5713.w32_loop: 5714 mova m1, [dstq] 5715 mova m2, [tmpq] 5716 add tmpq, 32 5717 punpcklbw m0, m1, m2 5718 punpckhbw m1, m2 5719 pmaddubsw m0, m3 5720 pmaddubsw m1, m3 5721 pmulhrsw m0, m5 5722 pmulhrsw m1, m5 5723 packuswb m0, m1 5724 mova [dstq], m0 5725 add dstq, 32 5726 sub wd, 32 5727 jg .w32_loop 5728 add dstq, dsq 5729 inc hq 5730 jl .w32_loop0 5731 RET 5732 5733cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ 5734 bottomext, rightext 5735 ; we assume that the buffer (stride) is larger than width, so we can 5736 ; safely overwrite by a few bytes 5737 5738 ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 5739 xor r12d, r12d 5740 lea r10, [ihq-1] 5741 cmp yq, ihq 5742 cmovs r10, yq 5743 test yq, yq 5744 cmovs r10, r12 5745 imul r10, sstrideq 5746 add srcq, r10 5747 5748 ; ref += iclip(x, 0, iw - 1) 5749 lea r10, [iwq-1] 5750 cmp xq, iwq 5751 cmovs r10, xq 5752 test xq, xq 5753 cmovs r10, r12 5754 add srcq, r10 5755 5756 ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) 5757 lea bottomextq, [yq+bhq] 5758 sub bottomextq, ihq 5759 lea r3, [bhq-1] 5760 cmovs bottomextq, r12 5761 5762 DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ 5763 bottomext, rightext 5764 5765 ; top_ext = iclip(-y, 0, bh - 1) 5766 neg topextq 5767 cmovs topextq, r12 5768 cmp bottomextq, bhq 5769 cmovns bottomextq, r3 5770 cmp topextq, bhq 5771 cmovg topextq, r3 5772 5773 ; right_ext = iclip(x + bw - iw, 0, bw - 1) 5774 lea rightextq, [xq+bwq] 5775 sub rightextq, iwq 5776 lea r2, [bwq-1] 5777 cmovs rightextq, r12 5778 5779 DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ 5780 bottomext, rightext 5781 5782 ; left_ext = iclip(-x, 0, bw - 1) 5783 neg leftextq 5784 cmovs leftextq, r12 5785 cmp rightextq, bwq 5786 cmovns rightextq, r2 5787 cmp leftextq, bwq 5788 cmovns leftextq, r2 5789 5790 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ 5791 dst, dstride, src, sstride, bottomext, rightext 5792 5793 ; center_h = bh - top_ext - bottom_ext 5794 lea r3, [bottomextq+topextq] 5795 sub centerhq, r3 5796 5797 ; blk += top_ext * PXSTRIDE(dst_stride) 5798 mov r2, topextq 5799 imul r2, dstrideq 5800 add dstq, r2 5801 mov r9m, dstq 5802 5803 ; center_w = bw - left_ext - right_ext 5804 mov centerwq, bwq 5805 lea r3, [rightextq+leftextq] 5806 sub centerwq, r3 5807 5808%macro v_loop 3 ; need_left_ext, need_right_ext, suffix 5809.v_loop_%3: 5810%if %1 5811 ; left extension 5812 xor r3, r3 5813 vpbroadcastb m0, [srcq] 5814.left_loop_%3: 5815 mova [dstq+r3], m0 5816 add r3, 32 5817 cmp r3, leftextq 5818 jl .left_loop_%3 5819 5820 ; body 5821 lea r12, [dstq+leftextq] 5822%endif 5823 xor r3, r3 5824.body_loop_%3: 5825 movu m0, [srcq+r3] 5826%if %1 5827 movu [r12+r3], m0 5828%else 5829 movu [dstq+r3], m0 5830%endif 5831 add r3, 32 5832 cmp r3, centerwq 5833 jl .body_loop_%3 5834 5835%if %2 5836 ; right extension 5837%if %1 5838 add r12, centerwq 5839%else 5840 lea r12, [dstq+centerwq] 5841%endif 5842 xor r3, r3 5843 vpbroadcastb m0, [srcq+centerwq-1] 5844.right_loop_%3: 5845 movu [r12+r3], m0 5846 add r3, 32 5847 cmp r3, rightextq 5848 jl .right_loop_%3 5849 5850%endif 5851 add dstq, dstrideq 5852 add srcq, sstrideq 5853 dec centerhq 5854 jg .v_loop_%3 5855%endmacro 5856 5857 test leftextq, leftextq 5858 jnz .need_left_ext 5859 test rightextq, rightextq 5860 jnz .need_right_ext 5861 v_loop 0, 0, 0 5862 jmp .body_done 5863 5864.need_left_ext: 5865 test rightextq, rightextq 5866 jnz .need_left_right_ext 5867 v_loop 1, 0, 1 5868 jmp .body_done 5869 5870.need_left_right_ext: 5871 v_loop 1, 1, 2 5872 jmp .body_done 5873 5874.need_right_ext: 5875 v_loop 0, 1, 3 5876 5877.body_done: 5878 ; bottom edge extension 5879 test bottomextq, bottomextq 5880 jz .top 5881 mov srcq, dstq 5882 sub srcq, dstrideq 5883 xor r1, r1 5884.bottom_x_loop: 5885 mova m0, [srcq+r1] 5886 lea r3, [dstq+r1] 5887 mov r4, bottomextq 5888.bottom_y_loop: 5889 mova [r3], m0 5890 add r3, dstrideq 5891 dec r4 5892 jg .bottom_y_loop 5893 add r1, 32 5894 cmp r1, bwq 5895 jl .bottom_x_loop 5896 5897.top: 5898 ; top edge extension 5899 test topextq, topextq 5900 jz .end 5901 mov srcq, r9m 5902 mov dstq, dstm 5903 xor r1, r1 5904.top_x_loop: 5905 mova m0, [srcq+r1] 5906 lea r3, [dstq+r1] 5907 mov r4, topextq 5908.top_y_loop: 5909 mova [r3], m0 5910 add r3, dstrideq 5911 dec r4 5912 jg .top_y_loop 5913 add r1, 32 5914 cmp r1, bwq 5915 jl .top_x_loop 5916 5917.end: 5918 RET 5919 5920cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \ 5921 dst_w, h, src_w, dx, mx0 5922 sub dword mx0m, 4<<14 5923 sub dword src_wm, 8 5924 vpbroadcastd m5, dxm 5925 vpbroadcastd m8, mx0m 5926 vpbroadcastd m6, src_wm 5927 5928 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x 5929 LEA r7, $$ 5930%define base r7-$$ 5931 5932 vpbroadcastd xm3, [base+pw_m256] 5933 vpbroadcastd m7, [base+pd_63] 5934 vbroadcasti128 m15, [base+pb_8x0_8x8] 5935 pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] 5936 pslld m5, 3 ; dx*8 5937 pslld m6, 14 5938 paddd m8, m2 ; mx+[0..7]*dx 5939 pxor m2, m2 5940 5941 ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7 5942 ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8 5943 5944.loop_y: 5945 xor xd, xd 5946 mova m4, m8 ; per-line working version of mx 5947 5948.loop_x: 5949 pmaxsd m0, m4, m2 5950 psrad m9, m4, 8 ; filter offset (unmasked) 5951 pminsd m0, m6 ; iclip(mx, 0, src_w-8) 5952 psubd m1, m4, m0 ; pshufb offset 5953 psrad m0, 14 ; clipped src_x offset 5954 psrad m1, 14 ; pshufb edge_emu offset 5955 pand m9, m7 ; filter offset (masked) 5956 5957 ; load source pixels - this ugly code is vpgatherdq emulation since 5958 ; directly using vpgatherdq on Haswell is quite a bit slower :( 5959 movd r8d, xm0 5960 pextrd r9d, xm0, 1 5961 pextrd r10d, xm0, 2 5962 pextrd r11d, xm0, 3 5963 vextracti128 xm0, m0, 1 5964 movq xm12, [srcq+r8] 5965 movq xm13, [srcq+r10] 5966 movhps xm12, [srcq+r9] 5967 movhps xm13, [srcq+r11] 5968 movd r8d, xm0 5969 pextrd r9d, xm0, 1 5970 pextrd r10d, xm0, 2 5971 pextrd r11d, xm0, 3 5972 vinserti128 m12, [srcq+r8], 1 5973 vinserti128 m13, [srcq+r10], 1 5974 vpbroadcastq m10, [srcq+r9] 5975 vpbroadcastq m11, [srcq+r11] 5976 vpblendd m12, m10, 11000000b 5977 vpblendd m13, m11, 11000000b 5978 5979 ; if no emulation is required, we don't need to shuffle or emulate edges 5980 ; this also saves 2 quasi-vpgatherdqs 5981 vptest m1, m1 5982 jz .filter 5983 5984 movq r9, xm1 5985 pextrq r11, xm1, 1 5986 movsxd r8, r9d 5987 sar r9, 32 5988 movsxd r10, r11d 5989 sar r11, 32 5990 vextracti128 xm1, m1, 1 5991 movq xm14, [base+resize_shuf+4+r8] 5992 movq xm0, [base+resize_shuf+4+r10] 5993 movhps xm14, [base+resize_shuf+4+r9] 5994 movhps xm0, [base+resize_shuf+4+r11] 5995 movq r9, xm1 5996 pextrq r11, xm1, 1 5997 movsxd r8, r9d 5998 sar r9, 32 5999 movsxd r10, r11d 6000 sar r11, 32 6001 vinserti128 m14, [base+resize_shuf+4+r8], 1 6002 vinserti128 m0, [base+resize_shuf+4+r10], 1 6003 vpbroadcastq m10, [base+resize_shuf+4+r9] 6004 vpbroadcastq m11, [base+resize_shuf+4+r11] 6005 vpblendd m14, m10, 11000000b 6006 vpblendd m0, m11, 11000000b 6007 6008 paddb m14, m15 6009 paddb m0, m15 6010 pshufb m12, m14 6011 pshufb m13, m0 6012 6013.filter: 6014 movd r8d, xm9 6015 pextrd r9d, xm9, 1 6016 pextrd r10d, xm9, 2 6017 pextrd r11d, xm9, 3 6018 vextracti128 xm9, m9, 1 6019 movq xm10, [base+resize_filter+r8*8] 6020 movq xm11, [base+resize_filter+r10*8] 6021 movhps xm10, [base+resize_filter+r9*8] 6022 movhps xm11, [base+resize_filter+r11*8] 6023 movd r8d, xm9 6024 pextrd r9d, xm9, 1 6025 pextrd r10d, xm9, 2 6026 pextrd r11d, xm9, 3 6027 vinserti128 m10, [base+resize_filter+r8*8], 1 6028 vinserti128 m11, [base+resize_filter+r10*8], 1 6029 vpbroadcastq m14, [base+resize_filter+r9*8] 6030 vpbroadcastq m1, [base+resize_filter+r11*8] 6031 vpblendd m10, m14, 11000000b 6032 vpblendd m11, m1, 11000000b 6033 6034 pmaddubsw m12, m10 6035 pmaddubsw m13, m11 6036 phaddw m12, m13 6037 vextracti128 xm13, m12, 1 6038 phaddsw xm12, xm13 6039 pmulhrsw xm12, xm3 ; x=(x+64)>>7 6040 packuswb xm12, xm12 6041 movq [dstq+xq], xm12 6042 6043 paddd m4, m5 6044 add xd, 8 6045 cmp xd, dst_wd 6046 jl .loop_x 6047 6048 add dstq, dst_strideq 6049 add srcq, src_strideq 6050 dec hd 6051 jg .loop_y 6052 RET 6053 6054cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 6055%define base r7-w_mask_420_avx2_table 6056 lea r7, [w_mask_420_avx2_table] 6057 tzcnt wd, wm 6058 mov r6d, r7m ; sign 6059 movifnidn hd, hm 6060 movsxd wq, [r7+wq*4] 6061 vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 6062 vpbroadcastd m7, [base+pw_2048] 6063 pmovzxbd m9, [base+deint_shuf4] 6064 vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign 6065 add wq, r7 6066 W_MASK 0, 4, 0, 1 6067 mov maskq, maskmp 6068 lea stride3q, [strideq*3] 6069 jmp wq 6070.w4: 6071 vextracti128 xm1, m0, 1 6072 movd [dstq+strideq*0], xm0 6073 pextrd [dstq+strideq*1], xm0, 1 6074 movd [dstq+strideq*2], xm1 6075 pextrd [dstq+stride3q ], xm1, 1 6076 cmp hd, 8 6077 jl .w4_end 6078 lea dstq, [dstq+strideq*4] 6079 pextrd [dstq+strideq*0], xm0, 2 6080 pextrd [dstq+strideq*1], xm0, 3 6081 pextrd [dstq+strideq*2], xm1, 2 6082 pextrd [dstq+stride3q ], xm1, 3 6083 jg .w4_h16 6084.w4_end: 6085 vextracti128 xm0, m4, 1 6086 vpblendd xm1, xm4, xm0, 0x05 6087 vpblendd xm4, xm0, 0x0a 6088 pshufd xm1, xm1, q2301 6089 psubw xm4, xm8, xm4 6090 psubw xm4, xm1 6091 psrlw xm4, 2 6092 packuswb xm4, xm4 6093 movq [maskq], xm4 6094 RET 6095.w4_h16: 6096 W_MASK 0, 5, 2, 3 6097 lea dstq, [dstq+strideq*4] 6098 phaddd m4, m5 6099 vextracti128 xm1, m0, 1 6100 psubw m4, m8, m4 6101 psrlw m4, 2 6102 vpermd m4, m9, m4 6103 vextracti128 xm5, m4, 1 6104 packuswb xm4, xm5 6105 movd [dstq+strideq*0], xm0 6106 pextrd [dstq+strideq*1], xm0, 1 6107 movd [dstq+strideq*2], xm1 6108 pextrd [dstq+stride3q], xm1, 1 6109 lea dstq, [dstq+strideq*4] 6110 pextrd [dstq+strideq*0], xm0, 2 6111 pextrd [dstq+strideq*1], xm0, 3 6112 pextrd [dstq+strideq*2], xm1, 2 6113 pextrd [dstq+stride3q ], xm1, 3 6114 mova [maskq], xm4 6115 RET 6116.w8_loop: 6117 add tmp1q, 2*32 6118 add tmp2q, 2*32 6119 W_MASK 0, 4, 0, 1 6120 lea dstq, [dstq+strideq*4] 6121 add maskq, 8 6122.w8: 6123 vextracti128 xm2, m4, 1 6124 vextracti128 xm1, m0, 1 6125 psubw xm4, xm8, xm4 6126 psubw xm4, xm2 6127 psrlw xm4, 2 6128 packuswb xm4, xm4 6129 movq [dstq+strideq*0], xm0 6130 movq [dstq+strideq*1], xm1 6131 movhps [dstq+strideq*2], xm0 6132 movhps [dstq+stride3q ], xm1 6133 movq [maskq], xm4 6134 sub hd, 4 6135 jg .w8_loop 6136 RET 6137.w16_loop: 6138 add tmp1q, 4*32 6139 add tmp2q, 4*32 6140 W_MASK 0, 4, 0, 1 6141 lea dstq, [dstq+strideq*4] 6142 add maskq, 16 6143.w16: 6144 vpermq m0, m0, q3120 6145 mova [dstq+strideq*0], xm0 6146 vextracti128 [dstq+strideq*1], m0, 1 6147 W_MASK 0, 5, 2, 3 6148 punpckhqdq m1, m4, m5 6149 punpcklqdq m4, m5 6150 psubw m1, m8, m1 6151 psubw m1, m4 6152 psrlw m1, 2 6153 vpermq m0, m0, q3120 6154 packuswb m1, m1 6155 vpermd m1, m9, m1 6156 mova [dstq+strideq*2], xm0 6157 vextracti128 [dstq+stride3q ], m0, 1 6158 mova [maskq], xm1 6159 sub hd, 4 6160 jg .w16_loop 6161 RET 6162.w32_loop: 6163 add tmp1q, 4*32 6164 add tmp2q, 4*32 6165 W_MASK 0, 4, 0, 1 6166 lea dstq, [dstq+strideq*2] 6167 add maskq, 16 6168.w32: 6169 vpermq m0, m0, q3120 6170 mova [dstq+strideq*0], m0 6171 W_MASK 0, 5, 2, 3 6172 psubw m4, m8, m4 6173 psubw m4, m5 6174 psrlw m4, 2 6175 vpermq m0, m0, q3120 6176 packuswb m4, m4 6177 vpermd m4, m9, m4 6178 mova [dstq+strideq*1], m0 6179 mova [maskq], xm4 6180 sub hd, 2 6181 jg .w32_loop 6182 RET 6183.w64_loop_even: 6184 psubw m10, m8, m4 6185 psubw m11, m8, m5 6186 dec hd 6187.w64_loop: 6188 add tmp1q, 4*32 6189 add tmp2q, 4*32 6190 W_MASK 0, 4, 0, 1 6191 add dstq, strideq 6192.w64: 6193 vpermq m0, m0, q3120 6194 mova [dstq+32*0], m0 6195 W_MASK 0, 5, 2, 3 6196 vpermq m0, m0, q3120 6197 mova [dstq+32*1], m0 6198 test hd, 1 6199 jz .w64_loop_even 6200 psubw m4, m10, m4 6201 psubw m5, m11, m5 6202 psrlw m4, 2 6203 psrlw m5, 2 6204 packuswb m4, m5 6205 vpermd m4, m9, m4 6206 mova [maskq], m4 6207 add maskq, 32 6208 dec hd 6209 jg .w64_loop 6210 RET 6211.w128_loop_even: 6212 psubw m12, m8, m4 6213 psubw m13, m8, m5 6214 dec hd 6215.w128_loop: 6216 W_MASK 0, 4, 0, 1 6217 add dstq, strideq 6218.w128: 6219 vpermq m0, m0, q3120 6220 mova [dstq+32*0], m0 6221 W_MASK 0, 5, 2, 3 6222 vpermq m0, m0, q3120 6223 mova [dstq+32*1], m0 6224 add tmp1q, 8*32 6225 add tmp2q, 8*32 6226 test hd, 1 6227 jz .w128_even 6228 psubw m4, m10, m4 6229 psubw m5, m11, m5 6230 psrlw m4, 2 6231 psrlw m5, 2 6232 packuswb m4, m5 6233 vpermd m4, m9, m4 6234 mova [maskq+32*0], m4 6235 jmp .w128_odd 6236.w128_even: 6237 psubw m10, m8, m4 6238 psubw m11, m8, m5 6239.w128_odd: 6240 W_MASK 0, 4, -4, -3 6241 vpermq m0, m0, q3120 6242 mova [dstq+32*2], m0 6243 W_MASK 0, 5, -2, -1 6244 vpermq m0, m0, q3120 6245 mova [dstq+32*3], m0 6246 test hd, 1 6247 jz .w128_loop_even 6248 psubw m4, m12, m4 6249 psubw m5, m13, m5 6250 psrlw m4, 2 6251 psrlw m5, 2 6252 packuswb m4, m5 6253 vpermd m4, m9, m4 6254 mova [maskq+32*1], m4 6255 add maskq, 64 6256 dec hd 6257 jg .w128_loop 6258 RET 6259 6260cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 6261%define base r7-w_mask_422_avx2_table 6262 lea r7, [w_mask_422_avx2_table] 6263 tzcnt wd, wm 6264 mov r6d, r7m ; sign 6265 movifnidn hd, hm 6266 pxor m9, m9 6267 movsxd wq, dword [r7+wq*4] 6268 vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 6269 vpbroadcastd m7, [base+pw_2048] 6270 pmovzxbd m10, [base+deint_shuf4] 6271 vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign 6272 add wq, r7 6273 mov maskq, maskmp 6274 W_MASK 0, 4, 0, 1 6275 lea stride3q, [strideq*3] 6276 jmp wq 6277.w4: 6278 vextracti128 xm1, m0, 1 6279 movd [dstq+strideq*0], xm0 6280 pextrd [dstq+strideq*1], xm0, 1 6281 movd [dstq+strideq*2], xm1 6282 pextrd [dstq+stride3q ], xm1, 1 6283 cmp hd, 8 6284 jl .w4_end 6285 lea dstq, [dstq+strideq*4] 6286 pextrd [dstq+strideq*0], xm0, 2 6287 pextrd [dstq+strideq*1], xm0, 3 6288 pextrd [dstq+strideq*2], xm1, 2 6289 pextrd [dstq+stride3q ], xm1, 3 6290 jg .w4_h16 6291.w4_end: 6292 vextracti128 xm5, m4, 1 6293 packuswb xm4, xm5 6294 psubb xm5, xm8, xm4 6295 pavgb xm5, xm9 6296 pshufd xm5, xm5, q3120 6297 mova [maskq], xm5 6298 RET 6299.w4_h16: 6300 W_MASK 0, 5, 2, 3 6301 lea dstq, [dstq+strideq*4] 6302 packuswb m4, m5 6303 psubb m5, m8, m4 6304 pavgb m5, m9 6305 vpermd m5, m10, m5 6306 vextracti128 xm1, m0, 1 6307 movd [dstq+strideq*0], xm0 6308 pextrd [dstq+strideq*1], xm0, 1 6309 movd [dstq+strideq*2], xm1 6310 pextrd [dstq+stride3q ], xm1, 1 6311 lea dstq, [dstq+strideq*4] 6312 pextrd [dstq+strideq*0], xm0, 2 6313 pextrd [dstq+strideq*1], xm0, 3 6314 pextrd [dstq+strideq*2], xm1, 2 6315 pextrd [dstq+stride3q ], xm1, 3 6316 mova [maskq], m5 6317 RET 6318.w8_loop: 6319 add tmp1q, 32*2 6320 add tmp2q, 32*2 6321 W_MASK 0, 4, 0, 1 6322 lea dstq, [dstq+strideq*4] 6323 add maskq, 16 6324.w8: 6325 vextracti128 xm5, m4, 1 6326 vextracti128 xm1, m0, 1 6327 packuswb xm4, xm5 6328 psubb xm5, xm8, xm4 6329 pavgb xm5, xm9 6330 pshufd xm5, xm5, q3120 6331 movq [dstq+strideq*0], xm0 6332 movq [dstq+strideq*1], xm1 6333 movhps [dstq+strideq*2], xm0 6334 movhps [dstq+stride3q ], xm1 6335 mova [maskq], xm5 6336 sub hd, 4 6337 jg .w8_loop 6338 RET 6339.w16_loop: 6340 add tmp1q, 32*4 6341 add tmp2q, 32*4 6342 W_MASK 0, 4, 0, 1 6343 lea dstq, [dstq+strideq*4] 6344 add maskq, 32 6345.w16: 6346 vpermq m0, m0, q3120 6347 mova [dstq+strideq*0], xm0 6348 vextracti128 [dstq+strideq*1], m0, 1 6349 W_MASK 0, 5, 2, 3 6350 packuswb m4, m5 6351 psubb m5, m8, m4 6352 pavgb m5, m9 6353 vpermq m0, m0, q3120 6354 vpermd m5, m10, m5 6355 mova [dstq+strideq*2], xm0 6356 vextracti128 [dstq+stride3q ], m0, 1 6357 mova [maskq], m5 6358 sub hd, 4 6359 jg .w16_loop 6360 RET 6361.w32_loop: 6362 add tmp1q, 32*4 6363 add tmp2q, 32*4 6364 W_MASK 0, 4, 0, 1 6365 lea dstq, [dstq+strideq*2] 6366 add maskq, 32 6367.w32: 6368 vpermq m0, m0, q3120 6369 mova [dstq+strideq*0], m0 6370 W_MASK 0, 5, 2, 3 6371 packuswb m4, m5 6372 psubb m5, m8, m4 6373 pavgb m5, m9 6374 vpermq m0, m0, q3120 6375 vpermd m5, m10, m5 6376 mova [dstq+strideq*1], m0 6377 mova [maskq], m5 6378 sub hd, 2 6379 jg .w32_loop 6380 RET 6381.w64_loop: 6382 add tmp1q, 32*4 6383 add tmp2q, 32*4 6384 W_MASK 0, 4, 0, 1 6385 add dstq, strideq 6386 add maskq, 32 6387.w64: 6388 vpermq m0, m0, q3120 6389 mova [dstq+32*0], m0 6390 W_MASK 0, 5, 2, 3 6391 packuswb m4, m5 6392 psubb m5, m8, m4 6393 pavgb m5, m9 6394 vpermq m0, m0, q3120 6395 vpermd m5, m10, m5 6396 mova [dstq+32*1], m0 6397 mova [maskq], m5 6398 dec hd 6399 jg .w64_loop 6400 RET 6401.w128_loop: 6402 add tmp1q, 32*8 6403 add tmp2q, 32*8 6404 W_MASK 0, 4, 0, 1 6405 add dstq, strideq 6406 add maskq, 32*2 6407.w128: 6408 vpermq m0, m0, q3120 6409 mova [dstq+32*0], m0 6410 W_MASK 0, 5, 2, 3 6411 packuswb m4, m5 6412 psubb m5, m8, m4 6413 pavgb m5, m9 6414 vpermq m0, m0, q3120 6415 vpermd m5, m10, m5 6416 mova [dstq+32*1], m0 6417 mova [maskq+32*0], m5 6418 W_MASK 0, 4, 4, 5 6419 vpermq m0, m0, q3120 6420 mova [dstq+32*2], m0 6421 W_MASK 0, 5, 6, 7 6422 packuswb m4, m5 6423 psubb m5, m8, m4 6424 pavgb m5, m9 6425 vpermq m0, m0, q3120 6426 vpermd m5, m10, m5 6427 mova [dstq+32*3], m0 6428 mova [maskq+32*1], m5 6429 dec hd 6430 jg .w128_loop 6431 RET 6432 6433cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 6434%define base r7-w_mask_444_avx2_table 6435 lea r7, [w_mask_444_avx2_table] 6436 tzcnt wd, wm 6437 movifnidn hd, hm 6438 mov maskq, maskmp 6439 movsxd wq, dword [r7+wq*4] 6440 vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 6441 vpbroadcastd m5, [base+pb_64] 6442 vpbroadcastd m7, [base+pw_2048] 6443 add wq, r7 6444 W_MASK 0, 4, 0, 1, 1 6445 lea stride3q, [strideq*3] 6446 jmp wq 6447.w4: 6448 vextracti128 xm1, m0, 1 6449 movd [dstq+strideq*0], xm0 6450 pextrd [dstq+strideq*1], xm0, 1 6451 movd [dstq+strideq*2], xm1 6452 pextrd [dstq+stride3q ], xm1, 1 6453 mova [maskq+32*0], m4 6454 cmp hd, 8 6455 jl .w4_end 6456 lea dstq, [dstq+strideq*4] 6457 pextrd [dstq+strideq*0], xm0, 2 6458 pextrd [dstq+strideq*1], xm0, 3 6459 pextrd [dstq+strideq*2], xm1, 2 6460 pextrd [dstq+stride3q ], xm1, 3 6461 je .w4_end 6462 W_MASK 0, 4, 2, 3, 1 6463 lea dstq, [dstq+strideq*4] 6464 vextracti128 xm1, m0, 1 6465 movd [dstq+strideq*0], xm0 6466 pextrd [dstq+strideq*1], xm0, 1 6467 movd [dstq+strideq*2], xm1 6468 pextrd [dstq+stride3q ], xm1, 1 6469 lea dstq, [dstq+strideq*4] 6470 pextrd [dstq+strideq*0], xm0, 2 6471 pextrd [dstq+strideq*1], xm0, 3 6472 pextrd [dstq+strideq*2], xm1, 2 6473 pextrd [dstq+stride3q ], xm1, 3 6474 mova [maskq+32*1], m4 6475.w4_end: 6476 RET 6477.w8_loop: 6478 add tmp1q, 32*2 6479 add tmp2q, 32*2 6480 W_MASK 0, 4, 0, 1, 1 6481 lea dstq, [dstq+strideq*4] 6482 add maskq, 32 6483.w8: 6484 vextracti128 xm1, m0, 1 6485 movq [dstq+strideq*0], xm0 6486 movq [dstq+strideq*1], xm1 6487 movhps [dstq+strideq*2], xm0 6488 movhps [dstq+stride3q ], xm1 6489 mova [maskq], m4 6490 sub hd, 4 6491 jg .w8_loop 6492 RET 6493.w16_loop: 6494 add tmp1q, 32*2 6495 add tmp2q, 32*2 6496 W_MASK 0, 4, 0, 1, 1 6497 lea dstq, [dstq+strideq*2] 6498 add maskq, 32 6499.w16: 6500 vpermq m0, m0, q3120 6501 mova [dstq+strideq*0], xm0 6502 vextracti128 [dstq+strideq*1], m0, 1 6503 mova [maskq], m4 6504 sub hd, 2 6505 jg .w16_loop 6506 RET 6507.w32_loop: 6508 add tmp1q, 32*2 6509 add tmp2q, 32*2 6510 W_MASK 0, 4, 0, 1, 1 6511 add dstq, strideq 6512 add maskq, 32 6513.w32: 6514 vpermq m0, m0, q3120 6515 mova [dstq], m0 6516 mova [maskq], m4 6517 dec hd 6518 jg .w32_loop 6519 RET 6520.w64_loop: 6521 add tmp1q, 32*4 6522 add tmp2q, 32*4 6523 W_MASK 0, 4, 0, 1, 1 6524 add dstq, strideq 6525 add maskq, 32*2 6526.w64: 6527 vpermq m0, m0, q3120 6528 mova [dstq+32*0], m0 6529 mova [maskq+32*0], m4 6530 W_MASK 0, 4, 2, 3, 1 6531 vpermq m0, m0, q3120 6532 mova [dstq+32*1], m0 6533 mova [maskq+32*1], m4 6534 dec hd 6535 jg .w64_loop 6536 RET 6537.w128_loop: 6538 add tmp1q, 32*8 6539 add tmp2q, 32*8 6540 W_MASK 0, 4, 0, 1, 1 6541 add dstq, strideq 6542 add maskq, 32*4 6543.w128: 6544 vpermq m0, m0, q3120 6545 mova [dstq+32*0], m0 6546 mova [maskq+32*0], m4 6547 W_MASK 0, 4, 2, 3, 1 6548 vpermq m0, m0, q3120 6549 mova [dstq+32*1], m0 6550 mova [maskq+32*1], m4 6551 W_MASK 0, 4, 4, 5, 1 6552 vpermq m0, m0, q3120 6553 mova [dstq+32*2], m0 6554 mova [maskq+32*2], m4 6555 W_MASK 0, 4, 6, 7, 1 6556 vpermq m0, m0, q3120 6557 mova [dstq+32*3], m0 6558 mova [maskq+32*3], m4 6559 dec hd 6560 jg .w128_loop 6561 RET 6562 6563%endif ; ARCH_X86_64 6564