1; Copyright © 2018-2021, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 64 32 33%macro SMOOTH_WEIGHT_TABLE 1-* 34 %rep %0 35 db %1-128, 127-%1 36 %rotate 1 37 %endrep 38%endmacro 39 40; sm_weights[], but modified to precalculate x and 256-x with offsets to 41; enable efficient use of pmaddubsw (which requires signed values) 42smooth_weights: SMOOTH_WEIGHT_TABLE \ 43 0, 0, 255, 128, 255, 149, 85, 64, \ 44 255, 197, 146, 105, 73, 50, 37, 32, \ 45 255, 225, 196, 170, 145, 123, 102, 84, \ 46 68, 54, 43, 33, 26, 20, 17, 16, \ 47 255, 240, 225, 210, 196, 182, 169, 157, \ 48 145, 133, 122, 111, 101, 92, 83, 74, \ 49 66, 59, 52, 45, 39, 34, 29, 25, \ 50 21, 17, 14, 12, 10, 9, 8, 8, \ 51 255, 248, 240, 233, 225, 218, 210, 203, \ 52 196, 189, 182, 176, 169, 163, 156, 150, \ 53 144, 138, 133, 127, 121, 116, 111, 106, \ 54 101, 96, 91, 86, 82, 77, 73, 69, \ 55 65, 61, 57, 54, 50, 47, 44, 41, \ 56 38, 35, 32, 29, 27, 25, 22, 20, \ 57 18, 16, 15, 13, 12, 10, 9, 8, \ 58 7, 6, 6, 5, 5, 4, 4, 4 59 60pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 61 db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 62pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 63pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 64z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 65 db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 66z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 67 db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16 68 db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0 69const \ 70z_filter_s, db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 71 db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 72 db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line 73pb_128: times 4 db 128 ; those are just placed here for alignment. 74pb_36_m4: times 2 db 36, -4 75z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0 76z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 77z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 78z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 79z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 80z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 81z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 82z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13 83z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 84z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 85z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 86z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 87 dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64 88z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64 89 dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64 90z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 91z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7 92 db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5 93; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5 94filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1 95 db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1 96filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 97filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1 98pb_127_m127: times 2 db 127, -127 99ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 100 db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15 101ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1 102 db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0 103pw_64: times 2 dw 64 104 105cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1 106 times 9 db 7, -1 107cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1 108 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 109 ; w=8, w_pad=1 as well as second half of previous one 110cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5 111 times 5 db 6, 7 112 ; w=16,w_pad=2 113 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 114 times 8 db 14, 15 115 ; w=16,w_pad=3 116 db 0, 1, 2, 3, 4, 5 117 times 13 db 6, 7 118pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 119 120%define pb_0to15 cfl_ac_w16_pad_shuffle 121%define pb_1 (ipred_h_shuf+12) 122%define pb_2 (ipred_h_shuf+20) 123%define pb_3 (ipred_h_shuf+ 4) 124%define pb_4 (ipred_h_shuf+24) 125%define pb_5 (ipred_h_shuf+ 8) 126%define pb_7 (ipred_h_shuf+ 0) 127%define pb_8 (z_upsample2 +12) 128%define pb_12 (z2_y_shuf_h4+20) 129%define pb_14 (z2_y_shuf_h4+ 4) 130%define pb_15 (z_filter_s +32) 131%define pb_27 (z2_y_shuf_h4+ 8) 132%define pb_31 (z2_y_shuf_h4+12) 133%define pb_32 (z2_y_shuf_h4+16) 134%define pb_90 (z2_y_shuf_h4+ 0) 135%define pw_1 (z2_y_shuf_h4+24) 136%define pw_8 (z_filter_k +32) 137 138pw_62: times 2 dw 62 139pw_128: times 2 dw 128 140pw_255: times 2 dw 255 141pw_512: times 2 dw 512 142 143%macro JMP_TABLE 3-* 144 %xdefine %1_%2_table (%%table - 2*4) 145 %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) 146 %%table: 147 %rep %0 - 2 148 dd %%base %+ .%3 - (%%table - 2*4) 149 %rotate 1 150 %endrep 151%endmacro 152 153%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4) 154%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4) 155 156JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64 157JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64 158JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64 159JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64 160JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32 161JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ 162 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 163JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64 164JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64 165JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64 166JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64 167JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64 168JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ 169 s4-8*4, s8-8*4, s16-8*4, s32-8*4 170JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32 171JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3 172JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3 173JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32 174JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64 175 176cextern dr_intra_derivative 177cextern filter_intra_taps 178 179SECTION .text 180 181INIT_YMM avx2 182cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h 183 lea r5, [ipred_dc_left_avx2_table] 184 tzcnt wd, wm 185 inc tlq 186 movu m0, [tlq] 187 movifnidn hd, hm 188 mov r6d, 0x8000 189 shrx r6d, r6d, wd 190 movd xm3, r6d 191 movsxd r6, [r5+wq*4] 192 pcmpeqd m2, m2 193 pmaddubsw m0, m2 194 add r6, r5 195 add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table 196 movsxd wq, [r5+wq*4] 197 add wq, r5 198 jmp r6 199 200cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 201 mov hd, hm ; zero upper half 202 tzcnt r6d, hd 203 sub tlq, hq 204 tzcnt wd, wm 205 movu m0, [tlq] 206 mov r5d, 0x8000 207 shrx r5d, r5d, r6d 208 movd xm3, r5d 209 lea r5, [ipred_dc_left_avx2_table] 210 movsxd r6, [r5+r6*4] 211 pcmpeqd m2, m2 212 pmaddubsw m0, m2 213 add r6, r5 214 add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table 215 movsxd wq, [r5+wq*4] 216 add wq, r5 217 jmp r6 218.h64: 219 movu m1, [tlq+32] ; unaligned when jumping here from dc_top 220 pmaddubsw m1, m2 221 paddw m0, m1 222.h32: 223 vextracti128 xm1, m0, 1 224 paddw xm0, xm1 225.h16: 226 punpckhqdq xm1, xm0, xm0 227 paddw xm0, xm1 228.h8: 229 psrlq xm1, xm0, 32 230 paddw xm0, xm1 231.h4: 232 pmaddwd xm0, xm2 233 pmulhrsw xm0, xm3 234 lea stride3q, [strideq*3] 235 vpbroadcastb m0, xm0 236 mova m1, m0 237 jmp wq 238 239cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 240 movifnidn hd, hm 241 movifnidn wd, wm 242 tzcnt r6d, hd 243 lea r5d, [wq+hq] 244 movd xm4, r5d 245 tzcnt r5d, r5d 246 movd xm5, r5d 247 lea r5, [ipred_dc_avx2_table] 248 tzcnt wd, wd 249 movsxd r6, [r5+r6*4] 250 movsxd wq, [r5+wq*4+5*4] 251 pcmpeqd m3, m3 252 psrlw xm4, 1 253 add r6, r5 254 add wq, r5 255 lea stride3q, [strideq*3] 256 jmp r6 257.h4: 258 movd xm0, [tlq-4] 259 pmaddubsw xm0, xm3 260 jmp wq 261.w4: 262 movd xm1, [tlq+1] 263 pmaddubsw xm1, xm3 264 psubw xm0, xm4 265 paddw xm0, xm1 266 pmaddwd xm0, xm3 267 cmp hd, 4 268 jg .w4_mul 269 psrlw xm0, 3 270 jmp .w4_end 271.w4_mul: 272 punpckhqdq xm1, xm0, xm0 273 lea r2d, [hq*2] 274 mov r6d, 0x55563334 275 paddw xm0, xm1 276 shrx r6d, r6d, r2d 277 psrlq xm1, xm0, 32 278 paddw xm0, xm1 279 movd xm1, r6d 280 psrlw xm0, 2 281 pmulhuw xm0, xm1 282.w4_end: 283 vpbroadcastb xm0, xm0 284.s4: 285 movd [dstq+strideq*0], xm0 286 movd [dstq+strideq*1], xm0 287 movd [dstq+strideq*2], xm0 288 movd [dstq+stride3q ], xm0 289 lea dstq, [dstq+strideq*4] 290 sub hd, 4 291 jg .s4 292 RET 293ALIGN function_align 294.h8: 295 movq xm0, [tlq-8] 296 pmaddubsw xm0, xm3 297 jmp wq 298.w8: 299 movq xm1, [tlq+1] 300 vextracti128 xm2, m0, 1 301 pmaddubsw xm1, xm3 302 psubw xm0, xm4 303 paddw xm0, xm2 304 punpckhqdq xm2, xm0, xm0 305 paddw xm0, xm2 306 paddw xm0, xm1 307 psrlq xm1, xm0, 32 308 paddw xm0, xm1 309 pmaddwd xm0, xm3 310 psrlw xm0, xm5 311 cmp hd, 8 312 je .w8_end 313 mov r6d, 0x5556 314 mov r2d, 0x3334 315 cmp hd, 32 316 cmove r6d, r2d 317 movd xm1, r6d 318 pmulhuw xm0, xm1 319.w8_end: 320 vpbroadcastb xm0, xm0 321.s8: 322 movq [dstq+strideq*0], xm0 323 movq [dstq+strideq*1], xm0 324 movq [dstq+strideq*2], xm0 325 movq [dstq+stride3q ], xm0 326 lea dstq, [dstq+strideq*4] 327 sub hd, 4 328 jg .s8 329 RET 330ALIGN function_align 331.h16: 332 mova xm0, [tlq-16] 333 pmaddubsw xm0, xm3 334 jmp wq 335.w16: 336 movu xm1, [tlq+1] 337 vextracti128 xm2, m0, 1 338 pmaddubsw xm1, xm3 339 psubw xm0, xm4 340 paddw xm0, xm2 341 paddw xm0, xm1 342 punpckhqdq xm1, xm0, xm0 343 paddw xm0, xm1 344 psrlq xm1, xm0, 32 345 paddw xm0, xm1 346 pmaddwd xm0, xm3 347 psrlw xm0, xm5 348 cmp hd, 16 349 je .w16_end 350 mov r6d, 0x5556 351 mov r2d, 0x3334 352 test hb, 8|32 353 cmovz r6d, r2d 354 movd xm1, r6d 355 pmulhuw xm0, xm1 356.w16_end: 357 vpbroadcastb xm0, xm0 358.s16: 359 mova [dstq+strideq*0], xm0 360 mova [dstq+strideq*1], xm0 361 mova [dstq+strideq*2], xm0 362 mova [dstq+stride3q ], xm0 363 lea dstq, [dstq+strideq*4] 364 sub hd, 4 365 jg .s16 366 RET 367ALIGN function_align 368.h32: 369 mova m0, [tlq-32] 370 pmaddubsw m0, m3 371 jmp wq 372.w32: 373 movu m1, [tlq+1] 374 pmaddubsw m1, m3 375 paddw m0, m1 376 vextracti128 xm1, m0, 1 377 psubw xm0, xm4 378 paddw xm0, xm1 379 punpckhqdq xm1, xm0, xm0 380 paddw xm0, xm1 381 psrlq xm1, xm0, 32 382 paddw xm0, xm1 383 pmaddwd xm0, xm3 384 psrlw xm0, xm5 385 cmp hd, 32 386 je .w32_end 387 lea r2d, [hq*2] 388 mov r6d, 0x33345556 389 shrx r6d, r6d, r2d 390 movd xm1, r6d 391 pmulhuw xm0, xm1 392.w32_end: 393 vpbroadcastb m0, xm0 394.s32: 395 mova [dstq+strideq*0], m0 396 mova [dstq+strideq*1], m0 397 mova [dstq+strideq*2], m0 398 mova [dstq+stride3q ], m0 399 lea dstq, [dstq+strideq*4] 400 sub hd, 4 401 jg .s32 402 RET 403ALIGN function_align 404.h64: 405 mova m0, [tlq-64] 406 mova m1, [tlq-32] 407 pmaddubsw m0, m3 408 pmaddubsw m1, m3 409 paddw m0, m1 410 jmp wq 411.w64: 412 movu m1, [tlq+ 1] 413 movu m2, [tlq+33] 414 pmaddubsw m1, m3 415 pmaddubsw m2, m3 416 paddw m0, m1 417 paddw m0, m2 418 vextracti128 xm1, m0, 1 419 psubw xm0, xm4 420 paddw xm0, xm1 421 punpckhqdq xm1, xm0, xm0 422 paddw xm0, xm1 423 psrlq xm1, xm0, 32 424 paddw xm0, xm1 425 pmaddwd xm0, xm3 426 psrlw xm0, xm5 427 cmp hd, 64 428 je .w64_end 429 mov r6d, 0x33345556 430 shrx r6d, r6d, hd 431 movd xm1, r6d 432 pmulhuw xm0, xm1 433.w64_end: 434 vpbroadcastb m0, xm0 435 mova m1, m0 436.s64: 437 mova [dstq+strideq*0+32*0], m0 438 mova [dstq+strideq*0+32*1], m1 439 mova [dstq+strideq*1+32*0], m0 440 mova [dstq+strideq*1+32*1], m1 441 mova [dstq+strideq*2+32*0], m0 442 mova [dstq+strideq*2+32*1], m1 443 mova [dstq+stride3q +32*0], m0 444 mova [dstq+stride3q +32*1], m1 445 lea dstq, [dstq+strideq*4] 446 sub hd, 4 447 jg .s64 448 RET 449 450cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 451 lea r5, [ipred_dc_splat_avx2_table] 452 tzcnt wd, wm 453 movifnidn hd, hm 454 movsxd wq, [r5+wq*4] 455 vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128] 456 mova m1, m0 457 add wq, r5 458 lea stride3q, [strideq*3] 459 jmp wq 460 461cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 462 lea r5, [ipred_dc_splat_avx2_table] 463 tzcnt wd, wm 464 movu m0, [tlq+ 1] 465 movu m1, [tlq+33] 466 movifnidn hd, hm 467 movsxd wq, [r5+wq*4] 468 add wq, r5 469 lea stride3q, [strideq*3] 470 jmp wq 471 472%macro IPRED_H 2 ; w, store_type 473 vpbroadcastb m0, [tlq-1] 474 vpbroadcastb m1, [tlq-2] 475 vpbroadcastb m2, [tlq-3] 476 sub tlq, 4 477 vpbroadcastb m3, [tlq+0] 478 mov%2 [dstq+strideq*0], m0 479 mov%2 [dstq+strideq*1], m1 480 mov%2 [dstq+strideq*2], m2 481 mov%2 [dstq+stride3q ], m3 482 lea dstq, [dstq+strideq*4] 483 sub hd, 4 484 jg .w%1 485 RET 486ALIGN function_align 487%endmacro 488 489INIT_XMM avx2 490cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 491 lea r5, [ipred_h_avx2_table] 492 tzcnt wd, wm 493 movifnidn hd, hm 494 movsxd wq, [r5+wq*4] 495 add wq, r5 496 lea stride3q, [strideq*3] 497 jmp wq 498.w4: 499 IPRED_H 4, d 500.w8: 501 IPRED_H 8, q 502.w16: 503 IPRED_H 16, a 504INIT_YMM avx2 505.w32: 506 IPRED_H 32, a 507.w64: 508 vpbroadcastb m0, [tlq-1] 509 vpbroadcastb m1, [tlq-2] 510 vpbroadcastb m2, [tlq-3] 511 sub tlq, 4 512 vpbroadcastb m3, [tlq+0] 513 mova [dstq+strideq*0+32*0], m0 514 mova [dstq+strideq*0+32*1], m0 515 mova [dstq+strideq*1+32*0], m1 516 mova [dstq+strideq*1+32*1], m1 517 mova [dstq+strideq*2+32*0], m2 518 mova [dstq+strideq*2+32*1], m2 519 mova [dstq+stride3q +32*0], m3 520 mova [dstq+stride3q +32*1], m3 521 lea dstq, [dstq+strideq*4] 522 sub hd, 4 523 jg .w64 524 RET 525 526%macro PAETH 2 ; top, ldiff 527 pavgb m1, m%1, m3 ; Calculating tldiff normally requires 528 pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it 529 pand m0, m4 ; in 8-bit with some tricks which avoids 530 psubusb m2, m5, m1 ; having to unpack everything to 16-bit. 531 psubb m1, m0 532 psubusb m1, m5 533 por m1, m2 534 paddusb m1, m1 535 por m1, m0 ; min(tldiff, 255) 536 psubusb m2, m5, m3 537 psubusb m0, m3, m5 538 por m2, m0 ; tdiff 539 pminub m2, m%2 540 pcmpeqb m0, m%2, m2 ; ldiff <= tdiff 541 vpblendvb m0, m%1, m3, m0 542 pminub m1, m2 543 pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff 544 vpblendvb m0, m5, m0, m1 545%endmacro 546 547cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h 548%define base r5-ipred_paeth_avx2_table 549 lea r5, [ipred_paeth_avx2_table] 550 tzcnt wd, wm 551 vpbroadcastb m5, [tlq] ; topleft 552 movifnidn hd, hm 553 movsxd wq, [r5+wq*4] 554 vpbroadcastd m4, [base+pb_1] 555 add wq, r5 556 jmp wq 557.w4: 558 vpbroadcastd m6, [tlq+1] ; top 559 mova m8, [base+ipred_h_shuf] 560 lea r3, [strideq*3] 561 psubusb m7, m5, m6 562 psubusb m0, m6, m5 563 por m7, m0 ; ldiff 564.w4_loop: 565 sub tlq, 8 566 vpbroadcastq m3, [tlq] 567 pshufb m3, m8 ; left 568 PAETH 6, 7 569 vextracti128 xm1, m0, 1 570 movd [dstq+strideq*0], xm0 571 movd [dstq+strideq*1], xm1 572 pextrd [dstq+strideq*2], xm0, 2 573 pextrd [dstq+r3 ], xm1, 2 574 cmp hd, 4 575 je .ret 576 lea dstq, [dstq+strideq*4] 577 pextrd [dstq+strideq*0], xm0, 1 578 pextrd [dstq+strideq*1], xm1, 1 579 pextrd [dstq+strideq*2], xm0, 3 580 pextrd [dstq+r3 ], xm1, 3 581 lea dstq, [dstq+strideq*4] 582 sub hd, 8 583 jg .w4_loop 584.ret: 585 RET 586ALIGN function_align 587.w8: 588 vpbroadcastq m6, [tlq+1] 589 mova m8, [base+ipred_h_shuf] 590 lea r3, [strideq*3] 591 psubusb m7, m5, m6 592 psubusb m0, m6, m5 593 por m7, m0 594.w8_loop: 595 sub tlq, 4 596 vpbroadcastd m3, [tlq] 597 pshufb m3, m8 598 PAETH 6, 7 599 vextracti128 xm1, m0, 1 600 movq [dstq+strideq*0], xm0 601 movq [dstq+strideq*1], xm1 602 movhps [dstq+strideq*2], xm0 603 movhps [dstq+r3 ], xm1 604 lea dstq, [dstq+strideq*4] 605 sub hd, 4 606 jg .w8_loop 607 RET 608ALIGN function_align 609.w16: 610 vbroadcasti128 m6, [tlq+1] 611 mova xm8, xm4 ; lower half = 1, upper half = 0 612 psubusb m7, m5, m6 613 psubusb m0, m6, m5 614 por m7, m0 615.w16_loop: 616 sub tlq, 2 617 vpbroadcastd m3, [tlq] 618 pshufb m3, m8 619 PAETH 6, 7 620 mova [dstq+strideq*0], xm0 621 vextracti128 [dstq+strideq*1], m0, 1 622 lea dstq, [dstq+strideq*2] 623 sub hd, 2 624 jg .w16_loop 625 RET 626ALIGN function_align 627.w32: 628 movu m6, [tlq+1] 629 psubusb m7, m5, m6 630 psubusb m0, m6, m5 631 por m7, m0 632.w32_loop: 633 dec tlq 634 vpbroadcastb m3, [tlq] 635 PAETH 6, 7 636 mova [dstq], m0 637 add dstq, strideq 638 dec hd 639 jg .w32_loop 640 RET 641ALIGN function_align 642.w64: 643 movu m6, [tlq+ 1] 644 movu m7, [tlq+33] 645%if WIN64 646 movaps r4m, xmm9 647%endif 648 psubusb m8, m5, m6 649 psubusb m0, m6, m5 650 psubusb m9, m5, m7 651 psubusb m1, m7, m5 652 por m8, m0 653 por m9, m1 654.w64_loop: 655 dec tlq 656 vpbroadcastb m3, [tlq] 657 PAETH 6, 8 658 mova [dstq+32*0], m0 659 PAETH 7, 9 660 mova [dstq+32*1], m0 661 add dstq, strideq 662 dec hd 663 jg .w64_loop 664%if WIN64 665 movaps xmm9, r4m 666%endif 667 RET 668 669%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] 670 ; w * a = (w - 128) * a + 128 * a 671 ; (256 - w) * b = (127 - w) * b + 129 * b 672 pmaddubsw m0, m%3, m%1 673 pmaddubsw m1, m%4, m%2 674 paddw m0, m%5 675 paddw m1, m%6 676 psrlw m0, 8 677 psrlw m1, 8 678 packuswb m0, m1 679%endmacro 680 681cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights 682%define base r6-ipred_smooth_v_avx2_table 683 lea r6, [ipred_smooth_v_avx2_table] 684 tzcnt wd, wm 685 mov hd, hm 686 movsxd wq, [r6+wq*4] 687 vpbroadcastd m0, [base+pb_127_m127] 688 vpbroadcastd m1, [base+pw_128] 689 lea weightsq, [base+smooth_weights+hq*4] 690 neg hq 691 vpbroadcastb m5, [tlq+hq] ; bottom 692 add wq, r6 693 jmp wq 694.w4: 695 vpbroadcastd m2, [tlq+1] 696 punpcklbw m2, m5 ; top, bottom 697 mova m5, [base+ipred_v_shuf] 698 lea r3, [strideq*3] 699 punpckldq m4, m5, m5 700 punpckhdq m5, m5 701 pmaddubsw m3, m2, m0 702 paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok 703 paddw m3, m1 ; 128 * top + 129 * bottom + 128 704.w4_loop: 705 vbroadcasti128 m1, [weightsq+hq*2] 706 pshufb m0, m1, m4 707 pshufb m1, m5 708 SMOOTH 0, 1, 2, 2, 3, 3 709 vextracti128 xm1, m0, 1 710 movd [dstq+strideq*0], xm0 711 movd [dstq+strideq*1], xm1 712 pextrd [dstq+strideq*2], xm0, 1 713 pextrd [dstq+r3 ], xm1, 1 714 cmp hd, -4 715 je .ret 716 lea dstq, [dstq+strideq*4] 717 pextrd [dstq+strideq*0], xm0, 2 718 pextrd [dstq+strideq*1], xm1, 2 719 pextrd [dstq+strideq*2], xm0, 3 720 pextrd [dstq+r3 ], xm1, 3 721 lea dstq, [dstq+strideq*4] 722 add hq, 8 723 jl .w4_loop 724.ret: 725 RET 726ALIGN function_align 727.w8: 728 vpbroadcastq m2, [tlq+1] 729 punpcklbw m2, m5 730 mova m5, [base+ipred_v_shuf] 731 lea r3, [strideq*3] 732 pshufd m4, m5, q0000 733 pshufd m5, m5, q1111 734 pmaddubsw m3, m2, m0 735 paddw m1, m2 736 paddw m3, m1 737.w8_loop: 738 vpbroadcastq m1, [weightsq+hq*2] 739 pshufb m0, m1, m4 740 pshufb m1, m5 741 SMOOTH 0, 1, 2, 2, 3, 3 742 vextracti128 xm1, m0, 1 743 movq [dstq+strideq*0], xm0 744 movq [dstq+strideq*1], xm1 745 movhps [dstq+strideq*2], xm0 746 movhps [dstq+r3 ], xm1 747 lea dstq, [dstq+strideq*4] 748 add hq, 4 749 jl .w8_loop 750 RET 751ALIGN function_align 752.w16: 753 WIN64_SPILL_XMM 7 754 vbroadcasti128 m3, [tlq+1] 755 mova m6, [base+ipred_v_shuf] 756 punpcklbw m2, m3, m5 757 punpckhbw m3, m5 758 pmaddubsw m4, m2, m0 759 pmaddubsw m5, m3, m0 760 paddw m0, m1, m2 761 paddw m1, m3 762 paddw m4, m0 763 paddw m5, m1 764.w16_loop: 765 vpbroadcastd m1, [weightsq+hq*2] 766 pshufb m1, m6 767 SMOOTH 1, 1, 2, 3, 4, 5 768 mova [dstq+strideq*0], xm0 769 vextracti128 [dstq+strideq*1], m0, 1 770 lea dstq, [dstq+strideq*2] 771 add hq, 2 772 jl .w16_loop 773 RET 774ALIGN function_align 775.w32: 776 WIN64_SPILL_XMM 6 777 movu m3, [tlq+1] 778 punpcklbw m2, m3, m5 779 punpckhbw m3, m5 780 pmaddubsw m4, m2, m0 781 pmaddubsw m5, m3, m0 782 paddw m0, m1, m2 783 paddw m1, m3 784 paddw m4, m0 785 paddw m5, m1 786.w32_loop: 787 vpbroadcastw m1, [weightsq+hq*2] 788 SMOOTH 1, 1, 2, 3, 4, 5 789 mova [dstq], m0 790 add dstq, strideq 791 inc hq 792 jl .w32_loop 793 RET 794ALIGN function_align 795.w64: 796 WIN64_SPILL_XMM 11 797 movu m4, [tlq+ 1] 798 movu m8, [tlq+33] 799 punpcklbw m3, m4, m5 800 punpckhbw m4, m5 801 punpcklbw m7, m8, m5 802 punpckhbw m8, m5 803 pmaddubsw m5, m3, m0 804 pmaddubsw m6, m4, m0 805 pmaddubsw m9, m7, m0 806 pmaddubsw m10, m8, m0 807 paddw m2, m1, m3 808 paddw m5, m2 809 paddw m2, m1, m4 810 paddw m6, m2 811 paddw m0, m1, m7 812 paddw m9, m0 813 paddw m1, m8 814 paddw m10, m1 815.w64_loop: 816 vpbroadcastw m2, [weightsq+hq*2] 817 SMOOTH 2, 2, 3, 4, 5, 6 818 mova [dstq+32*0], m0 819 SMOOTH 2, 2, 7, 8, 9, 10 820 mova [dstq+32*1], m0 821 add dstq, strideq 822 inc hq 823 jl .w64_loop 824 RET 825 826cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h 827%define base r5-ipred_smooth_h_avx2_table 828 lea r5, [ipred_smooth_h_avx2_table] 829 mov wd, wm 830 vpbroadcastb m3, [tlq+wq] ; right 831 tzcnt wd, wd 832 mov hd, hm 833 movsxd wq, [r5+wq*4] 834 vpbroadcastd m4, [base+pb_127_m127] 835 vpbroadcastd m5, [base+pw_128] 836 add wq, r5 837 jmp wq 838.w4: 839 WIN64_SPILL_XMM 8 840 vpbroadcastq m6, [base+smooth_weights+4*2] 841 mova m7, [base+ipred_h_shuf] 842 sub tlq, 8 843 sub tlq, hq 844 lea r3, [strideq*3] 845.w4_loop: 846 vpbroadcastq m2, [tlq+hq] 847 pshufb m2, m7 848 punpcklbw m1, m2, m3 ; left, right 849 punpckhbw m2, m3 850 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 851 paddw m0, m1 ; 128 * left + 129 * right 852 pmaddubsw m1, m6 853 paddw m1, m5 854 paddw m0, m1 855 pmaddubsw m1, m2, m4 856 paddw m1, m2 857 pmaddubsw m2, m6 858 paddw m2, m5 859 paddw m1, m2 860 psrlw m0, 8 861 psrlw m1, 8 862 packuswb m0, m1 863 vextracti128 xm1, m0, 1 864 movd [dstq+strideq*0], xm0 865 movd [dstq+strideq*1], xm1 866 pextrd [dstq+strideq*2], xm0, 2 867 pextrd [dstq+r3 ], xm1, 2 868 cmp hd, 4 869 je .ret 870 lea dstq, [dstq+strideq*4] 871 pextrd [dstq+strideq*0], xm0, 1 872 pextrd [dstq+strideq*1], xm1, 1 873 pextrd [dstq+strideq*2], xm0, 3 874 pextrd [dstq+r3 ], xm1, 3 875 lea dstq, [dstq+strideq*4] 876 sub hd, 8 877 jg .w4_loop 878.ret: 879 RET 880ALIGN function_align 881.w8: 882 WIN64_SPILL_XMM 8 883 vbroadcasti128 m6, [base+smooth_weights+8*2] 884 mova m7, [base+ipred_h_shuf] 885 sub tlq, 4 886 lea r3, [strideq*3] 887 sub tlq, hq 888.w8_loop: 889 vpbroadcastd m2, [tlq+hq] 890 pshufb m2, m7 891 punpcklbw m1, m2, m3 892 punpckhbw m2, m3 893 pmaddubsw m0, m1, m4 894 paddw m0, m1 895 pmaddubsw m1, m6 896 paddw m1, m5 897 paddw m0, m1 898 pmaddubsw m1, m2, m4 899 paddw m1, m2 900 pmaddubsw m2, m6 901 paddw m2, m5 902 paddw m1, m2 903 psrlw m0, 8 904 psrlw m1, 8 905 packuswb m0, m1 906 vextracti128 xm1, m0, 1 907 movq [dstq+strideq*0], xm0 908 movq [dstq+strideq*1], xm1 909 movhps [dstq+strideq*2], xm0 910 movhps [dstq+r3 ], xm1 911 lea dstq, [dstq+strideq*4] 912 sub hd, 4 913 jg .w8_loop 914 RET 915ALIGN function_align 916.w16: 917 ALLOC_STACK 32*4, 8 918 lea r3, [rsp+64*2-4] 919 call .prep ; only worthwhile for for w16 and above 920 sub tlq, 2 921 vpbroadcastd xm6, [base+pb_1] 922 mova xm7, [base+ipred_v_shuf+16] 923 vinserti128 m7, [base+ipred_v_shuf+ 0], 1 924 vbroadcasti128 m4, [base+smooth_weights+16*2] 925 vbroadcasti128 m5, [base+smooth_weights+16*3] 926.w16_loop: 927 vpbroadcastd m1, [tlq+hq] 928 vpbroadcastd m2, [r3+hq*2] 929 pshufb m1, m6 930 punpcklbw m1, m3 931 pshufb m2, m7 932 SMOOTH 4, 5, 1, 1, 2, 2 933 mova [dstq+strideq*0], xm0 934 vextracti128 [dstq+strideq*1], m0, 1 935 lea dstq, [dstq+strideq*2] 936 sub hd, 2 937 jg .w16_loop 938 RET 939ALIGN function_align 940.w32: 941 ALLOC_STACK 32*4 942 lea r3, [rsp+64*2-2] 943 call .prep 944 dec tlq 945 mova xm4, [base+smooth_weights+16*4] 946 vinserti128 m4, [base+smooth_weights+16*6], 1 947 mova xm5, [base+smooth_weights+16*5] 948 vinserti128 m5, [base+smooth_weights+16*7], 1 949.w32_loop: 950 vpbroadcastb m1, [tlq+hq] 951 punpcklbw m1, m3 952 vpbroadcastw m2, [r3+hq*2] 953 SMOOTH 4, 5, 1, 1, 2, 2 954 mova [dstq], m0 955 add dstq, strideq 956 dec hd 957 jg .w32_loop 958 RET 959ALIGN function_align 960.w64: 961 ALLOC_STACK 32*4, 9 962 lea r3, [rsp+64*2-2] 963 call .prep 964 add r5, smooth_weights+16*15-ipred_smooth_h_avx2_table 965 dec tlq 966 mova xm5, [r5-16*7] 967 vinserti128 m5, [r5-16*5], 1 968 mova xm6, [r5-16*6] 969 vinserti128 m6, [r5-16*4], 1 970 mova xm7, [r5-16*3] 971 vinserti128 m7, [r5-16*1], 1 972 mova xm8, [r5-16*2] 973 vinserti128 m8, [r5-16*0], 1 974.w64_loop: 975 vpbroadcastb m2, [tlq+hq] 976 punpcklbw m2, m3 977 vpbroadcastw m4, [r3+hq*2] 978 SMOOTH 5, 6, 2, 2, 4, 4 979 mova [dstq+32*0], m0 980 SMOOTH 7, 8, 2, 2, 4, 4 981 mova [dstq+32*1], m0 982 add dstq, strideq 983 dec hd 984 jg .w64_loop 985 RET 986ALIGN function_align 987.prep: 988 vpermq m2, [tlq-32*1], q3120 989 punpckhbw m1, m2, m3 990 punpcklbw m2, m3 991 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right 992 paddw m1, m5 ; 1 * left + 256 * right + 128 993 paddw m0, m1 ; 128 * left + 129 * right + 128 994 pmaddubsw m1, m2, m4 995 paddw m2, m5 996 paddw m1, m2 997 vpermq m2, [tlq-32*2], q3120 998 mova [rsp+gprsize+32*3], m0 999 mova [rsp+gprsize+32*2], m1 1000 punpckhbw m1, m2, m3 1001 punpcklbw m2, m3 1002 pmaddubsw m0, m1, m4 1003 paddw m1, m5 1004 paddw m0, m1 1005 pmaddubsw m1, m2, m4 1006 paddw m2, m5 1007 paddw m1, m2 1008 mova [rsp+gprsize+32*1], m0 1009 mova [rsp+gprsize+32*0], m1 1010 sub r3, hq 1011 sub tlq, hq 1012 sub r3, hq 1013 ret 1014 1015%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] 1016 pmaddubsw m0, m%3, m%1 1017 pmaddubsw m1, m%4, m%2 1018%ifnum %5 1019 paddw m0, m%5 1020%else 1021 paddw m0, %5 1022%endif 1023%ifnum %6 1024 paddw m1, m%6 1025%else 1026 paddw m1, %6 1027%endif 1028 pavgw m0, m2 1029 pavgw m1, m3 1030 psrlw m0, 8 1031 psrlw m1, 8 1032 packuswb m0, m1 1033%endmacro 1034 1035cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights 1036%define base r6-ipred_smooth_avx2_table 1037 lea r6, [ipred_smooth_avx2_table] 1038 mov wd, wm 1039 vpbroadcastb m4, [tlq+wq] ; right 1040 tzcnt wd, wd 1041 mov hd, hm 1042 mov r5, tlq 1043 sub r5, hq 1044 movsxd wq, [r6+wq*4] 1045 vpbroadcastd m5, [base+pb_127_m127] 1046 vpbroadcastb m0, [r5] ; bottom 1047 vpbroadcastd m3, [base+pw_255] 1048 add wq, r6 1049 lea v_weightsq, [base+smooth_weights+hq*2] 1050 jmp wq 1051.w4: 1052 WIN64_SPILL_XMM 12 1053 mova m10, [base+ipred_h_shuf] 1054 vpbroadcastq m11, [base+smooth_weights+4*2] 1055 mova m7, [base+ipred_v_shuf] 1056 vpbroadcastd m8, [tlq+1] 1057 sub tlq, 8 1058 lea r3, [strideq*3] 1059 sub tlq, hq 1060 punpcklbw m8, m0 ; top, bottom 1061 pshufd m6, m7, q2200 1062 pshufd m7, m7, q3311 1063 pmaddubsw m9, m8, m5 1064 paddw m3, m8 ; 1 * top + 255 * bottom + 255 1065 paddw m9, m3 ; 128 * top + 129 * bottom + 255 1066.w4_loop: 1067 vpbroadcastq m1, [tlq+hq] 1068 pshufb m1, m10 1069 punpcklbw m0, m1, m4 ; left, right 1070 punpckhbw m1, m4 1071 pmaddubsw m2, m0, m5 ; 127 * left - 127 * right 1072 pmaddubsw m3, m1, m5 1073 paddw m2, m0 ; 128 * left + 129 * right 1074 paddw m3, m1 1075 pmaddubsw m0, m11 1076 pmaddubsw m1, m11 1077 paddw m2, m0 1078 paddw m3, m1 1079 vbroadcasti128 m1, [v_weightsq] 1080 add v_weightsq, 16 1081 pshufb m0, m1, m6 1082 pshufb m1, m7 1083 SMOOTH_2D_END 0, 1, 8, 8, 9, 9 1084 vextracti128 xm1, m0, 1 1085 movd [dstq+strideq*0], xm0 1086 movd [dstq+strideq*1], xm1 1087 pextrd [dstq+strideq*2], xm0, 2 1088 pextrd [dstq+r3 ], xm1, 2 1089 cmp hd, 4 1090 je .ret 1091 lea dstq, [dstq+strideq*4] 1092 pextrd [dstq+strideq*0], xm0, 1 1093 pextrd [dstq+strideq*1], xm1, 1 1094 pextrd [dstq+strideq*2], xm0, 3 1095 pextrd [dstq+r3 ], xm1, 3 1096 lea dstq, [dstq+strideq*4] 1097 sub hd, 8 1098 jg .w4_loop 1099.ret: 1100 RET 1101ALIGN function_align 1102.w8: 1103 WIN64_SPILL_XMM 12 1104 mova m10, [base+ipred_h_shuf] 1105 vbroadcasti128 m11, [base+smooth_weights+8*2] 1106 mova m7, [base+ipred_v_shuf] 1107 vpbroadcastq m8, [tlq+1] 1108 sub tlq, 4 1109 lea r3, [strideq*3] 1110 sub tlq, hq 1111 punpcklbw m8, m0 1112 pshufd m6, m7, q0000 1113 pshufd m7, m7, q1111 1114 pmaddubsw m9, m8, m5 1115 paddw m3, m8 1116 paddw m9, m3 1117.w8_loop: 1118 vpbroadcastd m1, [tlq+hq] 1119 pshufb m1, m10 1120 punpcklbw m0, m1, m4 1121 punpckhbw m1, m4 1122 pmaddubsw m2, m0, m5 1123 pmaddubsw m3, m1, m5 1124 paddw m2, m0 1125 paddw m3, m1 1126 pmaddubsw m0, m11 1127 pmaddubsw m1, m11 1128 paddw m2, m0 1129 paddw m3, m1 1130 vpbroadcastq m1, [v_weightsq] 1131 add v_weightsq, 8 1132 pshufb m0, m1, m6 1133 pshufb m1, m7 1134 SMOOTH_2D_END 0, 1, 8, 8, 9, 9 1135 vextracti128 xm1, m0, 1 1136 movq [dstq+strideq*0], xm0 1137 movq [dstq+strideq*1], xm1 1138 movhps [dstq+strideq*2], xm0 1139 movhps [dstq+r3 ], xm1 1140 lea dstq, [dstq+strideq*4] 1141 sub hd, 4 1142 jg .w8_loop 1143 RET 1144ALIGN function_align 1145.w16: 1146 %assign regs_used 4 1147 ALLOC_STACK -32*4, 14 1148 %assign regs_used 7 1149 vbroadcasti128 m11, [tlq+1] 1150 lea r3, [rsp+64*2-4] 1151 punpcklbw m10, m11, m0 ; top, bottom 1152 punpckhbw m11, m0 1153 call .prep_v 1154 sub tlq, 2 1155 pmaddubsw m12, m10, m5 1156 pmaddubsw m13, m11, m5 1157 vpbroadcastd xm5, [base+pb_1] 1158 mova m9, [base+ipred_v_shuf] 1159 vbroadcasti128 m6, [base+smooth_weights+16*2] 1160 vbroadcasti128 m7, [base+smooth_weights+16*3] 1161 vperm2i128 m8, m9, m9, 0x01 1162 paddw m0, m10, m3 1163 paddw m3, m11 1164 paddw m12, m0 1165 paddw m13, m3 1166.w16_loop: 1167 vpbroadcastd m3, [tlq+hq] 1168 vpbroadcastd m0, [r3+hq*2] 1169 vpbroadcastd m1, [v_weightsq] 1170 add v_weightsq, 4 1171 pshufb m3, m5 1172 punpcklbw m3, m4 ; left, right 1173 pmaddubsw m2, m3, m6 1174 pmaddubsw m3, m7 1175 pshufb m0, m8 1176 pshufb m1, m9 1177 paddw m2, m0 1178 paddw m3, m0 1179 SMOOTH_2D_END 1, 1, 10, 11, 12, 13 1180 mova [dstq+strideq*0], xm0 1181 vextracti128 [dstq+strideq*1], m0, 1 1182 lea dstq, [dstq+strideq*2] 1183 sub hd, 2 1184 jg .w16_loop 1185 RET 1186ALIGN function_align 1187.w32: 1188 %assign regs_used 4 1189 ALLOC_STACK -32*4, 11 1190 %assign regs_used 7 1191 movu m8, [tlq+1] 1192 lea r3, [rsp+64*2-2] 1193 punpcklbw m7, m8, m0 1194 punpckhbw m8, m0 1195 call .prep_v 1196 dec tlq 1197 pmaddubsw m9, m7, m5 1198 pmaddubsw m10, m8, m5 1199 mova xm5, [base+smooth_weights+16*4] 1200 vinserti128 m5, [base+smooth_weights+16*6], 1 1201 mova xm6, [base+smooth_weights+16*5] 1202 vinserti128 m6, [base+smooth_weights+16*7], 1 1203 paddw m0, m7, m3 1204 paddw m3, m8 1205 paddw m9, m0 1206 paddw m10, m3 1207.w32_loop: 1208 vpbroadcastb m3, [tlq+hq] 1209 punpcklbw m3, m4 1210 vpbroadcastw m0, [r3+hq*2] 1211 vpbroadcastw m1, [v_weightsq] 1212 add v_weightsq, 2 1213 pmaddubsw m2, m3, m5 1214 pmaddubsw m3, m6 1215 paddw m2, m0 1216 paddw m3, m0 1217 SMOOTH_2D_END 1, 1, 7, 8, 9, 10 1218 mova [dstq], m0 1219 add dstq, strideq 1220 dec hd 1221 jg .w32_loop 1222 RET 1223ALIGN function_align 1224.w64: 1225 %assign regs_used 4 1226 ALLOC_STACK -32*8, 16 1227 %assign regs_used 7 1228 movu m13, [tlq+1 ] 1229 movu m15, [tlq+33] 1230 add r6, smooth_weights+16*15-ipred_smooth_avx2_table 1231 lea r3, [rsp+64*2-2] 1232 punpcklbw m12, m13, m0 1233 punpckhbw m13, m0 1234 punpcklbw m14, m15, m0 1235 punpckhbw m15, m0 1236 call .prep_v 1237 dec tlq 1238 pmaddubsw m0, m12, m5 1239 pmaddubsw m1, m13, m5 1240 pmaddubsw m2, m14, m5 1241 pmaddubsw m5, m15, m5 1242 mova xm8, [r6-16*7] 1243 vinserti128 m8, [r6-16*5], 1 1244 mova xm9, [r6-16*6] 1245 vinserti128 m9, [r6-16*4], 1 1246 mova xm10, [r6-16*3] 1247 vinserti128 m10, [r6-16*1], 1 1248 mova xm11, [r6-16*2] 1249 vinserti128 m11, [r6-16*0], 1 1250 lea r6, [rsp+32*4] 1251 paddw m0, m3 1252 paddw m1, m3 1253 paddw m2, m3 1254 paddw m3, m5 1255 paddw m0, m12 1256 paddw m1, m13 1257 paddw m2, m14 1258 paddw m3, m15 1259 mova [r6+32*0], m0 1260 mova [r6+32*1], m1 1261 mova [r6+32*2], m2 1262 mova [r6+32*3], m3 1263.w64_loop: 1264 vpbroadcastb m5, [tlq+hq] 1265 punpcklbw m5, m4 1266 vpbroadcastw m6, [r3+hq*2] 1267 vpbroadcastw m7, [v_weightsq] 1268 add v_weightsq, 2 1269 pmaddubsw m2, m5, m8 1270 pmaddubsw m3, m5, m9 1271 paddw m2, m6 1272 paddw m3, m6 1273 SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1] 1274 mova [dstq+32*0], m0 1275 pmaddubsw m2, m5, m10 1276 pmaddubsw m3, m5, m11 1277 paddw m2, m6 1278 paddw m3, m6 1279 SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3] 1280 mova [dstq+32*1], m0 1281 add dstq, strideq 1282 dec hd 1283 jg .w64_loop 1284 RET 1285ALIGN function_align 1286.prep_v: 1287 vpermq m2, [tlq-32*1], q3120 1288 punpckhbw m1, m2, m4 1289 punpcklbw m2, m4 1290 pmaddubsw m0, m1, m5 ; 127 * left - 127 * right 1291 paddw m0, m1 ; 128 * left + 129 * right 1292 pmaddubsw m1, m2, m5 1293 paddw m1, m2 1294 vpermq m2, [tlq-32*2], q3120 1295 mova [rsp+gprsize+32*3], m0 1296 mova [rsp+gprsize+32*2], m1 1297 punpckhbw m1, m2, m4 1298 punpcklbw m2, m4 1299 pmaddubsw m0, m1, m5 1300 paddw m0, m1 1301 pmaddubsw m1, m2, m5 1302 paddw m1, m2 1303 mova [rsp+gprsize+32*1], m0 1304 mova [rsp+gprsize+32*0], m1 1305 sub r3, hq 1306 sub tlq, hq 1307 sub r3, hq 1308 ret 1309 1310cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase 1311 lea r6, [ipred_z1_avx2_table] 1312 tzcnt wd, wm 1313 movifnidn angled, anglem 1314 movifnidn hd, hm 1315 lea r7, [dr_intra_derivative] 1316 inc tlq 1317 movsxd wq, [r6+wq*4] 1318 add wq, r6 1319 mov dxd, angled 1320 and dxd, 0x7e 1321 add angled, 165 ; ~90 1322 movzx dxd, word [r7+dxq] 1323 xor angled, 0x4ff ; d = 90 - angle 1324 vpbroadcastd m3, [pw_512] 1325 vpbroadcastd m4, [pw_62] 1326 vpbroadcastd m5, [pw_64] 1327 jmp wq 1328.w4: 1329 cmp angleb, 40 1330 jae .w4_no_upsample 1331 lea r3d, [angleq-1024] 1332 sar r3d, 7 1333 add r3d, hd 1334 jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) 1335 ALLOC_STACK -32, 8 1336 mova xm1, [tlq-1] 1337 pshufb xm0, xm1, [z_upsample1] 1338 pshufb xm1, [z_upsample2] 1339 vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse 1340 add dxd, dxd ; pw_512 (which is already in m3) 1341 pmaddubsw xm0, xm2 ; for rounding instead of pw_2048 1342 pextrd [rsp+16], xm1, 3 ; top[max_base_x] 1343 pmaddubsw xm1, xm2 1344 movd xm7, dxd 1345 mov r3d, dxd ; xpos 1346 vpbroadcastw m7, xm7 1347 paddw xm1, xm0 1348 movq xm0, [tlq] 1349 pmulhrsw xm1, xm3 1350 pslldq m6, m7, 8 1351 paddw xm2, xm7, xm7 1352 lea r2, [strideq*3] 1353 paddw m6, m7 1354 packuswb xm1, xm1 1355 paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1 1356 punpcklbw xm0, xm1 1357 psllw m7, 2 1358 mova [rsp], xm0 1359.w4_upsample_loop: 1360 lea r5d, [r3+dxq] 1361 shr r3d, 6 ; base0 1362 vpbroadcastq m1, [rsp+r3] 1363 lea r3d, [r5+dxq] 1364 shr r5d, 6 ; base1 1365 vpbroadcastq m2, [rsp+r5] 1366 lea r5d, [r3+dxq] 1367 shr r3d, 6 ; base2 1368 movq xm0, [rsp+r3] 1369 lea r3d, [r5+dxq] 1370 shr r5d, 6 ; base3 1371 movhps xm0, [rsp+r5] 1372 vpblendd m1, m2, 0xc0 1373 pand m2, m4, m6 ; frac 1374 vpblendd m0, m1, 0xf0 1375 psubw m1, m5, m2 ; 64-frac 1376 psllw m2, 8 1377 por m1, m2 ; 64-frac, frac 1378 pmaddubsw m0, m1 1379 paddw m6, m7 ; xpos += dx 1380 pmulhrsw m0, m3 1381 packuswb m0, m0 1382 vextracti128 xm1, m0, 1 1383 movd [dstq+strideq*2], xm0 1384 pextrd [dstq+r2 ], xm0, 1 1385 movd [dstq+strideq*0], xm1 1386 pextrd [dstq+strideq*1], xm1, 1 1387 lea dstq, [dstq+strideq*4] 1388 sub hd, 4 1389 jg .w4_upsample_loop 1390 RET 1391ALIGN function_align 1392.filter_strength: ; w4/w8/w16 1393 ; The C version uses a lot of branches, but we can do all the comparisons 1394 ; in parallel and use popcnt to get the final filter strength value. 1395%define base r3-z_filter_t0 1396 lea r3, [z_filter_t0] 1397 movd xm0, maxbased 1398 movd xm2, angled 1399 shr angled, 8 ; is_sm << 1 1400 vpbroadcastb m0, xm0 1401 vpbroadcastb m2, xm2 1402 pcmpeqb m1, m0, [base+z_filter_wh] 1403 pand m1, m2 1404 mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases 1405 pcmpgtb m1, m2 1406 pmovmskb r5d, m1 1407 ret 1408.w4_no_upsample: 1409 ALLOC_STACK -16, 11 1410 mov maxbased, 7 1411 test angled, 0x400 ; !enable_intra_edge_filter 1412 jnz .w4_main 1413 lea maxbased, [hq+3] 1414 call .filter_strength 1415 mov maxbased, 7 1416 test r5d, r5d 1417 jz .w4_main ; filter_strength == 0 1418 popcnt r5d, r5d 1419 vpbroadcastd m7, [base+pb_8] 1420 vbroadcasti128 m2, [tlq-1] 1421 pminub m1, m7, [base+z_filter_s] 1422 vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] 1423 pminub m7, [base+z_filter_s+8] 1424 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] 1425 vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] 1426 pshufb m0, m2, m1 1427 shufps m1, m7, q2121 1428 pmaddubsw m0, m8 1429 pshufb m1, m2, m1 1430 pmaddubsw m1, m9 1431 pshufb m2, m7 1432 pmaddubsw m2, m10 1433 paddw m0, m1 1434 paddw m0, m2 1435 pmulhrsw m0, m3 1436 mov r3d, 9 1437 mov tlq, rsp 1438 cmp hd, 4 1439 cmovne maxbased, r3d 1440 vextracti128 xm1, m0, 1 1441 packuswb xm0, xm1 1442 mova [tlq], xm0 1443.w4_main: 1444 movd xm6, dxd 1445 vpbroadcastq m0, [z_base_inc] ; base_inc << 6 1446 vpbroadcastb m7, [tlq+maxbaseq] 1447 shl maxbased, 6 1448 vpbroadcastw m6, xm6 1449 mov r3d, dxd ; xpos 1450 movd xm9, maxbased 1451 vpbroadcastw m9, xm9 1452 vbroadcasti128 m8, [z1_shuf_w4] 1453 psrlw m7, 8 ; top[max_base_x] 1454 paddw m10, m6, m6 1455 psubw m9, m0 ; max_base_x 1456 vpblendd m6, m10, 0xcc 1457 mova xm0, xm10 1458 paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1 1459 paddw m10, m10 1460.w4_loop: 1461 lea r5d, [r3+dxq] 1462 shr r3d, 6 ; base0 1463 vpbroadcastq m1, [tlq+r3] 1464 lea r3d, [r5+dxq] 1465 shr r5d, 6 ; base1 1466 vpbroadcastq m2, [tlq+r5] 1467 lea r5d, [r3+dxq] 1468 shr r3d, 6 ; base2 1469 movq xm0, [tlq+r3] 1470 lea r3d, [r5+dxq] 1471 shr r5d, 6 ; base3 1472 movhps xm0, [tlq+r5] 1473 vpblendd m1, m2, 0xc0 1474 pand m2, m4, m6 ; frac 1475 vpblendd m0, m1, 0xf0 1476 psubw m1, m5, m2 ; 64-frac 1477 psllw m2, 8 1478 pshufb m0, m8 1479 por m1, m2 ; 64-frac, frac 1480 pmaddubsw m0, m1 1481 pcmpgtw m1, m9, m6 ; base < max_base_x 1482 pmulhrsw m0, m3 1483 paddw m6, m10 ; xpos += dx 1484 lea r5, [dstq+strideq*2] 1485 vpblendvb m0, m7, m0, m1 1486 packuswb m0, m0 1487 vextracti128 xm1, m0, 1 1488 movd [r5 +strideq*0], xm0 1489 pextrd [r5 +strideq*1], xm0, 1 1490 movd [dstq+strideq*0], xm1 1491 pextrd [dstq+strideq*1], xm1, 1 1492 sub hd, 4 1493 jz .w4_end 1494 lea dstq, [dstq+strideq*4] 1495 cmp r3d, maxbased 1496 jb .w4_loop 1497 packuswb xm7, xm7 1498 lea r6, [strideq*3] 1499.w4_end_loop: 1500 movd [dstq+strideq*0], xm7 1501 movd [dstq+strideq*1], xm7 1502 movd [dstq+strideq*2], xm7 1503 movd [dstq+r6 ], xm7 1504 lea dstq, [dstq+strideq*4] 1505 sub hd, 4 1506 jg .w4_end_loop 1507.w4_end: 1508 RET 1509ALIGN function_align 1510.w8: 1511 lea r3d, [angleq+216] 1512 mov r3b, hb 1513 cmp r3d, 8 1514 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 1515 ALLOC_STACK -32, 8 1516 movu xm2, [z_filter_s+6] 1517 mova xm0, [tlq-1] 1518 movd xm6, hd 1519 vinserti128 m0, [tlq+7], 1 1520 vpbroadcastb xm6, xm6 1521 vbroadcasti128 m1, [z_upsample1] 1522 pminub xm6, xm2 1523 vpbroadcastd m7, [pb_36_m4] 1524 vinserti128 m2, xm6, 1 1525 add dxd, dxd 1526 pshufb m1, m0, m1 1527 pshufb m2, m0, m2 1528 movd xm6, dxd 1529 pmaddubsw m1, m7 1530 pmaddubsw m2, m7 1531 vpbroadcastw m6, xm6 1532 mov r3d, dxd 1533 psrldq m0, 1 1534 lea r2, [strideq*3] 1535 paddw m7, m6, m6 1536 paddw m1, m2 1537 vpblendd m6, m7, 0xf0 1538 pmulhrsw m1, m3 1539 pslldq m2, m7, 8 1540 paddw m7, m7 1541 paddw m6, m2 1542 packuswb m1, m1 1543 punpcklbw m0, m1 1544 mova [rsp], m0 1545.w8_upsample_loop: 1546 lea r5d, [r3+dxq] 1547 shr r3d, 6 ; base0 1548 movu xm0, [rsp+r3] 1549 lea r3d, [r5+dxq] 1550 shr r5d, 6 ; base1 1551 vinserti128 m0, [rsp+r5], 1 1552 lea r5d, [r3+dxq] 1553 shr r3d, 6 ; base2 1554 pand m1, m4, m6 1555 psubw m2, m5, m1 1556 psllw m1, 8 1557 por m2, m1 1558 punpcklqdq m1, m2, m2 ; frac0 frac1 1559 pmaddubsw m0, m1 1560 movu xm1, [rsp+r3] 1561 lea r3d, [r5+dxq] 1562 shr r5d, 6 ; base3 1563 vinserti128 m1, [rsp+r5], 1 1564 punpckhqdq m2, m2 ; frac2 frac3 1565 pmaddubsw m1, m2 1566 pmulhrsw m0, m3 1567 paddw m6, m7 1568 pmulhrsw m1, m3 1569 packuswb m0, m1 1570 vextracti128 xm1, m0, 1 1571 movq [dstq+strideq*0], xm0 1572 movhps [dstq+strideq*2], xm0 1573 movq [dstq+strideq*1], xm1 1574 movhps [dstq+r2 ], xm1 1575 lea dstq, [dstq+strideq*4] 1576 sub hd, 4 1577 jg .w8_upsample_loop 1578 RET 1579.w8_no_intra_edge_filter: 1580 and maxbased, 7 1581 or maxbased, 8 ; imin(h+7, 15) 1582 jmp .w8_main 1583.w8_no_upsample: 1584 ALLOC_STACK -32, 10 1585 lea maxbased, [hq+7] 1586 test angled, 0x400 1587 jnz .w8_no_intra_edge_filter 1588 call .filter_strength 1589 test r5d, r5d 1590 jz .w8_main ; filter_strength == 0 1591 popcnt r5d, r5d 1592 movu xm2, [tlq] 1593 pminub xm1, xm0, [base+z_filter_s+14] 1594 vinserti128 m2, [tlq-1], 1 1595 vinserti128 m1, [base+z_filter_s+ 0], 1 1596 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] 1597 pminub xm0, [base+z_filter_s+22] 1598 vinserti128 m0, [base+z_filter_s+ 8], 1 1599 pshufb m6, m2, m1 1600 pmaddubsw m6, m7 1601 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] 1602 movzx r3d, byte [tlq+15] 1603 shufps m1, m0, q2121 1604 pshufb m1, m2, m1 1605 pmaddubsw m1, m7 1606 paddw m1, m6 1607 sub r5d, 3 1608 jnz .w8_3tap 1609 ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one, 1610 ; which also results in an awkward edge case where out[w*2] is 1611 ; slightly different from out[max_base_x] when h > w. 1612 vpbroadcastd m7, [z_filter_k+4*8] 1613 movzx r2d, byte [tlq+14] 1614 pshufb m2, m0 1615 pmaddubsw m2, m7 1616 sub r2d, r3d 1617 lea r2d, [r2+r3*8+4] 1618 shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3 1619 mov [rsp+16], r2b 1620 paddw m1, m2 1621.w8_3tap: 1622 pmulhrsw m1, m3 1623 sar r5d, 1 1624 mov tlq, rsp 1625 add r5d, 17 ; w*2 + (filter_strength == 3) 1626 cmp hd, 16 1627 cmovns maxbased, r5d 1628 mov [tlq+r5], r3b 1629 vextracti128 xm0, m1, 1 1630 packuswb xm0, xm1 1631 mova [tlq], xm0 1632.w8_main: 1633 movd xm2, dxd 1634 vbroadcasti128 m0, [z_base_inc] 1635 vpbroadcastw m2, xm2 1636 vpbroadcastb m7, [tlq+maxbaseq] 1637 shl maxbased, 6 1638 movd xm9, maxbased 1639 vbroadcasti128 m8, [z_filter_s+2] 1640 vpbroadcastw m9, xm9 1641 psrlw m7, 8 1642 psubw m9, m0 1643 mov r3d, dxd 1644 paddw m6, m2, m2 1645 vpblendd m2, m6, 0xf0 1646.w8_loop: 1647 lea r5d, [r3+dxq] 1648 shr r3d, 6 1649 pand m0, m4, m2 1650 psubw m1, m5, m0 1651 psllw m0, 8 1652 por m1, m0 1653 movu xm0, [tlq+r3] 1654 lea r3d, [r5+dxq] 1655 shr r5d, 6 ; base1 1656 vinserti128 m0, [tlq+r5], 1 1657 pshufb m0, m8 1658 pmaddubsw m0, m1 1659 pcmpgtw m1, m9, m2 1660 paddw m2, m6 1661 pmulhrsw m0, m3 1662 vpblendvb m0, m7, m0, m1 1663 vextracti128 xm1, m0, 1 1664 packuswb xm0, xm1 1665 movq [dstq+strideq*0], xm0 1666 movhps [dstq+strideq*1], xm0 1667 sub hd, 2 1668 jz .w8_end 1669 lea dstq, [dstq+strideq*2] 1670 cmp r3d, maxbased 1671 jb .w8_loop 1672 packuswb xm7, xm7 1673.w8_end_loop: 1674 movq [dstq+strideq*0], xm7 1675 movq [dstq+strideq*1], xm7 1676 lea dstq, [dstq+strideq*2] 1677 sub hd, 2 1678 jg .w8_end_loop 1679.w8_end: 1680 RET 1681.w16_no_intra_edge_filter: 1682 and maxbased, 15 1683 or maxbased, 16 ; imin(h+15, 31) 1684 jmp .w16_main 1685ALIGN function_align 1686.w16: 1687 ALLOC_STACK -64, 12 1688 lea maxbased, [hq+15] 1689 test angled, 0x400 1690 jnz .w16_no_intra_edge_filter 1691 call .filter_strength 1692 test r5d, r5d 1693 jz .w16_main ; filter_strength == 0 1694 popcnt r5d, r5d 1695 vpbroadcastd m1, [base+pb_12] 1696 vbroadcasti128 m6, [base+z_filter_s+8] 1697 vinserti128 m2, m6, [base+z_filter_s], 0 1698 vinserti128 m6, [base+z_filter_s+16], 1 1699 mova xm10, [tlq-1] 1700 vinserti128 m10, [tlq+3], 1 1701 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] 1702 vbroadcasti128 m7, [base+z_filter_s+14] 1703 vinserti128 m8, m7, [base+z_filter_s+6], 0 1704 vinserti128 m7, [base+z_filter_s+22], 1 1705 psubw m0, m1 1706 movu xm11, [tlq+12] 1707 vinserti128 m11, [tlq+16], 1 1708 pminub m8, m0 1709 pminub m7, m0 1710 pshufb m0, m10, m2 1711 shufps m2, m6, q2121 1712 pmaddubsw m0, m9 1713 pshufb m1, m11, m8 1714 shufps m8, m7, q2121 1715 pmaddubsw m1, m9 1716 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] 1717 movzx r3d, byte [tlq+31] 1718 pshufb m2, m10, m2 1719 pmaddubsw m2, m9 1720 pshufb m8, m11, m8 1721 pmaddubsw m8, m9 1722 paddw m0, m2 1723 paddw m1, m8 1724 sub r5d, 3 1725 jnz .w16_3tap 1726 vpbroadcastd m9, [z_filter_k+4*8] 1727 movzx r2d, byte [tlq+30] 1728 pshufb m10, m6 1729 pmaddubsw m10, m9 1730 pshufb m11, m7 1731 pmaddubsw m11, m9 1732 sub r2d, r3d 1733 lea r2d, [r2+r3*8+4] 1734 shr r2d, 3 1735 mov [rsp+32], r2b 1736 paddw m0, m10 1737 paddw m1, m11 1738.w16_3tap: 1739 pmulhrsw m0, m3 1740 pmulhrsw m1, m3 1741 sar r5d, 1 1742 mov tlq, rsp 1743 add r5d, 33 1744 cmp hd, 32 1745 cmovns maxbased, r5d 1746 mov [tlq+r5], r3b 1747 packuswb m0, m1 1748 vpermq m0, m0, q3120 1749 mova [tlq], m0 1750.w16_main: 1751 movd xm6, dxd 1752 vbroadcasti128 m0, [z_base_inc] 1753 vpbroadcastb m7, [tlq+maxbaseq] 1754 shl maxbased, 6 1755 vpbroadcastw m6, xm6 1756 movd xm9, maxbased 1757 vbroadcasti128 m8, [z_filter_s+2] 1758 vpbroadcastw m9, xm9 1759 mov r3d, dxd 1760 psubw m9, m0 1761 paddw m11, m6, m6 1762 psubw m10, m9, m3 ; 64*8 1763 vpblendd m6, m11, 0xf0 1764.w16_loop: 1765 lea r5d, [r3+dxq] 1766 shr r3d, 6 ; base0 1767 pand m1, m4, m6 1768 psubw m2, m5, m1 1769 psllw m1, 8 1770 por m2, m1 1771 movu xm0, [tlq+r3+0] 1772 movu xm1, [tlq+r3+8] 1773 lea r3d, [r5+dxq] 1774 shr r5d, 6 ; base1 1775 vinserti128 m0, [tlq+r5+0], 1 1776 vinserti128 m1, [tlq+r5+8], 1 1777 pshufb m0, m8 1778 pshufb m1, m8 1779 pmaddubsw m0, m2 1780 pmaddubsw m1, m2 1781 pmulhrsw m0, m3 1782 pmulhrsw m1, m3 1783 packuswb m0, m1 1784 pcmpgtw m1, m9, m6 1785 pcmpgtw m2, m10, m6 1786 packsswb m1, m2 1787 paddw m6, m11 1788 vpblendvb m0, m7, m0, m1 1789 mova [dstq+strideq*0], xm0 1790 vextracti128 [dstq+strideq*1], m0, 1 1791 sub hd, 2 1792 jz .w16_end 1793 lea dstq, [dstq+strideq*2] 1794 cmp r3d, maxbased 1795 jb .w16_loop 1796.w16_end_loop: 1797 mova [dstq+strideq*0], xm7 1798 mova [dstq+strideq*1], xm7 1799 lea dstq, [dstq+strideq*2] 1800 sub hd, 2 1801 jg .w16_end_loop 1802.w16_end: 1803 RET 1804ALIGN function_align 1805.w32: 1806 ALLOC_STACK -96, 15 1807 lea r3d, [hq+31] 1808 mov maxbased, 63 1809 cmp hd, 32 1810 cmovs maxbased, r3d 1811 test angled, 0x400 ; !enable_intra_edge_filter 1812 jnz .w32_main 1813 vbroadcasti128 m0, [pb_0to15] 1814 sub r3d, 29 ; h+2 1815 movu xm13, [tlq+29] ; 32-39 1816 movd xm1, r3d 1817 movu xm14, [tlq+37] ; 40-47 1818 sub r3d, 8 ; h-6 1819 vinserti128 m14, [tlq+51], 1 ; 56-63 1820 vpbroadcastb xm1, xm1 1821 mova xm11, [tlq- 1] ; 0- 7 1822 vinserti128 m11, [tlq+13], 1 ; 16-23 1823 movd xm2, r3d 1824 movu xm12, [tlq+ 5] ; 8-15 1825 vinserti128 m12, [tlq+19], 1 ; 24-31 1826 pminub xm1, xm0 ; clip 32x8 1827 mova m7, [z_filter_s+0] 1828 pshufb xm13, xm1 1829 vpbroadcastd m1, [pb_12] 1830 vpbroadcastb xm2, xm2 1831 vinserti128 m13, [tlq+43], 1 ; 48-55 1832 vinserti128 m8, m7, [z_filter_s+4], 1 1833 vpblendd m2, m1, 0xf0 1834 vinserti128 m7, [z_filter_s+12], 0 1835 pminub m2, m0 ; clip 32x16 and 32x(32|64) 1836 vpbroadcastd m9, [z_filter_k+4*2+12*0] 1837 pshufb m14, m2 1838 pshufb m0, m11, m8 1839 shufps m8, m7, q1021 1840 pmaddubsw m0, m9 1841 pshufb m2, m12, m8 1842 pmaddubsw m2, m9 1843 pshufb m1, m13, m8 1844 pmaddubsw m1, m9 1845 pshufb m6, m14, m8 1846 pmaddubsw m6, m9 1847 vpbroadcastd m9, [z_filter_k+4*2+12*1] 1848 pshufb m10, m11, m8 1849 shufps m8, m7, q2121 1850 pmaddubsw m10, m9 1851 paddw m0, m10 1852 pshufb m10, m12, m8 1853 pmaddubsw m10, m9 1854 paddw m2, m10 1855 pshufb m10, m13, m8 1856 pmaddubsw m10, m9 1857 paddw m1, m10 1858 pshufb m10, m14, m8 1859 pmaddubsw m10, m9 1860 paddw m6, m10 1861 vpbroadcastd m9, [z_filter_k+4*2+12*2] 1862 pshufb m11, m8 1863 pmaddubsw m11, m9 1864 pshufb m12, m7 1865 pmaddubsw m12, m9 1866 movzx r3d, byte [tlq+63] 1867 movzx r2d, byte [tlq+62] 1868 paddw m0, m11 1869 paddw m2, m12 1870 pshufb m13, m7 1871 pmaddubsw m13, m9 1872 pshufb m14, m7 1873 pmaddubsw m14, m9 1874 paddw m1, m13 1875 paddw m6, m14 1876 sub r2d, r3d 1877 lea r2d, [r2+r3*8+4] ; edge case for 32x64 1878 pmulhrsw m0, m3 1879 pmulhrsw m2, m3 1880 pmulhrsw m1, m3 1881 pmulhrsw m6, m3 1882 shr r2d, 3 1883 mov [rsp+64], r2b 1884 mov tlq, rsp 1885 mov [tlq+65], r3b 1886 mov r3d, 65 1887 cmp hd, 64 1888 cmove maxbased, r3d 1889 packuswb m0, m2 1890 packuswb m1, m6 1891 mova [tlq+ 0], m0 1892 mova [tlq+32], m1 1893.w32_main: 1894 movd xm6, dxd 1895 vpbroadcastb m7, [tlq+maxbaseq] 1896 shl maxbased, 6 1897 vpbroadcastw m6, xm6 1898 movd xm9, maxbased 1899 vbroadcasti128 m8, [z_filter_s+2] 1900 vpbroadcastw m9, xm9 1901 mov r5d, dxd 1902 psubw m9, [z_base_inc] 1903 mova m11, m6 1904 psubw m10, m9, m3 ; 64*8 1905.w32_loop: 1906 mov r3d, r5d 1907 shr r3d, 6 1908 pand m1, m4, m6 1909 psubw m2, m5, m1 1910 psllw m1, 8 1911 por m2, m1 1912 movu m0, [tlq+r3+0] 1913 movu m1, [tlq+r3+8] 1914 add r5d, dxd 1915 pshufb m0, m8 1916 pshufb m1, m8 1917 pmaddubsw m0, m2 1918 pmaddubsw m1, m2 1919 pmulhrsw m0, m3 1920 pmulhrsw m1, m3 1921 packuswb m0, m1 1922 pcmpgtw m1, m9, m6 1923 pcmpgtw m2, m10, m6 1924 packsswb m1, m2 1925 paddw m6, m11 1926 vpblendvb m0, m7, m0, m1 1927 mova [dstq], m0 1928 dec hd 1929 jz .w32_end 1930 add dstq, strideq 1931 cmp r5d, maxbased 1932 jb .w32_loop 1933 test hb, 1 1934 jz .w32_end_loop 1935 mova [dstq], m7 1936 add dstq, strideq 1937 dec hd 1938 jz .w32_end 1939.w32_end_loop: 1940 mova [dstq+strideq*0], m7 1941 mova [dstq+strideq*1], m7 1942 lea dstq, [dstq+strideq*2] 1943 sub hd, 2 1944 jg .w32_end_loop 1945.w32_end: 1946 RET 1947ALIGN function_align 1948.w64: 1949 ALLOC_STACK -128, 16 1950 lea maxbased, [hq+63] 1951 test angled, 0x400 ; !enable_intra_edge_filter 1952 jnz .w64_main 1953 mova xm11, [tlq- 1] ; 0- 7 1954 vinserti128 m11, [tlq+13], 1 ; 16-23 1955 movu xm12, [tlq+ 5] ; 8-15 1956 vinserti128 m12, [tlq+19], 1 ; 24-31 1957 mova m7, [z_filter_s+0] 1958 vinserti128 m8, m7, [z_filter_s+4], 1 1959 vinserti128 m7, [z_filter_s+12], 0 1960 vpbroadcastd m9, [z_filter_k+4*2+12*0] 1961 movu xm13, [tlq+29] ; 32-39 1962 vinserti128 m13, [tlq+43], 1 ; 48-55 1963 movu xm14, [tlq+37] ; 40-47 1964 vinserti128 m14, [tlq+51], 1 ; 56-63 1965 pshufb m0, m11, m8 1966 shufps m8, m7, q1021 1967 pmaddubsw m0, m9 1968 pshufb m2, m12, m8 1969 pmaddubsw m2, m9 1970 pshufb m1, m13, m8 1971 pmaddubsw m1, m9 1972 pshufb m6, m14, m8 1973 pmaddubsw m6, m9 1974 vpbroadcastd m9, [z_filter_k+4*2+12*1] 1975 pshufb m10, m11, m8 1976 shufps m15, m8, m7, q2121 1977 pmaddubsw m10, m9 1978 paddw m0, m10 1979 pshufb m10, m12, m15 1980 pmaddubsw m10, m9 1981 paddw m2, m10 1982 pshufb m10, m13, m15 1983 pmaddubsw m10, m9 1984 paddw m1, m10 1985 pshufb m10, m14, m15 1986 pmaddubsw m10, m9 1987 paddw m6, m10 1988 vpbroadcastd m10, [z_filter_k+4*2+12*2] 1989 pshufb m11, m15 1990 pmaddubsw m11, m10 1991 pshufb m12, m7 1992 pmaddubsw m12, m10 1993 pshufb m13, m7 1994 pmaddubsw m13, m10 1995 pshufb m14, m7 1996 pmaddubsw m14, m10 1997 paddw m0, m11 1998 paddw m2, m12 1999 paddw m1, m13 2000 paddw m6, m14 2001 movu xm11, [tlq+ 61] ; 64- 71 2002 vinserti128 m11, [tlq+ 75], 1 ; 80- 87 2003 movu xm12, [tlq+ 69] ; 72- 79 2004 vinserti128 m12, [tlq+ 83], 1 ; 88- 95 2005 movu xm13, [tlq+ 93] ; 96-103 2006 vinserti128 m13, [tlq+107], 1 ; 112-119 2007 movu xm14, [tlq+101] ; 104-111 2008 vinserti128 m14, [tlq+115], 1 ; 120-127 2009 pmulhrsw m0, m3 2010 pmulhrsw m2, m3 2011 pmulhrsw m1, m3 2012 pmulhrsw m6, m3 2013 lea r3d, [hq-20] 2014 mov tlq, rsp 2015 packuswb m0, m2 2016 packuswb m1, m6 2017 vpbroadcastd xm2, [pb_14] 2018 vbroadcasti128 m6, [pb_0to15] 2019 mova [tlq+32*0], m0 2020 mova [tlq+32*1], m1 2021 movd xm0, r3d 2022 vpbroadcastd m1, [pb_12] 2023 vpbroadcastb m0, xm0 2024 paddb m0, m2 2025 pminub m0, m6 ; clip 64x16 and 64x32 2026 pshufb m12, m0 2027 pminub m1, m6 ; clip 64x64 2028 pshufb m14, m1 2029 pshufb m0, m11, m7 2030 pmaddubsw m0, m10 2031 pshufb m2, m12, m7 2032 pmaddubsw m2, m10 2033 pshufb m1, m13, m7 2034 pmaddubsw m1, m10 2035 pshufb m6, m14, m7 2036 pmaddubsw m6, m10 2037 pshufb m7, m11, m15 2038 pmaddubsw m7, m9 2039 pshufb m10, m12, m15 2040 pmaddubsw m10, m9 2041 paddw m0, m7 2042 pshufb m7, m13, m15 2043 pmaddubsw m7, m9 2044 paddw m2, m10 2045 pshufb m10, m14, m15 2046 pmaddubsw m10, m9 2047 paddw m1, m7 2048 paddw m6, m10 2049 vpbroadcastd m9, [z_filter_k+4*2+12*0] 2050 pshufb m11, m8 2051 pmaddubsw m11, m9 2052 pshufb m12, m8 2053 pmaddubsw m12, m9 2054 pshufb m13, m8 2055 pmaddubsw m13, m9 2056 pshufb m14, m8 2057 pmaddubsw m14, m9 2058 paddw m0, m11 2059 paddw m2, m12 2060 paddw m1, m13 2061 paddw m6, m14 2062 pmulhrsw m0, m3 2063 pmulhrsw m2, m3 2064 pmulhrsw m1, m3 2065 pmulhrsw m6, m3 2066 packuswb m0, m2 2067 packuswb m1, m6 2068 mova [tlq+32*2], m0 2069 mova [tlq+32*3], m1 2070.w64_main: 2071 movd xm12, dxd 2072 vpbroadcastb m7, [tlq+maxbaseq] 2073 lea r3d, [dxq-64] 2074 shl maxbased, 6 2075 vpbroadcastw m12, xm12 2076 sub r3d, maxbased 2077 vbroadcasti128 m8, [z_filter_s+2] 2078 movd xm6, r3d 2079 mov r5d, dxd 2080 mova m10, [pb_1to32] 2081 vpbroadcastd m11, [pb_32] 2082 vpbroadcastw m6, xm6 2083.w64_loop: 2084 mov r3d, r5d 2085 shr r3d, 6 2086 movu m0, [tlq+r3+ 0] 2087 movu m1, [tlq+r3+ 8] 2088 pand m2, m4, m6 2089 psubw m9, m5, m2 2090 psllw m2, 8 2091 por m9, m2 2092 pshufb m0, m8 2093 pshufb m1, m8 2094 pmaddubsw m0, m9 2095 pmaddubsw m1, m9 2096 psraw m2, m6, 6 2097 pmulhrsw m0, m3 2098 pmulhrsw m1, m3 2099 packsswb m2, m2 2100 paddb m2, m10 2101 packuswb m0, m1 2102 vpblendvb m0, m7, m0, m2 2103 mova [dstq+ 0], m0 2104 movu m0, [tlq+r3+32] 2105 movu m1, [tlq+r3+40] 2106 add r5d, dxd 2107 pshufb m0, m8 2108 pshufb m1, m8 2109 pmaddubsw m0, m9 2110 pmaddubsw m1, m9 2111 paddb m2, m11 2112 pmulhrsw m0, m3 2113 pmulhrsw m1, m3 2114 paddw m6, m12 2115 packuswb m0, m1 2116 vpblendvb m0, m7, m0, m2 2117 mova [dstq+32], m0 2118 dec hd 2119 jz .w64_end 2120 add dstq, strideq 2121 cmp r5d, maxbased 2122 jb .w64_loop 2123.w64_end_loop: 2124 mova [dstq+ 0], m7 2125 mova [dstq+32], m7 2126 add dstq, strideq 2127 dec hd 2128 jg .w64_end_loop 2129.w64_end: 2130 RET 2131 2132cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy 2133%define base r9-z_filter_t0 2134 lea r9, [ipred_z2_avx2_table] 2135 tzcnt wd, wm 2136 movifnidn angled, anglem 2137 movifnidn hd, hm 2138 lea dxq, [dr_intra_derivative-90] 2139 movsxd wq, [r9+wq*4] 2140 movzx dyd, angleb 2141 xor angled, 0x400 2142 mov r8, dxq 2143 sub dxq, dyq 2144 add wq, r9 2145 add r9, z_filter_t0-ipred_z2_avx2_table 2146 mova m2, [tlq-64] 2147 mova m0, [tlq-32] 2148 mova m1, [tlq] 2149 and dyd, ~1 2150 and dxq, ~1 2151 movzx dyd, word [r8+dyq] ; angle - 90 2152 movzx dxd, word [dxq+270] ; 180 - angle 2153 vpbroadcastd m13, [base+pw_512] 2154 vpbroadcastd m14, [base+pw_62] 2155 vpbroadcastd m15, [base+pw_64] 2156 mova [rsp+ 0], m2 2157 mova [rsp+32], m0 2158 mova [rsp+64], m1 2159 neg dxd 2160 neg dyd 2161 jmp wq 2162.w4: 2163 vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6 2164 vbroadcasti128 m10, [base+z1_shuf_w4] 2165 vbroadcasti128 m11, [base+z2_shuf_h4] 2166 lea r2d, [dxq+(65<<6)] ; xpos 2167 movd xm5, dyd 2168 mov r8d, (63-4)<<6 2169 mov dyq, -4 2170 pshuflw xm5, xm5, q0000 2171 pmullw xm5, [base+z2_ymul] 2172 test angled, 0x400 2173 jnz .w4_main ; !enable_intra_edge_filter 2174 lea r3d, [hq+2] 2175 add angled, 1022 2176 shl r3d, 6 2177 test r3d, angled 2178 jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) 2179 vpbroadcastd xm3, [base+pb_4] 2180 call .upsample_above 2181 sub angled, 1075 ; angle - 53 2182 lea r3d, [hq+3] 2183 xor angled, 0x7f ; 180 - angle 2184 call .filter_strength 2185 jmp .w4_filter_left 2186ALIGN function_align 2187.filter_strength: 2188 movd xm8, r3d 2189 mov r3d, angled 2190 movd xm7, angled 2191 vpbroadcastb m8, xm8 2192 shr r3d, 8 ; is_sm << 1 2193 vpbroadcastb m7, xm7 2194 pcmpeqb m8, [base+z_filter_wh] 2195 mova xm9, [r9+r3*8] 2196 pand m0, m8, m7 2197 pcmpgtb m0, m9 2198 pmovmskb r3d, m0 2199 ret 2200ALIGN function_align 2201.upsample_above: ; w4/w8 2202 pshufb xm2, xm1, [base+z_upsample1-2] 2203 pminub xm3, [base+z_filter_s+4] 2204 vpbroadcastd xm4, [base+pb_36_m4] 2205 vbroadcasti128 m10, [base+pb_0to15] 2206 pshufb xm3, xm1, xm3 2207 pmaddubsw xm2, xm4 2208 pmaddubsw xm3, xm4 2209 lea r2d, [r2+dxq+(1<<6)] 2210 add dxd, dxd 2211 paddw xm2, xm3 2212 pmulhrsw xm2, xm13 2213 sub r8d, 3<<6 2214 paddw m6, m6 2215 packuswb xm2, xm2 2216 punpcklbw xm1, xm2 2217 mova [rsp+gprsize+64], xm1 2218 ret 2219ALIGN function_align 2220.upsample_left: ; h4/h8 2221 mov r3d, hd 2222 and r3d, 4 2223 movd xm2, [rsp+gprsize+64] 2224 movddup xm0, [rsp+gprsize+56] 2225 movd xm1, r3d 2226 palignr xm2, xm0, 1 2227 vpbroadcastb xm1, xm1 2228 pshufb xm2, [base+z_filter_s+18] 2229 vpbroadcastd xm3, [base+pb_36_m4] 2230 pmaxub xm1, [base+z_upsample1-2] 2231 pshufb xm1, xm0, xm1 2232 pmaddubsw xm2, xm3 2233 pmaddubsw xm1, xm3 2234 paddw xm5, xm5 2235 add dyq, dyq 2236 paddw xm1, xm2 2237 pmulhrsw xm1, xm13 2238 vbroadcasti128 m11, [base+z2_upsample] 2239 paddw xm5, xm15 2240 packuswb xm1, xm1 2241 punpcklbw xm0, xm1 2242 mova [rsp+gprsize+48], xm0 2243 ret 2244.w4_no_upsample_above: 2245 lea r3d, [hq+3] 2246 sub angled, 1112 ; angle - 90 2247 call .filter_strength 2248 test r3d, r3d 2249 jz .w4_no_filter_above 2250 popcnt r3d, r3d 2251 vpbroadcastd xm2, [base+pb_4] 2252 pminub xm2, [base+z_filter_s] 2253 vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] 2254 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] 2255 pshufb xm3, xm1, xm2 ; 00 01 12 23 2256 pshufd xm2, xm2, q0321 2257 pmaddubsw xm0, xm3, xm0 2258 pshufb xm2, xm1, xm2 ; 12 23 34 44 2259 pmaddubsw xm2, xm4 2260 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] 2261 punpckhqdq xm3, xm3 ; 34 44 44 44 2262 pmaddubsw xm3, xm4 2263 vpbroadcastd xm4, r6m ; max_width 2264 packssdw xm4, xm4 2265 paddw xm0, xm2 2266 paddw xm0, xm3 2267 pmulhrsw xm0, xm13 2268 packsswb xm4, xm4 2269 psrlq xm1, 8 2270 psubb xm4, [base+pb_1to32] 2271 packuswb xm0, xm0 2272 vpblendvb xm0, xm1, xm4 2273 movd [rsp+65], xm0 2274.w4_no_filter_above: 2275 lea r3d, [hq+2] 2276 add angled, 973 ; angle + 883 2277 shl r3d, 6 2278 test r3d, angled 2279 jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) 2280 vpbroadcastd xm0, [base+pb_90] 2281 psubb xm0, xm7 ; 180 - angle 2282 pand xm0, xm8 ; reuse from previous filter_strength call 2283 pcmpgtb xm0, xm9 2284 pmovmskb r3d, xm0 2285.w4_filter_left: 2286 test r3d, r3d 2287 jz .w4_main 2288 popcnt r3d, r3d 2289 mov r5d, 10 2290 cmp hd, 16 2291 movu xm2, [rsp+49] 2292 vinserti128 m2, [rsp+43], 1 2293 cmovs r5d, hd 2294 xor r5d, 15 ; h == 16 ? 5 : 15 - h 2295 movd xm0, r5d 2296 vbroadcasti128 m1, [base+z_filter_s+12] 2297 vbroadcasti128 m4, [base+z_filter_s+16] 2298 vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab 2299 vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd 2300 vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef 2301 vpbroadcastb m0, xm0 2302 pmaxub m0, m3 2303 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] 2304 pshufb m0, m2, m0 2305 pmaddubsw m0, m3 2306 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1] 2307 pshufb m1, m2, m1 2308 pmaddubsw m1, m3 2309 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2] 2310 pshufb m2, m4 2311 pmaddubsw m2, m3 2312 vpbroadcastd xm4, r7m ; max_height 2313 packssdw xm4, xm4 2314 paddw m1, m0 2315 paddw m1, m2 2316 pmulhrsw m1, m13 2317 packsswb xm4, xm4 2318 vextracti128 xm0, m1, 1 2319 psubb xm4, [base+pb_16to1] 2320 packuswb xm0, xm1 2321 vpblendvb xm0, [rsp+48], xm4 2322 mova [rsp+48], xm0 2323 jmp .w4_main 2324.w4_upsample_left: 2325 call .upsample_left 2326.w4_main: 2327 movd xm0, dxd 2328 mova m12, [base+z2_y_shuf_h4] 2329 lea r5, [rsp+56] ; left-7 2330 vpbroadcastw m0, xm0 2331 lea r9, [strideq*3] 2332 psraw xm1, xm5, 6 2333 pand xm5, xm14 ; frac_y 2334 pxor xm2, xm2 2335 paddw m7, m0, m0 2336 psubw xm4, xm2, xm1 ; base_y 2337 vpblendd m0, m7, 0xcc 2338 mova xm1, xm7 2339 punpcklwd xm4, xm2 2340 paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1 2341 psubw xm1, xm15, xm5 ; 64-frac_y 2342 psllw xm5, 8 2343 paddw m7, m7 2344 paddw m6, m0 2345 por xm5, xm1 ; 64-frac_y, frac_y 2346 vpbroadcastq m5, xm5 2347.w4_loop: 2348 lea r3d, [r2+dxq] 2349 shr r2d, 6 ; base_x0 2350 vpbroadcastq m1, [rsp+r2] 2351 lea r2d, [r3+dxq] 2352 shr r3d, 6 ; base_x1 2353 vpbroadcastq m2, [rsp+r3] 2354 lea r3d, [r2+dxq] 2355 shr r2d, 6 ; base_x2 2356 movq xm0, [rsp+r2] 2357 lea r2d, [r3+dxq] 2358 shr r3d, 6 ; base_x3 2359 movhps xm0, [rsp+r3] 2360 vpblendd m1, m2, 0xc0 2361 pand m2, m14, m6 ; frac_x 2362 vpblendd m0, m1, 0xf0 2363 psubw m1, m15, m2 ; 64-frac_x 2364 psllw m2, 8 2365 pshufb m0, m10 2366 por m1, m2 ; 64-frac_x, frac_x 2367 pmaddubsw m0, m1 2368 cmp r3d, 64 2369 jge .w4_toponly 2370 mova m1, m7 ; arbitrary negative value 2371 vpgatherdq m3, [r5+xm4], m1 2372 pshufb m1, m3, m11 2373 vpermd m1, m12, m1 2374 pmaddubsw m1, m5 2375 psraw m2, m6, 15 ; base_x < topleft 2376 vpblendvb m0, m1, m2 2377.w4_toponly: 2378 pmulhrsw m0, m13 2379 paddw m6, m7 ; xpos += dx 2380 add r5, dyq 2381 packuswb m0, m0 2382 vextracti128 xm1, m0, 1 2383 movd [dstq+strideq*2], xm0 2384 pextrd [dstq+r9 ], xm0, 1 2385 movd [dstq+strideq*0], xm1 2386 pextrd [dstq+strideq*1], xm1, 1 2387 sub hd, 4 2388 jz .w4_end 2389 lea dstq, [dstq+strideq*4] 2390 cmp r2d, r8d 2391 jge .w4_loop 2392.w4_leftonly_loop: 2393 mova m1, m7 2394 vpgatherdq m2, [r5+xm4], m1 2395 add r5, dyq 2396 pshufb m0, m2, m11 2397 vpermd m0, m12, m0 2398 pmaddubsw m0, m5 2399 pmulhrsw m0, m13 2400 packuswb m0, m0 2401 vextracti128 xm1, m0, 1 2402 movd [dstq+strideq*2], xm0 2403 pextrd [dstq+r9 ], xm0, 1 2404 movd [dstq+strideq*0], xm1 2405 pextrd [dstq+strideq*1], xm1, 1 2406 lea dstq, [dstq+strideq*4] 2407 sub hd, 4 2408 jg .w4_leftonly_loop 2409.w4_end: 2410 RET 2411.w8: 2412 vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6 2413 movd xm5, dyd 2414 vbroadcasti128 m10, [base+z_filter_s+2] 2415 vbroadcasti128 m11, [base+z2_shuf_h4] 2416 lea r2d, [dxq+(65<<6)] ; xpos 2417 vpbroadcastw xm5, xm5 2418 mov r8d, (63-8)<<6 2419 mov dyq, -4 2420 pmullw xm5, [base+z2_ymul] 2421 test angled, 0x400 2422 jnz .w8_main 2423 lea r3d, [angleq+126] 2424 mov r3b, hb 2425 cmp r3d, 8 2426 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm 2427 vpbroadcastd xm3, [base+pb_8] 2428 movhps [rsp+80], xm1 2429 call .upsample_above 2430 sub angled, 53 ; angle - 53 2431 lea r3d, [hq+7] 2432 xor angled, 0x7f ; 180 - angle 2433 call .filter_strength 2434 jmp .w8_filter_left 2435.w8_no_upsample_above: 2436 lea r3d, [hq+7] 2437 sub angled, 90 ; angle - 90 2438 call .filter_strength 2439 test r3d, r3d 2440 jz .w8_no_filter_above 2441 popcnt r3d, r3d 2442 vpbroadcastd xm3, [base+pb_8] 2443 pminub xm3, [base+z_filter_s+8] 2444 vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] 2445 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] 2446 pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67 2447 pmaddubsw xm0, xm2, xm0 2448 pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88 2449 shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88 2450 pmaddubsw xm2, xm4 2451 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] 2452 pmaddubsw xm3, xm4 2453 vpbroadcastd xm4, r6m ; max_width 2454 packssdw xm4, xm4 2455 paddw xm0, xm2 2456 paddw xm0, xm3 2457 pmulhrsw xm0, xm13 2458 packsswb xm4, xm4 2459 psrldq xm1, 1 2460 psubb xm4, [base+pb_1to32] 2461 packuswb xm0, xm0 2462 vpblendvb xm0, xm1, xm4 2463 movq [rsp+65], xm0 2464.w8_no_filter_above: 2465 lea r3d, [angleq-51] 2466 mov r3b, hb 2467 cmp r3d, 8 2468 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm 2469 vpbroadcastd m0, [base+pb_90] 2470 psubb m0, m7 2471 pand m0, m8 2472 pcmpgtb m0, m9 2473 pmovmskb r3d, m0 2474.w8_filter_left: 2475 test r3d, r3d 2476 jz .w8_main 2477 popcnt r3d, r3d 2478 vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] 2479 vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] 2480 vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] 2481 cmp hd, 32 2482 jne .w8_filter_left_h16 2483 movu xm2, [rsp+27] 2484 vinserti128 m2, [rsp+35], 1 2485 vpbroadcastd xm0, [base+pb_5] 2486 vbroadcasti128 m3, [base+z_filter_s+ 8] 2487 vbroadcasti128 m1, [base+z_filter_s+12] 2488 vbroadcasti128 m4, [base+z_filter_s+16] 2489 pmaxub m3, m0 2490 pshufb m3, m2, m3 2491 pmaddubsw m3, m7 2492 pshufb m1, m2, m1 2493 pmaddubsw m1, m8 2494 pshufb m2, m4 2495 pmaddubsw m2, m9 2496 paddw m3, m1 2497 paddw m3, m2 2498 pmulhrsw m3, m13 2499 jmp .w8_filter_left_top16 2500.w8_filter_left_h16: 2501 mov r5d, 10 2502 cmp hd, 16 2503 cmovs r5d, hd 2504 xor r5d, 15 ; h == 16 ? 5 : 15 - h 2505 movd xm0, r5d 2506 vpbroadcastb m0, xm0 2507.w8_filter_left_top16: 2508 vbroadcasti128 m1, [base+z_filter_s+12] 2509 vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab 2510 vbroadcasti128 m4, [base+z_filter_s+16] 2511 vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd 2512 vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef 2513 pmaxub m0, m2 2514 movu xm2, [rsp+49] 2515 vinserti128 m2, [rsp+43], 1 2516 pshufb m0, m2, m0 2517 pmaddubsw m0, m7 2518 vpbroadcastd m7, r7m ; max_height 2519 pshufb m1, m2, m1 2520 pmaddubsw m1, m8 2521 pshufb m2, m4 2522 pmaddubsw m2, m9 2523 packssdw m7, m7 2524 paddw m1, m0 2525 packsswb m7, m7 2526 paddw m1, m2 2527 pmulhrsw m1, m13 2528 psubb m7, [base+pb_32to1] 2529 packuswb m3, m1 2530 vpermq m3, m3, q1320 2531 vpblendvb m3, [rsp+32], m7 2532 mova [rsp+32], m3 2533 jmp .w8_main 2534.w8_upsample_left: 2535 call .upsample_left 2536.w8_main: 2537 movd xm3, dxd 2538 lea r5, [rsp+56] ; left-7 2539 pshufd xm1, xm5, q3120 2540 pand xm5, xm14 2541 vpbroadcastw m3, xm3 2542 pxor xm0, xm0 2543 psubw xm2, xm15, xm5 2544 psraw xm1, 6 2545 lea r9, [strideq*3] 2546 paddw m7, m3, m3 2547 psubw xm9, xm0, xm1 ; base_y 2548 psllw xm5, 8 2549 punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5 2550 vpblendd m3, m7, 0xf0 ; xpos0 xpos1 2551 por xm5, xm2 ; 64-frac_y, frac_y 2552 punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7 2553 paddw m6, m3 2554 vinserti128 m12, m5, xm5, 1 2555.w8_loop: 2556 lea r3d, [r2+dxq] 2557 shr r2d, 6 ; base_x0 2558 movu xm0, [rsp+r2] 2559 lea r2d, [r3+dxq] 2560 shr r3d, 6 ; base_x1 2561 vinserti128 m0, [rsp+r3], 1 2562 lea r3d, [r2+dxq] 2563 shr r2d, 6 ; base_x2 2564 movu xm1, [rsp+r2] 2565 lea r2d, [r3+dxq] 2566 shr r3d, 6 ; base_x3 2567 vinserti128 m1, [rsp+r3], 1 2568 pand m2, m14, m6 2569 paddsw m4, m6, m7 2570 psubw m5, m15, m2 2571 psllw m2, 8 2572 pshufb m0, m10 2573 por m2, m5 2574 pmaddubsw m0, m2 2575 pand m2, m14, m4 2576 psubw m5, m15, m2 2577 psllw m2, 8 2578 pshufb m1, m10 2579 por m2, m5 2580 pmaddubsw m1, m2 2581 cmp r3d, 64 2582 jge .w8_toponly 2583 mova m5, m7 2584 vpgatherdq m3, [r5+xm9], m7 2585 mova m7, m5 2586 vpgatherdq m2, [r5+xm8], m5 2587 pshufb m3, m11 2588 pshufb m2, m11 2589 punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 2590 punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3 2591 vpermq m5, m5, q3120 ; y0 y1 2592 vpermq m2, m2, q3120 ; y2 y3 2593 pmaddubsw m5, m12 2594 pmaddubsw m2, m12 2595 psraw m6, 15 ; base_x < topleft 2596 vpblendvb m0, m5, m6 2597 psraw m3, m4, 15 2598 vpblendvb m1, m2, m3 2599.w8_toponly: 2600 pmulhrsw m0, m13 2601 pmulhrsw m1, m13 2602 paddw m6, m4, m7 ; xpos += dx 2603 add r5, dyq 2604 packuswb m0, m1 2605 vextracti128 xm1, m0, 1 2606 movq [dstq+strideq*0], xm0 2607 movhps [dstq+strideq*2], xm0 2608 movq [dstq+strideq*1], xm1 2609 movhps [dstq+r9 ], xm1 2610 sub hd, 4 2611 jz .w8_end 2612 lea dstq, [dstq+strideq*4] 2613 cmp r2d, r8d 2614 jge .w8_loop 2615.w8_leftonly_loop: 2616 mova m0, m7 2617 vpgatherdq m5, [r5+xm9], m7 2618 mova m7, m0 2619 vpgatherdq m3, [r5+xm8], m0 2620 add r5, dyq 2621 pshufb m2, m5, m11 2622 pshufb m1, m3, m11 2623 punpckldq m0, m1, m2 2624 punpckhdq m1, m2 2625 vpermq m0, m0, q3120 2626 vpermq m1, m1, q3120 2627 pmaddubsw m0, m12 2628 pmaddubsw m1, m12 2629 pmulhrsw m0, m13 2630 pmulhrsw m1, m13 2631 packuswb m0, m1 2632 vextracti128 xm1, m0, 1 2633 movq [dstq+strideq*0], xm0 2634 movhps [dstq+strideq*2], xm0 2635 movq [dstq+strideq*1], xm1 2636 movhps [dstq+r9 ], xm1 2637 lea dstq, [dstq+strideq*4] 2638 sub hd, 4 2639 jg .w8_leftonly_loop 2640.w8_end: 2641 RET 2642.w16: 2643 mov r8d, hd 2644 test angled, 0x400 2645 jnz .w16_main 2646 lea r3d, [hq+15] 2647 sub angled, 90 2648 call .filter_strength 2649 test r3d, r3d 2650 jz .w16_no_filter_above 2651 popcnt r3d, r3d 2652 vbroadcasti128 m6, [tlq+1] 2653 mova xm2, [base+z_filter_s] 2654 vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de 2655 movu xm3, [base+z_filter_s+8] 2656 vinserti128 m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab ab bc cd de ef ff ff ff 2657 vpblendd m1, m6, 0xf0 2658 vpbroadcastd m0, [base+z_filter_k-4+r3*4+12*0] 2659 vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] 2660 vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*2] 2661 pshufb m2, m1, m2 2662 pshufb m1, m3 2663 pmaddubsw m0, m2, m0 2664 shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff 2665 pmaddubsw m2, m4 2666 pmaddubsw m1, m5 2667 vpbroadcastd xm4, r6m ; max_width 2668 packssdw xm4, xm4 2669 paddw m0, m2 2670 paddw m0, m1 2671 pmulhrsw m0, m13 2672 packsswb xm4, xm4 2673 vextracti128 xm2, m0, 1 2674 psubb xm4, [base+pb_1to32] 2675 packuswb xm0, xm2 2676 vpblendvb xm0, xm6, xm4 2677 movu [rsp+65], xm0 2678.w16_no_filter_above: 2679 vpbroadcastd m0, [base+pb_90] 2680 psubb m0, m7 2681 pand m0, m8 2682 pcmpgtb m0, m9 2683 pmovmskb r3d, m0 2684 test r3d, r3d 2685 jz .w16_main 2686 popcnt r3d, r3d 2687 vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] 2688 vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] 2689 vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] 2690.w16_filter_left: 2691 vpbroadcastd m6, r7m ; max_height 2692 packssdw m6, m6 2693 packsswb m6, m6 2694 cmp hd, 32 2695 jl .w16_filter_left_h16 2696 vpbroadcastd xm0, [base+pb_5] 2697 vbroadcasti128 m10, [base+z_filter_s+ 8] 2698 vbroadcasti128 m11, [base+z_filter_s+12] 2699 vbroadcasti128 m12, [base+z_filter_s+16] 2700 je .w16_filter_left_h32 2701 movu m3, [tlq-69] 2702 movu m5, [tlq-61] 2703 pmaxub m1, m10, m0 2704 pshufb m1, m3, m1 2705 pmaddubsw m1, m7 2706 pshufb m2, m3, m11 2707 pmaddubsw m2, m8 2708 pshufb m3, m12 2709 pmaddubsw m3, m9 2710 paddw m1, m2 2711 pshufb m2, m5, m10 2712 pmaddubsw m2, m7 2713 pshufb m4, m5, m11 2714 pmaddubsw m4, m8 2715 pshufb m5, m12 2716 pmaddubsw m5, m9 2717 paddw m1, m3 2718 vpbroadcastd m3, [base+pb_32] 2719 paddb m3, [base+pb_32to1] 2720 paddw m2, m4 2721 paddw m2, m5 2722 pmulhrsw m1, m13 2723 pmulhrsw m2, m13 2724 psubb m3, m6, m3 2725 packuswb m1, m2 2726 vpblendvb m1, [tlq-64], m3 2727 mova [rsp], m1 2728 jmp .w16_filter_left_top32 2729.w16_filter_left_h32: 2730 pmaxub m10, m0 2731.w16_filter_left_top32: 2732 movu xm2, [tlq-37] 2733 vinserti128 m2, [tlq-29], 1 2734 pshufb m3, m2, m10 2735 pshufb m1, m2, m11 2736 pshufb m2, m12 2737 pmaddubsw m3, m7 2738 pmaddubsw m1, m8 2739 pmaddubsw m2, m9 2740 paddw m3, m1 2741 paddw m3, m2 2742 pmulhrsw m3, m13 2743 jmp .w16_filter_left_top16 2744.w16_filter_left_h16: 2745 mov r5d, 10 2746 cmp hd, 16 2747 cmovs r5d, hd 2748 xor r5d, 15 ; h == 16 ? 5 : 15 - h 2749 movd xm0, r5d 2750 vpbroadcastb m0, xm0 2751.w16_filter_left_top16: 2752 movu xm2, [tlq-15] 2753 vinserti128 m2, [tlq-21], 1 2754 vbroadcasti128 m1, [base+z_filter_s+12] 2755 vbroadcasti128 m4, [base+z_filter_s+16] 2756 vinserti128 m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 34 45 56 67 78 89 9a ab 2757 vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd 2758 vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef 2759 pmaxub m0, m5 2760 pshufb m0, m2, m0 2761 pmaddubsw m0, m7 2762 pshufb m1, m2, m1 2763 pmaddubsw m1, m8 2764 pshufb m2, m4 2765 pmaddubsw m2, m9 2766 psubb m6, [base+pb_32to1] 2767 paddw m1, m0 2768 paddw m1, m2 2769 pmulhrsw m1, m13 2770 packuswb m3, m1 2771 vpermq m3, m3, q1320 2772 vpblendvb m3, [tlq-32], m6 2773 mova [rsp+32], m3 2774.w16_main: 2775 movd xm1, dyd 2776 vbroadcasti128 m10, [base+z_filter_s+2] 2777 movd xm7, dxd 2778 vbroadcasti128 m11, [base+z2_shuf_h2] 2779 vpbroadcastw m1, xm1 2780 vpbroadcastw m7, xm7 2781 mov r7, dstq 2782 pmullw m0, m1, [base+z2_ymul] 2783 psllw xm1, 4 2784 paddw m6, m7, [base+z2_base_inc] 2785 lea r9d, [dxq+(65<<6)] ; xpos 2786 movd [rsp+156], xm1 2787.w16_loop0: 2788 mov r2d, r9d 2789 mova [rsp+160], m0 2790 lea r5, [rsp+60] ; left-3 2791 mova [rsp+192], m6 2792 pxor m1, m1 2793 psraw m2, m0, 6 2794 pand m0, m14 2795 psubw m9, m1, m2 ; base_y 2796 psubw m12, m15, m0 2797 punpcklwd m8, m9, m1 ; base_y 0, 1, 2, 3, 8, 9, 10, 11 2798 psllw m0, 8 2799 punpckhwd m9, m1 ; base_y 4, 5, 6, 7, 12, 13, 14, 15 2800 por m12, m0 ; 64-frac_y, frac_y 2801.w16_loop: 2802 lea r3d, [r2+dxq] 2803 shr r2d, 6 ; base_x0 2804 movu xm0, [rsp+r2] 2805 vinserti128 m0, [rsp+r2+8], 1 2806 lea r2d, [r3+dxq] 2807 shr r3d, 6 ; base_x1 2808 movu xm1, [rsp+r3] 2809 vinserti128 m1, [rsp+r3+8], 1 2810 pand m2, m14, m6 2811 paddsw m5, m6, m7 2812 psubw m3, m15, m2 2813 psllw m2, 8 2814 pshufb m0, m10 2815 por m2, m3 2816 pmaddubsw m0, m2 2817 pand m2, m14, m5 2818 psubw m3, m15, m2 2819 psllw m2, 8 2820 pshufb m1, m10 2821 por m2, m3 2822 pmaddubsw m1, m2 2823 cmp r3d, 64 2824 jge .w16_toponly 2825 punpckhwd m2, m5, m5 ; mask out unnecessary loads 2826 vpgatherdd m4, [r5+m9], m2 2827 punpcklwd m2, m5, m5 2828 vpgatherdd m3, [r5+m8], m2 2829 pshufb m4, m11 ; e0 f0 g0 h0 e1 f1 g1 h1 m0 n0 o0 p0 m1 n1 o1 p1 2830 pshufb m3, m11 ; a0 b0 c0 d0 a1 b1 c1 d1 i0 j0 k0 l0 i1 j1 k1 l1 2831 punpcklqdq m2, m3, m4 ; y0 2832 punpckhqdq m3, m4 ; y1 2833 pmaddubsw m2, m12 2834 pmaddubsw m3, m12 2835 psraw m6, 15 ; base_x < topleft 2836 vpblendvb m0, m2, m6 2837 psraw m6, m5, 15 2838 vpblendvb m1, m3, m6 2839.w16_toponly: 2840 pmulhrsw m0, m13 2841 pmulhrsw m1, m13 2842 paddw m6, m5, m7 ; xpos += dx 2843 sub r5, 2 2844 packuswb m0, m1 2845 vpermq m0, m0, q3120 2846 mova [dstq+strideq*0], xm0 2847 vextracti128 [dstq+strideq*1], m0, 1 2848 sub hd, 2 2849 jz .w16_end 2850 lea dstq, [dstq+strideq*2] 2851 cmp r2d, (63-16)<<6 2852 jge .w16_loop 2853.w16_leftonly_loop: 2854 mova m0, m7 2855 vpgatherdd m4, [r5+m9], m7 2856 mova m7, m0 2857 vpgatherdd m3, [r5+m8], m0 2858 sub r5, 2 2859 pshufb m2, m4, m11 2860 pshufb m1, m3, m11 2861 punpcklqdq m0, m1, m2 2862 punpckhqdq m1, m2 2863 pmaddubsw m0, m12 2864 pmaddubsw m1, m12 2865 pmulhrsw m0, m13 2866 pmulhrsw m1, m13 2867 packuswb m0, m1 2868 vpermq m0, m0, q3120 2869 mova [dstq+strideq*0], xm0 2870 vextracti128 [dstq+strideq*1], m0, 1 2871 lea dstq, [dstq+strideq*2] 2872 sub hd, 2 2873 jg .w16_leftonly_loop 2874.w16_end: 2875 sub r8d, 1<<8 2876 jl .w16_ret 2877 vpbroadcastd m0, [rsp+156] 2878 paddw m0, [rsp+160] ; base_y += 16*dy 2879 paddw m6, m13, [rsp+192] 2880 add r7, 16 2881 add r9d, 16<<6 2882 movzx hd, r8b 2883 mov dstq, r7 2884 paddw m6, m13 ; base_x += 16*64 2885 jmp .w16_loop0 2886.w16_ret: 2887 RET 2888.w32: 2889 mova m2, [tlq+32] 2890 lea r8d, [hq+(1<<8)] 2891 mova [rsp+96], m2 2892 test angled, 0x400 2893 jnz .w16_main 2894 vpbroadcastd m7, [base+z_filter_k+4*2+12*0] 2895 vpbroadcastd m8, [base+z_filter_k+4*2+12*1] 2896 vpbroadcastd m9, [base+z_filter_k+4*2+12*2] 2897 mova xm5, [base+z_filter_s] 2898 vinserti128 m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67 45 56 67 78 89 9a ab bc 2899 vinserti128 m1, [tlq+11], 1 2900 movu xm6, [base+z_filter_s+12] 2901 vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff 2902 movu xm3, [tlq+ 6] 2903 vinserti128 m3, [tlq+17], 1 2904 vpbroadcastd m10, r6m ; max_width 2905 packssdw m10, m10 2906 packsswb m10, m10 2907.w32_filter_above: 2908 pshufb m0, m1, m5 2909 shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de 2910 pmaddubsw m0, m7 2911 pshufb m2, m1, m4 2912 shufps m5, m6, q2132 ; 34 45 56 67 78 89 9a ab 89 9a ab bc cd de ef ff 2913 pmaddubsw m2, m8 2914 pshufb m1, m5 2915 pmaddubsw m1, m9 2916 paddw m0, m2 2917 paddw m0, m1 2918 pshufb m1, m3, m4 2919 pmaddubsw m1, m7 2920 pshufb m2, m3, m5 2921 pmaddubsw m2, m8 2922 pshufb m3, m6 2923 pmaddubsw m3, m9 2924 paddw m1, m2 2925 paddw m1, m3 2926 pmulhrsw m0, m13 2927 pmulhrsw m1, m13 2928 psubb m10, [base+pb_1to32] 2929 packuswb m0, m1 2930 vpblendvb m0, [tlq+1], m10 2931 movu [rsp+65], m0 2932 jmp .w16_filter_left 2933.w64: 2934 mova m2, [tlq+32] 2935 mov r3d, [tlq+64] 2936 lea r8d, [hq+(3<<8)] 2937 mova [rsp+ 96], m2 2938 mov [rsp+128], r3d 2939 test angled, 0x400 2940 jnz .w16_main 2941 vpbroadcastd m7, [base+z_filter_k+4*2+12*0] 2942 vpbroadcastd m8, [base+z_filter_k+4*2+12*1] 2943 vpbroadcastd m9, [base+z_filter_k+4*2+12*2] 2944 movu xm6, [base+z_filter_s+ 4] 2945 vinserti128 m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89 45 56 67 78 89 9a ab bc 2946 movu xm3, [tlq+30] 2947 vinserti128 m3, [tlq+43], 1 2948 movu xm5, [base+z_filter_s+16] 2949 vinserti128 m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef ab bc cd de ef ff ff ff 2950 pshufb m0, m3, m6 2951 shufps m4, m6, m5, q1021 ; 34 45 56 67 78 89 9a ab 67 78 89 9a ab bc cd de 2952 pmaddubsw m0, m7 2953 pshufb m2, m3, m4 2954 shufps m6, m5, q2132 ; 56 67 78 89 9a ab bc cd 89 9a ab bc cd de ef ff 2955 pmaddubsw m2, m8 2956 pshufb m3, m6 2957 pmaddubsw m3, m9 2958 paddw m0, m2 2959 paddw m0, m3 2960 movu xm2, [tlq+36] 2961 vinserti128 m2, [tlq+49], 1 2962 vpbroadcastd m10, r6m ; max_width 2963 pshufb m4, m2, m4 2964 pmaddubsw m4, m7 2965 pshufb m3, m2, m6 2966 pmaddubsw m3, m8 2967 pshufb m2, m5 2968 pmaddubsw m2, m9 2969 packssdw m10, m10 2970 paddw m3, m4 2971 paddw m2, m3 2972 vpbroadcastd m3, [base+pb_32] 2973 pmulhrsw m0, m13 2974 pmulhrsw m2, m13 2975 packsswb m10, m10 2976 mova xm5, [base+z_filter_s] 2977 vinserti128 m5, [base+z_filter_s+6], 1 2978 psubb m3, m10, m3 2979 psubb m3, [base+pb_1to32] 2980 vinserti128 m1, [tlq+13], 1 2981 packuswb m0, m2 2982 vpblendvb m0, [tlq+33], m3 2983 movu xm3, [tlq+ 6] 2984 vinserti128 m3, [tlq+19], 1 2985 movu [rsp+97], m0 2986 jmp .w32_filter_above 2987 2988cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase 2989 lea r6, [ipred_z3_avx2_table] 2990 tzcnt hd, hm 2991 movifnidn angled, anglem 2992 lea r7, [dr_intra_derivative+45*2-1] 2993 dec tlq 2994 movsxd hq, [r6+hq*4] 2995 sub angled, 180 2996 add hq, r6 2997 mov dyd, angled 2998 neg dyd 2999 xor angled, 0x400 3000 or dyq, ~0x7e 3001 movzx dyd, word [r7+dyq] 3002 vpbroadcastd m3, [pw_512] 3003 vpbroadcastd m4, [pw_62] 3004 vpbroadcastd m5, [pw_64] 3005 mov org_wd, wd 3006 jmp hq 3007.h4: 3008 lea r7, [strideq*3] 3009 cmp angleb, 40 3010 jae .h4_no_upsample 3011 lea r4d, [angleq-1024] 3012 sar r4d, 7 3013 add r4d, wd 3014 jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) 3015 ALLOC_STACK -32, 9 3016 movu xm8, [tlq-7] 3017 pshufb xm0, xm8, [z_upsample1-4] 3018 vpbroadcastb xm2, xm8 3019 pshufb xm1, xm8, [z_filter_s+2] 3020 mova [rsp+16], xm2 ; top[max_base_y] 3021 vpbroadcastd xm2, [pb_36_m4] 3022 add dyd, dyd 3023 pmaddubsw xm0, xm2 3024 pmaddubsw xm1, xm2 3025 movd xm7, dyd 3026 mov r2d, dyd 3027 vpbroadcastw m7, xm7 3028 paddw xm1, xm0 3029 pmulhrsw xm1, xm3 3030 pslldq m6, m7, 8 3031 paddw xm2, xm7, xm7 3032 paddw m6, m7 3033 packuswb xm1, xm1 3034 paddw m6, m2 3035 punpcklbw xm1, xm8 3036 mova xm8, [z_transpose4] 3037 psllw m7, 2 3038 pshufb xm1, [pb_15to0] 3039 mova [rsp], xm1 3040.h4_upsample_loop: 3041 lea r4d, [r2+dyq] 3042 shr r2d, 6 3043 vpbroadcastq m1, [rsp+r2] 3044 lea r2d, [r4+dyq] 3045 shr r4d, 6 3046 vpbroadcastq m2, [rsp+r4] 3047 lea r4d, [r2+dyq] 3048 shr r2d, 6 3049 movq xm0, [rsp+r2] 3050 lea r2d, [r4+dyq] 3051 shr r4d, 6 3052 movhps xm0, [rsp+r4] 3053 vpblendd m1, m2, 0xc0 3054 pand m2, m4, m6 3055 vpblendd m0, m1, 0xf0 3056 psubw m1, m5, m2 3057 psllw m2, 8 3058 por m1, m2 3059 pmaddubsw m0, m1 3060 paddw m6, m7 3061 pmulhrsw m0, m3 3062 vextracti128 xm1, m0, 1 3063 packuswb xm1, xm0 3064 pshufb xm1, xm8 3065 movd [dstq+strideq*0], xm1 3066 pextrd [dstq+strideq*1], xm1, 1 3067 pextrd [dstq+strideq*2], xm1, 2 3068 pextrd [dstq+r7 ], xm1, 3 3069 add dstq, 4 3070 sub wd, 4 3071 jg .h4_upsample_loop 3072 RET 3073ALIGN function_align 3074.filter_strength: ; h4/h8/h16 3075%define base r4-z_filter_t0 3076 lea r4, [z_filter_t0] 3077 movd xm0, maxbased 3078 movd xm2, angled 3079 shr angled, 8 ; is_sm << 1 3080 vpbroadcastb m0, xm0 3081 vpbroadcastb m2, xm2 3082 pcmpeqb m1, m0, [base+z_filter_wh] 3083 pand m1, m2 3084 mova xm2, [r4+angleq*8] 3085 pcmpgtb m1, m2 3086 pmovmskb r5d, m1 3087 ret 3088.h4_no_upsample: 3089 ALLOC_STACK -16, 12 3090 mov maxbased, 7 3091 test angled, 0x400 ; !enable_intra_edge_filter 3092 jnz .h4_main 3093 lea maxbased, [wq+3] 3094 call .filter_strength 3095 mov maxbased, 7 3096 test r5d, r5d 3097 jz .h4_main ; filter_strength == 0 3098 popcnt r5d, r5d 3099 vpbroadcastd m7, [base+pb_7] 3100 vbroadcasti128 m2, [tlq-14] 3101 pmaxub m1, m7, [base+z_filter_s-4] 3102 vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] 3103 pmaxub m7, [base+z_filter_s+4] 3104 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] 3105 vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] 3106 pshufb m0, m2, m1 3107 shufps m1, m7, q2121 3108 pmaddubsw m0, m8 3109 pshufb m1, m2, m1 3110 pmaddubsw m1, m9 3111 pshufb m2, m7 3112 pmaddubsw m2, m10 3113 paddw m0, m1 3114 paddw m0, m2 3115 pmulhrsw m0, m3 3116 mov r4d, 9 3117 lea tlq, [rsp+15] 3118 cmp wd, 4 3119 cmovne maxbased, r4d 3120 vextracti128 xm1, m0, 1 3121 packuswb xm0, xm1 3122 mova [rsp], xm0 3123.h4_main: 3124 movd xm6, dyd 3125 vpbroadcastq m0, [z_base_inc] ; base_inc << 6 3126 mov r4, tlq 3127 sub tlq, 4 3128 neg dyq 3129 vpbroadcastw m6, xm6 3130 sub r4, maxbaseq 3131 shl maxbased, 6 3132 vpbroadcastb m7, [r4] 3133 lea r4, [dyq+63] ; ypos 3134 movd xm9, maxbased 3135 not maxbased 3136 vbroadcasti128 m8, [z3_shuf_w4] 3137 add maxbased, 64 3138 vpbroadcastw m9, xm9 3139 psrlw m7, 8 ; top[max_base_y] 3140 paddw m10, m6, m6 3141 psubw m9, m0 ; max_base_y 3142 vpblendd m6, m10, 0xcc 3143 mova xm0, xm10 3144 paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1 3145 paddw m10, m10 3146 mova xm11, [z_transpose4] 3147.h4_loop: 3148 lea r5, [r4+dyq] 3149 sar r4, 6 ; base0 3150 vpbroadcastq m1, [tlq+r4] 3151 lea r4, [r5+dyq] 3152 sar r5, 6 ; base1 3153 vpbroadcastq m2, [tlq+r5] 3154 lea r5, [r4+dyq] 3155 sar r4, 6 ; base2 3156 movq xm0, [tlq+r4] 3157 lea r4, [r5+dyq] 3158 sar r5, 6 ; base3 3159 movhps xm0, [tlq+r5] 3160 vpblendd m1, m2, 0xc0 3161 pand m2, m4, m6 ; frac 3162 vpblendd m0, m1, 0xf0 3163 psubw m1, m5, m2 ; 64-frac 3164 psllw m2, 8 3165 pshufb m0, m8 3166 por m1, m2 ; 64-frac, frac 3167 pmaddubsw m0, m1 3168 pcmpgtw m1, m9, m6 ; base < max_base_y 3169 pmulhrsw m0, m3 3170 paddw m6, m10 ; ypos += dy 3171 vpblendvb m0, m7, m0, m1 3172 vextracti128 xm1, m0, 1 3173 packuswb xm1, xm0 3174 pshufb xm1, xm11 ; transpose 3175 movd [dstq+strideq*0], xm1 3176 pextrd [dstq+strideq*1], xm1, 1 3177 pextrd [dstq+strideq*2], xm1, 2 3178 pextrd [dstq+r7 ], xm1, 3 3179 sub wd, 4 3180 jz .h4_end 3181 add dstq, 4 3182 cmp r4d, maxbased 3183 jg .h4_loop 3184 packuswb xm7, xm7 3185.h4_end_loop: 3186 movd [dstq+strideq*0], xm7 3187 movd [dstq+strideq*1], xm7 3188 movd [dstq+strideq*2], xm7 3189 movd [dstq+r7 ], xm7 3190 add dstq, 4 3191 sub wd, 4 3192 jg .h4_end_loop 3193.h4_end: 3194 RET 3195ALIGN function_align 3196.h8: 3197 lea r4d, [angleq+216] 3198 mov r4b, wb 3199 cmp r4d, 8 3200 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 3201 ALLOC_STACK -32, 8 3202 and r4d, 4 3203 mova xm0, [tlq-15] 3204 vinserti128 m0, [tlq- 9], 1 3205 movd xm1, r4d 3206 movu xm2, [z_filter_s+2] 3207 vinserti128 m2, [z_filter_s+6], 1 3208 vpbroadcastb xm1, xm1 ; w & 4 3209 vpbroadcastd m7, [pb_36_m4] 3210 pmaxub xm1, [z_upsample1-4] ; clip 4x8 3211 vinserti128 m1, [z_upsample1], 1 3212 add dyd, dyd 3213 pshufb m1, m0, m1 3214 pshufb m2, m0, m2 3215 vinserti128 m0, [tlq-7], 1 3216 movd xm6, dyd 3217 pmaddubsw m1, m7 3218 pmaddubsw m2, m7 3219 vpbroadcastw m6, xm6 3220 mov r2d, dyd 3221 lea r5, [strideq*3] 3222 paddw m7, m6, m6 3223 paddw m1, m2 3224 vpblendd m6, m7, 0xf0 3225 pmulhrsw m1, m3 3226 pslldq m2, m7, 8 3227 paddw m7, m7 3228 paddw m6, m2 3229 vbroadcasti128 m2, [pb_15to0] 3230 packuswb m1, m1 3231 punpcklbw m1, m0 3232 pshufb m1, m2 3233 vextracti128 [rsp+ 0], m1, 1 3234 mova [rsp+16], xm1 3235.h8_upsample_loop: 3236 lea r4d, [r2+dyq] 3237 shr r2d, 6 ; base0 3238 movu xm0, [rsp+r2] 3239 lea r2d, [r4+dyq] 3240 shr r4d, 6 ; base1 3241 vinserti128 m0, [rsp+r4], 1 3242 lea r4d, [r2+dyq] 3243 shr r2d, 6 ; base2 3244 pand m1, m4, m6 3245 psubw m2, m5, m1 3246 psllw m1, 8 3247 por m2, m1 3248 punpcklqdq m1, m2, m2 ; frac0 frac1 3249 pmaddubsw m0, m1 3250 movu xm1, [rsp+r2] 3251 lea r2d, [r4+dyq] 3252 shr r4d, 6 ; base3 3253 vinserti128 m1, [rsp+r4], 1 3254 punpckhqdq m2, m2 ; frac2 frac3 3255 pmaddubsw m1, m2 3256 pmulhrsw m0, m3 3257 paddw m6, m7 3258 pmulhrsw m1, m3 3259 lea r4, [dstq+strideq*4] 3260 psllw m1, 8 3261 por m0, m1 3262 vextracti128 xm1, m0, 1 3263 punpcklbw xm2, xm0, xm1 3264 punpckhbw xm0, xm1 3265 movd [dstq+strideq*0], xm2 3266 pextrd [dstq+strideq*1], xm2, 1 3267 pextrd [dstq+strideq*2], xm2, 2 3268 pextrd [dstq+r5 ], xm2, 3 3269 movd [r4 +strideq*0], xm0 3270 pextrd [r4 +strideq*1], xm0, 1 3271 pextrd [r4 +strideq*2], xm0, 2 3272 pextrd [r4 +r5 ], xm0, 3 3273 add dstq, 4 3274 sub wd, 4 3275 jg .h8_upsample_loop 3276 RET 3277.h8_no_intra_edge_filter: 3278 and maxbased, 7 3279 or maxbased, 8 ; imin(w+7, 15) 3280 jmp .h8_main 3281.h8_no_upsample: 3282 ALLOC_STACK -32, 10 3283 lea maxbased, [wq+7] 3284 test angled, 0x400 3285 jnz .h8_no_intra_edge_filter 3286 call .filter_strength 3287 test r5d, r5d 3288 jz .h8_main ; filter_strength == 0 3289 popcnt r5d, r5d 3290 vpbroadcastd xm6, [base+pb_15] 3291 pcmpeqb xm1, xm1 3292 psubusb xm6, xm0 3293 psubb xm6, xm1 ; w == 4 ? 5 : 1 3294 movu xm2, [tlq-16] 3295 pmaxub xm1, xm6, [base+z_filter_s] 3296 vinserti128 m2, [tlq-14], 1 3297 vinserti128 m1, [base+z_filter_s+12], 1 3298 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] 3299 pmaxub xm6, [base+z_filter_s+ 8] 3300 vinserti128 m6, [base+z_filter_s+20], 1 3301 pshufb m0, m2, m1 3302 pmaddubsw m0, m7 3303 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] 3304 movzx r4d, byte [tlq-15] 3305 shufps m1, m6, q2121 3306 pshufb m1, m2, m1 3307 pmaddubsw m1, m7 3308 paddw m0, m1 3309 sub r5d, 3 3310 jnz .h8_3tap 3311 vpbroadcastd m7, [z_filter_k+4*8] 3312 movzx r2d, byte [tlq-14] 3313 pshufb m2, m6 3314 pmaddubsw m2, m7 3315 sub r2d, r4d 3316 lea r2d, [r2+r4*8+4] 3317 shr r2d, 3 3318 mov [rsp+15], r2b 3319 paddw m0, m2 3320.h8_3tap: 3321 pmulhrsw m0, m3 3322 sar r5d, 1 3323 lea tlq, [rsp+31] 3324 add r5d, 17 3325 cmp wd, 16 3326 cmovns maxbased, r5d 3327 neg r5 3328 mov [tlq+r5], r4b 3329 vextracti128 xm1, m0, 1 3330 packuswb xm0, xm1 3331 mova [tlq-15], xm0 3332.h8_main: 3333 movd xm2, dyd 3334 vbroadcasti128 m0, [z_base_inc] 3335 mov r4, tlq 3336 sub tlq, 8 3337 neg dyq 3338 vpbroadcastw m2, xm2 3339 sub r4, maxbaseq 3340 shl maxbased, 6 3341 vpbroadcastb m7, [r4] 3342 lea r4, [dyq+63] 3343 movd xm9, maxbased 3344 not maxbased 3345 vbroadcasti128 m8, [z3_shuf] 3346 add maxbased, 64 3347 vpbroadcastw m9, xm9 3348 psrlw m7, 8 3349 psubw m9, m0 3350 paddw m6, m2, m2 3351 vpblendd m2, m6, 0x0f 3352.h8_loop: 3353 lea r5, [r4+dyq] 3354 sar r4, 6 3355 pand m0, m4, m2 3356 psubw m1, m5, m0 3357 psllw m0, 8 3358 por m1, m0 3359 vbroadcasti128 m0, [tlq+r4] 3360 lea r4, [r5+dyq] 3361 sar r5, 6 3362 vinserti128 m0, [tlq+r5], 0 3363 sub rsp, 8*2 3364 pshufb m0, m8 3365 pmaddubsw m0, m1 3366 pcmpgtw m1, m9, m2 3367 paddw m2, m6 3368 pmulhrsw m0, m3 3369 vpblendvb m0, m7, m0, m1 3370 vextracti128 xm1, m0, 1 3371 psllw xm0, 8 3372 por xm0, xm1 ; interleave rows (partial transpose) 3373 mova [rsp], xm0 3374 sub wd, 2 3375 jz .h8_transpose 3376 cmp r4d, maxbased 3377 jg .h8_loop 3378 packuswb xm0, xm7, xm7 3379.h8_end_loop: 3380 sub rsp, 8*2 3381 mova [rsp], xm0 3382 sub wd, 2 3383 jg .h8_end_loop 3384.h8_transpose: 3385 mova xm2, [rsp+16*1] 3386 sub org_wd, 8 3387 lea r2, [strideq*3] 3388 lea r6, [dstq+org_wq] 3389 cmovns dstq, r6 3390 punpcklwd xm1, xm2, xm0 3391 punpckhwd xm2, xm0 3392 lea r6, [dstq+strideq*4] 3393 jge .h8_w8 3394 add rsp, 16*2 3395 movd [dstq+strideq*0], xm1 3396 pextrd [dstq+strideq*1], xm1, 1 3397 pextrd [dstq+strideq*2], xm1, 2 3398 pextrd [dstq+r2 ], xm1, 3 3399 movd [r6 +strideq*0], xm2 3400 pextrd [r6 +strideq*1], xm2, 1 3401 pextrd [r6 +strideq*2], xm2, 2 3402 pextrd [r6 +r2 ], xm2, 3 3403 jmp .h8_end 3404.h8_w8_loop: 3405 mova xm0, [rsp+16*0] 3406 mova xm2, [rsp+16*1] 3407 punpcklwd xm1, xm2, xm0 3408 punpckhwd xm2, xm0 3409.h8_w8: ; w8/w16/w32 3410 mova xm0, [rsp+16*2] 3411 mova xm4, [rsp+16*3] 3412 add rsp, 16*4 3413 punpcklwd xm3, xm4, xm0 3414 punpckhwd xm4, xm0 3415 punpckldq xm0, xm3, xm1 3416 punpckhdq xm3, xm1 3417 punpckldq xm1, xm4, xm2 3418 punpckhdq xm4, xm2 3419 movq [dstq+strideq*0], xm0 3420 movhps [dstq+strideq*1], xm0 3421 movq [dstq+strideq*2], xm3 3422 movhps [dstq+r2 ], xm3 3423 movq [r6 +strideq*0], xm1 3424 movhps [r6 +strideq*1], xm1 3425 movq [r6 +strideq*2], xm4 3426 movhps [r6 +r2 ], xm4 3427 sub dstq, 8 3428 sub r6, 8 3429 sub org_wd, 8 3430 jge .h8_w8_loop 3431.h8_end: 3432 RET 3433.h16_no_intra_edge_filter: 3434 and maxbased, 15 3435 or maxbased, 16 ; imin(w+15, 31) 3436 jmp .h16_main 3437ALIGN function_align 3438.h16: 3439 ALLOC_STACK -64, 12 3440 lea maxbased, [wq+15] 3441 test angled, 0x400 3442 jnz .h16_no_intra_edge_filter 3443 call .filter_strength 3444 test r5d, r5d 3445 jz .h16_main ; filter_strength == 0 3446 popcnt r5d, r5d 3447 vpbroadcastd m11, [base+pb_27] 3448 vpbroadcastd m1, [base+pb_1] 3449 vbroadcasti128 m6, [base+z_filter_s+12] 3450 vinserti128 m2, m6, [base+z_filter_s+4], 0 3451 vinserti128 m6, [base+z_filter_s+20], 1 3452 movu xm10, [tlq-18] 3453 vinserti128 m10, [tlq-14], 1 3454 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] 3455 vbroadcasti128 m7, [base+z_filter_s+8] 3456 vinserti128 m8, m7, [base+z_filter_s+0], 0 3457 vinserti128 m7, [base+z_filter_s+16], 1 3458 psubusb m11, m0 3459 por m1, m11 3460 movu xm11, [tlq-32] 3461 vinserti128 m11, [tlq-28], 1 3462 pmaxub m8, m1 3463 pmaxub m7, m1 3464 pshufb m0, m10, m2 3465 shufps m2, m6, q2121 3466 pmaddubsw m0, m9 3467 pshufb m1, m11, m8 3468 shufps m8, m7, q2121 3469 pmaddubsw m1, m9 3470 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] 3471 movzx r4d, byte [tlq-31] 3472 pshufb m2, m10, m2 3473 pmaddubsw m2, m9 3474 pshufb m8, m11, m8 3475 pmaddubsw m8, m9 3476 paddw m0, m2 3477 paddw m1, m8 3478 sub r5d, 3 3479 jnz .h16_3tap 3480 vpbroadcastd m9, [z_filter_k+4*8] 3481 movzx r2d, byte [tlq-30] 3482 pshufb m10, m6 3483 pmaddubsw m10, m9 3484 pshufb m11, m7 3485 pmaddubsw m11, m9 3486 sub r2d, r4d 3487 lea r2d, [r2+r4*8+4] 3488 shr r2d, 3 3489 mov [rsp+31], r2b 3490 paddw m0, m10 3491 paddw m1, m11 3492.h16_3tap: 3493 pmulhrsw m0, m3 3494 pmulhrsw m1, m3 3495 sar r5d, 1 3496 lea tlq, [rsp+63] 3497 add r5d, 33 3498 cmp wd, 32 3499 cmovns maxbased, r5d 3500 neg r5 3501 mov [tlq+r5], r4b 3502 packuswb m0, m1 3503 vpermq m0, m0, q2031 3504 mova [tlq-31], m0 3505.h16_main: 3506 movd xm6, dyd 3507 vbroadcasti128 m0, [z_base_inc] 3508 mov r4, tlq 3509 sub tlq, 8 3510 neg dyq 3511 vpbroadcastw m6, xm6 3512 sub r4, maxbaseq 3513 shl maxbased, 6 3514 vpbroadcastb m7, [r4] 3515 lea r4, [dyq+63] 3516 movd xm9, maxbased 3517 not maxbased 3518 vbroadcasti128 m8, [z3_shuf] 3519 add maxbased, 64 3520 vpbroadcastw m9, xm9 3521 psubw m9, m0 3522 paddw m11, m6, m6 3523 psubw m10, m9, m3 ; 64*8 3524 vpblendd m6, m11, 0xf0 3525.h16_loop: 3526 lea r5, [r4+dyq] 3527 sar r4, 6 3528 pand m1, m4, m6 3529 psubw m2, m5, m1 3530 psllw m1, 8 3531 por m2, m1 3532 movu xm0, [tlq+r4-0] 3533 movu xm1, [tlq+r4-8] 3534 lea r4, [r5+dyq] 3535 sar r5, 6 3536 vinserti128 m0, [tlq+r5-0], 1 3537 vinserti128 m1, [tlq+r5-8], 1 3538 sub rsp, 32 3539 pshufb m0, m8 3540 pshufb m1, m8 3541 pmaddubsw m0, m2 3542 pmaddubsw m1, m2 3543 pmulhrsw m0, m3 3544 pmulhrsw m1, m3 3545 packuswb m0, m1 3546 pcmpgtw m1, m9, m6 3547 pcmpgtw m2, m10, m6 3548 packsswb m1, m2 3549 paddw m6, m11 3550 vpblendvb m0, m7, m0, m1 3551 vpermq m0, m0, q3120 3552 mova [rsp], m0 3553 sub wd, 2 3554 jz .h16_transpose 3555 cmp r4d, maxbased 3556 jg .h16_loop 3557 mova m0, m7 3558.h16_end_loop: 3559 sub rsp, 32 3560 mova [rsp], m7 3561 sub wd, 2 3562 jg .h16_end_loop 3563.h16_transpose: 3564 mova m2, [rsp+32*1] 3565 sub org_wd, 8 3566 lea r2, [strideq*3] 3567 lea r6, [dstq+org_wq] 3568 cmovns dstq, r6 3569 punpcklbw m1, m2, m0 3570 punpckhbw m2, m0 3571 lea r3, [strideq*5] 3572 punpcklbw m0, m1, m2 3573 punpckhbw m1, m2 3574 lea r4, [strideq+r2*2] ; stride*7 3575 jge .h16_w8 3576 add rsp, 32*2 3577 movd [dstq+strideq*0], xm0 3578 pextrd [dstq+strideq*1], xm0, 1 3579 pextrd [dstq+strideq*2], xm0, 2 3580 pextrd [dstq+r2 ], xm0, 3 3581 vextracti128 xm0, m0, 1 3582 movd [dstq+strideq*4], xm1 3583 pextrd [dstq+r3 ], xm1, 1 3584 pextrd [dstq+r2*2 ], xm1, 2 3585 pextrd [dstq+r4 ], xm1, 3 3586 lea dstq, [dstq+strideq*8] 3587 vextracti128 xm1, m1, 1 3588 movd [dstq+strideq*0], xm0 3589 pextrd [dstq+strideq*1], xm0, 1 3590 pextrd [dstq+strideq*2], xm0, 2 3591 pextrd [dstq+r2 ], xm0, 3 3592 movd [dstq+strideq*4], xm1 3593 pextrd [dstq+r3 ], xm1, 1 3594 pextrd [dstq+r2*2 ], xm1, 2 3595 pextrd [dstq+r4 ], xm1, 3 3596 jmp .h16_end 3597.h16_w8_loop: 3598 mova m0, [rsp+32*0] 3599 mova m2, [rsp+32*1] 3600 punpcklbw m1, m2, m0 3601 punpckhbw m2, m0 3602 punpcklbw m0, m1, m2 3603 punpckhbw m1, m2 3604.h16_w8: 3605 mova m2, [rsp+32*2] 3606 mova m4, [rsp+32*3] 3607 lea r6, [dstq+strideq*8] 3608 add rsp, 32*4 3609 punpcklbw m3, m4, m2 3610 punpckhbw m4, m2 3611 punpcklbw m2, m3, m4 3612 punpckhbw m3, m4 3613 punpckldq m4, m2, m0 3614 punpckhdq m2, m0 3615 punpckldq m0, m3, m1 3616 punpckhdq m3, m1 3617 movq [dstq+strideq*0], xm4 3618 movhps [dstq+strideq*1], xm4 3619 vextracti128 xm4, m4, 1 3620 movq [dstq+strideq*2], xm2 3621 movhps [dstq+r2 ], xm2 3622 vextracti128 xm2, m2, 1 3623 movq [dstq+strideq*4], xm0 3624 movhps [dstq+r3 ], xm0 3625 vextracti128 xm0, m0, 1 3626 movq [dstq+r2*2 ], xm3 3627 movhps [dstq+r4 ], xm3 3628 vextracti128 xm3, m3, 1 3629 movq [r6+strideq*0], xm4 3630 movhps [r6+strideq*1], xm4 3631 movq [r6+strideq*2], xm2 3632 movhps [r6+r2 ], xm2 3633 movq [r6+strideq*4], xm0 3634 movhps [r6+r3 ], xm0 3635 movq [r6+r2*2 ], xm3 3636 movhps [r6+r4 ], xm3 3637 sub dstq, 8 3638 sub org_wd, 8 3639 jge .h16_w8_loop 3640.h16_end: 3641 RET 3642ALIGN function_align 3643.h32: 3644 ALLOC_STACK -96, 15 3645 lea maxbased, [wq+31] 3646 and maxbased, 31 3647 or maxbased, 32 ; imin(w+31, 63) 3648 test angled, 0x400 ; !enable_intra_edge_filter 3649 jnz .h32_main 3650 vbroadcasti128 m0, [pb_0to15] 3651 mov r4d, 21 3652 mov r5d, 3 3653 movu xm11, [tlq-66] ; 56-63 3654 vinserti128 m11, [tlq-52], 1 ; 40-47 3655 sub r4d, wd ; 21-w 3656 cmovns r5d, r4d 3657 movu xm12, [tlq-58] ; 48-55 3658 vinserti128 m12, [tlq-44], 1 ; 32-39 3659 sub r4d, 8 ; 13-w 3660 movd xm1, r5d 3661 movu xm13, [tlq-34] ; 24-31 3662 vinserti128 m13, [tlq-20], 1 ; 8-15 3663 movd xm2, r4d 3664 vpbroadcastb m1, xm1 3665 movu xm14, [tlq-28] ; 16-23 3666 vinserti128 m14, [tlq-14], 1 ; 0- 7 3667 vpbroadcastb m2, xm2 3668 pmaxsb m1, m0 ; clip 16x32 and (32|64)x32 3669 movu m7, [z_filter_s+4] 3670 pshufb m11, m1 3671 vinserti128 m8, m7, [z_filter_s+8], 1 3672 vinserti128 m7, [z_filter_s+16], 0 3673 pmaxsb m2, m0 ; clip 8x32 3674 vpbroadcastd m9, [z_filter_k+4*2+12*0] 3675 pshufb m12, m2 3676 pshufb m0, m11, m8 3677 pmaddubsw m0, m9 3678 pshufb m2, m12, m8 3679 pmaddubsw m2, m9 3680 pshufb m1, m13, m8 3681 pmaddubsw m1, m9 3682 shufps m8, m7, q1021 3683 pshufb m6, m14, m8 3684 pmaddubsw m6, m9 3685 vpbroadcastd m9, [z_filter_k+4*2+12*1] 3686 pshufb m10, m11, m8 3687 pmaddubsw m10, m9 3688 paddw m0, m10 3689 pshufb m10, m12, m8 3690 pmaddubsw m10, m9 3691 paddw m2, m10 3692 pshufb m10, m13, m8 3693 pmaddubsw m10, m9 3694 shufps m8, m7, q2121 3695 paddw m1, m10 3696 pshufb m10, m14, m8 3697 pmaddubsw m10, m9 3698 paddw m6, m10 3699 vpbroadcastd m9, [z_filter_k+4*2+12*2] 3700 pshufb m11, m8 3701 pmaddubsw m11, m9 3702 pshufb m12, m8 3703 pmaddubsw m12, m9 3704 movzx r4d, byte [tlq-63] 3705 movzx r2d, byte [tlq-62] 3706 paddw m0, m11 3707 paddw m2, m12 3708 pshufb m13, m8 3709 pmaddubsw m13, m9 3710 pshufb m14, m7 3711 pmaddubsw m14, m9 3712 paddw m1, m13 3713 paddw m6, m14 3714 sub r2d, r4d 3715 lea r2d, [r2+r4*8+4] ; edge case for 64x32 3716 pmulhrsw m0, m3 3717 pmulhrsw m2, m3 3718 pmulhrsw m1, m3 3719 pmulhrsw m6, m3 3720 shr r2d, 3 3721 mov [rsp+31], r2b 3722 lea tlq, [rsp+95] 3723 mov [tlq-65], r4b 3724 mov r4d, 65 3725 cmp wd, 64 3726 cmove maxbased, r4d 3727 packuswb m0, m2 3728 packuswb m1, m6 3729 mova [tlq-63], m0 3730 mova [tlq-31], m1 3731.h32_main: 3732 movd xm6, dyd 3733 mov r4, tlq 3734 sub tlq, 8 3735 neg dyq 3736 vpbroadcastw m6, xm6 3737 sub r4, maxbaseq 3738 shl maxbased, 6 3739 vpbroadcastb m7, [r4] 3740 lea r4, [dyq+63] 3741 movd xm9, maxbased 3742 not maxbased 3743 vbroadcasti128 m8, [z3_shuf] 3744 add maxbased, 64 3745 vpbroadcastw m9, xm9 3746 psubw m9, [z_base_inc] 3747 mova m11, m6 3748 psubw m10, m9, m3 ; 64*8 3749.h32_loop: 3750 mov r5, r4 3751 sar r5, 6 3752 pand m1, m4, m6 3753 psubw m2, m5, m1 3754 psllw m1, 8 3755 por m2, m1 3756 movu xm0, [tlq+r5- 0] 3757 vinserti128 m0, [tlq+r5-16], 1 3758 movu xm1, [tlq+r5- 8] 3759 vinserti128 m1, [tlq+r5-24], 1 3760 sub rsp, 32 3761 add r4, dyq 3762 pshufb m0, m8 3763 pshufb m1, m8 3764 pmaddubsw m0, m2 3765 pmaddubsw m1, m2 3766 pmulhrsw m0, m3 3767 pmulhrsw m1, m3 3768 packuswb m0, m1 3769 pcmpgtw m1, m9, m6 3770 pcmpgtw m2, m10, m6 3771 packsswb m1, m2 3772 paddw m6, m11 3773 vpblendvb m0, m7, m0, m1 3774 mova [rsp], m0 3775 dec wd 3776 jz .h32_transpose 3777 cmp r4d, maxbased 3778 jg .h32_loop 3779.h32_end_loop: 3780 sub rsp, 32 3781 mova [rsp], m7 3782 dec wd 3783 jg .h32_end_loop 3784.h32_transpose: 3785 lea dstq, [dstq+org_wq-8] 3786 lea r2, [strideq*3] 3787 lea r3, [strideq*5] 3788 lea r4, [strideq+r2*2] ; stride*7 3789.h32_w8_loop: 3790 mova m7, [rsp+32*0] 3791 mova m6, [rsp+32*1] 3792 mova m5, [rsp+32*2] 3793 mova m4, [rsp+32*3] 3794 mova m3, [rsp+32*4] 3795 mova m2, [rsp+32*5] 3796 mova m1, [rsp+32*6] 3797 mova m0, [rsp+32*7] 3798 lea r6, [dstq+strideq*8] 3799 add rsp, 32*8 3800 punpcklbw m8, m0, m1 3801 punpckhbw m0, m1 3802 punpcklbw m1, m2, m3 3803 punpckhbw m2, m3 3804 punpcklbw m3, m4, m5 3805 punpckhbw m4, m5 3806 punpcklbw m5, m6, m7 3807 punpckhbw m6, m7 3808 punpcklwd m7, m8, m1 3809 punpckhwd m8, m1 3810 punpcklwd m1, m0, m2 3811 punpckhwd m0, m2 3812 punpcklwd m2, m3, m5 3813 punpckhwd m3, m5 3814 punpcklwd m5, m4, m6 3815 punpckhwd m4, m6 3816 punpckldq m6, m7, m2 3817 punpckhdq m7, m2 3818 punpckldq m2, m8, m3 3819 punpckhdq m8, m3 3820 punpckldq m3, m1, m5 3821 punpckhdq m1, m5 3822 punpckldq m5, m0, m4 3823 punpckhdq m0, m4 3824 movq [dstq+strideq*0], xm6 3825 movhps [dstq+strideq*1], xm6 3826 vextracti128 xm6, m6, 1 3827 movq [dstq+strideq*2], xm7 3828 movhps [dstq+r2 ], xm7 3829 vextracti128 xm7, m7, 1 3830 movq [dstq+strideq*4], xm2 3831 movhps [dstq+r3 ], xm2 3832 vextracti128 xm2, m2, 1 3833 movq [dstq+r2*2 ], xm8 3834 movhps [dstq+r4 ], xm8 3835 vextracti128 xm8, m8, 1 3836 movq [r6+strideq*0], xm3 3837 movhps [r6+strideq*1], xm3 3838 vextracti128 xm3, m3, 1 3839 movq [r6+strideq*2], xm1 3840 movhps [r6+r2 ], xm1 3841 vextracti128 xm1, m1, 1 3842 movq [r6+strideq*4], xm5 3843 movhps [r6+r3 ], xm5 3844 vextracti128 xm5, m5, 1 3845 movq [r6+r2*2 ], xm0 3846 movhps [r6+r4 ], xm0 3847 lea r6, [r6+strideq*8] 3848 vextracti128 xm0, m0, 1 3849 movq [r6+strideq*0], xm6 3850 movhps [r6+strideq*1], xm6 3851 movq [r6+strideq*2], xm7 3852 movhps [r6+r2 ], xm7 3853 movq [r6+strideq*4], xm2 3854 movhps [r6+r3 ], xm2 3855 movq [r6+r2*2 ], xm8 3856 movhps [r6+r4 ], xm8 3857 lea r6, [r6+strideq*8] 3858 movq [r6+strideq*0], xm3 3859 movhps [r6+strideq*1], xm3 3860 movq [r6+strideq*2], xm1 3861 movhps [r6+r2 ], xm1 3862 movq [r6+strideq*4], xm5 3863 movhps [r6+r3 ], xm5 3864 movq [r6+r2*2 ], xm0 3865 movhps [r6+r4 ], xm0 3866 sub dstq, 8 3867 sub org_wd, 8 3868 jg .h32_w8_loop 3869 RET 3870ALIGN function_align 3871.h64: 3872 ALLOC_STACK -128, 16 3873 lea maxbased, [wq+63] 3874 test angled, 0x400 ; !enable_intra_edge_filter 3875 jnz .h64_main 3876 mov r4d, 21 3877 vpbroadcastb xm11, [tlq-127] 3878 vpblendd xm11, [tlq-130], 0x0e ; 120-127 3879 sub r4d, wd ; 21-w 3880 mov r5d, 3 3881 vinserti128 m11, [tlq-116], 1 ; 104-111 3882 movu m7, [z_filter_s+4] 3883 cmp wd, 32 3884 cmove r4d, r5d 3885 vinserti128 m8, m7, [z_filter_s+8], 1 3886 vbroadcasti128 m6, [pb_0to15] 3887 movd xm1, r4d 3888 vpbroadcastd m9, [z_filter_k+4*2+12*0] 3889 movu xm12, [tlq-122] ; 112-119 3890 vinserti128 m12, [tlq-108], 1 ; 96-103 3891 vpbroadcastb m1, xm1 3892 movu xm13, [tlq- 98] ; 88- 95 3893 vinserti128 m13, [tlq- 84], 1 ; 72- 79 3894 movu xm14, [tlq- 90] ; 80- 87 3895 vinserti128 m14, [tlq- 76], 1 ; 64- 71 3896 vinserti128 m7, [z_filter_s+16], 0 3897 pshufb m0, m11, m8 3898 pmaddubsw m0, m9 3899 pshufb m2, m12, m8 3900 pmaddubsw m2, m9 3901 pmaxsb m1, m6 ; clip (16|32)x64 3902 pshufb m13, m1 3903 pshufb m1, m13, m8 3904 pmaddubsw m1, m9 3905 pshufb m6, m14, m8 3906 pmaddubsw m6, m9 3907 vpbroadcastd m9, [z_filter_k+4*2+12*1] 3908 shufps m15, m8, m7, q1021 3909 pshufb m10, m11, m15 3910 pmaddubsw m10, m9 3911 paddw m0, m10 3912 pshufb m10, m12, m15 3913 pmaddubsw m10, m9 3914 paddw m2, m10 3915 pshufb m10, m13, m15 3916 pmaddubsw m10, m9 3917 paddw m1, m10 3918 pshufb m10, m14, m15 3919 pmaddubsw m10, m9 3920 paddw m6, m10 3921 vpbroadcastd m9, [z_filter_k+4*2+12*2] 3922 shufps m10, m8, m7, q2132 3923 pshufb m11, m10 3924 pmaddubsw m11, m9 3925 pshufb m12, m10 3926 pmaddubsw m12, m9 3927 pshufb m13, m10 3928 pmaddubsw m13, m9 3929 pshufb m14, m10 3930 pmaddubsw m14, m9 3931 paddw m0, m11 3932 paddw m2, m12 3933 paddw m1, m13 3934 paddw m6, m14 3935 movu xm11, [tlq-66] ; 56-63 3936 vinserti128 m11, [tlq-52], 1 ; 40-47 3937 movu xm12, [tlq-58] ; 48-55 3938 vinserti128 m12, [tlq-44], 1 ; 32-39 3939 movu xm13, [tlq-34] ; 24-31 3940 vinserti128 m13, [tlq-20], 1 ; 8-15 3941 movu xm14, [tlq-28] ; 16-23 3942 vinserti128 m14, [tlq-14], 1 ; 0- 7 3943 pmulhrsw m0, m3 3944 pmulhrsw m2, m3 3945 pmulhrsw m1, m3 3946 pmulhrsw m6, m3 3947 lea tlq, [rsp+127] 3948 packuswb m0, m2 3949 packuswb m1, m6 3950 mova [tlq-127], m0 3951 mova [tlq- 95], m1 3952 pshufb m0, m11, m10 3953 pmaddubsw m0, m9 3954 pshufb m2, m12, m10 3955 pmaddubsw m2, m9 3956 pshufb m1, m13, m10 3957 pmaddubsw m1, m9 3958 pshufb m6, m14, m7 3959 pmaddubsw m6, m9 3960 vpbroadcastd m9, [z_filter_k+4*2+12*1] 3961 pshufb m7, m11, m15 3962 pmaddubsw m7, m9 3963 paddw m0, m7 3964 pshufb m7, m12, m15 3965 pmaddubsw m7, m9 3966 paddw m2, m7 3967 pshufb m7, m13, m15 3968 pmaddubsw m7, m9 3969 paddw m1, m7 3970 pshufb m7, m14, m10 3971 pmaddubsw m7, m9 3972 paddw m6, m7 3973 vpbroadcastd m9, [z_filter_k+4*2+12*0] 3974 pshufb m11, m8 3975 pmaddubsw m11, m9 3976 pshufb m12, m8 3977 pmaddubsw m12, m9 3978 pshufb m13, m8 3979 pmaddubsw m13, m9 3980 pshufb m14, m15 3981 pmaddubsw m14, m9 3982 paddw m0, m11 3983 paddw m2, m12 3984 paddw m1, m13 3985 paddw m6, m14 3986 pmulhrsw m0, m3 3987 pmulhrsw m2, m3 3988 pmulhrsw m1, m3 3989 pmulhrsw m6, m3 3990 packuswb m0, m2 3991 packuswb m1, m6 3992 mova [tlq-63], m0 3993 mova [tlq-31], m1 3994.h64_main: 3995 movd xm12, dyd 3996 neg maxbaseq 3997 vbroadcasti128 m8, [z3_shuf] 3998 vpbroadcastb m7, [tlq+maxbaseq] 3999 shl maxbased, 6 4000 vpbroadcastw m12, xm12 4001 lea r5d, [dyq+maxbaseq-64] 4002 neg dyq 4003 or maxbased, 63 4004 lea r4, [dyq+63] 4005 movd xm6, r5d 4006 mova xm10, [pb_1to32+16] 4007 vinserti128 m10, [pb_1to32], 1 4008 vpbroadcastd m11, [pb_32] 4009 vpbroadcastw m6, xm6 4010.h64_loop: 4011 mov r5, r4 4012 sar r5, 6 4013 movu m0, [tlq+r5-24] 4014 movu m1, [tlq+r5-32] 4015 pand m2, m4, m6 4016 psubw m9, m5, m2 4017 psllw m2, 8 4018 por m9, m2 4019 pshufb m0, m8 4020 pshufb m1, m8 4021 pmaddubsw m0, m9 4022 pmaddubsw m1, m9 4023 psraw m2, m6, 6 4024 sub rsp, 64 4025 pmulhrsw m0, m3 4026 pmulhrsw m1, m3 4027 packsswb m2, m2 4028 paddb m2, m10 4029 packuswb m0, m1 4030 vpblendvb m0, m7, m0, m2 4031 mova [rsp+32], m0 4032 movu m0, [tlq+r5-56] 4033 movu m1, [tlq+r5-64] 4034 add r4, dyq 4035 pshufb m0, m8 4036 pshufb m1, m8 4037 pmaddubsw m0, m9 4038 pmaddubsw m1, m9 4039 paddb m2, m11 4040 pmulhrsw m0, m3 4041 pmulhrsw m1, m3 4042 paddw m6, m12 4043 packuswb m0, m1 4044 vpblendvb m0, m7, m0, m2 4045 mova [rsp], m0 4046 dec wd 4047 jz .h64_transpose 4048 cmp r4d, maxbased 4049 jg .h64_loop 4050.h64_end_loop: 4051 sub rsp, 64 4052 mova [rsp+32], m7 4053 mova [rsp+ 0], m7 4054 dec wd 4055 jg .h64_end_loop 4056.h64_transpose: 4057 lea r2, [strideq*3] 4058 lea r3, [strideq*5] 4059 imul r5, strideq, -8 4060 lea dstq, [dstq+org_wq-16] 4061 lea r4, [strideq+r2*2] ; stride*7 4062.h64_transpose_loop0: 4063 lea r6, [rsp+16*3] 4064.h64_transpose_loop: 4065 mova xm0, [r6+64*15] 4066 vinserti128 m0, [r6+64* 7], 1 4067 mova xm1, [r6+64*14] 4068 vinserti128 m1, [r6+64* 6], 1 4069 mova xm2, [r6+64*13] 4070 vinserti128 m2, [r6+64* 5], 1 4071 mova xm3, [r6+64*12] 4072 vinserti128 m3, [r6+64* 4], 1 4073 mova xm4, [r6+64*11] 4074 vinserti128 m4, [r6+64* 3], 1 4075 mova xm5, [r6+64*10] 4076 vinserti128 m5, [r6+64* 2], 1 4077 mova xm6, [r6+64* 9] 4078 vinserti128 m6, [r6+64* 1], 1 4079 mova xm7, [r6+64* 8] 4080 vinserti128 m7, [r6+64* 0], 1 4081 sub r6, 16 4082 punpcklbw m8, m0, m1 4083 punpckhbw m0, m1 4084 punpcklbw m1, m2, m3 4085 punpckhbw m2, m3 4086 punpcklbw m3, m4, m5 4087 punpckhbw m4, m5 4088 punpcklbw m5, m6, m7 4089 punpckhbw m6, m7 4090 punpcklwd m7, m8, m1 4091 punpckhwd m8, m1 4092 punpcklwd m1, m0, m2 4093 punpckhwd m0, m2 4094 punpcklwd m2, m3, m5 4095 punpckhwd m3, m5 4096 punpcklwd m5, m4, m6 4097 punpckhwd m4, m6 4098 punpckldq m6, m7, m2 4099 punpckhdq m7, m2 4100 punpckldq m2, m8, m3 4101 punpckhdq m8, m3 4102 punpckldq m3, m1, m5 4103 punpckhdq m1, m5 4104 punpckldq m5, m0, m4 4105 punpckhdq m0, m4 4106 vpermq m6, m6, q3120 4107 vpermq m7, m7, q3120 4108 vpermq m2, m2, q3120 4109 vpermq m8, m8, q3120 4110 vpermq m3, m3, q3120 4111 vpermq m1, m1, q3120 4112 vpermq m5, m5, q3120 4113 vpermq m0, m0, q3120 4114 mova [dstq+strideq*0], xm6 4115 vextracti128 [dstq+strideq*1], m6, 1 4116 mova [dstq+strideq*2], xm7 4117 vextracti128 [dstq+r2 ], m7, 1 4118 mova [dstq+strideq*4], xm2 4119 vextracti128 [dstq+r3 ], m2, 1 4120 mova [dstq+r2*2 ], xm8 4121 vextracti128 [dstq+r4 ], m8, 1 4122 sub dstq, r5 4123 mova [dstq+strideq*0], xm3 4124 vextracti128 [dstq+strideq*1], m3, 1 4125 mova [dstq+strideq*2], xm1 4126 vextracti128 [dstq+r2 ], m1, 1 4127 mova [dstq+strideq*4], xm5 4128 vextracti128 [dstq+r3 ], m5, 1 4129 mova [dstq+r2*2 ], xm0 4130 vextracti128 [dstq+r4 ], m0, 1 4131 sub dstq, r5 4132 cmp r6, rsp 4133 jae .h64_transpose_loop 4134 add rsp, 64*16 4135 lea dstq, [dstq+r5*8-16] 4136 sub org_wd, 16 4137 jg .h64_transpose_loop0 4138.h64_end: 4139 RET 4140 4141%macro FILTER_XMM 4 ; dst, src, tmp, shuf 4142%ifnum %4 4143 pshufb xm%2, xm%4 4144%else 4145 pshufb xm%2, %4 4146%endif 4147 pshufd xm%1, xm%2, q0000 ; p0 p1 4148 pmaddubsw xm%1, xm2 4149 pshufd xm%3, xm%2, q1111 ; p2 p3 4150 pmaddubsw xm%3, xm3 4151 paddw xm%1, xm1 4152 paddw xm%1, xm%3 4153 pshufd xm%3, xm%2, q2222 ; p4 p5 4154 pmaddubsw xm%3, xm4 4155 paddw xm%1, xm%3 4156 pshufd xm%3, xm%2, q3333 ; p6 __ 4157 pmaddubsw xm%3, xm5 4158 paddw xm%1, xm%3 4159 psraw xm%1, 4 4160 packuswb xm%1, xm%1 4161%endmacro 4162 4163%macro FILTER_YMM 4 ; dst, src, tmp, shuf 4164 pshufb m%2, m%4 4165 pshufd m%1, m%2, q0000 4166 pmaddubsw m%1, m2 4167 pshufd m%3, m%2, q1111 4168 pmaddubsw m%3, m3 4169 paddw m%1, m1 4170 paddw m%1, m%3 4171 pshufd m%3, m%2, q2222 4172 pmaddubsw m%3, m4 4173 paddw m%1, m%3 4174 pshufd m%3, m%2, q3333 4175 pmaddubsw m%3, m5 4176 paddw m%1, m%3 4177 psraw m%1, 4 4178 vperm2i128 m%3, m%1, m%1, 0x01 4179 packuswb m%1, m%3 4180%endmacro 4181 4182; The ipred_filter SIMD processes 4x2 blocks in the following order which 4183; increases parallelism compared to doing things row by row. One redundant 4184; block is calculated for w8 and w16, two for w32. 4185; w4 w8 w16 w32 4186; 1 1 2 1 2 3 5 1 2 3 5 b c d f 4187; 2 2 3 2 4 5 7 2 4 5 7 c e f h 4188; 3 3 4 4 6 7 9 4 6 7 9 e g h j 4189; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ 4190; 5 8 8 i 4191 4192cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter 4193%define base r6-ipred_filter_avx2_table 4194 lea r6, [filter_intra_taps] 4195 tzcnt wd, wm 4196%ifidn filterd, filterm 4197 movzx filterd, filterb 4198%else 4199 movzx filterd, byte filterm 4200%endif 4201 shl filterd, 6 4202 WIN64_SPILL_XMM 9, 15 4203 add filterq, r6 4204 lea r6, [ipred_filter_avx2_table] 4205 movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4 4206 movsxd wq, [r6+wq*4] 4207 vpbroadcastd m1, [base+pw_8] 4208 vbroadcasti128 m2, [filterq+16*0] 4209 vbroadcasti128 m3, [filterq+16*1] 4210 vbroadcasti128 m4, [filterq+16*2] 4211 vbroadcasti128 m5, [filterq+16*3] 4212 add wq, r6 4213 mov hd, hm 4214 jmp wq 4215.w4: 4216 mova xm8, [base+filter_shuf2] 4217 sub tlq, 3 4218 sub tlq, hq 4219 jmp .w4_loop_start 4220.w4_loop: 4221 pinsrd xm0, xm6, [tlq+hq], 0 4222 lea dstq, [dstq+strideq*2] 4223.w4_loop_start: 4224 FILTER_XMM 6, 0, 7, 8 4225 movd [dstq+strideq*0], xm6 4226 pextrd [dstq+strideq*1], xm6, 1 4227 sub hd, 2 4228 jg .w4_loop 4229 RET 4230ALIGN function_align 4231.w8: 4232 WIN64_PUSH_XMM 10 4233 mova m8, [base+filter_shuf1] 4234 FILTER_XMM 7, 0, 6, [base+filter_shuf2] 4235 vpbroadcastd m0, [tlq+4] 4236 vpbroadcastd m6, [tlq+5] 4237 sub tlq, 4 4238 sub tlq, hq 4239 vpbroadcastq m7, xm7 4240 vpblendd m7, m6, 0x20 4241.w8_loop: 4242 vpbroadcastd xm6, [tlq+hq] 4243 palignr m6, m0, 12 4244 vpblendd m0, m6, m7, 0xeb ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ 4245 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 4246 mova xm6, xm7 4247 call .main 4248 vpblendd xm6, xm7, 0x0c 4249 pshufd xm6, xm6, q3120 4250 movq [dstq+strideq*0], xm6 4251 movhps [dstq+strideq*1], xm6 4252 lea dstq, [dstq+strideq*2] 4253 sub hd, 2 4254 jg .w8_loop 4255 RET 4256ALIGN function_align 4257.w16: 4258 sub hd, 2 4259 call .w16_main 4260%if WIN64 4261 jmp .end 4262%else 4263 RET 4264%endif 4265.w16_main: 4266 ; The spills are into the callers stack frame 4267 %assign stack_size stack_size + gprsize 4268 WIN64_PUSH_XMM 15, 9 4269 %assign stack_size stack_size - gprsize 4270 FILTER_XMM 12, 0, 7, [base+filter_shuf2] 4271 vpbroadcastd m0, [tlq+5] 4272 vpblendd m0, [tlq-12], 0x14 4273 mova m8, [base+filter_shuf1] 4274 vpbroadcastq m7, xm12 4275 vpblendd m0, m7, 0xc2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ 4276 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 4277 call .main ; c0 d0 a1 b1 a1 b1 c0 d0 4278 movlps xm9, xm7, [tlq+5] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 4279 vinserti128 m14, m8, [base+filter_shuf3], 0 4280 vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 4281 FILTER_XMM 6, 9, 10, 14 4282 vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 4283 vpbroadcastd m9, [tlq+13] 4284 vpbroadcastd m10, [tlq+12] 4285 psrld m11, m8, 4 4286 vpblendd m6, m9, 0x20 ; top 4287 sub tlq, 6 4288 sub tlq, hq 4289.w16_loop: 4290 vpbroadcastd xm9, [tlq+hq] 4291 palignr m9, m0, 12 4292 vpblendd m0, m9, m7, 0xe2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ 4293 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 4294 mova xm13, xm7 4295 call .main ; e0 f0 c1 d1 c1 d1 e0 f0 4296 vpblendd m9, m12, m10, 0xf0 4297 vpblendd m12, m6, 0xc0 4298 pshufd m9, m9, q3333 4299 vpblendd m9, m6, 0xee 4300 vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 4301 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 4302 FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 4303 vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 4304 vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 4305 vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 4306 mova [dstq+strideq*0], xm9 4307 vextracti128 [dstq+strideq*1], m9, 1 4308 lea dstq, [dstq+strideq*2] 4309 sub hd, 2 4310 jg .w16_loop 4311 vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 4312 pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 4313 FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] 4314 vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 4315 shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 4316 shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 4317 mova [dstq+strideq*0], xm0 4318 mova [dstq+strideq*1], xm6 4319 ret 4320ALIGN function_align 4321.w32: 4322 sub hd, 2 4323 lea r3, [dstq+16] 4324 lea r5d, [hq-2] 4325 call .w16_main 4326 add tlq, r5 4327 mov dstq, r3 4328 lea r3, [strideq-4] 4329 lea r4, [r3+strideq*2] 4330 movq xm0, [tlq+21] 4331 pinsrd xm0, [dstq-4], 2 4332 pinsrd xm0, [dstq+r3*1], 3 4333 FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0 4334 movq xm7, [dstq+r3*2] 4335 pinsrd xm7, [dstq+r4], 2 4336 palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6 4337 vpbroadcastd m0, [tlq+28] 4338 vpbroadcastd m9, [tlq+29] 4339 vbroadcasti128 m8, [base+filter_shuf1+16] 4340 vpblendd m0, m9, 0x20 4341 vpblendd m0, m7, 0x0f 4342 vpbroadcastq m7, xm12 4343 vpblendd m0, m7, 0xc2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 4344 call .main ; c0 d0 a1 b1 a1 b1 c0 d0 4345 add r3, 2 4346 lea r4, [r4+strideq*2] 4347 movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 4348 vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 4349 FILTER_XMM 6, 9, 10, 14 4350 vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 4351 vpbroadcastd m9, [tlq+37] 4352 vpbroadcastd m10, [tlq+36] 4353 vpblendd m6, m9, 0x20 ; top 4354.w32_loop: 4355 movq xm9, [dstq+r3*4] 4356 pinsrd xm9, [dstq+r4], 2 4357.w32_loop_last: 4358 palignr m9, m0, 12 4359 vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 4360 mova xm13, xm7 ; c0 d0 4361 call .main ; e0 f0 c1 d1 c1 d1 e0 f0 4362 vpblendd m9, m12, m10, 0xf0 4363 vpblendd m12, m6, 0xc0 4364 pshufd m9, m9, q3333 4365 vpblendd m9, m6, 0xee 4366 vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 4367 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 4368 FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 4369 vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 4370 vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 4371 vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 4372 mova [dstq+strideq*0], xm9 4373 vextracti128 [dstq+strideq*1], m9, 1 4374 lea dstq, [dstq+strideq*2] 4375 sub r5d, 2 4376 jg .w32_loop 4377 jz .w32_loop_last 4378 vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 4379 pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 4380 FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] 4381 vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 4382 shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 4383 shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 4384 mova [dstq+strideq*0], xm0 4385 mova [dstq+strideq*1], xm6 4386.end: 4387 RET 4388ALIGN function_align 4389.main: 4390 FILTER_YMM 7, 0, 9, 8 4391 ret 4392 4393%if WIN64 4394DECLARE_REG_TMP 5 4395%else 4396DECLARE_REG_TMP 7 4397%endif 4398 4399%macro IPRED_CFL 1 ; ac in, unpacked pixels out 4400 psignw m3, m%1, m1 4401 pabsw m%1, m%1 4402 pmulhrsw m%1, m2 4403 psignw m%1, m3 4404 paddw m%1, m0 4405%endmacro 4406 4407cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 4408 lea t0, [ipred_cfl_left_avx2_table] 4409 tzcnt wd, wm 4410 inc tlq 4411 movu m0, [tlq] 4412 movifnidn hd, hm 4413 mov r6d, 0x8000 4414 shrx r6d, r6d, wd 4415 movd xm3, r6d 4416 movsxd r6, [t0+wq*4] 4417 pcmpeqd m2, m2 4418 pmaddubsw m0, m2 4419 add r6, t0 4420 add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table 4421 movsxd wq, [t0+wq*4] 4422 add wq, t0 4423 movifnidn acq, acmp 4424 jmp r6 4425 4426cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 4427 mov hd, hm ; zero upper half 4428 tzcnt r6d, hd 4429 sub tlq, hq 4430 tzcnt wd, wm 4431 movu m0, [tlq] 4432 mov t0d, 0x8000 4433 shrx t0d, t0d, r6d 4434 movd xm3, t0d 4435 lea t0, [ipred_cfl_left_avx2_table] 4436 movsxd r6, [t0+r6*4] 4437 pcmpeqd m2, m2 4438 pmaddubsw m0, m2 4439 add r6, t0 4440 add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table 4441 movsxd wq, [t0+wq*4] 4442 add wq, t0 4443 movifnidn acq, acmp 4444 jmp r6 4445.h32: 4446 vextracti128 xm1, m0, 1 4447 paddw xm0, xm1 4448.h16: 4449 punpckhqdq xm1, xm0, xm0 4450 paddw xm0, xm1 4451.h8: 4452 psrlq xm1, xm0, 32 4453 paddw xm0, xm1 4454.h4: 4455 pmaddwd xm0, xm2 4456 pmulhrsw xm0, xm3 4457 vpbroadcastw m0, xm0 4458 jmp wq 4459 4460cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 4461 movifnidn hd, hm 4462 movifnidn wd, wm 4463 tzcnt r6d, hd 4464 lea t0d, [wq+hq] 4465 movd xm4, t0d 4466 tzcnt t0d, t0d 4467 movd xm5, t0d 4468 lea t0, [ipred_cfl_avx2_table] 4469 tzcnt wd, wd 4470 movsxd r6, [t0+r6*4] 4471 movsxd wq, [t0+wq*4+4*4] 4472 pcmpeqd m3, m3 4473 psrlw xm4, 1 4474 add r6, t0 4475 add wq, t0 4476 movifnidn acq, acmp 4477 jmp r6 4478.h4: 4479 movd xm0, [tlq-4] 4480 pmaddubsw xm0, xm3 4481 jmp wq 4482.w4: 4483 movd xm1, [tlq+1] 4484 pmaddubsw xm1, xm3 4485 psubw xm0, xm4 4486 paddw xm0, xm1 4487 pmaddwd xm0, xm3 4488 cmp hd, 4 4489 jg .w4_mul 4490 psrlw xm0, 3 4491 jmp .w4_end 4492.w4_mul: 4493 punpckhqdq xm1, xm0, xm0 4494 lea r2d, [hq*2] 4495 mov r6d, 0x55563334 4496 paddw xm0, xm1 4497 shrx r6d, r6d, r2d 4498 psrlq xm1, xm0, 32 4499 paddw xm0, xm1 4500 movd xm1, r6d 4501 psrlw xm0, 2 4502 pmulhuw xm0, xm1 4503.w4_end: 4504 vpbroadcastw m0, xm0 4505.s4: 4506 vpbroadcastw m1, alpham 4507 lea r6, [strideq*3] 4508 pabsw m2, m1 4509 psllw m2, 9 4510.s4_loop: 4511 mova m4, [acq] 4512 IPRED_CFL 4 4513 packuswb m4, m4 4514 vextracti128 xm5, m4, 1 4515 movd [dstq+strideq*0], xm4 4516 pextrd [dstq+strideq*1], xm4, 1 4517 movd [dstq+strideq*2], xm5 4518 pextrd [dstq+r6 ], xm5, 1 4519 lea dstq, [dstq+strideq*4] 4520 add acq, 32 4521 sub hd, 4 4522 jg .s4_loop 4523 RET 4524ALIGN function_align 4525.h8: 4526 movq xm0, [tlq-8] 4527 pmaddubsw xm0, xm3 4528 jmp wq 4529.w8: 4530 movq xm1, [tlq+1] 4531 vextracti128 xm2, m0, 1 4532 pmaddubsw xm1, xm3 4533 psubw xm0, xm4 4534 paddw xm0, xm2 4535 punpckhqdq xm2, xm0, xm0 4536 paddw xm0, xm2 4537 paddw xm0, xm1 4538 psrlq xm1, xm0, 32 4539 paddw xm0, xm1 4540 pmaddwd xm0, xm3 4541 psrlw xm0, xm5 4542 cmp hd, 8 4543 je .w8_end 4544 mov r6d, 0x5556 4545 mov r2d, 0x3334 4546 cmp hd, 32 4547 cmove r6d, r2d 4548 movd xm1, r6d 4549 pmulhuw xm0, xm1 4550.w8_end: 4551 vpbroadcastw m0, xm0 4552.s8: 4553 vpbroadcastw m1, alpham 4554 lea r6, [strideq*3] 4555 pabsw m2, m1 4556 psllw m2, 9 4557.s8_loop: 4558 mova m4, [acq] 4559 mova m5, [acq+32] 4560 IPRED_CFL 4 4561 IPRED_CFL 5 4562 packuswb m4, m5 4563 vextracti128 xm5, m4, 1 4564 movq [dstq+strideq*0], xm4 4565 movq [dstq+strideq*1], xm5 4566 movhps [dstq+strideq*2], xm4 4567 movhps [dstq+r6 ], xm5 4568 lea dstq, [dstq+strideq*4] 4569 add acq, 64 4570 sub hd, 4 4571 jg .s8_loop 4572 RET 4573ALIGN function_align 4574.h16: 4575 mova xm0, [tlq-16] 4576 pmaddubsw xm0, xm3 4577 jmp wq 4578.w16: 4579 movu xm1, [tlq+1] 4580 vextracti128 xm2, m0, 1 4581 pmaddubsw xm1, xm3 4582 psubw xm0, xm4 4583 paddw xm0, xm2 4584 paddw xm0, xm1 4585 punpckhqdq xm1, xm0, xm0 4586 paddw xm0, xm1 4587 psrlq xm1, xm0, 32 4588 paddw xm0, xm1 4589 pmaddwd xm0, xm3 4590 psrlw xm0, xm5 4591 cmp hd, 16 4592 je .w16_end 4593 mov r6d, 0x5556 4594 mov r2d, 0x3334 4595 test hb, 8|32 4596 cmovz r6d, r2d 4597 movd xm1, r6d 4598 pmulhuw xm0, xm1 4599.w16_end: 4600 vpbroadcastw m0, xm0 4601.s16: 4602 vpbroadcastw m1, alpham 4603 pabsw m2, m1 4604 psllw m2, 9 4605.s16_loop: 4606 mova m4, [acq] 4607 mova m5, [acq+32] 4608 IPRED_CFL 4 4609 IPRED_CFL 5 4610 packuswb m4, m5 4611 vpermq m4, m4, q3120 4612 mova [dstq+strideq*0], xm4 4613 vextracti128 [dstq+strideq*1], m4, 1 4614 lea dstq, [dstq+strideq*2] 4615 add acq, 64 4616 sub hd, 2 4617 jg .s16_loop 4618 RET 4619ALIGN function_align 4620.h32: 4621 mova m0, [tlq-32] 4622 pmaddubsw m0, m3 4623 jmp wq 4624.w32: 4625 movu m1, [tlq+1] 4626 pmaddubsw m1, m3 4627 paddw m0, m1 4628 vextracti128 xm1, m0, 1 4629 psubw xm0, xm4 4630 paddw xm0, xm1 4631 punpckhqdq xm1, xm0, xm0 4632 paddw xm0, xm1 4633 psrlq xm1, xm0, 32 4634 paddw xm0, xm1 4635 pmaddwd xm0, xm3 4636 psrlw xm0, xm5 4637 cmp hd, 32 4638 je .w32_end 4639 lea r2d, [hq*2] 4640 mov r6d, 0x33345556 4641 shrx r6d, r6d, r2d 4642 movd xm1, r6d 4643 pmulhuw xm0, xm1 4644.w32_end: 4645 vpbroadcastw m0, xm0 4646.s32: 4647 vpbroadcastw m1, alpham 4648 pabsw m2, m1 4649 psllw m2, 9 4650.s32_loop: 4651 mova m4, [acq] 4652 mova m5, [acq+32] 4653 IPRED_CFL 4 4654 IPRED_CFL 5 4655 packuswb m4, m5 4656 vpermq m4, m4, q3120 4657 mova [dstq], m4 4658 add dstq, strideq 4659 add acq, 64 4660 dec hd 4661 jg .s32_loop 4662 RET 4663 4664cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha 4665 lea t0, [ipred_cfl_splat_avx2_table] 4666 tzcnt wd, wm 4667 movifnidn hd, hm 4668 movsxd wq, [t0+wq*4] 4669 vpbroadcastd m0, [t0-ipred_cfl_splat_avx2_table+pw_128] 4670 add wq, t0 4671 movifnidn acq, acmp 4672 jmp wq 4673 4674cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak 4675 movifnidn hpadd, hpadm 4676 movifnidn wd, wm 4677 mov hd, hm 4678 mov szd, wd 4679 mov ac_bakq, acq 4680 imul szd, hd 4681 shl hpadd, 2 4682 sub hd, hpadd 4683 vpbroadcastd m2, [pb_2] 4684 pxor m4, m4 4685 cmp wd, 8 4686 jg .w16 4687 je .w8 4688 ; fall-through 4689 4690 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak 4691.w4: 4692 lea stride3q, [strideq*3] 4693.w4_loop: 4694 movq xm0, [yq] 4695 movq xm1, [yq+strideq] 4696 movhps xm0, [yq+strideq*2] 4697 movhps xm1, [yq+stride3q] 4698 pmaddubsw xm0, xm2 4699 pmaddubsw xm1, xm2 4700 paddw xm0, xm1 4701 mova [acq], xm0 4702 paddw xm4, xm0 4703 lea yq, [yq+strideq*4] 4704 add acq, 16 4705 sub hd, 2 4706 jg .w4_loop 4707 test hpadd, hpadd 4708 jz .calc_avg 4709 vpermq m0, m0, q1111 4710.w4_hpad_loop: 4711 mova [acq], m0 4712 paddw m4, m0 4713 add acq, 32 4714 sub hpadd, 4 4715 jg .w4_hpad_loop 4716 jmp .calc_avg 4717 4718.w8: 4719 lea stride3q, [strideq*3] 4720 test wpadd, wpadd 4721 jnz .w8_wpad 4722.w8_loop: 4723 mova xm0, [yq] 4724 mova xm1, [yq+strideq] 4725 vinserti128 m0, [yq+strideq*2], 1 4726 vinserti128 m1, [yq+stride3q], 1 4727 pmaddubsw m0, m2 4728 pmaddubsw m1, m2 4729 paddw m0, m1 4730 mova [acq], m0 4731 paddw m4, m0 4732 lea yq, [yq+strideq*4] 4733 add acq, 32 4734 sub hd, 2 4735 jg .w8_loop 4736 test hpadd, hpadd 4737 jz .calc_avg 4738 jmp .w8_hpad 4739.w8_wpad: 4740 vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] 4741.w8_wpad_loop: 4742 movq xm0, [yq] 4743 movq xm1, [yq+strideq] 4744 vinserti128 m0, [yq+strideq*2], 1 4745 vinserti128 m1, [yq+stride3q], 1 4746 pmaddubsw m0, m2 4747 pmaddubsw m1, m2 4748 paddw m0, m1 4749 pshufb m0, m3 4750 mova [acq], m0 4751 paddw m4, m0 4752 lea yq, [yq+strideq*4] 4753 add acq, 32 4754 sub hd, 2 4755 jg .w8_wpad_loop 4756 test hpadd, hpadd 4757 jz .calc_avg 4758.w8_hpad: 4759 vpermq m0, m0, q3232 4760.w8_hpad_loop: 4761 mova [acq], m0 4762 paddw m4, m0 4763 add acq, 32 4764 sub hpadd, 2 4765 jg .w8_hpad_loop 4766 jmp .calc_avg 4767 4768.w16: 4769 test wpadd, wpadd 4770 jnz .w16_wpad 4771.w16_loop: 4772 mova m0, [yq] 4773 mova m1, [yq+strideq] 4774 pmaddubsw m0, m2 4775 pmaddubsw m1, m2 4776 paddw m0, m1 4777 mova [acq], m0 4778 paddw m4, m0 4779 lea yq, [yq+strideq*2] 4780 add acq, 32 4781 dec hd 4782 jg .w16_loop 4783 test hpadd, hpadd 4784 jz .calc_avg 4785 jmp .w16_hpad_loop 4786.w16_wpad: 4787 DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak 4788 lea iptrq, [ipred_cfl_ac_420_avx2_table] 4789 shl wpadd, 2 4790 mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ 4791 ipred_cfl_ac_420_avx2_table+wpadq*8-32] 4792 movsxd wpadq, [iptrq+wpadq+4] 4793 add iptrq, wpadq 4794 jmp iptrq 4795.w16_pad3: 4796 vpbroadcastq m0, [yq] 4797 vpbroadcastq m1, [yq+strideq] 4798 jmp .w16_wpad_end 4799.w16_pad2: 4800 vbroadcasti128 m0, [yq] 4801 vbroadcasti128 m1, [yq+strideq] 4802 jmp .w16_wpad_end 4803.w16_pad1: 4804 mova m0, [yq] 4805 mova m1, [yq+strideq] 4806 ; fall-through 4807.w16_wpad_end: 4808 pmaddubsw m0, m2 4809 pmaddubsw m1, m2 4810 paddw m0, m1 4811 pshufb m0, m3 4812 mova [acq], m0 4813 paddw m4, m0 4814 lea yq, [yq+strideq*2] 4815 add acq, 32 4816 dec hd 4817 jz .w16_wpad_done 4818 jmp iptrq 4819.w16_wpad_done: 4820 test hpadd, hpadd 4821 jz .calc_avg 4822.w16_hpad_loop: 4823 mova [acq], m0 4824 paddw m4, m0 4825 add acq, 32 4826 dec hpadd 4827 jg .w16_hpad_loop 4828 ; fall-through 4829 4830.calc_avg: 4831 vpbroadcastd m2, [pw_1] 4832 pmaddwd m0, m4, m2 4833 vextracti128 xm1, m0, 1 4834 tzcnt r1d, szd 4835 paddd xm0, xm1 4836 movd xm2, r1d 4837 movd xm3, szd 4838 punpckhqdq xm1, xm0, xm0 4839 paddd xm0, xm1 4840 psrad xm3, 1 4841 psrlq xm1, xm0, 32 4842 paddd xm0, xm3 4843 paddd xm0, xm1 4844 psrad xm0, xm2 4845 vpbroadcastw m0, xm0 4846.sub_loop: 4847 mova m1, [ac_bakq] 4848 psubw m1, m0 4849 mova [ac_bakq], m1 4850 add ac_bakq, 32 4851 sub szd, 16 4852 jg .sub_loop 4853 RET 4854 4855cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak 4856 movifnidn hpadd, hpadm 4857 movifnidn wd, wm 4858 mov hd, hm 4859 mov szd, wd 4860 mov ac_bakq, acq 4861 imul szd, hd 4862 shl hpadd, 2 4863 sub hd, hpadd 4864 vpbroadcastd m2, [pb_4] 4865 pxor m4, m4 4866 pxor m5, m5 4867 cmp wd, 8 4868 jg .w16 4869 je .w8 4870 ; fall-through 4871 4872 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak 4873.w4: 4874 lea stride3q, [strideq*3] 4875.w4_loop: 4876 movq xm1, [yq] 4877 movhps xm1, [yq+strideq] 4878 movq xm0, [yq+strideq*2] 4879 movhps xm0, [yq+stride3q] 4880 pmaddubsw xm0, xm2 4881 pmaddubsw xm1, xm2 4882 mova [acq], xm1 4883 mova [acq+16], xm0 4884 paddw xm4, xm0 4885 paddw xm5, xm1 4886 lea yq, [yq+strideq*4] 4887 add acq, 32 4888 sub hd, 4 4889 jg .w4_loop 4890 test hpadd, hpadd 4891 jz .calc_avg 4892 vpermq m0, m0, q1111 4893.w4_hpad_loop: 4894 mova [acq], m0 4895 paddw m4, m0 4896 add acq, 32 4897 sub hpadd, 4 4898 jg .w4_hpad_loop 4899 jmp .calc_avg 4900 4901.w8: 4902 lea stride3q, [strideq*3] 4903 test wpadd, wpadd 4904 jnz .w8_wpad 4905.w8_loop: 4906 mova xm1, [yq] 4907 vinserti128 m1, [yq+strideq], 1 4908 mova xm0, [yq+strideq*2] 4909 vinserti128 m0, [yq+stride3q], 1 4910 pmaddubsw m0, m2 4911 pmaddubsw m1, m2 4912 mova [acq], m1 4913 mova [acq+32], m0 4914 paddw m4, m0 4915 paddw m5, m1 4916 lea yq, [yq+strideq*4] 4917 add acq, 64 4918 sub hd, 4 4919 jg .w8_loop 4920 test hpadd, hpadd 4921 jz .calc_avg 4922 jmp .w8_hpad 4923.w8_wpad: 4924 vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] 4925.w8_wpad_loop: 4926 movq xm1, [yq] 4927 vinserti128 m1, [yq+strideq], 1 4928 movq xm0, [yq+strideq*2] 4929 vinserti128 m0, [yq+stride3q], 1 4930 pmaddubsw m0, m2 4931 pmaddubsw m1, m2 4932 pshufb m0, m3 4933 pshufb m1, m3 4934 mova [acq], m1 4935 mova [acq+32], m0 4936 paddw m4, m0 4937 paddw m5, m1 4938 lea yq, [yq+strideq*4] 4939 add acq, 64 4940 sub hd, 4 4941 jg .w8_wpad_loop 4942 test hpadd, hpadd 4943 jz .calc_avg 4944.w8_hpad: 4945 vpermq m0, m0, q3232 4946.w8_hpad_loop: 4947 mova [acq], m0 4948 paddw m4, m0 4949 add acq, 32 4950 sub hpadd, 2 4951 jg .w8_hpad_loop 4952 jmp .calc_avg 4953 4954.w16: 4955 test wpadd, wpadd 4956 jnz .w16_wpad 4957.w16_loop: 4958 mova m1, [yq] 4959 mova m0, [yq+strideq] 4960 pmaddubsw m0, m2 4961 pmaddubsw m1, m2 4962 mova [acq], m1 4963 mova [acq+32], m0 4964 paddw m4, m0 4965 paddw m5, m1 4966 lea yq, [yq+strideq*2] 4967 add acq, 64 4968 sub hd, 2 4969 jg .w16_loop 4970 test hpadd, hpadd 4971 jz .calc_avg 4972 jmp .w16_hpad_loop 4973.w16_wpad: 4974 DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak 4975 lea iptrq, [ipred_cfl_ac_422_avx2_table] 4976 shl wpadd, 2 4977 mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ 4978 ipred_cfl_ac_422_avx2_table+wpadq*8-32] 4979 movsxd wpadq, [iptrq+wpadq+4] 4980 add iptrq, wpadq 4981 jmp iptrq 4982.w16_pad3: 4983 vpbroadcastq m1, [yq] 4984 vpbroadcastq m0, [yq+strideq] 4985 jmp .w16_wpad_end 4986.w16_pad2: 4987 vbroadcasti128 m1, [yq] 4988 vbroadcasti128 m0, [yq+strideq] 4989 jmp .w16_wpad_end 4990.w16_pad1: 4991 mova m1, [yq] 4992 mova m0, [yq+strideq] 4993 ; fall-through 4994.w16_wpad_end: 4995 pmaddubsw m0, m2 4996 pmaddubsw m1, m2 4997 pshufb m0, m3 4998 pshufb m1, m3 4999 mova [acq], m1 5000 mova [acq+32], m0 5001 paddw m4, m0 5002 paddw m5, m1 5003 lea yq, [yq+strideq*2] 5004 add acq, 64 5005 sub hd, 2 5006 jz .w16_wpad_done 5007 jmp iptrq 5008.w16_wpad_done: 5009 test hpadd, hpadd 5010 jz .calc_avg 5011.w16_hpad_loop: 5012 mova [acq], m0 5013 mova [acq+32], m0 5014 paddw m4, m0 5015 paddw m5, m0 5016 add acq, 64 5017 sub hpadd, 2 5018 jg .w16_hpad_loop 5019 ; fall-through 5020 5021.calc_avg: 5022 vpbroadcastd m2, [pw_1] 5023 pmaddwd m5, m5, m2 5024 pmaddwd m0, m4, m2 5025 paddd m0, m5 5026 vextracti128 xm1, m0, 1 5027 tzcnt r1d, szd 5028 paddd xm0, xm1 5029 movd xm2, r1d 5030 movd xm3, szd 5031 punpckhqdq xm1, xm0, xm0 5032 paddd xm0, xm1 5033 psrad xm3, 1 5034 psrlq xm1, xm0, 32 5035 paddd xm0, xm3 5036 paddd xm0, xm1 5037 psrad xm0, xm2 5038 vpbroadcastw m0, xm0 5039.sub_loop: 5040 mova m1, [ac_bakq] 5041 psubw m1, m0 5042 mova [ac_bakq], m1 5043 add ac_bakq, 32 5044 sub szd, 16 5045 jg .sub_loop 5046 RET 5047 5048cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak 5049 movifnidn hpadd, hpadm 5050 movifnidn wd, wm 5051 mov hd, hm 5052 mov szd, wd 5053 imul szd, hd 5054 shl hpadd, 2 5055 sub hd, hpadd 5056 pxor m4, m4 5057 vpbroadcastd m5, [pw_1] 5058 tzcnt r8d, wd 5059 lea r5, [ipred_cfl_ac_444_avx2_table] 5060 movsxd r8, [r5+r8*4+12] 5061 add r5, r8 5062 5063 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak 5064 mov ac_bakq, acq 5065 jmp r5 5066 5067.w4: 5068 lea stride3q, [strideq*3] 5069 pxor xm2, xm2 5070.w4_loop: 5071 movd xm1, [yq] 5072 movd xm0, [yq+strideq*2] 5073 pinsrd xm1, [yq+strideq], 1 5074 pinsrd xm0, [yq+stride3q], 1 5075 punpcklbw xm1, xm2 5076 punpcklbw xm0, xm2 5077 psllw xm1, 3 5078 psllw xm0, 3 5079 mova [acq], xm1 5080 mova [acq+16], xm0 5081 paddw xm1, xm0 5082 paddw xm4, xm1 5083 lea yq, [yq+strideq*4] 5084 add acq, 32 5085 sub hd, 4 5086 jg .w4_loop 5087 test hpadd, hpadd 5088 jz .calc_avg_mul 5089 pshufd xm0, xm0, q3232 5090 paddw xm1, xm0, xm0 5091.w4_hpad_loop: 5092 mova [acq], xm0 5093 mova [acq+16], xm0 5094 paddw xm4, xm1 5095 add acq, 32 5096 sub hpadd, 4 5097 jg .w4_hpad_loop 5098 jmp .calc_avg_mul 5099 5100.w8: 5101 lea stride3q, [strideq*3] 5102 pxor m2, m2 5103.w8_loop: 5104 movq xm1, [yq] 5105 movq xm0, [yq+strideq*2] 5106 vinserti128 m1, [yq+strideq], 1 5107 vinserti128 m0, [yq+stride3q], 1 5108 punpcklbw m1, m2 5109 punpcklbw m0, m2 5110 psllw m1, 3 5111 psllw m0, 3 5112 mova [acq], m1 5113 mova [acq+32], m0 5114 paddw m1, m0 5115 paddw m4, m1 5116 lea yq, [yq+strideq*4] 5117 add acq, 64 5118 sub hd, 4 5119 jg .w8_loop 5120 test hpadd, hpadd 5121 jz .calc_avg_mul 5122 vpermq m0, m0, q3232 5123 paddw m1, m0, m0 5124.w8_hpad_loop: 5125 mova [acq], m0 5126 mova [acq+32], m0 5127 paddw m4, m1 5128 add acq, 64 5129 sub hpadd, 4 5130 jg .w8_hpad_loop 5131 jmp .calc_avg_mul 5132 5133.w16: 5134 test wpadd, wpadd 5135 jnz .w16_wpad 5136.w16_loop: 5137 pmovzxbw m1, [yq] 5138 pmovzxbw m0, [yq+strideq] 5139 psllw m1, 3 5140 psllw m0, 3 5141 mova [acq], m1 5142 mova [acq+32], m0 5143 paddw m1, m0 5144 pmaddwd m1, m5 5145 paddd m4, m1 5146 lea yq, [yq+strideq*2] 5147 add acq, 64 5148 sub hd, 2 5149 jg .w16_loop 5150 test hpadd, hpadd 5151 jz .calc_avg 5152 jmp .w16_hpad 5153.w16_wpad: 5154 mova m3, [cfl_ac_444_w16_pad1_shuffle] 5155.w16_wpad_loop: 5156 vpbroadcastq m1, [yq] 5157 vpbroadcastq m0, [yq+strideq] 5158 pshufb m1, m3 5159 pshufb m0, m3 5160 psllw m1, 3 5161 psllw m0, 3 5162 mova [acq], m1 5163 mova [acq+32], m0 5164 paddw m1, m0 5165 pmaddwd m1, m5 5166 paddd m4, m1 5167 lea yq, [yq+strideq*2] 5168 add acq, 64 5169 sub hd, 2 5170 jg .w16_wpad_loop 5171 test hpadd, hpadd 5172 jz .calc_avg 5173.w16_hpad: 5174 paddw m1, m0, m0 5175 pmaddwd m1, m5 5176.w16_hpad_loop: 5177 mova [acq], m0 5178 mova [acq+32], m0 5179 paddd m4, m1 5180 add acq, 64 5181 sub hpadd, 2 5182 jg .w16_hpad_loop 5183 jmp .calc_avg 5184 5185.w32: 5186 test wpadd, wpadd 5187 jnz .w32_wpad 5188.w32_loop: 5189 pmovzxbw m1, [yq] 5190 pmovzxbw m0, [yq+16] 5191 psllw m1, 3 5192 psllw m0, 3 5193 mova [acq], m1 5194 mova [acq+32], m0 5195 paddw m2, m1, m0 5196 pmaddwd m2, m5 5197 paddd m4, m2 5198 add yq, strideq 5199 add acq, 64 5200 dec hd 5201 jg .w32_loop 5202 test hpadd, hpadd 5203 jz .calc_avg 5204 jmp .w32_hpad_loop 5205.w32_wpad: 5206 DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak 5207 lea iptrq, [ipred_cfl_ac_444_avx2_table] 5208 add wpadd, wpadd 5209 mova m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table] 5210 movsxd wpadq, [iptrq+wpadq+4] 5211 add iptrq, wpadq 5212 jmp iptrq 5213.w32_pad3: 5214 vpbroadcastq m1, [yq] 5215 pshufb m1, m3 5216 vpermq m0, m1, q3232 5217 jmp .w32_wpad_end 5218.w32_pad2: 5219 pmovzxbw m1, [yq] 5220 pshufhw m0, m1, q3333 5221 vpermq m0, m0, q3333 5222 jmp .w32_wpad_end 5223.w32_pad1: 5224 pmovzxbw m1, [yq] 5225 vpbroadcastq m0, [yq+16] 5226 pshufb m0, m3 5227 ; fall-through 5228.w32_wpad_end: 5229 psllw m1, 3 5230 psllw m0, 3 5231 mova [acq], m1 5232 mova [acq+32], m0 5233 paddw m2, m1, m0 5234 pmaddwd m2, m5 5235 paddd m4, m2 5236 add yq, strideq 5237 add acq, 64 5238 dec hd 5239 jz .w32_wpad_done 5240 jmp iptrq 5241.w32_wpad_done: 5242 test hpadd, hpadd 5243 jz .calc_avg 5244.w32_hpad_loop: 5245 mova [acq], m1 5246 mova [acq+32], m0 5247 paddd m4, m2 5248 add acq, 64 5249 dec hpadd 5250 jg .w32_hpad_loop 5251 jmp .calc_avg 5252 5253.calc_avg_mul: 5254 pmaddwd m4, m5 5255.calc_avg: 5256 vextracti128 xm1, m4, 1 5257 tzcnt r1d, szd 5258 paddd xm0, xm4, xm1 5259 movd xm2, r1d 5260 movd xm3, szd 5261 punpckhqdq xm1, xm0, xm0 5262 paddd xm0, xm1 5263 psrad xm3, 1 5264 psrlq xm1, xm0, 32 5265 paddd xm0, xm3 5266 paddd xm0, xm1 5267 psrad xm0, xm2 5268 vpbroadcastw m0, xm0 5269.sub_loop: 5270 mova m1, [ac_bakq] 5271 psubw m1, m0 5272 mova [ac_bakq], m1 5273 add ac_bakq, 32 5274 sub szd, 16 5275 jg .sub_loop 5276 RET 5277 5278cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h 5279 vpbroadcastq m4, [palq] 5280 lea r2, [pal_pred_avx2_table] 5281 tzcnt wd, wm 5282 movifnidn hd, hm 5283 movsxd wq, [r2+wq*4] 5284 add wq, r2 5285 lea r2, [strideq*3] 5286 jmp wq 5287.w4: 5288 movq xm0, [idxq] 5289 add idxq, 8 5290 psrlw xm1, xm0, 4 5291 punpcklbw xm0, xm1 5292 pshufb xm0, xm4, xm0 5293 movd [dstq+strideq*0], xm0 5294 pextrd [dstq+strideq*1], xm0, 1 5295 pextrd [dstq+strideq*2], xm0, 2 5296 pextrd [dstq+r2 ], xm0, 3 5297 lea dstq, [dstq+strideq*4] 5298 sub hd, 4 5299 jg .w4 5300 RET 5301.w8: 5302 movu xm2, [idxq] 5303 add idxq, 16 5304 pshufb xm1, xm4, xm2 5305 psrlw xm2, 4 5306 pshufb xm2, xm4, xm2 5307 punpcklbw xm0, xm1, xm2 5308 punpckhbw xm1, xm2 5309 movq [dstq+strideq*0], xm0 5310 movhps [dstq+strideq*1], xm0 5311 movq [dstq+strideq*2], xm1 5312 movhps [dstq+r2 ], xm1 5313 lea dstq, [dstq+strideq*4] 5314 sub hd, 4 5315 jg .w8 5316 RET 5317.w16: 5318 movu m2, [idxq] 5319 add idxq, 32 5320 pshufb m1, m4, m2 5321 psrlw m2, 4 5322 pshufb m2, m4, m2 5323 punpcklbw m0, m1, m2 5324 punpckhbw m1, m2 5325 mova [dstq+strideq*0], xm0 5326 mova [dstq+strideq*1], xm1 5327 vextracti128 [dstq+strideq*2], m0, 1 5328 vextracti128 [dstq+r2 ], m1, 1 5329 lea dstq, [dstq+strideq*4] 5330 sub hd, 4 5331 jg .w16 5332 RET 5333.w32: 5334 vpermq m2, [idxq], q3120 5335 add idxq, 32 5336 pshufb m1, m4, m2 5337 psrlw m2, 4 5338 pshufb m2, m4, m2 5339 punpcklbw m0, m1, m2 5340 punpckhbw m1, m2 5341 mova [dstq+strideq*0], m0 5342 mova [dstq+strideq*1], m1 5343 lea dstq, [dstq+strideq*2] 5344 sub hd, 2 5345 jg .w32 5346 RET 5347.w64: 5348 vpermq m2, [idxq], q3120 5349 add idxq, 32 5350 pshufb m1, m4, m2 5351 psrlw m2, 4 5352 pshufb m2, m4, m2 5353 punpcklbw m0, m1, m2 5354 punpckhbw m1, m2 5355 mova [dstq+32*0], m0 5356 mova [dstq+32*1], m1 5357 add dstq, strideq 5358 dec hd 5359 jg .w64 5360 RET 5361 5362%endif 5363