1; Copyright © 2022, VideoLAN and dav1d authors 2; Copyright © 2022, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 64 32 33cdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21 34 db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29 35 db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37 36 db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45 37end_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 38 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 39edge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111 40 dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011 41 dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111 42pri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4 43cdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6 44 dw 1, 2, 1, 10, 9, 18, 8, 17 45 dw 8, 16, 8, 15, -7,-14, 1, -6 46deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 47cdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28 48 db 2, 4, 2, 36, 34, 68, 32, 66 49 db 32, 64, 32, 62,-30,-60, 2,-28 50pri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3 51sec_taps4: dw 32, 16 52pw_m16384: times 2 dw -16384 53pw_2048: times 2 dw 2048 54pd_268435568: dd 268435568 ; (1 << 28) + (7 << 4) 55edge_mask8: dw 0x2121, 0x2020, 0x0101 56 57SECTION .text 58 59%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp 60 psubw %1, %2, %3 61 pabsw %1, %1 62 vpcmpgtw k1, %3, %2 63 vpsrlvw %7, %1, %6 64 psubusw %7, %5, %7 65 pminsw %1, %7 66 vpsubw %1{k1}, %4, %1 67%endmacro 68 69; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 70; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 71; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7 72; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7 73 74INIT_ZMM avx512icl 75cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \ 76 pri, sec, dir, damping, edge 77%define base r6-cdef_dirs4 78 lea r6, [cdef_dirs4] 79 movu xm3, [dstq+strideq*0] 80 vinserti32x4 ym3, [dstq+strideq*1], 1 81 mova xm2, [leftq] 82 lea r2, [dstq+strideq*2] 83 vinserti32x4 m3, [r2+strideq*0], 2 84 mova m5, [base+cdef_perm] 85 vinserti32x4 m3, [r2+strideq*1], 3 86 vpermt2d m2, m5, m3 87 vinserti32x4 m1, m2, [topq+strideq*0-4], 0 88 vinserti32x4 m1, [topq+strideq*1-4], 1 89 mov r3d, edgem 90 movifnidn prid, prim 91 punpcklwd m3, m3 ; px 92 psrlw m5, 8 93 vpbroadcastd m0, [base+pd_268435568] 94 pxor m12, m12 95 cmp r3d, 0x0f 96 jne .mask_edges 97 vinserti32x4 m2, [botq+strideq*0-4], 2 98 vinserti32x4 m2, [botq+strideq*1-4], 3 99.main: 100 test prid, prid 101 jz .sec_only 102 lzcnt r4d, prid 103 rorx r3d, prid, 2 104 vpbroadcastw m13, prim 105 cmp dword r10m, 0xfff ; if (bpc == 12) 106 cmove prid, r3d ; pri >>= 2 107 mov r3d, dampingm 108 and prid, 4 109 sub r3d, 31 110 vpbroadcastd m15, [base+pri_taps4+priq] 111 xor prid, prid 112 add r4d, r3d 113 cmovns prid, r4d ; pri_shift 114 mov r4d, dirm 115 vpbroadcastw m14, prid 116 mov r5d, secm 117 vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] 118 call .constrain 119 test r5d, r5d 120 jz .end_no_clip 121 lzcnt r5d, r5d 122 vpbroadcastw m13, secm 123 add r3d, r5d 124 pminuw m6, m3, m8 125 pmaxsw m7, m3, m8 126 pminuw m6, m9 127 pmaxsw m7, m9 128 call .constrain_sec 129 pminuw m6, m8 130 pmaxsw m7, m8 131 pminuw m6, m9 132 pmaxsw m7, m9 133 vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] 134 call .constrain 135 pminuw m6, m8 136 pmaxsw m7, m8 137 pminuw m6, m9 138 pmaxsw m7, m9 139 psrldq m8, m6, 2 140 vpshldd m3, m0, 8 141 psrldq m9, m7, 2 142 paddd m0, m3 143 pminuw m6, m8 144 psrldq m0, 1 145 pmaxsw m7, m9 146 pmaxsw m0, m6 147 pminsw m0, m7 148 vpmovdw ym0, m0 149 jmp .end 150.sec_only: 151 tzcnt r5d, secm 152 mov r3d, dampingm 153 vpbroadcastw m13, secm 154 mov r4d, dirm 155 sub r3d, r5d ; sec_shift 156 call .constrain_sec 157 vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] 158 call .constrain 159.end_no_clip: 160 mova ym1, [base+end_perm4] 161 vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4) 162 paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 163 vpermb m0, m1, m0 164.end: 165 movq [dstq+strideq*0], xm0 166 movhps [dstq+strideq*1], xm0 167 vextracti32x4 xm0, ym0, 1 168 movq [r2+strideq*0], xm0 169 movhps [r2+strideq*1], xm0 170 RET 171.mask_edges: 172 vpbroadcastd m6, [base+pw_m16384] 173 test r3b, 0x08 174 jz .mask_edges_no_bottom ; avoid buffer overread 175 vinserti32x4 m2, [botq+strideq*0-4], 2 176 vinserti32x4 m2, [botq+strideq*1-4], 3 177 kmovw k1, [base+edge_mask4-8+r3*2] 178 jmp .mask_edges_main 179.mask_edges_no_bottom: 180 kmovw k1, [base+edge_mask4+8+r3*2] 181.mask_edges_main: 182 or r3d, 0x04 183 vmovdqa32 m1{k1}, m6 ; edge pixels = -16384 184 kmovw k1, [base+edge_mask4-8+r3*2] 185 vmovdqa32 m2{k1}, m6 186 jmp .main 187.constrain_sec: 188 vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] 189 vpbroadcastw m14, r3d 190 vpbroadcastd m15, [base+sec_taps4] 191.constrain: 192 paddw m8, m5, m9 193 vpermi2w m8, m1, m2 ; k0p0 k1p0 194 psubw m9, m5, m9 195 vpermi2w m9, m1, m2 ; k0p1 k1p1 196 CONSTRAIN m10, m8, m3, m12, m13, m14, m11 197 vpdpwssd m0, m10, m15 198 CONSTRAIN m10, m9, m3, m12, m13, m14, m11 199 vpdpwssd m0, m10, m15 200 ret 201 202; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 203; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 204; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 205; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 206 207cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \ 208 pri, sec, dir, damping, edge 209 lea r6, [cdef_dirs4] 210 movu xm18, [dstq+strideq*0] 211 vinserti128 ym18, [dstq+strideq*1], 1 212 mova xm1, [leftq+16*0] 213 mova xm2, [leftq+16*1] 214 lea r2, [strideq*3] 215 vinserti32x4 m18, [dstq+strideq*2], 2 216 mova m5, [base+cdef_perm] 217 vinserti32x4 m18, [dstq+r2 ], 3 218 vpermt2d m1, m5, m18 219 vinserti32x4 m0, m1, [topq+strideq*0-4], 0 220 vinserti32x4 m0, [topq+strideq*1-4], 1 221 lea r3, [dstq+strideq*4] 222 movu xm19, [r3+strideq*0] 223 vinserti128 ym19, [r3+strideq*1], 1 224 vinserti32x4 m19, [r3+strideq*2], 2 225 vinserti32x4 m19, [r3+r2 ], 3 226 mov r3d, edgem 227 movifnidn prid, prim 228 vpermt2d m2, m5, m19 229 vpbroadcastd m16, [base+pd_268435568] 230 pxor m12, m12 231 punpcklwd m18, m18 ; px (top) 232 psrlw m5, 8 233 punpcklwd m19, m19 ; px (bottom) 234 mova m17, m16 235 vshufi32x4 m1, m2, q3210 236 cmp r3d, 0x0f 237 jne .mask_edges 238 vinserti32x4 m2, [botq+strideq*0-4], 2 239 vinserti32x4 m2, [botq+strideq*1-4], 3 240.main: 241 test prid, prid 242 jz .sec_only 243 lzcnt r4d, prid 244 rorx r3d, prid, 2 245 vpbroadcastw m13, prim 246 cmp dword r10m, 0xfff ; if (bpc == 12) 247 cmove prid, r3d ; pri >>= 2 248 mov r3d, dampingm 249 and prid, 4 250 sub r3d, 31 251 vpbroadcastd m15, [base+pri_taps4+priq] 252 xor prid, prid 253 add r4d, r3d 254 cmovns prid, r4d ; pri_shift 255 mov r4d, dirm 256 vpbroadcastw m14, prid 257 mov r5d, secm 258 vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] 259 call .constrain 260 test r5d, r5d 261 jz .end_no_clip 262 lzcnt r5d, r5d 263 vpbroadcastw m13, secm 264 add r3d, r5d 265 pminuw m3, m18, m6 266 pmaxsw m4, m18, m6 267 pminuw m20, m19, m7 268 pmaxsw m21, m19, m7 269 pminuw m3, m8 270 pmaxsw m4, m8 271 pminuw m20, m9 272 pmaxsw m21, m9 273 call .constrain_sec 274 pminuw m3, m6 275 pmaxsw m4, m6 276 pminuw m20, m7 277 pmaxsw m21, m7 278 pminuw m3, m8 279 pmaxsw m4, m8 280 pminuw m20, m9 281 pmaxsw m21, m9 282 vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] 283 call .constrain 284 pminuw m3, m6 285 pmaxsw m4, m6 286 mov r3, 0xcccccccccccccccc 287 pminuw m20, m7 288 pmaxsw m21, m7 289 kmovq k1, r3 290 pminuw m3, m8 291 pmaxsw m4, m8 292 pminuw m20, m9 293 pmaxsw m21, m9 294 vbroadcasti32x4 m0, [base+deint_shuf] 295 vpshldd m6, m20, m3, 16 296 vmovdqu8 m3{k1}, m20 297 vpshldd m18, m16, 8 298 vpshldd m7, m21, m4, 16 299 vmovdqu8 m4{k1}, m21 300 vpshldd m19, m17, 8 301 pminuw m3, m6 302 paddd m16, m18 303 pmaxsw m4, m7 304 paddd m17, m19 305 psrldq m16, 1 306 palignr m16{k1}, m17, m17, 15 307 lea r6, [dstq+strideq*4] 308 pmaxsw m16, m3 309 pminsw m16, m4 310 pshufb m16, m0 311 movq [dstq+strideq*0], xm16 312 movhps [r6 +strideq*0], xm16 313 vextracti128 xm17, ym16, 1 314 movq [dstq+strideq*1], xm17 315 movhps [r6 +strideq*1], xm17 316 vextracti32x4 xm17, m16, 2 317 movq [dstq+strideq*2], xm17 318 movhps [r6 +strideq*2], xm17 319 vextracti32x4 xm16, m16, 3 320 movq [dstq+r2 ], xm16 321 movhps [r6 +r2 ], xm16 322 RET 323.sec_only: 324 mov r4d, dirm 325 tzcnt r5d, secm 326 mov r3d, dampingm 327 vpbroadcastw m13, secm 328 sub r3d, r5d ; sec_shift 329 call .constrain_sec 330 vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] 331 call .constrain 332.end_no_clip: 333 mova ym20, [base+end_perm4] 334 vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4) 335 vpshldd m19, m17, 8 336 paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 337 paddd m17, m19 338 vpermb m16, m20, m16 339 vpermb m17, m20, m17 340 movq [dstq+strideq*0], xm16 341 movhps [dstq+strideq*1], xm16 342 vextracti128 xm16, ym16, 1 343 movq [dstq+strideq*2], xm16 344 movhps [dstq+r2 ], xm16 345 lea dstq, [dstq+strideq*4] 346 movq [dstq+strideq*0], xm17 347 movhps [dstq+strideq*1], xm17 348 vextracti128 xm17, ym17, 1 349 movq [dstq+strideq*2], xm17 350 movhps [dstq+r2 ], xm17 351 RET 352.mask_edges: 353 vpbroadcastd m6, [base+pw_m16384] 354 test r3b, 0x08 355 jz .mask_edges_no_bottom ; avoid buffer overread 356 vinserti32x4 m2, [botq+strideq*0-4], 2 357 vinserti32x4 m2, [botq+strideq*1-4], 3 358 kmovw k1, [base+edge_mask4-8+r3*2] 359 jmp .mask_edges_main 360.mask_edges_no_bottom: 361 kmovw k1, [base+edge_mask4+8+r3*2] 362.mask_edges_main: 363 mov r4d, r3d 364 or r3d, 0x0c 365 vmovdqa32 m0{k1}, m6 ; edge pixels = -16384 366 kmovw k1, [base+edge_mask4-8+r3*2] 367 or r4d, 0x04 368 vmovdqa32 m1{k1}, m6 369 kmovw k1, [base+edge_mask4-8+r4*2] 370 vmovdqa32 m2{k1}, m6 371 jmp .main 372.constrain_sec: 373 vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] 374 vpbroadcastw m14, r3d 375 vpbroadcastd m15, [base+sec_taps4] 376.constrain: 377 paddw m7, m5, m9 378 mova m6, m0 379 vpermt2w m6, m7, m1 ; k0p0 k1p0 (top) 380 psubw m9, m5, m9 381 mova m8, m0 382 vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom) 383 CONSTRAIN m10, m6, m18, m12, m13, m14, m11 384 vpermt2w m8, m9, m1 ; k0p1 k1p1 (top) 385 vpdpwssd m16, m10, m15 386 CONSTRAIN m10, m7, m19, m12, m13, m14, m11 387 vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom) 388 vpdpwssd m17, m10, m15 389 CONSTRAIN m10, m8, m18, m12, m13, m14, m11 390 vpdpwssd m16, m10, m15 391 CONSTRAIN m10, m9, m19, m12, m13, m14, m11 392 vpdpwssd m17, m10, m15 393 ret 394 395cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \ 396 pri, sec, dir, damping, edge 397%define base r6-cdef_dirs8 398 lea r6, [cdef_dirs8] 399 movu ym17, [dstq+strideq*0] 400 vinserti32x8 m17, [dstq+strideq*1], 1 401 movq xm4, [leftq+8*0] 402 movq xm5, [leftq+8*1] 403 psrld m2, [base+cdef_perm], 16 404 movq xm6, [leftq+8*2] 405 movq xm7, [leftq+8*3] 406 lea r2, [strideq*3] 407 movu ym16, [topq+strideq*0-4] 408 vinserti32x8 m16, [topq+strideq*1-4], 1 409 lea r3, [dstq+strideq*4] 410 movu ym18, [dstq+strideq*2] 411 vinserti32x8 m18, [dstq+r2 ], 1 412 movu ym19, [r3+strideq*0] 413 vinserti32x8 m19, [r3+strideq*1], 1 414 movu ym20, [r3+strideq*2] 415 vinserti32x8 m20, [r3+r2 ], 1 416 vshufi32x4 m0, m17, m18, q2020 ; px (top) 417 mov r3d, edgem 418 vshufi32x4 m1, m19, m20, q2020 ; px (bottom) 419 movifnidn prid, prim 420 vpermt2d m17, m2, m4 421 vpermt2d m18, m2, m5 422 pxor m12, m12 423 vpermt2d m19, m2, m6 424 vpermt2d m20, m2, m7 425 cmp r3d, 0x0f 426 jne .mask_edges 427 movu ym21, [botq+strideq*0-4] 428 vinserti32x8 m21, [botq+strideq*1-4], 1 429.main: 430 mova [rsp+64*0], m16 ; top 431 mova [rsp+64*1], m17 ; 0 1 432 mova [rsp+64*2], m18 ; 2 3 433 mova [rsp+64*3], m19 ; 4 5 434 mova [rsp+64*4], m20 ; 6 7 435 mova [rsp+64*5], m21 ; bottom 436 test prid, prid 437 jz .sec_only 438 lzcnt r4d, prid 439 rorx r3d, prid, 2 440 vpbroadcastw m13, prim 441 cmp dword r10m, 0xfff ; if (bpc == 12) 442 cmove prid, r3d ; pri >>= 2 443 mov r3d, dampingm 444 and prid, 4 445 sub r3d, 31 446 add r4d, r3d ; pri_shift 447 vpbroadcastw m14, r4d 448 mov r4d, dirm 449 vpbroadcastd m2, [base+pri_taps8+priq*2+0] 450 vpbroadcastd m3, [base+pri_taps8+priq*2+4] 451 movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1 452 pmaxsw m14, m12 453 call .constrain 454 mov r5d, secm 455 pmullw m16, m8, m2 456 pmullw m17, m9, m2 457 test r5d, r5d 458 jnz .pri_sec 459 movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 460 call .constrain 461 pmullw m8, m3 462 pmullw m9, m3 463 jmp .end_no_clip 464.pri_sec: 465 lzcnt r5d, r5d 466 add r3d, r5d ; sec_shift 467 movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 468 pminuw m18, m0, m4 469 pmaxsw m19, m0, m4 470 pminuw m20, m1, m5 471 pmaxsw m21, m1, m5 472 call .min_max_constrain2 473 movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2 474 pmullw m8, m3 475 pmullw m9, m3 476 vpbroadcastw m13, secm 477 vpbroadcastw m14, r3d 478 paddw m16, m8 479 paddw m17, m9 480 call .min_max_constrain 481 movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3 482 mova m2, m8 483 mova m3, m9 484 call .min_max_constrain 485 movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2 486 paddw m2, m8 487 paddw m3, m9 488 call .min_max_constrain 489 movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3 490 paddw m2, m2 491 paddw m3, m3 492 paddw m16, m8 493 paddw m17, m9 494 call .min_max_constrain 495 vpbroadcastd m10, [base+pw_2048] 496 paddw m16, m2 497 paddw m17, m3 498 paddw m16, m8 499 paddw m17, m9 500 psraw m8, m16, 15 501 psraw m9, m17, 15 502 paddw m16, m8 503 paddw m17, m9 504 pmulhrsw m16, m10 505 pmulhrsw m17, m10 506 pminuw m18, m4 507 pmaxsw m19, m4 508 pminuw m20, m5 509 pmaxsw m21, m5 510 pminuw m18, m6 511 pmaxsw m19, m6 512 pminuw m20, m7 513 pmaxsw m21, m7 514 paddw m16, m0 515 paddw m17, m1 516 pmaxsw m16, m18 517 pmaxsw m17, m20 518 pminsw m16, m19 519 pminsw m17, m21 520 jmp .end 521.sec_only: 522 tzcnt r5d, secm 523 mov r4d, dirm 524 mov r3d, dampingm 525 vpbroadcastw m13, secm 526 sub r3d, r5d 527 movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] 528 vpbroadcastw m14, r3d 529 call .constrain 530 movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] 531 mova m16, m8 532 mova m17, m9 533 call .constrain 534 movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] 535 paddw m16, m8 536 paddw m17, m9 537 call .constrain 538 movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] 539 paddw m16, m16 540 paddw m17, m17 541 paddw m16, m8 542 paddw m17, m9 543 call .constrain 544.end_no_clip: 545 vpbroadcastd m10, [base+pw_2048] 546 paddw m16, m8 547 paddw m17, m9 548 psraw m8, m16, 15 549 psraw m9, m17, 15 550 paddw m16, m8 551 paddw m17, m9 552 pmulhrsw m16, m10 553 pmulhrsw m17, m10 554 paddw m16, m0 555 paddw m17, m1 556.end: 557 mova [dstq+strideq*0], xm16 558 vextracti128 [dstq+strideq*1], ym16, 1 559 vextracti32x4 [dstq+strideq*2], m16, 2 560 vextracti32x4 [dstq+r2 ], m16, 3 561 lea dstq, [dstq+strideq*4] 562 mova [dstq+strideq*0], xm17 563 vextracti128 [dstq+strideq*1], ym17, 1 564 vextracti32x4 [dstq+strideq*2], m17, 2 565 vextracti32x4 [dstq+r2 ], m17, 3 566 RET 567.mask_edges: 568 vpbroadcastd m2, [base+pw_m16384] 569 test r3b, 0x08 570 jz .mask_edges_no_bottom ; avoid buffer overread 571 movu ym21, [botq+strideq*0-4] 572 vinserti32x8 m21, [botq+strideq*1-4], 1 573 jmp .mask_edges_top 574.mask_edges_no_bottom: 575 mova m21, m2 576.mask_edges_top: 577 test r3b, 0x04 578 jnz .mask_edges_main 579 mova m16, m2 580.mask_edges_main: 581 and r3d, 0x03 582 cmp r3d, 0x03 583 je .main 584 kmovw k1, [base+edge_mask8+r3*2] 585 vmovdqa32 m16{k1}, m2 ; edge pixels = -16384 586 vmovdqa32 m17{k1}, m2 587 vmovdqa32 m18{k1}, m2 588 vmovdqa32 m19{k1}, m2 589 vmovdqa32 m20{k1}, m2 590 vmovdqa32 m21{k1}, m2 591 jmp .main 592ALIGN function_align 593.min_max_constrain: 594 pminuw m18, m4 595 pmaxsw m19, m4 596 pminuw m20, m5 597 pmaxsw m21, m5 598.min_max_constrain2: 599 pminuw m18, m6 600 pmaxsw m19, m6 601 pminuw m20, m7 602 pmaxsw m21, m7 603.constrain: 604 %define tmp rsp+gprsize+68 605 movu m4, [tmp+r5+64*0] 606 vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top) 607 movu m5, [tmp+r5+64*2] 608 vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom) 609 neg r5 610 movu m6, [tmp+r5+64*0] 611 vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top) 612 movu m7, [tmp+r5+64*2] 613 vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom) 614 CONSTRAIN m8, m4, m0, m12, m13, m14, m15 615 CONSTRAIN m9, m5, m1, m12, m13, m14, m15 616 CONSTRAIN m10, m6, m0, m12, m13, m14, m15 617 CONSTRAIN m11, m7, m1, m12, m13, m14, m15 618 paddw m8, m10 619 paddw m9, m11 620 ret 621 622%endif ; ARCH_X86_64 623