1*c0909341SAndroid Build Coastguard Worker; Copyright © 2022, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2022, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64 32*c0909341SAndroid Build Coastguard Worker 33*c0909341SAndroid Build Coastguard Workercdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21 34*c0909341SAndroid Build Coastguard Worker db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29 35*c0909341SAndroid Build Coastguard Worker db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37 36*c0909341SAndroid Build Coastguard Worker db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45 37*c0909341SAndroid Build Coastguard Workerend_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 38*c0909341SAndroid Build Coastguard Worker db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 39*c0909341SAndroid Build Coastguard Workeredge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111 40*c0909341SAndroid Build Coastguard Worker dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011 41*c0909341SAndroid Build Coastguard Worker dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111 42*c0909341SAndroid Build Coastguard Workerpri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4 43*c0909341SAndroid Build Coastguard Workercdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6 44*c0909341SAndroid Build Coastguard Worker dw 1, 2, 1, 10, 9, 18, 8, 17 45*c0909341SAndroid Build Coastguard Worker dw 8, 16, 8, 15, -7,-14, 1, -6 46*c0909341SAndroid Build Coastguard Workerdeint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 47*c0909341SAndroid Build Coastguard Workercdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28 48*c0909341SAndroid Build Coastguard Worker db 2, 4, 2, 36, 34, 68, 32, 66 49*c0909341SAndroid Build Coastguard Worker db 32, 64, 32, 62,-30,-60, 2,-28 50*c0909341SAndroid Build Coastguard Workerpri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3 51*c0909341SAndroid Build Coastguard Workersec_taps4: dw 32, 16 52*c0909341SAndroid Build Coastguard Workerpw_m16384: times 2 dw -16384 53*c0909341SAndroid Build Coastguard Workerpw_2048: times 2 dw 2048 54*c0909341SAndroid Build Coastguard Workerpd_268435568: dd 268435568 ; (1 << 28) + (7 << 4) 55*c0909341SAndroid Build Coastguard Workeredge_mask8: dw 0x2121, 0x2020, 0x0101 56*c0909341SAndroid Build Coastguard Worker 57*c0909341SAndroid Build Coastguard WorkerSECTION .text 58*c0909341SAndroid Build Coastguard Worker 59*c0909341SAndroid Build Coastguard Worker%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp 60*c0909341SAndroid Build Coastguard Worker psubw %1, %2, %3 61*c0909341SAndroid Build Coastguard Worker pabsw %1, %1 62*c0909341SAndroid Build Coastguard Worker vpcmpgtw k1, %3, %2 63*c0909341SAndroid Build Coastguard Worker vpsrlvw %7, %1, %6 64*c0909341SAndroid Build Coastguard Worker psubusw %7, %5, %7 65*c0909341SAndroid Build Coastguard Worker pminsw %1, %7 66*c0909341SAndroid Build Coastguard Worker vpsubw %1{k1}, %4, %1 67*c0909341SAndroid Build Coastguard Worker%endmacro 68*c0909341SAndroid Build Coastguard Worker 69*c0909341SAndroid Build Coastguard Worker; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 70*c0909341SAndroid Build Coastguard Worker; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 71*c0909341SAndroid Build Coastguard Worker; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7 72*c0909341SAndroid Build Coastguard Worker; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7 73*c0909341SAndroid Build Coastguard Worker 74*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 75*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \ 76*c0909341SAndroid Build Coastguard Worker pri, sec, dir, damping, edge 77*c0909341SAndroid Build Coastguard Worker%define base r6-cdef_dirs4 78*c0909341SAndroid Build Coastguard Worker lea r6, [cdef_dirs4] 79*c0909341SAndroid Build Coastguard Worker movu xm3, [dstq+strideq*0] 80*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym3, [dstq+strideq*1], 1 81*c0909341SAndroid Build Coastguard Worker mova xm2, [leftq] 82*c0909341SAndroid Build Coastguard Worker lea r2, [dstq+strideq*2] 83*c0909341SAndroid Build Coastguard Worker vinserti32x4 m3, [r2+strideq*0], 2 84*c0909341SAndroid Build Coastguard Worker mova m5, [base+cdef_perm] 85*c0909341SAndroid Build Coastguard Worker vinserti32x4 m3, [r2+strideq*1], 3 86*c0909341SAndroid Build Coastguard Worker vpermt2d m2, m5, m3 87*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, m2, [topq+strideq*0-4], 0 88*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [topq+strideq*1-4], 1 89*c0909341SAndroid Build Coastguard Worker mov r3d, edgem 90*c0909341SAndroid Build Coastguard Worker movifnidn prid, prim 91*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m3 ; px 92*c0909341SAndroid Build Coastguard Worker psrlw m5, 8 93*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+pd_268435568] 94*c0909341SAndroid Build Coastguard Worker pxor m12, m12 95*c0909341SAndroid Build Coastguard Worker cmp r3d, 0x0f 96*c0909341SAndroid Build Coastguard Worker jne .mask_edges 97*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [botq+strideq*0-4], 2 98*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [botq+strideq*1-4], 3 99*c0909341SAndroid Build Coastguard Worker.main: 100*c0909341SAndroid Build Coastguard Worker test prid, prid 101*c0909341SAndroid Build Coastguard Worker jz .sec_only 102*c0909341SAndroid Build Coastguard Worker lzcnt r4d, prid 103*c0909341SAndroid Build Coastguard Worker rorx r3d, prid, 2 104*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, prim 105*c0909341SAndroid Build Coastguard Worker cmp dword r10m, 0xfff ; if (bpc == 12) 106*c0909341SAndroid Build Coastguard Worker cmove prid, r3d ; pri >>= 2 107*c0909341SAndroid Build Coastguard Worker mov r3d, dampingm 108*c0909341SAndroid Build Coastguard Worker and prid, 4 109*c0909341SAndroid Build Coastguard Worker sub r3d, 31 110*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+pri_taps4+priq] 111*c0909341SAndroid Build Coastguard Worker xor prid, prid 112*c0909341SAndroid Build Coastguard Worker add r4d, r3d 113*c0909341SAndroid Build Coastguard Worker cmovns prid, r4d ; pri_shift 114*c0909341SAndroid Build Coastguard Worker mov r4d, dirm 115*c0909341SAndroid Build Coastguard Worker vpbroadcastw m14, prid 116*c0909341SAndroid Build Coastguard Worker mov r5d, secm 117*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] 118*c0909341SAndroid Build Coastguard Worker call .constrain 119*c0909341SAndroid Build Coastguard Worker test r5d, r5d 120*c0909341SAndroid Build Coastguard Worker jz .end_no_clip 121*c0909341SAndroid Build Coastguard Worker lzcnt r5d, r5d 122*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, secm 123*c0909341SAndroid Build Coastguard Worker add r3d, r5d 124*c0909341SAndroid Build Coastguard Worker pminuw m6, m3, m8 125*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m3, m8 126*c0909341SAndroid Build Coastguard Worker pminuw m6, m9 127*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m9 128*c0909341SAndroid Build Coastguard Worker call .constrain_sec 129*c0909341SAndroid Build Coastguard Worker pminuw m6, m8 130*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m8 131*c0909341SAndroid Build Coastguard Worker pminuw m6, m9 132*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m9 133*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] 134*c0909341SAndroid Build Coastguard Worker call .constrain 135*c0909341SAndroid Build Coastguard Worker pminuw m6, m8 136*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m8 137*c0909341SAndroid Build Coastguard Worker pminuw m6, m9 138*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m9 139*c0909341SAndroid Build Coastguard Worker psrldq m8, m6, 2 140*c0909341SAndroid Build Coastguard Worker vpshldd m3, m0, 8 141*c0909341SAndroid Build Coastguard Worker psrldq m9, m7, 2 142*c0909341SAndroid Build Coastguard Worker paddd m0, m3 143*c0909341SAndroid Build Coastguard Worker pminuw m6, m8 144*c0909341SAndroid Build Coastguard Worker psrldq m0, 1 145*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m9 146*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 147*c0909341SAndroid Build Coastguard Worker pminsw m0, m7 148*c0909341SAndroid Build Coastguard Worker vpmovdw ym0, m0 149*c0909341SAndroid Build Coastguard Worker jmp .end 150*c0909341SAndroid Build Coastguard Worker.sec_only: 151*c0909341SAndroid Build Coastguard Worker tzcnt r5d, secm 152*c0909341SAndroid Build Coastguard Worker mov r3d, dampingm 153*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, secm 154*c0909341SAndroid Build Coastguard Worker mov r4d, dirm 155*c0909341SAndroid Build Coastguard Worker sub r3d, r5d ; sec_shift 156*c0909341SAndroid Build Coastguard Worker call .constrain_sec 157*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] 158*c0909341SAndroid Build Coastguard Worker call .constrain 159*c0909341SAndroid Build Coastguard Worker.end_no_clip: 160*c0909341SAndroid Build Coastguard Worker mova ym1, [base+end_perm4] 161*c0909341SAndroid Build Coastguard Worker vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4) 162*c0909341SAndroid Build Coastguard Worker paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 163*c0909341SAndroid Build Coastguard Worker vpermb m0, m1, m0 164*c0909341SAndroid Build Coastguard Worker.end: 165*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm0 166*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm0 167*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm0, ym0, 1 168*c0909341SAndroid Build Coastguard Worker movq [r2+strideq*0], xm0 169*c0909341SAndroid Build Coastguard Worker movhps [r2+strideq*1], xm0 170*c0909341SAndroid Build Coastguard Worker RET 171*c0909341SAndroid Build Coastguard Worker.mask_edges: 172*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_m16384] 173*c0909341SAndroid Build Coastguard Worker test r3b, 0x08 174*c0909341SAndroid Build Coastguard Worker jz .mask_edges_no_bottom ; avoid buffer overread 175*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [botq+strideq*0-4], 2 176*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [botq+strideq*1-4], 3 177*c0909341SAndroid Build Coastguard Worker kmovw k1, [base+edge_mask4-8+r3*2] 178*c0909341SAndroid Build Coastguard Worker jmp .mask_edges_main 179*c0909341SAndroid Build Coastguard Worker.mask_edges_no_bottom: 180*c0909341SAndroid Build Coastguard Worker kmovw k1, [base+edge_mask4+8+r3*2] 181*c0909341SAndroid Build Coastguard Worker.mask_edges_main: 182*c0909341SAndroid Build Coastguard Worker or r3d, 0x04 183*c0909341SAndroid Build Coastguard Worker vmovdqa32 m1{k1}, m6 ; edge pixels = -16384 184*c0909341SAndroid Build Coastguard Worker kmovw k1, [base+edge_mask4-8+r3*2] 185*c0909341SAndroid Build Coastguard Worker vmovdqa32 m2{k1}, m6 186*c0909341SAndroid Build Coastguard Worker jmp .main 187*c0909341SAndroid Build Coastguard Worker.constrain_sec: 188*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] 189*c0909341SAndroid Build Coastguard Worker vpbroadcastw m14, r3d 190*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+sec_taps4] 191*c0909341SAndroid Build Coastguard Worker.constrain: 192*c0909341SAndroid Build Coastguard Worker paddw m8, m5, m9 193*c0909341SAndroid Build Coastguard Worker vpermi2w m8, m1, m2 ; k0p0 k1p0 194*c0909341SAndroid Build Coastguard Worker psubw m9, m5, m9 195*c0909341SAndroid Build Coastguard Worker vpermi2w m9, m1, m2 ; k0p1 k1p1 196*c0909341SAndroid Build Coastguard Worker CONSTRAIN m10, m8, m3, m12, m13, m14, m11 197*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m10, m15 198*c0909341SAndroid Build Coastguard Worker CONSTRAIN m10, m9, m3, m12, m13, m14, m11 199*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m10, m15 200*c0909341SAndroid Build Coastguard Worker ret 201*c0909341SAndroid Build Coastguard Worker 202*c0909341SAndroid Build Coastguard Worker; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 203*c0909341SAndroid Build Coastguard Worker; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 204*c0909341SAndroid Build Coastguard Worker; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 205*c0909341SAndroid Build Coastguard Worker; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 206*c0909341SAndroid Build Coastguard Worker 207*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \ 208*c0909341SAndroid Build Coastguard Worker pri, sec, dir, damping, edge 209*c0909341SAndroid Build Coastguard Worker lea r6, [cdef_dirs4] 210*c0909341SAndroid Build Coastguard Worker movu xm18, [dstq+strideq*0] 211*c0909341SAndroid Build Coastguard Worker vinserti128 ym18, [dstq+strideq*1], 1 212*c0909341SAndroid Build Coastguard Worker mova xm1, [leftq+16*0] 213*c0909341SAndroid Build Coastguard Worker mova xm2, [leftq+16*1] 214*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 215*c0909341SAndroid Build Coastguard Worker vinserti32x4 m18, [dstq+strideq*2], 2 216*c0909341SAndroid Build Coastguard Worker mova m5, [base+cdef_perm] 217*c0909341SAndroid Build Coastguard Worker vinserti32x4 m18, [dstq+r2 ], 3 218*c0909341SAndroid Build Coastguard Worker vpermt2d m1, m5, m18 219*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, m1, [topq+strideq*0-4], 0 220*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [topq+strideq*1-4], 1 221*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+strideq*4] 222*c0909341SAndroid Build Coastguard Worker movu xm19, [r3+strideq*0] 223*c0909341SAndroid Build Coastguard Worker vinserti128 ym19, [r3+strideq*1], 1 224*c0909341SAndroid Build Coastguard Worker vinserti32x4 m19, [r3+strideq*2], 2 225*c0909341SAndroid Build Coastguard Worker vinserti32x4 m19, [r3+r2 ], 3 226*c0909341SAndroid Build Coastguard Worker mov r3d, edgem 227*c0909341SAndroid Build Coastguard Worker movifnidn prid, prim 228*c0909341SAndroid Build Coastguard Worker vpermt2d m2, m5, m19 229*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, [base+pd_268435568] 230*c0909341SAndroid Build Coastguard Worker pxor m12, m12 231*c0909341SAndroid Build Coastguard Worker punpcklwd m18, m18 ; px (top) 232*c0909341SAndroid Build Coastguard Worker psrlw m5, 8 233*c0909341SAndroid Build Coastguard Worker punpcklwd m19, m19 ; px (bottom) 234*c0909341SAndroid Build Coastguard Worker mova m17, m16 235*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m2, q3210 236*c0909341SAndroid Build Coastguard Worker cmp r3d, 0x0f 237*c0909341SAndroid Build Coastguard Worker jne .mask_edges 238*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [botq+strideq*0-4], 2 239*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [botq+strideq*1-4], 3 240*c0909341SAndroid Build Coastguard Worker.main: 241*c0909341SAndroid Build Coastguard Worker test prid, prid 242*c0909341SAndroid Build Coastguard Worker jz .sec_only 243*c0909341SAndroid Build Coastguard Worker lzcnt r4d, prid 244*c0909341SAndroid Build Coastguard Worker rorx r3d, prid, 2 245*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, prim 246*c0909341SAndroid Build Coastguard Worker cmp dword r10m, 0xfff ; if (bpc == 12) 247*c0909341SAndroid Build Coastguard Worker cmove prid, r3d ; pri >>= 2 248*c0909341SAndroid Build Coastguard Worker mov r3d, dampingm 249*c0909341SAndroid Build Coastguard Worker and prid, 4 250*c0909341SAndroid Build Coastguard Worker sub r3d, 31 251*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+pri_taps4+priq] 252*c0909341SAndroid Build Coastguard Worker xor prid, prid 253*c0909341SAndroid Build Coastguard Worker add r4d, r3d 254*c0909341SAndroid Build Coastguard Worker cmovns prid, r4d ; pri_shift 255*c0909341SAndroid Build Coastguard Worker mov r4d, dirm 256*c0909341SAndroid Build Coastguard Worker vpbroadcastw m14, prid 257*c0909341SAndroid Build Coastguard Worker mov r5d, secm 258*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] 259*c0909341SAndroid Build Coastguard Worker call .constrain 260*c0909341SAndroid Build Coastguard Worker test r5d, r5d 261*c0909341SAndroid Build Coastguard Worker jz .end_no_clip 262*c0909341SAndroid Build Coastguard Worker lzcnt r5d, r5d 263*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, secm 264*c0909341SAndroid Build Coastguard Worker add r3d, r5d 265*c0909341SAndroid Build Coastguard Worker pminuw m3, m18, m6 266*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m18, m6 267*c0909341SAndroid Build Coastguard Worker pminuw m20, m19, m7 268*c0909341SAndroid Build Coastguard Worker pmaxsw m21, m19, m7 269*c0909341SAndroid Build Coastguard Worker pminuw m3, m8 270*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m8 271*c0909341SAndroid Build Coastguard Worker pminuw m20, m9 272*c0909341SAndroid Build Coastguard Worker pmaxsw m21, m9 273*c0909341SAndroid Build Coastguard Worker call .constrain_sec 274*c0909341SAndroid Build Coastguard Worker pminuw m3, m6 275*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m6 276*c0909341SAndroid Build Coastguard Worker pminuw m20, m7 277*c0909341SAndroid Build Coastguard Worker pmaxsw m21, m7 278*c0909341SAndroid Build Coastguard Worker pminuw m3, m8 279*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m8 280*c0909341SAndroid Build Coastguard Worker pminuw m20, m9 281*c0909341SAndroid Build Coastguard Worker pmaxsw m21, m9 282*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] 283*c0909341SAndroid Build Coastguard Worker call .constrain 284*c0909341SAndroid Build Coastguard Worker pminuw m3, m6 285*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m6 286*c0909341SAndroid Build Coastguard Worker mov r3, 0xcccccccccccccccc 287*c0909341SAndroid Build Coastguard Worker pminuw m20, m7 288*c0909341SAndroid Build Coastguard Worker pmaxsw m21, m7 289*c0909341SAndroid Build Coastguard Worker kmovq k1, r3 290*c0909341SAndroid Build Coastguard Worker pminuw m3, m8 291*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m8 292*c0909341SAndroid Build Coastguard Worker pminuw m20, m9 293*c0909341SAndroid Build Coastguard Worker pmaxsw m21, m9 294*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [base+deint_shuf] 295*c0909341SAndroid Build Coastguard Worker vpshldd m6, m20, m3, 16 296*c0909341SAndroid Build Coastguard Worker vmovdqu8 m3{k1}, m20 297*c0909341SAndroid Build Coastguard Worker vpshldd m18, m16, 8 298*c0909341SAndroid Build Coastguard Worker vpshldd m7, m21, m4, 16 299*c0909341SAndroid Build Coastguard Worker vmovdqu8 m4{k1}, m21 300*c0909341SAndroid Build Coastguard Worker vpshldd m19, m17, 8 301*c0909341SAndroid Build Coastguard Worker pminuw m3, m6 302*c0909341SAndroid Build Coastguard Worker paddd m16, m18 303*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m7 304*c0909341SAndroid Build Coastguard Worker paddd m17, m19 305*c0909341SAndroid Build Coastguard Worker psrldq m16, 1 306*c0909341SAndroid Build Coastguard Worker palignr m16{k1}, m17, m17, 15 307*c0909341SAndroid Build Coastguard Worker lea r6, [dstq+strideq*4] 308*c0909341SAndroid Build Coastguard Worker pmaxsw m16, m3 309*c0909341SAndroid Build Coastguard Worker pminsw m16, m4 310*c0909341SAndroid Build Coastguard Worker pshufb m16, m0 311*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm16 312*c0909341SAndroid Build Coastguard Worker movhps [r6 +strideq*0], xm16 313*c0909341SAndroid Build Coastguard Worker vextracti128 xm17, ym16, 1 314*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm17 315*c0909341SAndroid Build Coastguard Worker movhps [r6 +strideq*1], xm17 316*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm17, m16, 2 317*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm17 318*c0909341SAndroid Build Coastguard Worker movhps [r6 +strideq*2], xm17 319*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm16, m16, 3 320*c0909341SAndroid Build Coastguard Worker movq [dstq+r2 ], xm16 321*c0909341SAndroid Build Coastguard Worker movhps [r6 +r2 ], xm16 322*c0909341SAndroid Build Coastguard Worker RET 323*c0909341SAndroid Build Coastguard Worker.sec_only: 324*c0909341SAndroid Build Coastguard Worker mov r4d, dirm 325*c0909341SAndroid Build Coastguard Worker tzcnt r5d, secm 326*c0909341SAndroid Build Coastguard Worker mov r3d, dampingm 327*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, secm 328*c0909341SAndroid Build Coastguard Worker sub r3d, r5d ; sec_shift 329*c0909341SAndroid Build Coastguard Worker call .constrain_sec 330*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] 331*c0909341SAndroid Build Coastguard Worker call .constrain 332*c0909341SAndroid Build Coastguard Worker.end_no_clip: 333*c0909341SAndroid Build Coastguard Worker mova ym20, [base+end_perm4] 334*c0909341SAndroid Build Coastguard Worker vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4) 335*c0909341SAndroid Build Coastguard Worker vpshldd m19, m17, 8 336*c0909341SAndroid Build Coastguard Worker paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) 337*c0909341SAndroid Build Coastguard Worker paddd m17, m19 338*c0909341SAndroid Build Coastguard Worker vpermb m16, m20, m16 339*c0909341SAndroid Build Coastguard Worker vpermb m17, m20, m17 340*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm16 341*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm16 342*c0909341SAndroid Build Coastguard Worker vextracti128 xm16, ym16, 1 343*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm16 344*c0909341SAndroid Build Coastguard Worker movhps [dstq+r2 ], xm16 345*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 346*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm17 347*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm17 348*c0909341SAndroid Build Coastguard Worker vextracti128 xm17, ym17, 1 349*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm17 350*c0909341SAndroid Build Coastguard Worker movhps [dstq+r2 ], xm17 351*c0909341SAndroid Build Coastguard Worker RET 352*c0909341SAndroid Build Coastguard Worker.mask_edges: 353*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_m16384] 354*c0909341SAndroid Build Coastguard Worker test r3b, 0x08 355*c0909341SAndroid Build Coastguard Worker jz .mask_edges_no_bottom ; avoid buffer overread 356*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [botq+strideq*0-4], 2 357*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [botq+strideq*1-4], 3 358*c0909341SAndroid Build Coastguard Worker kmovw k1, [base+edge_mask4-8+r3*2] 359*c0909341SAndroid Build Coastguard Worker jmp .mask_edges_main 360*c0909341SAndroid Build Coastguard Worker.mask_edges_no_bottom: 361*c0909341SAndroid Build Coastguard Worker kmovw k1, [base+edge_mask4+8+r3*2] 362*c0909341SAndroid Build Coastguard Worker.mask_edges_main: 363*c0909341SAndroid Build Coastguard Worker mov r4d, r3d 364*c0909341SAndroid Build Coastguard Worker or r3d, 0x0c 365*c0909341SAndroid Build Coastguard Worker vmovdqa32 m0{k1}, m6 ; edge pixels = -16384 366*c0909341SAndroid Build Coastguard Worker kmovw k1, [base+edge_mask4-8+r3*2] 367*c0909341SAndroid Build Coastguard Worker or r4d, 0x04 368*c0909341SAndroid Build Coastguard Worker vmovdqa32 m1{k1}, m6 369*c0909341SAndroid Build Coastguard Worker kmovw k1, [base+edge_mask4-8+r4*2] 370*c0909341SAndroid Build Coastguard Worker vmovdqa32 m2{k1}, m6 371*c0909341SAndroid Build Coastguard Worker jmp .main 372*c0909341SAndroid Build Coastguard Worker.constrain_sec: 373*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] 374*c0909341SAndroid Build Coastguard Worker vpbroadcastw m14, r3d 375*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+sec_taps4] 376*c0909341SAndroid Build Coastguard Worker.constrain: 377*c0909341SAndroid Build Coastguard Worker paddw m7, m5, m9 378*c0909341SAndroid Build Coastguard Worker mova m6, m0 379*c0909341SAndroid Build Coastguard Worker vpermt2w m6, m7, m1 ; k0p0 k1p0 (top) 380*c0909341SAndroid Build Coastguard Worker psubw m9, m5, m9 381*c0909341SAndroid Build Coastguard Worker mova m8, m0 382*c0909341SAndroid Build Coastguard Worker vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom) 383*c0909341SAndroid Build Coastguard Worker CONSTRAIN m10, m6, m18, m12, m13, m14, m11 384*c0909341SAndroid Build Coastguard Worker vpermt2w m8, m9, m1 ; k0p1 k1p1 (top) 385*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m10, m15 386*c0909341SAndroid Build Coastguard Worker CONSTRAIN m10, m7, m19, m12, m13, m14, m11 387*c0909341SAndroid Build Coastguard Worker vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom) 388*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m10, m15 389*c0909341SAndroid Build Coastguard Worker CONSTRAIN m10, m8, m18, m12, m13, m14, m11 390*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m10, m15 391*c0909341SAndroid Build Coastguard Worker CONSTRAIN m10, m9, m19, m12, m13, m14, m11 392*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m10, m15 393*c0909341SAndroid Build Coastguard Worker ret 394*c0909341SAndroid Build Coastguard Worker 395*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \ 396*c0909341SAndroid Build Coastguard Worker pri, sec, dir, damping, edge 397*c0909341SAndroid Build Coastguard Worker%define base r6-cdef_dirs8 398*c0909341SAndroid Build Coastguard Worker lea r6, [cdef_dirs8] 399*c0909341SAndroid Build Coastguard Worker movu ym17, [dstq+strideq*0] 400*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [dstq+strideq*1], 1 401*c0909341SAndroid Build Coastguard Worker movq xm4, [leftq+8*0] 402*c0909341SAndroid Build Coastguard Worker movq xm5, [leftq+8*1] 403*c0909341SAndroid Build Coastguard Worker psrld m2, [base+cdef_perm], 16 404*c0909341SAndroid Build Coastguard Worker movq xm6, [leftq+8*2] 405*c0909341SAndroid Build Coastguard Worker movq xm7, [leftq+8*3] 406*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 407*c0909341SAndroid Build Coastguard Worker movu ym16, [topq+strideq*0-4] 408*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [topq+strideq*1-4], 1 409*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+strideq*4] 410*c0909341SAndroid Build Coastguard Worker movu ym18, [dstq+strideq*2] 411*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, [dstq+r2 ], 1 412*c0909341SAndroid Build Coastguard Worker movu ym19, [r3+strideq*0] 413*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [r3+strideq*1], 1 414*c0909341SAndroid Build Coastguard Worker movu ym20, [r3+strideq*2] 415*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, [r3+r2 ], 1 416*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m17, m18, q2020 ; px (top) 417*c0909341SAndroid Build Coastguard Worker mov r3d, edgem 418*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m19, m20, q2020 ; px (bottom) 419*c0909341SAndroid Build Coastguard Worker movifnidn prid, prim 420*c0909341SAndroid Build Coastguard Worker vpermt2d m17, m2, m4 421*c0909341SAndroid Build Coastguard Worker vpermt2d m18, m2, m5 422*c0909341SAndroid Build Coastguard Worker pxor m12, m12 423*c0909341SAndroid Build Coastguard Worker vpermt2d m19, m2, m6 424*c0909341SAndroid Build Coastguard Worker vpermt2d m20, m2, m7 425*c0909341SAndroid Build Coastguard Worker cmp r3d, 0x0f 426*c0909341SAndroid Build Coastguard Worker jne .mask_edges 427*c0909341SAndroid Build Coastguard Worker movu ym21, [botq+strideq*0-4] 428*c0909341SAndroid Build Coastguard Worker vinserti32x8 m21, [botq+strideq*1-4], 1 429*c0909341SAndroid Build Coastguard Worker.main: 430*c0909341SAndroid Build Coastguard Worker mova [rsp+64*0], m16 ; top 431*c0909341SAndroid Build Coastguard Worker mova [rsp+64*1], m17 ; 0 1 432*c0909341SAndroid Build Coastguard Worker mova [rsp+64*2], m18 ; 2 3 433*c0909341SAndroid Build Coastguard Worker mova [rsp+64*3], m19 ; 4 5 434*c0909341SAndroid Build Coastguard Worker mova [rsp+64*4], m20 ; 6 7 435*c0909341SAndroid Build Coastguard Worker mova [rsp+64*5], m21 ; bottom 436*c0909341SAndroid Build Coastguard Worker test prid, prid 437*c0909341SAndroid Build Coastguard Worker jz .sec_only 438*c0909341SAndroid Build Coastguard Worker lzcnt r4d, prid 439*c0909341SAndroid Build Coastguard Worker rorx r3d, prid, 2 440*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, prim 441*c0909341SAndroid Build Coastguard Worker cmp dword r10m, 0xfff ; if (bpc == 12) 442*c0909341SAndroid Build Coastguard Worker cmove prid, r3d ; pri >>= 2 443*c0909341SAndroid Build Coastguard Worker mov r3d, dampingm 444*c0909341SAndroid Build Coastguard Worker and prid, 4 445*c0909341SAndroid Build Coastguard Worker sub r3d, 31 446*c0909341SAndroid Build Coastguard Worker add r4d, r3d ; pri_shift 447*c0909341SAndroid Build Coastguard Worker vpbroadcastw m14, r4d 448*c0909341SAndroid Build Coastguard Worker mov r4d, dirm 449*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+pri_taps8+priq*2+0] 450*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [base+pri_taps8+priq*2+4] 451*c0909341SAndroid Build Coastguard Worker movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1 452*c0909341SAndroid Build Coastguard Worker pmaxsw m14, m12 453*c0909341SAndroid Build Coastguard Worker call .constrain 454*c0909341SAndroid Build Coastguard Worker mov r5d, secm 455*c0909341SAndroid Build Coastguard Worker pmullw m16, m8, m2 456*c0909341SAndroid Build Coastguard Worker pmullw m17, m9, m2 457*c0909341SAndroid Build Coastguard Worker test r5d, r5d 458*c0909341SAndroid Build Coastguard Worker jnz .pri_sec 459*c0909341SAndroid Build Coastguard Worker movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 460*c0909341SAndroid Build Coastguard Worker call .constrain 461*c0909341SAndroid Build Coastguard Worker pmullw m8, m3 462*c0909341SAndroid Build Coastguard Worker pmullw m9, m3 463*c0909341SAndroid Build Coastguard Worker jmp .end_no_clip 464*c0909341SAndroid Build Coastguard Worker.pri_sec: 465*c0909341SAndroid Build Coastguard Worker lzcnt r5d, r5d 466*c0909341SAndroid Build Coastguard Worker add r3d, r5d ; sec_shift 467*c0909341SAndroid Build Coastguard Worker movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 468*c0909341SAndroid Build Coastguard Worker pminuw m18, m0, m4 469*c0909341SAndroid Build Coastguard Worker pmaxsw m19, m0, m4 470*c0909341SAndroid Build Coastguard Worker pminuw m20, m1, m5 471*c0909341SAndroid Build Coastguard Worker pmaxsw m21, m1, m5 472*c0909341SAndroid Build Coastguard Worker call .min_max_constrain2 473*c0909341SAndroid Build Coastguard Worker movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2 474*c0909341SAndroid Build Coastguard Worker pmullw m8, m3 475*c0909341SAndroid Build Coastguard Worker pmullw m9, m3 476*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, secm 477*c0909341SAndroid Build Coastguard Worker vpbroadcastw m14, r3d 478*c0909341SAndroid Build Coastguard Worker paddw m16, m8 479*c0909341SAndroid Build Coastguard Worker paddw m17, m9 480*c0909341SAndroid Build Coastguard Worker call .min_max_constrain 481*c0909341SAndroid Build Coastguard Worker movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3 482*c0909341SAndroid Build Coastguard Worker mova m2, m8 483*c0909341SAndroid Build Coastguard Worker mova m3, m9 484*c0909341SAndroid Build Coastguard Worker call .min_max_constrain 485*c0909341SAndroid Build Coastguard Worker movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2 486*c0909341SAndroid Build Coastguard Worker paddw m2, m8 487*c0909341SAndroid Build Coastguard Worker paddw m3, m9 488*c0909341SAndroid Build Coastguard Worker call .min_max_constrain 489*c0909341SAndroid Build Coastguard Worker movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3 490*c0909341SAndroid Build Coastguard Worker paddw m2, m2 491*c0909341SAndroid Build Coastguard Worker paddw m3, m3 492*c0909341SAndroid Build Coastguard Worker paddw m16, m8 493*c0909341SAndroid Build Coastguard Worker paddw m17, m9 494*c0909341SAndroid Build Coastguard Worker call .min_max_constrain 495*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pw_2048] 496*c0909341SAndroid Build Coastguard Worker paddw m16, m2 497*c0909341SAndroid Build Coastguard Worker paddw m17, m3 498*c0909341SAndroid Build Coastguard Worker paddw m16, m8 499*c0909341SAndroid Build Coastguard Worker paddw m17, m9 500*c0909341SAndroid Build Coastguard Worker psraw m8, m16, 15 501*c0909341SAndroid Build Coastguard Worker psraw m9, m17, 15 502*c0909341SAndroid Build Coastguard Worker paddw m16, m8 503*c0909341SAndroid Build Coastguard Worker paddw m17, m9 504*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m10 505*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m10 506*c0909341SAndroid Build Coastguard Worker pminuw m18, m4 507*c0909341SAndroid Build Coastguard Worker pmaxsw m19, m4 508*c0909341SAndroid Build Coastguard Worker pminuw m20, m5 509*c0909341SAndroid Build Coastguard Worker pmaxsw m21, m5 510*c0909341SAndroid Build Coastguard Worker pminuw m18, m6 511*c0909341SAndroid Build Coastguard Worker pmaxsw m19, m6 512*c0909341SAndroid Build Coastguard Worker pminuw m20, m7 513*c0909341SAndroid Build Coastguard Worker pmaxsw m21, m7 514*c0909341SAndroid Build Coastguard Worker paddw m16, m0 515*c0909341SAndroid Build Coastguard Worker paddw m17, m1 516*c0909341SAndroid Build Coastguard Worker pmaxsw m16, m18 517*c0909341SAndroid Build Coastguard Worker pmaxsw m17, m20 518*c0909341SAndroid Build Coastguard Worker pminsw m16, m19 519*c0909341SAndroid Build Coastguard Worker pminsw m17, m21 520*c0909341SAndroid Build Coastguard Worker jmp .end 521*c0909341SAndroid Build Coastguard Worker.sec_only: 522*c0909341SAndroid Build Coastguard Worker tzcnt r5d, secm 523*c0909341SAndroid Build Coastguard Worker mov r4d, dirm 524*c0909341SAndroid Build Coastguard Worker mov r3d, dampingm 525*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, secm 526*c0909341SAndroid Build Coastguard Worker sub r3d, r5d 527*c0909341SAndroid Build Coastguard Worker movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] 528*c0909341SAndroid Build Coastguard Worker vpbroadcastw m14, r3d 529*c0909341SAndroid Build Coastguard Worker call .constrain 530*c0909341SAndroid Build Coastguard Worker movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] 531*c0909341SAndroid Build Coastguard Worker mova m16, m8 532*c0909341SAndroid Build Coastguard Worker mova m17, m9 533*c0909341SAndroid Build Coastguard Worker call .constrain 534*c0909341SAndroid Build Coastguard Worker movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] 535*c0909341SAndroid Build Coastguard Worker paddw m16, m8 536*c0909341SAndroid Build Coastguard Worker paddw m17, m9 537*c0909341SAndroid Build Coastguard Worker call .constrain 538*c0909341SAndroid Build Coastguard Worker movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] 539*c0909341SAndroid Build Coastguard Worker paddw m16, m16 540*c0909341SAndroid Build Coastguard Worker paddw m17, m17 541*c0909341SAndroid Build Coastguard Worker paddw m16, m8 542*c0909341SAndroid Build Coastguard Worker paddw m17, m9 543*c0909341SAndroid Build Coastguard Worker call .constrain 544*c0909341SAndroid Build Coastguard Worker.end_no_clip: 545*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pw_2048] 546*c0909341SAndroid Build Coastguard Worker paddw m16, m8 547*c0909341SAndroid Build Coastguard Worker paddw m17, m9 548*c0909341SAndroid Build Coastguard Worker psraw m8, m16, 15 549*c0909341SAndroid Build Coastguard Worker psraw m9, m17, 15 550*c0909341SAndroid Build Coastguard Worker paddw m16, m8 551*c0909341SAndroid Build Coastguard Worker paddw m17, m9 552*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m10 553*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m10 554*c0909341SAndroid Build Coastguard Worker paddw m16, m0 555*c0909341SAndroid Build Coastguard Worker paddw m17, m1 556*c0909341SAndroid Build Coastguard Worker.end: 557*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm16 558*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], ym16, 1 559*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m16, 2 560*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r2 ], m16, 3 561*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 562*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm17 563*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+strideq*1], ym17, 1 564*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m17, 2 565*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r2 ], m17, 3 566*c0909341SAndroid Build Coastguard Worker RET 567*c0909341SAndroid Build Coastguard Worker.mask_edges: 568*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [base+pw_m16384] 569*c0909341SAndroid Build Coastguard Worker test r3b, 0x08 570*c0909341SAndroid Build Coastguard Worker jz .mask_edges_no_bottom ; avoid buffer overread 571*c0909341SAndroid Build Coastguard Worker movu ym21, [botq+strideq*0-4] 572*c0909341SAndroid Build Coastguard Worker vinserti32x8 m21, [botq+strideq*1-4], 1 573*c0909341SAndroid Build Coastguard Worker jmp .mask_edges_top 574*c0909341SAndroid Build Coastguard Worker.mask_edges_no_bottom: 575*c0909341SAndroid Build Coastguard Worker mova m21, m2 576*c0909341SAndroid Build Coastguard Worker.mask_edges_top: 577*c0909341SAndroid Build Coastguard Worker test r3b, 0x04 578*c0909341SAndroid Build Coastguard Worker jnz .mask_edges_main 579*c0909341SAndroid Build Coastguard Worker mova m16, m2 580*c0909341SAndroid Build Coastguard Worker.mask_edges_main: 581*c0909341SAndroid Build Coastguard Worker and r3d, 0x03 582*c0909341SAndroid Build Coastguard Worker cmp r3d, 0x03 583*c0909341SAndroid Build Coastguard Worker je .main 584*c0909341SAndroid Build Coastguard Worker kmovw k1, [base+edge_mask8+r3*2] 585*c0909341SAndroid Build Coastguard Worker vmovdqa32 m16{k1}, m2 ; edge pixels = -16384 586*c0909341SAndroid Build Coastguard Worker vmovdqa32 m17{k1}, m2 587*c0909341SAndroid Build Coastguard Worker vmovdqa32 m18{k1}, m2 588*c0909341SAndroid Build Coastguard Worker vmovdqa32 m19{k1}, m2 589*c0909341SAndroid Build Coastguard Worker vmovdqa32 m20{k1}, m2 590*c0909341SAndroid Build Coastguard Worker vmovdqa32 m21{k1}, m2 591*c0909341SAndroid Build Coastguard Worker jmp .main 592*c0909341SAndroid Build Coastguard WorkerALIGN function_align 593*c0909341SAndroid Build Coastguard Worker.min_max_constrain: 594*c0909341SAndroid Build Coastguard Worker pminuw m18, m4 595*c0909341SAndroid Build Coastguard Worker pmaxsw m19, m4 596*c0909341SAndroid Build Coastguard Worker pminuw m20, m5 597*c0909341SAndroid Build Coastguard Worker pmaxsw m21, m5 598*c0909341SAndroid Build Coastguard Worker.min_max_constrain2: 599*c0909341SAndroid Build Coastguard Worker pminuw m18, m6 600*c0909341SAndroid Build Coastguard Worker pmaxsw m19, m6 601*c0909341SAndroid Build Coastguard Worker pminuw m20, m7 602*c0909341SAndroid Build Coastguard Worker pmaxsw m21, m7 603*c0909341SAndroid Build Coastguard Worker.constrain: 604*c0909341SAndroid Build Coastguard Worker %define tmp rsp+gprsize+68 605*c0909341SAndroid Build Coastguard Worker movu m4, [tmp+r5+64*0] 606*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top) 607*c0909341SAndroid Build Coastguard Worker movu m5, [tmp+r5+64*2] 608*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom) 609*c0909341SAndroid Build Coastguard Worker neg r5 610*c0909341SAndroid Build Coastguard Worker movu m6, [tmp+r5+64*0] 611*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top) 612*c0909341SAndroid Build Coastguard Worker movu m7, [tmp+r5+64*2] 613*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom) 614*c0909341SAndroid Build Coastguard Worker CONSTRAIN m8, m4, m0, m12, m13, m14, m15 615*c0909341SAndroid Build Coastguard Worker CONSTRAIN m9, m5, m1, m12, m13, m14, m15 616*c0909341SAndroid Build Coastguard Worker CONSTRAIN m10, m6, m0, m12, m13, m14, m15 617*c0909341SAndroid Build Coastguard Worker CONSTRAIN m11, m7, m1, m12, m13, m14, m15 618*c0909341SAndroid Build Coastguard Worker paddw m8, m10 619*c0909341SAndroid Build Coastguard Worker paddw m9, m11 620*c0909341SAndroid Build Coastguard Worker ret 621*c0909341SAndroid Build Coastguard Worker 622*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 623