1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard Worker%macro JMP_TABLE 2-* 32*c0909341SAndroid Build Coastguard Worker %xdefine %1_jmptable %%table 33*c0909341SAndroid Build Coastguard Worker %xdefine %%base mangle(private_prefix %+ _%1_avx2) 34*c0909341SAndroid Build Coastguard Worker %%table: 35*c0909341SAndroid Build Coastguard Worker %rep %0 - 1 36*c0909341SAndroid Build Coastguard Worker dd %%base %+ .%2 - %%table 37*c0909341SAndroid Build Coastguard Worker %rotate 1 38*c0909341SAndroid Build Coastguard Worker %endrep 39*c0909341SAndroid Build Coastguard Worker%endmacro 40*c0909341SAndroid Build Coastguard Worker 41*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER_JMP_TABLE 1 42*c0909341SAndroid Build Coastguard WorkerJMP_TABLE cdef_filter_%1_8bpc, \ 43*c0909341SAndroid Build Coastguard Worker d6k0, d6k1, d7k0, d7k1, \ 44*c0909341SAndroid Build Coastguard Worker d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \ 45*c0909341SAndroid Build Coastguard Worker d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \ 46*c0909341SAndroid Build Coastguard Worker d0k0, d0k1, d1k0, d1k1 47*c0909341SAndroid Build Coastguard Worker%endmacro 48*c0909341SAndroid Build Coastguard Worker 49*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 32 50*c0909341SAndroid Build Coastguard Worker 51*c0909341SAndroid Build Coastguard Workerpd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 52*c0909341SAndroid Build Coastguard Workerblend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00 53*c0909341SAndroid Build Coastguard Worker dd 0x80, 0x00, 0x00 54*c0909341SAndroid Build Coastguard Workerblend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 55*c0909341SAndroid Build Coastguard Workerblend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 56*c0909341SAndroid Build Coastguard Worker dd 0x00, 0x00 57*c0909341SAndroid Build Coastguard Workerblend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 58*c0909341SAndroid Build Coastguard Worker dd 0x0000 59*c0909341SAndroid Build Coastguard Workerblend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 60*c0909341SAndroid Build Coastguard Worker dd 0x0000, 0x0000 61*c0909341SAndroid Build Coastguard Workerblend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80 62*c0909341SAndroid Build Coastguard Workerblend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000 63*c0909341SAndroid Build Coastguard Workerdiv_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 64*c0909341SAndroid Build Coastguard Workershufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 65*c0909341SAndroid Build Coastguard Workershufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 66*c0909341SAndroid Build Coastguard Workerpw_128: times 2 dw 128 67*c0909341SAndroid Build Coastguard Workerpw_2048: times 2 dw 2048 68*c0909341SAndroid Build Coastguard Workertap_table: ; masks for 8 bit shifts 69*c0909341SAndroid Build Coastguard Worker db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 70*c0909341SAndroid Build Coastguard Worker ; weights 71*c0909341SAndroid Build Coastguard Worker db 4, 2, 3, 3, 2, 1 72*c0909341SAndroid Build Coastguard Worker db -1 * 16 + 1, -2 * 16 + 2 73*c0909341SAndroid Build Coastguard Worker db 0 * 16 + 1, -1 * 16 + 2 74*c0909341SAndroid Build Coastguard Worker db 0 * 16 + 1, 0 * 16 + 2 75*c0909341SAndroid Build Coastguard Worker db 0 * 16 + 1, 1 * 16 + 2 76*c0909341SAndroid Build Coastguard Worker db 1 * 16 + 1, 2 * 16 + 2 77*c0909341SAndroid Build Coastguard Worker db 1 * 16 + 0, 2 * 16 + 1 78*c0909341SAndroid Build Coastguard Worker db 1 * 16 + 0, 2 * 16 + 0 79*c0909341SAndroid Build Coastguard Worker db 1 * 16 + 0, 2 * 16 - 1 80*c0909341SAndroid Build Coastguard Worker ; the last 6 are repeats of the first 6 so we don't need to & 7 81*c0909341SAndroid Build Coastguard Worker db -1 * 16 + 1, -2 * 16 + 2 82*c0909341SAndroid Build Coastguard Worker db 0 * 16 + 1, -1 * 16 + 2 83*c0909341SAndroid Build Coastguard Worker db 0 * 16 + 1, 0 * 16 + 2 84*c0909341SAndroid Build Coastguard Worker db 0 * 16 + 1, 1 * 16 + 2 85*c0909341SAndroid Build Coastguard Worker db 1 * 16 + 1, 2 * 16 + 2 86*c0909341SAndroid Build Coastguard Worker db 1 * 16 + 0, 2 * 16 + 1 87*c0909341SAndroid Build Coastguard Worker 88*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER_JMP_TABLE 4x4 89*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER_JMP_TABLE 4x8 90*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER_JMP_TABLE 8x8 91*c0909341SAndroid Build Coastguard Worker 92*c0909341SAndroid Build Coastguard WorkerSECTION .text 93*c0909341SAndroid Build Coastguard Worker 94*c0909341SAndroid Build Coastguard Worker%macro PREP_REGS 2 ; w, h 95*c0909341SAndroid Build Coastguard Worker ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] 96*c0909341SAndroid Build Coastguard Worker mov dird, r7m 97*c0909341SAndroid Build Coastguard Worker lea tableq, [cdef_filter_%1x%2_8bpc_jmptable] 98*c0909341SAndroid Build Coastguard Worker lea dirq, [tableq+dirq*2*4] 99*c0909341SAndroid Build Coastguard Worker%if %1 == 4 100*c0909341SAndroid Build Coastguard Worker %if %2 == 4 101*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ 102*c0909341SAndroid Build Coastguard Worker table, dir, dirjmp, stride3, k 103*c0909341SAndroid Build Coastguard Worker %else 104*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ 105*c0909341SAndroid Build Coastguard Worker table, dir, dirjmp, dst4, stride3, k 106*c0909341SAndroid Build Coastguard Worker lea dst4q, [dstq+strideq*4] 107*c0909341SAndroid Build Coastguard Worker %endif 108*c0909341SAndroid Build Coastguard Worker%else 109*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \ 110*c0909341SAndroid Build Coastguard Worker table, dir, dirjmp, top2, stride3, k 111*c0909341SAndroid Build Coastguard Worker mov hq, -8 112*c0909341SAndroid Build Coastguard Worker lea top1q, [top1q+strideq*0] 113*c0909341SAndroid Build Coastguard Worker lea top2q, [top1q+strideq*1] 114*c0909341SAndroid Build Coastguard Worker%endif 115*c0909341SAndroid Build Coastguard Worker%if %1 == 4 116*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 117*c0909341SAndroid Build Coastguard Worker%endif 118*c0909341SAndroid Build Coastguard Worker%endmacro 119*c0909341SAndroid Build Coastguard Worker 120*c0909341SAndroid Build Coastguard Worker%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max 121*c0909341SAndroid Build Coastguard Worker mov kd, 1 122*c0909341SAndroid Build Coastguard Worker pxor m15, m15 ; sum 123*c0909341SAndroid Build Coastguard Worker%if %2 == 8 124*c0909341SAndroid Build Coastguard Worker pxor m12, m12 125*c0909341SAndroid Build Coastguard Worker %if %1 == 4 126*c0909341SAndroid Build Coastguard Worker movd xm4, [dstq +strideq*0] 127*c0909341SAndroid Build Coastguard Worker movd xm6, [dstq +strideq*1] 128*c0909341SAndroid Build Coastguard Worker movd xm5, [dstq +strideq*2] 129*c0909341SAndroid Build Coastguard Worker movd xm7, [dstq +stride3q ] 130*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [dst4q+strideq*0], 1 131*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [dst4q+strideq*1], 1 132*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [dst4q+strideq*2], 1 133*c0909341SAndroid Build Coastguard Worker vinserti128 m7, [dst4q+stride3q ], 1 134*c0909341SAndroid Build Coastguard Worker punpckldq m4, m6 135*c0909341SAndroid Build Coastguard Worker punpckldq m5, m7 136*c0909341SAndroid Build Coastguard Worker %else 137*c0909341SAndroid Build Coastguard Worker movq xm4, [dstq+strideq*0] 138*c0909341SAndroid Build Coastguard Worker movq xm5, [dstq+strideq*1] 139*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [dstq+strideq*2], 1 140*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [dstq+stride3q ], 1 141*c0909341SAndroid Build Coastguard Worker %endif 142*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m5 143*c0909341SAndroid Build Coastguard Worker%else 144*c0909341SAndroid Build Coastguard Worker movd xm4, [dstq+strideq*0] 145*c0909341SAndroid Build Coastguard Worker movd xm5, [dstq+strideq*1] 146*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [dstq+strideq*2], 1 147*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [dstq+stride3q ], 1 148*c0909341SAndroid Build Coastguard Worker punpckldq m4, m5 149*c0909341SAndroid Build Coastguard Worker%endif 150*c0909341SAndroid Build Coastguard Worker%if %3 == 1 151*c0909341SAndroid Build Coastguard Worker mova m7, m4 ; min 152*c0909341SAndroid Build Coastguard Worker mova m8, m4 ; max 153*c0909341SAndroid Build Coastguard Worker%endif 154*c0909341SAndroid Build Coastguard Worker%endmacro 155*c0909341SAndroid Build Coastguard Worker 156*c0909341SAndroid Build Coastguard Worker%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength 157*c0909341SAndroid Build Coastguard Worker ; mul_tap, w, h, clip 158*c0909341SAndroid Build Coastguard Worker ; load p0/p1 159*c0909341SAndroid Build Coastguard Worker movsxd dirjmpq, [dirq+kq*4+%1*2*4] 160*c0909341SAndroid Build Coastguard Worker add dirjmpq, tableq 161*c0909341SAndroid Build Coastguard Worker call dirjmpq 162*c0909341SAndroid Build Coastguard Worker 163*c0909341SAndroid Build Coastguard Worker%if %8 == 1 164*c0909341SAndroid Build Coastguard Worker pmaxub m7, m5 165*c0909341SAndroid Build Coastguard Worker pminub m8, m5 166*c0909341SAndroid Build Coastguard Worker pmaxub m7, m6 167*c0909341SAndroid Build Coastguard Worker pminub m8, m6 168*c0909341SAndroid Build Coastguard Worker%endif 169*c0909341SAndroid Build Coastguard Worker 170*c0909341SAndroid Build Coastguard Worker ; accumulate sum[m15] over p0/p1 171*c0909341SAndroid Build Coastguard Worker%if %7 == 4 172*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m6 173*c0909341SAndroid Build Coastguard Worker punpcklbw m6, m4, m4 174*c0909341SAndroid Build Coastguard Worker psubusb m9, m5, m6 175*c0909341SAndroid Build Coastguard Worker psubusb m5, m6, m5 176*c0909341SAndroid Build Coastguard Worker por m9, m5 ; abs_diff_p01(p01 - px) 177*c0909341SAndroid Build Coastguard Worker pcmpeqb m5, m9 178*c0909341SAndroid Build Coastguard Worker por m5, %5 179*c0909341SAndroid Build Coastguard Worker psignb m6, %5, m5 180*c0909341SAndroid Build Coastguard Worker psrlw m5, m9, %2 ; emulate 8-bit shift 181*c0909341SAndroid Build Coastguard Worker pand m5, %3 182*c0909341SAndroid Build Coastguard Worker psubusb m5, %4, m5 183*c0909341SAndroid Build Coastguard Worker pminub m5, m9 184*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m6 185*c0909341SAndroid Build Coastguard Worker paddw m15, m5 186*c0909341SAndroid Build Coastguard Worker%else 187*c0909341SAndroid Build Coastguard Worker psubusb m9, m5, m4 188*c0909341SAndroid Build Coastguard Worker psubusb m5, m4, m5 189*c0909341SAndroid Build Coastguard Worker psubusb m11, m6, m4 190*c0909341SAndroid Build Coastguard Worker psubusb m6, m4, m6 191*c0909341SAndroid Build Coastguard Worker por m9, m5 ; abs_diff_p0(p0 - px) 192*c0909341SAndroid Build Coastguard Worker por m11, m6 ; abs_diff_p1(p1 - px) 193*c0909341SAndroid Build Coastguard Worker pcmpeqb m5, m9 194*c0909341SAndroid Build Coastguard Worker pcmpeqb m6, m11 195*c0909341SAndroid Build Coastguard Worker punpckhbw m10, m9, m11 196*c0909341SAndroid Build Coastguard Worker punpcklbw m9, m11 197*c0909341SAndroid Build Coastguard Worker por m5, %5 198*c0909341SAndroid Build Coastguard Worker por m11, m6, %5 199*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m5, m11 200*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m11 201*c0909341SAndroid Build Coastguard Worker psignb m11, %5, m6 202*c0909341SAndroid Build Coastguard Worker psrlw m6, m10, %2 ; emulate 8-bit shift 203*c0909341SAndroid Build Coastguard Worker pand m6, %3 204*c0909341SAndroid Build Coastguard Worker psubusb m6, %4, m6 205*c0909341SAndroid Build Coastguard Worker pminub m6, m10 206*c0909341SAndroid Build Coastguard Worker pmaddubsw m6, m11 207*c0909341SAndroid Build Coastguard Worker paddw m12, m6 208*c0909341SAndroid Build Coastguard Worker psignb m11, %5, m5 209*c0909341SAndroid Build Coastguard Worker psrlw m5, m9, %2 ; emulate 8-bit shift 210*c0909341SAndroid Build Coastguard Worker pand m5, %3 211*c0909341SAndroid Build Coastguard Worker psubusb m5, %4, m5 212*c0909341SAndroid Build Coastguard Worker pminub m5, m9 213*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m11 214*c0909341SAndroid Build Coastguard Worker paddw m15, m5 215*c0909341SAndroid Build Coastguard Worker%endif 216*c0909341SAndroid Build Coastguard Worker%endmacro 217*c0909341SAndroid Build Coastguard Worker 218*c0909341SAndroid Build Coastguard Worker%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip 219*c0909341SAndroid Build Coastguard Worker%if %2 == 4 220*c0909341SAndroid Build Coastguard Worker %if %5 == 1 221*c0909341SAndroid Build Coastguard Worker punpcklbw m4, %3 222*c0909341SAndroid Build Coastguard Worker %endif 223*c0909341SAndroid Build Coastguard Worker pcmpgtw %3, m15 224*c0909341SAndroid Build Coastguard Worker paddw m15, %3 225*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, %4 226*c0909341SAndroid Build Coastguard Worker %if %5 == 0 227*c0909341SAndroid Build Coastguard Worker packsswb m15, m15 228*c0909341SAndroid Build Coastguard Worker paddb m4, m15 229*c0909341SAndroid Build Coastguard Worker %else 230*c0909341SAndroid Build Coastguard Worker paddw m4, m15 231*c0909341SAndroid Build Coastguard Worker packuswb m4, m4 ; clip px in [0x0,0xff] 232*c0909341SAndroid Build Coastguard Worker pminub m4, m7 233*c0909341SAndroid Build Coastguard Worker pmaxub m4, m8 234*c0909341SAndroid Build Coastguard Worker %endif 235*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 236*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm4 237*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm5 238*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm4, 1 239*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm5, 1 240*c0909341SAndroid Build Coastguard Worker%else 241*c0909341SAndroid Build Coastguard Worker pcmpgtw m6, %3, m12 242*c0909341SAndroid Build Coastguard Worker pcmpgtw m5, %3, m15 243*c0909341SAndroid Build Coastguard Worker paddw m12, m6 244*c0909341SAndroid Build Coastguard Worker paddw m15, m5 245*c0909341SAndroid Build Coastguard Worker %if %5 == 1 246*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m4, %3 247*c0909341SAndroid Build Coastguard Worker punpcklbw m4, %3 248*c0909341SAndroid Build Coastguard Worker %endif 249*c0909341SAndroid Build Coastguard Worker pmulhrsw m12, %4 250*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, %4 251*c0909341SAndroid Build Coastguard Worker %if %5 == 0 252*c0909341SAndroid Build Coastguard Worker packsswb m15, m12 253*c0909341SAndroid Build Coastguard Worker paddb m4, m15 254*c0909341SAndroid Build Coastguard Worker %else 255*c0909341SAndroid Build Coastguard Worker paddw m5, m12 256*c0909341SAndroid Build Coastguard Worker paddw m4, m15 257*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 ; clip px in [0x0,0xff] 258*c0909341SAndroid Build Coastguard Worker pminub m4, m7 259*c0909341SAndroid Build Coastguard Worker pmaxub m4, m8 260*c0909341SAndroid Build Coastguard Worker %endif 261*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 262*c0909341SAndroid Build Coastguard Worker %if %1 == 4 263*c0909341SAndroid Build Coastguard Worker movd [dstq +strideq*0], xm4 264*c0909341SAndroid Build Coastguard Worker movd [dst4q+strideq*0], xm5 265*c0909341SAndroid Build Coastguard Worker pextrd [dstq +strideq*1], xm4, 1 266*c0909341SAndroid Build Coastguard Worker pextrd [dst4q+strideq*1], xm5, 1 267*c0909341SAndroid Build Coastguard Worker pextrd [dstq +strideq*2], xm4, 2 268*c0909341SAndroid Build Coastguard Worker pextrd [dst4q+strideq*2], xm5, 2 269*c0909341SAndroid Build Coastguard Worker pextrd [dstq +stride3q ], xm4, 3 270*c0909341SAndroid Build Coastguard Worker pextrd [dst4q+stride3q ], xm5, 3 271*c0909341SAndroid Build Coastguard Worker %else 272*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm4 273*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*2], xm5 274*c0909341SAndroid Build Coastguard Worker movhps [dstq+strideq*1], xm4 275*c0909341SAndroid Build Coastguard Worker movhps [dstq+stride3q ], xm5 276*c0909341SAndroid Build Coastguard Worker %endif 277*c0909341SAndroid Build Coastguard Worker%endif 278*c0909341SAndroid Build Coastguard Worker%endmacro 279*c0909341SAndroid Build Coastguard Worker 280*c0909341SAndroid Build Coastguard Worker%macro BORDER_PREP_REGS 2 ; w, h 281*c0909341SAndroid Build Coastguard Worker ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] 282*c0909341SAndroid Build Coastguard Worker mov dird, r7m 283*c0909341SAndroid Build Coastguard Worker lea dirq, [tableq+dirq*2+14] 284*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1 285*c0909341SAndroid Build Coastguard Worker %if %1 == 4 286*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off 287*c0909341SAndroid Build Coastguard Worker %else 288*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off 289*c0909341SAndroid Build Coastguard Worker %endif 290*c0909341SAndroid Build Coastguard Worker mov hd, %1*%2*2/mmsize 291*c0909341SAndroid Build Coastguard Worker%else 292*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off 293*c0909341SAndroid Build Coastguard Worker%endif 294*c0909341SAndroid Build Coastguard Worker lea stkq, [px] 295*c0909341SAndroid Build Coastguard Worker pxor m11, m11 296*c0909341SAndroid Build Coastguard Worker%endmacro 297*c0909341SAndroid Build Coastguard Worker 298*c0909341SAndroid Build Coastguard Worker%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max 299*c0909341SAndroid Build Coastguard Worker mov kd, 1 300*c0909341SAndroid Build Coastguard Worker%if %1 == 4 301*c0909341SAndroid Build Coastguard Worker movq xm4, [stkq+32*0] 302*c0909341SAndroid Build Coastguard Worker movhps xm4, [stkq+32*1] 303*c0909341SAndroid Build Coastguard Worker movq xm5, [stkq+32*2] 304*c0909341SAndroid Build Coastguard Worker movhps xm5, [stkq+32*3] 305*c0909341SAndroid Build Coastguard Worker vinserti128 m4, xm5, 1 306*c0909341SAndroid Build Coastguard Worker%else 307*c0909341SAndroid Build Coastguard Worker mova xm4, [stkq+32*0] ; px 308*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [stkq+32*1], 1 309*c0909341SAndroid Build Coastguard Worker%endif 310*c0909341SAndroid Build Coastguard Worker pxor m15, m15 ; sum 311*c0909341SAndroid Build Coastguard Worker%if %3 == 1 312*c0909341SAndroid Build Coastguard Worker mova m7, m4 ; max 313*c0909341SAndroid Build Coastguard Worker mova m8, m4 ; min 314*c0909341SAndroid Build Coastguard Worker%endif 315*c0909341SAndroid Build Coastguard Worker%endmacro 316*c0909341SAndroid Build Coastguard Worker 317*c0909341SAndroid Build Coastguard Worker%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength 318*c0909341SAndroid Build Coastguard Worker ; mul_tap, w, clip 319*c0909341SAndroid Build Coastguard Worker ; load p0/p1 320*c0909341SAndroid Build Coastguard Worker movsx offq, byte [dirq+kq+%1] ; off1 321*c0909341SAndroid Build Coastguard Worker%if %6 == 4 322*c0909341SAndroid Build Coastguard Worker movq xm5, [stkq+offq*2+32*0] ; p0 323*c0909341SAndroid Build Coastguard Worker movq xm6, [stkq+offq*2+32*2] 324*c0909341SAndroid Build Coastguard Worker movhps xm5, [stkq+offq*2+32*1] 325*c0909341SAndroid Build Coastguard Worker movhps xm6, [stkq+offq*2+32*3] 326*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm6, 1 327*c0909341SAndroid Build Coastguard Worker%else 328*c0909341SAndroid Build Coastguard Worker movu xm5, [stkq+offq*2+32*0] ; p0 329*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [stkq+offq*2+32*1], 1 330*c0909341SAndroid Build Coastguard Worker%endif 331*c0909341SAndroid Build Coastguard Worker neg offq ; -off1 332*c0909341SAndroid Build Coastguard Worker%if %6 == 4 333*c0909341SAndroid Build Coastguard Worker movq xm6, [stkq+offq*2+32*0] ; p1 334*c0909341SAndroid Build Coastguard Worker movq xm9, [stkq+offq*2+32*2] 335*c0909341SAndroid Build Coastguard Worker movhps xm6, [stkq+offq*2+32*1] 336*c0909341SAndroid Build Coastguard Worker movhps xm9, [stkq+offq*2+32*3] 337*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm9, 1 338*c0909341SAndroid Build Coastguard Worker%else 339*c0909341SAndroid Build Coastguard Worker movu xm6, [stkq+offq*2+32*0] ; p1 340*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [stkq+offq*2+32*1], 1 341*c0909341SAndroid Build Coastguard Worker%endif 342*c0909341SAndroid Build Coastguard Worker%if %7 == 1 343*c0909341SAndroid Build Coastguard Worker ; out of bounds values are set to a value that is a both a large unsigned 344*c0909341SAndroid Build Coastguard Worker ; value and a negative signed value. 345*c0909341SAndroid Build Coastguard Worker ; use signed max and unsigned min to remove them 346*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m5 ; max after p0 347*c0909341SAndroid Build Coastguard Worker pminuw m8, m5 ; min after p0 348*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m6 ; max after p1 349*c0909341SAndroid Build Coastguard Worker pminuw m8, m6 ; min after p1 350*c0909341SAndroid Build Coastguard Worker%endif 351*c0909341SAndroid Build Coastguard Worker 352*c0909341SAndroid Build Coastguard Worker ; accumulate sum[m15] over p0/p1 353*c0909341SAndroid Build Coastguard Worker ; calculate difference before converting 354*c0909341SAndroid Build Coastguard Worker psubw m5, m4 ; diff_p0(p0 - px) 355*c0909341SAndroid Build Coastguard Worker psubw m6, m4 ; diff_p1(p1 - px) 356*c0909341SAndroid Build Coastguard Worker 357*c0909341SAndroid Build Coastguard Worker ; convert to 8-bits with signed saturation 358*c0909341SAndroid Build Coastguard Worker ; saturating to large diffs has no impact on the results 359*c0909341SAndroid Build Coastguard Worker packsswb m5, m6 360*c0909341SAndroid Build Coastguard Worker 361*c0909341SAndroid Build Coastguard Worker ; group into pairs so we can accumulate using maddubsw 362*c0909341SAndroid Build Coastguard Worker pshufb m5, m12 363*c0909341SAndroid Build Coastguard Worker pabsb m9, m5 364*c0909341SAndroid Build Coastguard Worker psignb m10, %5, m5 365*c0909341SAndroid Build Coastguard Worker psrlw m5, m9, %2 ; emulate 8-bit shift 366*c0909341SAndroid Build Coastguard Worker pand m5, %3 367*c0909341SAndroid Build Coastguard Worker psubusb m5, %4, m5 368*c0909341SAndroid Build Coastguard Worker 369*c0909341SAndroid Build Coastguard Worker ; use unsigned min since abs diff can equal 0x80 370*c0909341SAndroid Build Coastguard Worker pminub m5, m9 371*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m10 372*c0909341SAndroid Build Coastguard Worker paddw m15, m5 373*c0909341SAndroid Build Coastguard Worker%endmacro 374*c0909341SAndroid Build Coastguard Worker 375*c0909341SAndroid Build Coastguard Worker%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip 376*c0909341SAndroid Build Coastguard Worker pcmpgtw m9, m11, m15 377*c0909341SAndroid Build Coastguard Worker paddw m15, m9 378*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, %2 379*c0909341SAndroid Build Coastguard Worker paddw m4, m15 380*c0909341SAndroid Build Coastguard Worker%if %3 == 1 381*c0909341SAndroid Build Coastguard Worker pminsw m4, m7 382*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m8 383*c0909341SAndroid Build Coastguard Worker%endif 384*c0909341SAndroid Build Coastguard Worker packuswb m4, m4 385*c0909341SAndroid Build Coastguard Worker vextracti128 xm5, m4, 1 386*c0909341SAndroid Build Coastguard Worker%if %1 == 4 387*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], xm4 388*c0909341SAndroid Build Coastguard Worker pextrd [dstq+strideq*1], xm4, 1 389*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*2], xm5 390*c0909341SAndroid Build Coastguard Worker pextrd [dstq+stride3q ], xm5, 1 391*c0909341SAndroid Build Coastguard Worker%else 392*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*0], xm4 393*c0909341SAndroid Build Coastguard Worker movq [dstq+strideq*1], xm5 394*c0909341SAndroid Build Coastguard Worker%endif 395*c0909341SAndroid Build Coastguard Worker%endmacro 396*c0909341SAndroid Build Coastguard Worker 397*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER 2 ; w, h 398*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 399*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \ 400*c0909341SAndroid Build Coastguard Worker pri, sec, dir, damping, edge 401*c0909341SAndroid Build Coastguard Worker mov edged, edgem 402*c0909341SAndroid Build Coastguard Worker cmp edged, 0xf 403*c0909341SAndroid Build Coastguard Worker jne .border_block 404*c0909341SAndroid Build Coastguard Worker 405*c0909341SAndroid Build Coastguard Worker PUSH r11 406*c0909341SAndroid Build Coastguard Worker PUSH r12 407*c0909341SAndroid Build Coastguard Worker%if %2 == 4 408*c0909341SAndroid Build Coastguard Worker%assign regs_used 13 409*c0909341SAndroid Build Coastguard Worker ALLOC_STACK 0x60, 16 410*c0909341SAndroid Build Coastguard Worker pmovzxbw xm0, [leftq+1] 411*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q0110 412*c0909341SAndroid Build Coastguard Worker psrldq m1, m0, 4 413*c0909341SAndroid Build Coastguard Worker vpalignr m2, m0, m0, 12 414*c0909341SAndroid Build Coastguard Worker movu [rsp+0x10], m0 415*c0909341SAndroid Build Coastguard Worker movu [rsp+0x28], m1 416*c0909341SAndroid Build Coastguard Worker movu [rsp+0x40], m2 417*c0909341SAndroid Build Coastguard Worker%elif %1 == 4 418*c0909341SAndroid Build Coastguard Worker%assign regs_used 14 419*c0909341SAndroid Build Coastguard Worker PUSH r13 420*c0909341SAndroid Build Coastguard Worker ALLOC_STACK 8*2+%1*%2*1, 16 421*c0909341SAndroid Build Coastguard Worker pmovzxwd m0, [leftq] 422*c0909341SAndroid Build Coastguard Worker mova [rsp+0x10], m0 423*c0909341SAndroid Build Coastguard Worker%else 424*c0909341SAndroid Build Coastguard Worker%assign regs_used 15 425*c0909341SAndroid Build Coastguard Worker PUSH r13 426*c0909341SAndroid Build Coastguard Worker PUSH r14 427*c0909341SAndroid Build Coastguard Worker ALLOC_STACK 8*4+%1*%2*2+32, 16 428*c0909341SAndroid Build Coastguard Worker lea r11, [strideq*3] 429*c0909341SAndroid Build Coastguard Worker movu xm4, [dstq+strideq*2] 430*c0909341SAndroid Build Coastguard Worker pmovzxwq m0, [leftq+0] 431*c0909341SAndroid Build Coastguard Worker pmovzxwq m1, [leftq+8] 432*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [dstq+r11], 1 433*c0909341SAndroid Build Coastguard Worker pmovzxbd m2, [leftq+1] 434*c0909341SAndroid Build Coastguard Worker pmovzxbd m3, [leftq+9] 435*c0909341SAndroid Build Coastguard Worker mov [rsp+16], botq 436*c0909341SAndroid Build Coastguard Worker mova [rsp+0x20], m0 437*c0909341SAndroid Build Coastguard Worker mova [rsp+0x40], m1 438*c0909341SAndroid Build Coastguard Worker mova [rsp+0x60], m2 439*c0909341SAndroid Build Coastguard Worker mova [rsp+0x80], m3 440*c0909341SAndroid Build Coastguard Worker mova [rsp+0xa0], m4 441*c0909341SAndroid Build Coastguard Worker lea botq, [dstq+strideq*4] 442*c0909341SAndroid Build Coastguard Worker%endif 443*c0909341SAndroid Build Coastguard Worker 444*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping 445*c0909341SAndroid Build Coastguard Worker mov dampingd, r8m 446*c0909341SAndroid Build Coastguard Worker xor zerod, zerod 447*c0909341SAndroid Build Coastguard Worker movifnidn prid, prim 448*c0909341SAndroid Build Coastguard Worker sub dampingd, 31 449*c0909341SAndroid Build Coastguard Worker movifnidn secdmpd, secdmpm 450*c0909341SAndroid Build Coastguard Worker test prid, prid 451*c0909341SAndroid Build Coastguard Worker jz .sec_only 452*c0909341SAndroid Build Coastguard Worker movd xm0, prid 453*c0909341SAndroid Build Coastguard Worker lzcnt pridmpd, prid 454*c0909341SAndroid Build Coastguard Worker add pridmpd, dampingd 455*c0909341SAndroid Build Coastguard Worker cmovs pridmpd, zerod 456*c0909341SAndroid Build Coastguard Worker mov [rsp+0], pridmpq ; pri_shift 457*c0909341SAndroid Build Coastguard Worker test secdmpd, secdmpd 458*c0909341SAndroid Build Coastguard Worker jz .pri_only 459*c0909341SAndroid Build Coastguard Worker movd xm1, secdmpd 460*c0909341SAndroid Build Coastguard Worker lzcnt secdmpd, secdmpd 461*c0909341SAndroid Build Coastguard Worker add secdmpd, dampingd 462*c0909341SAndroid Build Coastguard Worker mov [rsp+8], secdmpq ; sec_shift 463*c0909341SAndroid Build Coastguard Worker 464*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp 465*c0909341SAndroid Build Coastguard Worker lea tableq, [tap_table] 466*c0909341SAndroid Build Coastguard Worker vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 467*c0909341SAndroid Build Coastguard Worker vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 468*c0909341SAndroid Build Coastguard Worker 469*c0909341SAndroid Build Coastguard Worker ; pri/sec_taps[k] [4 total] 470*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir 471*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, xm0 ; pri_strength 472*c0909341SAndroid Build Coastguard Worker vpbroadcastb m1, xm1 ; sec_strength 473*c0909341SAndroid Build Coastguard Worker and prid, 1 474*c0909341SAndroid Build Coastguard Worker lea priq, [tableq+priq*2+8] ; pri_taps 475*c0909341SAndroid Build Coastguard Worker lea secq, [tableq+12] ; sec_taps 476*c0909341SAndroid Build Coastguard Worker 477*c0909341SAndroid Build Coastguard Worker PREP_REGS %1, %2 478*c0909341SAndroid Build Coastguard Worker%if %1*%2 > mmsize 479*c0909341SAndroid Build Coastguard Worker.v_loop: 480*c0909341SAndroid Build Coastguard Worker%endif 481*c0909341SAndroid Build Coastguard Worker LOAD_BLOCK %1, %2, 1 482*c0909341SAndroid Build Coastguard Worker.k_loop: 483*c0909341SAndroid Build Coastguard Worker vpbroadcastb m2, [priq+kq] ; pri_taps 484*c0909341SAndroid Build Coastguard Worker vpbroadcastb m3, [secq+kq] ; sec_taps 485*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0 486*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2 487*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2 488*c0909341SAndroid Build Coastguard Worker dec kq 489*c0909341SAndroid Build Coastguard Worker jge .k_loop 490*c0909341SAndroid Build Coastguard Worker 491*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pw_2048] 492*c0909341SAndroid Build Coastguard Worker pxor m9, m9 493*c0909341SAndroid Build Coastguard Worker ADJUST_PIXEL %1, %2, m9, m10, 1 494*c0909341SAndroid Build Coastguard Worker%if %1*%2 > mmsize 495*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 496*c0909341SAndroid Build Coastguard Worker lea top1q, [rsp+0xa0] 497*c0909341SAndroid Build Coastguard Worker lea top2q, [rsp+0xb0] 498*c0909341SAndroid Build Coastguard Worker mov botq, [rsp+16] 499*c0909341SAndroid Build Coastguard Worker add hq, 4 500*c0909341SAndroid Build Coastguard Worker jl .v_loop 501*c0909341SAndroid Build Coastguard Worker%endif 502*c0909341SAndroid Build Coastguard Worker RET 503*c0909341SAndroid Build Coastguard Worker 504*c0909341SAndroid Build Coastguard Worker.pri_only: 505*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp 506*c0909341SAndroid Build Coastguard Worker lea tableq, [tap_table] 507*c0909341SAndroid Build Coastguard Worker vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 508*c0909341SAndroid Build Coastguard Worker ; pri/sec_taps[k] [4 total] 509*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir 510*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, xm0 ; pri_strength 511*c0909341SAndroid Build Coastguard Worker and prid, 1 512*c0909341SAndroid Build Coastguard Worker lea priq, [tableq+priq*2+8] ; pri_taps 513*c0909341SAndroid Build Coastguard Worker PREP_REGS %1, %2 514*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [pw_2048] 515*c0909341SAndroid Build Coastguard Worker pxor m1, m1 516*c0909341SAndroid Build Coastguard Worker%if %1*%2 > mmsize 517*c0909341SAndroid Build Coastguard Worker.pri_v_loop: 518*c0909341SAndroid Build Coastguard Worker%endif 519*c0909341SAndroid Build Coastguard Worker LOAD_BLOCK %1, %2 520*c0909341SAndroid Build Coastguard Worker.pri_k_loop: 521*c0909341SAndroid Build Coastguard Worker vpbroadcastb m2, [priq+kq] ; pri_taps 522*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0 523*c0909341SAndroid Build Coastguard Worker dec kq 524*c0909341SAndroid Build Coastguard Worker jge .pri_k_loop 525*c0909341SAndroid Build Coastguard Worker ADJUST_PIXEL %1, %2, m1, m3 526*c0909341SAndroid Build Coastguard Worker%if %1*%2 > mmsize 527*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 528*c0909341SAndroid Build Coastguard Worker lea top1q, [rsp+0xa0] 529*c0909341SAndroid Build Coastguard Worker lea top2q, [rsp+0xb0] 530*c0909341SAndroid Build Coastguard Worker mov botq, [rsp+16] 531*c0909341SAndroid Build Coastguard Worker add hq, 4 532*c0909341SAndroid Build Coastguard Worker jl .pri_v_loop 533*c0909341SAndroid Build Coastguard Worker%endif 534*c0909341SAndroid Build Coastguard Worker RET 535*c0909341SAndroid Build Coastguard Worker 536*c0909341SAndroid Build Coastguard Worker.sec_only: 537*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping 538*c0909341SAndroid Build Coastguard Worker movd xm1, secdmpd 539*c0909341SAndroid Build Coastguard Worker lzcnt secdmpd, secdmpd 540*c0909341SAndroid Build Coastguard Worker add secdmpd, dampingd 541*c0909341SAndroid Build Coastguard Worker mov [rsp+8], secdmpq ; sec_shift 542*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table 543*c0909341SAndroid Build Coastguard Worker lea tableq, [tap_table] 544*c0909341SAndroid Build Coastguard Worker vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 545*c0909341SAndroid Build Coastguard Worker ; pri/sec_taps[k] [4 total] 546*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir 547*c0909341SAndroid Build Coastguard Worker vpbroadcastb m1, xm1 ; sec_strength 548*c0909341SAndroid Build Coastguard Worker lea secq, [tableq+12] ; sec_taps 549*c0909341SAndroid Build Coastguard Worker PREP_REGS %1, %2 550*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pw_2048] 551*c0909341SAndroid Build Coastguard Worker pxor m0, m0 552*c0909341SAndroid Build Coastguard Worker%if %1*%2 > mmsize 553*c0909341SAndroid Build Coastguard Worker.sec_v_loop: 554*c0909341SAndroid Build Coastguard Worker%endif 555*c0909341SAndroid Build Coastguard Worker LOAD_BLOCK %1, %2 556*c0909341SAndroid Build Coastguard Worker.sec_k_loop: 557*c0909341SAndroid Build Coastguard Worker vpbroadcastb m3, [secq+kq] ; sec_taps 558*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2 559*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2 560*c0909341SAndroid Build Coastguard Worker dec kq 561*c0909341SAndroid Build Coastguard Worker jge .sec_k_loop 562*c0909341SAndroid Build Coastguard Worker ADJUST_PIXEL %1, %2, m0, m2 563*c0909341SAndroid Build Coastguard Worker%if %1*%2 > mmsize 564*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 565*c0909341SAndroid Build Coastguard Worker lea top1q, [rsp+0xa0] 566*c0909341SAndroid Build Coastguard Worker lea top2q, [rsp+0xb0] 567*c0909341SAndroid Build Coastguard Worker mov botq, [rsp+16] 568*c0909341SAndroid Build Coastguard Worker add hq, 4 569*c0909341SAndroid Build Coastguard Worker jl .sec_v_loop 570*c0909341SAndroid Build Coastguard Worker%endif 571*c0909341SAndroid Build Coastguard Worker RET 572*c0909341SAndroid Build Coastguard Worker 573*c0909341SAndroid Build Coastguard Worker.d0k0: 574*c0909341SAndroid Build Coastguard Worker%if %1 == 4 575*c0909341SAndroid Build Coastguard Worker %if %2 == 4 576*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [dstq+strideq*1-1] 577*c0909341SAndroid Build Coastguard Worker vpbroadcastq m10, [dstq+strideq*2-1] 578*c0909341SAndroid Build Coastguard Worker movd xm5, [topq+strideq*1+1] 579*c0909341SAndroid Build Coastguard Worker movd xm9, [dstq+strideq*0+1] 580*c0909341SAndroid Build Coastguard Worker psrldq m11, m6, 2 581*c0909341SAndroid Build Coastguard Worker psrldq m12, m10, 2 582*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [dstq+stride3q -1], 1 583*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [botq -1], 1 584*c0909341SAndroid Build Coastguard Worker vpblendd m5, m11, 0x10 585*c0909341SAndroid Build Coastguard Worker vpblendd m9, m12, 0x10 586*c0909341SAndroid Build Coastguard Worker movu m11, [blend_4x4+16] 587*c0909341SAndroid Build Coastguard Worker punpckldq m6, m10 588*c0909341SAndroid Build Coastguard Worker punpckldq m5, m9 589*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x28], m11 590*c0909341SAndroid Build Coastguard Worker %else 591*c0909341SAndroid Build Coastguard Worker movd xm5, [topq +strideq*1+1] 592*c0909341SAndroid Build Coastguard Worker movq xm6, [dstq +strideq*1-1] 593*c0909341SAndroid Build Coastguard Worker movq xm10, [dstq +stride3q -1] 594*c0909341SAndroid Build Coastguard Worker movq xm11, [dst4q+strideq*1-1] 595*c0909341SAndroid Build Coastguard Worker pinsrd xm5, [dstq +strideq*0+1], 1 596*c0909341SAndroid Build Coastguard Worker movhps xm6, [dstq +strideq*2-1] 597*c0909341SAndroid Build Coastguard Worker movhps xm10, [dst4q+strideq*0-1] 598*c0909341SAndroid Build Coastguard Worker movhps xm11, [dst4q+strideq*2-1] 599*c0909341SAndroid Build Coastguard Worker psrldq xm9, xm6, 2 600*c0909341SAndroid Build Coastguard Worker shufps xm5, xm9, q2010 ; -1 +0 +1 +2 601*c0909341SAndroid Build Coastguard Worker shufps xm6, xm10, q2020 ; +1 +2 +3 +4 602*c0909341SAndroid Build Coastguard Worker psrldq xm9, xm11, 2 603*c0909341SAndroid Build Coastguard Worker psrldq xm10, 2 604*c0909341SAndroid Build Coastguard Worker shufps xm10, xm9, q2020 ; +3 +4 +5 +6 605*c0909341SAndroid Build Coastguard Worker movd xm9, [dst4q+stride3q -1] 606*c0909341SAndroid Build Coastguard Worker pinsrd xm9, [botq -1], 1 607*c0909341SAndroid Build Coastguard Worker shufps xm11, xm9, q1020 ; +5 +6 +7 +8 608*c0909341SAndroid Build Coastguard Worker pmovzxbw m9, [leftq+3] 609*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm11, 1 610*c0909341SAndroid Build Coastguard Worker movu m11, [blend_4x8_0+4] 611*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm10, 1 612*c0909341SAndroid Build Coastguard Worker vpblendvb m6, m9, m11 613*c0909341SAndroid Build Coastguard Worker %endif 614*c0909341SAndroid Build Coastguard Worker%else 615*c0909341SAndroid Build Coastguard Worker lea r13, [blend_8x8_0+16] 616*c0909341SAndroid Build Coastguard Worker movq xm5, [top2q +1] 617*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [dstq+strideq*1-1] 618*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m11, [dstq+strideq*2-1] 619*c0909341SAndroid Build Coastguard Worker movhps xm5, [dstq+strideq*0+1] 620*c0909341SAndroid Build Coastguard Worker vinserti128 m6, m10, [dstq+stride3q-1], 1 621*c0909341SAndroid Build Coastguard Worker vinserti128 m9, m11, [botq -1], 1 622*c0909341SAndroid Build Coastguard Worker psrldq m10, 2 623*c0909341SAndroid Build Coastguard Worker psrldq m11, 2 624*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m9 625*c0909341SAndroid Build Coastguard Worker movu m9, [r13+hq*2*1+16*1] 626*c0909341SAndroid Build Coastguard Worker punpcklqdq m10, m11 627*c0909341SAndroid Build Coastguard Worker vpblendd m5, m10, 0xF0 628*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9 629*c0909341SAndroid Build Coastguard Worker%endif 630*c0909341SAndroid Build Coastguard Worker ret 631*c0909341SAndroid Build Coastguard Worker.d1k0: 632*c0909341SAndroid Build Coastguard Worker.d2k0: 633*c0909341SAndroid Build Coastguard Worker.d3k0: 634*c0909341SAndroid Build Coastguard Worker%if %1 == 4 635*c0909341SAndroid Build Coastguard Worker %if %2 == 4 636*c0909341SAndroid Build Coastguard Worker movq xm6, [dstq+strideq*0-1] 637*c0909341SAndroid Build Coastguard Worker movq xm9, [dstq+strideq*1-1] 638*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [dstq+strideq*2-1], 1 639*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [dstq+stride3q -1], 1 640*c0909341SAndroid Build Coastguard Worker movu m11, [rsp+gprsize+0x10] 641*c0909341SAndroid Build Coastguard Worker pcmpeqd m12, m12 642*c0909341SAndroid Build Coastguard Worker psrldq m5, m6, 2 643*c0909341SAndroid Build Coastguard Worker psrldq m10, m9, 2 644*c0909341SAndroid Build Coastguard Worker psrld m12, 24 645*c0909341SAndroid Build Coastguard Worker punpckldq m6, m9 646*c0909341SAndroid Build Coastguard Worker punpckldq m5, m10 647*c0909341SAndroid Build Coastguard Worker vpblendvb m6, m11, m12 648*c0909341SAndroid Build Coastguard Worker %else 649*c0909341SAndroid Build Coastguard Worker movq xm6, [dstq +strideq*0-1] 650*c0909341SAndroid Build Coastguard Worker movq xm9, [dstq +strideq*2-1] 651*c0909341SAndroid Build Coastguard Worker movhps xm6, [dstq +strideq*1-1] 652*c0909341SAndroid Build Coastguard Worker movhps xm9, [dstq +stride3q -1] 653*c0909341SAndroid Build Coastguard Worker movq xm10, [dst4q+strideq*0-1] 654*c0909341SAndroid Build Coastguard Worker movhps xm10, [dst4q+strideq*1-1] 655*c0909341SAndroid Build Coastguard Worker psrldq xm5, xm6, 2 656*c0909341SAndroid Build Coastguard Worker psrldq xm11, xm9, 2 657*c0909341SAndroid Build Coastguard Worker shufps xm5, xm11, q2020 658*c0909341SAndroid Build Coastguard Worker movq xm11, [dst4q+strideq*2-1] 659*c0909341SAndroid Build Coastguard Worker movhps xm11, [dst4q+stride3q -1] 660*c0909341SAndroid Build Coastguard Worker shufps xm6, xm9, q2020 661*c0909341SAndroid Build Coastguard Worker shufps xm9, xm10, xm11, q2020 662*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm9, 1 663*c0909341SAndroid Build Coastguard Worker pmovzxbw m9, [leftq+1] 664*c0909341SAndroid Build Coastguard Worker psrldq xm10, 2 665*c0909341SAndroid Build Coastguard Worker psrldq xm11, 2 666*c0909341SAndroid Build Coastguard Worker shufps xm10, xm11, q2020 667*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [blend_4x8_0+4] 668*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm10, 1 669*c0909341SAndroid Build Coastguard Worker vpblendvb m6, m9, m11 670*c0909341SAndroid Build Coastguard Worker %endif 671*c0909341SAndroid Build Coastguard Worker%else 672*c0909341SAndroid Build Coastguard Worker movu xm5, [dstq+strideq*0-1] 673*c0909341SAndroid Build Coastguard Worker movu xm9, [dstq+strideq*1-1] 674*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [dstq+strideq*2-1], 1 675*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [dstq+stride3q -1], 1 676*c0909341SAndroid Build Coastguard Worker movu m10, [blend_8x8_0+16] 677*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m5, m9 678*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x60+hq*8+64], m10 679*c0909341SAndroid Build Coastguard Worker psrldq m5, 2 680*c0909341SAndroid Build Coastguard Worker psrldq m9, 2 681*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m9 682*c0909341SAndroid Build Coastguard Worker%endif 683*c0909341SAndroid Build Coastguard Worker ret 684*c0909341SAndroid Build Coastguard Worker.d4k0: 685*c0909341SAndroid Build Coastguard Worker%if %1 == 4 686*c0909341SAndroid Build Coastguard Worker %if %2 == 4 687*c0909341SAndroid Build Coastguard Worker vpbroadcastq m10, [dstq+strideq*1-1] 688*c0909341SAndroid Build Coastguard Worker vpbroadcastq m11, [dstq+strideq*2-1] 689*c0909341SAndroid Build Coastguard Worker movd xm6, [topq+strideq*1-1] 690*c0909341SAndroid Build Coastguard Worker movd xm9, [dstq+strideq*0-1] 691*c0909341SAndroid Build Coastguard Worker psrldq m5, m10, 2 692*c0909341SAndroid Build Coastguard Worker psrldq m12, m11, 2 693*c0909341SAndroid Build Coastguard Worker vpblendd m6, m10, 0x10 694*c0909341SAndroid Build Coastguard Worker vpblendd m9, m11, 0x10 695*c0909341SAndroid Build Coastguard Worker movu m10, [blend_4x4] 696*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [dstq+stride3q +1], 1 697*c0909341SAndroid Build Coastguard Worker vinserti128 m12, [botq +1], 1 698*c0909341SAndroid Build Coastguard Worker punpckldq m6, m9 699*c0909341SAndroid Build Coastguard Worker punpckldq m5, m12 700*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x40], m10 701*c0909341SAndroid Build Coastguard Worker %else 702*c0909341SAndroid Build Coastguard Worker movd xm6, [topq +strideq*1-1] 703*c0909341SAndroid Build Coastguard Worker movq xm9, [dstq +strideq*1-1] 704*c0909341SAndroid Build Coastguard Worker movq xm10, [dstq +stride3q -1] 705*c0909341SAndroid Build Coastguard Worker movq xm11, [dst4q+strideq*1-1] 706*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [dstq +strideq*0-1], 1 707*c0909341SAndroid Build Coastguard Worker movhps xm9, [dstq +strideq*2-1] 708*c0909341SAndroid Build Coastguard Worker movhps xm10, [dst4q+strideq*0-1] 709*c0909341SAndroid Build Coastguard Worker movhps xm11, [dst4q+strideq*2-1] 710*c0909341SAndroid Build Coastguard Worker psrldq xm5, xm9, 2 711*c0909341SAndroid Build Coastguard Worker shufps xm6, xm9, q2010 712*c0909341SAndroid Build Coastguard Worker psrldq xm9, xm10, 2 713*c0909341SAndroid Build Coastguard Worker shufps xm5, xm9, q2020 714*c0909341SAndroid Build Coastguard Worker shufps xm10, xm11, q2020 715*c0909341SAndroid Build Coastguard Worker movd xm9, [dst4q+stride3q +1] 716*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm10, 1 717*c0909341SAndroid Build Coastguard Worker pinsrd xm9, [botq +1], 1 718*c0909341SAndroid Build Coastguard Worker psrldq xm11, 2 719*c0909341SAndroid Build Coastguard Worker pmovzxbw m10, [leftq-1] 720*c0909341SAndroid Build Coastguard Worker shufps xm11, xm9, q1020 721*c0909341SAndroid Build Coastguard Worker movu m9, [blend_4x8_0] 722*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm11, 1 723*c0909341SAndroid Build Coastguard Worker vpblendvb m6, m10, m9 724*c0909341SAndroid Build Coastguard Worker %endif 725*c0909341SAndroid Build Coastguard Worker%else 726*c0909341SAndroid Build Coastguard Worker lea r13, [blend_8x8_0+8] 727*c0909341SAndroid Build Coastguard Worker movq xm6, [top2q -1] 728*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [dstq+strideq*1-1] 729*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [dstq+strideq*2-1] 730*c0909341SAndroid Build Coastguard Worker movhps xm6, [dstq+strideq*0-1] 731*c0909341SAndroid Build Coastguard Worker movu m11, [r13+hq*2*1+16*1] 732*c0909341SAndroid Build Coastguard Worker punpcklqdq m10, m5, m9 733*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [dstq+stride3q -1], 1 734*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [botq -1], 1 735*c0909341SAndroid Build Coastguard Worker vpblendd m6, m10, 0xF0 736*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11 737*c0909341SAndroid Build Coastguard Worker psrldq m5, 2 738*c0909341SAndroid Build Coastguard Worker psrldq m9, 2 739*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m9 740*c0909341SAndroid Build Coastguard Worker%endif 741*c0909341SAndroid Build Coastguard Worker ret 742*c0909341SAndroid Build Coastguard Worker.d5k0: 743*c0909341SAndroid Build Coastguard Worker.d6k0: 744*c0909341SAndroid Build Coastguard Worker.d7k0: 745*c0909341SAndroid Build Coastguard Worker%if %1 == 4 746*c0909341SAndroid Build Coastguard Worker %if %2 == 4 747*c0909341SAndroid Build Coastguard Worker movd xm6, [topq+strideq*1 ] 748*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [dstq+strideq*1 ] 749*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [dstq+strideq*2 ] 750*c0909341SAndroid Build Coastguard Worker vpblendd xm6, [dstq+strideq*0-4], 0x2 751*c0909341SAndroid Build Coastguard Worker vpblendd m5, m9, 0x22 752*c0909341SAndroid Build Coastguard Worker vpblendd m6, m5, 0x30 753*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [dstq+stride3q ], 1 754*c0909341SAndroid Build Coastguard Worker vpblendd m5, [botq -20], 0x20 755*c0909341SAndroid Build Coastguard Worker %else 756*c0909341SAndroid Build Coastguard Worker movd xm6, [topq +strideq*1] 757*c0909341SAndroid Build Coastguard Worker movd xm5, [dstq +strideq*1] 758*c0909341SAndroid Build Coastguard Worker movd xm9, [dstq +stride3q ] 759*c0909341SAndroid Build Coastguard Worker movd xm10, [dst4q+strideq*1] 760*c0909341SAndroid Build Coastguard Worker movd xm11, [dst4q+stride3q ] 761*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [dstq +strideq*0], 1 762*c0909341SAndroid Build Coastguard Worker pinsrd xm5, [dstq +strideq*2], 1 763*c0909341SAndroid Build Coastguard Worker pinsrd xm9, [dst4q+strideq*0], 1 764*c0909341SAndroid Build Coastguard Worker pinsrd xm10, [dst4q+strideq*2], 1 765*c0909341SAndroid Build Coastguard Worker pinsrd xm11, [botq ], 1 766*c0909341SAndroid Build Coastguard Worker punpcklqdq xm6, xm5 767*c0909341SAndroid Build Coastguard Worker punpcklqdq xm5, xm9 768*c0909341SAndroid Build Coastguard Worker punpcklqdq xm9, xm10 769*c0909341SAndroid Build Coastguard Worker punpcklqdq xm10, xm11 770*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm9, 1 771*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm10, 1 772*c0909341SAndroid Build Coastguard Worker %endif 773*c0909341SAndroid Build Coastguard Worker%else 774*c0909341SAndroid Build Coastguard Worker movq xm6, [top2q ] 775*c0909341SAndroid Build Coastguard Worker movq xm5, [dstq+strideq*1] 776*c0909341SAndroid Build Coastguard Worker movq xm9, [dstq+stride3q ] 777*c0909341SAndroid Build Coastguard Worker movhps xm6, [dstq+strideq*0] 778*c0909341SAndroid Build Coastguard Worker movhps xm5, [dstq+strideq*2] 779*c0909341SAndroid Build Coastguard Worker movhps xm9, [botq ] 780*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm5, 1 781*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm9, 1 782*c0909341SAndroid Build Coastguard Worker%endif 783*c0909341SAndroid Build Coastguard Worker ret 784*c0909341SAndroid Build Coastguard Worker.d0k1: 785*c0909341SAndroid Build Coastguard Worker%if %1 == 4 786*c0909341SAndroid Build Coastguard Worker %if %2 == 4 787*c0909341SAndroid Build Coastguard Worker movd xm6, [dstq+strideq*2-2] 788*c0909341SAndroid Build Coastguard Worker movd xm9, [dstq+stride3q -2] 789*c0909341SAndroid Build Coastguard Worker movd xm5, [topq+strideq*0+2] 790*c0909341SAndroid Build Coastguard Worker movd xm10, [topq+strideq*1+2] 791*c0909341SAndroid Build Coastguard Worker pinsrw xm6, [leftq+4], 0 792*c0909341SAndroid Build Coastguard Worker pinsrw xm9, [leftq+6], 0 793*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [dstq+strideq*0+2], 1 794*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [dstq+strideq*1+2], 1 795*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [botq+strideq*0-2], 1 796*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [botq+strideq*1-2], 1 797*c0909341SAndroid Build Coastguard Worker punpckldq m5, m10 798*c0909341SAndroid Build Coastguard Worker punpckldq m6, m9 799*c0909341SAndroid Build Coastguard Worker %else 800*c0909341SAndroid Build Coastguard Worker movq xm6, [dstq +strideq*2-2] 801*c0909341SAndroid Build Coastguard Worker movd xm10, [dst4q+strideq*2-2] 802*c0909341SAndroid Build Coastguard Worker movd xm5, [topq +strideq*0+2] 803*c0909341SAndroid Build Coastguard Worker movq xm9, [dst4q+strideq*0-2] 804*c0909341SAndroid Build Coastguard Worker movhps xm6, [dstq +stride3q -2] 805*c0909341SAndroid Build Coastguard Worker pinsrw xm10, [dst4q+stride3q ], 3 806*c0909341SAndroid Build Coastguard Worker pinsrd xm5, [topq +strideq*1+2], 1 807*c0909341SAndroid Build Coastguard Worker movhps xm9, [dst4q+strideq*1-2] 808*c0909341SAndroid Build Coastguard Worker pinsrd xm10, [botq +strideq*0-2], 2 809*c0909341SAndroid Build Coastguard Worker pinsrd xm5, [dstq +strideq*0+2], 2 810*c0909341SAndroid Build Coastguard Worker pinsrd xm10, [botq +strideq*1-2], 3 811*c0909341SAndroid Build Coastguard Worker pinsrd xm5, [dstq +strideq*1+2], 3 812*c0909341SAndroid Build Coastguard Worker shufps xm11, xm6, xm9, q3131 813*c0909341SAndroid Build Coastguard Worker shufps xm6, xm9, q2020 814*c0909341SAndroid Build Coastguard Worker movu m9, [blend_4x8_3+8] 815*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm10, 1 816*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm11, 1 817*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x10+8], m9 818*c0909341SAndroid Build Coastguard Worker %endif 819*c0909341SAndroid Build Coastguard Worker%else 820*c0909341SAndroid Build Coastguard Worker lea r13, [blend_8x8_1+16] 821*c0909341SAndroid Build Coastguard Worker movq xm6, [dstq+strideq*2-2] 822*c0909341SAndroid Build Coastguard Worker movq xm9, [dstq+stride3q -2] 823*c0909341SAndroid Build Coastguard Worker movq xm5, [top1q +2] 824*c0909341SAndroid Build Coastguard Worker movq xm10, [top2q +2] 825*c0909341SAndroid Build Coastguard Worker movu m11, [r13+hq*2*2+16*2] 826*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [botq+strideq*0-2], 1 827*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [botq+strideq*1-2], 1 828*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [dstq+strideq*0+2], 1 829*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [dstq+strideq*1+2], 1 830*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m9 831*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m10 832*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11 833*c0909341SAndroid Build Coastguard Worker%endif 834*c0909341SAndroid Build Coastguard Worker ret 835*c0909341SAndroid Build Coastguard Worker.d1k1: 836*c0909341SAndroid Build Coastguard Worker%if %1 == 4 837*c0909341SAndroid Build Coastguard Worker %if %2 == 4 838*c0909341SAndroid Build Coastguard Worker vpbroadcastq m6, [dstq+strideq*1-2] 839*c0909341SAndroid Build Coastguard Worker vpbroadcastq m9, [dstq+strideq*2-2] 840*c0909341SAndroid Build Coastguard Worker movd xm5, [topq+strideq*1+2] 841*c0909341SAndroid Build Coastguard Worker movd xm10, [dstq+strideq*0+2] 842*c0909341SAndroid Build Coastguard Worker psrldq m11, m6, 4 843*c0909341SAndroid Build Coastguard Worker psrldq m12, m9, 4 844*c0909341SAndroid Build Coastguard Worker vpblendd m5, m11, 0x10 845*c0909341SAndroid Build Coastguard Worker movq xm11, [leftq+2] 846*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [dstq+stride3q-2], 1 847*c0909341SAndroid Build Coastguard Worker punpckldq xm11, xm11 848*c0909341SAndroid Build Coastguard Worker vpblendd m10, m12, 0x10 849*c0909341SAndroid Build Coastguard Worker pcmpeqd m12, m12 850*c0909341SAndroid Build Coastguard Worker pmovzxwd m11, xm11 851*c0909341SAndroid Build Coastguard Worker psrld m12, 16 852*c0909341SAndroid Build Coastguard Worker punpckldq m6, m9 853*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [botq-2] 854*c0909341SAndroid Build Coastguard Worker vpblendvb m6, m11, m12 855*c0909341SAndroid Build Coastguard Worker punpckldq m5, m10 856*c0909341SAndroid Build Coastguard Worker vpblendd m6, m9, 0x20 857*c0909341SAndroid Build Coastguard Worker %else 858*c0909341SAndroid Build Coastguard Worker movd xm5, [topq +strideq*1+2] 859*c0909341SAndroid Build Coastguard Worker movq xm6, [dstq +strideq*1-2] 860*c0909341SAndroid Build Coastguard Worker movq xm9, [dstq +stride3q -2] 861*c0909341SAndroid Build Coastguard Worker movq xm10, [dst4q+strideq*1-2] 862*c0909341SAndroid Build Coastguard Worker movd xm11, [dst4q+stride3q -2] 863*c0909341SAndroid Build Coastguard Worker pinsrd xm5, [dstq +strideq*0+2], 1 864*c0909341SAndroid Build Coastguard Worker movhps xm6, [dstq +strideq*2-2] 865*c0909341SAndroid Build Coastguard Worker movhps xm9, [dst4q+strideq*0-2] 866*c0909341SAndroid Build Coastguard Worker movhps xm10, [dst4q+strideq*2-2] 867*c0909341SAndroid Build Coastguard Worker pinsrd xm11, [botq -2], 1 868*c0909341SAndroid Build Coastguard Worker shufps xm5, xm6, q3110 869*c0909341SAndroid Build Coastguard Worker shufps xm6, xm9, q2020 870*c0909341SAndroid Build Coastguard Worker shufps xm9, xm10, q3131 871*c0909341SAndroid Build Coastguard Worker shufps xm10, xm11, q1020 872*c0909341SAndroid Build Coastguard Worker movu m11, [blend_4x8_2+4] 873*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm10, 1 874*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm9, 1 875*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x10+4], m11 876*c0909341SAndroid Build Coastguard Worker %endif 877*c0909341SAndroid Build Coastguard Worker%else 878*c0909341SAndroid Build Coastguard Worker lea r13, [blend_8x8_1+16] 879*c0909341SAndroid Build Coastguard Worker movq xm5, [top2q +2] 880*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [dstq+strideq*1-2] 881*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [dstq+strideq*2-2] 882*c0909341SAndroid Build Coastguard Worker movhps xm5, [dstq+strideq*0+2] 883*c0909341SAndroid Build Coastguard Worker shufps m10, m6, m9, q2121 884*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [dstq+stride3q -2], 1 885*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [botq -2], 1 886*c0909341SAndroid Build Coastguard Worker movu m11, [r13+hq*2*1+16*1] 887*c0909341SAndroid Build Coastguard Worker vpblendd m5, m10, 0xF0 888*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m9 889*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11 890*c0909341SAndroid Build Coastguard Worker%endif 891*c0909341SAndroid Build Coastguard Worker ret 892*c0909341SAndroid Build Coastguard Worker.d2k1: 893*c0909341SAndroid Build Coastguard Worker%if %1 == 4 894*c0909341SAndroid Build Coastguard Worker %if %2 == 4 895*c0909341SAndroid Build Coastguard Worker movq xm11, [leftq] 896*c0909341SAndroid Build Coastguard Worker movq xm6, [dstq+strideq*0-2] 897*c0909341SAndroid Build Coastguard Worker movq xm9, [dstq+strideq*1-2] 898*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [dstq+strideq*2-2], 1 899*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [dstq+stride3q -2], 1 900*c0909341SAndroid Build Coastguard Worker punpckldq xm11, xm11 901*c0909341SAndroid Build Coastguard Worker psrldq m5, m6, 4 902*c0909341SAndroid Build Coastguard Worker psrldq m10, m9, 4 903*c0909341SAndroid Build Coastguard Worker pmovzxwd m11, xm11 904*c0909341SAndroid Build Coastguard Worker punpckldq m6, m9 905*c0909341SAndroid Build Coastguard Worker punpckldq m5, m10 906*c0909341SAndroid Build Coastguard Worker pblendw m6, m11, 0x05 907*c0909341SAndroid Build Coastguard Worker %else 908*c0909341SAndroid Build Coastguard Worker movq xm5, [dstq +strideq*0-2] 909*c0909341SAndroid Build Coastguard Worker movq xm9, [dstq +strideq*2-2] 910*c0909341SAndroid Build Coastguard Worker movq xm10, [dst4q+strideq*0-2] 911*c0909341SAndroid Build Coastguard Worker movq xm11, [dst4q+strideq*2-2] 912*c0909341SAndroid Build Coastguard Worker movhps xm5, [dstq +strideq*1-2] 913*c0909341SAndroid Build Coastguard Worker movhps xm9, [dstq +stride3q -2] 914*c0909341SAndroid Build Coastguard Worker movhps xm10, [dst4q+strideq*1-2] 915*c0909341SAndroid Build Coastguard Worker movhps xm11, [dst4q+stride3q -2] 916*c0909341SAndroid Build Coastguard Worker shufps xm6, xm5, xm9, q2020 917*c0909341SAndroid Build Coastguard Worker shufps xm5, xm9, q3131 918*c0909341SAndroid Build Coastguard Worker shufps xm9, xm10, xm11, q2020 919*c0909341SAndroid Build Coastguard Worker shufps xm10, xm11, q3131 920*c0909341SAndroid Build Coastguard Worker pmovzxwd m11, [leftq] 921*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm9, 1 922*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm10, 1 923*c0909341SAndroid Build Coastguard Worker pblendw m6, m11, 0x55 924*c0909341SAndroid Build Coastguard Worker %endif 925*c0909341SAndroid Build Coastguard Worker%else 926*c0909341SAndroid Build Coastguard Worker mova m11, [rsp+gprsize+0x20+hq*8+64] 927*c0909341SAndroid Build Coastguard Worker movu xm5, [dstq+strideq*0-2] 928*c0909341SAndroid Build Coastguard Worker movu xm9, [dstq+strideq*1-2] 929*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [dstq+strideq*2-2], 1 930*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [dstq+stride3q -2], 1 931*c0909341SAndroid Build Coastguard Worker shufps m6, m5, m9, q1010 932*c0909341SAndroid Build Coastguard Worker shufps m5, m9, q2121 933*c0909341SAndroid Build Coastguard Worker pblendw m6, m11, 0x11 934*c0909341SAndroid Build Coastguard Worker%endif 935*c0909341SAndroid Build Coastguard Worker ret 936*c0909341SAndroid Build Coastguard Worker.d3k1: 937*c0909341SAndroid Build Coastguard Worker%if %1 == 4 938*c0909341SAndroid Build Coastguard Worker %if %2 == 4 939*c0909341SAndroid Build Coastguard Worker vpbroadcastq m11, [dstq+strideq*1-2] 940*c0909341SAndroid Build Coastguard Worker vpbroadcastq m12, [dstq+strideq*2-2] 941*c0909341SAndroid Build Coastguard Worker movd xm6, [topq+strideq*1-2] 942*c0909341SAndroid Build Coastguard Worker movd xm9, [dstq+strideq*0-2] 943*c0909341SAndroid Build Coastguard Worker pblendw m11, [leftq-16+2], 0x01 944*c0909341SAndroid Build Coastguard Worker pblendw m12, [leftq-16+4], 0x01 945*c0909341SAndroid Build Coastguard Worker pinsrw xm9, [leftq- 0+0], 0 946*c0909341SAndroid Build Coastguard Worker psrldq m5, m11, 4 947*c0909341SAndroid Build Coastguard Worker psrldq m10, m12, 4 948*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [dstq+stride3q +2], 1 949*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [botq +2], 1 950*c0909341SAndroid Build Coastguard Worker vpblendd m6, m11, 0x10 951*c0909341SAndroid Build Coastguard Worker vpblendd m9, m12, 0x10 952*c0909341SAndroid Build Coastguard Worker punpckldq m6, m9 953*c0909341SAndroid Build Coastguard Worker punpckldq m5, m10 954*c0909341SAndroid Build Coastguard Worker %else 955*c0909341SAndroid Build Coastguard Worker movd xm6, [topq +strideq*1-2] 956*c0909341SAndroid Build Coastguard Worker movq xm5, [dstq +strideq*1-2] 957*c0909341SAndroid Build Coastguard Worker movq xm9, [dstq +stride3q -2] 958*c0909341SAndroid Build Coastguard Worker movq xm10, [dst4q+strideq*1-2] 959*c0909341SAndroid Build Coastguard Worker movd xm11, [dst4q+stride3q +2] 960*c0909341SAndroid Build Coastguard Worker pinsrw xm6, [dstq +strideq*0 ], 3 961*c0909341SAndroid Build Coastguard Worker movhps xm5, [dstq +strideq*2-2] 962*c0909341SAndroid Build Coastguard Worker movhps xm9, [dst4q+strideq*0-2] 963*c0909341SAndroid Build Coastguard Worker movhps xm10, [dst4q+strideq*2-2] 964*c0909341SAndroid Build Coastguard Worker pinsrd xm11, [botq +2], 1 965*c0909341SAndroid Build Coastguard Worker shufps xm6, xm5, q2010 966*c0909341SAndroid Build Coastguard Worker shufps xm5, xm9, q3131 967*c0909341SAndroid Build Coastguard Worker shufps xm9, xm10, q2020 968*c0909341SAndroid Build Coastguard Worker shufps xm10, xm11, q1031 969*c0909341SAndroid Build Coastguard Worker movu m11, [blend_4x8_2] 970*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm9, 1 971*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm10, 1 972*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x10-4], m11 973*c0909341SAndroid Build Coastguard Worker %endif 974*c0909341SAndroid Build Coastguard Worker%else 975*c0909341SAndroid Build Coastguard Worker lea r13, [blend_8x8_1+8] 976*c0909341SAndroid Build Coastguard Worker movq xm6, [top2q -2] 977*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [dstq+strideq*1-2] 978*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [dstq+strideq*2-2] 979*c0909341SAndroid Build Coastguard Worker movhps xm6, [dstq+strideq*0-2] 980*c0909341SAndroid Build Coastguard Worker punpcklqdq m9, m5, m10 981*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [dstq+stride3q -2], 1 982*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [botq -2], 1 983*c0909341SAndroid Build Coastguard Worker movu m11, [r13+hq*2*1+16*1] 984*c0909341SAndroid Build Coastguard Worker vpblendd m6, m9, 0xF0 985*c0909341SAndroid Build Coastguard Worker shufps m5, m10, q2121 986*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11 987*c0909341SAndroid Build Coastguard Worker%endif 988*c0909341SAndroid Build Coastguard Worker ret 989*c0909341SAndroid Build Coastguard Worker.d4k1: 990*c0909341SAndroid Build Coastguard Worker%if %1 == 4 991*c0909341SAndroid Build Coastguard Worker %if %2 == 4 992*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [dstq+strideq*0-2], 1 993*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [dstq+strideq*1-2], 1 994*c0909341SAndroid Build Coastguard Worker movd xm5, [dstq+strideq*2+2] 995*c0909341SAndroid Build Coastguard Worker movd xm10, [dstq+stride3q +2] 996*c0909341SAndroid Build Coastguard Worker pblendw m6, [leftq-16+0], 0x01 997*c0909341SAndroid Build Coastguard Worker pblendw m9, [leftq-16+2], 0x01 998*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [botq+strideq*0+2], 1 999*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [botq+strideq*1+2], 1 1000*c0909341SAndroid Build Coastguard Worker vpblendd m6, [topq+strideq*0-2], 0x01 1001*c0909341SAndroid Build Coastguard Worker vpblendd m9, [topq+strideq*1-2], 0x01 1002*c0909341SAndroid Build Coastguard Worker punpckldq m5, m10 1003*c0909341SAndroid Build Coastguard Worker punpckldq m6, m9 1004*c0909341SAndroid Build Coastguard Worker %else 1005*c0909341SAndroid Build Coastguard Worker movd xm6, [topq +strideq*0-2] 1006*c0909341SAndroid Build Coastguard Worker movq xm5, [dstq +strideq*2-2] 1007*c0909341SAndroid Build Coastguard Worker movq xm9, [dst4q+strideq*0-2] 1008*c0909341SAndroid Build Coastguard Worker movd xm10, [dst4q+strideq*2+2] 1009*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [topq +strideq*1-2], 1 1010*c0909341SAndroid Build Coastguard Worker movhps xm5, [dstq +stride3q -2] 1011*c0909341SAndroid Build Coastguard Worker movhps xm9, [dst4q+strideq*1-2] 1012*c0909341SAndroid Build Coastguard Worker pinsrd xm10, [dst4q+stride3q +2], 1 1013*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [dstq +strideq*0-2], 2 1014*c0909341SAndroid Build Coastguard Worker pinsrd xm10, [botq +strideq*0+2], 2 1015*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [dstq +strideq*1-2], 3 1016*c0909341SAndroid Build Coastguard Worker pinsrd xm10, [botq +strideq*1+2], 3 1017*c0909341SAndroid Build Coastguard Worker shufps xm11, xm5, xm9, q2020 1018*c0909341SAndroid Build Coastguard Worker shufps xm5, xm9, q3131 1019*c0909341SAndroid Build Coastguard Worker movu m9, [blend_4x8_3] 1020*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm11, 1 1021*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm10, 1 1022*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x10-8], m9 1023*c0909341SAndroid Build Coastguard Worker %endif 1024*c0909341SAndroid Build Coastguard Worker%else 1025*c0909341SAndroid Build Coastguard Worker lea r13, [blend_8x8_1] 1026*c0909341SAndroid Build Coastguard Worker movu m11, [r13+hq*2*2+16*2] 1027*c0909341SAndroid Build Coastguard Worker movq xm6, [top1q -2] 1028*c0909341SAndroid Build Coastguard Worker movq xm9, [top2q -2] 1029*c0909341SAndroid Build Coastguard Worker movq xm5, [dstq+strideq*2+2] 1030*c0909341SAndroid Build Coastguard Worker movq xm10, [dstq+stride3q +2] 1031*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [dstq+strideq*0-2], 1 1032*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [dstq+strideq*1-2], 1 1033*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [botq+strideq*0+2], 1 1034*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [botq+strideq*1+2], 1 1035*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m9 1036*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11 1037*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m10 1038*c0909341SAndroid Build Coastguard Worker%endif 1039*c0909341SAndroid Build Coastguard Worker ret 1040*c0909341SAndroid Build Coastguard Worker.d5k1: 1041*c0909341SAndroid Build Coastguard Worker%if %1 == 4 1042*c0909341SAndroid Build Coastguard Worker %if %2 == 4 1043*c0909341SAndroid Build Coastguard Worker movd xm6, [topq+strideq*0-1] 1044*c0909341SAndroid Build Coastguard Worker movd xm9, [topq+strideq*1-1] 1045*c0909341SAndroid Build Coastguard Worker movd xm5, [dstq+strideq*2+1] 1046*c0909341SAndroid Build Coastguard Worker movd xm10, [dstq+stride3q +1] 1047*c0909341SAndroid Build Coastguard Worker pcmpeqd m12, m12 1048*c0909341SAndroid Build Coastguard Worker pmovzxbw m11, [leftq-8+1] 1049*c0909341SAndroid Build Coastguard Worker psrld m12, 24 1050*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [dstq+strideq*0-1], 1 1051*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [dstq+strideq*1-1], 1 1052*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [botq+strideq*0+1], 1 1053*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [botq+strideq*1+1], 1 1054*c0909341SAndroid Build Coastguard Worker punpckldq m6, m9 1055*c0909341SAndroid Build Coastguard Worker pxor m9, m9 1056*c0909341SAndroid Build Coastguard Worker vpblendd m12, m9, 0x0F 1057*c0909341SAndroid Build Coastguard Worker punpckldq m5, m10 1058*c0909341SAndroid Build Coastguard Worker vpblendvb m6, m11, m12 1059*c0909341SAndroid Build Coastguard Worker %else 1060*c0909341SAndroid Build Coastguard Worker movd xm6, [topq +strideq*0-1] 1061*c0909341SAndroid Build Coastguard Worker movq xm5, [dstq +strideq*2-1] 1062*c0909341SAndroid Build Coastguard Worker movq xm9, [dst4q+strideq*0-1] 1063*c0909341SAndroid Build Coastguard Worker movd xm10, [dst4q+strideq*2+1] 1064*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [topq +strideq*1-1], 1 1065*c0909341SAndroid Build Coastguard Worker movhps xm5, [dstq +stride3q -1] 1066*c0909341SAndroid Build Coastguard Worker movhps xm9, [dst4q+strideq*1-1] 1067*c0909341SAndroid Build Coastguard Worker pinsrd xm10, [dst4q+stride3q +1], 1 1068*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [dstq +strideq*0-1], 2 1069*c0909341SAndroid Build Coastguard Worker pinsrd xm10, [botq +strideq*0+1], 2 1070*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [dstq +strideq*1-1], 3 1071*c0909341SAndroid Build Coastguard Worker pinsrd xm10, [botq +strideq*1+1], 3 1072*c0909341SAndroid Build Coastguard Worker shufps xm11, xm5, xm9, q2020 1073*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm11, 1 1074*c0909341SAndroid Build Coastguard Worker pmovzxbw m11, [leftq-3] 1075*c0909341SAndroid Build Coastguard Worker psrldq xm5, 2 1076*c0909341SAndroid Build Coastguard Worker psrldq xm9, 2 1077*c0909341SAndroid Build Coastguard Worker shufps xm5, xm9, q2020 1078*c0909341SAndroid Build Coastguard Worker movu m9, [blend_4x8_1] 1079*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm10, 1 1080*c0909341SAndroid Build Coastguard Worker vpblendvb m6, m11, m9 1081*c0909341SAndroid Build Coastguard Worker %endif 1082*c0909341SAndroid Build Coastguard Worker%else 1083*c0909341SAndroid Build Coastguard Worker lea r13, [blend_8x8_0] 1084*c0909341SAndroid Build Coastguard Worker movu m11, [r13+hq*2*2+16*2] 1085*c0909341SAndroid Build Coastguard Worker movq xm6, [top1q -1] 1086*c0909341SAndroid Build Coastguard Worker movq xm9, [top2q -1] 1087*c0909341SAndroid Build Coastguard Worker movq xm5, [dstq+strideq*2+1] 1088*c0909341SAndroid Build Coastguard Worker movq xm10, [dstq+stride3q +1] 1089*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [dstq+strideq*0-1], 1 1090*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [dstq+strideq*1-1], 1 1091*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [botq+strideq*0+1], 1 1092*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [botq+strideq*1+1], 1 1093*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m9 1094*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m10 1095*c0909341SAndroid Build Coastguard Worker vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11 1096*c0909341SAndroid Build Coastguard Worker%endif 1097*c0909341SAndroid Build Coastguard Worker ret 1098*c0909341SAndroid Build Coastguard Worker.d6k1: 1099*c0909341SAndroid Build Coastguard Worker%if %1 == 4 1100*c0909341SAndroid Build Coastguard Worker %if %2 == 4 1101*c0909341SAndroid Build Coastguard Worker movd xm6, [topq+strideq*0] 1102*c0909341SAndroid Build Coastguard Worker movd xm9, [topq+strideq*1] 1103*c0909341SAndroid Build Coastguard Worker movd xm5, [dstq+strideq*2] 1104*c0909341SAndroid Build Coastguard Worker movd xm10, [dstq+stride3q ] 1105*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [dstq+strideq*0], 1 1106*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [dstq+strideq*1], 1 1107*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [botq+strideq*0], 1 1108*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [botq+strideq*1], 1 1109*c0909341SAndroid Build Coastguard Worker punpckldq m6, m9 1110*c0909341SAndroid Build Coastguard Worker punpckldq m5, m10 1111*c0909341SAndroid Build Coastguard Worker %else 1112*c0909341SAndroid Build Coastguard Worker movd xm5, [dstq +strideq*2] 1113*c0909341SAndroid Build Coastguard Worker movd xm6, [topq +strideq*0] 1114*c0909341SAndroid Build Coastguard Worker movd xm9, [dst4q+strideq*2] 1115*c0909341SAndroid Build Coastguard Worker pinsrd xm5, [dstq +stride3q ], 1 1116*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [topq +strideq*1], 1 1117*c0909341SAndroid Build Coastguard Worker pinsrd xm9, [dst4q+stride3q ], 1 1118*c0909341SAndroid Build Coastguard Worker pinsrd xm5, [dst4q+strideq*0], 2 1119*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [dstq +strideq*0], 2 1120*c0909341SAndroid Build Coastguard Worker pinsrd xm9, [botq +strideq*0], 2 1121*c0909341SAndroid Build Coastguard Worker pinsrd xm5, [dst4q+strideq*1], 3 1122*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [dstq +strideq*1], 3 1123*c0909341SAndroid Build Coastguard Worker pinsrd xm9, [botq +strideq*1], 3 1124*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm5, 1 1125*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm9, 1 1126*c0909341SAndroid Build Coastguard Worker %endif 1127*c0909341SAndroid Build Coastguard Worker%else 1128*c0909341SAndroid Build Coastguard Worker movq xm5, [dstq+strideq*2] 1129*c0909341SAndroid Build Coastguard Worker movq xm9, [botq+strideq*0] 1130*c0909341SAndroid Build Coastguard Worker movq xm6, [top1q ] 1131*c0909341SAndroid Build Coastguard Worker movq xm10, [dstq+strideq*0] 1132*c0909341SAndroid Build Coastguard Worker movhps xm5, [dstq+stride3q ] 1133*c0909341SAndroid Build Coastguard Worker movhps xm9, [botq+strideq*1] 1134*c0909341SAndroid Build Coastguard Worker movhps xm6, [top2q ] 1135*c0909341SAndroid Build Coastguard Worker movhps xm10, [dstq+strideq*1] 1136*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm9, 1 1137*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm10, 1 1138*c0909341SAndroid Build Coastguard Worker%endif 1139*c0909341SAndroid Build Coastguard Worker ret 1140*c0909341SAndroid Build Coastguard Worker.d7k1: 1141*c0909341SAndroid Build Coastguard Worker%if %1 == 4 1142*c0909341SAndroid Build Coastguard Worker %if %2 == 4 1143*c0909341SAndroid Build Coastguard Worker movd xm5, [dstq+strideq*2-1] 1144*c0909341SAndroid Build Coastguard Worker movd xm9, [dstq+stride3q -1] 1145*c0909341SAndroid Build Coastguard Worker movd xm6, [topq+strideq*0+1] 1146*c0909341SAndroid Build Coastguard Worker movd xm10, [topq+strideq*1+1] 1147*c0909341SAndroid Build Coastguard Worker pinsrb xm5, [leftq+ 5], 0 1148*c0909341SAndroid Build Coastguard Worker pinsrb xm9, [leftq+ 7], 0 1149*c0909341SAndroid Build Coastguard Worker vinserti128 m6, [dstq+strideq*0+1], 1 1150*c0909341SAndroid Build Coastguard Worker vinserti128 m10, [dstq+strideq*1+1], 1 1151*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [botq+strideq*0-1], 1 1152*c0909341SAndroid Build Coastguard Worker vinserti128 m9, [botq+strideq*1-1], 1 1153*c0909341SAndroid Build Coastguard Worker punpckldq m6, m10 1154*c0909341SAndroid Build Coastguard Worker punpckldq m5, m9 1155*c0909341SAndroid Build Coastguard Worker %else 1156*c0909341SAndroid Build Coastguard Worker movd xm6, [topq +strideq*0+1] 1157*c0909341SAndroid Build Coastguard Worker movq xm9, [dstq +strideq*2-1] 1158*c0909341SAndroid Build Coastguard Worker movq xm10, [dst4q+strideq*0-1] 1159*c0909341SAndroid Build Coastguard Worker movd xm11, [dst4q+strideq*2-1] 1160*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [topq +strideq*1+1], 1 1161*c0909341SAndroid Build Coastguard Worker movhps xm9, [dstq +stride3q -1] 1162*c0909341SAndroid Build Coastguard Worker movhps xm10, [dst4q+strideq*1-1] 1163*c0909341SAndroid Build Coastguard Worker pinsrd xm11, [dst4q+stride3q -1], 1 1164*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [dstq +strideq*0+1], 2 1165*c0909341SAndroid Build Coastguard Worker pinsrd xm11, [botq +strideq*0-1], 2 1166*c0909341SAndroid Build Coastguard Worker pinsrd xm6, [dstq +strideq*1+1], 3 1167*c0909341SAndroid Build Coastguard Worker pinsrd xm11, [botq +strideq*1-1], 3 1168*c0909341SAndroid Build Coastguard Worker shufps xm5, xm9, xm10, q2020 1169*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm11, 1 1170*c0909341SAndroid Build Coastguard Worker pmovzxbw m11, [leftq+5] 1171*c0909341SAndroid Build Coastguard Worker psrldq xm9, 2 1172*c0909341SAndroid Build Coastguard Worker psrldq xm10, 2 1173*c0909341SAndroid Build Coastguard Worker shufps xm9, xm10, q2020 1174*c0909341SAndroid Build Coastguard Worker movu m10, [blend_4x8_1+8] 1175*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm9, 1 1176*c0909341SAndroid Build Coastguard Worker vpblendvb m5, m11, m10 1177*c0909341SAndroid Build Coastguard Worker %endif 1178*c0909341SAndroid Build Coastguard Worker%else 1179*c0909341SAndroid Build Coastguard Worker lea r13, [blend_8x8_0+16] 1180*c0909341SAndroid Build Coastguard Worker movq xm5, [dstq+strideq*2-1] 1181*c0909341SAndroid Build Coastguard Worker movq xm9, [botq+strideq*0-1] 1182*c0909341SAndroid Build Coastguard Worker movq xm6, [top1q +1] 1183*c0909341SAndroid Build Coastguard Worker movq xm10, [dstq+strideq*0+1] 1184*c0909341SAndroid Build Coastguard Worker movhps xm5, [dstq+stride3q -1] 1185*c0909341SAndroid Build Coastguard Worker movhps xm9, [botq+strideq*1-1] 1186*c0909341SAndroid Build Coastguard Worker movhps xm6, [top2q +1] 1187*c0909341SAndroid Build Coastguard Worker movhps xm10, [dstq+strideq*1+1] 1188*c0909341SAndroid Build Coastguard Worker movu m11, [r13+hq*2*2+16*2] 1189*c0909341SAndroid Build Coastguard Worker vinserti128 m5, xm9, 1 1190*c0909341SAndroid Build Coastguard Worker vinserti128 m6, xm10, 1 1191*c0909341SAndroid Build Coastguard Worker vpblendvb m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11 1192*c0909341SAndroid Build Coastguard Worker%endif 1193*c0909341SAndroid Build Coastguard Worker ret 1194*c0909341SAndroid Build Coastguard Worker 1195*c0909341SAndroid Build Coastguard Worker.border_block: 1196*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge 1197*c0909341SAndroid Build Coastguard Worker RESET_STACK_STATE 1198*c0909341SAndroid Build Coastguard Worker %assign stack_offset stack_offset - (regs_used - 11) * gprsize 1199*c0909341SAndroid Build Coastguard Worker %assign regs_used 11 1200*c0909341SAndroid Build Coastguard Worker ALLOC_STACK 2*16+(%2+4)*32, 16 1201*c0909341SAndroid Build Coastguard Worker%define px rsp+2*16+2*32 1202*c0909341SAndroid Build Coastguard Worker 1203*c0909341SAndroid Build Coastguard Worker pcmpeqw m14, m14 1204*c0909341SAndroid Build Coastguard Worker psllw m14, 15 ; 0x8000 1205*c0909341SAndroid Build Coastguard Worker 1206*c0909341SAndroid Build Coastguard Worker ; prepare pixel buffers - body/right 1207*c0909341SAndroid Build Coastguard Worker%if %1 == 4 1208*c0909341SAndroid Build Coastguard Worker INIT_XMM avx2 1209*c0909341SAndroid Build Coastguard Worker%endif 1210*c0909341SAndroid Build Coastguard Worker%if %2 == 8 1211*c0909341SAndroid Build Coastguard Worker lea dst4q, [dstq+strideq*4] 1212*c0909341SAndroid Build Coastguard Worker%endif 1213*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 1214*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; have_right 1215*c0909341SAndroid Build Coastguard Worker jz .no_right 1216*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [dstq+strideq*0] 1217*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [dstq+strideq*1] 1218*c0909341SAndroid Build Coastguard Worker pmovzxbw m3, [dstq+strideq*2] 1219*c0909341SAndroid Build Coastguard Worker pmovzxbw m4, [dstq+stride3q] 1220*c0909341SAndroid Build Coastguard Worker mova [px+0*32], m1 1221*c0909341SAndroid Build Coastguard Worker mova [px+1*32], m2 1222*c0909341SAndroid Build Coastguard Worker mova [px+2*32], m3 1223*c0909341SAndroid Build Coastguard Worker mova [px+3*32], m4 1224*c0909341SAndroid Build Coastguard Worker%if %2 == 8 1225*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [dst4q+strideq*0] 1226*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [dst4q+strideq*1] 1227*c0909341SAndroid Build Coastguard Worker pmovzxbw m3, [dst4q+strideq*2] 1228*c0909341SAndroid Build Coastguard Worker pmovzxbw m4, [dst4q+stride3q] 1229*c0909341SAndroid Build Coastguard Worker mova [px+4*32], m1 1230*c0909341SAndroid Build Coastguard Worker mova [px+5*32], m2 1231*c0909341SAndroid Build Coastguard Worker mova [px+6*32], m3 1232*c0909341SAndroid Build Coastguard Worker mova [px+7*32], m4 1233*c0909341SAndroid Build Coastguard Worker%endif 1234*c0909341SAndroid Build Coastguard Worker jmp .body_done 1235*c0909341SAndroid Build Coastguard Worker.no_right: 1236*c0909341SAndroid Build Coastguard Worker%if %1 == 4 1237*c0909341SAndroid Build Coastguard Worker movd xm1, [dstq+strideq*0] 1238*c0909341SAndroid Build Coastguard Worker movd xm2, [dstq+strideq*1] 1239*c0909341SAndroid Build Coastguard Worker movd xm3, [dstq+strideq*2] 1240*c0909341SAndroid Build Coastguard Worker movd xm4, [dstq+stride3q] 1241*c0909341SAndroid Build Coastguard Worker pmovzxbw xm1, xm1 1242*c0909341SAndroid Build Coastguard Worker pmovzxbw xm2, xm2 1243*c0909341SAndroid Build Coastguard Worker pmovzxbw xm3, xm3 1244*c0909341SAndroid Build Coastguard Worker pmovzxbw xm4, xm4 1245*c0909341SAndroid Build Coastguard Worker movq [px+0*32], xm1 1246*c0909341SAndroid Build Coastguard Worker movq [px+1*32], xm2 1247*c0909341SAndroid Build Coastguard Worker movq [px+2*32], xm3 1248*c0909341SAndroid Build Coastguard Worker movq [px+3*32], xm4 1249*c0909341SAndroid Build Coastguard Worker%else 1250*c0909341SAndroid Build Coastguard Worker pmovzxbw xm1, [dstq+strideq*0] 1251*c0909341SAndroid Build Coastguard Worker pmovzxbw xm2, [dstq+strideq*1] 1252*c0909341SAndroid Build Coastguard Worker pmovzxbw xm3, [dstq+strideq*2] 1253*c0909341SAndroid Build Coastguard Worker pmovzxbw xm4, [dstq+stride3q] 1254*c0909341SAndroid Build Coastguard Worker mova [px+0*32], xm1 1255*c0909341SAndroid Build Coastguard Worker mova [px+1*32], xm2 1256*c0909341SAndroid Build Coastguard Worker mova [px+2*32], xm3 1257*c0909341SAndroid Build Coastguard Worker mova [px+3*32], xm4 1258*c0909341SAndroid Build Coastguard Worker%endif 1259*c0909341SAndroid Build Coastguard Worker movd [px+0*32+%1*2], xm14 1260*c0909341SAndroid Build Coastguard Worker movd [px+1*32+%1*2], xm14 1261*c0909341SAndroid Build Coastguard Worker movd [px+2*32+%1*2], xm14 1262*c0909341SAndroid Build Coastguard Worker movd [px+3*32+%1*2], xm14 1263*c0909341SAndroid Build Coastguard Worker%if %2 == 8 1264*c0909341SAndroid Build Coastguard Worker %if %1 == 4 1265*c0909341SAndroid Build Coastguard Worker movd xm1, [dst4q+strideq*0] 1266*c0909341SAndroid Build Coastguard Worker movd xm2, [dst4q+strideq*1] 1267*c0909341SAndroid Build Coastguard Worker movd xm3, [dst4q+strideq*2] 1268*c0909341SAndroid Build Coastguard Worker movd xm4, [dst4q+stride3q] 1269*c0909341SAndroid Build Coastguard Worker pmovzxbw xm1, xm1 1270*c0909341SAndroid Build Coastguard Worker pmovzxbw xm2, xm2 1271*c0909341SAndroid Build Coastguard Worker pmovzxbw xm3, xm3 1272*c0909341SAndroid Build Coastguard Worker pmovzxbw xm4, xm4 1273*c0909341SAndroid Build Coastguard Worker movq [px+4*32], xm1 1274*c0909341SAndroid Build Coastguard Worker movq [px+5*32], xm2 1275*c0909341SAndroid Build Coastguard Worker movq [px+6*32], xm3 1276*c0909341SAndroid Build Coastguard Worker movq [px+7*32], xm4 1277*c0909341SAndroid Build Coastguard Worker %else 1278*c0909341SAndroid Build Coastguard Worker pmovzxbw xm1, [dst4q+strideq*0] 1279*c0909341SAndroid Build Coastguard Worker pmovzxbw xm2, [dst4q+strideq*1] 1280*c0909341SAndroid Build Coastguard Worker pmovzxbw xm3, [dst4q+strideq*2] 1281*c0909341SAndroid Build Coastguard Worker pmovzxbw xm4, [dst4q+stride3q] 1282*c0909341SAndroid Build Coastguard Worker mova [px+4*32], xm1 1283*c0909341SAndroid Build Coastguard Worker mova [px+5*32], xm2 1284*c0909341SAndroid Build Coastguard Worker mova [px+6*32], xm3 1285*c0909341SAndroid Build Coastguard Worker mova [px+7*32], xm4 1286*c0909341SAndroid Build Coastguard Worker %endif 1287*c0909341SAndroid Build Coastguard Worker movd [px+4*32+%1*2], xm14 1288*c0909341SAndroid Build Coastguard Worker movd [px+5*32+%1*2], xm14 1289*c0909341SAndroid Build Coastguard Worker movd [px+6*32+%1*2], xm14 1290*c0909341SAndroid Build Coastguard Worker movd [px+7*32+%1*2], xm14 1291*c0909341SAndroid Build Coastguard Worker%endif 1292*c0909341SAndroid Build Coastguard Worker.body_done: 1293*c0909341SAndroid Build Coastguard Worker 1294*c0909341SAndroid Build Coastguard Worker ; top 1295*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; have_top 1296*c0909341SAndroid Build Coastguard Worker jz .no_top 1297*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; have_left 1298*c0909341SAndroid Build Coastguard Worker jz .top_no_left 1299*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; have_right 1300*c0909341SAndroid Build Coastguard Worker jz .top_no_right 1301*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [topq+strideq*0-(%1/2)] 1302*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [topq+strideq*1-(%1/2)] 1303*c0909341SAndroid Build Coastguard Worker movu [px-2*32-%1], m1 1304*c0909341SAndroid Build Coastguard Worker movu [px-1*32-%1], m2 1305*c0909341SAndroid Build Coastguard Worker jmp .top_done 1306*c0909341SAndroid Build Coastguard Worker.top_no_right: 1307*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [topq+strideq*0-%1] 1308*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [topq+strideq*1-%1] 1309*c0909341SAndroid Build Coastguard Worker movu [px-2*32-%1*2], m1 1310*c0909341SAndroid Build Coastguard Worker movu [px-1*32-%1*2], m2 1311*c0909341SAndroid Build Coastguard Worker movd [px-2*32+%1*2], xm14 1312*c0909341SAndroid Build Coastguard Worker movd [px-1*32+%1*2], xm14 1313*c0909341SAndroid Build Coastguard Worker jmp .top_done 1314*c0909341SAndroid Build Coastguard Worker.top_no_left: 1315*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; have_right 1316*c0909341SAndroid Build Coastguard Worker jz .top_no_left_right 1317*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [topq+strideq*0] 1318*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [topq+strideq*1] 1319*c0909341SAndroid Build Coastguard Worker mova [px-2*32+0], m1 1320*c0909341SAndroid Build Coastguard Worker mova [px-1*32+0], m2 1321*c0909341SAndroid Build Coastguard Worker movd [px-2*32-4], xm14 1322*c0909341SAndroid Build Coastguard Worker movd [px-1*32-4], xm14 1323*c0909341SAndroid Build Coastguard Worker jmp .top_done 1324*c0909341SAndroid Build Coastguard Worker.top_no_left_right: 1325*c0909341SAndroid Build Coastguard Worker%if %1 == 4 1326*c0909341SAndroid Build Coastguard Worker movd xm1, [topq+strideq*0] 1327*c0909341SAndroid Build Coastguard Worker pinsrd xm1, [topq+strideq*1], 1 1328*c0909341SAndroid Build Coastguard Worker pmovzxbw xm1, xm1 1329*c0909341SAndroid Build Coastguard Worker movq [px-2*32+0], xm1 1330*c0909341SAndroid Build Coastguard Worker movhps [px-1*32+0], xm1 1331*c0909341SAndroid Build Coastguard Worker%else 1332*c0909341SAndroid Build Coastguard Worker pmovzxbw xm1, [topq+strideq*0] 1333*c0909341SAndroid Build Coastguard Worker pmovzxbw xm2, [topq+strideq*1] 1334*c0909341SAndroid Build Coastguard Worker mova [px-2*32+0], xm1 1335*c0909341SAndroid Build Coastguard Worker mova [px-1*32+0], xm2 1336*c0909341SAndroid Build Coastguard Worker%endif 1337*c0909341SAndroid Build Coastguard Worker movd [px-2*32-4], xm14 1338*c0909341SAndroid Build Coastguard Worker movd [px-1*32-4], xm14 1339*c0909341SAndroid Build Coastguard Worker movd [px-2*32+%1*2], xm14 1340*c0909341SAndroid Build Coastguard Worker movd [px-1*32+%1*2], xm14 1341*c0909341SAndroid Build Coastguard Worker jmp .top_done 1342*c0909341SAndroid Build Coastguard Worker.no_top: 1343*c0909341SAndroid Build Coastguard Worker movu [px-2*32-%1], m14 1344*c0909341SAndroid Build Coastguard Worker movu [px-1*32-%1], m14 1345*c0909341SAndroid Build Coastguard Worker.top_done: 1346*c0909341SAndroid Build Coastguard Worker 1347*c0909341SAndroid Build Coastguard Worker ; left 1348*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; have_left 1349*c0909341SAndroid Build Coastguard Worker jz .no_left 1350*c0909341SAndroid Build Coastguard Worker pmovzxbw xm1, [leftq+ 0] 1351*c0909341SAndroid Build Coastguard Worker%if %2 == 8 1352*c0909341SAndroid Build Coastguard Worker pmovzxbw xm2, [leftq+ 8] 1353*c0909341SAndroid Build Coastguard Worker%endif 1354*c0909341SAndroid Build Coastguard Worker movd [px+0*32-4], xm1 1355*c0909341SAndroid Build Coastguard Worker pextrd [px+1*32-4], xm1, 1 1356*c0909341SAndroid Build Coastguard Worker pextrd [px+2*32-4], xm1, 2 1357*c0909341SAndroid Build Coastguard Worker pextrd [px+3*32-4], xm1, 3 1358*c0909341SAndroid Build Coastguard Worker%if %2 == 8 1359*c0909341SAndroid Build Coastguard Worker movd [px+4*32-4], xm2 1360*c0909341SAndroid Build Coastguard Worker pextrd [px+5*32-4], xm2, 1 1361*c0909341SAndroid Build Coastguard Worker pextrd [px+6*32-4], xm2, 2 1362*c0909341SAndroid Build Coastguard Worker pextrd [px+7*32-4], xm2, 3 1363*c0909341SAndroid Build Coastguard Worker%endif 1364*c0909341SAndroid Build Coastguard Worker jmp .left_done 1365*c0909341SAndroid Build Coastguard Worker.no_left: 1366*c0909341SAndroid Build Coastguard Worker movd [px+0*32-4], xm14 1367*c0909341SAndroid Build Coastguard Worker movd [px+1*32-4], xm14 1368*c0909341SAndroid Build Coastguard Worker movd [px+2*32-4], xm14 1369*c0909341SAndroid Build Coastguard Worker movd [px+3*32-4], xm14 1370*c0909341SAndroid Build Coastguard Worker%if %2 == 8 1371*c0909341SAndroid Build Coastguard Worker movd [px+4*32-4], xm14 1372*c0909341SAndroid Build Coastguard Worker movd [px+5*32-4], xm14 1373*c0909341SAndroid Build Coastguard Worker movd [px+6*32-4], xm14 1374*c0909341SAndroid Build Coastguard Worker movd [px+7*32-4], xm14 1375*c0909341SAndroid Build Coastguard Worker%endif 1376*c0909341SAndroid Build Coastguard Worker.left_done: 1377*c0909341SAndroid Build Coastguard Worker 1378*c0909341SAndroid Build Coastguard Worker ; bottom 1379*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge 1380*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; have_bottom 1381*c0909341SAndroid Build Coastguard Worker jz .no_bottom 1382*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; have_left 1383*c0909341SAndroid Build Coastguard Worker jz .bottom_no_left 1384*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; have_right 1385*c0909341SAndroid Build Coastguard Worker jz .bottom_no_right 1386*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [botq+strideq*0-(%1/2)] 1387*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [botq+strideq*1-(%1/2)] 1388*c0909341SAndroid Build Coastguard Worker movu [px+(%2+0)*32-%1], m1 1389*c0909341SAndroid Build Coastguard Worker movu [px+(%2+1)*32-%1], m2 1390*c0909341SAndroid Build Coastguard Worker jmp .bottom_done 1391*c0909341SAndroid Build Coastguard Worker.bottom_no_right: 1392*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [botq+strideq*0-%1] 1393*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [botq+strideq*1-%1] 1394*c0909341SAndroid Build Coastguard Worker movu [px+(%2+0)*32-%1*2], m1 1395*c0909341SAndroid Build Coastguard Worker movu [px+(%2+1)*32-%1*2], m2 1396*c0909341SAndroid Build Coastguard Worker%if %1 == 8 1397*c0909341SAndroid Build Coastguard Worker movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu 1398*c0909341SAndroid Build Coastguard Worker%endif 1399*c0909341SAndroid Build Coastguard Worker movd [px+(%2+0)*32+%1*2], xm14 1400*c0909341SAndroid Build Coastguard Worker movd [px+(%2+1)*32+%1*2], xm14 1401*c0909341SAndroid Build Coastguard Worker jmp .bottom_done 1402*c0909341SAndroid Build Coastguard Worker.bottom_no_left: 1403*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; have_right 1404*c0909341SAndroid Build Coastguard Worker jz .bottom_no_left_right 1405*c0909341SAndroid Build Coastguard Worker pmovzxbw m1, [botq+strideq*0] 1406*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, [botq+strideq*1] 1407*c0909341SAndroid Build Coastguard Worker mova [px+(%2+0)*32+0], m1 1408*c0909341SAndroid Build Coastguard Worker mova [px+(%2+1)*32+0], m2 1409*c0909341SAndroid Build Coastguard Worker movd [px+(%2+0)*32-4], xm14 1410*c0909341SAndroid Build Coastguard Worker movd [px+(%2+1)*32-4], xm14 1411*c0909341SAndroid Build Coastguard Worker jmp .bottom_done 1412*c0909341SAndroid Build Coastguard Worker.bottom_no_left_right: 1413*c0909341SAndroid Build Coastguard Worker%if %1 == 4 1414*c0909341SAndroid Build Coastguard Worker movd xm1, [botq+strideq*0] 1415*c0909341SAndroid Build Coastguard Worker pinsrd xm1, [botq+strideq*1], 1 1416*c0909341SAndroid Build Coastguard Worker pmovzxbw xm1, xm1 1417*c0909341SAndroid Build Coastguard Worker movq [px+(%2+0)*32+0], xm1 1418*c0909341SAndroid Build Coastguard Worker movhps [px+(%2+1)*32+0], xm1 1419*c0909341SAndroid Build Coastguard Worker%else 1420*c0909341SAndroid Build Coastguard Worker pmovzxbw xm1, [botq+strideq*0] 1421*c0909341SAndroid Build Coastguard Worker pmovzxbw xm2, [botq+strideq*1] 1422*c0909341SAndroid Build Coastguard Worker mova [px+(%2+0)*32+0], xm1 1423*c0909341SAndroid Build Coastguard Worker mova [px+(%2+1)*32+0], xm2 1424*c0909341SAndroid Build Coastguard Worker%endif 1425*c0909341SAndroid Build Coastguard Worker movd [px+(%2+0)*32-4], xm14 1426*c0909341SAndroid Build Coastguard Worker movd [px+(%2+1)*32-4], xm14 1427*c0909341SAndroid Build Coastguard Worker movd [px+(%2+0)*32+%1*2], xm14 1428*c0909341SAndroid Build Coastguard Worker movd [px+(%2+1)*32+%1*2], xm14 1429*c0909341SAndroid Build Coastguard Worker jmp .bottom_done 1430*c0909341SAndroid Build Coastguard Worker.no_bottom: 1431*c0909341SAndroid Build Coastguard Worker movu [px+(%2+0)*32-%1], m14 1432*c0909341SAndroid Build Coastguard Worker movu [px+(%2+1)*32-%1], m14 1433*c0909341SAndroid Build Coastguard Worker.bottom_done: 1434*c0909341SAndroid Build Coastguard Worker 1435*c0909341SAndroid Build Coastguard Worker ; actual filter 1436*c0909341SAndroid Build Coastguard Worker INIT_YMM avx2 1437*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero 1438*c0909341SAndroid Build Coastguard Worker%undef edged 1439*c0909341SAndroid Build Coastguard Worker ; register to shuffle values into after packing 1440*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m12, [shufb_lohi] 1441*c0909341SAndroid Build Coastguard Worker 1442*c0909341SAndroid Build Coastguard Worker mov dampingd, r8m 1443*c0909341SAndroid Build Coastguard Worker xor zerod, zerod 1444*c0909341SAndroid Build Coastguard Worker movifnidn prid, prim 1445*c0909341SAndroid Build Coastguard Worker sub dampingd, 31 1446*c0909341SAndroid Build Coastguard Worker movifnidn secdmpd, secdmpm 1447*c0909341SAndroid Build Coastguard Worker test prid, prid 1448*c0909341SAndroid Build Coastguard Worker jz .border_sec_only 1449*c0909341SAndroid Build Coastguard Worker movd xm0, prid 1450*c0909341SAndroid Build Coastguard Worker lzcnt pridmpd, prid 1451*c0909341SAndroid Build Coastguard Worker add pridmpd, dampingd 1452*c0909341SAndroid Build Coastguard Worker cmovs pridmpd, zerod 1453*c0909341SAndroid Build Coastguard Worker mov [rsp+0], pridmpq ; pri_shift 1454*c0909341SAndroid Build Coastguard Worker test secdmpd, secdmpd 1455*c0909341SAndroid Build Coastguard Worker jz .border_pri_only 1456*c0909341SAndroid Build Coastguard Worker movd xm1, secdmpd 1457*c0909341SAndroid Build Coastguard Worker lzcnt secdmpd, secdmpd 1458*c0909341SAndroid Build Coastguard Worker add secdmpd, dampingd 1459*c0909341SAndroid Build Coastguard Worker mov [rsp+8], secdmpq ; sec_shift 1460*c0909341SAndroid Build Coastguard Worker 1461*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3 1462*c0909341SAndroid Build Coastguard Worker lea tableq, [tap_table] 1463*c0909341SAndroid Build Coastguard Worker vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 1464*c0909341SAndroid Build Coastguard Worker vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 1465*c0909341SAndroid Build Coastguard Worker 1466*c0909341SAndroid Build Coastguard Worker ; pri/sec_taps[k] [4 total] 1467*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3 1468*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, xm0 ; pri_strength 1469*c0909341SAndroid Build Coastguard Worker vpbroadcastb m1, xm1 ; sec_strength 1470*c0909341SAndroid Build Coastguard Worker and prid, 1 1471*c0909341SAndroid Build Coastguard Worker lea priq, [tableq+priq*2+8] ; pri_taps 1472*c0909341SAndroid Build Coastguard Worker lea secq, [tableq+12] ; sec_taps 1473*c0909341SAndroid Build Coastguard Worker 1474*c0909341SAndroid Build Coastguard Worker BORDER_PREP_REGS %1, %2 1475*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1 1476*c0909341SAndroid Build Coastguard Worker.border_v_loop: 1477*c0909341SAndroid Build Coastguard Worker%endif 1478*c0909341SAndroid Build Coastguard Worker BORDER_LOAD_BLOCK %1, %2, 1 1479*c0909341SAndroid Build Coastguard Worker.border_k_loop: 1480*c0909341SAndroid Build Coastguard Worker vpbroadcastb m2, [priq+kq] ; pri_taps 1481*c0909341SAndroid Build Coastguard Worker vpbroadcastb m3, [secq+kq] ; sec_taps 1482*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1 1483*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1 1484*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1 1485*c0909341SAndroid Build Coastguard Worker dec kq 1486*c0909341SAndroid Build Coastguard Worker jge .border_k_loop 1487*c0909341SAndroid Build Coastguard Worker 1488*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pw_2048] 1489*c0909341SAndroid Build Coastguard Worker BORDER_ADJUST_PIXEL %1, m10, 1 1490*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1 1491*c0909341SAndroid Build Coastguard Worker %define vloop_lines (mmsize/(%1*2)) 1492*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*vloop_lines] 1493*c0909341SAndroid Build Coastguard Worker add stkq, 32*vloop_lines 1494*c0909341SAndroid Build Coastguard Worker dec hd 1495*c0909341SAndroid Build Coastguard Worker jg .border_v_loop 1496*c0909341SAndroid Build Coastguard Worker%endif 1497*c0909341SAndroid Build Coastguard Worker RET 1498*c0909341SAndroid Build Coastguard Worker 1499*c0909341SAndroid Build Coastguard Worker.border_pri_only: 1500*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3 1501*c0909341SAndroid Build Coastguard Worker lea tableq, [tap_table] 1502*c0909341SAndroid Build Coastguard Worker vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask 1503*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3 1504*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, xm0 ; pri_strength 1505*c0909341SAndroid Build Coastguard Worker and prid, 1 1506*c0909341SAndroid Build Coastguard Worker lea priq, [tableq+priq*2+8] ; pri_taps 1507*c0909341SAndroid Build Coastguard Worker BORDER_PREP_REGS %1, %2 1508*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [pw_2048] 1509*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1 1510*c0909341SAndroid Build Coastguard Worker.border_pri_v_loop: 1511*c0909341SAndroid Build Coastguard Worker%endif 1512*c0909341SAndroid Build Coastguard Worker BORDER_LOAD_BLOCK %1, %2 1513*c0909341SAndroid Build Coastguard Worker.border_pri_k_loop: 1514*c0909341SAndroid Build Coastguard Worker vpbroadcastb m2, [priq+kq] ; pri_taps 1515*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1 1516*c0909341SAndroid Build Coastguard Worker dec kq 1517*c0909341SAndroid Build Coastguard Worker jge .border_pri_k_loop 1518*c0909341SAndroid Build Coastguard Worker BORDER_ADJUST_PIXEL %1, m1 1519*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1 1520*c0909341SAndroid Build Coastguard Worker %define vloop_lines (mmsize/(%1*2)) 1521*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*vloop_lines] 1522*c0909341SAndroid Build Coastguard Worker add stkq, 32*vloop_lines 1523*c0909341SAndroid Build Coastguard Worker dec hd 1524*c0909341SAndroid Build Coastguard Worker jg .border_pri_v_loop 1525*c0909341SAndroid Build Coastguard Worker%endif 1526*c0909341SAndroid Build Coastguard Worker RET 1527*c0909341SAndroid Build Coastguard Worker 1528*c0909341SAndroid Build Coastguard Worker.border_sec_only: 1529*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3 1530*c0909341SAndroid Build Coastguard Worker movd xm1, secdmpd 1531*c0909341SAndroid Build Coastguard Worker lzcnt secdmpd, secdmpd 1532*c0909341SAndroid Build Coastguard Worker add secdmpd, dampingd 1533*c0909341SAndroid Build Coastguard Worker mov [rsp+8], secdmpq ; sec_shift 1534*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3 1535*c0909341SAndroid Build Coastguard Worker lea tableq, [tap_table] 1536*c0909341SAndroid Build Coastguard Worker vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask 1537*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3 1538*c0909341SAndroid Build Coastguard Worker vpbroadcastb m1, xm1 ; sec_strength 1539*c0909341SAndroid Build Coastguard Worker lea secq, [tableq+12] ; sec_taps 1540*c0909341SAndroid Build Coastguard Worker BORDER_PREP_REGS %1, %2 1541*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [pw_2048] 1542*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1 1543*c0909341SAndroid Build Coastguard Worker.border_sec_v_loop: 1544*c0909341SAndroid Build Coastguard Worker%endif 1545*c0909341SAndroid Build Coastguard Worker BORDER_LOAD_BLOCK %1, %2 1546*c0909341SAndroid Build Coastguard Worker.border_sec_k_loop: 1547*c0909341SAndroid Build Coastguard Worker vpbroadcastb m3, [secq+kq] ; sec_taps 1548*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1 1549*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1 1550*c0909341SAndroid Build Coastguard Worker dec kq 1551*c0909341SAndroid Build Coastguard Worker jge .border_sec_k_loop 1552*c0909341SAndroid Build Coastguard Worker BORDER_ADJUST_PIXEL %1, m0 1553*c0909341SAndroid Build Coastguard Worker%if %1*%2*2/mmsize > 1 1554*c0909341SAndroid Build Coastguard Worker %define vloop_lines (mmsize/(%1*2)) 1555*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*vloop_lines] 1556*c0909341SAndroid Build Coastguard Worker add stkq, 32*vloop_lines 1557*c0909341SAndroid Build Coastguard Worker dec hd 1558*c0909341SAndroid Build Coastguard Worker jg .border_sec_v_loop 1559*c0909341SAndroid Build Coastguard Worker%endif 1560*c0909341SAndroid Build Coastguard Worker RET 1561*c0909341SAndroid Build Coastguard Worker%endmacro 1562*c0909341SAndroid Build Coastguard Worker 1563*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 8, 8 1564*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 8 1565*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 4 1566*c0909341SAndroid Build Coastguard Worker 1567*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 1568*c0909341SAndroid Build Coastguard Workercglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3 1569*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 1570*c0909341SAndroid Build Coastguard Worker movq xm0, [srcq+strideq*0] 1571*c0909341SAndroid Build Coastguard Worker movq xm1, [srcq+strideq*1] 1572*c0909341SAndroid Build Coastguard Worker movq xm2, [srcq+strideq*2] 1573*c0909341SAndroid Build Coastguard Worker movq xm3, [srcq+stride3q ] 1574*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1575*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+stride3q ] 1576*c0909341SAndroid Build Coastguard Worker vpbroadcastq m5, [srcq+strideq*2] 1577*c0909341SAndroid Build Coastguard Worker vpblendd m0, m4, 0xf0 1578*c0909341SAndroid Build Coastguard Worker vpblendd m1, m5, 0xf0 1579*c0909341SAndroid Build Coastguard Worker vpbroadcastq m4, [srcq+strideq*1] 1580*c0909341SAndroid Build Coastguard Worker vpbroadcastq m5, [srcq+strideq*0] 1581*c0909341SAndroid Build Coastguard Worker vpblendd m2, m4, 0xf0 1582*c0909341SAndroid Build Coastguard Worker vpblendd m3, m5, 0xf0 1583*c0909341SAndroid Build Coastguard Worker pxor m4, m4 1584*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m4 1585*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m4 1586*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m4 1587*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m4 1588*c0909341SAndroid Build Coastguard Workercglobal_label .main 1589*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pw_128] 1590*c0909341SAndroid Build Coastguard Worker PROLOGUE 3, 4, 15 1591*c0909341SAndroid Build Coastguard Worker psubw m0, m4 1592*c0909341SAndroid Build Coastguard Worker psubw m1, m4 1593*c0909341SAndroid Build Coastguard Worker psubw m2, m4 1594*c0909341SAndroid Build Coastguard Worker psubw m3, m4 1595*c0909341SAndroid Build Coastguard Worker 1596*c0909341SAndroid Build Coastguard Worker ; shuffle registers to generate partial_sum_diag[0-1] together 1597*c0909341SAndroid Build Coastguard Worker vperm2i128 m7, m0, m0, 0x01 1598*c0909341SAndroid Build Coastguard Worker vperm2i128 m6, m1, m1, 0x01 1599*c0909341SAndroid Build Coastguard Worker vperm2i128 m5, m2, m2, 0x01 1600*c0909341SAndroid Build Coastguard Worker vperm2i128 m4, m3, m3, 0x01 1601*c0909341SAndroid Build Coastguard Worker 1602*c0909341SAndroid Build Coastguard Worker ; start with partial_sum_hv[0-1] 1603*c0909341SAndroid Build Coastguard Worker paddw m8, m0, m1 1604*c0909341SAndroid Build Coastguard Worker paddw m9, m2, m3 1605*c0909341SAndroid Build Coastguard Worker phaddw m10, m0, m1 1606*c0909341SAndroid Build Coastguard Worker phaddw m11, m2, m3 1607*c0909341SAndroid Build Coastguard Worker paddw m8, m9 1608*c0909341SAndroid Build Coastguard Worker phaddw m10, m11 1609*c0909341SAndroid Build Coastguard Worker vextracti128 xm9, m8, 1 1610*c0909341SAndroid Build Coastguard Worker vextracti128 xm11, m10, 1 1611*c0909341SAndroid Build Coastguard Worker paddw xm8, xm9 ; partial_sum_hv[1] 1612*c0909341SAndroid Build Coastguard Worker phaddw xm10, xm11 ; partial_sum_hv[0] 1613*c0909341SAndroid Build Coastguard Worker vinserti128 m8, xm10, 1 1614*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [div_table+44] 1615*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m8 1616*c0909341SAndroid Build Coastguard Worker pmulld m8, m9 ; cost6[2a-d] | cost2[a-d] 1617*c0909341SAndroid Build Coastguard Worker 1618*c0909341SAndroid Build Coastguard Worker ; create aggregates [lower half]: 1619*c0909341SAndroid Build Coastguard Worker ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+ 1620*c0909341SAndroid Build Coastguard Worker ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0 1621*c0909341SAndroid Build Coastguard Worker ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+ 1622*c0909341SAndroid Build Coastguard Worker ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x 1623*c0909341SAndroid Build Coastguard Worker ; and [upper half]: 1624*c0909341SAndroid Build Coastguard Worker ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+ 1625*c0909341SAndroid Build Coastguard Worker ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567 1626*c0909341SAndroid Build Coastguard Worker ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+ 1627*c0909341SAndroid Build Coastguard Worker ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx 1628*c0909341SAndroid Build Coastguard Worker ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd 1629*c0909341SAndroid Build Coastguard Worker 1630*c0909341SAndroid Build Coastguard Worker pslldq m9, m1, 2 1631*c0909341SAndroid Build Coastguard Worker psrldq m10, m1, 14 1632*c0909341SAndroid Build Coastguard Worker pslldq m11, m2, 4 1633*c0909341SAndroid Build Coastguard Worker psrldq m12, m2, 12 1634*c0909341SAndroid Build Coastguard Worker pslldq m13, m3, 6 1635*c0909341SAndroid Build Coastguard Worker psrldq m14, m3, 10 1636*c0909341SAndroid Build Coastguard Worker paddw m9, m11 1637*c0909341SAndroid Build Coastguard Worker paddw m10, m12 1638*c0909341SAndroid Build Coastguard Worker paddw m9, m13 1639*c0909341SAndroid Build Coastguard Worker paddw m10, m14 1640*c0909341SAndroid Build Coastguard Worker pslldq m11, m4, 8 1641*c0909341SAndroid Build Coastguard Worker psrldq m12, m4, 8 1642*c0909341SAndroid Build Coastguard Worker pslldq m13, m5, 10 1643*c0909341SAndroid Build Coastguard Worker psrldq m14, m5, 6 1644*c0909341SAndroid Build Coastguard Worker paddw m9, m11 1645*c0909341SAndroid Build Coastguard Worker paddw m10, m12 1646*c0909341SAndroid Build Coastguard Worker paddw m9, m13 1647*c0909341SAndroid Build Coastguard Worker paddw m10, m14 1648*c0909341SAndroid Build Coastguard Worker pslldq m11, m6, 12 1649*c0909341SAndroid Build Coastguard Worker psrldq m12, m6, 4 1650*c0909341SAndroid Build Coastguard Worker pslldq m13, m7, 14 1651*c0909341SAndroid Build Coastguard Worker psrldq m14, m7, 2 1652*c0909341SAndroid Build Coastguard Worker paddw m9, m11 1653*c0909341SAndroid Build Coastguard Worker paddw m10, m12 1654*c0909341SAndroid Build Coastguard Worker paddw m9, m13 1655*c0909341SAndroid Build Coastguard Worker paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero] 1656*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m14, [shufw_6543210x] 1657*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m13, [div_table+16] 1658*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m12, [div_table+0] 1659*c0909341SAndroid Build Coastguard Worker paddw m9, m0 ; partial_sum_diag[0/1][0-7] 1660*c0909341SAndroid Build Coastguard Worker pshufb m10, m14 1661*c0909341SAndroid Build Coastguard Worker punpckhwd m11, m9, m10 1662*c0909341SAndroid Build Coastguard Worker punpcklwd m9, m10 1663*c0909341SAndroid Build Coastguard Worker pmaddwd m11, m11 1664*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m9 1665*c0909341SAndroid Build Coastguard Worker pmulld m11, m13 1666*c0909341SAndroid Build Coastguard Worker pmulld m9, m12 1667*c0909341SAndroid Build Coastguard Worker paddd m9, m11 ; cost0[a-d] | cost4[a-d] 1668*c0909341SAndroid Build Coastguard Worker 1669*c0909341SAndroid Build Coastguard Worker ; merge horizontally and vertically for partial_sum_alt[0-3] 1670*c0909341SAndroid Build Coastguard Worker paddw m10, m0, m1 1671*c0909341SAndroid Build Coastguard Worker paddw m11, m2, m3 1672*c0909341SAndroid Build Coastguard Worker paddw m12, m4, m5 1673*c0909341SAndroid Build Coastguard Worker paddw m13, m6, m7 1674*c0909341SAndroid Build Coastguard Worker phaddw m0, m4 1675*c0909341SAndroid Build Coastguard Worker phaddw m1, m5 1676*c0909341SAndroid Build Coastguard Worker phaddw m2, m6 1677*c0909341SAndroid Build Coastguard Worker phaddw m3, m7 1678*c0909341SAndroid Build Coastguard Worker 1679*c0909341SAndroid Build Coastguard Worker ; create aggregates [lower half]: 1680*c0909341SAndroid Build Coastguard Worker ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234 1681*c0909341SAndroid Build Coastguard Worker ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx 1682*c0909341SAndroid Build Coastguard Worker ; and [upper half]: 1683*c0909341SAndroid Build Coastguard Worker ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 1684*c0909341SAndroid Build Coastguard Worker ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx 1685*c0909341SAndroid Build Coastguard Worker ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd 1686*c0909341SAndroid Build Coastguard Worker 1687*c0909341SAndroid Build Coastguard Worker pslldq m4, m11, 2 1688*c0909341SAndroid Build Coastguard Worker psrldq m11, 14 1689*c0909341SAndroid Build Coastguard Worker pslldq m5, m12, 4 1690*c0909341SAndroid Build Coastguard Worker psrldq m12, 12 1691*c0909341SAndroid Build Coastguard Worker pslldq m6, m13, 6 1692*c0909341SAndroid Build Coastguard Worker psrldq m13, 10 1693*c0909341SAndroid Build Coastguard Worker paddw m4, m10 1694*c0909341SAndroid Build Coastguard Worker paddw m11, m12 1695*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [div_table+44] 1696*c0909341SAndroid Build Coastguard Worker paddw m5, m6 1697*c0909341SAndroid Build Coastguard Worker paddw m11, m13 ; partial_sum_alt[3/2] right 1698*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m13, [div_table+32] 1699*c0909341SAndroid Build Coastguard Worker paddw m4, m5 ; partial_sum_alt[3/2] left 1700*c0909341SAndroid Build Coastguard Worker pshuflw m5, m11, q3012 1701*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m11, m4 1702*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 1703*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m6 1704*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 1705*c0909341SAndroid Build Coastguard Worker pmulld m6, m12 1706*c0909341SAndroid Build Coastguard Worker pmulld m4, m13 1707*c0909341SAndroid Build Coastguard Worker paddd m4, m6 ; cost7[a-d] | cost5[a-d] 1708*c0909341SAndroid Build Coastguard Worker 1709*c0909341SAndroid Build Coastguard Worker ; create aggregates [lower half]: 1710*c0909341SAndroid Build Coastguard Worker ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234 1711*c0909341SAndroid Build Coastguard Worker ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx 1712*c0909341SAndroid Build Coastguard Worker ; and [upper half]: 1713*c0909341SAndroid Build Coastguard Worker ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 1714*c0909341SAndroid Build Coastguard Worker ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx 1715*c0909341SAndroid Build Coastguard Worker ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd 1716*c0909341SAndroid Build Coastguard Worker 1717*c0909341SAndroid Build Coastguard Worker pslldq m5, m1, 2 1718*c0909341SAndroid Build Coastguard Worker psrldq m1, 14 1719*c0909341SAndroid Build Coastguard Worker pslldq m6, m2, 4 1720*c0909341SAndroid Build Coastguard Worker psrldq m2, 12 1721*c0909341SAndroid Build Coastguard Worker pslldq m7, m3, 6 1722*c0909341SAndroid Build Coastguard Worker psrldq m3, 10 1723*c0909341SAndroid Build Coastguard Worker paddw m5, m0 1724*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1725*c0909341SAndroid Build Coastguard Worker paddw m6, m7 1726*c0909341SAndroid Build Coastguard Worker paddw m1, m3 ; partial_sum_alt[0/1] right 1727*c0909341SAndroid Build Coastguard Worker paddw m5, m6 ; partial_sum_alt[0/1] left 1728*c0909341SAndroid Build Coastguard Worker pshuflw m0, m1, q3012 1729*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m5 1730*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m0 1731*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 1732*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1733*c0909341SAndroid Build Coastguard Worker pmulld m1, m12 1734*c0909341SAndroid Build Coastguard Worker pmulld m5, m13 1735*c0909341SAndroid Build Coastguard Worker paddd m5, m1 ; cost1[a-d] | cost3[a-d] 1736*c0909341SAndroid Build Coastguard Worker 1737*c0909341SAndroid Build Coastguard Worker mova xm0, [pd_47130256+ 16] 1738*c0909341SAndroid Build Coastguard Worker mova m1, [pd_47130256] 1739*c0909341SAndroid Build Coastguard Worker phaddd m9, m8 1740*c0909341SAndroid Build Coastguard Worker phaddd m5, m4 1741*c0909341SAndroid Build Coastguard Worker phaddd m9, m5 1742*c0909341SAndroid Build Coastguard Worker vpermd m0, m9 ; cost[0-3] 1743*c0909341SAndroid Build Coastguard Worker vpermd m1, m9 ; cost[4-7] | cost[0-3] 1744*c0909341SAndroid Build Coastguard Worker 1745*c0909341SAndroid Build Coastguard Worker ; now find the best cost 1746*c0909341SAndroid Build Coastguard Worker pmaxsd xm2, xm0, xm1 1747*c0909341SAndroid Build Coastguard Worker pshufd xm3, xm2, q1032 1748*c0909341SAndroid Build Coastguard Worker pmaxsd xm2, xm3 1749*c0909341SAndroid Build Coastguard Worker pshufd xm3, xm2, q2301 1750*c0909341SAndroid Build Coastguard Worker pmaxsd xm2, xm3 ; best cost 1751*c0909341SAndroid Build Coastguard Worker 1752*c0909341SAndroid Build Coastguard Worker ; find the idx using minpos 1753*c0909341SAndroid Build Coastguard Worker ; make everything other than the best cost negative via subtraction 1754*c0909341SAndroid Build Coastguard Worker ; find the min of unsigned 16-bit ints to sort out the negative values 1755*c0909341SAndroid Build Coastguard Worker psubd xm4, xm1, xm2 1756*c0909341SAndroid Build Coastguard Worker psubd xm3, xm0, xm2 1757*c0909341SAndroid Build Coastguard Worker packssdw xm3, xm4 1758*c0909341SAndroid Build Coastguard Worker phminposuw xm3, xm3 1759*c0909341SAndroid Build Coastguard Worker 1760*c0909341SAndroid Build Coastguard Worker ; convert idx to 32-bits 1761*c0909341SAndroid Build Coastguard Worker psrld xm3, 16 1762*c0909341SAndroid Build Coastguard Worker movd eax, xm3 1763*c0909341SAndroid Build Coastguard Worker 1764*c0909341SAndroid Build Coastguard Worker ; get idx^4 complement 1765*c0909341SAndroid Build Coastguard Worker vpermd m3, m1 1766*c0909341SAndroid Build Coastguard Worker psubd xm2, xm3 1767*c0909341SAndroid Build Coastguard Worker psrld xm2, 10 1768*c0909341SAndroid Build Coastguard Worker movd [varq], xm2 1769*c0909341SAndroid Build Coastguard Worker RET 1770*c0909341SAndroid Build Coastguard Worker 1771*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 1772