1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; Copyright © 2019, VideoLabs 4*c0909341SAndroid Build Coastguard Worker; All rights reserved. 5*c0909341SAndroid Build Coastguard Worker; 6*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 7*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 8*c0909341SAndroid Build Coastguard Worker; 9*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 10*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 11*c0909341SAndroid Build Coastguard Worker; 12*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 13*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 14*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 15*c0909341SAndroid Build Coastguard Worker; 16*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*c0909341SAndroid Build Coastguard Worker 27*c0909341SAndroid Build Coastguard Worker%include "config.asm" 28*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 29*c0909341SAndroid Build Coastguard Worker 30*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16 31*c0909341SAndroid Build Coastguard Worker 32*c0909341SAndroid Build Coastguard Worker%macro DUP8 1-* 33*c0909341SAndroid Build Coastguard Worker %rep %0 34*c0909341SAndroid Build Coastguard Worker times 8 db %1 35*c0909341SAndroid Build Coastguard Worker %rotate 1 36*c0909341SAndroid Build Coastguard Worker %endrep 37*c0909341SAndroid Build Coastguard Worker%endmacro 38*c0909341SAndroid Build Coastguard Worker 39*c0909341SAndroid Build Coastguard Workerdiv_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105 40*c0909341SAndroid Build Coastguard Worker dd 420, 210, 140, 105, 105, 105, 105, 105 41*c0909341SAndroid Build Coastguard Workerdiv_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210 42*c0909341SAndroid Build Coastguard Worker dw 168, 168, 140, 140, 120, 120, 105, 105 43*c0909341SAndroid Build Coastguard Worker dw 420, 420, 210, 210, 140, 140, 105, 105 44*c0909341SAndroid Build Coastguard Worker dw 105, 105, 105, 105, 105, 105, 105, 105 45*c0909341SAndroid Build Coastguard Workerconst shufw_6543210x, \ 46*c0909341SAndroid Build Coastguard Worker db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 47*c0909341SAndroid Build Coastguard Workershufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 48*c0909341SAndroid Build Coastguard Workerpw_8: times 8 dw 8 49*c0909341SAndroid Build Coastguard Workerpw_128: times 8 dw 128 50*c0909341SAndroid Build Coastguard Workerpw_256: times 8 dw 256 51*c0909341SAndroid Build Coastguard Workerpw_2048: times 8 dw 2048 52*c0909341SAndroid Build Coastguard Workerpw_0x7FFF: times 8 dw 0x7FFF 53*c0909341SAndroid Build Coastguard Workerpw_0x8000: times 8 dw 0x8000 54*c0909341SAndroid Build Coastguard Workertap_table: ; masks for 8-bit shift emulation 55*c0909341SAndroid Build Coastguard Worker DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80 56*c0909341SAndroid Build Coastguard Worker ; weights 57*c0909341SAndroid Build Coastguard Worker DUP8 4, 2, 3, 3, 2, 1 58*c0909341SAndroid Build Coastguard Worker ; taps indices 59*c0909341SAndroid Build Coastguard Worker db -1 * 16 + 1, -2 * 16 + 2 60*c0909341SAndroid Build Coastguard Worker db 0 * 16 + 1, -1 * 16 + 2 61*c0909341SAndroid Build Coastguard Worker db 0 * 16 + 1, 0 * 16 + 2 62*c0909341SAndroid Build Coastguard Worker db 0 * 16 + 1, 1 * 16 + 2 63*c0909341SAndroid Build Coastguard Worker db 1 * 16 + 1, 2 * 16 + 2 64*c0909341SAndroid Build Coastguard Worker db 1 * 16 + 0, 2 * 16 + 1 65*c0909341SAndroid Build Coastguard Worker db 1 * 16 + 0, 2 * 16 + 0 66*c0909341SAndroid Build Coastguard Worker db 1 * 16 + 0, 2 * 16 - 1 67*c0909341SAndroid Build Coastguard Worker ; the last 6 are repeats of the first 6 so we don't need to & 7 68*c0909341SAndroid Build Coastguard Worker db -1 * 16 + 1, -2 * 16 + 2 69*c0909341SAndroid Build Coastguard Worker db 0 * 16 + 1, -1 * 16 + 2 70*c0909341SAndroid Build Coastguard Worker db 0 * 16 + 1, 0 * 16 + 2 71*c0909341SAndroid Build Coastguard Worker db 0 * 16 + 1, 1 * 16 + 2 72*c0909341SAndroid Build Coastguard Worker db 1 * 16 + 1, 2 * 16 + 2 73*c0909341SAndroid Build Coastguard Worker db 1 * 16 + 0, 2 * 16 + 1 74*c0909341SAndroid Build Coastguard Worker 75*c0909341SAndroid Build Coastguard WorkerSECTION .text 76*c0909341SAndroid Build Coastguard Worker 77*c0909341SAndroid Build Coastguard Worker%macro movif32 2 78*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32 79*c0909341SAndroid Build Coastguard Worker mov %1, %2 80*c0909341SAndroid Build Coastguard Worker %endif 81*c0909341SAndroid Build Coastguard Worker%endmacro 82*c0909341SAndroid Build Coastguard Worker 83*c0909341SAndroid Build Coastguard Worker%macro PMOVZXBW 2-3 0 ; %3 = half 84*c0909341SAndroid Build Coastguard Worker %if cpuflag(sse4) && %3 == 0 85*c0909341SAndroid Build Coastguard Worker pmovzxbw %1, %2 86*c0909341SAndroid Build Coastguard Worker %else 87*c0909341SAndroid Build Coastguard Worker %if %3 == 1 88*c0909341SAndroid Build Coastguard Worker movd %1, %2 89*c0909341SAndroid Build Coastguard Worker %else 90*c0909341SAndroid Build Coastguard Worker movq %1, %2 91*c0909341SAndroid Build Coastguard Worker %endif 92*c0909341SAndroid Build Coastguard Worker punpcklbw %1, m7 93*c0909341SAndroid Build Coastguard Worker %endif 94*c0909341SAndroid Build Coastguard Worker%endmacro 95*c0909341SAndroid Build Coastguard Worker 96*c0909341SAndroid Build Coastguard Worker%macro PSHUFB_0 2 97*c0909341SAndroid Build Coastguard Worker %if cpuflag(ssse3) 98*c0909341SAndroid Build Coastguard Worker pshufb %1, %2 99*c0909341SAndroid Build Coastguard Worker %else 100*c0909341SAndroid Build Coastguard Worker punpcklbw %1, %1 101*c0909341SAndroid Build Coastguard Worker pshuflw %1, %1, q0000 102*c0909341SAndroid Build Coastguard Worker punpcklqdq %1, %1 103*c0909341SAndroid Build Coastguard Worker %endif 104*c0909341SAndroid Build Coastguard Worker%endmacro 105*c0909341SAndroid Build Coastguard Worker 106*c0909341SAndroid Build Coastguard Worker%macro MOVDDUP 2 107*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 108*c0909341SAndroid Build Coastguard Worker movddup %1, %2 109*c0909341SAndroid Build Coastguard Worker%else 110*c0909341SAndroid Build Coastguard Worker movq %1, %2 111*c0909341SAndroid Build Coastguard Worker punpcklqdq %1, %1 112*c0909341SAndroid Build Coastguard Worker%endif 113*c0909341SAndroid Build Coastguard Worker%endmacro 114*c0909341SAndroid Build Coastguard Worker 115*c0909341SAndroid Build Coastguard Worker%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax 116*c0909341SAndroid Build Coastguard Worker ; load p0/p1 117*c0909341SAndroid Build Coastguard Worker movsx offq, byte [dirq+kq+%1+14*8] ; off1 118*c0909341SAndroid Build Coastguard Worker %if %6 == 4 119*c0909341SAndroid Build Coastguard Worker movq m5, [stkq+offq*2+32*0] ; p0 120*c0909341SAndroid Build Coastguard Worker movhps m5, [stkq+offq*2+32*1] 121*c0909341SAndroid Build Coastguard Worker %else 122*c0909341SAndroid Build Coastguard Worker movu m5, [stkq+offq*2+32*0] ; p0 123*c0909341SAndroid Build Coastguard Worker %endif 124*c0909341SAndroid Build Coastguard Worker neg offq ; -off1 125*c0909341SAndroid Build Coastguard Worker %if %6 == 4 126*c0909341SAndroid Build Coastguard Worker movq m6, [stkq+offq*2+32*0] ; p1 127*c0909341SAndroid Build Coastguard Worker movhps m6, [stkq+offq*2+32*1] 128*c0909341SAndroid Build Coastguard Worker %else 129*c0909341SAndroid Build Coastguard Worker movu m6, [stkq+offq*2+32*0] ; p1 130*c0909341SAndroid Build Coastguard Worker %endif 131*c0909341SAndroid Build Coastguard Worker %if %7 132*c0909341SAndroid Build Coastguard Worker %if cpuflag(sse4) 133*c0909341SAndroid Build Coastguard Worker ; out of bounds values are set to a value that is a both a large unsigned 134*c0909341SAndroid Build Coastguard Worker ; value and a negative signed value. 135*c0909341SAndroid Build Coastguard Worker ; use signed max and unsigned min to remove them 136*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m5 137*c0909341SAndroid Build Coastguard Worker pminuw m8, m5 138*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m6 139*c0909341SAndroid Build Coastguard Worker pminuw m8, m6 140*c0909341SAndroid Build Coastguard Worker %else 141*c0909341SAndroid Build Coastguard Worker pcmpeqw m3, m14, m5 142*c0909341SAndroid Build Coastguard Worker pminsw m8, m5 ; min after p0 143*c0909341SAndroid Build Coastguard Worker pandn m3, m5 144*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m3 ; max after p0 145*c0909341SAndroid Build Coastguard Worker pcmpeqw m3, m14, m6 146*c0909341SAndroid Build Coastguard Worker pminsw m8, m6 ; min after p1 147*c0909341SAndroid Build Coastguard Worker pandn m3, m6 148*c0909341SAndroid Build Coastguard Worker pmaxsw m7, m3 ; max after p1 149*c0909341SAndroid Build Coastguard Worker %endif 150*c0909341SAndroid Build Coastguard Worker %endif 151*c0909341SAndroid Build Coastguard Worker 152*c0909341SAndroid Build Coastguard Worker ; accumulate sum[m13] over p0/p1 153*c0909341SAndroid Build Coastguard Worker psubw m5, m4 ; diff_p0(p0 - px) 154*c0909341SAndroid Build Coastguard Worker psubw m6, m4 ; diff_p1(p1 - px) 155*c0909341SAndroid Build Coastguard Worker packsswb m5, m6 ; convert pixel diff to 8-bit 156*c0909341SAndroid Build Coastguard Worker %if cpuflag(ssse3) 157*c0909341SAndroid Build Coastguard Worker pshufb m5, m13 ; group diffs p0 and p1 into pairs 158*c0909341SAndroid Build Coastguard Worker pabsb m6, m5 159*c0909341SAndroid Build Coastguard Worker psignb m3, %5, m5 160*c0909341SAndroid Build Coastguard Worker %else 161*c0909341SAndroid Build Coastguard Worker movlhps m6, m5 162*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m5 163*c0909341SAndroid Build Coastguard Worker pxor m5, m5 164*c0909341SAndroid Build Coastguard Worker pcmpgtb m5, m6 165*c0909341SAndroid Build Coastguard Worker paddb m6, m5 166*c0909341SAndroid Build Coastguard Worker pxor m6, m5 167*c0909341SAndroid Build Coastguard Worker paddb m3, %5, m5 168*c0909341SAndroid Build Coastguard Worker pxor m3, m5 169*c0909341SAndroid Build Coastguard Worker %endif 170*c0909341SAndroid Build Coastguard Worker pand m9, %3, m6 ; emulate 8-bit shift 171*c0909341SAndroid Build Coastguard Worker psrlw m9, %2 172*c0909341SAndroid Build Coastguard Worker psubusb m5, %4, m9 173*c0909341SAndroid Build Coastguard Worker pminub m5, m6 ; constrain(diff_p) 174*c0909341SAndroid Build Coastguard Worker %if cpuflag(ssse3) 175*c0909341SAndroid Build Coastguard Worker pmaddubsw m5, m3 ; constrain(diff_p) * taps 176*c0909341SAndroid Build Coastguard Worker %else 177*c0909341SAndroid Build Coastguard Worker psrlw m9, m5, 8 178*c0909341SAndroid Build Coastguard Worker psraw m6, m3, 8 179*c0909341SAndroid Build Coastguard Worker psllw m5, 8 180*c0909341SAndroid Build Coastguard Worker psllw m3, 8 181*c0909341SAndroid Build Coastguard Worker pmullw m9, m6 182*c0909341SAndroid Build Coastguard Worker pmulhw m5, m3 183*c0909341SAndroid Build Coastguard Worker paddw m5, m9 184*c0909341SAndroid Build Coastguard Worker %endif 185*c0909341SAndroid Build Coastguard Worker paddw m0, m5 186*c0909341SAndroid Build Coastguard Worker%endmacro 187*c0909341SAndroid Build Coastguard Worker 188*c0909341SAndroid Build Coastguard Worker%macro LOAD_BODY 3 ; dst, src, block_width 189*c0909341SAndroid Build Coastguard Worker %if %3 == 4 190*c0909341SAndroid Build Coastguard Worker PMOVZXBW m0, [%2+strideq*0] 191*c0909341SAndroid Build Coastguard Worker PMOVZXBW m1, [%2+strideq*1] 192*c0909341SAndroid Build Coastguard Worker PMOVZXBW m2, [%2+strideq*2] 193*c0909341SAndroid Build Coastguard Worker PMOVZXBW m3, [%2+stride3q] 194*c0909341SAndroid Build Coastguard Worker mova [%1+32*0], m0 195*c0909341SAndroid Build Coastguard Worker mova [%1+32*1], m1 196*c0909341SAndroid Build Coastguard Worker mova [%1+32*2], m2 197*c0909341SAndroid Build Coastguard Worker mova [%1+32*3], m3 198*c0909341SAndroid Build Coastguard Worker %else 199*c0909341SAndroid Build Coastguard Worker movu m0, [%2+strideq*0] 200*c0909341SAndroid Build Coastguard Worker movu m1, [%2+strideq*1] 201*c0909341SAndroid Build Coastguard Worker movu m2, [%2+strideq*2] 202*c0909341SAndroid Build Coastguard Worker movu m3, [%2+stride3q] 203*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m0, m7 204*c0909341SAndroid Build Coastguard Worker punpckhbw m0, m7 205*c0909341SAndroid Build Coastguard Worker mova [%1+32*0+ 0], m4 206*c0909341SAndroid Build Coastguard Worker mova [%1+32*0+16], m0 207*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m1, m7 208*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m7 209*c0909341SAndroid Build Coastguard Worker mova [%1+32*1+ 0], m4 210*c0909341SAndroid Build Coastguard Worker mova [%1+32*1+16], m1 211*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m2, m7 212*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m7 213*c0909341SAndroid Build Coastguard Worker mova [%1+32*2+ 0], m4 214*c0909341SAndroid Build Coastguard Worker mova [%1+32*2+16], m2 215*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m3, m7 216*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m7 217*c0909341SAndroid Build Coastguard Worker mova [%1+32*3+ 0], m4 218*c0909341SAndroid Build Coastguard Worker mova [%1+32*3+16], m3 219*c0909341SAndroid Build Coastguard Worker %endif 220*c0909341SAndroid Build Coastguard Worker%endmacro 221*c0909341SAndroid Build Coastguard Worker 222*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER_END 2 ; w, minmax 223*c0909341SAndroid Build Coastguard Worker pxor m6, m6 224*c0909341SAndroid Build Coastguard Worker pcmpgtw m6, m0 225*c0909341SAndroid Build Coastguard Worker paddw m0, m6 226*c0909341SAndroid Build Coastguard Worker %if cpuflag(ssse3) 227*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 228*c0909341SAndroid Build Coastguard Worker %else 229*c0909341SAndroid Build Coastguard Worker paddw m0, m15 230*c0909341SAndroid Build Coastguard Worker psraw m0, 4 231*c0909341SAndroid Build Coastguard Worker %endif 232*c0909341SAndroid Build Coastguard Worker paddw m4, m0 233*c0909341SAndroid Build Coastguard Worker %if %2 234*c0909341SAndroid Build Coastguard Worker pminsw m4, m7 235*c0909341SAndroid Build Coastguard Worker pmaxsw m4, m8 236*c0909341SAndroid Build Coastguard Worker %endif 237*c0909341SAndroid Build Coastguard Worker packuswb m4, m4 238*c0909341SAndroid Build Coastguard Worker %if %1 == 4 239*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*0], m4 240*c0909341SAndroid Build Coastguard Worker psrlq m4, 32 241*c0909341SAndroid Build Coastguard Worker movd [dstq+strideq*1], m4 242*c0909341SAndroid Build Coastguard Worker add stkq, 32*2 243*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 244*c0909341SAndroid Build Coastguard Worker %else 245*c0909341SAndroid Build Coastguard Worker movq [dstq], m4 246*c0909341SAndroid Build Coastguard Worker add stkq, 32 247*c0909341SAndroid Build Coastguard Worker add dstq, strideq 248*c0909341SAndroid Build Coastguard Worker %endif 249*c0909341SAndroid Build Coastguard Worker%endmacro 250*c0909341SAndroid Build Coastguard Worker 251*c0909341SAndroid Build Coastguard Worker%macro CDEF_FILTER 2 ; w, h 252*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 253*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \ 254*c0909341SAndroid Build Coastguard Worker dst, stride, left, top, bot, pri, dst4, edge, \ 255*c0909341SAndroid Build Coastguard Worker stride3 256*c0909341SAndroid Build Coastguard Worker %define px rsp+3*16+2*32 257*c0909341SAndroid Build Coastguard Worker %define base 0 258*c0909341SAndroid Build Coastguard Worker %else 259*c0909341SAndroid Build Coastguard Workercglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ 260*c0909341SAndroid Build Coastguard Worker dst, stride, left, edge, stride3 261*c0909341SAndroid Build Coastguard Worker %define topq r2 262*c0909341SAndroid Build Coastguard Worker %define botq r2 263*c0909341SAndroid Build Coastguard Worker %define dst4q r2 264*c0909341SAndroid Build Coastguard Worker LEA r5, tap_table 265*c0909341SAndroid Build Coastguard Worker %define px esp+7*16+2*32 266*c0909341SAndroid Build Coastguard Worker %define base r5-tap_table 267*c0909341SAndroid Build Coastguard Worker %endif 268*c0909341SAndroid Build Coastguard Worker mov edged, r9m 269*c0909341SAndroid Build Coastguard Worker %if cpuflag(sse4) 270*c0909341SAndroid Build Coastguard Worker %define OUT_OF_BOUNDS_MEM [base+pw_0x8000] 271*c0909341SAndroid Build Coastguard Worker %else 272*c0909341SAndroid Build Coastguard Worker %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF] 273*c0909341SAndroid Build Coastguard Worker %endif 274*c0909341SAndroid Build Coastguard Worker mova m6, OUT_OF_BOUNDS_MEM 275*c0909341SAndroid Build Coastguard Worker pxor m7, m7 276*c0909341SAndroid Build Coastguard Worker 277*c0909341SAndroid Build Coastguard Worker ; prepare pixel buffers - body/right 278*c0909341SAndroid Build Coastguard Worker %if %2 == 8 279*c0909341SAndroid Build Coastguard Worker lea dst4q, [dstq+strideq*4] 280*c0909341SAndroid Build Coastguard Worker %endif 281*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 282*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; have_right 283*c0909341SAndroid Build Coastguard Worker jz .no_right 284*c0909341SAndroid Build Coastguard Worker LOAD_BODY px, dstq, %1 285*c0909341SAndroid Build Coastguard Worker %if %2 == 8 286*c0909341SAndroid Build Coastguard Worker LOAD_BODY px+4*32, dst4q, %1 287*c0909341SAndroid Build Coastguard Worker %endif 288*c0909341SAndroid Build Coastguard Worker jmp .body_done 289*c0909341SAndroid Build Coastguard Worker.no_right: 290*c0909341SAndroid Build Coastguard Worker PMOVZXBW m0, [dstq+strideq*0], %1 == 4 291*c0909341SAndroid Build Coastguard Worker PMOVZXBW m1, [dstq+strideq*1], %1 == 4 292*c0909341SAndroid Build Coastguard Worker PMOVZXBW m2, [dstq+strideq*2], %1 == 4 293*c0909341SAndroid Build Coastguard Worker PMOVZXBW m3, [dstq+stride3q ], %1 == 4 294*c0909341SAndroid Build Coastguard Worker mova [px+32*0], m0 295*c0909341SAndroid Build Coastguard Worker mova [px+32*1], m1 296*c0909341SAndroid Build Coastguard Worker mova [px+32*2], m2 297*c0909341SAndroid Build Coastguard Worker mova [px+32*3], m3 298*c0909341SAndroid Build Coastguard Worker movd [px+32*0+%1*2], m6 299*c0909341SAndroid Build Coastguard Worker movd [px+32*1+%1*2], m6 300*c0909341SAndroid Build Coastguard Worker movd [px+32*2+%1*2], m6 301*c0909341SAndroid Build Coastguard Worker movd [px+32*3+%1*2], m6 302*c0909341SAndroid Build Coastguard Worker %if %2 == 8 303*c0909341SAndroid Build Coastguard Worker PMOVZXBW m0, [dst4q+strideq*0], %1 == 4 304*c0909341SAndroid Build Coastguard Worker PMOVZXBW m1, [dst4q+strideq*1], %1 == 4 305*c0909341SAndroid Build Coastguard Worker PMOVZXBW m2, [dst4q+strideq*2], %1 == 4 306*c0909341SAndroid Build Coastguard Worker PMOVZXBW m3, [dst4q+stride3q ], %1 == 4 307*c0909341SAndroid Build Coastguard Worker mova [px+32*4], m0 308*c0909341SAndroid Build Coastguard Worker mova [px+32*5], m1 309*c0909341SAndroid Build Coastguard Worker mova [px+32*6], m2 310*c0909341SAndroid Build Coastguard Worker mova [px+32*7], m3 311*c0909341SAndroid Build Coastguard Worker movd [px+32*4+%1*2], m6 312*c0909341SAndroid Build Coastguard Worker movd [px+32*5+%1*2], m6 313*c0909341SAndroid Build Coastguard Worker movd [px+32*6+%1*2], m6 314*c0909341SAndroid Build Coastguard Worker movd [px+32*7+%1*2], m6 315*c0909341SAndroid Build Coastguard Worker %endif 316*c0909341SAndroid Build Coastguard Worker.body_done: 317*c0909341SAndroid Build Coastguard Worker 318*c0909341SAndroid Build Coastguard Worker ; top 319*c0909341SAndroid Build Coastguard Worker movifnidn topq, r3mp 320*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; have_top 321*c0909341SAndroid Build Coastguard Worker jz .no_top 322*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; have_left 323*c0909341SAndroid Build Coastguard Worker jz .top_no_left 324*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; have_right 325*c0909341SAndroid Build Coastguard Worker jz .top_no_right 326*c0909341SAndroid Build Coastguard Worker %if %1 == 4 327*c0909341SAndroid Build Coastguard Worker PMOVZXBW m0, [topq+strideq*0-2] 328*c0909341SAndroid Build Coastguard Worker PMOVZXBW m1, [topq+strideq*1-2] 329*c0909341SAndroid Build Coastguard Worker %else 330*c0909341SAndroid Build Coastguard Worker movu m0, [topq+strideq*0-4] 331*c0909341SAndroid Build Coastguard Worker movu m1, [topq+strideq*1-4] 332*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m7 333*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m7 334*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1, m7 335*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m7 336*c0909341SAndroid Build Coastguard Worker movu [px-32*2+8], m2 337*c0909341SAndroid Build Coastguard Worker movu [px-32*1+8], m3 338*c0909341SAndroid Build Coastguard Worker %endif 339*c0909341SAndroid Build Coastguard Worker movu [px-32*2-%1], m0 340*c0909341SAndroid Build Coastguard Worker movu [px-32*1-%1], m1 341*c0909341SAndroid Build Coastguard Worker jmp .top_done 342*c0909341SAndroid Build Coastguard Worker.top_no_right: 343*c0909341SAndroid Build Coastguard Worker %if %1 == 4 344*c0909341SAndroid Build Coastguard Worker PMOVZXBW m0, [topq+strideq*0-%1] 345*c0909341SAndroid Build Coastguard Worker PMOVZXBW m1, [topq+strideq*1-%1] 346*c0909341SAndroid Build Coastguard Worker movu [px-32*2-8], m0 347*c0909341SAndroid Build Coastguard Worker movu [px-32*1-8], m1 348*c0909341SAndroid Build Coastguard Worker %else 349*c0909341SAndroid Build Coastguard Worker movu m0, [topq+strideq*0-%1] 350*c0909341SAndroid Build Coastguard Worker movu m1, [topq+strideq*1-%2] 351*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m7 352*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m7 353*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1, m7 354*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m7 355*c0909341SAndroid Build Coastguard Worker mova [px-32*2-16], m0 356*c0909341SAndroid Build Coastguard Worker mova [px-32*2+ 0], m2 357*c0909341SAndroid Build Coastguard Worker mova [px-32*1-16], m1 358*c0909341SAndroid Build Coastguard Worker mova [px-32*1+ 0], m3 359*c0909341SAndroid Build Coastguard Worker %endif 360*c0909341SAndroid Build Coastguard Worker movd [px-32*2+%1*2], m6 361*c0909341SAndroid Build Coastguard Worker movd [px-32*1+%1*2], m6 362*c0909341SAndroid Build Coastguard Worker jmp .top_done 363*c0909341SAndroid Build Coastguard Worker.top_no_left: 364*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; have_right 365*c0909341SAndroid Build Coastguard Worker jz .top_no_left_right 366*c0909341SAndroid Build Coastguard Worker %if %1 == 4 367*c0909341SAndroid Build Coastguard Worker PMOVZXBW m0, [topq+strideq*0] 368*c0909341SAndroid Build Coastguard Worker PMOVZXBW m1, [topq+strideq*1] 369*c0909341SAndroid Build Coastguard Worker %else 370*c0909341SAndroid Build Coastguard Worker movu m0, [topq+strideq*0] 371*c0909341SAndroid Build Coastguard Worker movu m1, [topq+strideq*1] 372*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m7 373*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m7 374*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1, m7 375*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m7 376*c0909341SAndroid Build Coastguard Worker movd [px-32*2+16], m2 377*c0909341SAndroid Build Coastguard Worker movd [px-32*1+16], m3 378*c0909341SAndroid Build Coastguard Worker %endif 379*c0909341SAndroid Build Coastguard Worker movd [px-32*2- 4], m6 380*c0909341SAndroid Build Coastguard Worker movd [px-32*1- 4], m6 381*c0909341SAndroid Build Coastguard Worker mova [px-32*2+ 0], m0 382*c0909341SAndroid Build Coastguard Worker mova [px-32*1+ 0], m1 383*c0909341SAndroid Build Coastguard Worker jmp .top_done 384*c0909341SAndroid Build Coastguard Worker.top_no_left_right: 385*c0909341SAndroid Build Coastguard Worker PMOVZXBW m0, [topq+strideq*0], %1 == 4 386*c0909341SAndroid Build Coastguard Worker PMOVZXBW m1, [topq+strideq*1], %1 == 4 387*c0909341SAndroid Build Coastguard Worker movd [px-32*2-4], m6 388*c0909341SAndroid Build Coastguard Worker movd [px-32*1-4], m6 389*c0909341SAndroid Build Coastguard Worker mova [px-32*2+0], m0 390*c0909341SAndroid Build Coastguard Worker mova [px-32*1+0], m1 391*c0909341SAndroid Build Coastguard Worker movd [px-32*2+%1*2], m6 392*c0909341SAndroid Build Coastguard Worker movd [px-32*1+%1*2], m6 393*c0909341SAndroid Build Coastguard Worker jmp .top_done 394*c0909341SAndroid Build Coastguard Worker.no_top: 395*c0909341SAndroid Build Coastguard Worker movu [px-32*2- 4], m6 396*c0909341SAndroid Build Coastguard Worker movu [px-32*1- 4], m6 397*c0909341SAndroid Build Coastguard Worker %if %1 == 8 398*c0909341SAndroid Build Coastguard Worker movq [px-32*2+12], m6 399*c0909341SAndroid Build Coastguard Worker movq [px-32*1+12], m6 400*c0909341SAndroid Build Coastguard Worker %endif 401*c0909341SAndroid Build Coastguard Worker.top_done: 402*c0909341SAndroid Build Coastguard Worker 403*c0909341SAndroid Build Coastguard Worker ; left 404*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; have_left 405*c0909341SAndroid Build Coastguard Worker jz .no_left 406*c0909341SAndroid Build Coastguard Worker movifnidn leftq, leftmp 407*c0909341SAndroid Build Coastguard Worker %if %2 == 4 408*c0909341SAndroid Build Coastguard Worker movq m0, [leftq] 409*c0909341SAndroid Build Coastguard Worker %else 410*c0909341SAndroid Build Coastguard Worker movu m0, [leftq] 411*c0909341SAndroid Build Coastguard Worker %endif 412*c0909341SAndroid Build Coastguard Worker %if %2 == 4 413*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m7 414*c0909341SAndroid Build Coastguard Worker %else 415*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m0, m7 416*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m7 417*c0909341SAndroid Build Coastguard Worker movhlps m3, m1 418*c0909341SAndroid Build Coastguard Worker movd [px+32*4-4], m1 419*c0909341SAndroid Build Coastguard Worker movd [px+32*6-4], m3 420*c0909341SAndroid Build Coastguard Worker psrlq m1, 32 421*c0909341SAndroid Build Coastguard Worker psrlq m3, 32 422*c0909341SAndroid Build Coastguard Worker movd [px+32*5-4], m1 423*c0909341SAndroid Build Coastguard Worker movd [px+32*7-4], m3 424*c0909341SAndroid Build Coastguard Worker %endif 425*c0909341SAndroid Build Coastguard Worker movhlps m2, m0 426*c0909341SAndroid Build Coastguard Worker movd [px+32*0-4], m0 427*c0909341SAndroid Build Coastguard Worker movd [px+32*2-4], m2 428*c0909341SAndroid Build Coastguard Worker psrlq m0, 32 429*c0909341SAndroid Build Coastguard Worker psrlq m2, 32 430*c0909341SAndroid Build Coastguard Worker movd [px+32*1-4], m0 431*c0909341SAndroid Build Coastguard Worker movd [px+32*3-4], m2 432*c0909341SAndroid Build Coastguard Worker jmp .left_done 433*c0909341SAndroid Build Coastguard Worker.no_left: 434*c0909341SAndroid Build Coastguard Worker movd [px+32*0-4], m6 435*c0909341SAndroid Build Coastguard Worker movd [px+32*1-4], m6 436*c0909341SAndroid Build Coastguard Worker movd [px+32*2-4], m6 437*c0909341SAndroid Build Coastguard Worker movd [px+32*3-4], m6 438*c0909341SAndroid Build Coastguard Worker %if %2 == 8 439*c0909341SAndroid Build Coastguard Worker movd [px+32*4-4], m6 440*c0909341SAndroid Build Coastguard Worker movd [px+32*5-4], m6 441*c0909341SAndroid Build Coastguard Worker movd [px+32*6-4], m6 442*c0909341SAndroid Build Coastguard Worker movd [px+32*7-4], m6 443*c0909341SAndroid Build Coastguard Worker %endif 444*c0909341SAndroid Build Coastguard Worker.left_done: 445*c0909341SAndroid Build Coastguard Worker 446*c0909341SAndroid Build Coastguard Worker ; bottom 447*c0909341SAndroid Build Coastguard Worker movifnidn botq, r4mp 448*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; have_bottom 449*c0909341SAndroid Build Coastguard Worker jz .no_bottom 450*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; have_left 451*c0909341SAndroid Build Coastguard Worker jz .bottom_no_left 452*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; have_right 453*c0909341SAndroid Build Coastguard Worker jz .bottom_no_right 454*c0909341SAndroid Build Coastguard Worker %if %1 == 4 455*c0909341SAndroid Build Coastguard Worker PMOVZXBW m0, [botq+strideq*0-(%1/2)] 456*c0909341SAndroid Build Coastguard Worker PMOVZXBW m1, [botq+strideq*1-(%1/2)] 457*c0909341SAndroid Build Coastguard Worker %else 458*c0909341SAndroid Build Coastguard Worker movu m0, [botq+strideq*0-4] 459*c0909341SAndroid Build Coastguard Worker movu m1, [botq+strideq*1-4] 460*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m7 461*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m7 462*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1, m7 463*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m7 464*c0909341SAndroid Build Coastguard Worker movu [px+32*(%2+0)+8], m2 465*c0909341SAndroid Build Coastguard Worker movu [px+32*(%2+1)+8], m3 466*c0909341SAndroid Build Coastguard Worker %endif 467*c0909341SAndroid Build Coastguard Worker movu [px+32*(%2+0)-%1], m0 468*c0909341SAndroid Build Coastguard Worker movu [px+32*(%2+1)-%1], m1 469*c0909341SAndroid Build Coastguard Worker jmp .bottom_done 470*c0909341SAndroid Build Coastguard Worker.bottom_no_right: 471*c0909341SAndroid Build Coastguard Worker %if %1 == 4 472*c0909341SAndroid Build Coastguard Worker PMOVZXBW m0, [botq+strideq*0-4] 473*c0909341SAndroid Build Coastguard Worker PMOVZXBW m1, [botq+strideq*1-4] 474*c0909341SAndroid Build Coastguard Worker movu [px+32*(%2+0)-8], m0 475*c0909341SAndroid Build Coastguard Worker movu [px+32*(%2+1)-8], m1 476*c0909341SAndroid Build Coastguard Worker %else 477*c0909341SAndroid Build Coastguard Worker movu m0, [botq+strideq*0-8] 478*c0909341SAndroid Build Coastguard Worker movu m1, [botq+strideq*1-8] 479*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m7 480*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m7 481*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1, m7 482*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m7 483*c0909341SAndroid Build Coastguard Worker mova [px+32*(%2+0)-16], m0 484*c0909341SAndroid Build Coastguard Worker mova [px+32*(%2+0)+ 0], m2 485*c0909341SAndroid Build Coastguard Worker mova [px+32*(%2+1)-16], m1 486*c0909341SAndroid Build Coastguard Worker mova [px+32*(%2+1)+ 0], m3 487*c0909341SAndroid Build Coastguard Worker movd [px+32*(%2-1)+16], m6 ; overwritten by first mova 488*c0909341SAndroid Build Coastguard Worker %endif 489*c0909341SAndroid Build Coastguard Worker movd [px+32*(%2+0)+%1*2], m6 490*c0909341SAndroid Build Coastguard Worker movd [px+32*(%2+1)+%1*2], m6 491*c0909341SAndroid Build Coastguard Worker jmp .bottom_done 492*c0909341SAndroid Build Coastguard Worker.bottom_no_left: 493*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; have_right 494*c0909341SAndroid Build Coastguard Worker jz .bottom_no_left_right 495*c0909341SAndroid Build Coastguard Worker %if %1 == 4 496*c0909341SAndroid Build Coastguard Worker PMOVZXBW m0, [botq+strideq*0] 497*c0909341SAndroid Build Coastguard Worker PMOVZXBW m1, [botq+strideq*1] 498*c0909341SAndroid Build Coastguard Worker %else 499*c0909341SAndroid Build Coastguard Worker movu m0, [botq+strideq*0] 500*c0909341SAndroid Build Coastguard Worker movu m1, [botq+strideq*1] 501*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m0, m7 502*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m7 503*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m1, m7 504*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m7 505*c0909341SAndroid Build Coastguard Worker mova [px+32*(%2+0)+16], m2 506*c0909341SAndroid Build Coastguard Worker mova [px+32*(%2+1)+16], m3 507*c0909341SAndroid Build Coastguard Worker %endif 508*c0909341SAndroid Build Coastguard Worker mova [px+32*(%2+0)+ 0], m0 509*c0909341SAndroid Build Coastguard Worker mova [px+32*(%2+1)+ 0], m1 510*c0909341SAndroid Build Coastguard Worker movd [px+32*(%2+0)- 4], m6 511*c0909341SAndroid Build Coastguard Worker movd [px+32*(%2+1)- 4], m6 512*c0909341SAndroid Build Coastguard Worker jmp .bottom_done 513*c0909341SAndroid Build Coastguard Worker.bottom_no_left_right: 514*c0909341SAndroid Build Coastguard Worker PMOVZXBW m0, [botq+strideq*0], %1 == 4 515*c0909341SAndroid Build Coastguard Worker PMOVZXBW m1, [botq+strideq*1], %1 == 4 516*c0909341SAndroid Build Coastguard Worker mova [px+32*(%2+0)+ 0], m0 517*c0909341SAndroid Build Coastguard Worker mova [px+32*(%2+1)+ 0], m1 518*c0909341SAndroid Build Coastguard Worker movd [px+32*(%2+0)+%1*2], m6 519*c0909341SAndroid Build Coastguard Worker movd [px+32*(%2+1)+%1*2], m6 520*c0909341SAndroid Build Coastguard Worker movd [px+32*(%2+0)- 4], m6 521*c0909341SAndroid Build Coastguard Worker movd [px+32*(%2+1)- 4], m6 522*c0909341SAndroid Build Coastguard Worker jmp .bottom_done 523*c0909341SAndroid Build Coastguard Worker.no_bottom: 524*c0909341SAndroid Build Coastguard Worker movu [px+32*(%2+0)- 4], m6 525*c0909341SAndroid Build Coastguard Worker movu [px+32*(%2+1)- 4], m6 526*c0909341SAndroid Build Coastguard Worker %if %1 == 8 527*c0909341SAndroid Build Coastguard Worker movq [px+32*(%2+0)+12], m6 528*c0909341SAndroid Build Coastguard Worker movq [px+32*(%2+1)+12], m6 529*c0909341SAndroid Build Coastguard Worker %endif 530*c0909341SAndroid Build Coastguard Worker.bottom_done: 531*c0909341SAndroid Build Coastguard Worker 532*c0909341SAndroid Build Coastguard Worker ; actual filter 533*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 534*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec 535*c0909341SAndroid Build Coastguard Worker mova m13, [shufb_lohi] 536*c0909341SAndroid Build Coastguard Worker %if cpuflag(ssse3) 537*c0909341SAndroid Build Coastguard Worker mova m15, [pw_2048] 538*c0909341SAndroid Build Coastguard Worker %else 539*c0909341SAndroid Build Coastguard Worker mova m15, [pw_8] 540*c0909341SAndroid Build Coastguard Worker %endif 541*c0909341SAndroid Build Coastguard Worker mova m14, m6 542*c0909341SAndroid Build Coastguard Worker %else 543*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, pridmp, sec, damping, pri, tap 544*c0909341SAndroid Build Coastguard Worker %xdefine m8 m1 545*c0909341SAndroid Build Coastguard Worker %xdefine m9 m2 546*c0909341SAndroid Build Coastguard Worker %xdefine m10 m0 547*c0909341SAndroid Build Coastguard Worker %xdefine m13 [base+shufb_lohi] 548*c0909341SAndroid Build Coastguard Worker %xdefine m14 OUT_OF_BOUNDS_MEM 549*c0909341SAndroid Build Coastguard Worker %if cpuflag(ssse3) 550*c0909341SAndroid Build Coastguard Worker %xdefine m15 [base+pw_2048] 551*c0909341SAndroid Build Coastguard Worker %else 552*c0909341SAndroid Build Coastguard Worker %xdefine m15 [base+pw_8] 553*c0909341SAndroid Build Coastguard Worker %endif 554*c0909341SAndroid Build Coastguard Worker %endif 555*c0909341SAndroid Build Coastguard Worker movifnidn prid, r5m 556*c0909341SAndroid Build Coastguard Worker movifnidn secd, r6m 557*c0909341SAndroid Build Coastguard Worker mov dampingd, r8m 558*c0909341SAndroid Build Coastguard Worker movif32 [esp+0x3C], r1d 559*c0909341SAndroid Build Coastguard Worker test prid, prid 560*c0909341SAndroid Build Coastguard Worker jz .sec_only 561*c0909341SAndroid Build Coastguard Worker movd m1, r5m 562*c0909341SAndroid Build Coastguard Worker bsr pridmpd, prid 563*c0909341SAndroid Build Coastguard Worker test secd, secd 564*c0909341SAndroid Build Coastguard Worker jz .pri_only 565*c0909341SAndroid Build Coastguard Worker movd m10, r6m 566*c0909341SAndroid Build Coastguard Worker tzcnt secd, secd 567*c0909341SAndroid Build Coastguard Worker and prid, 1 568*c0909341SAndroid Build Coastguard Worker sub pridmpd, dampingd 569*c0909341SAndroid Build Coastguard Worker sub secd, dampingd 570*c0909341SAndroid Build Coastguard Worker xor dampingd, dampingd 571*c0909341SAndroid Build Coastguard Worker add prid, prid 572*c0909341SAndroid Build Coastguard Worker neg pridmpd 573*c0909341SAndroid Build Coastguard Worker cmovs pridmpd, dampingd 574*c0909341SAndroid Build Coastguard Worker neg secd 575*c0909341SAndroid Build Coastguard Worker PSHUFB_0 m1, m7 576*c0909341SAndroid Build Coastguard Worker PSHUFB_0 m10, m7 577*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 578*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec 579*c0909341SAndroid Build Coastguard Worker lea tapq, [tap_table] 580*c0909341SAndroid Build Coastguard Worker MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask 581*c0909341SAndroid Build Coastguard Worker MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask 582*c0909341SAndroid Build Coastguard Worker mov [rsp+0x00], pridmpq ; pri_shift 583*c0909341SAndroid Build Coastguard Worker mov [rsp+0x10], secq ; sec_shift 584*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off 585*c0909341SAndroid Build Coastguard Worker %else 586*c0909341SAndroid Build Coastguard Worker MOVDDUP m2, [tapq+pridmpq*8] 587*c0909341SAndroid Build Coastguard Worker MOVDDUP m3, [tapq+secq*8] 588*c0909341SAndroid Build Coastguard Worker mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw 589*c0909341SAndroid Build Coastguard Worker mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP 590*c0909341SAndroid Build Coastguard Worker mov [esp+0x00], pridmpd 591*c0909341SAndroid Build Coastguard Worker mov [esp+0x30], secd 592*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, dir, stk, pri, tap, h 593*c0909341SAndroid Build Coastguard Worker %define offq dstq 594*c0909341SAndroid Build Coastguard Worker %define kd strided 595*c0909341SAndroid Build Coastguard Worker %define kq strideq 596*c0909341SAndroid Build Coastguard Worker mova [esp+0x10], m2 597*c0909341SAndroid Build Coastguard Worker mova [esp+0x40], m3 598*c0909341SAndroid Build Coastguard Worker mova [esp+0x20], m1 599*c0909341SAndroid Build Coastguard Worker mova [esp+0x50], m10 600*c0909341SAndroid Build Coastguard Worker %endif 601*c0909341SAndroid Build Coastguard Worker mov dird, r7m 602*c0909341SAndroid Build Coastguard Worker lea stkq, [px] 603*c0909341SAndroid Build Coastguard Worker lea priq, [tapq+8*8+priq*8] ; pri_taps 604*c0909341SAndroid Build Coastguard Worker mov hd, %1*%2/8 605*c0909341SAndroid Build Coastguard Worker lea dirq, [tapq+dirq*2] 606*c0909341SAndroid Build Coastguard Worker.v_loop: 607*c0909341SAndroid Build Coastguard Worker movif32 [esp+0x38], dstd 608*c0909341SAndroid Build Coastguard Worker mov kd, 1 609*c0909341SAndroid Build Coastguard Worker %if %1 == 4 610*c0909341SAndroid Build Coastguard Worker movq m4, [stkq+32*0] 611*c0909341SAndroid Build Coastguard Worker movhps m4, [stkq+32*1] 612*c0909341SAndroid Build Coastguard Worker %else 613*c0909341SAndroid Build Coastguard Worker mova m4, [stkq+32*0] ; px 614*c0909341SAndroid Build Coastguard Worker %endif 615*c0909341SAndroid Build Coastguard Worker pxor m0, m0 ; sum 616*c0909341SAndroid Build Coastguard Worker mova m7, m4 ; max 617*c0909341SAndroid Build Coastguard Worker mova m8, m4 ; min 618*c0909341SAndroid Build Coastguard Worker.k_loop: 619*c0909341SAndroid Build Coastguard Worker MOVDDUP m2, [priq+kq*8] 620*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 621*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1 622*c0909341SAndroid Build Coastguard Worker MOVDDUP m2, [tapq+12*8+kq*8] 623*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1 624*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1 625*c0909341SAndroid Build Coastguard Worker %else 626*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1 627*c0909341SAndroid Build Coastguard Worker MOVDDUP m2, [tapq+12*8+kq*8] 628*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 629*c0909341SAndroid Build Coastguard Worker MOVDDUP m2, [tapq+12*8+kq*8] 630*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 631*c0909341SAndroid Build Coastguard Worker %endif 632*c0909341SAndroid Build Coastguard Worker dec kd 633*c0909341SAndroid Build Coastguard Worker jge .k_loop 634*c0909341SAndroid Build Coastguard Worker movif32 dstq, [esp+0x38] 635*c0909341SAndroid Build Coastguard Worker movif32 strideq, [esp+0x3C] 636*c0909341SAndroid Build Coastguard Worker CDEF_FILTER_END %1, 1 637*c0909341SAndroid Build Coastguard Worker dec hd 638*c0909341SAndroid Build Coastguard Worker jg .v_loop 639*c0909341SAndroid Build Coastguard Worker RET 640*c0909341SAndroid Build Coastguard Worker 641*c0909341SAndroid Build Coastguard Worker.pri_only: 642*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 643*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap 644*c0909341SAndroid Build Coastguard Worker lea tapq, [tap_table] 645*c0909341SAndroid Build Coastguard Worker %else 646*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, pridmp, zero, damping, pri, tap 647*c0909341SAndroid Build Coastguard Worker %endif 648*c0909341SAndroid Build Coastguard Worker and prid, 1 649*c0909341SAndroid Build Coastguard Worker xor zerod, zerod 650*c0909341SAndroid Build Coastguard Worker sub dampingd, pridmpd 651*c0909341SAndroid Build Coastguard Worker cmovs dampingd, zerod 652*c0909341SAndroid Build Coastguard Worker add prid, prid 653*c0909341SAndroid Build Coastguard Worker PSHUFB_0 m1, m7 654*c0909341SAndroid Build Coastguard Worker MOVDDUP m7, [tapq+dampingq*8] 655*c0909341SAndroid Build Coastguard Worker mov [rsp+0x00], dampingq 656*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 657*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off 658*c0909341SAndroid Build Coastguard Worker %else 659*c0909341SAndroid Build Coastguard Worker mov [rsp+0x04], zerod 660*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, dir, stk, pri, tap, h 661*c0909341SAndroid Build Coastguard Worker %endif 662*c0909341SAndroid Build Coastguard Worker mov dird, r7m 663*c0909341SAndroid Build Coastguard Worker lea stkq, [px] 664*c0909341SAndroid Build Coastguard Worker lea priq, [tapq+8*8+priq*8] 665*c0909341SAndroid Build Coastguard Worker mov hd, %1*%2/8 666*c0909341SAndroid Build Coastguard Worker lea dirq, [tapq+dirq*2] 667*c0909341SAndroid Build Coastguard Worker.pri_v_loop: 668*c0909341SAndroid Build Coastguard Worker movif32 [esp+0x38], dstd 669*c0909341SAndroid Build Coastguard Worker mov kd, 1 670*c0909341SAndroid Build Coastguard Worker %if %1 == 4 671*c0909341SAndroid Build Coastguard Worker movq m4, [stkq+32*0] 672*c0909341SAndroid Build Coastguard Worker movhps m4, [stkq+32*1] 673*c0909341SAndroid Build Coastguard Worker %else 674*c0909341SAndroid Build Coastguard Worker mova m4, [stkq+32*0] 675*c0909341SAndroid Build Coastguard Worker %endif 676*c0909341SAndroid Build Coastguard Worker pxor m0, m0 677*c0909341SAndroid Build Coastguard Worker.pri_k_loop: 678*c0909341SAndroid Build Coastguard Worker MOVDDUP m2, [priq+kq*8] 679*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0 680*c0909341SAndroid Build Coastguard Worker dec kd 681*c0909341SAndroid Build Coastguard Worker jge .pri_k_loop 682*c0909341SAndroid Build Coastguard Worker movif32 dstq, [esp+0x38] 683*c0909341SAndroid Build Coastguard Worker movif32 strideq, [esp+0x3C] 684*c0909341SAndroid Build Coastguard Worker CDEF_FILTER_END %1, 0 685*c0909341SAndroid Build Coastguard Worker dec hd 686*c0909341SAndroid Build Coastguard Worker jg .pri_v_loop 687*c0909341SAndroid Build Coastguard Worker RET 688*c0909341SAndroid Build Coastguard Worker 689*c0909341SAndroid Build Coastguard Worker.sec_only: 690*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 691*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec 692*c0909341SAndroid Build Coastguard Worker%else 693*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero 694*c0909341SAndroid Build Coastguard Worker%endif 695*c0909341SAndroid Build Coastguard Worker movd m1, r6m 696*c0909341SAndroid Build Coastguard Worker tzcnt secd, secd 697*c0909341SAndroid Build Coastguard Worker mov dird, r7m 698*c0909341SAndroid Build Coastguard Worker xor zerod, zerod 699*c0909341SAndroid Build Coastguard Worker sub dampingd, secd 700*c0909341SAndroid Build Coastguard Worker cmovs dampingd, zerod 701*c0909341SAndroid Build Coastguard Worker PSHUFB_0 m1, m7 702*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 703*c0909341SAndroid Build Coastguard Worker lea tapq, [tap_table] 704*c0909341SAndroid Build Coastguard Worker %else 705*c0909341SAndroid Build Coastguard Worker mov [rsp+0x04], zerod 706*c0909341SAndroid Build Coastguard Worker %endif 707*c0909341SAndroid Build Coastguard Worker mov [rsp+0x00], dampingq 708*c0909341SAndroid Build Coastguard Worker MOVDDUP m7, [tapq+dampingq*8] 709*c0909341SAndroid Build Coastguard Worker lea dirq, [tapq+dirq*2] 710*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 711*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k 712*c0909341SAndroid Build Coastguard Worker %else 713*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, off, stk, dir, tap, h 714*c0909341SAndroid Build Coastguard Worker %endif 715*c0909341SAndroid Build Coastguard Worker lea stkq, [px] 716*c0909341SAndroid Build Coastguard Worker mov hd, %1*%2/8 717*c0909341SAndroid Build Coastguard Worker.sec_v_loop: 718*c0909341SAndroid Build Coastguard Worker mov kd, 1 719*c0909341SAndroid Build Coastguard Worker %if %1 == 4 720*c0909341SAndroid Build Coastguard Worker movq m4, [stkq+32*0] 721*c0909341SAndroid Build Coastguard Worker movhps m4, [stkq+32*1] 722*c0909341SAndroid Build Coastguard Worker %else 723*c0909341SAndroid Build Coastguard Worker mova m4, [stkq+32*0] 724*c0909341SAndroid Build Coastguard Worker %endif 725*c0909341SAndroid Build Coastguard Worker pxor m0, m0 726*c0909341SAndroid Build Coastguard Worker.sec_k_loop: 727*c0909341SAndroid Build Coastguard Worker MOVDDUP m2, [tapq+12*8+kq*8] 728*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0 729*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32 730*c0909341SAndroid Build Coastguard Worker MOVDDUP m2, [tapq+12*8+kq*8] 731*c0909341SAndroid Build Coastguard Worker %endif 732*c0909341SAndroid Build Coastguard Worker ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0 733*c0909341SAndroid Build Coastguard Worker dec kd 734*c0909341SAndroid Build Coastguard Worker jge .sec_k_loop 735*c0909341SAndroid Build Coastguard Worker movif32 strideq, [esp+0x3C] 736*c0909341SAndroid Build Coastguard Worker CDEF_FILTER_END %1, 0 737*c0909341SAndroid Build Coastguard Worker dec hd 738*c0909341SAndroid Build Coastguard Worker jg .sec_v_loop 739*c0909341SAndroid Build Coastguard Worker RET 740*c0909341SAndroid Build Coastguard Worker%endmacro 741*c0909341SAndroid Build Coastguard Worker 742*c0909341SAndroid Build Coastguard Worker%macro MULLD 2 743*c0909341SAndroid Build Coastguard Worker %if cpuflag(sse4) 744*c0909341SAndroid Build Coastguard Worker pmulld %1, %2 745*c0909341SAndroid Build Coastguard Worker %else 746*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32 747*c0909341SAndroid Build Coastguard Worker %define m15 m1 748*c0909341SAndroid Build Coastguard Worker %endif 749*c0909341SAndroid Build Coastguard Worker pmulhuw m15, %1, %2 750*c0909341SAndroid Build Coastguard Worker pmullw %1, %2 751*c0909341SAndroid Build Coastguard Worker pslld m15, 16 752*c0909341SAndroid Build Coastguard Worker paddd %1, m15 753*c0909341SAndroid Build Coastguard Worker %endif 754*c0909341SAndroid Build Coastguard Worker%endmacro 755*c0909341SAndroid Build Coastguard Worker 756*c0909341SAndroid Build Coastguard Worker%macro CDEF_DIR 0 757*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 758*c0909341SAndroid Build Coastguard Workercglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var 759*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 760*c0909341SAndroid Build Coastguard Worker movq m1, [srcq+strideq*0] 761*c0909341SAndroid Build Coastguard Worker movhps m1, [srcq+strideq*1] 762*c0909341SAndroid Build Coastguard Worker movq m3, [srcq+strideq*2] 763*c0909341SAndroid Build Coastguard Worker movhps m3, [srcq+r6 ] 764*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 765*c0909341SAndroid Build Coastguard Worker movq m5, [srcq+strideq*0] 766*c0909341SAndroid Build Coastguard Worker movhps m5, [srcq+strideq*1] 767*c0909341SAndroid Build Coastguard Worker movq m7, [srcq+strideq*2] 768*c0909341SAndroid Build Coastguard Worker movhps m7, [srcq+r6 ] 769*c0909341SAndroid Build Coastguard Worker 770*c0909341SAndroid Build Coastguard Worker pxor m8, m8 771*c0909341SAndroid Build Coastguard Worker psadbw m9, m1, m8 772*c0909341SAndroid Build Coastguard Worker psadbw m2, m3, m8 773*c0909341SAndroid Build Coastguard Worker psadbw m4, m5, m8 774*c0909341SAndroid Build Coastguard Worker psadbw m6, m7, m8 775*c0909341SAndroid Build Coastguard Worker packssdw m9, m2 776*c0909341SAndroid Build Coastguard Worker packssdw m4, m6 777*c0909341SAndroid Build Coastguard Worker packssdw m9, m4 778*c0909341SAndroid Build Coastguard Worker 779*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m8 780*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m8 781*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m8 782*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m8 783*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m8 784*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m8 785*c0909341SAndroid Build Coastguard Worker punpcklbw m6, m7, m8 786*c0909341SAndroid Build Coastguard Worker punpckhbw m7, m8 787*c0909341SAndroid Build Coastguard Workercglobal_label .main 788*c0909341SAndroid Build Coastguard Worker mova m8, [pw_128] 789*c0909341SAndroid Build Coastguard Worker psubw m0, m8 790*c0909341SAndroid Build Coastguard Worker psubw m1, m8 791*c0909341SAndroid Build Coastguard Worker psubw m2, m8 792*c0909341SAndroid Build Coastguard Worker psubw m3, m8 793*c0909341SAndroid Build Coastguard Worker psubw m4, m8 794*c0909341SAndroid Build Coastguard Worker psubw m5, m8 795*c0909341SAndroid Build Coastguard Worker psubw m6, m8 796*c0909341SAndroid Build Coastguard Worker psubw m7, m8 797*c0909341SAndroid Build Coastguard Worker psllw m8, 3 798*c0909341SAndroid Build Coastguard Worker psubw m9, m8 ; partial_sum_hv[0] 799*c0909341SAndroid Build Coastguard Worker 800*c0909341SAndroid Build Coastguard Worker paddw m8, m0, m1 801*c0909341SAndroid Build Coastguard Worker paddw m10, m2, m3 802*c0909341SAndroid Build Coastguard Worker paddw m8, m4 803*c0909341SAndroid Build Coastguard Worker paddw m10, m5 804*c0909341SAndroid Build Coastguard Worker paddw m8, m6 805*c0909341SAndroid Build Coastguard Worker paddw m10, m7 806*c0909341SAndroid Build Coastguard Worker paddw m8, m10 ; partial_sum_hv[1] 807*c0909341SAndroid Build Coastguard Worker 808*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m8 809*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m9 810*c0909341SAndroid Build Coastguard Worker phaddd m9, m8 811*c0909341SAndroid Build Coastguard Worker SWAP m8, m9 812*c0909341SAndroid Build Coastguard Worker MULLD m8, [div_table%+SUFFIX+48] 813*c0909341SAndroid Build Coastguard Worker 814*c0909341SAndroid Build Coastguard Worker pslldq m9, m1, 2 815*c0909341SAndroid Build Coastguard Worker psrldq m10, m1, 14 816*c0909341SAndroid Build Coastguard Worker pslldq m11, m2, 4 817*c0909341SAndroid Build Coastguard Worker psrldq m12, m2, 12 818*c0909341SAndroid Build Coastguard Worker pslldq m13, m3, 6 819*c0909341SAndroid Build Coastguard Worker psrldq m14, m3, 10 820*c0909341SAndroid Build Coastguard Worker paddw m9, m0 821*c0909341SAndroid Build Coastguard Worker paddw m10, m12 822*c0909341SAndroid Build Coastguard Worker paddw m11, m13 823*c0909341SAndroid Build Coastguard Worker paddw m10, m14 ; partial_sum_diag[0] top/right half 824*c0909341SAndroid Build Coastguard Worker paddw m9, m11 ; partial_sum_diag[0] top/left half 825*c0909341SAndroid Build Coastguard Worker pslldq m11, m4, 8 826*c0909341SAndroid Build Coastguard Worker psrldq m12, m4, 8 827*c0909341SAndroid Build Coastguard Worker pslldq m13, m5, 10 828*c0909341SAndroid Build Coastguard Worker psrldq m14, m5, 6 829*c0909341SAndroid Build Coastguard Worker paddw m9, m11 830*c0909341SAndroid Build Coastguard Worker paddw m10, m12 831*c0909341SAndroid Build Coastguard Worker paddw m9, m13 832*c0909341SAndroid Build Coastguard Worker paddw m10, m14 833*c0909341SAndroid Build Coastguard Worker pslldq m11, m6, 12 834*c0909341SAndroid Build Coastguard Worker psrldq m12, m6, 4 835*c0909341SAndroid Build Coastguard Worker pslldq m13, m7, 14 836*c0909341SAndroid Build Coastguard Worker psrldq m14, m7, 2 837*c0909341SAndroid Build Coastguard Worker paddw m9, m11 838*c0909341SAndroid Build Coastguard Worker paddw m10, m12 839*c0909341SAndroid Build Coastguard Worker paddw m9, m13 ; partial_sum_diag[0][0-7] 840*c0909341SAndroid Build Coastguard Worker paddw m10, m14 ; partial_sum_diag[0][8-14,zero] 841*c0909341SAndroid Build Coastguard Worker pshufb m10, [shufw_6543210x] 842*c0909341SAndroid Build Coastguard Worker punpckhwd m11, m9, m10 843*c0909341SAndroid Build Coastguard Worker punpcklwd m9, m10 844*c0909341SAndroid Build Coastguard Worker pmaddwd m11, m11 845*c0909341SAndroid Build Coastguard Worker pmaddwd m9, m9 846*c0909341SAndroid Build Coastguard Worker MULLD m11, [div_table%+SUFFIX+16] 847*c0909341SAndroid Build Coastguard Worker MULLD m9, [div_table%+SUFFIX+0] 848*c0909341SAndroid Build Coastguard Worker paddd m9, m11 ; cost[0a-d] 849*c0909341SAndroid Build Coastguard Worker 850*c0909341SAndroid Build Coastguard Worker pslldq m10, m0, 14 851*c0909341SAndroid Build Coastguard Worker psrldq m11, m0, 2 852*c0909341SAndroid Build Coastguard Worker pslldq m12, m1, 12 853*c0909341SAndroid Build Coastguard Worker psrldq m13, m1, 4 854*c0909341SAndroid Build Coastguard Worker pslldq m14, m2, 10 855*c0909341SAndroid Build Coastguard Worker psrldq m15, m2, 6 856*c0909341SAndroid Build Coastguard Worker paddw m10, m12 857*c0909341SAndroid Build Coastguard Worker paddw m11, m13 858*c0909341SAndroid Build Coastguard Worker paddw m10, m14 859*c0909341SAndroid Build Coastguard Worker paddw m11, m15 860*c0909341SAndroid Build Coastguard Worker pslldq m12, m3, 8 861*c0909341SAndroid Build Coastguard Worker psrldq m13, m3, 8 862*c0909341SAndroid Build Coastguard Worker pslldq m14, m4, 6 863*c0909341SAndroid Build Coastguard Worker psrldq m15, m4, 10 864*c0909341SAndroid Build Coastguard Worker paddw m10, m12 865*c0909341SAndroid Build Coastguard Worker paddw m11, m13 866*c0909341SAndroid Build Coastguard Worker paddw m10, m14 867*c0909341SAndroid Build Coastguard Worker paddw m11, m15 868*c0909341SAndroid Build Coastguard Worker pslldq m12, m5, 4 869*c0909341SAndroid Build Coastguard Worker psrldq m13, m5, 12 870*c0909341SAndroid Build Coastguard Worker pslldq m14, m6, 2 871*c0909341SAndroid Build Coastguard Worker psrldq m15, m6, 14 872*c0909341SAndroid Build Coastguard Worker paddw m10, m12 873*c0909341SAndroid Build Coastguard Worker paddw m11, m13 874*c0909341SAndroid Build Coastguard Worker paddw m10, m14 875*c0909341SAndroid Build Coastguard Worker paddw m11, m15 ; partial_sum_diag[1][8-14,zero] 876*c0909341SAndroid Build Coastguard Worker paddw m10, m7 ; partial_sum_diag[1][0-7] 877*c0909341SAndroid Build Coastguard Worker pshufb m11, [shufw_6543210x] 878*c0909341SAndroid Build Coastguard Worker punpckhwd m12, m10, m11 879*c0909341SAndroid Build Coastguard Worker punpcklwd m10, m11 880*c0909341SAndroid Build Coastguard Worker pmaddwd m12, m12 881*c0909341SAndroid Build Coastguard Worker pmaddwd m10, m10 882*c0909341SAndroid Build Coastguard Worker MULLD m12, [div_table%+SUFFIX+16] 883*c0909341SAndroid Build Coastguard Worker MULLD m10, [div_table%+SUFFIX+0] 884*c0909341SAndroid Build Coastguard Worker paddd m10, m12 ; cost[4a-d] 885*c0909341SAndroid Build Coastguard Worker phaddd m9, m10 ; cost[0a/b,4a/b] 886*c0909341SAndroid Build Coastguard Worker 887*c0909341SAndroid Build Coastguard Worker paddw m10, m0, m1 888*c0909341SAndroid Build Coastguard Worker paddw m11, m2, m3 889*c0909341SAndroid Build Coastguard Worker paddw m12, m4, m5 890*c0909341SAndroid Build Coastguard Worker paddw m13, m6, m7 891*c0909341SAndroid Build Coastguard Worker phaddw m0, m4 892*c0909341SAndroid Build Coastguard Worker phaddw m1, m5 893*c0909341SAndroid Build Coastguard Worker phaddw m2, m6 894*c0909341SAndroid Build Coastguard Worker phaddw m3, m7 895*c0909341SAndroid Build Coastguard Worker 896*c0909341SAndroid Build Coastguard Worker ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1) 897*c0909341SAndroid Build Coastguard Worker pslldq m4, m11, 2 898*c0909341SAndroid Build Coastguard Worker psrldq m5, m11, 14 899*c0909341SAndroid Build Coastguard Worker pslldq m6, m12, 4 900*c0909341SAndroid Build Coastguard Worker psrldq m7, m12, 12 901*c0909341SAndroid Build Coastguard Worker pslldq m14, m13, 6 902*c0909341SAndroid Build Coastguard Worker psrldq m15, m13, 10 903*c0909341SAndroid Build Coastguard Worker paddw m4, m10 904*c0909341SAndroid Build Coastguard Worker paddw m5, m7 905*c0909341SAndroid Build Coastguard Worker paddw m4, m6 906*c0909341SAndroid Build Coastguard Worker paddw m5, m15 ; partial_sum_alt[3] right 907*c0909341SAndroid Build Coastguard Worker paddw m4, m14 ; partial_sum_alt[3] left 908*c0909341SAndroid Build Coastguard Worker pshuflw m6, m5, q3012 909*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m4 910*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m6 911*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 912*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 913*c0909341SAndroid Build Coastguard Worker MULLD m5, [div_table%+SUFFIX+48] 914*c0909341SAndroid Build Coastguard Worker MULLD m4, [div_table%+SUFFIX+32] 915*c0909341SAndroid Build Coastguard Worker paddd m4, m5 ; cost[7a-d] 916*c0909341SAndroid Build Coastguard Worker 917*c0909341SAndroid Build Coastguard Worker pslldq m5, m10, 6 918*c0909341SAndroid Build Coastguard Worker psrldq m6, m10, 10 919*c0909341SAndroid Build Coastguard Worker pslldq m7, m11, 4 920*c0909341SAndroid Build Coastguard Worker psrldq m10, m11, 12 921*c0909341SAndroid Build Coastguard Worker pslldq m11, m12, 2 922*c0909341SAndroid Build Coastguard Worker psrldq m12, 14 923*c0909341SAndroid Build Coastguard Worker paddw m5, m7 924*c0909341SAndroid Build Coastguard Worker paddw m6, m10 925*c0909341SAndroid Build Coastguard Worker paddw m5, m11 926*c0909341SAndroid Build Coastguard Worker paddw m6, m12 927*c0909341SAndroid Build Coastguard Worker paddw m5, m13 928*c0909341SAndroid Build Coastguard Worker pshuflw m7, m6, q3012 929*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m5 930*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m7 931*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m6 932*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 933*c0909341SAndroid Build Coastguard Worker MULLD m6, [div_table%+SUFFIX+48] 934*c0909341SAndroid Build Coastguard Worker MULLD m5, [div_table%+SUFFIX+32] 935*c0909341SAndroid Build Coastguard Worker paddd m5, m6 ; cost[5a-d] 936*c0909341SAndroid Build Coastguard Worker 937*c0909341SAndroid Build Coastguard Worker pslldq m6, m1, 2 938*c0909341SAndroid Build Coastguard Worker psrldq m7, m1, 14 939*c0909341SAndroid Build Coastguard Worker pslldq m10, m2, 4 940*c0909341SAndroid Build Coastguard Worker psrldq m11, m2, 12 941*c0909341SAndroid Build Coastguard Worker pslldq m12, m3, 6 942*c0909341SAndroid Build Coastguard Worker psrldq m13, m3, 10 943*c0909341SAndroid Build Coastguard Worker paddw m6, m0 944*c0909341SAndroid Build Coastguard Worker paddw m7, m11 945*c0909341SAndroid Build Coastguard Worker paddw m6, m10 946*c0909341SAndroid Build Coastguard Worker paddw m7, m13 ; partial_sum_alt[3] right 947*c0909341SAndroid Build Coastguard Worker paddw m6, m12 ; partial_sum_alt[3] left 948*c0909341SAndroid Build Coastguard Worker pshuflw m10, m7, q3012 949*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m6 950*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m10 951*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 952*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m6 953*c0909341SAndroid Build Coastguard Worker MULLD m7, [div_table%+SUFFIX+48] 954*c0909341SAndroid Build Coastguard Worker MULLD m6, [div_table%+SUFFIX+32] 955*c0909341SAndroid Build Coastguard Worker paddd m6, m7 ; cost[1a-d] 956*c0909341SAndroid Build Coastguard Worker 957*c0909341SAndroid Build Coastguard Worker pshufd m0, m0, q1032 958*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q1032 959*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q1032 960*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q1032 961*c0909341SAndroid Build Coastguard Worker 962*c0909341SAndroid Build Coastguard Worker pslldq m10, m0, 6 963*c0909341SAndroid Build Coastguard Worker psrldq m11, m0, 10 964*c0909341SAndroid Build Coastguard Worker pslldq m12, m1, 4 965*c0909341SAndroid Build Coastguard Worker psrldq m13, m1, 12 966*c0909341SAndroid Build Coastguard Worker pslldq m14, m2, 2 967*c0909341SAndroid Build Coastguard Worker psrldq m2, 14 968*c0909341SAndroid Build Coastguard Worker paddw m10, m12 969*c0909341SAndroid Build Coastguard Worker paddw m11, m13 970*c0909341SAndroid Build Coastguard Worker paddw m10, m14 971*c0909341SAndroid Build Coastguard Worker paddw m11, m2 972*c0909341SAndroid Build Coastguard Worker paddw m10, m3 973*c0909341SAndroid Build Coastguard Worker pshuflw m12, m11, q3012 974*c0909341SAndroid Build Coastguard Worker punpckhwd m11, m10 975*c0909341SAndroid Build Coastguard Worker punpcklwd m10, m12 976*c0909341SAndroid Build Coastguard Worker pmaddwd m11, m11 977*c0909341SAndroid Build Coastguard Worker pmaddwd m10, m10 978*c0909341SAndroid Build Coastguard Worker MULLD m11, [div_table%+SUFFIX+48] 979*c0909341SAndroid Build Coastguard Worker MULLD m10, [div_table%+SUFFIX+32] 980*c0909341SAndroid Build Coastguard Worker paddd m10, m11 ; cost[3a-d] 981*c0909341SAndroid Build Coastguard Worker 982*c0909341SAndroid Build Coastguard Worker phaddd m9, m8 ; cost[0,4,2,6] 983*c0909341SAndroid Build Coastguard Worker phaddd m6, m10 984*c0909341SAndroid Build Coastguard Worker phaddd m5, m4 985*c0909341SAndroid Build Coastguard Worker phaddd m6, m5 ; cost[1,3,5,7] 986*c0909341SAndroid Build Coastguard Worker pshufd m4, m9, q3120 987*c0909341SAndroid Build Coastguard Worker 988*c0909341SAndroid Build Coastguard Worker ; now find the best cost 989*c0909341SAndroid Build Coastguard Worker %if cpuflag(sse4) 990*c0909341SAndroid Build Coastguard Worker pmaxsd m9, m6 991*c0909341SAndroid Build Coastguard Worker pshufd m0, m9, q1032 992*c0909341SAndroid Build Coastguard Worker pmaxsd m0, m9 993*c0909341SAndroid Build Coastguard Worker pshufd m1, m0, q2301 994*c0909341SAndroid Build Coastguard Worker pmaxsd m0, m1 ; best cost 995*c0909341SAndroid Build Coastguard Worker %else 996*c0909341SAndroid Build Coastguard Worker pcmpgtd m0, m9, m6 997*c0909341SAndroid Build Coastguard Worker pand m9, m0 998*c0909341SAndroid Build Coastguard Worker pandn m0, m6 999*c0909341SAndroid Build Coastguard Worker por m9, m0 1000*c0909341SAndroid Build Coastguard Worker pshufd m1, m9, q1032 1001*c0909341SAndroid Build Coastguard Worker pcmpgtd m0, m9, m1 1002*c0909341SAndroid Build Coastguard Worker pand m9, m0 1003*c0909341SAndroid Build Coastguard Worker pandn m0, m1 1004*c0909341SAndroid Build Coastguard Worker por m9, m0 1005*c0909341SAndroid Build Coastguard Worker pshufd m1, m9, q2301 1006*c0909341SAndroid Build Coastguard Worker pcmpgtd m0, m9, m1 1007*c0909341SAndroid Build Coastguard Worker pand m9, m0 1008*c0909341SAndroid Build Coastguard Worker pandn m0, m1 1009*c0909341SAndroid Build Coastguard Worker por m0, m9 1010*c0909341SAndroid Build Coastguard Worker %endif 1011*c0909341SAndroid Build Coastguard Worker 1012*c0909341SAndroid Build Coastguard Worker ; get direction and variance 1013*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m4, m6 1014*c0909341SAndroid Build Coastguard Worker punpckldq m4, m6 1015*c0909341SAndroid Build Coastguard Worker psubd m2, m0, m1 1016*c0909341SAndroid Build Coastguard Worker psubd m3, m0, m4 1017*c0909341SAndroid Build Coastguard Worker%if WIN64 1018*c0909341SAndroid Build Coastguard Worker WIN64_RESTORE_XMM 1019*c0909341SAndroid Build Coastguard Worker %define tmp rsp+stack_offset+8 1020*c0909341SAndroid Build Coastguard Worker%else 1021*c0909341SAndroid Build Coastguard Worker %define tmp rsp-40 1022*c0909341SAndroid Build Coastguard Worker%endif 1023*c0909341SAndroid Build Coastguard Worker mova [tmp+0x00], m2 ; emulate ymm in stack 1024*c0909341SAndroid Build Coastguard Worker mova [tmp+0x10], m3 1025*c0909341SAndroid Build Coastguard Worker pcmpeqd m1, m0 ; compute best cost mask 1026*c0909341SAndroid Build Coastguard Worker pcmpeqd m4, m0 1027*c0909341SAndroid Build Coastguard Worker packssdw m4, m1 1028*c0909341SAndroid Build Coastguard Worker pmovmskb eax, m4 ; get byte-idx from mask 1029*c0909341SAndroid Build Coastguard Worker tzcnt eax, eax 1030*c0909341SAndroid Build Coastguard Worker mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm 1031*c0909341SAndroid Build Coastguard Worker shr eax, 1 ; get direction by converting byte-idx to word-idx 1032*c0909341SAndroid Build Coastguard Worker shr r1d, 10 1033*c0909341SAndroid Build Coastguard Worker mov [varq], r1d 1034*c0909341SAndroid Build Coastguard Worker %else 1035*c0909341SAndroid Build Coastguard Workercglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3 1036*c0909341SAndroid Build Coastguard Worker%define base r2-shufw_6543210x 1037*c0909341SAndroid Build Coastguard Worker LEA r2, shufw_6543210x 1038*c0909341SAndroid Build Coastguard Worker pxor m0, m0 1039*c0909341SAndroid Build Coastguard Worker lea stride3q, [strideq*3] 1040*c0909341SAndroid Build Coastguard Worker movq m5, [srcq+strideq*0] 1041*c0909341SAndroid Build Coastguard Worker movhps m5, [srcq+strideq*1] 1042*c0909341SAndroid Build Coastguard Worker movq m7, [srcq+strideq*2] 1043*c0909341SAndroid Build Coastguard Worker movhps m7, [srcq+stride3q] 1044*c0909341SAndroid Build Coastguard Worker mova m1, [base+pw_128] 1045*c0909341SAndroid Build Coastguard Worker psadbw m2, m5, m0 1046*c0909341SAndroid Build Coastguard Worker psadbw m3, m7, m0 1047*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1048*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m0 1049*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m0 1050*c0909341SAndroid Build Coastguard Worker punpcklbw m6, m7, m0 1051*c0909341SAndroid Build Coastguard Worker punpckhbw m7, m0 1052*c0909341SAndroid Build Coastguard Worker psubw m4, m1 1053*c0909341SAndroid Build Coastguard Worker psubw m5, m1 1054*c0909341SAndroid Build Coastguard Worker psubw m6, m1 1055*c0909341SAndroid Build Coastguard Worker psubw m7, m1 1056*c0909341SAndroid Build Coastguard Worker 1057*c0909341SAndroid Build Coastguard Worker mova [esp+0x00], m4 1058*c0909341SAndroid Build Coastguard Worker mova [esp+0x10], m5 1059*c0909341SAndroid Build Coastguard Worker mova [esp+0x20], m6 1060*c0909341SAndroid Build Coastguard Worker mova [esp+0x50], m7 1061*c0909341SAndroid Build Coastguard Worker 1062*c0909341SAndroid Build Coastguard Worker lea srcq, [srcq+strideq*4] 1063*c0909341SAndroid Build Coastguard Worker movq m5, [srcq+strideq*0] 1064*c0909341SAndroid Build Coastguard Worker movhps m5, [srcq+strideq*1] 1065*c0909341SAndroid Build Coastguard Worker movq m7, [srcq+strideq*2] 1066*c0909341SAndroid Build Coastguard Worker movhps m7, [srcq+stride3q] 1067*c0909341SAndroid Build Coastguard Worker psadbw m3, m5, m0 1068*c0909341SAndroid Build Coastguard Worker psadbw m0, m7 1069*c0909341SAndroid Build Coastguard Worker packssdw m3, m0 1070*c0909341SAndroid Build Coastguard Worker pxor m0, m0 1071*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m0 1072*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m0 1073*c0909341SAndroid Build Coastguard Worker punpcklbw m6, m7, m0 1074*c0909341SAndroid Build Coastguard Worker punpckhbw m7, m0 1075*c0909341SAndroid Build Coastguard Workercglobal_label .main 1076*c0909341SAndroid Build Coastguard Worker psubw m4, m1 1077*c0909341SAndroid Build Coastguard Worker psubw m5, m1 1078*c0909341SAndroid Build Coastguard Worker psubw m6, m1 1079*c0909341SAndroid Build Coastguard Worker psubw m7, m1 1080*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1081*c0909341SAndroid Build Coastguard Worker psllw m1, 3 1082*c0909341SAndroid Build Coastguard Worker psubw m2, m1 ; partial_sum_hv[0] 1083*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1084*c0909341SAndroid Build Coastguard Worker 1085*c0909341SAndroid Build Coastguard Worker mova m3, [esp+0x50] 1086*c0909341SAndroid Build Coastguard Worker mova m0, [esp+0x00] 1087*c0909341SAndroid Build Coastguard Worker paddw m0, [esp+0x10] 1088*c0909341SAndroid Build Coastguard Worker paddw m1, m3, [esp+0x20] 1089*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1090*c0909341SAndroid Build Coastguard Worker paddw m1, m5 1091*c0909341SAndroid Build Coastguard Worker paddw m0, m6 1092*c0909341SAndroid Build Coastguard Worker paddw m1, m7 1093*c0909341SAndroid Build Coastguard Worker paddw m0, m1 ; partial_sum_hv[1] 1094*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 1095*c0909341SAndroid Build Coastguard Worker 1096*c0909341SAndroid Build Coastguard Worker phaddd m2, m0 1097*c0909341SAndroid Build Coastguard Worker MULLD m2, [base+div_table%+SUFFIX+48] 1098*c0909341SAndroid Build Coastguard Worker mova [esp+0x30], m2 1099*c0909341SAndroid Build Coastguard Worker 1100*c0909341SAndroid Build Coastguard Worker mova m1, [esp+0x10] 1101*c0909341SAndroid Build Coastguard Worker pslldq m0, m1, 2 1102*c0909341SAndroid Build Coastguard Worker psrldq m1, 14 1103*c0909341SAndroid Build Coastguard Worker paddw m0, [esp+0x00] 1104*c0909341SAndroid Build Coastguard Worker pslldq m2, m3, 6 1105*c0909341SAndroid Build Coastguard Worker psrldq m3, 10 1106*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1107*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1108*c0909341SAndroid Build Coastguard Worker mova m3, [esp+0x20] 1109*c0909341SAndroid Build Coastguard Worker pslldq m2, m3, 4 1110*c0909341SAndroid Build Coastguard Worker psrldq m3, 12 1111*c0909341SAndroid Build Coastguard Worker paddw m0, m2 ; partial_sum_diag[0] top/left half 1112*c0909341SAndroid Build Coastguard Worker paddw m1, m3 ; partial_sum_diag[0] top/right half 1113*c0909341SAndroid Build Coastguard Worker pslldq m2, m4, 8 1114*c0909341SAndroid Build Coastguard Worker psrldq m3, m4, 8 1115*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1116*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1117*c0909341SAndroid Build Coastguard Worker pslldq m2, m5, 10 1118*c0909341SAndroid Build Coastguard Worker psrldq m3, m5, 6 1119*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1120*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1121*c0909341SAndroid Build Coastguard Worker pslldq m2, m6, 12 1122*c0909341SAndroid Build Coastguard Worker psrldq m3, m6, 4 1123*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1124*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1125*c0909341SAndroid Build Coastguard Worker pslldq m2, m7, 14 1126*c0909341SAndroid Build Coastguard Worker psrldq m3, m7, 2 1127*c0909341SAndroid Build Coastguard Worker paddw m0, m2 ; partial_sum_diag[0][0-7] 1128*c0909341SAndroid Build Coastguard Worker paddw m1, m3 ; partial_sum_diag[0][8-14,zero] 1129*c0909341SAndroid Build Coastguard Worker mova m3, [esp+0x50] 1130*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+shufw_6543210x] 1131*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0, m1 1132*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 1133*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1134*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 1135*c0909341SAndroid Build Coastguard Worker MULLD m2, [base+div_table%+SUFFIX+16] 1136*c0909341SAndroid Build Coastguard Worker MULLD m0, [base+div_table%+SUFFIX+ 0] 1137*c0909341SAndroid Build Coastguard Worker paddd m0, m2 ; cost[0a-d] 1138*c0909341SAndroid Build Coastguard Worker mova [esp+0x40], m0 1139*c0909341SAndroid Build Coastguard Worker 1140*c0909341SAndroid Build Coastguard Worker mova m1, [esp+0x00] 1141*c0909341SAndroid Build Coastguard Worker pslldq m0, m1, 14 1142*c0909341SAndroid Build Coastguard Worker psrldq m1, 2 1143*c0909341SAndroid Build Coastguard Worker paddw m0, m7 1144*c0909341SAndroid Build Coastguard Worker pslldq m2, m3, 8 1145*c0909341SAndroid Build Coastguard Worker psrldq m3, 8 1146*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1147*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1148*c0909341SAndroid Build Coastguard Worker mova m3, [esp+0x20] 1149*c0909341SAndroid Build Coastguard Worker pslldq m2, m3, 10 1150*c0909341SAndroid Build Coastguard Worker psrldq m3, 6 1151*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1152*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1153*c0909341SAndroid Build Coastguard Worker mova m3, [esp+0x10] 1154*c0909341SAndroid Build Coastguard Worker pslldq m2, m3, 12 1155*c0909341SAndroid Build Coastguard Worker psrldq m3, 4 1156*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1157*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1158*c0909341SAndroid Build Coastguard Worker pslldq m2, m4, 6 1159*c0909341SAndroid Build Coastguard Worker psrldq m3, m4, 10 1160*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1161*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1162*c0909341SAndroid Build Coastguard Worker pslldq m2, m5, 4 1163*c0909341SAndroid Build Coastguard Worker psrldq m3, m5, 12 1164*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1165*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1166*c0909341SAndroid Build Coastguard Worker pslldq m2, m6, 2 1167*c0909341SAndroid Build Coastguard Worker psrldq m3, m6, 14 1168*c0909341SAndroid Build Coastguard Worker paddw m0, m2 ; partial_sum_diag[1][0-7] 1169*c0909341SAndroid Build Coastguard Worker paddw m1, m3 ; partial_sum_diag[1][8-14,zero] 1170*c0909341SAndroid Build Coastguard Worker mova m3, [esp+0x50] 1171*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+shufw_6543210x] 1172*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0, m1 1173*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 1174*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1175*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 1176*c0909341SAndroid Build Coastguard Worker MULLD m2, [base+div_table%+SUFFIX+16] 1177*c0909341SAndroid Build Coastguard Worker MULLD m0, [base+div_table%+SUFFIX+ 0] 1178*c0909341SAndroid Build Coastguard Worker paddd m0, m2 ; cost[4a-d] 1179*c0909341SAndroid Build Coastguard Worker phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b] 1180*c0909341SAndroid Build Coastguard Worker phaddd m1, [esp+0x30] ; cost[0,4,2,6] 1181*c0909341SAndroid Build Coastguard Worker mova [esp+0x30], m1 1182*c0909341SAndroid Build Coastguard Worker 1183*c0909341SAndroid Build Coastguard Worker phaddw m0, [esp+0x00], m4 1184*c0909341SAndroid Build Coastguard Worker phaddw m1, [esp+0x10], m5 1185*c0909341SAndroid Build Coastguard Worker paddw m4, m5 1186*c0909341SAndroid Build Coastguard Worker mova m2, [esp+0x20] 1187*c0909341SAndroid Build Coastguard Worker paddw m5, m2, m3 1188*c0909341SAndroid Build Coastguard Worker phaddw m2, m6 1189*c0909341SAndroid Build Coastguard Worker paddw m6, m7 1190*c0909341SAndroid Build Coastguard Worker phaddw m3, m7 1191*c0909341SAndroid Build Coastguard Worker mova m7, [esp+0x00] 1192*c0909341SAndroid Build Coastguard Worker paddw m7, [esp+0x10] 1193*c0909341SAndroid Build Coastguard Worker mova [esp+0x00], m0 1194*c0909341SAndroid Build Coastguard Worker mova [esp+0x10], m1 1195*c0909341SAndroid Build Coastguard Worker mova [esp+0x20], m2 1196*c0909341SAndroid Build Coastguard Worker 1197*c0909341SAndroid Build Coastguard Worker pslldq m1, m4, 4 1198*c0909341SAndroid Build Coastguard Worker pslldq m2, m6, 6 1199*c0909341SAndroid Build Coastguard Worker pslldq m0, m5, 2 1200*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1201*c0909341SAndroid Build Coastguard Worker paddw m0, m7 1202*c0909341SAndroid Build Coastguard Worker psrldq m2, m5, 14 1203*c0909341SAndroid Build Coastguard Worker paddw m0, m1 ; partial_sum_alt[3] left 1204*c0909341SAndroid Build Coastguard Worker psrldq m1, m4, 12 1205*c0909341SAndroid Build Coastguard Worker paddw m1, m2 1206*c0909341SAndroid Build Coastguard Worker psrldq m2, m6, 10 1207*c0909341SAndroid Build Coastguard Worker paddw m1, m2 ; partial_sum_alt[3] right 1208*c0909341SAndroid Build Coastguard Worker pshuflw m1, m1, q3012 1209*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0, m1 1210*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 1211*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1212*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 1213*c0909341SAndroid Build Coastguard Worker MULLD m2, [base+div_table%+SUFFIX+48] 1214*c0909341SAndroid Build Coastguard Worker MULLD m0, [base+div_table%+SUFFIX+32] 1215*c0909341SAndroid Build Coastguard Worker paddd m0, m2 ; cost[7a-d] 1216*c0909341SAndroid Build Coastguard Worker mova [esp+0x40], m0 1217*c0909341SAndroid Build Coastguard Worker 1218*c0909341SAndroid Build Coastguard Worker pslldq m0, m7, 6 1219*c0909341SAndroid Build Coastguard Worker psrldq m7, 10 1220*c0909341SAndroid Build Coastguard Worker pslldq m1, m5, 4 1221*c0909341SAndroid Build Coastguard Worker psrldq m5, 12 1222*c0909341SAndroid Build Coastguard Worker pslldq m2, m4, 2 1223*c0909341SAndroid Build Coastguard Worker psrldq m4, 14 1224*c0909341SAndroid Build Coastguard Worker paddw m0, m6 1225*c0909341SAndroid Build Coastguard Worker paddw m7, m5 1226*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1227*c0909341SAndroid Build Coastguard Worker paddw m7, m4 1228*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1229*c0909341SAndroid Build Coastguard Worker pshuflw m2, m7, q3012 1230*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m0 1231*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 1232*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 1233*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 1234*c0909341SAndroid Build Coastguard Worker MULLD m7, [base+div_table%+SUFFIX+48] 1235*c0909341SAndroid Build Coastguard Worker MULLD m0, [base+div_table%+SUFFIX+32] 1236*c0909341SAndroid Build Coastguard Worker paddd m0, m7 ; cost[5a-d] 1237*c0909341SAndroid Build Coastguard Worker mova [esp+0x50], m0 1238*c0909341SAndroid Build Coastguard Worker 1239*c0909341SAndroid Build Coastguard Worker mova m7, [esp+0x10] 1240*c0909341SAndroid Build Coastguard Worker mova m2, [esp+0x20] 1241*c0909341SAndroid Build Coastguard Worker pslldq m0, m7, 2 1242*c0909341SAndroid Build Coastguard Worker psrldq m7, 14 1243*c0909341SAndroid Build Coastguard Worker pslldq m4, m2, 4 1244*c0909341SAndroid Build Coastguard Worker psrldq m2, 12 1245*c0909341SAndroid Build Coastguard Worker pslldq m5, m3, 6 1246*c0909341SAndroid Build Coastguard Worker psrldq m6, m3, 10 1247*c0909341SAndroid Build Coastguard Worker paddw m0, [esp+0x00] 1248*c0909341SAndroid Build Coastguard Worker paddw m7, m2 1249*c0909341SAndroid Build Coastguard Worker paddw m4, m5 1250*c0909341SAndroid Build Coastguard Worker paddw m7, m6 ; partial_sum_alt[3] right 1251*c0909341SAndroid Build Coastguard Worker paddw m0, m4 ; partial_sum_alt[3] left 1252*c0909341SAndroid Build Coastguard Worker pshuflw m2, m7, q3012 1253*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m0 1254*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 1255*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 1256*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 1257*c0909341SAndroid Build Coastguard Worker MULLD m7, [base+div_table%+SUFFIX+48] 1258*c0909341SAndroid Build Coastguard Worker MULLD m0, [base+div_table%+SUFFIX+32] 1259*c0909341SAndroid Build Coastguard Worker paddd m0, m7 ; cost[1a-d] 1260*c0909341SAndroid Build Coastguard Worker SWAP m0, m4 1261*c0909341SAndroid Build Coastguard Worker 1262*c0909341SAndroid Build Coastguard Worker pshufd m0, [esp+0x00], q1032 1263*c0909341SAndroid Build Coastguard Worker pshufd m1, [esp+0x10], q1032 1264*c0909341SAndroid Build Coastguard Worker pshufd m2, [esp+0x20], q1032 1265*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q1032 1266*c0909341SAndroid Build Coastguard Worker mova [esp+0x00], m4 1267*c0909341SAndroid Build Coastguard Worker 1268*c0909341SAndroid Build Coastguard Worker pslldq m4, m0, 6 1269*c0909341SAndroid Build Coastguard Worker psrldq m0, 10 1270*c0909341SAndroid Build Coastguard Worker pslldq m5, m1, 4 1271*c0909341SAndroid Build Coastguard Worker psrldq m1, 12 1272*c0909341SAndroid Build Coastguard Worker pslldq m6, m2, 2 1273*c0909341SAndroid Build Coastguard Worker psrldq m2, 14 1274*c0909341SAndroid Build Coastguard Worker paddw m4, m3 1275*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1276*c0909341SAndroid Build Coastguard Worker paddw m5, m6 1277*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1278*c0909341SAndroid Build Coastguard Worker paddw m4, m5 1279*c0909341SAndroid Build Coastguard Worker pshuflw m2, m0, q3012 1280*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m4 1281*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m2 1282*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 1283*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 1284*c0909341SAndroid Build Coastguard Worker MULLD m0, [base+div_table%+SUFFIX+48] 1285*c0909341SAndroid Build Coastguard Worker MULLD m4, [base+div_table%+SUFFIX+32] 1286*c0909341SAndroid Build Coastguard Worker paddd m4, m0 ; cost[3a-d] 1287*c0909341SAndroid Build Coastguard Worker 1288*c0909341SAndroid Build Coastguard Worker mova m1, [esp+0x00] 1289*c0909341SAndroid Build Coastguard Worker mova m2, [esp+0x50] 1290*c0909341SAndroid Build Coastguard Worker mova m0, [esp+0x30] ; cost[0,4,2,6] 1291*c0909341SAndroid Build Coastguard Worker phaddd m1, m4 1292*c0909341SAndroid Build Coastguard Worker phaddd m2, [esp+0x40] ; cost[1,3,5,7] 1293*c0909341SAndroid Build Coastguard Worker phaddd m1, m2 1294*c0909341SAndroid Build Coastguard Worker pshufd m2, m0, q3120 1295*c0909341SAndroid Build Coastguard Worker 1296*c0909341SAndroid Build Coastguard Worker ; now find the best cost 1297*c0909341SAndroid Build Coastguard Worker %if cpuflag(sse4) 1298*c0909341SAndroid Build Coastguard Worker pmaxsd m0, m1 1299*c0909341SAndroid Build Coastguard Worker pshufd m3, m0, q1032 1300*c0909341SAndroid Build Coastguard Worker pmaxsd m3, m0 1301*c0909341SAndroid Build Coastguard Worker pshufd m0, m3, q2301 1302*c0909341SAndroid Build Coastguard Worker pmaxsd m0, m3 1303*c0909341SAndroid Build Coastguard Worker %else 1304*c0909341SAndroid Build Coastguard Worker pcmpgtd m3, m0, m1 1305*c0909341SAndroid Build Coastguard Worker pand m0, m3 1306*c0909341SAndroid Build Coastguard Worker pandn m3, m1 1307*c0909341SAndroid Build Coastguard Worker por m0, m3 1308*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q1032 1309*c0909341SAndroid Build Coastguard Worker pcmpgtd m3, m0, m4 1310*c0909341SAndroid Build Coastguard Worker pand m0, m3 1311*c0909341SAndroid Build Coastguard Worker pandn m3, m4 1312*c0909341SAndroid Build Coastguard Worker por m0, m3 1313*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q2301 1314*c0909341SAndroid Build Coastguard Worker pcmpgtd m3, m0, m4 1315*c0909341SAndroid Build Coastguard Worker pand m0, m3 1316*c0909341SAndroid Build Coastguard Worker pandn m3, m4 1317*c0909341SAndroid Build Coastguard Worker por m0, m3 1318*c0909341SAndroid Build Coastguard Worker %endif 1319*c0909341SAndroid Build Coastguard Worker 1320*c0909341SAndroid Build Coastguard Worker ; get direction and variance 1321*c0909341SAndroid Build Coastguard Worker mov vard, varm 1322*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m2, m1 1323*c0909341SAndroid Build Coastguard Worker punpckldq m2, m1 1324*c0909341SAndroid Build Coastguard Worker psubd m1, m0, m3 1325*c0909341SAndroid Build Coastguard Worker psubd m4, m0, m2 1326*c0909341SAndroid Build Coastguard Worker mova [esp+0x00], m1 ; emulate ymm in stack 1327*c0909341SAndroid Build Coastguard Worker mova [esp+0x10], m4 1328*c0909341SAndroid Build Coastguard Worker pcmpeqd m3, m0 ; compute best cost mask 1329*c0909341SAndroid Build Coastguard Worker pcmpeqd m2, m0 1330*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1331*c0909341SAndroid Build Coastguard Worker pmovmskb eax, m2 ; get byte-idx from mask 1332*c0909341SAndroid Build Coastguard Worker tzcnt eax, eax 1333*c0909341SAndroid Build Coastguard Worker mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm 1334*c0909341SAndroid Build Coastguard Worker shr eax, 1 ; get direction by converting byte-idx to word-idx 1335*c0909341SAndroid Build Coastguard Worker shr r1d, 10 1336*c0909341SAndroid Build Coastguard Worker mov [vard], r1d 1337*c0909341SAndroid Build Coastguard Worker %endif 1338*c0909341SAndroid Build Coastguard Worker 1339*c0909341SAndroid Build Coastguard Worker RET 1340*c0909341SAndroid Build Coastguard Worker%endmacro 1341*c0909341SAndroid Build Coastguard Worker 1342*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse4 1343*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 8, 8 1344*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 8 1345*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 4 1346*c0909341SAndroid Build Coastguard WorkerCDEF_DIR 1347*c0909341SAndroid Build Coastguard Worker 1348*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 1349*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 8, 8 1350*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 8 1351*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 4 1352*c0909341SAndroid Build Coastguard WorkerCDEF_DIR 1353*c0909341SAndroid Build Coastguard Worker 1354*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse2 1355*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 8, 8 1356*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 8 1357*c0909341SAndroid Build Coastguard WorkerCDEF_FILTER 4, 4 1358