1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard Workerwiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 32*c0909341SAndroid Build Coastguard Workerwiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 33*c0909341SAndroid Build Coastguard Workerwiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 34*c0909341SAndroid Build Coastguard Workerwiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 35*c0909341SAndroid Build Coastguard Workerwiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 36*c0909341SAndroid Build Coastguard Workerwiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 37*c0909341SAndroid Build Coastguard Workerwiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 38*c0909341SAndroid Build Coastguard Workersgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 39*c0909341SAndroid Build Coastguard Workersgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 40*c0909341SAndroid Build Coastguard Workerpb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 41*c0909341SAndroid Build Coastguard Worker 42*c0909341SAndroid Build Coastguard Workerpb_m14_m13: times 8 db -14,-13 43*c0909341SAndroid Build Coastguard Workerpb_m10_m9: times 8 db -10, -9 44*c0909341SAndroid Build Coastguard Workerpb_m6_m5: times 8 db -6, -5 45*c0909341SAndroid Build Coastguard Workerpb_m2_m1: times 8 db -2, -1 46*c0909341SAndroid Build Coastguard Workerpb_2_3: times 8 db 2, 3 47*c0909341SAndroid Build Coastguard Workerpb_6_7: times 8 db 6, 7 48*c0909341SAndroid Build Coastguard Workerpw_256: times 8 dw 256 49*c0909341SAndroid Build Coastguard Workerpw_1023: times 8 dw 1023 50*c0909341SAndroid Build Coastguard Workerpd_8: times 4 dd 8 51*c0909341SAndroid Build Coastguard Workerpd_4096: times 4 dd 4096 52*c0909341SAndroid Build Coastguard Workerpd_34816: times 4 dd 34816 53*c0909341SAndroid Build Coastguard Workerpd_m262128: times 4 dd -262128 54*c0909341SAndroid Build Coastguard Workerpd_0xffff: times 4 dd 0xffff 55*c0909341SAndroid Build Coastguard Workerpd_0xf00800a4: times 4 dd 0xf00800a4 56*c0909341SAndroid Build Coastguard Workerpd_0xf00801c7: times 4 dd 0xf00801c7 57*c0909341SAndroid Build Coastguard Workerpd_0xfffffff0: times 4 dd 0xfffffff0 58*c0909341SAndroid Build Coastguard Worker 59*c0909341SAndroid Build Coastguard Workerwiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 60*c0909341SAndroid Build Coastguard Workerwiener_round: dd 1049600, 1048832 61*c0909341SAndroid Build Coastguard Worker 62*c0909341SAndroid Build Coastguard Workercextern sgr_x_by_x 63*c0909341SAndroid Build Coastguard Worker 64*c0909341SAndroid Build Coastguard WorkerSECTION .text 65*c0909341SAndroid Build Coastguard Worker 66*c0909341SAndroid Build Coastguard Worker%macro movif64 2 ; dst, src 67*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 68*c0909341SAndroid Build Coastguard Worker mov %1, %2 69*c0909341SAndroid Build Coastguard Worker %endif 70*c0909341SAndroid Build Coastguard Worker%endmacro 71*c0909341SAndroid Build Coastguard Worker 72*c0909341SAndroid Build Coastguard Worker%macro movif32 2 ; dst, src 73*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32 74*c0909341SAndroid Build Coastguard Worker mov %1, %2 75*c0909341SAndroid Build Coastguard Worker %endif 76*c0909341SAndroid Build Coastguard Worker%endmacro 77*c0909341SAndroid Build Coastguard Worker 78*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 79*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 80*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5, 6 81*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 82*c0909341SAndroid Build Coastguard Worker %assign extra_stack 13*16 83*c0909341SAndroid Build Coastguard Worker %else 84*c0909341SAndroid Build Coastguard Worker %assign extra_stack 12*16 85*c0909341SAndroid Build Coastguard Worker %endif 86*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \ 87*c0909341SAndroid Build Coastguard Worker dst, stride, left, lpf, w, flt 88*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 89*c0909341SAndroid Build Coastguard Worker %define lpfm dword [esp+calloff+16*12+ 0] 90*c0909341SAndroid Build Coastguard Worker %define wm dword [esp+calloff+16*12+ 4] 91*c0909341SAndroid Build Coastguard Worker %define hd dword [esp+calloff+16*12+ 8] 92*c0909341SAndroid Build Coastguard Worker %define edgeb byte [esp+calloff+16*12+12] 93*c0909341SAndroid Build Coastguard Worker %define edged dword [esp+calloff+16*12+12] 94*c0909341SAndroid Build Coastguard Worker %else 95*c0909341SAndroid Build Coastguard Worker %define hd dword r5m 96*c0909341SAndroid Build Coastguard Worker %define edgeb byte r7m 97*c0909341SAndroid Build Coastguard Worker %endif 98*c0909341SAndroid Build Coastguard Worker %define PICmem dword [esp+calloff+4*0] 99*c0909341SAndroid Build Coastguard Worker %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers 100*c0909341SAndroid Build Coastguard Worker %define t1m dword [esp+calloff+4*2] 101*c0909341SAndroid Build Coastguard Worker %define t2m dword [esp+calloff+4*3] 102*c0909341SAndroid Build Coastguard Worker %define t3m dword [esp+calloff+4*4] 103*c0909341SAndroid Build Coastguard Worker %define t4m dword [esp+calloff+4*5] 104*c0909341SAndroid Build Coastguard Worker %define t5m dword [esp+calloff+4*6] 105*c0909341SAndroid Build Coastguard Worker %define t6m dword [esp+calloff+4*7] 106*c0909341SAndroid Build Coastguard Worker %define t2 t2m 107*c0909341SAndroid Build Coastguard Worker %define t3 t3m 108*c0909341SAndroid Build Coastguard Worker %define t4 t4m 109*c0909341SAndroid Build Coastguard Worker %define t5 t5m 110*c0909341SAndroid Build Coastguard Worker %define t6 t6m 111*c0909341SAndroid Build Coastguard Worker %define m8 [esp+calloff+16*2] 112*c0909341SAndroid Build Coastguard Worker %define m9 [esp+calloff+16*3] 113*c0909341SAndroid Build Coastguard Worker %define m10 [esp+calloff+16*4] 114*c0909341SAndroid Build Coastguard Worker %define m11 [esp+calloff+16*5] 115*c0909341SAndroid Build Coastguard Worker %define m12 [esp+calloff+16*6] 116*c0909341SAndroid Build Coastguard Worker %define m13 [esp+calloff+16*7] 117*c0909341SAndroid Build Coastguard Worker %define m14 [esp+calloff+16*8] 118*c0909341SAndroid Build Coastguard Worker %define m15 [esp+calloff+16*9] 119*c0909341SAndroid Build Coastguard Worker %define r10 r4 120*c0909341SAndroid Build Coastguard Worker %define base t0-wiener_shifts 121*c0909341SAndroid Build Coastguard Worker %assign calloff 0 122*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 123*c0909341SAndroid Build Coastguard Worker mov wd, [rstk+stack_offset+20] 124*c0909341SAndroid Build Coastguard Worker mov wm, wd 125*c0909341SAndroid Build Coastguard Worker mov r5, [rstk+stack_offset+24] 126*c0909341SAndroid Build Coastguard Worker mov hd, r5 127*c0909341SAndroid Build Coastguard Worker mov r5, [rstk+stack_offset+32] 128*c0909341SAndroid Build Coastguard Worker mov edged, r5 ; edge 129*c0909341SAndroid Build Coastguard Worker %endif 130*c0909341SAndroid Build Coastguard Worker%else 131*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers 132*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ 133*c0909341SAndroid Build Coastguard Worker w, h, edge, flt 134*c0909341SAndroid Build Coastguard Worker %define base 135*c0909341SAndroid Build Coastguard Worker%endif 136*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 137*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 138*c0909341SAndroid Build Coastguard Worker%endif 139*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 140*c0909341SAndroid Build Coastguard Worker mov fltq, r6mp 141*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 142*c0909341SAndroid Build Coastguard Worker mov edged, r7m 143*c0909341SAndroid Build Coastguard Worker mov t3d, r8m ; pixel_max 144*c0909341SAndroid Build Coastguard Worker movq m13, [fltq] 145*c0909341SAndroid Build Coastguard Worker movq m15, [fltq+16] 146*c0909341SAndroid Build Coastguard Worker%else 147*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 148*c0909341SAndroid Build Coastguard Worker mov t0, [rstk+stack_offset+28] 149*c0909341SAndroid Build Coastguard Worker mov t1, [rstk+stack_offset+36] ; pixel_max 150*c0909341SAndroid Build Coastguard Worker movq m1, [t0] ; fx 151*c0909341SAndroid Build Coastguard Worker movq m3, [t0+16] ; fy 152*c0909341SAndroid Build Coastguard Worker LEA t0, wiener_shifts 153*c0909341SAndroid Build Coastguard Worker %else 154*c0909341SAndroid Build Coastguard Worker mov fltq, r6m 155*c0909341SAndroid Build Coastguard Worker movq m1, [fltq] 156*c0909341SAndroid Build Coastguard Worker movq m3, [fltq+16] 157*c0909341SAndroid Build Coastguard Worker LEA t0, wiener_shifts 158*c0909341SAndroid Build Coastguard Worker mov t1, r8m ; pixel_max 159*c0909341SAndroid Build Coastguard Worker %endif 160*c0909341SAndroid Build Coastguard Worker mov PICmem, t0 161*c0909341SAndroid Build Coastguard Worker%endif 162*c0909341SAndroid Build Coastguard Worker mova m6, [base+wiener_shufA] 163*c0909341SAndroid Build Coastguard Worker mova m7, [base+wiener_shufB] 164*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 165*c0909341SAndroid Build Coastguard Worker lea t4, [wiener_shifts] 166*c0909341SAndroid Build Coastguard Worker add wd, wd 167*c0909341SAndroid Build Coastguard Worker pshufd m12, m13, q0000 ; x0 x1 168*c0909341SAndroid Build Coastguard Worker pshufd m13, m13, q1111 ; x2 x3 169*c0909341SAndroid Build Coastguard Worker pshufd m14, m15, q0000 ; y0 y1 170*c0909341SAndroid Build Coastguard Worker pshufd m15, m15, q1111 ; y2 y3 171*c0909341SAndroid Build Coastguard Worker mova m8, [wiener_shufC] 172*c0909341SAndroid Build Coastguard Worker mova m9, [wiener_shufD] 173*c0909341SAndroid Build Coastguard Worker add lpfq, wq 174*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+16] 175*c0909341SAndroid Build Coastguard Worker add dstq, wq 176*c0909341SAndroid Build Coastguard Worker neg wq 177*c0909341SAndroid Build Coastguard Worker shr t3d, 11 178*c0909341SAndroid Build Coastguard Worker %define base t4-wiener_shifts 179*c0909341SAndroid Build Coastguard Worker movd m10, [base+wiener_round+t3*4] 180*c0909341SAndroid Build Coastguard Worker movq m11, [base+wiener_shifts+t3*8] 181*c0909341SAndroid Build Coastguard Worker pshufd m10, m10, q0000 182*c0909341SAndroid Build Coastguard Worker pshufd m0, m11, q0000 183*c0909341SAndroid Build Coastguard Worker pshufd m11, m11, q1111 184*c0909341SAndroid Build Coastguard Worker pmullw m12, m0 ; upshift filter coefs to make the 185*c0909341SAndroid Build Coastguard Worker pmullw m13, m0 ; horizontal downshift constant 186*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 187*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp] 188*c0909341SAndroid Build Coastguard Worker %define base 189*c0909341SAndroid Build Coastguard Worker %define wiener_lshuf7_mem [wiener_lshuf7] 190*c0909341SAndroid Build Coastguard Worker %define pd_m262128_mem [pd_m262128] 191*c0909341SAndroid Build Coastguard Worker%else 192*c0909341SAndroid Build Coastguard Worker add wd, wd 193*c0909341SAndroid Build Coastguard Worker mova m4, [base+wiener_shufC] 194*c0909341SAndroid Build Coastguard Worker mova m5, [base+wiener_shufD] 195*c0909341SAndroid Build Coastguard Worker pshufd m0, m1, q0000 196*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q1111 197*c0909341SAndroid Build Coastguard Worker pshufd m2, m3, q0000 198*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q1111 199*c0909341SAndroid Build Coastguard Worker mova m8, m4 200*c0909341SAndroid Build Coastguard Worker mova m9, m5 201*c0909341SAndroid Build Coastguard Worker mova m14, m2 202*c0909341SAndroid Build Coastguard Worker mova m15, m3 203*c0909341SAndroid Build Coastguard Worker shr t1, 11 204*c0909341SAndroid Build Coastguard Worker add lpfq, wq 205*c0909341SAndroid Build Coastguard Worker mova m3, [base+pd_m262128] 206*c0909341SAndroid Build Coastguard Worker movd m4, [base+wiener_round+t1*4] 207*c0909341SAndroid Build Coastguard Worker movq m5, [base+wiener_shifts+t1*8] 208*c0909341SAndroid Build Coastguard Worker lea t1, [esp+extra_stack+wq+16] 209*c0909341SAndroid Build Coastguard Worker add dstq, wq 210*c0909341SAndroid Build Coastguard Worker neg wq 211*c0909341SAndroid Build Coastguard Worker pshufd m4, m4, q0000 212*c0909341SAndroid Build Coastguard Worker pshufd m2, m5, q0000 213*c0909341SAndroid Build Coastguard Worker pshufd m5, m5, q1111 214*c0909341SAndroid Build Coastguard Worker mov wm, wq 215*c0909341SAndroid Build Coastguard Worker pmullw m0, m2 216*c0909341SAndroid Build Coastguard Worker pmullw m1, m2 217*c0909341SAndroid Build Coastguard Worker mova m2, [base+wiener_lshuf7] 218*c0909341SAndroid Build Coastguard Worker %define pd_m262128_mem [esp+calloff+16*10] 219*c0909341SAndroid Build Coastguard Worker mova pd_m262128_mem, m3 220*c0909341SAndroid Build Coastguard Worker mova m10, m4 221*c0909341SAndroid Build Coastguard Worker mova m11, m5 222*c0909341SAndroid Build Coastguard Worker mova m12, m0 223*c0909341SAndroid Build Coastguard Worker mova m13, m1 224*c0909341SAndroid Build Coastguard Worker %define wiener_lshuf7_mem [esp+calloff+16*11] 225*c0909341SAndroid Build Coastguard Worker mova wiener_lshuf7_mem, m2 226*c0909341SAndroid Build Coastguard Worker%endif 227*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 228*c0909341SAndroid Build Coastguard Worker jz .no_top 229*c0909341SAndroid Build Coastguard Worker call .h_top 230*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 231*c0909341SAndroid Build Coastguard Worker mov t6, t1 232*c0909341SAndroid Build Coastguard Worker mov t5, t1 233*c0909341SAndroid Build Coastguard Worker add t1, 384*2 234*c0909341SAndroid Build Coastguard Worker call .h_top 235*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 236*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 237*c0909341SAndroid Build Coastguard Worker mov t4, t1 238*c0909341SAndroid Build Coastguard Worker add t1, 384*2 239*c0909341SAndroid Build Coastguard Worker add r10, strideq 240*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 ; below 241*c0909341SAndroid Build Coastguard Worker call .h 242*c0909341SAndroid Build Coastguard Worker mov t3, t1 243*c0909341SAndroid Build Coastguard Worker mov t2, t1 244*c0909341SAndroid Build Coastguard Worker dec hd 245*c0909341SAndroid Build Coastguard Worker jz .v1 246*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 247*c0909341SAndroid Build Coastguard Worker add t1, 384*2 248*c0909341SAndroid Build Coastguard Worker call .h 249*c0909341SAndroid Build Coastguard Worker mov t2, t1 250*c0909341SAndroid Build Coastguard Worker dec hd 251*c0909341SAndroid Build Coastguard Worker jz .v2 252*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 253*c0909341SAndroid Build Coastguard Worker add t1, 384*2 254*c0909341SAndroid Build Coastguard Worker call .h 255*c0909341SAndroid Build Coastguard Worker dec hd 256*c0909341SAndroid Build Coastguard Worker jz .v3 257*c0909341SAndroid Build Coastguard Worker.main: 258*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 259*c0909341SAndroid Build Coastguard Worker.main_loop: 260*c0909341SAndroid Build Coastguard Worker call .hv 261*c0909341SAndroid Build Coastguard Worker dec hd 262*c0909341SAndroid Build Coastguard Worker jnz .main_loop 263*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 264*c0909341SAndroid Build Coastguard Worker jz .v3 265*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 266*c0909341SAndroid Build Coastguard Worker call .hv_bottom 267*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 268*c0909341SAndroid Build Coastguard Worker call .hv_bottom 269*c0909341SAndroid Build Coastguard Worker.v1: 270*c0909341SAndroid Build Coastguard Worker call .v 271*c0909341SAndroid Build Coastguard Worker RET 272*c0909341SAndroid Build Coastguard Worker.no_top: 273*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 274*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 275*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 276*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 277*c0909341SAndroid Build Coastguard Worker call .h 278*c0909341SAndroid Build Coastguard Worker mov t6, t1 279*c0909341SAndroid Build Coastguard Worker mov t5, t1 280*c0909341SAndroid Build Coastguard Worker mov t4, t1 281*c0909341SAndroid Build Coastguard Worker mov t3, t1 282*c0909341SAndroid Build Coastguard Worker mov t2, t1 283*c0909341SAndroid Build Coastguard Worker dec hd 284*c0909341SAndroid Build Coastguard Worker jz .v1 285*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 286*c0909341SAndroid Build Coastguard Worker add t1, 384*2 287*c0909341SAndroid Build Coastguard Worker call .h 288*c0909341SAndroid Build Coastguard Worker mov t2, t1 289*c0909341SAndroid Build Coastguard Worker dec hd 290*c0909341SAndroid Build Coastguard Worker jz .v2 291*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 292*c0909341SAndroid Build Coastguard Worker add t1, 384*2 293*c0909341SAndroid Build Coastguard Worker call .h 294*c0909341SAndroid Build Coastguard Worker dec hd 295*c0909341SAndroid Build Coastguard Worker jz .v3 296*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 297*c0909341SAndroid Build Coastguard Worker call .hv 298*c0909341SAndroid Build Coastguard Worker dec hd 299*c0909341SAndroid Build Coastguard Worker jz .v3 300*c0909341SAndroid Build Coastguard Worker add t0, 384*8 301*c0909341SAndroid Build Coastguard Worker call .hv 302*c0909341SAndroid Build Coastguard Worker dec hd 303*c0909341SAndroid Build Coastguard Worker jnz .main 304*c0909341SAndroid Build Coastguard Worker.v3: 305*c0909341SAndroid Build Coastguard Worker call .v 306*c0909341SAndroid Build Coastguard Worker movif32 wq, wm 307*c0909341SAndroid Build Coastguard Worker.v2: 308*c0909341SAndroid Build Coastguard Worker call .v 309*c0909341SAndroid Build Coastguard Worker movif32 wq, wm 310*c0909341SAndroid Build Coastguard Worker jmp .v1 311*c0909341SAndroid Build Coastguard Worker.extend_right: 312*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+8 313*c0909341SAndroid Build Coastguard Worker%assign calloff 8 314*c0909341SAndroid Build Coastguard Worker movif32 t0, PICmem 315*c0909341SAndroid Build Coastguard Worker pxor m0, m0 316*c0909341SAndroid Build Coastguard Worker movd m1, wd 317*c0909341SAndroid Build Coastguard Worker mova m2, [base+pb_0to15] 318*c0909341SAndroid Build Coastguard Worker pshufb m1, m0 319*c0909341SAndroid Build Coastguard Worker mova m0, [base+pb_6_7] 320*c0909341SAndroid Build Coastguard Worker psubb m0, m1 321*c0909341SAndroid Build Coastguard Worker pminub m0, m2 322*c0909341SAndroid Build Coastguard Worker pshufb m3, m0 323*c0909341SAndroid Build Coastguard Worker mova m0, [base+pb_m2_m1] 324*c0909341SAndroid Build Coastguard Worker psubb m0, m1 325*c0909341SAndroid Build Coastguard Worker pminub m0, m2 326*c0909341SAndroid Build Coastguard Worker pshufb m4, m0 327*c0909341SAndroid Build Coastguard Worker mova m0, [base+pb_m10_m9] 328*c0909341SAndroid Build Coastguard Worker psubb m0, m1 329*c0909341SAndroid Build Coastguard Worker pminub m0, m2 330*c0909341SAndroid Build Coastguard Worker pshufb m5, m0 331*c0909341SAndroid Build Coastguard Worker movif32 t0, t0m 332*c0909341SAndroid Build Coastguard Worker ret 333*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset-4 334*c0909341SAndroid Build Coastguard Worker%assign calloff 4 335*c0909341SAndroid Build Coastguard Worker.h: 336*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 337*c0909341SAndroid Build Coastguard Worker movif32 wq, wm 338*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 339*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 340*c0909341SAndroid Build Coastguard Worker movq m3, [leftq] 341*c0909341SAndroid Build Coastguard Worker movhps m3, [lpfq+wq] 342*c0909341SAndroid Build Coastguard Worker add leftq, 8 343*c0909341SAndroid Build Coastguard Worker jmp .h_main 344*c0909341SAndroid Build Coastguard Worker.h_extend_left: 345*c0909341SAndroid Build Coastguard Worker mova m3, [lpfq+wq] ; avoid accessing memory located 346*c0909341SAndroid Build Coastguard Worker pshufb m3, wiener_lshuf7_mem ; before the start of the buffer 347*c0909341SAndroid Build Coastguard Worker jmp .h_main 348*c0909341SAndroid Build Coastguard Worker.h_top: 349*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 350*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 351*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 352*c0909341SAndroid Build Coastguard Worker.h_loop: 353*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+wq-8] 354*c0909341SAndroid Build Coastguard Worker.h_main: 355*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+0] 356*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq+8] 357*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 358*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 359*c0909341SAndroid Build Coastguard Worker cmp wd, -20 360*c0909341SAndroid Build Coastguard Worker jl .h_have_right 361*c0909341SAndroid Build Coastguard Worker call .extend_right 362*c0909341SAndroid Build Coastguard Worker.h_have_right: 363*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, m6 364*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m7 365*c0909341SAndroid Build Coastguard Worker paddw m0, m1 366*c0909341SAndroid Build Coastguard Worker pshufb m3, m8 367*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 368*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m9 369*c0909341SAndroid Build Coastguard Worker paddw m3, m1 370*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m6 371*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 372*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m7 373*c0909341SAndroid Build Coastguard Worker paddw m1, m2 374*c0909341SAndroid Build Coastguard Worker mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18) 375*c0909341SAndroid Build Coastguard Worker pshufb m4, m8 376*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 377*c0909341SAndroid Build Coastguard Worker pshufb m5, m9 378*c0909341SAndroid Build Coastguard Worker paddw m4, m5 379*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m13 380*c0909341SAndroid Build Coastguard Worker paddd m0, m2 381*c0909341SAndroid Build Coastguard Worker paddd m1, m2 382*c0909341SAndroid Build Coastguard Worker paddd m0, m3 383*c0909341SAndroid Build Coastguard Worker paddd m1, m4 384*c0909341SAndroid Build Coastguard Worker psrad m0, 4 385*c0909341SAndroid Build Coastguard Worker psrad m1, 4 386*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 387*c0909341SAndroid Build Coastguard Worker psraw m0, 1 388*c0909341SAndroid Build Coastguard Worker mova [t1+wq], m0 389*c0909341SAndroid Build Coastguard Worker add wq, 16 390*c0909341SAndroid Build Coastguard Worker jl .h_loop 391*c0909341SAndroid Build Coastguard Worker movif32 wq, wm 392*c0909341SAndroid Build Coastguard Worker ret 393*c0909341SAndroid Build Coastguard WorkerALIGN function_align 394*c0909341SAndroid Build Coastguard Worker.hv: 395*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 396*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 397*c0909341SAndroid Build Coastguard Worker movif32 t0m, t0 398*c0909341SAndroid Build Coastguard Worker movif32 t1m, t1 399*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 400*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 401*c0909341SAndroid Build Coastguard Worker movq m3, [leftq] 402*c0909341SAndroid Build Coastguard Worker movhps m3, [lpfq+wq] 403*c0909341SAndroid Build Coastguard Worker add leftq, 8 404*c0909341SAndroid Build Coastguard Worker jmp .hv_main 405*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 406*c0909341SAndroid Build Coastguard Worker mova m3, [lpfq+wq] 407*c0909341SAndroid Build Coastguard Worker pshufb m3, wiener_lshuf7_mem 408*c0909341SAndroid Build Coastguard Worker jmp .hv_main 409*c0909341SAndroid Build Coastguard Worker.hv_bottom: 410*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 411*c0909341SAndroid Build Coastguard Worker movif32 t0m, t0 412*c0909341SAndroid Build Coastguard Worker movif32 t1m, t1 413*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 414*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 415*c0909341SAndroid Build Coastguard Worker.hv_loop: 416*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+wq-8] 417*c0909341SAndroid Build Coastguard Worker.hv_main: 418*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+0] 419*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq+8] 420*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 421*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 422*c0909341SAndroid Build Coastguard Worker cmp wd, -20 423*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 424*c0909341SAndroid Build Coastguard Worker call .extend_right 425*c0909341SAndroid Build Coastguard Worker.hv_have_right: 426*c0909341SAndroid Build Coastguard Worker movif32 t1, t4m 427*c0909341SAndroid Build Coastguard Worker movif32 t0, t2m 428*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, m6 429*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m7 430*c0909341SAndroid Build Coastguard Worker paddw m0, m1 431*c0909341SAndroid Build Coastguard Worker pshufb m3, m8 432*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 433*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m9 434*c0909341SAndroid Build Coastguard Worker paddw m3, m1 435*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m6 436*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 437*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m7 438*c0909341SAndroid Build Coastguard Worker paddw m1, m2 439*c0909341SAndroid Build Coastguard Worker mova m2, pd_m262128_mem 440*c0909341SAndroid Build Coastguard Worker pshufb m4, m8 441*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 442*c0909341SAndroid Build Coastguard Worker pshufb m5, m9 443*c0909341SAndroid Build Coastguard Worker paddw m4, m5 444*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m13 445*c0909341SAndroid Build Coastguard Worker paddd m0, m2 446*c0909341SAndroid Build Coastguard Worker paddd m1, m2 447*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 448*c0909341SAndroid Build Coastguard Worker mova m2, [t4+wq] 449*c0909341SAndroid Build Coastguard Worker paddw m2, [t2+wq] 450*c0909341SAndroid Build Coastguard Worker mova m5, [t3+wq] 451*c0909341SAndroid Build Coastguard Worker%else 452*c0909341SAndroid Build Coastguard Worker mova m2, [t1+wq] 453*c0909341SAndroid Build Coastguard Worker paddw m2, [t0+wq] 454*c0909341SAndroid Build Coastguard Worker mov t1, t3m 455*c0909341SAndroid Build Coastguard Worker mov t0, t5m 456*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq] 457*c0909341SAndroid Build Coastguard Worker mov t1, t1m 458*c0909341SAndroid Build Coastguard Worker%endif 459*c0909341SAndroid Build Coastguard Worker paddd m0, m3 460*c0909341SAndroid Build Coastguard Worker paddd m1, m4 461*c0909341SAndroid Build Coastguard Worker psrad m0, 4 462*c0909341SAndroid Build Coastguard Worker psrad m1, 4 463*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 464*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 465*c0909341SAndroid Build Coastguard Worker mova m4, [t5+wq] 466*c0909341SAndroid Build Coastguard Worker paddw m4, [t1+wq] 467*c0909341SAndroid Build Coastguard Worker psraw m0, 1 468*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t6+wq] 469*c0909341SAndroid Build Coastguard Worker%else 470*c0909341SAndroid Build Coastguard Worker mova m4, [t0+wq] 471*c0909341SAndroid Build Coastguard Worker paddw m4, [t1+wq] 472*c0909341SAndroid Build Coastguard Worker mov t0, t0m 473*c0909341SAndroid Build Coastguard Worker mov t1, t6m 474*c0909341SAndroid Build Coastguard Worker psraw m0, 1 475*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t1+wq] 476*c0909341SAndroid Build Coastguard Worker%endif 477*c0909341SAndroid Build Coastguard Worker mova [t0+wq], m0 478*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2, m5 479*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 480*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m5 481*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m15 482*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m4 483*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m14 484*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 485*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 486*c0909341SAndroid Build Coastguard Worker paddd m0, m10 487*c0909341SAndroid Build Coastguard Worker paddd m2, m10 488*c0909341SAndroid Build Coastguard Worker paddd m0, m1 489*c0909341SAndroid Build Coastguard Worker paddd m2, m3 490*c0909341SAndroid Build Coastguard Worker psrad m0, 6 491*c0909341SAndroid Build Coastguard Worker psrad m2, 6 492*c0909341SAndroid Build Coastguard Worker packssdw m0, m2 493*c0909341SAndroid Build Coastguard Worker pmulhw m0, m11 494*c0909341SAndroid Build Coastguard Worker pxor m1, m1 495*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m1 496*c0909341SAndroid Build Coastguard Worker mova [dstq+wq], m0 497*c0909341SAndroid Build Coastguard Worker add wq, 16 498*c0909341SAndroid Build Coastguard Worker jl .hv_loop 499*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 500*c0909341SAndroid Build Coastguard Worker mov t6, t5 501*c0909341SAndroid Build Coastguard Worker mov t5, t4 502*c0909341SAndroid Build Coastguard Worker mov t4, t3 503*c0909341SAndroid Build Coastguard Worker mov t3, t2 504*c0909341SAndroid Build Coastguard Worker mov t2, t1 505*c0909341SAndroid Build Coastguard Worker mov t1, t0 506*c0909341SAndroid Build Coastguard Worker mov t0, t6 507*c0909341SAndroid Build Coastguard Worker%else 508*c0909341SAndroid Build Coastguard Worker mov r4, t5m 509*c0909341SAndroid Build Coastguard Worker mov t1, t4m 510*c0909341SAndroid Build Coastguard Worker mov t6m, r4 511*c0909341SAndroid Build Coastguard Worker mov t5m, t1 512*c0909341SAndroid Build Coastguard Worker mov r4, t3m 513*c0909341SAndroid Build Coastguard Worker mov t1, t2m 514*c0909341SAndroid Build Coastguard Worker mov t4m, r4 515*c0909341SAndroid Build Coastguard Worker mov t3m, t1 516*c0909341SAndroid Build Coastguard Worker mov r4, t1m 517*c0909341SAndroid Build Coastguard Worker mov t1, t0 518*c0909341SAndroid Build Coastguard Worker mov t2m, r4 519*c0909341SAndroid Build Coastguard Worker mov t0, t6m 520*c0909341SAndroid Build Coastguard Worker mov wq, wm 521*c0909341SAndroid Build Coastguard Worker%endif 522*c0909341SAndroid Build Coastguard Worker add dstq, strideq 523*c0909341SAndroid Build Coastguard Worker ret 524*c0909341SAndroid Build Coastguard Worker.v: 525*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 526*c0909341SAndroid Build Coastguard Worker movif32 t0m, t0 527*c0909341SAndroid Build Coastguard Worker movif32 t1m, t1 528*c0909341SAndroid Build Coastguard Worker.v_loop: 529*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 530*c0909341SAndroid Build Coastguard Worker mova m1, [t4+wq] 531*c0909341SAndroid Build Coastguard Worker paddw m1, [t2+wq] 532*c0909341SAndroid Build Coastguard Worker mova m2, [t3+wq] 533*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq] 534*c0909341SAndroid Build Coastguard Worker paddw m3, m4, [t6+wq] 535*c0909341SAndroid Build Coastguard Worker paddw m4, [t5+wq] 536*c0909341SAndroid Build Coastguard Worker%else 537*c0909341SAndroid Build Coastguard Worker mov t0, t4m 538*c0909341SAndroid Build Coastguard Worker mov t1, t2m 539*c0909341SAndroid Build Coastguard Worker mova m1, [t0+wq] 540*c0909341SAndroid Build Coastguard Worker paddw m1, [t1+wq] 541*c0909341SAndroid Build Coastguard Worker mov t0, t3m 542*c0909341SAndroid Build Coastguard Worker mov t1, t1m 543*c0909341SAndroid Build Coastguard Worker mova m2, [t0+wq] 544*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq] 545*c0909341SAndroid Build Coastguard Worker mov t0, t6m 546*c0909341SAndroid Build Coastguard Worker mov t1, t5m 547*c0909341SAndroid Build Coastguard Worker paddw m3, m4, [t0+wq] 548*c0909341SAndroid Build Coastguard Worker paddw m4, [t1+wq] 549*c0909341SAndroid Build Coastguard Worker%endif 550*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m2 551*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 552*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2 553*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m15 554*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 555*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m14 556*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 557*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 558*c0909341SAndroid Build Coastguard Worker paddd m0, m10 559*c0909341SAndroid Build Coastguard Worker paddd m1, m10 560*c0909341SAndroid Build Coastguard Worker paddd m0, m2 561*c0909341SAndroid Build Coastguard Worker paddd m1, m3 562*c0909341SAndroid Build Coastguard Worker psrad m0, 6 563*c0909341SAndroid Build Coastguard Worker psrad m1, 6 564*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 565*c0909341SAndroid Build Coastguard Worker pmulhw m0, m11 566*c0909341SAndroid Build Coastguard Worker pxor m1, m1 567*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m1 568*c0909341SAndroid Build Coastguard Worker mova [dstq+wq], m0 569*c0909341SAndroid Build Coastguard Worker add wq, 16 570*c0909341SAndroid Build Coastguard Worker jl .v_loop 571*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 572*c0909341SAndroid Build Coastguard Worker mov t6, t5 573*c0909341SAndroid Build Coastguard Worker mov t5, t4 574*c0909341SAndroid Build Coastguard Worker mov t4, t3 575*c0909341SAndroid Build Coastguard Worker mov t3, t2 576*c0909341SAndroid Build Coastguard Worker mov t2, t1 577*c0909341SAndroid Build Coastguard Worker%else 578*c0909341SAndroid Build Coastguard Worker mov t0, t5m 579*c0909341SAndroid Build Coastguard Worker mov t1, t4m 580*c0909341SAndroid Build Coastguard Worker mov r4, t3m 581*c0909341SAndroid Build Coastguard Worker mov t6m, t0 582*c0909341SAndroid Build Coastguard Worker mov t5m, t1 583*c0909341SAndroid Build Coastguard Worker mov t4m, r4 584*c0909341SAndroid Build Coastguard Worker mov r4, t2m 585*c0909341SAndroid Build Coastguard Worker mov t1, t1m 586*c0909341SAndroid Build Coastguard Worker mov t0, t0m 587*c0909341SAndroid Build Coastguard Worker mov t3m, r4 588*c0909341SAndroid Build Coastguard Worker mov t2m, t1 589*c0909341SAndroid Build Coastguard Worker%endif 590*c0909341SAndroid Build Coastguard Worker add dstq, strideq 591*c0909341SAndroid Build Coastguard Worker ret 592*c0909341SAndroid Build Coastguard Worker 593*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 594*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 595*c0909341SAndroid Build Coastguard Worker %assign stack_size 12*16+384*8 596*c0909341SAndroid Build Coastguard Worker %else 597*c0909341SAndroid Build Coastguard Worker %assign stack_size 11*16+384*8 598*c0909341SAndroid Build Coastguard Worker %endif 599*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \ 600*c0909341SAndroid Build Coastguard Worker lpf, w, flt 601*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 602*c0909341SAndroid Build Coastguard Worker %define lpfm dword [esp+calloff+4*6] 603*c0909341SAndroid Build Coastguard Worker %define wm dword [esp+calloff+4*7] 604*c0909341SAndroid Build Coastguard Worker %define hd dword [esp+calloff+16*10+0] 605*c0909341SAndroid Build Coastguard Worker %define edgeb byte [esp+calloff+16*10+4] 606*c0909341SAndroid Build Coastguard Worker %define edged dword [esp+calloff+16*10+4] 607*c0909341SAndroid Build Coastguard Worker %else 608*c0909341SAndroid Build Coastguard Worker %define hd dword r5m 609*c0909341SAndroid Build Coastguard Worker %define edgeb byte r7m 610*c0909341SAndroid Build Coastguard Worker %endif 611*c0909341SAndroid Build Coastguard Worker %define PICmem dword [esp+calloff+4*0] 612*c0909341SAndroid Build Coastguard Worker %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers 613*c0909341SAndroid Build Coastguard Worker %define t1m dword [esp+calloff+4*2] 614*c0909341SAndroid Build Coastguard Worker %define t2m dword [esp+calloff+4*3] 615*c0909341SAndroid Build Coastguard Worker %define t3m dword [esp+calloff+4*4] 616*c0909341SAndroid Build Coastguard Worker %define t4m dword [esp+calloff+4*5] 617*c0909341SAndroid Build Coastguard Worker %define t2 t2m 618*c0909341SAndroid Build Coastguard Worker %define t3 t3m 619*c0909341SAndroid Build Coastguard Worker %define t4 t4m 620*c0909341SAndroid Build Coastguard Worker %define m8 [esp+calloff+16*2] 621*c0909341SAndroid Build Coastguard Worker %define m9 [esp+calloff+16*3] 622*c0909341SAndroid Build Coastguard Worker %define m10 [esp+calloff+16*4] 623*c0909341SAndroid Build Coastguard Worker %define m11 [esp+calloff+16*5] 624*c0909341SAndroid Build Coastguard Worker %define m12 [esp+calloff+16*6] 625*c0909341SAndroid Build Coastguard Worker %define m13 [esp+calloff+16*7] 626*c0909341SAndroid Build Coastguard Worker %define m14 [esp+calloff+16*8] 627*c0909341SAndroid Build Coastguard Worker %define m15 [esp+calloff+16*9] 628*c0909341SAndroid Build Coastguard Worker %define base t0-wiener_shifts 629*c0909341SAndroid Build Coastguard Worker %assign calloff 0 630*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 631*c0909341SAndroid Build Coastguard Worker mov wd, [rstk+stack_offset+20] 632*c0909341SAndroid Build Coastguard Worker mov wm, wd 633*c0909341SAndroid Build Coastguard Worker mov r5, [rstk+stack_offset+24] 634*c0909341SAndroid Build Coastguard Worker mov hd, r5 635*c0909341SAndroid Build Coastguard Worker mov r5, [rstk+stack_offset+32] 636*c0909341SAndroid Build Coastguard Worker mov edged, r5 ; edge 637*c0909341SAndroid Build Coastguard Worker %endif 638*c0909341SAndroid Build Coastguard Worker%else 639*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \ 640*c0909341SAndroid Build Coastguard Worker w, h, edge, flt 641*c0909341SAndroid Build Coastguard Worker %define base 642*c0909341SAndroid Build Coastguard Worker%endif 643*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 644*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 645*c0909341SAndroid Build Coastguard Worker%endif 646*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 647*c0909341SAndroid Build Coastguard Worker mov fltq, r6mp 648*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 649*c0909341SAndroid Build Coastguard Worker mov edged, r7m 650*c0909341SAndroid Build Coastguard Worker mov t3d, r8m ; pixel_max 651*c0909341SAndroid Build Coastguard Worker movq m12, [fltq] 652*c0909341SAndroid Build Coastguard Worker movq m14, [fltq+16] 653*c0909341SAndroid Build Coastguard Worker%else 654*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 655*c0909341SAndroid Build Coastguard Worker mov t0, [rstk+stack_offset+28] 656*c0909341SAndroid Build Coastguard Worker mov t1, [rstk+stack_offset+36] ; pixel_max 657*c0909341SAndroid Build Coastguard Worker movq m1, [t0] ; fx 658*c0909341SAndroid Build Coastguard Worker movq m3, [t0+16] ; fy 659*c0909341SAndroid Build Coastguard Worker LEA t0, wiener_shifts 660*c0909341SAndroid Build Coastguard Worker %else 661*c0909341SAndroid Build Coastguard Worker mov fltq, r6m 662*c0909341SAndroid Build Coastguard Worker movq m1, [fltq] 663*c0909341SAndroid Build Coastguard Worker movq m3, [fltq+16] 664*c0909341SAndroid Build Coastguard Worker LEA t0, wiener_shifts 665*c0909341SAndroid Build Coastguard Worker mov t1, r8m ; pixel_max 666*c0909341SAndroid Build Coastguard Worker %endif 667*c0909341SAndroid Build Coastguard Worker mov PICmem, t0 668*c0909341SAndroid Build Coastguard Worker%endif 669*c0909341SAndroid Build Coastguard Worker mova m5, [base+wiener_shufE] 670*c0909341SAndroid Build Coastguard Worker mova m6, [base+wiener_shufB] 671*c0909341SAndroid Build Coastguard Worker mova m7, [base+wiener_shufD] 672*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 673*c0909341SAndroid Build Coastguard Worker lea t4, [wiener_shifts] 674*c0909341SAndroid Build Coastguard Worker add wd, wd 675*c0909341SAndroid Build Coastguard Worker punpcklwd m11, m12, m12 676*c0909341SAndroid Build Coastguard Worker pshufd m11, m11, q1111 ; x1 677*c0909341SAndroid Build Coastguard Worker pshufd m12, m12, q1111 ; x2 x3 678*c0909341SAndroid Build Coastguard Worker punpcklwd m13, m14, m14 679*c0909341SAndroid Build Coastguard Worker pshufd m13, m13, q1111 ; y1 680*c0909341SAndroid Build Coastguard Worker pshufd m14, m14, q1111 ; y2 y3 681*c0909341SAndroid Build Coastguard Worker shr t3d, 11 682*c0909341SAndroid Build Coastguard Worker mova m8, [pd_m262128] ; (1 << 4) - (1 << 18) 683*c0909341SAndroid Build Coastguard Worker add lpfq, wq 684*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+16] 685*c0909341SAndroid Build Coastguard Worker add dstq, wq 686*c0909341SAndroid Build Coastguard Worker neg wq 687*c0909341SAndroid Build Coastguard Worker %define base t4-wiener_shifts 688*c0909341SAndroid Build Coastguard Worker movd m9, [base+wiener_round+t3*4] 689*c0909341SAndroid Build Coastguard Worker movq m10, [base+wiener_shifts+t3*8] 690*c0909341SAndroid Build Coastguard Worker pshufd m9, m9, q0000 691*c0909341SAndroid Build Coastguard Worker pshufd m0, m10, q0000 692*c0909341SAndroid Build Coastguard Worker pshufd m10, m10, q1111 693*c0909341SAndroid Build Coastguard Worker mova m15, [wiener_lshuf5] 694*c0909341SAndroid Build Coastguard Worker pmullw m11, m0 695*c0909341SAndroid Build Coastguard Worker pmullw m12, m0 696*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 697*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp] 698*c0909341SAndroid Build Coastguard Worker %define base 699*c0909341SAndroid Build Coastguard Worker%else 700*c0909341SAndroid Build Coastguard Worker add wd, wd 701*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m1 702*c0909341SAndroid Build Coastguard Worker pshufd m0, m0, q1111 ; x1 703*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q1111 ; x2 x3 704*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m3 705*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q1111 ; y1 706*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q1111 ; y2 y3 707*c0909341SAndroid Build Coastguard Worker mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18) 708*c0909341SAndroid Build Coastguard Worker mova m13, m2 709*c0909341SAndroid Build Coastguard Worker mova m14, m3 710*c0909341SAndroid Build Coastguard Worker mova m8, m4 711*c0909341SAndroid Build Coastguard Worker shr t1, 11 712*c0909341SAndroid Build Coastguard Worker add lpfq, wq 713*c0909341SAndroid Build Coastguard Worker movd m2, [base+wiener_round+t1*4] 714*c0909341SAndroid Build Coastguard Worker movq m3, [base+wiener_shifts+t1*8] 715*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 716*c0909341SAndroid Build Coastguard Worker lea t1, [esp+16*11+wq+16] 717*c0909341SAndroid Build Coastguard Worker %else 718*c0909341SAndroid Build Coastguard Worker lea t1, [esp+16*10+wq+16] 719*c0909341SAndroid Build Coastguard Worker %endif 720*c0909341SAndroid Build Coastguard Worker add dstq, wq 721*c0909341SAndroid Build Coastguard Worker neg wq 722*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q0000 723*c0909341SAndroid Build Coastguard Worker pshufd m4, m3, q0000 724*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q1111 725*c0909341SAndroid Build Coastguard Worker mov wm, wq 726*c0909341SAndroid Build Coastguard Worker pmullw m0, m4 727*c0909341SAndroid Build Coastguard Worker pmullw m1, m4 728*c0909341SAndroid Build Coastguard Worker mova m4, [base+wiener_lshuf5] 729*c0909341SAndroid Build Coastguard Worker mova m9, m2 730*c0909341SAndroid Build Coastguard Worker mova m10, m3 731*c0909341SAndroid Build Coastguard Worker mova m11, m0 732*c0909341SAndroid Build Coastguard Worker mova m12, m1 733*c0909341SAndroid Build Coastguard Worker mova m15, m4 734*c0909341SAndroid Build Coastguard Worker%endif 735*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 736*c0909341SAndroid Build Coastguard Worker jz .no_top 737*c0909341SAndroid Build Coastguard Worker call .h_top 738*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 739*c0909341SAndroid Build Coastguard Worker mov t4, t1 740*c0909341SAndroid Build Coastguard Worker add t1, 384*2 741*c0909341SAndroid Build Coastguard Worker call .h_top 742*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 743*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 744*c0909341SAndroid Build Coastguard Worker mov t3, t1 745*c0909341SAndroid Build Coastguard Worker add t1, 384*2 746*c0909341SAndroid Build Coastguard Worker add r10, strideq 747*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 ; below 748*c0909341SAndroid Build Coastguard Worker call .h 749*c0909341SAndroid Build Coastguard Worker mov t2, t1 750*c0909341SAndroid Build Coastguard Worker dec hd 751*c0909341SAndroid Build Coastguard Worker jz .v1 752*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 753*c0909341SAndroid Build Coastguard Worker add t1, 384*2 754*c0909341SAndroid Build Coastguard Worker call .h 755*c0909341SAndroid Build Coastguard Worker dec hd 756*c0909341SAndroid Build Coastguard Worker jz .v2 757*c0909341SAndroid Build Coastguard Worker.main: 758*c0909341SAndroid Build Coastguard Worker mov t0, t4 759*c0909341SAndroid Build Coastguard Worker.main_loop: 760*c0909341SAndroid Build Coastguard Worker call .hv 761*c0909341SAndroid Build Coastguard Worker dec hd 762*c0909341SAndroid Build Coastguard Worker jnz .main_loop 763*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 764*c0909341SAndroid Build Coastguard Worker jz .v2 765*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 766*c0909341SAndroid Build Coastguard Worker call .hv_bottom 767*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 768*c0909341SAndroid Build Coastguard Worker call .hv_bottom 769*c0909341SAndroid Build Coastguard Worker.end: 770*c0909341SAndroid Build Coastguard Worker RET 771*c0909341SAndroid Build Coastguard Worker.no_top: 772*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 773*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 774*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 775*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 776*c0909341SAndroid Build Coastguard Worker call .h 777*c0909341SAndroid Build Coastguard Worker mov t4, t1 778*c0909341SAndroid Build Coastguard Worker mov t3, t1 779*c0909341SAndroid Build Coastguard Worker mov t2, t1 780*c0909341SAndroid Build Coastguard Worker dec hd 781*c0909341SAndroid Build Coastguard Worker jz .v1 782*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 783*c0909341SAndroid Build Coastguard Worker add t1, 384*2 784*c0909341SAndroid Build Coastguard Worker call .h 785*c0909341SAndroid Build Coastguard Worker dec hd 786*c0909341SAndroid Build Coastguard Worker jz .v2 787*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 788*c0909341SAndroid Build Coastguard Worker call .hv 789*c0909341SAndroid Build Coastguard Worker dec hd 790*c0909341SAndroid Build Coastguard Worker jz .v2 791*c0909341SAndroid Build Coastguard Worker add t0, 384*6 792*c0909341SAndroid Build Coastguard Worker call .hv 793*c0909341SAndroid Build Coastguard Worker dec hd 794*c0909341SAndroid Build Coastguard Worker jnz .main 795*c0909341SAndroid Build Coastguard Worker.v2: 796*c0909341SAndroid Build Coastguard Worker call .v 797*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 798*c0909341SAndroid Build Coastguard Worker mov t4, t3 799*c0909341SAndroid Build Coastguard Worker mov t3, t2 800*c0909341SAndroid Build Coastguard Worker mov t2, t1 801*c0909341SAndroid Build Coastguard Worker%else 802*c0909341SAndroid Build Coastguard Worker mov t0, t3m 803*c0909341SAndroid Build Coastguard Worker mov r4, t2m 804*c0909341SAndroid Build Coastguard Worker mov t1, t1m 805*c0909341SAndroid Build Coastguard Worker mov t4m, t0 806*c0909341SAndroid Build Coastguard Worker mov t3m, r4 807*c0909341SAndroid Build Coastguard Worker mov t2m, t1 808*c0909341SAndroid Build Coastguard Worker mov wq, wm 809*c0909341SAndroid Build Coastguard Worker%endif 810*c0909341SAndroid Build Coastguard Worker add dstq, strideq 811*c0909341SAndroid Build Coastguard Worker.v1: 812*c0909341SAndroid Build Coastguard Worker call .v 813*c0909341SAndroid Build Coastguard Worker jmp .end 814*c0909341SAndroid Build Coastguard Worker.extend_right: 815*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+8 816*c0909341SAndroid Build Coastguard Worker%assign calloff 8 817*c0909341SAndroid Build Coastguard Worker movif32 t0, PICmem 818*c0909341SAndroid Build Coastguard Worker pxor m1, m1 819*c0909341SAndroid Build Coastguard Worker movd m2, wd 820*c0909341SAndroid Build Coastguard Worker mova m0, [base+pb_2_3] 821*c0909341SAndroid Build Coastguard Worker pshufb m2, m1 822*c0909341SAndroid Build Coastguard Worker mova m1, [base+pb_m6_m5] 823*c0909341SAndroid Build Coastguard Worker psubb m0, m2 824*c0909341SAndroid Build Coastguard Worker psubb m1, m2 825*c0909341SAndroid Build Coastguard Worker mova m2, [base+pb_0to15] 826*c0909341SAndroid Build Coastguard Worker pminub m0, m2 827*c0909341SAndroid Build Coastguard Worker pminub m1, m2 828*c0909341SAndroid Build Coastguard Worker pshufb m3, m0 829*c0909341SAndroid Build Coastguard Worker pshufb m4, m1 830*c0909341SAndroid Build Coastguard Worker ret 831*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset-4 832*c0909341SAndroid Build Coastguard Worker%assign calloff 4 833*c0909341SAndroid Build Coastguard Worker.h: 834*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 835*c0909341SAndroid Build Coastguard Worker movif32 wq, wm 836*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 837*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 838*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq] 839*c0909341SAndroid Build Coastguard Worker movd m3, [leftq+4] 840*c0909341SAndroid Build Coastguard Worker pslldq m4, 4 841*c0909341SAndroid Build Coastguard Worker por m3, m4 842*c0909341SAndroid Build Coastguard Worker add leftq, 8 843*c0909341SAndroid Build Coastguard Worker jmp .h_main 844*c0909341SAndroid Build Coastguard Worker.h_extend_left: 845*c0909341SAndroid Build Coastguard Worker mova m3, [lpfq+wq] ; avoid accessing memory located 846*c0909341SAndroid Build Coastguard Worker pshufb m3, m15 ; before the start of the buffer 847*c0909341SAndroid Build Coastguard Worker jmp .h_main 848*c0909341SAndroid Build Coastguard Worker.h_top: 849*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 850*c0909341SAndroid Build Coastguard Worker movif32 wq, wm 851*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 852*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 853*c0909341SAndroid Build Coastguard Worker.h_loop: 854*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+wq-4] 855*c0909341SAndroid Build Coastguard Worker.h_main: 856*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+wq+4] 857*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 858*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 859*c0909341SAndroid Build Coastguard Worker cmp wd, -18 860*c0909341SAndroid Build Coastguard Worker jl .h_have_right 861*c0909341SAndroid Build Coastguard Worker call .extend_right 862*c0909341SAndroid Build Coastguard Worker.h_have_right: 863*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, m5 864*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 865*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m5 866*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 867*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m6 868*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 869*c0909341SAndroid Build Coastguard Worker paddw m2, m3 870*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m6 871*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m12 872*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 873*c0909341SAndroid Build Coastguard Worker paddw m3, m4 874*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12 875*c0909341SAndroid Build Coastguard Worker paddd m0, m8 876*c0909341SAndroid Build Coastguard Worker paddd m1, m8 877*c0909341SAndroid Build Coastguard Worker paddd m0, m2 878*c0909341SAndroid Build Coastguard Worker paddd m1, m3 879*c0909341SAndroid Build Coastguard Worker psrad m0, 4 880*c0909341SAndroid Build Coastguard Worker psrad m1, 4 881*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 882*c0909341SAndroid Build Coastguard Worker psraw m0, 1 883*c0909341SAndroid Build Coastguard Worker mova [t1+wq], m0 884*c0909341SAndroid Build Coastguard Worker add wq, 16 885*c0909341SAndroid Build Coastguard Worker jl .h_loop 886*c0909341SAndroid Build Coastguard Worker movif32 wq, wm 887*c0909341SAndroid Build Coastguard Worker ret 888*c0909341SAndroid Build Coastguard WorkerALIGN function_align 889*c0909341SAndroid Build Coastguard Worker.hv: 890*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 891*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 892*c0909341SAndroid Build Coastguard Worker movif32 t0m, t0 893*c0909341SAndroid Build Coastguard Worker movif32 t1m, t1 894*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 895*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 896*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq] 897*c0909341SAndroid Build Coastguard Worker movd m3, [leftq+4] 898*c0909341SAndroid Build Coastguard Worker pslldq m4, 4 899*c0909341SAndroid Build Coastguard Worker por m3, m4 900*c0909341SAndroid Build Coastguard Worker add leftq, 8 901*c0909341SAndroid Build Coastguard Worker jmp .hv_main 902*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 903*c0909341SAndroid Build Coastguard Worker mova m3, [lpfq+wq] 904*c0909341SAndroid Build Coastguard Worker pshufb m3, m15 905*c0909341SAndroid Build Coastguard Worker jmp .hv_main 906*c0909341SAndroid Build Coastguard Worker.hv_bottom: 907*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 908*c0909341SAndroid Build Coastguard Worker movif32 t0m, t0 909*c0909341SAndroid Build Coastguard Worker movif32 t1m, t1 910*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 911*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 912*c0909341SAndroid Build Coastguard Worker.hv_loop: 913*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+wq-4] 914*c0909341SAndroid Build Coastguard Worker.hv_main: 915*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+wq+4] 916*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 917*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 918*c0909341SAndroid Build Coastguard Worker cmp wd, -18 919*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 920*c0909341SAndroid Build Coastguard Worker call .extend_right 921*c0909341SAndroid Build Coastguard Worker.hv_have_right: 922*c0909341SAndroid Build Coastguard Worker movif32 t1, t1m 923*c0909341SAndroid Build Coastguard Worker movif32 t0, t3m 924*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, m5 925*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 926*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m5 927*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 928*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m6 929*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 930*c0909341SAndroid Build Coastguard Worker paddw m2, m3 931*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m6 932*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m12 933*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 934*c0909341SAndroid Build Coastguard Worker paddw m3, m4 935*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12 936*c0909341SAndroid Build Coastguard Worker paddd m0, m8 937*c0909341SAndroid Build Coastguard Worker paddd m1, m8 938*c0909341SAndroid Build Coastguard Worker paddd m0, m2 939*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 940*c0909341SAndroid Build Coastguard Worker mova m2, [t3+wq] 941*c0909341SAndroid Build Coastguard Worker paddw m2, [t1+wq] 942*c0909341SAndroid Build Coastguard Worker paddd m1, m3 943*c0909341SAndroid Build Coastguard Worker mova m4, [t2+wq] 944*c0909341SAndroid Build Coastguard Worker%else 945*c0909341SAndroid Build Coastguard Worker mova m2, [t0+wq] 946*c0909341SAndroid Build Coastguard Worker mov t0, t2m 947*c0909341SAndroid Build Coastguard Worker paddw m2, [t1+wq] 948*c0909341SAndroid Build Coastguard Worker mov t1, t4m 949*c0909341SAndroid Build Coastguard Worker paddd m1, m3 950*c0909341SAndroid Build Coastguard Worker mova m4, [t0+wq] 951*c0909341SAndroid Build Coastguard Worker mov t0, t0m 952*c0909341SAndroid Build Coastguard Worker%endif 953*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m2, m4 954*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 955*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4 956*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 957*c0909341SAndroid Build Coastguard Worker mova m4, [t4+wq] 958*c0909341SAndroid Build Coastguard Worker%else 959*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq] 960*c0909341SAndroid Build Coastguard Worker%endif 961*c0909341SAndroid Build Coastguard Worker psrad m0, 4 962*c0909341SAndroid Build Coastguard Worker psrad m1, 4 963*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 964*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m14 965*c0909341SAndroid Build Coastguard Worker psraw m0, 1 966*c0909341SAndroid Build Coastguard Worker mova [t0+wq], m0 967*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m4 968*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m13 969*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4 970*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m13 971*c0909341SAndroid Build Coastguard Worker paddd m3, m9 972*c0909341SAndroid Build Coastguard Worker paddd m2, m9 973*c0909341SAndroid Build Coastguard Worker paddd m1, m3 974*c0909341SAndroid Build Coastguard Worker paddd m0, m2 975*c0909341SAndroid Build Coastguard Worker psrad m1, 6 976*c0909341SAndroid Build Coastguard Worker psrad m0, 6 977*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 978*c0909341SAndroid Build Coastguard Worker pmulhw m0, m10 979*c0909341SAndroid Build Coastguard Worker pxor m1, m1 980*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m1 981*c0909341SAndroid Build Coastguard Worker mova [dstq+wq], m0 982*c0909341SAndroid Build Coastguard Worker add wq, 16 983*c0909341SAndroid Build Coastguard Worker jl .hv_loop 984*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 985*c0909341SAndroid Build Coastguard Worker mov t4, t3 986*c0909341SAndroid Build Coastguard Worker mov t3, t2 987*c0909341SAndroid Build Coastguard Worker mov t2, t1 988*c0909341SAndroid Build Coastguard Worker mov t1, t0 989*c0909341SAndroid Build Coastguard Worker mov t0, t4 990*c0909341SAndroid Build Coastguard Worker%else 991*c0909341SAndroid Build Coastguard Worker mov r4, t3m 992*c0909341SAndroid Build Coastguard Worker mov t1, t2m 993*c0909341SAndroid Build Coastguard Worker mov t4m, r4 994*c0909341SAndroid Build Coastguard Worker mov t3m, t1 995*c0909341SAndroid Build Coastguard Worker mov r4, t1m 996*c0909341SAndroid Build Coastguard Worker mov t1, t0 997*c0909341SAndroid Build Coastguard Worker mov t2m, r4 998*c0909341SAndroid Build Coastguard Worker mov t0, t4m 999*c0909341SAndroid Build Coastguard Worker mov wq, wm 1000*c0909341SAndroid Build Coastguard Worker%endif 1001*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1002*c0909341SAndroid Build Coastguard Worker ret 1003*c0909341SAndroid Build Coastguard Worker.v: 1004*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 1005*c0909341SAndroid Build Coastguard Worker movif32 t1m, t1 1006*c0909341SAndroid Build Coastguard Worker.v_loop: 1007*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1008*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq] 1009*c0909341SAndroid Build Coastguard Worker paddw m2, m0, [t3+wq] 1010*c0909341SAndroid Build Coastguard Worker mova m1, [t2+wq] 1011*c0909341SAndroid Build Coastguard Worker mova m4, [t4+wq] 1012*c0909341SAndroid Build Coastguard Worker%else 1013*c0909341SAndroid Build Coastguard Worker mov t0, t3m 1014*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq] 1015*c0909341SAndroid Build Coastguard Worker mov t1, t2m 1016*c0909341SAndroid Build Coastguard Worker paddw m2, m0, [t0+wq] 1017*c0909341SAndroid Build Coastguard Worker mov t0, t4m 1018*c0909341SAndroid Build Coastguard Worker mova m1, [t1+wq] 1019*c0909341SAndroid Build Coastguard Worker mova m4, [t0+wq] 1020*c0909341SAndroid Build Coastguard Worker%endif 1021*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m2, m1 1022*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 1023*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1 1024*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m14 1025*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m4 1026*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m13 1027*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4 1028*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m13 1029*c0909341SAndroid Build Coastguard Worker paddd m3, m9 1030*c0909341SAndroid Build Coastguard Worker paddd m2, m9 1031*c0909341SAndroid Build Coastguard Worker paddd m1, m3 1032*c0909341SAndroid Build Coastguard Worker paddd m0, m2 1033*c0909341SAndroid Build Coastguard Worker psrad m1, 6 1034*c0909341SAndroid Build Coastguard Worker psrad m0, 6 1035*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 1036*c0909341SAndroid Build Coastguard Worker pmulhw m0, m10 1037*c0909341SAndroid Build Coastguard Worker pxor m1, m1 1038*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m1 1039*c0909341SAndroid Build Coastguard Worker mova [dstq+wq], m0 1040*c0909341SAndroid Build Coastguard Worker add wq, 16 1041*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1042*c0909341SAndroid Build Coastguard Worker jl .v_loop 1043*c0909341SAndroid Build Coastguard Worker%else 1044*c0909341SAndroid Build Coastguard Worker jge .v_end 1045*c0909341SAndroid Build Coastguard Worker mov t1, t1m 1046*c0909341SAndroid Build Coastguard Worker jmp .v_loop 1047*c0909341SAndroid Build Coastguard Worker.v_end: 1048*c0909341SAndroid Build Coastguard Worker%endif 1049*c0909341SAndroid Build Coastguard Worker ret 1050*c0909341SAndroid Build Coastguard Worker 1051*c0909341SAndroid Build Coastguard Worker%macro GATHERDD 3 ; dst, src, tmp 1052*c0909341SAndroid Build Coastguard Worker movd %3d, %2 1053*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 1054*c0909341SAndroid Build Coastguard Worker movd %1, [r13+%3] 1055*c0909341SAndroid Build Coastguard Worker pextrw %3d, %2, 2 1056*c0909341SAndroid Build Coastguard Worker pinsrw %1, [r13+%3+2], 3 1057*c0909341SAndroid Build Coastguard Worker pextrw %3d, %2, 4 1058*c0909341SAndroid Build Coastguard Worker pinsrw %1, [r13+%3+2], 5 1059*c0909341SAndroid Build Coastguard Worker pextrw %3d, %2, 6 1060*c0909341SAndroid Build Coastguard Worker pinsrw %1, [r13+%3+2], 7 1061*c0909341SAndroid Build Coastguard Worker %else 1062*c0909341SAndroid Build Coastguard Worker movd %1, [base+sgr_x_by_x-0xf03+%3] 1063*c0909341SAndroid Build Coastguard Worker pextrw %3, %2, 2 1064*c0909341SAndroid Build Coastguard Worker pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 1065*c0909341SAndroid Build Coastguard Worker pextrw %3, %2, 4 1066*c0909341SAndroid Build Coastguard Worker pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 1067*c0909341SAndroid Build Coastguard Worker pextrw %3, %2, 6 1068*c0909341SAndroid Build Coastguard Worker pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 1069*c0909341SAndroid Build Coastguard Worker %endif 1070*c0909341SAndroid Build Coastguard Worker%endmacro 1071*c0909341SAndroid Build Coastguard Worker 1072*c0909341SAndroid Build Coastguard Worker%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore 1073*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 1074*c0909341SAndroid Build Coastguard Worker %define tmp r14 1075*c0909341SAndroid Build Coastguard Worker %else 1076*c0909341SAndroid Build Coastguard Worker %define tmp %4 1077*c0909341SAndroid Build Coastguard Worker %endif 1078*c0909341SAndroid Build Coastguard Worker GATHERDD %1, %2, tmp 1079*c0909341SAndroid Build Coastguard Worker GATHERDD %2, %3, tmp 1080*c0909341SAndroid Build Coastguard Worker movif32 %4, %5 1081*c0909341SAndroid Build Coastguard Worker psrld %1, 24 1082*c0909341SAndroid Build Coastguard Worker psrld %2, 24 1083*c0909341SAndroid Build Coastguard Worker packssdw %1, %2 1084*c0909341SAndroid Build Coastguard Worker%endmacro 1085*c0909341SAndroid Build Coastguard Worker 1086*c0909341SAndroid Build Coastguard Worker%macro MAXSD 3-4 0 ; dst, src, restore_tmp 1087*c0909341SAndroid Build Coastguard Worker pcmpgtd %3, %1, %2 1088*c0909341SAndroid Build Coastguard Worker pand %1, %3 1089*c0909341SAndroid Build Coastguard Worker pandn %3, %2 1090*c0909341SAndroid Build Coastguard Worker por %1, %3 1091*c0909341SAndroid Build Coastguard Worker %if %4 == 1 1092*c0909341SAndroid Build Coastguard Worker pxor %3, %3 1093*c0909341SAndroid Build Coastguard Worker %endif 1094*c0909341SAndroid Build Coastguard Worker%endmacro 1095*c0909341SAndroid Build Coastguard Worker 1096*c0909341SAndroid Build Coastguard Worker%macro MULLD 3 ; dst, src, tmp 1097*c0909341SAndroid Build Coastguard Worker pmulhuw %3, %1, %2 1098*c0909341SAndroid Build Coastguard Worker pmullw %1, %2 1099*c0909341SAndroid Build Coastguard Worker pslld %3, 16 1100*c0909341SAndroid Build Coastguard Worker paddd %1, %3 1101*c0909341SAndroid Build Coastguard Worker%endmacro 1102*c0909341SAndroid Build Coastguard Worker 1103*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1104*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 0, 1, 2, 3, 5 1105*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 1106*c0909341SAndroid Build Coastguard Worker %assign extra_stack 5*16 1107*c0909341SAndroid Build Coastguard Worker %else 1108*c0909341SAndroid Build Coastguard Worker %assign extra_stack 3*16 1109*c0909341SAndroid Build Coastguard Worker %endif 1110*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \ 1111*c0909341SAndroid Build Coastguard Worker dst, stride, left, lpf, w 1112*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 1113*c0909341SAndroid Build Coastguard Worker %define dstm dword [esp+calloff+16*0+4*6] 1114*c0909341SAndroid Build Coastguard Worker %define stridemp dword [esp+calloff+16*0+4*7] 1115*c0909341SAndroid Build Coastguard Worker %define leftm dword [esp+calloff+16*3+4*0] 1116*c0909341SAndroid Build Coastguard Worker %define lpfm dword [esp+calloff+16*3+4*1] 1117*c0909341SAndroid Build Coastguard Worker %define w0m dword [esp+calloff+16*3+4*2] 1118*c0909341SAndroid Build Coastguard Worker %define hd dword [esp+calloff+16*3+4*3] 1119*c0909341SAndroid Build Coastguard Worker %define edgeb byte [esp+calloff+16*3+4*4] 1120*c0909341SAndroid Build Coastguard Worker %define edged dword [esp+calloff+16*3+4*4] 1121*c0909341SAndroid Build Coastguard Worker %define leftmp leftm 1122*c0909341SAndroid Build Coastguard Worker %else 1123*c0909341SAndroid Build Coastguard Worker %define w0m wm 1124*c0909341SAndroid Build Coastguard Worker %define hd dword r5m 1125*c0909341SAndroid Build Coastguard Worker %define edgeb byte r7m 1126*c0909341SAndroid Build Coastguard Worker %define edged dword r7m 1127*c0909341SAndroid Build Coastguard Worker %endif 1128*c0909341SAndroid Build Coastguard Worker %define hvsrcm dword [esp+calloff+4*0] 1129*c0909341SAndroid Build Coastguard Worker %define w1m dword [esp+calloff+4*1] 1130*c0909341SAndroid Build Coastguard Worker %define t0m dword [esp+calloff+4*2] 1131*c0909341SAndroid Build Coastguard Worker %define t2m dword [esp+calloff+4*3] 1132*c0909341SAndroid Build Coastguard Worker %define t3m dword [esp+calloff+4*4] 1133*c0909341SAndroid Build Coastguard Worker %define t4m dword [esp+calloff+4*5] 1134*c0909341SAndroid Build Coastguard Worker %define m8 [base+pd_8] 1135*c0909341SAndroid Build Coastguard Worker %define m9 [base+pd_0xfffffff0] 1136*c0909341SAndroid Build Coastguard Worker %define m10 [esp+calloff+16*2] 1137*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_0xf00800a4] 1138*c0909341SAndroid Build Coastguard Worker %define m12 [base+sgr_lshuf5] 1139*c0909341SAndroid Build Coastguard Worker %define m13 [base+pd_34816] 1140*c0909341SAndroid Build Coastguard Worker %define m14 [base+pw_1023] 1141*c0909341SAndroid Build Coastguard Worker %define r10 r4 1142*c0909341SAndroid Build Coastguard Worker %define base r6-$$ 1143*c0909341SAndroid Build Coastguard Worker %assign calloff 0 1144*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 1145*c0909341SAndroid Build Coastguard Worker mov strideq, [rstk+stack_offset+ 8] 1146*c0909341SAndroid Build Coastguard Worker mov leftq, [rstk+stack_offset+12] 1147*c0909341SAndroid Build Coastguard Worker mov lpfq, [rstk+stack_offset+16] 1148*c0909341SAndroid Build Coastguard Worker mov wd, [rstk+stack_offset+20] 1149*c0909341SAndroid Build Coastguard Worker mov dstm, dstq 1150*c0909341SAndroid Build Coastguard Worker mov stridemp, strideq 1151*c0909341SAndroid Build Coastguard Worker mov leftm, leftq 1152*c0909341SAndroid Build Coastguard Worker mov r1, [rstk+stack_offset+24] 1153*c0909341SAndroid Build Coastguard Worker mov r2, [rstk+stack_offset+32] 1154*c0909341SAndroid Build Coastguard Worker mov lpfm, lpfq 1155*c0909341SAndroid Build Coastguard Worker mov hd, r1 1156*c0909341SAndroid Build Coastguard Worker mov edged, r2 1157*c0909341SAndroid Build Coastguard Worker %endif 1158*c0909341SAndroid Build Coastguard Worker%else 1159*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \ 1160*c0909341SAndroid Build Coastguard Worker w, h, edge, params 1161*c0909341SAndroid Build Coastguard Worker%endif 1162*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1163*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 1164*c0909341SAndroid Build Coastguard Worker%endif 1165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1166*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 1167*c0909341SAndroid Build Coastguard Worker lea r13, [sgr_x_by_x-0xf03] 1168*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1169*c0909341SAndroid Build Coastguard Worker add wd, wd 1170*c0909341SAndroid Build Coastguard Worker mov edged, r7m 1171*c0909341SAndroid Build Coastguard Worker movu m10, [paramsq] 1172*c0909341SAndroid Build Coastguard Worker mova m12, [sgr_lshuf5] 1173*c0909341SAndroid Build Coastguard Worker add lpfq, wq 1174*c0909341SAndroid Build Coastguard Worker mova m8, [pd_8] 1175*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+20] 1176*c0909341SAndroid Build Coastguard Worker mova m9, [pd_0xfffffff0] 1177*c0909341SAndroid Build Coastguard Worker add dstq, wq 1178*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*2+400*12+16] 1179*c0909341SAndroid Build Coastguard Worker mova m11, [pd_0xf00800a4] 1180*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+wq+400*20+16] 1181*c0909341SAndroid Build Coastguard Worker pshufhw m7, m10, q0000 1182*c0909341SAndroid Build Coastguard Worker pshufb m10, [pw_256] ; s0 1183*c0909341SAndroid Build Coastguard Worker punpckhqdq m7, m7 ; w0 1184*c0909341SAndroid Build Coastguard Worker neg wq 1185*c0909341SAndroid Build Coastguard Worker mova m13, [pd_34816] ; (1 << 11) + (1 << 15) 1186*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1187*c0909341SAndroid Build Coastguard Worker mova m14, [pw_1023] 1188*c0909341SAndroid Build Coastguard Worker psllw m7, 4 1189*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 1190*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp] 1191*c0909341SAndroid Build Coastguard Worker%else 1192*c0909341SAndroid Build Coastguard Worker mov r1, [rstk+stack_offset+28] ; params 1193*c0909341SAndroid Build Coastguard Worker LEA r6, $$ 1194*c0909341SAndroid Build Coastguard Worker add wd, wd 1195*c0909341SAndroid Build Coastguard Worker movu m1, [r1] 1196*c0909341SAndroid Build Coastguard Worker add lpfm, wq 1197*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+extra_stack+wq+20] 1198*c0909341SAndroid Build Coastguard Worker add dstq, wq 1199*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+extra_stack+wq*2+400*12+16] 1200*c0909341SAndroid Build Coastguard Worker mov dstm, dstq 1201*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+extra_stack+wq+400*20+16] 1202*c0909341SAndroid Build Coastguard Worker mov t3m, t3 1203*c0909341SAndroid Build Coastguard Worker pshufhw m7, m1, q0000 1204*c0909341SAndroid Build Coastguard Worker mov t4m, t4 1205*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+pw_256] ; s0 1206*c0909341SAndroid Build Coastguard Worker punpckhqdq m7, m7 ; w0 1207*c0909341SAndroid Build Coastguard Worker psllw m7, 4 1208*c0909341SAndroid Build Coastguard Worker neg wq 1209*c0909341SAndroid Build Coastguard Worker mova m10, m1 1210*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1211*c0909341SAndroid Build Coastguard Worker mov w1m, wd 1212*c0909341SAndroid Build Coastguard Worker sub wd, 4 1213*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 1214*c0909341SAndroid Build Coastguard Worker mov w0m, wd 1215*c0909341SAndroid Build Coastguard Worker %define strideq r5 1216*c0909341SAndroid Build Coastguard Worker%endif 1217*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 1218*c0909341SAndroid Build Coastguard Worker jz .no_top 1219*c0909341SAndroid Build Coastguard Worker call .h_top 1220*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1221*c0909341SAndroid Build Coastguard Worker movif32 t2m, t1 1222*c0909341SAndroid Build Coastguard Worker mov t2, t1 1223*c0909341SAndroid Build Coastguard Worker call .top_fixup 1224*c0909341SAndroid Build Coastguard Worker add t1, 400*6 1225*c0909341SAndroid Build Coastguard Worker call .h_top 1226*c0909341SAndroid Build Coastguard Worker movif32 strideq, stridemp 1227*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1228*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1229*c0909341SAndroid Build Coastguard Worker add r10, strideq 1230*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 ; below 1231*c0909341SAndroid Build Coastguard Worker movif32 t0m, t2 1232*c0909341SAndroid Build Coastguard Worker mov t0, t2 1233*c0909341SAndroid Build Coastguard Worker dec hd 1234*c0909341SAndroid Build Coastguard Worker jz .height1 1235*c0909341SAndroid Build Coastguard Worker or edged, 16 1236*c0909341SAndroid Build Coastguard Worker call .h 1237*c0909341SAndroid Build Coastguard Worker.main: 1238*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1239*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 1240*c0909341SAndroid Build Coastguard Worker call .hv 1241*c0909341SAndroid Build Coastguard Worker call .prep_n 1242*c0909341SAndroid Build Coastguard Worker sub hd, 2 1243*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 1244*c0909341SAndroid Build Coastguard Worker.main_loop: 1245*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 1246*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1247*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1248*c0909341SAndroid Build Coastguard Worker test hb, hb 1249*c0909341SAndroid Build Coastguard Worker%else 1250*c0909341SAndroid Build Coastguard Worker mov r4, hd 1251*c0909341SAndroid Build Coastguard Worker test r4, r4 1252*c0909341SAndroid Build Coastguard Worker%endif 1253*c0909341SAndroid Build Coastguard Worker jz .odd_height 1254*c0909341SAndroid Build Coastguard Worker call .h 1255*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1256*c0909341SAndroid Build Coastguard Worker call .hv 1257*c0909341SAndroid Build Coastguard Worker movif32 dstq, dstm 1258*c0909341SAndroid Build Coastguard Worker call .n0 1259*c0909341SAndroid Build Coastguard Worker call .n1 1260*c0909341SAndroid Build Coastguard Worker sub hd, 2 1261*c0909341SAndroid Build Coastguard Worker movif32 t0, t0m 1262*c0909341SAndroid Build Coastguard Worker jge .main_loop 1263*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 1264*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1265*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 1266*c0909341SAndroid Build Coastguard Worker call .h_top 1267*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1268*c0909341SAndroid Build Coastguard Worker call .hv_bottom 1269*c0909341SAndroid Build Coastguard Worker.end: 1270*c0909341SAndroid Build Coastguard Worker movif32 dstq, dstm 1271*c0909341SAndroid Build Coastguard Worker call .n0 1272*c0909341SAndroid Build Coastguard Worker call .n1 1273*c0909341SAndroid Build Coastguard Worker.end2: 1274*c0909341SAndroid Build Coastguard Worker RET 1275*c0909341SAndroid Build Coastguard Worker.height1: 1276*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 1277*c0909341SAndroid Build Coastguard Worker call .hv 1278*c0909341SAndroid Build Coastguard Worker call .prep_n 1279*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 1280*c0909341SAndroid Build Coastguard Worker.odd_height: 1281*c0909341SAndroid Build Coastguard Worker call .hv 1282*c0909341SAndroid Build Coastguard Worker movif32 dstq, dstm 1283*c0909341SAndroid Build Coastguard Worker call .n0 1284*c0909341SAndroid Build Coastguard Worker call .n1 1285*c0909341SAndroid Build Coastguard Worker.odd_height_end: 1286*c0909341SAndroid Build Coastguard Worker call .v 1287*c0909341SAndroid Build Coastguard Worker movif32 dstq, dstm 1288*c0909341SAndroid Build Coastguard Worker call .n0 1289*c0909341SAndroid Build Coastguard Worker jmp .end2 1290*c0909341SAndroid Build Coastguard Worker.extend_bottom: 1291*c0909341SAndroid Build Coastguard Worker call .v 1292*c0909341SAndroid Build Coastguard Worker jmp .end 1293*c0909341SAndroid Build Coastguard Worker.no_top: 1294*c0909341SAndroid Build Coastguard Worker movif32 strideq, stridemp 1295*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1296*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1297*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 1298*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 1299*c0909341SAndroid Build Coastguard Worker call .h 1300*c0909341SAndroid Build Coastguard Worker lea t2, [t1+400*6] 1301*c0909341SAndroid Build Coastguard Worker movif32 t2m, t2 1302*c0909341SAndroid Build Coastguard Worker call .top_fixup 1303*c0909341SAndroid Build Coastguard Worker dec hd 1304*c0909341SAndroid Build Coastguard Worker jz .no_top_height1 1305*c0909341SAndroid Build Coastguard Worker or edged, 16 1306*c0909341SAndroid Build Coastguard Worker mov t0, t1 1307*c0909341SAndroid Build Coastguard Worker mov t1, t2 1308*c0909341SAndroid Build Coastguard Worker movif32 t0m, t0 1309*c0909341SAndroid Build Coastguard Worker jmp .main 1310*c0909341SAndroid Build Coastguard Worker.no_top_height1: 1311*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 1312*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 1313*c0909341SAndroid Build Coastguard Worker call .v 1314*c0909341SAndroid Build Coastguard Worker call .prep_n 1315*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 1316*c0909341SAndroid Build Coastguard Worker.extend_right: 1317*c0909341SAndroid Build Coastguard Worker movd m0, wd 1318*c0909341SAndroid Build Coastguard Worker movd m1, [lpfq-2] 1319*c0909341SAndroid Build Coastguard Worker mova m2, [base+pw_256] 1320*c0909341SAndroid Build Coastguard Worker mova m3, [base+pb_m14_m13] 1321*c0909341SAndroid Build Coastguard Worker pshufb m0, m6 1322*c0909341SAndroid Build Coastguard Worker pshufb m1, m2 1323*c0909341SAndroid Build Coastguard Worker psubb m2, m0 1324*c0909341SAndroid Build Coastguard Worker psubb m3, m0 1325*c0909341SAndroid Build Coastguard Worker mova m0, [base+pb_0to15] 1326*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m0 1327*c0909341SAndroid Build Coastguard Worker pcmpgtb m3, m0 1328*c0909341SAndroid Build Coastguard Worker pand m4, m2 1329*c0909341SAndroid Build Coastguard Worker pand m5, m3 1330*c0909341SAndroid Build Coastguard Worker pandn m2, m1 1331*c0909341SAndroid Build Coastguard Worker pandn m3, m1 1332*c0909341SAndroid Build Coastguard Worker por m4, m2 1333*c0909341SAndroid Build Coastguard Worker por m5, m3 1334*c0909341SAndroid Build Coastguard Worker ret 1335*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+4 1336*c0909341SAndroid Build Coastguard Worker%assign calloff 4 1337*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 1338*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1339*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 1340*c0909341SAndroid Build Coastguard Worker%else 1341*c0909341SAndroid Build Coastguard Worker %define leftq r4 1342*c0909341SAndroid Build Coastguard Worker%endif 1343*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1344*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1345*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 1346*c0909341SAndroid Build Coastguard Worker movddup m5, [leftq] 1347*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1348*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 1349*c0909341SAndroid Build Coastguard Worker add leftmp, 8 1350*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 10 1351*c0909341SAndroid Build Coastguard Worker jmp .h_main 1352*c0909341SAndroid Build Coastguard Worker.h_extend_left: 1353*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1354*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 1355*c0909341SAndroid Build Coastguard Worker pshufb m4, m12 1356*c0909341SAndroid Build Coastguard Worker jmp .h_main 1357*c0909341SAndroid Build Coastguard Worker.h_top: 1358*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1359*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 1360*c0909341SAndroid Build Coastguard Worker%endif 1361*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1362*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1363*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1364*c0909341SAndroid Build Coastguard Worker.h_loop: 1365*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+wq- 2] 1366*c0909341SAndroid Build Coastguard Worker.h_main: 1367*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq+14] 1368*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1369*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 1370*c0909341SAndroid Build Coastguard Worker cmp wd, -20 1371*c0909341SAndroid Build Coastguard Worker jl .h_have_right 1372*c0909341SAndroid Build Coastguard Worker call .extend_right 1373*c0909341SAndroid Build Coastguard Worker.h_have_right: 1374*c0909341SAndroid Build Coastguard Worker palignr m2, m5, m4, 2 1375*c0909341SAndroid Build Coastguard Worker paddw m0, m4, m2 1376*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 6 1377*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1378*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 1379*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 1380*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 1381*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1382*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 8 1383*c0909341SAndroid Build Coastguard Worker paddw m0, m5 1384*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 1385*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1386*c0909341SAndroid Build Coastguard Worker paddd m1, m3 1387*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m5 1388*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1389*c0909341SAndroid Build Coastguard Worker shufps m4, m5, q2121 1390*c0909341SAndroid Build Coastguard Worker paddw m0, m4 ; sum 1391*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m4, m6 1392*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1393*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m6 1394*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 1395*c0909341SAndroid Build Coastguard Worker paddd m2, m3 1396*c0909341SAndroid Build Coastguard Worker test edgeb, 16 ; y > 0 1397*c0909341SAndroid Build Coastguard Worker jz .h_loop_end 1398*c0909341SAndroid Build Coastguard Worker paddw m0, [t1+wq+400*0] 1399*c0909341SAndroid Build Coastguard Worker paddd m1, [t1+wq+400*2] 1400*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+wq+400*4] 1401*c0909341SAndroid Build Coastguard Worker.h_loop_end: 1402*c0909341SAndroid Build Coastguard Worker paddd m1, m5 ; sumsq 1403*c0909341SAndroid Build Coastguard Worker paddd m2, m4 1404*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*0], m0 1405*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*2], m1 1406*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*4], m2 1407*c0909341SAndroid Build Coastguard Worker add wq, 16 1408*c0909341SAndroid Build Coastguard Worker jl .h_loop 1409*c0909341SAndroid Build Coastguard Worker ret 1410*c0909341SAndroid Build Coastguard Worker.top_fixup: 1411*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1412*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 1413*c0909341SAndroid Build Coastguard Worker%else 1414*c0909341SAndroid Build Coastguard Worker mov wd, w0m 1415*c0909341SAndroid Build Coastguard Worker%endif 1416*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: ; the sums of the first row needs to be doubled 1417*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq+400*0] 1418*c0909341SAndroid Build Coastguard Worker mova m1, [t1+wq+400*2] 1419*c0909341SAndroid Build Coastguard Worker mova m2, [t1+wq+400*4] 1420*c0909341SAndroid Build Coastguard Worker paddw m0, m0 1421*c0909341SAndroid Build Coastguard Worker paddd m1, m1 1422*c0909341SAndroid Build Coastguard Worker paddd m2, m2 1423*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*0], m0 1424*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*2], m1 1425*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*4], m2 1426*c0909341SAndroid Build Coastguard Worker add wq, 16 1427*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 1428*c0909341SAndroid Build Coastguard Worker ret 1429*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1430*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab 1431*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1432*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 1433*c0909341SAndroid Build Coastguard Worker%else 1434*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 1435*c0909341SAndroid Build Coastguard Worker%endif 1436*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1437*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 1438*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 1439*c0909341SAndroid Build Coastguard Worker movddup m5, [leftq] 1440*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1441*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 1442*c0909341SAndroid Build Coastguard Worker add leftmp, 8 1443*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 10 1444*c0909341SAndroid Build Coastguard Worker jmp .hv_main 1445*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 1446*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1447*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 1448*c0909341SAndroid Build Coastguard Worker pshufb m4, m12 1449*c0909341SAndroid Build Coastguard Worker jmp .hv_main 1450*c0909341SAndroid Build Coastguard Worker.hv_bottom: 1451*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1452*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 1453*c0909341SAndroid Build Coastguard Worker%else 1454*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 1455*c0909341SAndroid Build Coastguard Worker%endif 1456*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1457*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 1458*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1459*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1460*c0909341SAndroid Build Coastguard Worker jmp .hv_loop_start 1461*c0909341SAndroid Build Coastguard Worker%endif 1462*c0909341SAndroid Build Coastguard Worker.hv_loop: 1463*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 1464*c0909341SAndroid Build Coastguard Worker.hv_loop_start: 1465*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+wq- 2] 1466*c0909341SAndroid Build Coastguard Worker.hv_main: 1467*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq+14] 1468*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1469*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 1470*c0909341SAndroid Build Coastguard Worker cmp wd, -20 1471*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 1472*c0909341SAndroid Build Coastguard Worker call .extend_right 1473*c0909341SAndroid Build Coastguard Worker.hv_have_right: 1474*c0909341SAndroid Build Coastguard Worker movif32 t3, hd 1475*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 2 1476*c0909341SAndroid Build Coastguard Worker paddw m0, m4, m3 1477*c0909341SAndroid Build Coastguard Worker palignr m1, m5, m4, 6 1478*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1479*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m1 1480*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1481*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m1 1482*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1483*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 8 1484*c0909341SAndroid Build Coastguard Worker paddw m0, m5 1485*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m4, m5 1486*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 1487*c0909341SAndroid Build Coastguard Worker paddd m2, m1 1488*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4, m5 1489*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 1490*c0909341SAndroid Build Coastguard Worker shufps m4, m5, q2121 1491*c0909341SAndroid Build Coastguard Worker paddw m0, m4 ; h sum 1492*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m4, m6 1493*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1494*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m6 1495*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 1496*c0909341SAndroid Build Coastguard Worker paddd m3, m1 1497*c0909341SAndroid Build Coastguard Worker paddd m2, m5 ; h sumsq 1498*c0909341SAndroid Build Coastguard Worker paddd m3, m4 1499*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t1+wq+400*0] 1500*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t1+wq+400*2] 1501*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t1+wq+400*4] 1502*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1503*c0909341SAndroid Build Coastguard Worker test hd, hd 1504*c0909341SAndroid Build Coastguard Worker%else 1505*c0909341SAndroid Build Coastguard Worker test t3, t3 1506*c0909341SAndroid Build Coastguard Worker%endif 1507*c0909341SAndroid Build Coastguard Worker jz .hv_last_row 1508*c0909341SAndroid Build Coastguard Worker.hv_main2: 1509*c0909341SAndroid Build Coastguard Worker paddw m1, [t2+wq+400*0] ; hv sum 1510*c0909341SAndroid Build Coastguard Worker paddd m4, [t2+wq+400*2] ; hv sumsq 1511*c0909341SAndroid Build Coastguard Worker paddd m5, [t2+wq+400*4] 1512*c0909341SAndroid Build Coastguard Worker mova [t0+wq+400*0], m0 1513*c0909341SAndroid Build Coastguard Worker mova [t0+wq+400*2], m2 1514*c0909341SAndroid Build Coastguard Worker mova [t0+wq+400*4], m3 1515*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 1516*c0909341SAndroid Build Coastguard Worker paddd m4, m8 1517*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 1518*c0909341SAndroid Build Coastguard Worker paddd m5, m8 1519*c0909341SAndroid Build Coastguard Worker pand m4, m9 ; ((a + 8) >> 4) << 4 1520*c0909341SAndroid Build Coastguard Worker pand m5, m9 1521*c0909341SAndroid Build Coastguard Worker psrld m2, m4, 4 1522*c0909341SAndroid Build Coastguard Worker psrld m0, m5, 4 1523*c0909341SAndroid Build Coastguard Worker paddd m2, m4 1524*c0909341SAndroid Build Coastguard Worker psrld m4, 1 1525*c0909341SAndroid Build Coastguard Worker paddd m0, m5 1526*c0909341SAndroid Build Coastguard Worker psrld m5, 1 1527*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a * 25 1528*c0909341SAndroid Build Coastguard Worker paddd m5, m0 1529*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 1530*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1531*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 ; b * b 1532*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1533*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 1534*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 1535*c0909341SAndroid Build Coastguard Worker MAXSD m4, m2, m6 1536*c0909341SAndroid Build Coastguard Worker MAXSD m5, m3, m6, 1 1537*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 1538*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1539*c0909341SAndroid Build Coastguard Worker MULLD m4, m10, m2 ; p * s 1540*c0909341SAndroid Build Coastguard Worker MULLD m5, m10, m2 1541*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b * 164 1542*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 1543*c0909341SAndroid Build Coastguard Worker paddusw m4, m11 1544*c0909341SAndroid Build Coastguard Worker paddusw m5, m11 1545*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z, 255) 1546*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 1547*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1548*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, t2, t2m 1549*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 1550*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 1551*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m2 1552*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m2 1553*c0909341SAndroid Build Coastguard Worker paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 1554*c0909341SAndroid Build Coastguard Worker paddd m1, m13 1555*c0909341SAndroid Build Coastguard Worker mova [t4+wq+4], m3 1556*c0909341SAndroid Build Coastguard Worker psrld m0, 12 ; b 1557*c0909341SAndroid Build Coastguard Worker psrld m1, 12 1558*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+ 8], m0 1559*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+24], m1 1560*c0909341SAndroid Build Coastguard Worker add wq, 16 1561*c0909341SAndroid Build Coastguard Worker jl .hv_loop 1562*c0909341SAndroid Build Coastguard Worker mov t2, t1 1563*c0909341SAndroid Build Coastguard Worker mov t1, t0 1564*c0909341SAndroid Build Coastguard Worker mov t0, t2 1565*c0909341SAndroid Build Coastguard Worker movif32 t2m, t2 1566*c0909341SAndroid Build Coastguard Worker movif32 t0m, t0 1567*c0909341SAndroid Build Coastguard Worker ret 1568*c0909341SAndroid Build Coastguard Worker.hv_last_row: ; esoteric edge case for odd heights 1569*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*0], m1 1570*c0909341SAndroid Build Coastguard Worker paddw m1, m0 1571*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*2], m4 1572*c0909341SAndroid Build Coastguard Worker paddd m4, m2 1573*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*4], m5 1574*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1575*c0909341SAndroid Build Coastguard Worker jmp .hv_main2 1576*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab 1577*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1578*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 1579*c0909341SAndroid Build Coastguard Worker%else 1580*c0909341SAndroid Build Coastguard Worker mov wd, w0m 1581*c0909341SAndroid Build Coastguard Worker%endif 1582*c0909341SAndroid Build Coastguard Worker.v_loop: 1583*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq+400*0] 1584*c0909341SAndroid Build Coastguard Worker mova m2, [t1+wq+400*2] 1585*c0909341SAndroid Build Coastguard Worker mova m3, [t1+wq+400*4] 1586*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq+400*0] 1587*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t2+wq+400*2] 1588*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t2+wq+400*4] 1589*c0909341SAndroid Build Coastguard Worker paddw m0, m0 1590*c0909341SAndroid Build Coastguard Worker paddd m2, m2 1591*c0909341SAndroid Build Coastguard Worker paddd m3, m3 1592*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; hv sum 1593*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; hv sumsq 1594*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1595*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 1596*c0909341SAndroid Build Coastguard Worker paddd m4, m8 1597*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 1598*c0909341SAndroid Build Coastguard Worker paddd m5, m8 1599*c0909341SAndroid Build Coastguard Worker pand m4, m9 ; ((a + 8) >> 4) << 4 1600*c0909341SAndroid Build Coastguard Worker pand m5, m9 1601*c0909341SAndroid Build Coastguard Worker psrld m2, m4, 4 1602*c0909341SAndroid Build Coastguard Worker psrld m0, m5, 4 1603*c0909341SAndroid Build Coastguard Worker paddd m2, m4 1604*c0909341SAndroid Build Coastguard Worker psrld m4, 1 1605*c0909341SAndroid Build Coastguard Worker paddd m0, m5 1606*c0909341SAndroid Build Coastguard Worker psrld m5, 1 1607*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a * 25 1608*c0909341SAndroid Build Coastguard Worker paddd m5, m0 1609*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 1610*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1611*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 ; b * b 1612*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1613*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 1614*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 1615*c0909341SAndroid Build Coastguard Worker MAXSD m4, m2, m6 1616*c0909341SAndroid Build Coastguard Worker MAXSD m5, m3, m6, 1 1617*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 1618*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1619*c0909341SAndroid Build Coastguard Worker MULLD m4, m10, m2 ; p * s 1620*c0909341SAndroid Build Coastguard Worker MULLD m5, m10, m2 1621*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b * 164 1622*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 1623*c0909341SAndroid Build Coastguard Worker paddusw m4, m11 1624*c0909341SAndroid Build Coastguard Worker paddusw m5, m11 1625*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z, 255) 1626*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1627*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, t2, t2m 1628*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 1629*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 1630*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m2 1631*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m2 1632*c0909341SAndroid Build Coastguard Worker paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 1633*c0909341SAndroid Build Coastguard Worker paddd m1, m13 1634*c0909341SAndroid Build Coastguard Worker mova [t4+wq+4], m3 1635*c0909341SAndroid Build Coastguard Worker psrld m0, 12 ; b 1636*c0909341SAndroid Build Coastguard Worker psrld m1, 12 1637*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+ 8], m0 1638*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+24], m1 1639*c0909341SAndroid Build Coastguard Worker add wq, 16 1640*c0909341SAndroid Build Coastguard Worker jl .v_loop 1641*c0909341SAndroid Build Coastguard Worker ret 1642*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 1643*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 1644*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 1645*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 1646*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*1+ 2] 1647*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*1+ 4] 1648*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+ 4] 1649*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*2+ 8] 1650*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*2+20] 1651*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*2+24] 1652*c0909341SAndroid Build Coastguard Worker paddw m3, m0 1653*c0909341SAndroid Build Coastguard Worker paddd m4, m1 1654*c0909341SAndroid Build Coastguard Worker paddd m5, m2 1655*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*1+ 0] 1656*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*2+ 0] 1657*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*2+16] 1658*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1659*c0909341SAndroid Build Coastguard Worker psllw m3, 2 1660*c0909341SAndroid Build Coastguard Worker paddd m1, m4 1661*c0909341SAndroid Build Coastguard Worker pslld m4, 2 1662*c0909341SAndroid Build Coastguard Worker paddd m2, m5 1663*c0909341SAndroid Build Coastguard Worker pslld m5, 2 1664*c0909341SAndroid Build Coastguard Worker paddw m0, m3 ; a 565 1665*c0909341SAndroid Build Coastguard Worker paddd m1, m4 ; b 565 1666*c0909341SAndroid Build Coastguard Worker paddd m2, m5 1667*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*2+ 0], m0 1668*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*4+ 0], m1 1669*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*4+16], m2 1670*c0909341SAndroid Build Coastguard Worker add wq, 16 1671*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 1672*c0909341SAndroid Build Coastguard Worker ret 1673*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1674*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 1675*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 1676*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 1677*c0909341SAndroid Build Coastguard Worker.n0_loop: 1678*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*1+ 2] 1679*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*1+ 4] 1680*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+ 4] 1681*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*2+ 8] 1682*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*2+20] 1683*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*2+24] 1684*c0909341SAndroid Build Coastguard Worker paddw m3, m0 1685*c0909341SAndroid Build Coastguard Worker paddd m4, m1 1686*c0909341SAndroid Build Coastguard Worker paddd m5, m2 1687*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*1+ 0] 1688*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*2+ 0] 1689*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*2+16] 1690*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1691*c0909341SAndroid Build Coastguard Worker psllw m3, 2 1692*c0909341SAndroid Build Coastguard Worker paddd m1, m4 1693*c0909341SAndroid Build Coastguard Worker pslld m4, 2 1694*c0909341SAndroid Build Coastguard Worker paddd m2, m5 1695*c0909341SAndroid Build Coastguard Worker pslld m5, 2 1696*c0909341SAndroid Build Coastguard Worker paddw m0, m3 ; a 565 1697*c0909341SAndroid Build Coastguard Worker paddd m1, m4 ; b 565 1698*c0909341SAndroid Build Coastguard Worker paddd m2, m5 1699*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t4+wq*1+400*2+ 0] 1700*c0909341SAndroid Build Coastguard Worker paddd m4, m1, [t3+wq*2+400*4+ 0] 1701*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+wq*2+400*4+16] 1702*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*2+ 0], m0 1703*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*4+ 0], m1 1704*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*4+16], m2 1705*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+wq] 1706*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 ; src 1707*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 ; a 1708*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 1709*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 1710*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1711*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1712*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; b - a * src + (1 << 8) 1713*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1714*c0909341SAndroid Build Coastguard Worker psrad m4, 9 1715*c0909341SAndroid Build Coastguard Worker psrad m5, 9 1716*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 1717*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 1718*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1719*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 1720*c0909341SAndroid Build Coastguard Worker pminsw m0, m14 1721*c0909341SAndroid Build Coastguard Worker mova [dstq+wq], m0 1722*c0909341SAndroid Build Coastguard Worker add wq, 16 1723*c0909341SAndroid Build Coastguard Worker jl .n0_loop 1724*c0909341SAndroid Build Coastguard Worker add dstq, stridemp 1725*c0909341SAndroid Build Coastguard Worker ret 1726*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1727*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 1728*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 1729*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 1730*c0909341SAndroid Build Coastguard Worker.n1_loop: 1731*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+wq] 1732*c0909341SAndroid Build Coastguard Worker mova m3, [t4+wq*1+400*2+ 0] 1733*c0909341SAndroid Build Coastguard Worker mova m4, [t3+wq*2+400*4+ 0] 1734*c0909341SAndroid Build Coastguard Worker mova m5, [t3+wq*2+400*4+16] 1735*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 ; src 1736*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 ; a 1737*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 1738*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 1739*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1740*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1741*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; b - a * src + (1 << 7) 1742*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1743*c0909341SAndroid Build Coastguard Worker psrad m4, 8 1744*c0909341SAndroid Build Coastguard Worker psrad m5, 8 1745*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 1746*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 1747*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1748*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 1749*c0909341SAndroid Build Coastguard Worker pminsw m0, m14 1750*c0909341SAndroid Build Coastguard Worker mova [dstq+wq], m0 1751*c0909341SAndroid Build Coastguard Worker add wq, 16 1752*c0909341SAndroid Build Coastguard Worker jl .n1_loop 1753*c0909341SAndroid Build Coastguard Worker add dstq, stridemp 1754*c0909341SAndroid Build Coastguard Worker movif32 dstm, dstq 1755*c0909341SAndroid Build Coastguard Worker ret 1756*c0909341SAndroid Build Coastguard Worker 1757*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1758*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 1759*c0909341SAndroid Build Coastguard Worker %assign extra_stack 4*16 1760*c0909341SAndroid Build Coastguard Worker %else 1761*c0909341SAndroid Build Coastguard Worker %assign extra_stack 2*16 1762*c0909341SAndroid Build Coastguard Worker %endif 1763*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \ 1764*c0909341SAndroid Build Coastguard Worker dst, stride, left, lpf, w 1765*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 1766*c0909341SAndroid Build Coastguard Worker %define dstm dword [esp+calloff+16*2+4*0] 1767*c0909341SAndroid Build Coastguard Worker %define stridemp dword [esp+calloff+16*2+4*1] 1768*c0909341SAndroid Build Coastguard Worker %define leftm dword [esp+calloff+16*2+4*2] 1769*c0909341SAndroid Build Coastguard Worker %define lpfm dword [esp+calloff+16*2+4*3] 1770*c0909341SAndroid Build Coastguard Worker %define w0m dword [esp+calloff+16*2+4*4] 1771*c0909341SAndroid Build Coastguard Worker %define hd dword [esp+calloff+16*2+4*5] 1772*c0909341SAndroid Build Coastguard Worker %define edgeb byte [esp+calloff+16*2+4*6] 1773*c0909341SAndroid Build Coastguard Worker %define edged dword [esp+calloff+16*2+4*6] 1774*c0909341SAndroid Build Coastguard Worker %define leftmp leftm 1775*c0909341SAndroid Build Coastguard Worker %else 1776*c0909341SAndroid Build Coastguard Worker %define w0m wm 1777*c0909341SAndroid Build Coastguard Worker %define hd dword r5m 1778*c0909341SAndroid Build Coastguard Worker %define edgeb byte r7m 1779*c0909341SAndroid Build Coastguard Worker %define edged dword r7m 1780*c0909341SAndroid Build Coastguard Worker %endif 1781*c0909341SAndroid Build Coastguard Worker %define hvsrcm dword [esp+calloff+4*0] 1782*c0909341SAndroid Build Coastguard Worker %define w1m dword [esp+calloff+4*1] 1783*c0909341SAndroid Build Coastguard Worker %define t3m dword [esp+calloff+4*2] 1784*c0909341SAndroid Build Coastguard Worker %define t4m dword [esp+calloff+4*3] 1785*c0909341SAndroid Build Coastguard Worker %define m8 [base+pd_8] 1786*c0909341SAndroid Build Coastguard Worker %define m9 [esp+calloff+16*1] 1787*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0xf00801c7] 1788*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_34816] 1789*c0909341SAndroid Build Coastguard Worker %define m12 [base+sgr_lshuf3] 1790*c0909341SAndroid Build Coastguard Worker %define m13 [base+pw_1023] 1791*c0909341SAndroid Build Coastguard Worker %define m14 m6 1792*c0909341SAndroid Build Coastguard Worker %define base r6-$$ 1793*c0909341SAndroid Build Coastguard Worker %assign calloff 0 1794*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 1795*c0909341SAndroid Build Coastguard Worker mov strideq, [rstk+stack_offset+ 8] 1796*c0909341SAndroid Build Coastguard Worker mov leftq, [rstk+stack_offset+12] 1797*c0909341SAndroid Build Coastguard Worker mov lpfq, [rstk+stack_offset+16] 1798*c0909341SAndroid Build Coastguard Worker mov wd, [rstk+stack_offset+20] 1799*c0909341SAndroid Build Coastguard Worker mov dstm, dstq 1800*c0909341SAndroid Build Coastguard Worker mov stridemp, strideq 1801*c0909341SAndroid Build Coastguard Worker mov leftm, leftq 1802*c0909341SAndroid Build Coastguard Worker mov r1, [rstk+stack_offset+24] 1803*c0909341SAndroid Build Coastguard Worker mov r2, [rstk+stack_offset+32] 1804*c0909341SAndroid Build Coastguard Worker mov lpfm, lpfq 1805*c0909341SAndroid Build Coastguard Worker mov hd, r1 1806*c0909341SAndroid Build Coastguard Worker mov edged, r2 1807*c0909341SAndroid Build Coastguard Worker %endif 1808*c0909341SAndroid Build Coastguard Worker%else 1809*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \ 1810*c0909341SAndroid Build Coastguard Worker w, h, edge, params 1811*c0909341SAndroid Build Coastguard Worker%endif 1812*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1813*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 1814*c0909341SAndroid Build Coastguard Worker%endif 1815*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1816*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 1817*c0909341SAndroid Build Coastguard Worker lea r13, [sgr_x_by_x-0xf03] 1818*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1819*c0909341SAndroid Build Coastguard Worker add wd, wd 1820*c0909341SAndroid Build Coastguard Worker mov edged, r7m 1821*c0909341SAndroid Build Coastguard Worker movq m9, [paramsq+4] 1822*c0909341SAndroid Build Coastguard Worker add lpfq, wq 1823*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+12] 1824*c0909341SAndroid Build Coastguard Worker mova m8, [pd_8] 1825*c0909341SAndroid Build Coastguard Worker add dstq, wq 1826*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*2+400*12+8] 1827*c0909341SAndroid Build Coastguard Worker mova m10, [pd_0xf00801c7] 1828*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+wq+400*32+8] 1829*c0909341SAndroid Build Coastguard Worker mova m11, [pd_34816] 1830*c0909341SAndroid Build Coastguard Worker pshuflw m7, m9, q3333 1831*c0909341SAndroid Build Coastguard Worker pshufb m9, [pw_256] ; s1 1832*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m7 ; w1 1833*c0909341SAndroid Build Coastguard Worker neg wq 1834*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1835*c0909341SAndroid Build Coastguard Worker mova m13, [pw_1023] 1836*c0909341SAndroid Build Coastguard Worker psllw m7, 4 1837*c0909341SAndroid Build Coastguard Worker mova m12, [sgr_lshuf3] 1838*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 1839*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp] 1840*c0909341SAndroid Build Coastguard Worker%else 1841*c0909341SAndroid Build Coastguard Worker mov r1, [rstk+stack_offset+28] ; params 1842*c0909341SAndroid Build Coastguard Worker LEA r6, $$ 1843*c0909341SAndroid Build Coastguard Worker add wd, wd 1844*c0909341SAndroid Build Coastguard Worker movq m1, [r1+4] 1845*c0909341SAndroid Build Coastguard Worker add lpfm, wq 1846*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+extra_stack+wq+20] 1847*c0909341SAndroid Build Coastguard Worker add dstq, wq 1848*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+extra_stack+wq*2+400*12+16] 1849*c0909341SAndroid Build Coastguard Worker mov dstm, dstq 1850*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+extra_stack+wq+400*32+16] 1851*c0909341SAndroid Build Coastguard Worker mov t3m, t3 1852*c0909341SAndroid Build Coastguard Worker pshuflw m7, m1, q3333 1853*c0909341SAndroid Build Coastguard Worker mov t4m, t4 1854*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+pw_256] ; s1 1855*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m7 ; w1 1856*c0909341SAndroid Build Coastguard Worker psllw m7, 4 1857*c0909341SAndroid Build Coastguard Worker neg wq 1858*c0909341SAndroid Build Coastguard Worker mova m9, m1 1859*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1860*c0909341SAndroid Build Coastguard Worker mov w1m, wd 1861*c0909341SAndroid Build Coastguard Worker sub wd, 4 1862*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 1863*c0909341SAndroid Build Coastguard Worker mov w0m, wd 1864*c0909341SAndroid Build Coastguard Worker %define strideq r5 1865*c0909341SAndroid Build Coastguard Worker%endif 1866*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 1867*c0909341SAndroid Build Coastguard Worker jz .no_top 1868*c0909341SAndroid Build Coastguard Worker call .h_top 1869*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1870*c0909341SAndroid Build Coastguard Worker mov t2, t1 1871*c0909341SAndroid Build Coastguard Worker add t1, 400*6 1872*c0909341SAndroid Build Coastguard Worker call .h_top 1873*c0909341SAndroid Build Coastguard Worker movif32 strideq, stridemp 1874*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1875*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1876*c0909341SAndroid Build Coastguard Worker add r10, strideq 1877*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 ; below 1878*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 1879*c0909341SAndroid Build Coastguard Worker call .hv0 1880*c0909341SAndroid Build Coastguard Worker.main: 1881*c0909341SAndroid Build Coastguard Worker dec hd 1882*c0909341SAndroid Build Coastguard Worker jz .height1 1883*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 1884*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1885*c0909341SAndroid Build Coastguard Worker call .hv1 1886*c0909341SAndroid Build Coastguard Worker call .prep_n 1887*c0909341SAndroid Build Coastguard Worker sub hd, 2 1888*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 1889*c0909341SAndroid Build Coastguard Worker.main_loop: 1890*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 1891*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1892*c0909341SAndroid Build Coastguard Worker call .hv0 1893*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1894*c0909341SAndroid Build Coastguard Worker test hb, hb 1895*c0909341SAndroid Build Coastguard Worker%else 1896*c0909341SAndroid Build Coastguard Worker mov r4, hd 1897*c0909341SAndroid Build Coastguard Worker test r4, r4 1898*c0909341SAndroid Build Coastguard Worker%endif 1899*c0909341SAndroid Build Coastguard Worker jz .odd_height 1900*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 1901*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1902*c0909341SAndroid Build Coastguard Worker call .hv1 1903*c0909341SAndroid Build Coastguard Worker call .n0 1904*c0909341SAndroid Build Coastguard Worker call .n1 1905*c0909341SAndroid Build Coastguard Worker sub hd, 2 1906*c0909341SAndroid Build Coastguard Worker jge .main_loop 1907*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 1908*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1909*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 1910*c0909341SAndroid Build Coastguard Worker call .hv0_bottom 1911*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 1912*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1913*c0909341SAndroid Build Coastguard Worker call .hv1_bottom 1914*c0909341SAndroid Build Coastguard Worker.end: 1915*c0909341SAndroid Build Coastguard Worker call .n0 1916*c0909341SAndroid Build Coastguard Worker call .n1 1917*c0909341SAndroid Build Coastguard Worker.end2: 1918*c0909341SAndroid Build Coastguard Worker RET 1919*c0909341SAndroid Build Coastguard Worker.height1: 1920*c0909341SAndroid Build Coastguard Worker call .v1 1921*c0909341SAndroid Build Coastguard Worker call .prep_n 1922*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 1923*c0909341SAndroid Build Coastguard Worker.odd_height: 1924*c0909341SAndroid Build Coastguard Worker call .v1 1925*c0909341SAndroid Build Coastguard Worker call .n0 1926*c0909341SAndroid Build Coastguard Worker call .n1 1927*c0909341SAndroid Build Coastguard Worker.odd_height_end: 1928*c0909341SAndroid Build Coastguard Worker call .v0 1929*c0909341SAndroid Build Coastguard Worker call .v1 1930*c0909341SAndroid Build Coastguard Worker call .n0 1931*c0909341SAndroid Build Coastguard Worker jmp .end2 1932*c0909341SAndroid Build Coastguard Worker.extend_bottom: 1933*c0909341SAndroid Build Coastguard Worker call .v0 1934*c0909341SAndroid Build Coastguard Worker call .v1 1935*c0909341SAndroid Build Coastguard Worker jmp .end 1936*c0909341SAndroid Build Coastguard Worker.no_top: 1937*c0909341SAndroid Build Coastguard Worker movif32 strideq, stridemp 1938*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1939*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1940*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 1941*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 1942*c0909341SAndroid Build Coastguard Worker call .h 1943*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1944*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 1945*c0909341SAndroid Build Coastguard Worker%else 1946*c0909341SAndroid Build Coastguard Worker mov wq, w0m 1947*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 1948*c0909341SAndroid Build Coastguard Worker%endif 1949*c0909341SAndroid Build Coastguard Worker lea t2, [t1+400*6] 1950*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: 1951*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq+400*0] 1952*c0909341SAndroid Build Coastguard Worker mova m1, [t1+wq+400*2] 1953*c0909341SAndroid Build Coastguard Worker mova m2, [t1+wq+400*4] 1954*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*0], m0 1955*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*2], m1 1956*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*4], m2 1957*c0909341SAndroid Build Coastguard Worker add wq, 16 1958*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 1959*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 1960*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 1961*c0909341SAndroid Build Coastguard Worker call .v0 1962*c0909341SAndroid Build Coastguard Worker jmp .main 1963*c0909341SAndroid Build Coastguard Worker.extend_right: 1964*c0909341SAndroid Build Coastguard Worker movd m1, wd 1965*c0909341SAndroid Build Coastguard Worker movd m5, [lpfq-2] 1966*c0909341SAndroid Build Coastguard Worker mova m2, [base+pw_256] 1967*c0909341SAndroid Build Coastguard Worker mova m3, [base+pb_0to15] 1968*c0909341SAndroid Build Coastguard Worker pshufb m1, m6 1969*c0909341SAndroid Build Coastguard Worker pshufb m5, m2 1970*c0909341SAndroid Build Coastguard Worker psubb m2, m1 1971*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m3 1972*c0909341SAndroid Build Coastguard Worker pand m4, m2 1973*c0909341SAndroid Build Coastguard Worker pandn m2, m5 1974*c0909341SAndroid Build Coastguard Worker por m4, m2 1975*c0909341SAndroid Build Coastguard Worker ret 1976*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+4 1977*c0909341SAndroid Build Coastguard Worker%assign calloff 4 1978*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 1979*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1980*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 1981*c0909341SAndroid Build Coastguard Worker%else 1982*c0909341SAndroid Build Coastguard Worker %define leftq r4 1983*c0909341SAndroid Build Coastguard Worker%endif 1984*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1985*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1986*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 1987*c0909341SAndroid Build Coastguard Worker movddup m5, [leftq] 1988*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1989*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 1990*c0909341SAndroid Build Coastguard Worker add leftmp, 8 1991*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 12 1992*c0909341SAndroid Build Coastguard Worker jmp .h_main 1993*c0909341SAndroid Build Coastguard Worker.h_extend_left: 1994*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1995*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 1996*c0909341SAndroid Build Coastguard Worker pshufb m4, m12 1997*c0909341SAndroid Build Coastguard Worker jmp .h_main 1998*c0909341SAndroid Build Coastguard Worker.h_top: 1999*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2000*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 2001*c0909341SAndroid Build Coastguard Worker%endif 2002*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2003*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 2004*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2005*c0909341SAndroid Build Coastguard Worker.h_loop: 2006*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+wq+ 0] 2007*c0909341SAndroid Build Coastguard Worker.h_main: 2008*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq+16] 2009*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2010*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 2011*c0909341SAndroid Build Coastguard Worker cmp wd, -18 2012*c0909341SAndroid Build Coastguard Worker jl .h_have_right 2013*c0909341SAndroid Build Coastguard Worker call .extend_right 2014*c0909341SAndroid Build Coastguard Worker.h_have_right: 2015*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 2 2016*c0909341SAndroid Build Coastguard Worker paddw m1, m4, m0 2017*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 2018*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2019*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m0 2020*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2021*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 4 2022*c0909341SAndroid Build Coastguard Worker paddw m1, m5 ; sum 2023*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m6 2024*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 2025*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 2026*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2027*c0909341SAndroid Build Coastguard Worker paddd m2, m4 ; sumsq 2028*c0909341SAndroid Build Coastguard Worker paddd m3, m5 2029*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*0], m1 2030*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*2], m2 2031*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*4], m3 2032*c0909341SAndroid Build Coastguard Worker add wq, 16 2033*c0909341SAndroid Build Coastguard Worker jl .h_loop 2034*c0909341SAndroid Build Coastguard Worker ret 2035*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2036*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 2037*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2038*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 2039*c0909341SAndroid Build Coastguard Worker%else 2040*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2041*c0909341SAndroid Build Coastguard Worker%endif 2042*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2043*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 2044*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 2045*c0909341SAndroid Build Coastguard Worker movddup m5, [leftq] 2046*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2047*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 2048*c0909341SAndroid Build Coastguard Worker add leftmp, 8 2049*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 12 2050*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 2051*c0909341SAndroid Build Coastguard Worker.hv0_extend_left: 2052*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2053*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 2054*c0909341SAndroid Build Coastguard Worker pshufb m4, m12 2055*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 2056*c0909341SAndroid Build Coastguard Worker.hv0_bottom: 2057*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2058*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 2059*c0909341SAndroid Build Coastguard Worker%else 2060*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2061*c0909341SAndroid Build Coastguard Worker%endif 2062*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2063*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 2064*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2065*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2066*c0909341SAndroid Build Coastguard Worker jmp .hv0_loop_start 2067*c0909341SAndroid Build Coastguard Worker%endif 2068*c0909341SAndroid Build Coastguard Worker.hv0_loop: 2069*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2070*c0909341SAndroid Build Coastguard Worker.hv0_loop_start: 2071*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+wq+ 0] 2072*c0909341SAndroid Build Coastguard Worker.hv0_main: 2073*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq+16] 2074*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2075*c0909341SAndroid Build Coastguard Worker jnz .hv0_have_right 2076*c0909341SAndroid Build Coastguard Worker cmp wd, -18 2077*c0909341SAndroid Build Coastguard Worker jl .hv0_have_right 2078*c0909341SAndroid Build Coastguard Worker call .extend_right 2079*c0909341SAndroid Build Coastguard Worker.hv0_have_right: 2080*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 2 2081*c0909341SAndroid Build Coastguard Worker paddw m1, m4, m0 2082*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 2083*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2084*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m0 2085*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2086*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 4 2087*c0909341SAndroid Build Coastguard Worker paddw m1, m5 ; sum 2088*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m6 2089*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 2090*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 2091*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2092*c0909341SAndroid Build Coastguard Worker paddd m2, m4 ; sumsq 2093*c0909341SAndroid Build Coastguard Worker paddd m3, m5 2094*c0909341SAndroid Build Coastguard Worker paddw m0, m1, [t1+wq+400*0] 2095*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t1+wq+400*2] 2096*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t1+wq+400*4] 2097*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*0], m1 2098*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*2], m2 2099*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*4], m3 2100*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq+400*0] 2101*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+wq+400*2] 2102*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+wq+400*4] 2103*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*0], m0 2104*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*2], m4 2105*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*4], m5 2106*c0909341SAndroid Build Coastguard Worker paddd m2, m8 2107*c0909341SAndroid Build Coastguard Worker paddd m3, m8 2108*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a + 8) >> 4 2109*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2110*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 2111*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 2112*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a + 8) >> 4) * 9 2113*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2114*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 2115*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 2116*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 2117*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2118*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 2119*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2120*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 2121*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 2122*c0909341SAndroid Build Coastguard Worker MAXSD m4, m2, m14 2123*c0909341SAndroid Build Coastguard Worker MAXSD m5, m3, m14 2124*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 2125*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2126*c0909341SAndroid Build Coastguard Worker MULLD m4, m9, m14 ; p * s 2127*c0909341SAndroid Build Coastguard Worker MULLD m5, m9, m14 2128*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 455 2129*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 2130*c0909341SAndroid Build Coastguard Worker paddusw m4, m10 2131*c0909341SAndroid Build Coastguard Worker paddusw m5, m10 2132*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z, 255) 2133*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 2134*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2135*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 2136*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 2137*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 2138*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m14 2139*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m14 2140*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2141*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2142*c0909341SAndroid Build Coastguard Worker%endif 2143*c0909341SAndroid Build Coastguard Worker paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2144*c0909341SAndroid Build Coastguard Worker paddd m1, m11 2145*c0909341SAndroid Build Coastguard Worker mova [t4+wq+4], m3 2146*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2147*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2148*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+ 8], m0 2149*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+24], m1 2150*c0909341SAndroid Build Coastguard Worker add wq, 16 2151*c0909341SAndroid Build Coastguard Worker jl .hv0_loop 2152*c0909341SAndroid Build Coastguard Worker ret 2153*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2154*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2155*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2156*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 2157*c0909341SAndroid Build Coastguard Worker%else 2158*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2159*c0909341SAndroid Build Coastguard Worker%endif 2160*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2161*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 2162*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 2163*c0909341SAndroid Build Coastguard Worker movddup m5, [leftq] 2164*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2165*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 2166*c0909341SAndroid Build Coastguard Worker add leftmp, 8 2167*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 12 2168*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 2169*c0909341SAndroid Build Coastguard Worker.hv1_extend_left: 2170*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2171*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 2172*c0909341SAndroid Build Coastguard Worker pshufb m4, m12 2173*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 2174*c0909341SAndroid Build Coastguard Worker.hv1_bottom: 2175*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2176*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 2177*c0909341SAndroid Build Coastguard Worker%else 2178*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2179*c0909341SAndroid Build Coastguard Worker%endif 2180*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2181*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 2182*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2183*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2184*c0909341SAndroid Build Coastguard Worker jmp .hv1_loop_start 2185*c0909341SAndroid Build Coastguard Worker%endif 2186*c0909341SAndroid Build Coastguard Worker.hv1_loop: 2187*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2188*c0909341SAndroid Build Coastguard Worker.hv1_loop_start: 2189*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+wq+ 0] 2190*c0909341SAndroid Build Coastguard Worker.hv1_main: 2191*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq+16] 2192*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2193*c0909341SAndroid Build Coastguard Worker jnz .hv1_have_right 2194*c0909341SAndroid Build Coastguard Worker cmp wd, -18 2195*c0909341SAndroid Build Coastguard Worker jl .hv1_have_right 2196*c0909341SAndroid Build Coastguard Worker call .extend_right 2197*c0909341SAndroid Build Coastguard Worker.hv1_have_right: 2198*c0909341SAndroid Build Coastguard Worker palignr m1, m5, m4, 2 2199*c0909341SAndroid Build Coastguard Worker paddw m0, m4, m1 2200*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m1 2201*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2202*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m1 2203*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2204*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 4 2205*c0909341SAndroid Build Coastguard Worker paddw m0, m5 ; h sum 2206*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m5, m6 2207*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 2208*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 2209*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2210*c0909341SAndroid Build Coastguard Worker paddd m2, m1 ; h sumsq 2211*c0909341SAndroid Build Coastguard Worker paddd m3, m5 2212*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq+400*0] 2213*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t2+wq+400*2] 2214*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t2+wq+400*4] 2215*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*0], m0 2216*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*2], m2 2217*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*4], m3 2218*c0909341SAndroid Build Coastguard Worker paddd m4, m8 2219*c0909341SAndroid Build Coastguard Worker paddd m5, m8 2220*c0909341SAndroid Build Coastguard Worker psrld m4, 4 ; (a + 8) >> 4 2221*c0909341SAndroid Build Coastguard Worker psrld m5, 4 2222*c0909341SAndroid Build Coastguard Worker pslld m2, m4, 3 2223*c0909341SAndroid Build Coastguard Worker pslld m3, m5, 3 2224*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a + 8) >> 4) * 9 2225*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2226*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 2227*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 2228*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 2229*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2230*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 2231*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2232*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 2233*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 2234*c0909341SAndroid Build Coastguard Worker MAXSD m4, m2, m14 2235*c0909341SAndroid Build Coastguard Worker MAXSD m5, m3, m14 2236*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 2237*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2238*c0909341SAndroid Build Coastguard Worker MULLD m4, m9, m14 ; p * s 2239*c0909341SAndroid Build Coastguard Worker MULLD m5, m9, m14 2240*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 455 2241*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 2242*c0909341SAndroid Build Coastguard Worker paddusw m4, m10 2243*c0909341SAndroid Build Coastguard Worker paddusw m5, m10 2244*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z, 255) 2245*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 2246*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2247*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 2248*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 2249*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 2250*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m14 2251*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m14 2252*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2253*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2254*c0909341SAndroid Build Coastguard Worker%endif 2255*c0909341SAndroid Build Coastguard Worker paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2256*c0909341SAndroid Build Coastguard Worker paddd m1, m11 2257*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*2 +4], m3 2258*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2259*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2260*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*4+ 8], m0 2261*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*4+24], m1 2262*c0909341SAndroid Build Coastguard Worker add wq, 16 2263*c0909341SAndroid Build Coastguard Worker jl .hv1_loop 2264*c0909341SAndroid Build Coastguard Worker mov r10, t2 2265*c0909341SAndroid Build Coastguard Worker mov t2, t1 2266*c0909341SAndroid Build Coastguard Worker mov t1, r10 2267*c0909341SAndroid Build Coastguard Worker ret 2268*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab (even rows) 2269*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2270*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 2271*c0909341SAndroid Build Coastguard Worker%else 2272*c0909341SAndroid Build Coastguard Worker mov wd, w0m 2273*c0909341SAndroid Build Coastguard Worker%endif 2274*c0909341SAndroid Build Coastguard Worker.v0_loop: 2275*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq+400*0] 2276*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq+400*2] 2277*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq+400*4] 2278*c0909341SAndroid Build Coastguard Worker paddw m0, m0 2279*c0909341SAndroid Build Coastguard Worker paddd m4, m4 2280*c0909341SAndroid Build Coastguard Worker paddd m5, m5 2281*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq+400*0] 2282*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+wq+400*2] 2283*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+wq+400*4] 2284*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*0], m0 2285*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*2], m4 2286*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*4], m5 2287*c0909341SAndroid Build Coastguard Worker paddd m2, m8 2288*c0909341SAndroid Build Coastguard Worker paddd m3, m8 2289*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a + 8) >> 4 2290*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2291*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 2292*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 2293*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a + 8) >> 4) * 9 2294*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2295*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 2296*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 2297*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 2298*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2299*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 2300*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2301*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 2302*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 2303*c0909341SAndroid Build Coastguard Worker MAXSD m4, m2, m14 2304*c0909341SAndroid Build Coastguard Worker MAXSD m5, m3, m14 2305*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 2306*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2307*c0909341SAndroid Build Coastguard Worker MULLD m4, m9, m14 ; p * s 2308*c0909341SAndroid Build Coastguard Worker MULLD m5, m9, m14 2309*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 455 2310*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 2311*c0909341SAndroid Build Coastguard Worker paddusw m4, m10 2312*c0909341SAndroid Build Coastguard Worker paddusw m5, m10 2313*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z, 255) 2314*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2315*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 2316*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 2317*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 2318*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m14 2319*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m14 2320*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2321*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2322*c0909341SAndroid Build Coastguard Worker%endif 2323*c0909341SAndroid Build Coastguard Worker paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2324*c0909341SAndroid Build Coastguard Worker paddd m1, m11 2325*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*0+ 4], m3 2326*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2327*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2328*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*0+ 8], m0 2329*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*0+24], m1 2330*c0909341SAndroid Build Coastguard Worker add wq, 16 2331*c0909341SAndroid Build Coastguard Worker jl .v0_loop 2332*c0909341SAndroid Build Coastguard Worker ret 2333*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows) 2334*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2335*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 2336*c0909341SAndroid Build Coastguard Worker%else 2337*c0909341SAndroid Build Coastguard Worker mov wd, w0m 2338*c0909341SAndroid Build Coastguard Worker%endif 2339*c0909341SAndroid Build Coastguard Worker.v1_loop: 2340*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq+400*0] 2341*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq+400*2] 2342*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq+400*4] 2343*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq+400*0] 2344*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+wq+400*2] 2345*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+wq+400*4] 2346*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*0], m0 2347*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*2], m4 2348*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*4], m5 2349*c0909341SAndroid Build Coastguard Worker paddd m2, m8 2350*c0909341SAndroid Build Coastguard Worker paddd m3, m8 2351*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a + 8) >> 4 2352*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2353*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 2354*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 2355*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a + 8) >> 4) * 9 2356*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2357*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 2358*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 2359*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 2360*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2361*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 2362*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2363*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 2364*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 2365*c0909341SAndroid Build Coastguard Worker MAXSD m4, m2, m14 2366*c0909341SAndroid Build Coastguard Worker MAXSD m5, m3, m14 2367*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 2368*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2369*c0909341SAndroid Build Coastguard Worker MULLD m4, m9, m14 ; p * s 2370*c0909341SAndroid Build Coastguard Worker MULLD m5, m9, m14 2371*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 455 2372*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 2373*c0909341SAndroid Build Coastguard Worker paddusw m4, m10 2374*c0909341SAndroid Build Coastguard Worker paddusw m5, m10 2375*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z, 255) 2376*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2377*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 2378*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 2379*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 2380*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m14 2381*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m14 2382*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2383*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2384*c0909341SAndroid Build Coastguard Worker%endif 2385*c0909341SAndroid Build Coastguard Worker paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2386*c0909341SAndroid Build Coastguard Worker paddd m1, m11 2387*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*2+ 4], m3 2388*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2389*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2390*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*4+ 8], m0 2391*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*4+24], m1 2392*c0909341SAndroid Build Coastguard Worker add wq, 16 2393*c0909341SAndroid Build Coastguard Worker jl .v1_loop 2394*c0909341SAndroid Build Coastguard Worker mov r10, t2 2395*c0909341SAndroid Build Coastguard Worker mov t2, t1 2396*c0909341SAndroid Build Coastguard Worker mov t1, r10 2397*c0909341SAndroid Build Coastguard Worker ret 2398*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 2399*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 2400*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 2401*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 2402*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*1+400*0+ 4] 2403*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+400*0+ 8] 2404*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*2+400*0+24] 2405*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*1+400*0+ 2] 2406*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*2+400*0+ 4] 2407*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*2+400*0+20] 2408*c0909341SAndroid Build Coastguard Worker paddw m0, [t4+wq*1+400*0+ 0] 2409*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*2+400*0+ 0] 2410*c0909341SAndroid Build Coastguard Worker paddd m2, [t3+wq*2+400*0+16] 2411*c0909341SAndroid Build Coastguard Worker paddw m3, m0 2412*c0909341SAndroid Build Coastguard Worker paddd m4, m1 2413*c0909341SAndroid Build Coastguard Worker paddd m5, m2 2414*c0909341SAndroid Build Coastguard Worker psllw m3, 2 ; a[-1] 444 2415*c0909341SAndroid Build Coastguard Worker pslld m4, 2 ; b[-1] 444 2416*c0909341SAndroid Build Coastguard Worker pslld m5, 2 2417*c0909341SAndroid Build Coastguard Worker psubw m3, m0 ; a[-1] 343 2418*c0909341SAndroid Build Coastguard Worker psubd m4, m1 ; b[-1] 343 2419*c0909341SAndroid Build Coastguard Worker psubd m5, m2 2420*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*4], m3 2421*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*8+ 0], m4 2422*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*8+16], m5 2423*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*1+400*2+ 4] 2424*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+400*4+ 8] 2425*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*2+400*4+24] 2426*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*1+400*2+ 2] 2427*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*2+400*4+ 4] 2428*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*2+400*4+20] 2429*c0909341SAndroid Build Coastguard Worker paddw m0, [t4+wq*1+400*2+ 0] 2430*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*2+400*4+ 0] 2431*c0909341SAndroid Build Coastguard Worker paddd m2, [t3+wq*2+400*4+16] 2432*c0909341SAndroid Build Coastguard Worker paddw m3, m0 2433*c0909341SAndroid Build Coastguard Worker paddd m4, m1 2434*c0909341SAndroid Build Coastguard Worker paddd m5, m2 2435*c0909341SAndroid Build Coastguard Worker psllw m3, 2 ; a[ 0] 444 2436*c0909341SAndroid Build Coastguard Worker pslld m4, 2 ; b[ 0] 444 2437*c0909341SAndroid Build Coastguard Worker pslld m5, 2 2438*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400* 6], m3 2439*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*12+ 0], m4 2440*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*12+16], m5 2441*c0909341SAndroid Build Coastguard Worker psubw m3, m0 ; a[ 0] 343 2442*c0909341SAndroid Build Coastguard Worker psubd m4, m1 ; b[ 0] 343 2443*c0909341SAndroid Build Coastguard Worker psubd m5, m2 2444*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400* 8], m3 2445*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*16+ 0], m4 2446*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*16+16], m5 2447*c0909341SAndroid Build Coastguard Worker add wq, 16 2448*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 2449*c0909341SAndroid Build Coastguard Worker ret 2450*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2451*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 2452*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 2453*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 2454*c0909341SAndroid Build Coastguard Worker.n0_loop: 2455*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*1+400*0+4] 2456*c0909341SAndroid Build Coastguard Worker movu m1, [t4+wq*1+400*0+2] 2457*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*1+400*0+0] 2458*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2459*c0909341SAndroid Build Coastguard Worker psllw m1, 2 ; a[ 1] 444 2460*c0909341SAndroid Build Coastguard Worker psubw m2, m1, m3 ; a[ 1] 343 2461*c0909341SAndroid Build Coastguard Worker paddw m3, m2, [t4+wq*1+400*4] 2462*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*1+400*6] 2463*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*4], m2 2464*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*6], m1 2465*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*2+400*0+8] 2466*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+400*0+4] 2467*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*2+400*0+0] 2468*c0909341SAndroid Build Coastguard Worker paddd m1, m4 2469*c0909341SAndroid Build Coastguard Worker pslld m1, 2 ; b[ 1] 444 2470*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m4 ; b[ 1] 343 2471*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t3+wq*2+400* 8+ 0] 2472*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*2+400*12+ 0] 2473*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400* 8+ 0], m2 2474*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*12+ 0], m1 2475*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*2+400*0+24] 2476*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+400*0+20] 2477*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*2+400*0+16] 2478*c0909341SAndroid Build Coastguard Worker paddd m1, m5 2479*c0909341SAndroid Build Coastguard Worker pslld m1, 2 2480*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m5 2481*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+wq*2+400* 8+16] 2482*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*2+400*12+16] 2483*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400* 8+16], m2 2484*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*12+16], m1 2485*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+wq] 2486*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 2487*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 2488*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 2489*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 2490*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 2491*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 2492*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; b - a * src + (1 << 8) 2493*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2494*c0909341SAndroid Build Coastguard Worker psrad m4, 9 2495*c0909341SAndroid Build Coastguard Worker psrad m5, 9 2496*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 2497*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 2498*c0909341SAndroid Build Coastguard Worker paddw m0, m4 2499*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 2500*c0909341SAndroid Build Coastguard Worker pminsw m0, m13 2501*c0909341SAndroid Build Coastguard Worker mova [dstq+wq], m0 2502*c0909341SAndroid Build Coastguard Worker add wq, 16 2503*c0909341SAndroid Build Coastguard Worker jl .n0_loop 2504*c0909341SAndroid Build Coastguard Worker add dstq, stridemp 2505*c0909341SAndroid Build Coastguard Worker ret 2506*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2507*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 2508*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 2509*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 2510*c0909341SAndroid Build Coastguard Worker.n1_loop: 2511*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*1+400*2+4] 2512*c0909341SAndroid Build Coastguard Worker movu m1, [t4+wq*1+400*2+2] 2513*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*1+400*2+0] 2514*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2515*c0909341SAndroid Build Coastguard Worker psllw m1, 2 ; a[ 1] 444 2516*c0909341SAndroid Build Coastguard Worker psubw m2, m1, m3 ; a[ 1] 343 2517*c0909341SAndroid Build Coastguard Worker paddw m3, m2, [t4+wq*1+400*6] 2518*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*1+400*8] 2519*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*6], m1 2520*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*8], m2 2521*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*2+400*4+8] 2522*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+400*4+4] 2523*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*2+400*4+0] 2524*c0909341SAndroid Build Coastguard Worker paddd m1, m4 2525*c0909341SAndroid Build Coastguard Worker pslld m1, 2 ; b[ 1] 444 2526*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m4 ; b[ 1] 343 2527*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t3+wq*2+400*12+ 0] 2528*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*2+400*16+ 0] 2529*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*12+ 0], m1 2530*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*16+ 0], m2 2531*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*2+400*4+24] 2532*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+400*4+20] 2533*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*2+400*4+16] 2534*c0909341SAndroid Build Coastguard Worker paddd m1, m5 2535*c0909341SAndroid Build Coastguard Worker pslld m1, 2 2536*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m5 2537*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+wq*2+400*12+16] 2538*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*2+400*16+16] 2539*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*12+16], m1 2540*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*16+16], m2 2541*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+wq] 2542*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 2543*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 2544*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 2545*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 2546*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 2547*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 2548*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; b - a * src + (1 << 8) 2549*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2550*c0909341SAndroid Build Coastguard Worker psrad m4, 9 2551*c0909341SAndroid Build Coastguard Worker psrad m5, 9 2552*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 2553*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 2554*c0909341SAndroid Build Coastguard Worker paddw m0, m4 2555*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 2556*c0909341SAndroid Build Coastguard Worker pminsw m0, m13 2557*c0909341SAndroid Build Coastguard Worker mova [dstq+wq], m0 2558*c0909341SAndroid Build Coastguard Worker add wq, 16 2559*c0909341SAndroid Build Coastguard Worker jl .n1_loop 2560*c0909341SAndroid Build Coastguard Worker add dstq, stridemp 2561*c0909341SAndroid Build Coastguard Worker movif32 dstm, dstq 2562*c0909341SAndroid Build Coastguard Worker ret 2563*c0909341SAndroid Build Coastguard Worker 2564*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2565*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 2566*c0909341SAndroid Build Coastguard Worker %assign extra_stack 10*16 2567*c0909341SAndroid Build Coastguard Worker %else 2568*c0909341SAndroid Build Coastguard Worker %assign extra_stack 8*16 2569*c0909341SAndroid Build Coastguard Worker %endif 2570*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \ 2571*c0909341SAndroid Build Coastguard Worker dst, stride, left, lpf, w 2572*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 2573*c0909341SAndroid Build Coastguard Worker %define dstm dword [esp+calloff+16*8+4*0] 2574*c0909341SAndroid Build Coastguard Worker %define stridemp dword [esp+calloff+16*8+4*1] 2575*c0909341SAndroid Build Coastguard Worker %define leftm dword [esp+calloff+16*8+4*2] 2576*c0909341SAndroid Build Coastguard Worker %define lpfm dword [esp+calloff+16*8+4*3] 2577*c0909341SAndroid Build Coastguard Worker %define w0m dword [esp+calloff+16*8+4*4] 2578*c0909341SAndroid Build Coastguard Worker %define hd dword [esp+calloff+16*8+4*5] 2579*c0909341SAndroid Build Coastguard Worker %define edgeb byte [esp+calloff+16*8+4*6] 2580*c0909341SAndroid Build Coastguard Worker %define edged dword [esp+calloff+16*8+4*6] 2581*c0909341SAndroid Build Coastguard Worker %define leftmp leftm 2582*c0909341SAndroid Build Coastguard Worker %else 2583*c0909341SAndroid Build Coastguard Worker %define w0m wm 2584*c0909341SAndroid Build Coastguard Worker %define hd dword r5m 2585*c0909341SAndroid Build Coastguard Worker %define edgeb byte r7m 2586*c0909341SAndroid Build Coastguard Worker %define edged dword r7m 2587*c0909341SAndroid Build Coastguard Worker %endif 2588*c0909341SAndroid Build Coastguard Worker %define hvsrcm dword [esp+calloff+4*0] 2589*c0909341SAndroid Build Coastguard Worker %define w1m dword [esp+calloff+4*1] 2590*c0909341SAndroid Build Coastguard Worker %define t3m dword [esp+calloff+4*2] 2591*c0909341SAndroid Build Coastguard Worker %define t4m dword [esp+calloff+4*3] 2592*c0909341SAndroid Build Coastguard Worker %xdefine m8 m6 2593*c0909341SAndroid Build Coastguard Worker %define m9 [base+pd_8] 2594*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_34816] 2595*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_0xf00801c7] 2596*c0909341SAndroid Build Coastguard Worker %define m12 [base+pd_0xf00800a4] 2597*c0909341SAndroid Build Coastguard Worker %define m13 [esp+calloff+16*4] 2598*c0909341SAndroid Build Coastguard Worker %define m14 [esp+calloff+16*5] 2599*c0909341SAndroid Build Coastguard Worker %define m15 [esp+calloff+16*6] 2600*c0909341SAndroid Build Coastguard Worker %define m6 [esp+calloff+16*7] 2601*c0909341SAndroid Build Coastguard Worker %define base r6-$$ 2602*c0909341SAndroid Build Coastguard Worker %assign calloff 0 2603*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 2604*c0909341SAndroid Build Coastguard Worker mov strideq, [rstk+stack_offset+ 8] 2605*c0909341SAndroid Build Coastguard Worker mov leftq, [rstk+stack_offset+12] 2606*c0909341SAndroid Build Coastguard Worker mov lpfq, [rstk+stack_offset+16] 2607*c0909341SAndroid Build Coastguard Worker mov wd, [rstk+stack_offset+20] 2608*c0909341SAndroid Build Coastguard Worker mov dstm, dstq 2609*c0909341SAndroid Build Coastguard Worker mov stridemp, strideq 2610*c0909341SAndroid Build Coastguard Worker mov leftm, leftq 2611*c0909341SAndroid Build Coastguard Worker mov r1, [rstk+stack_offset+24] 2612*c0909341SAndroid Build Coastguard Worker mov r2, [rstk+stack_offset+32] 2613*c0909341SAndroid Build Coastguard Worker mov lpfm, lpfq 2614*c0909341SAndroid Build Coastguard Worker mov hd, r1 2615*c0909341SAndroid Build Coastguard Worker mov edged, r2 2616*c0909341SAndroid Build Coastguard Worker %endif 2617*c0909341SAndroid Build Coastguard Worker%else 2618*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ 2619*c0909341SAndroid Build Coastguard Worker w, h, edge, params 2620*c0909341SAndroid Build Coastguard Worker%endif 2621*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 2622*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 2623*c0909341SAndroid Build Coastguard Worker%endif 2624*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2625*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 2626*c0909341SAndroid Build Coastguard Worker lea r13, [sgr_x_by_x-0xf03] 2627*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 2628*c0909341SAndroid Build Coastguard Worker add wd, wd 2629*c0909341SAndroid Build Coastguard Worker mov edged, r7m 2630*c0909341SAndroid Build Coastguard Worker mova m14, [paramsq] 2631*c0909341SAndroid Build Coastguard Worker add lpfq, wq 2632*c0909341SAndroid Build Coastguard Worker mova m9, [pd_8] 2633*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+44] 2634*c0909341SAndroid Build Coastguard Worker mova m10, [pd_34816] 2635*c0909341SAndroid Build Coastguard Worker add dstq, wq 2636*c0909341SAndroid Build Coastguard Worker mova m11, [pd_0xf00801c7] 2637*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*2+400*24+40] 2638*c0909341SAndroid Build Coastguard Worker mova m12, [pd_0xf00800a4] 2639*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+wq+400*52+40] 2640*c0909341SAndroid Build Coastguard Worker neg wq 2641*c0909341SAndroid Build Coastguard Worker pshufd m15, m14, q2222 ; w0 w1 2642*c0909341SAndroid Build Coastguard Worker punpcklwd m14, m14 2643*c0909341SAndroid Build Coastguard Worker pshufd m13, m14, q0000 ; s0 2644*c0909341SAndroid Build Coastguard Worker pshufd m14, m14, q2222 ; s1 2645*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2646*c0909341SAndroid Build Coastguard Worker psllw m15, 2 2647*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 2648*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp] 2649*c0909341SAndroid Build Coastguard Worker%else 2650*c0909341SAndroid Build Coastguard Worker mov r1, [rstk+stack_offset+28] ; params 2651*c0909341SAndroid Build Coastguard Worker LEA r6, $$ 2652*c0909341SAndroid Build Coastguard Worker add wd, wd 2653*c0909341SAndroid Build Coastguard Worker mova m2, [r1] 2654*c0909341SAndroid Build Coastguard Worker add lpfm, wq 2655*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+extra_stack+wq+52] 2656*c0909341SAndroid Build Coastguard Worker add dstq, wq 2657*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+extra_stack+wq*2+400*24+48] 2658*c0909341SAndroid Build Coastguard Worker mov dstm, dstq 2659*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+extra_stack+wq+400*52+48] 2660*c0909341SAndroid Build Coastguard Worker mov t3m, t3 2661*c0909341SAndroid Build Coastguard Worker mov t4m, t4 2662*c0909341SAndroid Build Coastguard Worker neg wq 2663*c0909341SAndroid Build Coastguard Worker pshuflw m0, m2, q0000 2664*c0909341SAndroid Build Coastguard Worker pshuflw m1, m2, q2222 2665*c0909341SAndroid Build Coastguard Worker pshufhw m2, m2, q1010 2666*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 ; s0 2667*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 ; s1 2668*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m2 ; w0 w1 2669*c0909341SAndroid Build Coastguard Worker mov w1m, wd 2670*c0909341SAndroid Build Coastguard Worker pxor m3, m3 2671*c0909341SAndroid Build Coastguard Worker psllw m2, 2 2672*c0909341SAndroid Build Coastguard Worker mova m13, m0 2673*c0909341SAndroid Build Coastguard Worker mova m14, m1 2674*c0909341SAndroid Build Coastguard Worker sub wd, 4 2675*c0909341SAndroid Build Coastguard Worker mova m15, m2 2676*c0909341SAndroid Build Coastguard Worker mova m6, m3 2677*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 2678*c0909341SAndroid Build Coastguard Worker mov w0m, wd 2679*c0909341SAndroid Build Coastguard Worker %define strideq r5 2680*c0909341SAndroid Build Coastguard Worker%endif 2681*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 2682*c0909341SAndroid Build Coastguard Worker jz .no_top 2683*c0909341SAndroid Build Coastguard Worker call .h_top 2684*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 2685*c0909341SAndroid Build Coastguard Worker mov t2, t1 2686*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2687*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup 2688*c0909341SAndroid Build Coastguard Worker%else 2689*c0909341SAndroid Build Coastguard Worker mov wq, w0m 2690*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop 2691*c0909341SAndroid Build Coastguard Worker%endif 2692*c0909341SAndroid Build Coastguard Worker add t1, 400*12 2693*c0909341SAndroid Build Coastguard Worker call .h_top 2694*c0909341SAndroid Build Coastguard Worker movif32 strideq, stridemp 2695*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 2696*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 2697*c0909341SAndroid Build Coastguard Worker add r10, strideq 2698*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 ; below 2699*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 2700*c0909341SAndroid Build Coastguard Worker call .hv0 2701*c0909341SAndroid Build Coastguard Worker.main: 2702*c0909341SAndroid Build Coastguard Worker dec hd 2703*c0909341SAndroid Build Coastguard Worker jz .height1 2704*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2705*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 2706*c0909341SAndroid Build Coastguard Worker call .hv1 2707*c0909341SAndroid Build Coastguard Worker call .prep_n 2708*c0909341SAndroid Build Coastguard Worker sub hd, 2 2709*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 2710*c0909341SAndroid Build Coastguard Worker.main_loop: 2711*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2712*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 2713*c0909341SAndroid Build Coastguard Worker call .hv0 2714*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2715*c0909341SAndroid Build Coastguard Worker test hd, hd 2716*c0909341SAndroid Build Coastguard Worker%else 2717*c0909341SAndroid Build Coastguard Worker mov r4, hd 2718*c0909341SAndroid Build Coastguard Worker test r4, r4 2719*c0909341SAndroid Build Coastguard Worker%endif 2720*c0909341SAndroid Build Coastguard Worker jz .odd_height 2721*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2722*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 2723*c0909341SAndroid Build Coastguard Worker call .hv1 2724*c0909341SAndroid Build Coastguard Worker call .n0 2725*c0909341SAndroid Build Coastguard Worker call .n1 2726*c0909341SAndroid Build Coastguard Worker sub hd, 2 2727*c0909341SAndroid Build Coastguard Worker jge .main_loop 2728*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 2729*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 2730*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 2731*c0909341SAndroid Build Coastguard Worker call .hv0_bottom 2732*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2733*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 2734*c0909341SAndroid Build Coastguard Worker call .hv1_bottom 2735*c0909341SAndroid Build Coastguard Worker.end: 2736*c0909341SAndroid Build Coastguard Worker call .n0 2737*c0909341SAndroid Build Coastguard Worker call .n1 2738*c0909341SAndroid Build Coastguard Worker.end2: 2739*c0909341SAndroid Build Coastguard Worker RET 2740*c0909341SAndroid Build Coastguard Worker.height1: 2741*c0909341SAndroid Build Coastguard Worker call .v1 2742*c0909341SAndroid Build Coastguard Worker call .prep_n 2743*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 2744*c0909341SAndroid Build Coastguard Worker.odd_height: 2745*c0909341SAndroid Build Coastguard Worker call .v1 2746*c0909341SAndroid Build Coastguard Worker call .n0 2747*c0909341SAndroid Build Coastguard Worker call .n1 2748*c0909341SAndroid Build Coastguard Worker.odd_height_end: 2749*c0909341SAndroid Build Coastguard Worker call .v0 2750*c0909341SAndroid Build Coastguard Worker call .v1 2751*c0909341SAndroid Build Coastguard Worker call .n0 2752*c0909341SAndroid Build Coastguard Worker jmp .end2 2753*c0909341SAndroid Build Coastguard Worker.extend_bottom: 2754*c0909341SAndroid Build Coastguard Worker call .v0 2755*c0909341SAndroid Build Coastguard Worker call .v1 2756*c0909341SAndroid Build Coastguard Worker jmp .end 2757*c0909341SAndroid Build Coastguard Worker.no_top: 2758*c0909341SAndroid Build Coastguard Worker movif32 strideq, stridemp 2759*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 2760*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 2761*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 2762*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 2763*c0909341SAndroid Build Coastguard Worker call .h 2764*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2765*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 2766*c0909341SAndroid Build Coastguard Worker%else 2767*c0909341SAndroid Build Coastguard Worker mov wq, w0m 2768*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2769*c0909341SAndroid Build Coastguard Worker%endif 2770*c0909341SAndroid Build Coastguard Worker lea t2, [t1+400*12] 2771*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: 2772*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq+400* 0] 2773*c0909341SAndroid Build Coastguard Worker mova m1, [t1+wq+400* 2] 2774*c0909341SAndroid Build Coastguard Worker mova m2, [t1+wq+400* 4] 2775*c0909341SAndroid Build Coastguard Worker paddw m0, m0 2776*c0909341SAndroid Build Coastguard Worker mova m3, [t1+wq+400* 6] 2777*c0909341SAndroid Build Coastguard Worker paddd m1, m1 2778*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq+400* 8] 2779*c0909341SAndroid Build Coastguard Worker paddd m2, m2 2780*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq+400*10] 2781*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 0], m0 2782*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 2], m1 2783*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 4], m2 2784*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 6], m3 2785*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 8], m4 2786*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*10], m5 2787*c0909341SAndroid Build Coastguard Worker add wq, 16 2788*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 2789*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 2790*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 2791*c0909341SAndroid Build Coastguard Worker call .v0 2792*c0909341SAndroid Build Coastguard Worker jmp .main 2793*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 2794*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+4 2795*c0909341SAndroid Build Coastguard Worker%assign calloff 4 2796*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2797*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 2798*c0909341SAndroid Build Coastguard Worker%else 2799*c0909341SAndroid Build Coastguard Worker %define leftq r4 2800*c0909341SAndroid Build Coastguard Worker%endif 2801*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2802*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 2803*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 2804*c0909341SAndroid Build Coastguard Worker movddup m5, [leftq] 2805*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2806*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 2807*c0909341SAndroid Build Coastguard Worker add leftmp, 8 2808*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 10 2809*c0909341SAndroid Build Coastguard Worker jmp .h_main 2810*c0909341SAndroid Build Coastguard Worker.h_extend_left: 2811*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2812*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 2813*c0909341SAndroid Build Coastguard Worker pshufb m4, [base+sgr_lshuf5] 2814*c0909341SAndroid Build Coastguard Worker jmp .h_main 2815*c0909341SAndroid Build Coastguard Worker.h_top: 2816*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2817*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 2818*c0909341SAndroid Build Coastguard Worker%endif 2819*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2820*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 2821*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2822*c0909341SAndroid Build Coastguard Worker.h_loop: 2823*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+wq- 2] 2824*c0909341SAndroid Build Coastguard Worker.h_main: 2825*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq+14] 2826*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2827*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 2828*c0909341SAndroid Build Coastguard Worker cmp wd, -20 2829*c0909341SAndroid Build Coastguard Worker jl .h_have_right 2830*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2831*c0909341SAndroid Build Coastguard Worker pxor m8, m8 2832*c0909341SAndroid Build Coastguard Worker%endif 2833*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right 2834*c0909341SAndroid Build Coastguard Worker.h_have_right: 2835*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 2 2836*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 4 2837*c0909341SAndroid Build Coastguard Worker paddw m1, m3, m0 2838*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m0 2839*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2840*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0 2841*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2842*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 6 2843*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; sum3 2844*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m0, m6 2845*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 2846*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m6 2847*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 2848*c0909341SAndroid Build Coastguard Worker paddd m2, m7 ; sumsq3 2849*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 8 2850*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m5, m4 2851*c0909341SAndroid Build Coastguard Worker paddw m8, m4, m5 2852*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 2853*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m4 2854*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2855*c0909341SAndroid Build Coastguard Worker paddd m3, m0 2856*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400* 6], m1 2857*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400* 8], m2 2858*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*10], m3 2859*c0909341SAndroid Build Coastguard Worker paddw m8, m1 ; sum5 2860*c0909341SAndroid Build Coastguard Worker paddd m7, m2 ; sumsq5 2861*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2862*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400* 0], m8 2863*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400* 2], m7 2864*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400* 4], m5 2865*c0909341SAndroid Build Coastguard Worker add wq, 16 2866*c0909341SAndroid Build Coastguard Worker jl .h_loop 2867*c0909341SAndroid Build Coastguard Worker ret 2868*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2869*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 2870*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2871*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 2872*c0909341SAndroid Build Coastguard Worker%else 2873*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2874*c0909341SAndroid Build Coastguard Worker%endif 2875*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2876*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 2877*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 2878*c0909341SAndroid Build Coastguard Worker movddup m5, [leftq] 2879*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2880*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 2881*c0909341SAndroid Build Coastguard Worker add leftmp, 8 2882*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 10 2883*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 2884*c0909341SAndroid Build Coastguard Worker.hv0_extend_left: 2885*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2886*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 2887*c0909341SAndroid Build Coastguard Worker pshufb m4, [base+sgr_lshuf5] 2888*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 2889*c0909341SAndroid Build Coastguard Worker.hv0_bottom: 2890*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2891*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 2892*c0909341SAndroid Build Coastguard Worker%else 2893*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2894*c0909341SAndroid Build Coastguard Worker%endif 2895*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2896*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 2897*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2898*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2899*c0909341SAndroid Build Coastguard Worker jmp .hv0_loop_start 2900*c0909341SAndroid Build Coastguard Worker%endif 2901*c0909341SAndroid Build Coastguard Worker.hv0_loop: 2902*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2903*c0909341SAndroid Build Coastguard Worker.hv0_loop_start: 2904*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+wq- 2] 2905*c0909341SAndroid Build Coastguard Worker.hv0_main: 2906*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq+14] 2907*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2908*c0909341SAndroid Build Coastguard Worker jnz .hv0_have_right 2909*c0909341SAndroid Build Coastguard Worker cmp wd, -20 2910*c0909341SAndroid Build Coastguard Worker jl .hv0_have_right 2911*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2912*c0909341SAndroid Build Coastguard Worker pxor m8, m8 2913*c0909341SAndroid Build Coastguard Worker%endif 2914*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right 2915*c0909341SAndroid Build Coastguard Worker.hv0_have_right: 2916*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 2 2917*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 4 2918*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 2919*c0909341SAndroid Build Coastguard Worker paddw m1, m3, m0 2920*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m0 2921*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2922*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0 2923*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2924*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 6 2925*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; h sum3 2926*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m0, m6 2927*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 2928*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m6 2929*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 2930*c0909341SAndroid Build Coastguard Worker paddd m2, m7 ; h sumsq3 2931*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 8 2932*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m5, m4 2933*c0909341SAndroid Build Coastguard Worker paddw m8, m4, m5 2934*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 2935*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m4 2936*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2937*c0909341SAndroid Build Coastguard Worker paddd m3, m0 2938*c0909341SAndroid Build Coastguard Worker paddw m8, m1 ; h sum5 2939*c0909341SAndroid Build Coastguard Worker paddd m7, m2 ; h sumsq5 2940*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2941*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*8+ 8], m8 2942*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*0+ 8], m7 2943*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*0+24], m5 2944*c0909341SAndroid Build Coastguard Worker paddw m8, [t1+wq+400* 0] 2945*c0909341SAndroid Build Coastguard Worker paddd m7, [t1+wq+400* 2] 2946*c0909341SAndroid Build Coastguard Worker paddd m5, [t1+wq+400* 4] 2947*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400* 0], m8 2948*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400* 2], m7 2949*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400* 4], m5 2950*c0909341SAndroid Build Coastguard Worker paddw m0, m1, [t1+wq+400* 6] 2951*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t1+wq+400* 8] 2952*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t1+wq+400*10] 2953*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400* 6], m1 2954*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400* 8], m2 2955*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*10], m3 2956*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq+400* 6] 2957*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+wq+400* 8] 2958*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+wq+400*10] 2959*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 6], m0 2960*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 8], m4 2961*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*10], m5 2962*c0909341SAndroid Build Coastguard Worker paddd m2, m9 2963*c0909341SAndroid Build Coastguard Worker paddd m3, m9 2964*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a3 + 8) >> 4 2965*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2966*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2967*c0909341SAndroid Build Coastguard Worker pxor m7, m7 2968*c0909341SAndroid Build Coastguard Worker%else 2969*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 2970*c0909341SAndroid Build Coastguard Worker%endif 2971*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 2972*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 2973*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2974*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2975*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 2976*c0909341SAndroid Build Coastguard Worker pavgw m3, m7 ; (b3 + 2) >> 2 2977*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m7 2978*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2979*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m7 2980*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2981*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 2982*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 2983*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2984*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 2985*c0909341SAndroid Build Coastguard Worker%endif 2986*c0909341SAndroid Build Coastguard Worker MAXSD m4, m2, m7 2987*c0909341SAndroid Build Coastguard Worker MAXSD m5, m3, m7 2988*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p3 2989*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2990*c0909341SAndroid Build Coastguard Worker MULLD m4, m14, m7 ; p3 * s1 2991*c0909341SAndroid Build Coastguard Worker MULLD m5, m14, m7 2992*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b3 * 455 2993*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 2994*c0909341SAndroid Build Coastguard Worker paddusw m4, m11 2995*c0909341SAndroid Build Coastguard Worker paddusw m5, m11 2996*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z3, 255) 2997*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2998*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 2999*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 3000*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 3001*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m7 3002*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m7 3003*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3004*c0909341SAndroid Build Coastguard Worker paddd m1, m10 3005*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*2+ 4], m3 3006*c0909341SAndroid Build Coastguard Worker psrld m0, 12 3007*c0909341SAndroid Build Coastguard Worker psrld m1, 12 3008*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*4+ 8], m0 3009*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*4+24], m1 3010*c0909341SAndroid Build Coastguard Worker add wq, 16 3011*c0909341SAndroid Build Coastguard Worker jl .hv0_loop 3012*c0909341SAndroid Build Coastguard Worker ret 3013*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3014*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 3015*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3016*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 3017*c0909341SAndroid Build Coastguard Worker%else 3018*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 3019*c0909341SAndroid Build Coastguard Worker%endif 3020*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 3021*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 3022*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 3023*c0909341SAndroid Build Coastguard Worker movddup m5, [leftq] 3024*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 3025*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 3026*c0909341SAndroid Build Coastguard Worker add leftmp, 8 3027*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 10 3028*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 3029*c0909341SAndroid Build Coastguard Worker.hv1_extend_left: 3030*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 3031*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq+4] 3032*c0909341SAndroid Build Coastguard Worker pshufb m4, [base+sgr_lshuf5] 3033*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 3034*c0909341SAndroid Build Coastguard Worker.hv1_bottom: 3035*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3036*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 3037*c0909341SAndroid Build Coastguard Worker%else 3038*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 3039*c0909341SAndroid Build Coastguard Worker%endif 3040*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 3041*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 3042*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 3043*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3044*c0909341SAndroid Build Coastguard Worker jmp .hv1_loop_start 3045*c0909341SAndroid Build Coastguard Worker%endif 3046*c0909341SAndroid Build Coastguard Worker.hv1_loop: 3047*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 3048*c0909341SAndroid Build Coastguard Worker.hv1_loop_start: 3049*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+wq- 2] 3050*c0909341SAndroid Build Coastguard Worker.hv1_main: 3051*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq+14] 3052*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 3053*c0909341SAndroid Build Coastguard Worker jnz .hv1_have_right 3054*c0909341SAndroid Build Coastguard Worker cmp wd, -20 3055*c0909341SAndroid Build Coastguard Worker jl .hv1_have_right 3056*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3057*c0909341SAndroid Build Coastguard Worker pxor m8, m8 3058*c0909341SAndroid Build Coastguard Worker%endif 3059*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right 3060*c0909341SAndroid Build Coastguard Worker.hv1_have_right: 3061*c0909341SAndroid Build Coastguard Worker palignr m7, m5, m4, 2 3062*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 4 3063*c0909341SAndroid Build Coastguard Worker paddw m2, m7, m3 3064*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m7, m3 3065*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 3066*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m3 3067*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 3068*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 6 3069*c0909341SAndroid Build Coastguard Worker paddw m2, m3 ; h sum3 3070*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m6 3071*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 3072*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 3073*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 3074*c0909341SAndroid Build Coastguard Worker paddd m0, m1 ; h sumsq3 3075*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 8 3076*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4, m5 3077*c0909341SAndroid Build Coastguard Worker paddw m8, m4, m5 3078*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 3079*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 3080*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 3081*c0909341SAndroid Build Coastguard Worker paddd m7, m3 3082*c0909341SAndroid Build Coastguard Worker paddw m5, m2, [t2+wq+400* 6] 3083*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 6], m2 3084*c0909341SAndroid Build Coastguard Worker paddw m8, m2 ; h sum5 3085*c0909341SAndroid Build Coastguard Worker paddd m2, m0, [t2+wq+400* 8] 3086*c0909341SAndroid Build Coastguard Worker paddd m3, m7, [t2+wq+400*10] 3087*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 8], m0 3088*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*10], m7 3089*c0909341SAndroid Build Coastguard Worker paddd m4, m0 ; h sumsq5 3090*c0909341SAndroid Build Coastguard Worker paddd m1, m7 3091*c0909341SAndroid Build Coastguard Worker paddd m2, m9 3092*c0909341SAndroid Build Coastguard Worker paddd m3, m9 3093*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a3 + 8) >> 4 3094*c0909341SAndroid Build Coastguard Worker psrld m3, 4 3095*c0909341SAndroid Build Coastguard Worker pslld m0, m2, 3 3096*c0909341SAndroid Build Coastguard Worker pslld m7, m3, 3 3097*c0909341SAndroid Build Coastguard Worker paddd m2, m0 ; ((a3 + 8) >> 4) * 9 3098*c0909341SAndroid Build Coastguard Worker paddd m3, m7 3099*c0909341SAndroid Build Coastguard Worker psrlw m7, m5, 1 3100*c0909341SAndroid Build Coastguard Worker pavgw m7, m6 ; (b3 + 2) >> 2 3101*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m7, m6 3102*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 3103*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m6 3104*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 3105*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3106*c0909341SAndroid Build Coastguard Worker mova [esp+20], m8 3107*c0909341SAndroid Build Coastguard Worker%else 3108*c0909341SAndroid Build Coastguard Worker SWAP m8, m6 3109*c0909341SAndroid Build Coastguard Worker%endif 3110*c0909341SAndroid Build Coastguard Worker MAXSD m2, m0, m8 3111*c0909341SAndroid Build Coastguard Worker MAXSD m3, m7, m8 3112*c0909341SAndroid Build Coastguard Worker pxor m8, m8 3113*c0909341SAndroid Build Coastguard Worker psubd m2, m0 ; p3 3114*c0909341SAndroid Build Coastguard Worker psubd m3, m7 3115*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m5, m8 ; b3 3116*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m8 3117*c0909341SAndroid Build Coastguard Worker MULLD m2, m14, m8 ; p3 * s1 3118*c0909341SAndroid Build Coastguard Worker MULLD m3, m14, m8 3119*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b3 * 455 3120*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m11 3121*c0909341SAndroid Build Coastguard Worker paddusw m2, m11 3122*c0909341SAndroid Build Coastguard Worker paddusw m3, m11 3123*c0909341SAndroid Build Coastguard Worker psrld m2, 20 ; min(z3, 255) 3124*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 3125*c0909341SAndroid Build Coastguard Worker psrld m3, 20 3126*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m8, m2, m3, r0, dstm 3127*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m8, m8 3128*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m8, m8 3129*c0909341SAndroid Build Coastguard Worker MULLD m0, m2, m7 3130*c0909341SAndroid Build Coastguard Worker MULLD m5, m3, m7 3131*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3132*c0909341SAndroid Build Coastguard Worker paddd m5, m10 3133*c0909341SAndroid Build Coastguard Worker psrld m0, 12 3134*c0909341SAndroid Build Coastguard Worker psrld m5, 12 3135*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*4+4], m8 3136*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*8+ 8], m0 3137*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*8+24], m5 3138*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3139*c0909341SAndroid Build Coastguard Worker mova m8, [esp+20] 3140*c0909341SAndroid Build Coastguard Worker%else 3141*c0909341SAndroid Build Coastguard Worker SWAP m6, m8 3142*c0909341SAndroid Build Coastguard Worker pxor m6, m6 3143*c0909341SAndroid Build Coastguard Worker%endif 3144*c0909341SAndroid Build Coastguard Worker paddw m5, m8, [t2+wq+400*0] 3145*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+wq+400*2] 3146*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t2+wq+400*4] 3147*c0909341SAndroid Build Coastguard Worker paddw m5, [t1+wq+400*0] 3148*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+wq+400*2] 3149*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+wq+400*4] 3150*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*0], m8 3151*c0909341SAndroid Build Coastguard Worker paddd m2, m9 3152*c0909341SAndroid Build Coastguard Worker paddd m3, m9 3153*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a5 + 8) >> 4 3154*c0909341SAndroid Build Coastguard Worker psrld m3, 4 3155*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*2], m4 3156*c0909341SAndroid Build Coastguard Worker pslld m8, m2, 4 3157*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*4], m1 3158*c0909341SAndroid Build Coastguard Worker pslld m4, m3, 4 3159*c0909341SAndroid Build Coastguard Worker paddd m8, m2 3160*c0909341SAndroid Build Coastguard Worker pslld m2, 3 3161*c0909341SAndroid Build Coastguard Worker paddd m4, m3 3162*c0909341SAndroid Build Coastguard Worker pslld m3, 3 3163*c0909341SAndroid Build Coastguard Worker paddd m2, m8 ; ((a5 + 8) >> 4) * 25 3164*c0909341SAndroid Build Coastguard Worker paddd m3, m4 3165*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3166*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3167*c0909341SAndroid Build Coastguard Worker%else 3168*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3169*c0909341SAndroid Build Coastguard Worker%endif 3170*c0909341SAndroid Build Coastguard Worker psrlw m1, m5, 1 3171*c0909341SAndroid Build Coastguard Worker pavgw m1, m7 ; (b5 + 2) >> 2 3172*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m1, m7 3173*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 3174*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 3175*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 3176*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m5, m7 ; b5 3177*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m7 3178*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3179*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3180*c0909341SAndroid Build Coastguard Worker%endif 3181*c0909341SAndroid Build Coastguard Worker MAXSD m2, m4, m7 3182*c0909341SAndroid Build Coastguard Worker psubd m2, m4 ; p5 3183*c0909341SAndroid Build Coastguard Worker MAXSD m3, m1, m7 3184*c0909341SAndroid Build Coastguard Worker psubd m3, m1 3185*c0909341SAndroid Build Coastguard Worker MULLD m2, m13, m7 ; p5 * s0 3186*c0909341SAndroid Build Coastguard Worker MULLD m3, m13, m7 3187*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 ; b5 * 164 3188*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m12 3189*c0909341SAndroid Build Coastguard Worker paddusw m2, m12 3190*c0909341SAndroid Build Coastguard Worker paddusw m3, m12 3191*c0909341SAndroid Build Coastguard Worker psrld m2, 20 ; min(z5, 255) 3192*c0909341SAndroid Build Coastguard Worker psrld m3, 20 3193*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m1, m2, m3, r0, dstm 3194*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1, m1 3195*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m1, m1 3196*c0909341SAndroid Build Coastguard Worker MULLD m0, m2, m7 3197*c0909341SAndroid Build Coastguard Worker MULLD m5, m3, m7 3198*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3199*c0909341SAndroid Build Coastguard Worker paddd m5, m10 3200*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*0+ 4], m1 3201*c0909341SAndroid Build Coastguard Worker psrld m0, 12 3202*c0909341SAndroid Build Coastguard Worker psrld m5, 12 3203*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*0+ 8], m0 3204*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*0+24], m5 3205*c0909341SAndroid Build Coastguard Worker add wq, 16 3206*c0909341SAndroid Build Coastguard Worker jl .hv1_loop 3207*c0909341SAndroid Build Coastguard Worker mov r10, t2 3208*c0909341SAndroid Build Coastguard Worker mov t2, t1 3209*c0909341SAndroid Build Coastguard Worker mov t1, r10 3210*c0909341SAndroid Build Coastguard Worker ret 3211*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab3 (even rows) 3212*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3213*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 3214*c0909341SAndroid Build Coastguard Worker%else 3215*c0909341SAndroid Build Coastguard Worker mov wd, w0m 3216*c0909341SAndroid Build Coastguard Worker%endif 3217*c0909341SAndroid Build Coastguard Worker.v0_loop: 3218*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq+400* 6] 3219*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq+400* 8] 3220*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq+400*10] 3221*c0909341SAndroid Build Coastguard Worker paddw m0, m0 3222*c0909341SAndroid Build Coastguard Worker paddd m4, m4 3223*c0909341SAndroid Build Coastguard Worker paddd m5, m5 3224*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq+400* 6] 3225*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+wq+400* 8] 3226*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+wq+400*10] 3227*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 6], m0 3228*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 8], m4 3229*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*10], m5 3230*c0909341SAndroid Build Coastguard Worker paddd m2, m9 3231*c0909341SAndroid Build Coastguard Worker paddd m3, m9 3232*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a3 + 8) >> 4 3233*c0909341SAndroid Build Coastguard Worker psrld m3, 4 3234*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3235*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3236*c0909341SAndroid Build Coastguard Worker%else 3237*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3238*c0909341SAndroid Build Coastguard Worker%endif 3239*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 3240*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 3241*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a3 + 8) >> 4) * 9 3242*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3243*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 3244*c0909341SAndroid Build Coastguard Worker pavgw m3, m7 ; (b3 + 2) >> 2 3245*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m7 3246*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 3247*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m7 3248*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 3249*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 3250*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 3251*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3252*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3253*c0909341SAndroid Build Coastguard Worker%endif 3254*c0909341SAndroid Build Coastguard Worker MAXSD m4, m2, m7 3255*c0909341SAndroid Build Coastguard Worker MAXSD m5, m3, m7 3256*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p3 3257*c0909341SAndroid Build Coastguard Worker psubd m5, m3 3258*c0909341SAndroid Build Coastguard Worker MULLD m4, m14, m7 ; p3 * s1 3259*c0909341SAndroid Build Coastguard Worker MULLD m5, m14, m7 3260*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b3 * 455 3261*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 3262*c0909341SAndroid Build Coastguard Worker paddusw m4, m11 3263*c0909341SAndroid Build Coastguard Worker paddusw m5, m11 3264*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z3, 255) 3265*c0909341SAndroid Build Coastguard Worker psrld m5, 20 3266*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 3267*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 3268*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 3269*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m7 3270*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m7 3271*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3272*c0909341SAndroid Build Coastguard Worker paddd m1, m10 3273*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*2+4], m3 3274*c0909341SAndroid Build Coastguard Worker psrld m0, 12 3275*c0909341SAndroid Build Coastguard Worker psrld m1, 12 3276*c0909341SAndroid Build Coastguard Worker mova m3, [t1+wq+400*0] 3277*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq+400*2] 3278*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq+400*4] 3279*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*8+ 8], m3 3280*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*0+ 8], m4 3281*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*0+24], m5 3282*c0909341SAndroid Build Coastguard Worker paddw m3, m3 ; cc5 3283*c0909341SAndroid Build Coastguard Worker paddd m4, m4 3284*c0909341SAndroid Build Coastguard Worker paddd m5, m5 3285*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*0], m3 3286*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*2], m4 3287*c0909341SAndroid Build Coastguard Worker mova [t1+wq+400*4], m5 3288*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*4+ 8], m0 3289*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*4+24], m1 3290*c0909341SAndroid Build Coastguard Worker add wq, 16 3291*c0909341SAndroid Build Coastguard Worker jl .v0_loop 3292*c0909341SAndroid Build Coastguard Worker ret 3293*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows) 3294*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3295*c0909341SAndroid Build Coastguard Worker lea wq, [r4-4] 3296*c0909341SAndroid Build Coastguard Worker%else 3297*c0909341SAndroid Build Coastguard Worker mov wd, w0m 3298*c0909341SAndroid Build Coastguard Worker%endif 3299*c0909341SAndroid Build Coastguard Worker.v1_loop: 3300*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq+400* 6] 3301*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq+400* 8] 3302*c0909341SAndroid Build Coastguard Worker mova m7, [t1+wq+400*10] 3303*c0909341SAndroid Build Coastguard Worker paddw m1, m4, [t2+wq+400* 6] 3304*c0909341SAndroid Build Coastguard Worker paddd m2, m5, [t2+wq+400* 8] 3305*c0909341SAndroid Build Coastguard Worker paddd m3, m7, [t2+wq+400*10] 3306*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 6], m4 3307*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400* 8], m5 3308*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*10], m7 3309*c0909341SAndroid Build Coastguard Worker paddd m2, m9 3310*c0909341SAndroid Build Coastguard Worker paddd m3, m9 3311*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a3 + 8) >> 4 3312*c0909341SAndroid Build Coastguard Worker psrld m3, 4 3313*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3314*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3315*c0909341SAndroid Build Coastguard Worker%else 3316*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3317*c0909341SAndroid Build Coastguard Worker%endif 3318*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 3319*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 3320*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a3 + 8) >> 4) * 9 3321*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3322*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 3323*c0909341SAndroid Build Coastguard Worker pavgw m3, m7 ; (b3 + 2) >> 2 3324*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m7 3325*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 3326*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m7 3327*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 3328*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 3329*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 3330*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3331*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3332*c0909341SAndroid Build Coastguard Worker%endif 3333*c0909341SAndroid Build Coastguard Worker MAXSD m4, m2, m7 3334*c0909341SAndroid Build Coastguard Worker MAXSD m5, m3, m7 3335*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p3 3336*c0909341SAndroid Build Coastguard Worker psubd m5, m3 3337*c0909341SAndroid Build Coastguard Worker MULLD m4, m14, m7 ; p3 * s1 3338*c0909341SAndroid Build Coastguard Worker MULLD m5, m14, m7 3339*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b3 * 455 3340*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 3341*c0909341SAndroid Build Coastguard Worker paddusw m4, m11 3342*c0909341SAndroid Build Coastguard Worker paddusw m5, m11 3343*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z3, 255) 3344*c0909341SAndroid Build Coastguard Worker psrld m5, 20 3345*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 3346*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 3347*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 3348*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m7 3349*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m7 3350*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3351*c0909341SAndroid Build Coastguard Worker paddd m1, m10 3352*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*4+4], m3 3353*c0909341SAndroid Build Coastguard Worker psrld m0, 12 3354*c0909341SAndroid Build Coastguard Worker psrld m8, m1, 12 3355*c0909341SAndroid Build Coastguard Worker mova m4, [t3+wq*2+400*8+ 8] 3356*c0909341SAndroid Build Coastguard Worker mova m5, [t3+wq*2+400*0+ 8] 3357*c0909341SAndroid Build Coastguard Worker mova m7, [t3+wq*2+400*0+24] 3358*c0909341SAndroid Build Coastguard Worker paddw m1, m4, [t2+wq+400*0] 3359*c0909341SAndroid Build Coastguard Worker paddd m2, m5, [t2+wq+400*2] 3360*c0909341SAndroid Build Coastguard Worker paddd m3, m7, [t2+wq+400*4] 3361*c0909341SAndroid Build Coastguard Worker paddw m1, [t1+wq+400*0] 3362*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+wq+400*2] 3363*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+wq+400*4] 3364*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*0], m4 3365*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*2], m5 3366*c0909341SAndroid Build Coastguard Worker mova [t2+wq+400*4], m7 3367*c0909341SAndroid Build Coastguard Worker paddd m2, m9 3368*c0909341SAndroid Build Coastguard Worker paddd m3, m9 3369*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a5 + 8) >> 4 3370*c0909341SAndroid Build Coastguard Worker psrld m3, 4 3371*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*8+ 8], m0 3372*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 4 3373*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*8+24], m8 3374*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 4 3375*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3376*c0909341SAndroid Build Coastguard Worker pslld m2, 3 3377*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3378*c0909341SAndroid Build Coastguard Worker pslld m3, 3 3379*c0909341SAndroid Build Coastguard Worker paddd m2, m4 3380*c0909341SAndroid Build Coastguard Worker paddd m3, m5 3381*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3382*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3383*c0909341SAndroid Build Coastguard Worker%else 3384*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3385*c0909341SAndroid Build Coastguard Worker%endif 3386*c0909341SAndroid Build Coastguard Worker psrlw m5, m1, 1 3387*c0909341SAndroid Build Coastguard Worker pavgw m5, m7 ; (b5 + 2) >> 2 3388*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m7 3389*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 3390*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m7 3391*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 3392*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b5 3393*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 3394*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3395*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3396*c0909341SAndroid Build Coastguard Worker%endif 3397*c0909341SAndroid Build Coastguard Worker MAXSD m2, m4, m7 3398*c0909341SAndroid Build Coastguard Worker psubd m2, m4 ; p5 3399*c0909341SAndroid Build Coastguard Worker MAXSD m3, m5, m7 3400*c0909341SAndroid Build Coastguard Worker psubd m3, m5 3401*c0909341SAndroid Build Coastguard Worker MULLD m2, m13, m7 ; p5 * s0 3402*c0909341SAndroid Build Coastguard Worker MULLD m3, m13, m7 3403*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 ; b5 * 164 3404*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 3405*c0909341SAndroid Build Coastguard Worker paddusw m2, m12 3406*c0909341SAndroid Build Coastguard Worker paddusw m3, m12 3407*c0909341SAndroid Build Coastguard Worker psrld m2, 20 ; min(z5, 255) 3408*c0909341SAndroid Build Coastguard Worker psrld m3, 20 3409*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m4, m2, m3, r0, dstm 3410*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m4 3411*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m4 3412*c0909341SAndroid Build Coastguard Worker MULLD m0, m2, m7 3413*c0909341SAndroid Build Coastguard Worker MULLD m1, m3, m7 3414*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3415*c0909341SAndroid Build Coastguard Worker paddd m1, m10 3416*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*0+ 4], m4 3417*c0909341SAndroid Build Coastguard Worker psrld m0, 12 3418*c0909341SAndroid Build Coastguard Worker psrld m1, 12 3419*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*0+ 8], m0 3420*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*0+24], m1 3421*c0909341SAndroid Build Coastguard Worker add wq, 16 3422*c0909341SAndroid Build Coastguard Worker jl .v1_loop 3423*c0909341SAndroid Build Coastguard Worker mov r10, t2 3424*c0909341SAndroid Build Coastguard Worker mov t2, t1 3425*c0909341SAndroid Build Coastguard Worker mov t1, r10 3426*c0909341SAndroid Build Coastguard Worker ret 3427*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 3428*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 3429*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 3430*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 3431*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*1+400*0+ 2] 3432*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+400*0+ 4] 3433*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*2+400*0+20] 3434*c0909341SAndroid Build Coastguard Worker movu m7, [t4+wq*1+400*0+ 4] 3435*c0909341SAndroid Build Coastguard Worker movu m8, [t3+wq*2+400*0+ 8] 3436*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t4+wq*1+400*0+ 0] 3437*c0909341SAndroid Build Coastguard Worker paddd m4, m1, [t3+wq*2+400*0+ 0] 3438*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+wq*2+400*0+16] 3439*c0909341SAndroid Build Coastguard Worker paddw m3, m7 3440*c0909341SAndroid Build Coastguard Worker paddd m4, m8 3441*c0909341SAndroid Build Coastguard Worker movu m7, [t3+wq*2+400*0+24] 3442*c0909341SAndroid Build Coastguard Worker paddw m0, m3 3443*c0909341SAndroid Build Coastguard Worker paddd m1, m4 3444*c0909341SAndroid Build Coastguard Worker psllw m3, 2 3445*c0909341SAndroid Build Coastguard Worker pslld m4, 2 3446*c0909341SAndroid Build Coastguard Worker paddd m5, m7 3447*c0909341SAndroid Build Coastguard Worker paddd m2, m5 3448*c0909341SAndroid Build Coastguard Worker pslld m5, 2 3449*c0909341SAndroid Build Coastguard Worker paddw m0, m3 ; a5 565 3450*c0909341SAndroid Build Coastguard Worker paddd m1, m4 ; b5 565 3451*c0909341SAndroid Build Coastguard Worker paddd m2, m5 3452*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400* 6+ 0], m0 3453*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*12+ 0], m1 3454*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*12+16], m2 3455*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*1+400*2+ 4] 3456*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+400*4+ 8] 3457*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*2+400*4+24] 3458*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*1+400*2+ 2] 3459*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*2+400*4+ 4] 3460*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*2+400*4+20] 3461*c0909341SAndroid Build Coastguard Worker paddw m0, [t4+wq*1+400*2+ 0] 3462*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*2+400*4+ 0] 3463*c0909341SAndroid Build Coastguard Worker paddd m2, [t3+wq*2+400*4+16] 3464*c0909341SAndroid Build Coastguard Worker paddw m3, m0 3465*c0909341SAndroid Build Coastguard Worker paddd m4, m1 3466*c0909341SAndroid Build Coastguard Worker paddd m5, m2 3467*c0909341SAndroid Build Coastguard Worker psllw m3, 2 ; a3[-1] 444 3468*c0909341SAndroid Build Coastguard Worker pslld m4, 2 ; b3[-1] 444 3469*c0909341SAndroid Build Coastguard Worker pslld m5, 2 3470*c0909341SAndroid Build Coastguard Worker psubw m3, m0 ; a3[-1] 343 3471*c0909341SAndroid Build Coastguard Worker psubd m4, m1 ; b3[-1] 343 3472*c0909341SAndroid Build Coastguard Worker psubd m5, m2 3473*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400* 8+ 0], m3 3474*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*16+ 0], m4 3475*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*16+16], m5 3476*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*1+400*4+ 4] 3477*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+400*8+ 8] 3478*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*2+400*8+24] 3479*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*1+400*4+ 2] 3480*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*2+400*8+ 4] 3481*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*2+400*8+20] 3482*c0909341SAndroid Build Coastguard Worker paddw m0, [t4+wq*1+400*4+ 0] 3483*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*2+400*8+ 0] 3484*c0909341SAndroid Build Coastguard Worker paddd m2, [t3+wq*2+400*8+16] 3485*c0909341SAndroid Build Coastguard Worker paddw m3, m0 3486*c0909341SAndroid Build Coastguard Worker paddd m4, m1 3487*c0909341SAndroid Build Coastguard Worker paddd m5, m2 3488*c0909341SAndroid Build Coastguard Worker psllw m3, 2 ; a3[ 0] 444 3489*c0909341SAndroid Build Coastguard Worker pslld m4, 2 ; b3[ 0] 444 3490*c0909341SAndroid Build Coastguard Worker pslld m5, 2 3491*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*10+ 0], m3 3492*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*20+ 0], m4 3493*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*20+16], m5 3494*c0909341SAndroid Build Coastguard Worker psubw m3, m0 ; a3[ 0] 343 3495*c0909341SAndroid Build Coastguard Worker psubd m4, m1 ; b3[ 0] 343 3496*c0909341SAndroid Build Coastguard Worker psubd m5, m2 3497*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*12+ 0], m3 3498*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*24+ 0], m4 3499*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*24+16], m5 3500*c0909341SAndroid Build Coastguard Worker add wq, 16 3501*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 3502*c0909341SAndroid Build Coastguard Worker ret 3503*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3504*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 3505*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 3506*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 3507*c0909341SAndroid Build Coastguard Worker.n0_loop: 3508*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*1+ 4] 3509*c0909341SAndroid Build Coastguard Worker movu m2, [t4+wq*1+ 2] 3510*c0909341SAndroid Build Coastguard Worker paddw m0, [t4+wq*1+ 0] 3511*c0909341SAndroid Build Coastguard Worker paddw m0, m2 3512*c0909341SAndroid Build Coastguard Worker paddw m2, m0 3513*c0909341SAndroid Build Coastguard Worker psllw m0, 2 3514*c0909341SAndroid Build Coastguard Worker paddw m0, m2 ; a5 3515*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*2+ 8] 3516*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*2+24] 3517*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+ 4] 3518*c0909341SAndroid Build Coastguard Worker movu m3, [t3+wq*2+20] 3519*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*2+ 0] 3520*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*2+16] 3521*c0909341SAndroid Build Coastguard Worker paddd m4, m1 3522*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3523*c0909341SAndroid Build Coastguard Worker paddd m1, m4 3524*c0909341SAndroid Build Coastguard Worker paddd m3, m5 3525*c0909341SAndroid Build Coastguard Worker pslld m4, 2 3526*c0909341SAndroid Build Coastguard Worker pslld m5, 2 3527*c0909341SAndroid Build Coastguard Worker paddd m4, m1 ; b5 3528*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3529*c0909341SAndroid Build Coastguard Worker movu m2, [t4+wq*1+400* 6] 3530*c0909341SAndroid Build Coastguard Worker paddw m2, m0 3531*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400* 6], m0 3532*c0909341SAndroid Build Coastguard Worker paddd m0, m4, [t3+wq*2+400*12+ 0] 3533*c0909341SAndroid Build Coastguard Worker paddd m1, m5, [t3+wq*2+400*12+16] 3534*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*12+ 0], m4 3535*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*12+16], m5 3536*c0909341SAndroid Build Coastguard Worker mova [rsp+16+ARCH_X86_32*4], m1 3537*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*1+400*2+4] 3538*c0909341SAndroid Build Coastguard Worker movu m5, [t4+wq*1+400*2+2] 3539*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*1+400*2+0] 3540*c0909341SAndroid Build Coastguard Worker paddw m5, m3 3541*c0909341SAndroid Build Coastguard Worker psllw m5, 2 ; a3[ 1] 444 3542*c0909341SAndroid Build Coastguard Worker psubw m4, m5, m3 ; a3[ 1] 343 3543*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*1+400* 8] 3544*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*1+400*10] 3545*c0909341SAndroid Build Coastguard Worker paddw m3, m4 3546*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400* 8], m4 3547*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*10], m5 3548*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+400*4+ 8] 3549*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*2+400*4+ 4] 3550*c0909341SAndroid Build Coastguard Worker movu m7, [t3+wq*2+400*4+24] 3551*c0909341SAndroid Build Coastguard Worker movu m8, [t3+wq*2+400*4+20] 3552*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*2+400*4+ 0] 3553*c0909341SAndroid Build Coastguard Worker paddd m7, [t3+wq*2+400*4+16] 3554*c0909341SAndroid Build Coastguard Worker paddd m5, m1 3555*c0909341SAndroid Build Coastguard Worker paddd m8, m7 3556*c0909341SAndroid Build Coastguard Worker pslld m5, 2 ; b3[ 1] 444 3557*c0909341SAndroid Build Coastguard Worker pslld m8, 2 3558*c0909341SAndroid Build Coastguard Worker psubd m4, m5, m1 ; b3[ 1] 343 3559*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3560*c0909341SAndroid Build Coastguard Worker mova [esp+52], m8 3561*c0909341SAndroid Build Coastguard Worker psubd m8, m7 3562*c0909341SAndroid Build Coastguard Worker%else 3563*c0909341SAndroid Build Coastguard Worker psubd m6, m8, m7 3564*c0909341SAndroid Build Coastguard Worker SWAP m8, m6 3565*c0909341SAndroid Build Coastguard Worker%endif 3566*c0909341SAndroid Build Coastguard Worker paddd m1, m4, [t3+wq*2+400*16+ 0] 3567*c0909341SAndroid Build Coastguard Worker paddd m7, m8, [t3+wq*2+400*16+16] 3568*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*2+400*20+ 0] 3569*c0909341SAndroid Build Coastguard Worker paddd m7, [t3+wq*2+400*20+16] 3570*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*16+ 0], m4 3571*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*16+16], m8 3572*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*20+ 0], m5 3573*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3574*c0909341SAndroid Build Coastguard Worker mova m8, [esp+52] 3575*c0909341SAndroid Build Coastguard Worker%else 3576*c0909341SAndroid Build Coastguard Worker SWAP m8, m6 3577*c0909341SAndroid Build Coastguard Worker pxor m6, m6 3578*c0909341SAndroid Build Coastguard Worker%endif 3579*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*20+16], m8 3580*c0909341SAndroid Build Coastguard Worker mova [rsp+32+ARCH_X86_32*4], m7 3581*c0909341SAndroid Build Coastguard Worker movu m5, [dstq+wq] 3582*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m6 3583*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m2, m6 3584*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m4 ; a5 * src 3585*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m3, m6 3586*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m4 ; a3 * src 3587*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 3588*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m6 3589*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5 3590*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 3591*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5 3592*c0909341SAndroid Build Coastguard Worker pslld m4, 13 3593*c0909341SAndroid Build Coastguard Worker pslld m5, 13 3594*c0909341SAndroid Build Coastguard Worker psubd m0, m7 ; b5 - a5 * src + (1 << 8) 3595*c0909341SAndroid Build Coastguard Worker psubd m1, m8 ; b3 - a3 * src + (1 << 8) 3596*c0909341SAndroid Build Coastguard Worker mova m7, [base+pd_0xffff] 3597*c0909341SAndroid Build Coastguard Worker psrld m0, 9 3598*c0909341SAndroid Build Coastguard Worker pslld m1, 7 3599*c0909341SAndroid Build Coastguard Worker pand m0, m7 3600*c0909341SAndroid Build Coastguard Worker pandn m8, m7, m1 3601*c0909341SAndroid Build Coastguard Worker por m0, m8 3602*c0909341SAndroid Build Coastguard Worker mova m1, [rsp+16+ARCH_X86_32*4] 3603*c0909341SAndroid Build Coastguard Worker mova m8, [rsp+32+ARCH_X86_32*4] 3604*c0909341SAndroid Build Coastguard Worker psubd m1, m2 3605*c0909341SAndroid Build Coastguard Worker psubd m8, m3 3606*c0909341SAndroid Build Coastguard Worker mova m2, [base+pd_4096] 3607*c0909341SAndroid Build Coastguard Worker psrld m1, 9 3608*c0909341SAndroid Build Coastguard Worker pslld m8, 7 3609*c0909341SAndroid Build Coastguard Worker pand m1, m7 3610*c0909341SAndroid Build Coastguard Worker pandn m7, m8 3611*c0909341SAndroid Build Coastguard Worker por m1, m7 3612*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 3613*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m15 3614*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3615*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3616*c0909341SAndroid Build Coastguard Worker%else 3617*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3618*c0909341SAndroid Build Coastguard Worker%endif 3619*c0909341SAndroid Build Coastguard Worker paddd m4, m2 3620*c0909341SAndroid Build Coastguard Worker paddd m5, m2 3621*c0909341SAndroid Build Coastguard Worker paddd m0, m4 3622*c0909341SAndroid Build Coastguard Worker paddd m1, m5 3623*c0909341SAndroid Build Coastguard Worker psrad m0, 8 3624*c0909341SAndroid Build Coastguard Worker psrad m1, 8 3625*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 ; clip 3626*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m7 3627*c0909341SAndroid Build Coastguard Worker psrlw m0, 5 3628*c0909341SAndroid Build Coastguard Worker mova [dstq+wq], m0 3629*c0909341SAndroid Build Coastguard Worker add wq, 16 3630*c0909341SAndroid Build Coastguard Worker jl .n0_loop 3631*c0909341SAndroid Build Coastguard Worker add dstq, stridemp 3632*c0909341SAndroid Build Coastguard Worker ret 3633*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3634*c0909341SAndroid Build Coastguard Worker SWAP m6, m7 3635*c0909341SAndroid Build Coastguard Worker%endif 3636*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3637*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 3638*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 3639*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 3640*c0909341SAndroid Build Coastguard Worker.n1_loop: 3641*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*1+400*4+4] 3642*c0909341SAndroid Build Coastguard Worker movu m5, [t4+wq*1+400*4+2] 3643*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*1+400*4+0] 3644*c0909341SAndroid Build Coastguard Worker paddw m5, m3 3645*c0909341SAndroid Build Coastguard Worker psllw m5, 2 ; a3[ 1] 444 3646*c0909341SAndroid Build Coastguard Worker psubw m4, m5, m3 ; a3[ 1] 343 3647*c0909341SAndroid Build Coastguard Worker paddw m3, m4, [t4+wq*1+400*12] 3648*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*1+400*10] 3649*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*10], m5 3650*c0909341SAndroid Build Coastguard Worker mova [t4+wq*1+400*12], m4 3651*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*2+400*8+ 8] 3652*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*2+400*8+ 4] 3653*c0909341SAndroid Build Coastguard Worker movu m7, [t3+wq*2+400*8+24] 3654*c0909341SAndroid Build Coastguard Worker movu m8, [t3+wq*2+400*8+20] 3655*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*2+400*8+ 0] 3656*c0909341SAndroid Build Coastguard Worker paddd m7, [t3+wq*2+400*8+16] 3657*c0909341SAndroid Build Coastguard Worker paddd m5, m1 3658*c0909341SAndroid Build Coastguard Worker paddd m8, m7 3659*c0909341SAndroid Build Coastguard Worker pslld m5, 2 ; b3[ 1] 444 3660*c0909341SAndroid Build Coastguard Worker pslld m8, 2 3661*c0909341SAndroid Build Coastguard Worker psubd m4, m5, m1 ; b3[ 1] 343 3662*c0909341SAndroid Build Coastguard Worker psubd m0, m8, m7 3663*c0909341SAndroid Build Coastguard Worker paddd m1, m4, [t3+wq*2+400*24+ 0] 3664*c0909341SAndroid Build Coastguard Worker paddd m7, m0, [t3+wq*2+400*24+16] 3665*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*2+400*20+ 0] 3666*c0909341SAndroid Build Coastguard Worker paddd m7, [t3+wq*2+400*20+16] 3667*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*20+ 0], m5 3668*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*20+16], m8 3669*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*24+ 0], m4 3670*c0909341SAndroid Build Coastguard Worker mova [t3+wq*2+400*24+16], m0 3671*c0909341SAndroid Build Coastguard Worker mova m5, [dstq+wq] 3672*c0909341SAndroid Build Coastguard Worker mova m2, [t4+wq*1+400* 6] 3673*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m6 3674*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m2, m6 3675*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m4 ; a5 * src 3676*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m3, m6 3677*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m4 ; a3 * src 3678*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 3679*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m6 3680*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5 3681*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 3682*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5 3683*c0909341SAndroid Build Coastguard Worker psubd m1, m0 ; b3 - a3 * src + (1 << 8) 3684*c0909341SAndroid Build Coastguard Worker pslld m4, 13 3685*c0909341SAndroid Build Coastguard Worker pslld m5, 13 3686*c0909341SAndroid Build Coastguard Worker mova m0, [t3+wq*2+400*12+ 0] 3687*c0909341SAndroid Build Coastguard Worker psubd m0, m8 ; b5 - a5 * src + (1 << 8) 3688*c0909341SAndroid Build Coastguard Worker mova m8, [t3+wq*2+400*12+16] 3689*c0909341SAndroid Build Coastguard Worker psubd m8, m2 3690*c0909341SAndroid Build Coastguard Worker psubd m7, m3 3691*c0909341SAndroid Build Coastguard Worker mova m2, [base+pd_0xffff] 3692*c0909341SAndroid Build Coastguard Worker pslld m1, 7 3693*c0909341SAndroid Build Coastguard Worker psrld m0, 8 3694*c0909341SAndroid Build Coastguard Worker psrld m8, 8 3695*c0909341SAndroid Build Coastguard Worker pslld m7, 7 3696*c0909341SAndroid Build Coastguard Worker pand m0, m2 3697*c0909341SAndroid Build Coastguard Worker pandn m3, m2, m1 3698*c0909341SAndroid Build Coastguard Worker por m0, m3 3699*c0909341SAndroid Build Coastguard Worker pand m8, m2 3700*c0909341SAndroid Build Coastguard Worker pandn m2, m7 3701*c0909341SAndroid Build Coastguard Worker por m2, m8 3702*c0909341SAndroid Build Coastguard Worker mova m1, [base+pd_4096] 3703*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 3704*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m15 3705*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3706*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3707*c0909341SAndroid Build Coastguard Worker%endif 3708*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3709*c0909341SAndroid Build Coastguard Worker paddd m4, m1 3710*c0909341SAndroid Build Coastguard Worker paddd m5, m1 3711*c0909341SAndroid Build Coastguard Worker paddd m0, m4 3712*c0909341SAndroid Build Coastguard Worker paddd m2, m5 3713*c0909341SAndroid Build Coastguard Worker psrad m0, 8 3714*c0909341SAndroid Build Coastguard Worker psrad m2, 8 3715*c0909341SAndroid Build Coastguard Worker packssdw m0, m2 ; clip 3716*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m7 3717*c0909341SAndroid Build Coastguard Worker psrlw m0, 5 3718*c0909341SAndroid Build Coastguard Worker mova [dstq+wq], m0 3719*c0909341SAndroid Build Coastguard Worker add wq, 16 3720*c0909341SAndroid Build Coastguard Worker jl .n1_loop 3721*c0909341SAndroid Build Coastguard Worker add dstq, stridemp 3722*c0909341SAndroid Build Coastguard Worker movif32 dstm, dstq 3723*c0909341SAndroid Build Coastguard Worker ret 3724