1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, VideoLabs 4*c0909341SAndroid Build Coastguard Worker; All rights reserved. 5*c0909341SAndroid Build Coastguard Worker; 6*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 7*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 8*c0909341SAndroid Build Coastguard Worker; 9*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 10*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 11*c0909341SAndroid Build Coastguard Worker; 12*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 13*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 14*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 15*c0909341SAndroid Build Coastguard Worker; 16*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*c0909341SAndroid Build Coastguard Worker 27*c0909341SAndroid Build Coastguard Worker%include "config.asm" 28*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 29*c0909341SAndroid Build Coastguard Worker 30*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16 31*c0909341SAndroid Build Coastguard Worker 32*c0909341SAndroid Build Coastguard Workerwiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4 33*c0909341SAndroid Build Coastguard Workerwiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 34*c0909341SAndroid Build Coastguard Workerwiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 35*c0909341SAndroid Build Coastguard Workerwiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 36*c0909341SAndroid Build Coastguard Workerwiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1 37*c0909341SAndroid Build Coastguard Workerwiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 38*c0909341SAndroid Build Coastguard Workersgr_lshuf3: db 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 39*c0909341SAndroid Build Coastguard Workersgr_lshuf5: db 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 40*c0909341SAndroid Build Coastguard Workerpb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 41*c0909341SAndroid Build Coastguard Worker 42*c0909341SAndroid Build Coastguard Workerpb_right_ext_mask: times 24 db 0xff 43*c0909341SAndroid Build Coastguard Worker times 8 db 0 44*c0909341SAndroid Build Coastguard Workerpb_1: times 16 db 1 45*c0909341SAndroid Build Coastguard Workerpw_256: times 8 dw 256 46*c0909341SAndroid Build Coastguard Workerpw_2056: times 8 dw 2056 47*c0909341SAndroid Build Coastguard Workerpw_m16380: times 8 dw -16380 48*c0909341SAndroid Build Coastguard Workerpd_4096: times 4 dd 4096 49*c0909341SAndroid Build Coastguard Workerpd_34816: times 4 dd 34816 50*c0909341SAndroid Build Coastguard Workerpd_0xffff: times 4 dd 0xffff 51*c0909341SAndroid Build Coastguard Workerpd_0xf00800a4: times 4 dd 0xf00800a4 52*c0909341SAndroid Build Coastguard Workerpd_0xf00801c7: times 4 dd 0xf00801c7 53*c0909341SAndroid Build Coastguard Worker 54*c0909341SAndroid Build Coastguard Workercextern sgr_x_by_x 55*c0909341SAndroid Build Coastguard Worker 56*c0909341SAndroid Build Coastguard WorkerSECTION .text 57*c0909341SAndroid Build Coastguard Worker 58*c0909341SAndroid Build Coastguard Worker%macro movif64 2 ; dst, src 59*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 60*c0909341SAndroid Build Coastguard Worker mov %1, %2 61*c0909341SAndroid Build Coastguard Worker %endif 62*c0909341SAndroid Build Coastguard Worker%endmacro 63*c0909341SAndroid Build Coastguard Worker 64*c0909341SAndroid Build Coastguard Worker%macro movif32 2 ; dst, src 65*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32 66*c0909341SAndroid Build Coastguard Worker mov %1, %2 67*c0909341SAndroid Build Coastguard Worker %endif 68*c0909341SAndroid Build Coastguard Worker%endmacro 69*c0909341SAndroid Build Coastguard Worker 70*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 71*c0909341SAndroid Build Coastguard Worker %define PIC_base_offset $$ 72*c0909341SAndroid Build Coastguard Worker 73*c0909341SAndroid Build Coastguard Worker %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg 74*c0909341SAndroid Build Coastguard Worker %assign pic_reg_stk_off 4 75*c0909341SAndroid Build Coastguard Worker %xdefine PIC_reg %1 76*c0909341SAndroid Build Coastguard Worker %if %2 == 1 77*c0909341SAndroid Build Coastguard Worker mov [esp], %1 78*c0909341SAndroid Build Coastguard Worker %endif 79*c0909341SAndroid Build Coastguard Worker LEA PIC_reg, PIC_base_offset 80*c0909341SAndroid Build Coastguard Worker %if %3 == 1 81*c0909341SAndroid Build Coastguard Worker XCHG_PIC_REG 82*c0909341SAndroid Build Coastguard Worker %endif 83*c0909341SAndroid Build Coastguard Worker %endmacro 84*c0909341SAndroid Build Coastguard Worker 85*c0909341SAndroid Build Coastguard Worker %macro XCHG_PIC_REG 0 86*c0909341SAndroid Build Coastguard Worker mov [esp+pic_reg_stk_off], PIC_reg 87*c0909341SAndroid Build Coastguard Worker %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 88*c0909341SAndroid Build Coastguard Worker mov PIC_reg, [esp+pic_reg_stk_off] 89*c0909341SAndroid Build Coastguard Worker %endmacro 90*c0909341SAndroid Build Coastguard Worker 91*c0909341SAndroid Build Coastguard Worker %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) 92*c0909341SAndroid Build Coastguard Worker 93*c0909341SAndroid Build Coastguard Worker%else 94*c0909341SAndroid Build Coastguard Worker %macro XCHG_PIC_REG 0 95*c0909341SAndroid Build Coastguard Worker %endmacro 96*c0909341SAndroid Build Coastguard Worker 97*c0909341SAndroid Build Coastguard Worker %define PIC_sym(sym) (sym) 98*c0909341SAndroid Build Coastguard Worker%endif 99*c0909341SAndroid Build Coastguard Worker 100*c0909341SAndroid Build Coastguard Worker%macro WIENER 0 101*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 102*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers 103*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ 104*c0909341SAndroid Build Coastguard Worker w, h, edge, flt, x 105*c0909341SAndroid Build Coastguard Worker %define tmpstrideq strideq 106*c0909341SAndroid Build Coastguard Worker %define base 0 107*c0909341SAndroid Build Coastguard Worker mov fltq, r6mp 108*c0909341SAndroid Build Coastguard Worker mov wd, wm 109*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 110*c0909341SAndroid Build Coastguard Worker mov edged, r7m 111*c0909341SAndroid Build Coastguard Worker movq m14, [fltq] 112*c0909341SAndroid Build Coastguard Worker add lpfq, wq 113*c0909341SAndroid Build Coastguard Worker movq m7, [fltq+16] 114*c0909341SAndroid Build Coastguard Worker add dstq, wq 115*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+16] 116*c0909341SAndroid Build Coastguard Worker mova m15, [pw_2056] 117*c0909341SAndroid Build Coastguard Worker neg wq 118*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 119*c0909341SAndroid Build Coastguard Worker pshufb m14, [wiener_init] 120*c0909341SAndroid Build Coastguard Worker mova m8, [wiener_shufA] 121*c0909341SAndroid Build Coastguard Worker pshufd m12, m14, q2222 ; x0 x0 122*c0909341SAndroid Build Coastguard Worker mova m9, [wiener_shufB] 123*c0909341SAndroid Build Coastguard Worker pshufd m13, m14, q3333 ; x1 x2 124*c0909341SAndroid Build Coastguard Worker mova m10, [wiener_shufC] 125*c0909341SAndroid Build Coastguard Worker punpcklqdq m14, m14 ; x3 126*c0909341SAndroid Build Coastguard Worker mova m11, [wiener_shufD] 127*c0909341SAndroid Build Coastguard Worker%else 128*c0909341SAndroid Build Coastguard Worker mova m10, [pw_m16380] 129*c0909341SAndroid Build Coastguard Worker punpcklwd m14, m14 130*c0909341SAndroid Build Coastguard Worker pshufd m11, m14, q0000 ; x0 131*c0909341SAndroid Build Coastguard Worker pshufd m12, m14, q1111 ; x1 132*c0909341SAndroid Build Coastguard Worker pshufd m13, m14, q2222 ; x2 133*c0909341SAndroid Build Coastguard Worker pshufd m14, m14, q3333 ; x3 134*c0909341SAndroid Build Coastguard Worker%endif 135*c0909341SAndroid Build Coastguard Worker%else 136*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4, 0, _, 5 137*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 138*c0909341SAndroid Build Coastguard Worker %define m10 [base+wiener_shufC] 139*c0909341SAndroid Build Coastguard Worker %define m11 [base+wiener_shufD] 140*c0909341SAndroid Build Coastguard Worker %define stk_off 96 141*c0909341SAndroid Build Coastguard Worker%else 142*c0909341SAndroid Build Coastguard Worker %define m10 [base+pw_m16380] 143*c0909341SAndroid Build Coastguard Worker %define m11 [stk+96] 144*c0909341SAndroid Build Coastguard Worker %define stk_off 112 145*c0909341SAndroid Build Coastguard Worker%endif 146*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride 147*c0909341SAndroid Build Coastguard Worker %define base r6-pb_right_ext_mask-21 148*c0909341SAndroid Build Coastguard Worker %define stk esp 149*c0909341SAndroid Build Coastguard Worker %define dstq leftq 150*c0909341SAndroid Build Coastguard Worker %define edgeb byte edged 151*c0909341SAndroid Build Coastguard Worker %define edged [stk+ 8] 152*c0909341SAndroid Build Coastguard Worker %define dstmp [stk+12] 153*c0909341SAndroid Build Coastguard Worker %define hd dword [stk+16] 154*c0909341SAndroid Build Coastguard Worker %define wq [stk+20] 155*c0909341SAndroid Build Coastguard Worker %define strideq [stk+24] 156*c0909341SAndroid Build Coastguard Worker %define leftmp [stk+28] 157*c0909341SAndroid Build Coastguard Worker %define t2 [stk+32] 158*c0909341SAndroid Build Coastguard Worker %define t4 [stk+36] 159*c0909341SAndroid Build Coastguard Worker %define t5 [stk+40] 160*c0909341SAndroid Build Coastguard Worker %define t6 [stk+44] 161*c0909341SAndroid Build Coastguard Worker %define m8 [base+wiener_shufA] 162*c0909341SAndroid Build Coastguard Worker %define m9 [base+wiener_shufB] 163*c0909341SAndroid Build Coastguard Worker %define m12 [stk+48] 164*c0909341SAndroid Build Coastguard Worker %define m13 [stk+64] 165*c0909341SAndroid Build Coastguard Worker %define m14 [stk+80] 166*c0909341SAndroid Build Coastguard Worker %define m15 [base+pw_2056] 167*c0909341SAndroid Build Coastguard Worker mov r1, r6m ; flt 168*c0909341SAndroid Build Coastguard Worker mov r0, r0m ; dst 169*c0909341SAndroid Build Coastguard Worker mov r4, r4m ; w 170*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 171*c0909341SAndroid Build Coastguard Worker mov r2, r7m ; edge 172*c0909341SAndroid Build Coastguard Worker mov r5, r5m ; h 173*c0909341SAndroid Build Coastguard Worker movq m3, [r1+ 0] 174*c0909341SAndroid Build Coastguard Worker movq m7, [r1+16] 175*c0909341SAndroid Build Coastguard Worker add r0, r4 176*c0909341SAndroid Build Coastguard Worker mov r1, r1m ; stride 177*c0909341SAndroid Build Coastguard Worker add lpfq, r4 178*c0909341SAndroid Build Coastguard Worker mov edged, r2 179*c0909341SAndroid Build Coastguard Worker mov r2, r2m ; left 180*c0909341SAndroid Build Coastguard Worker mov dstmp, r0 181*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+r4*2+stk_off] 182*c0909341SAndroid Build Coastguard Worker mov hd, r5 183*c0909341SAndroid Build Coastguard Worker neg r4 184*c0909341SAndroid Build Coastguard Worker LEA r6, pb_right_ext_mask+21 185*c0909341SAndroid Build Coastguard Worker mov wq, r4 186*c0909341SAndroid Build Coastguard Worker mov strideq, r1 187*c0909341SAndroid Build Coastguard Worker mov leftmp, r2 188*c0909341SAndroid Build Coastguard Worker mov r4, r1 189*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 190*c0909341SAndroid Build Coastguard Worker pshufb m3, [base+wiener_init] 191*c0909341SAndroid Build Coastguard Worker pshufd m1, m3, q2222 192*c0909341SAndroid Build Coastguard Worker pshufd m2, m3, q3333 193*c0909341SAndroid Build Coastguard Worker punpcklqdq m3, m3 194*c0909341SAndroid Build Coastguard Worker%else 195*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m3 196*c0909341SAndroid Build Coastguard Worker pshufd m0, m3, q0000 197*c0909341SAndroid Build Coastguard Worker pshufd m1, m3, q1111 198*c0909341SAndroid Build Coastguard Worker pshufd m2, m3, q2222 199*c0909341SAndroid Build Coastguard Worker pshufd m3, m3, q3333 200*c0909341SAndroid Build Coastguard Worker mova m11, m0 201*c0909341SAndroid Build Coastguard Worker%endif 202*c0909341SAndroid Build Coastguard Worker mova m12, m1 203*c0909341SAndroid Build Coastguard Worker mova m13, m2 204*c0909341SAndroid Build Coastguard Worker mova m14, m3 205*c0909341SAndroid Build Coastguard Worker%endif 206*c0909341SAndroid Build Coastguard Worker psllw m7, 5 207*c0909341SAndroid Build Coastguard Worker pshufd m6, m7, q0000 ; y0 y1 208*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q1111 ; y2 y3 209*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 210*c0909341SAndroid Build Coastguard Worker jz .no_top 211*c0909341SAndroid Build Coastguard Worker call .h_top 212*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 213*c0909341SAndroid Build Coastguard Worker mov t6, t1 214*c0909341SAndroid Build Coastguard Worker mov t5, t1 215*c0909341SAndroid Build Coastguard Worker add t1, 384*2 216*c0909341SAndroid Build Coastguard Worker call .h_top 217*c0909341SAndroid Build Coastguard Worker lea t3, [lpfq+tmpstrideq*4] 218*c0909341SAndroid Build Coastguard Worker mov lpfq, dstmp 219*c0909341SAndroid Build Coastguard Worker add t3, tmpstrideq 220*c0909341SAndroid Build Coastguard Worker mov [rsp], t3 ; below 221*c0909341SAndroid Build Coastguard Worker mov t4, t1 222*c0909341SAndroid Build Coastguard Worker add t1, 384*2 223*c0909341SAndroid Build Coastguard Worker call .h 224*c0909341SAndroid Build Coastguard Worker mov t3, t1 225*c0909341SAndroid Build Coastguard Worker mov t2, t1 226*c0909341SAndroid Build Coastguard Worker dec hd 227*c0909341SAndroid Build Coastguard Worker jz .v1 228*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 229*c0909341SAndroid Build Coastguard Worker add t1, 384*2 230*c0909341SAndroid Build Coastguard Worker call .h 231*c0909341SAndroid Build Coastguard Worker mov t2, t1 232*c0909341SAndroid Build Coastguard Worker dec hd 233*c0909341SAndroid Build Coastguard Worker jz .v2 234*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 235*c0909341SAndroid Build Coastguard Worker add t1, 384*2 236*c0909341SAndroid Build Coastguard Worker call .h 237*c0909341SAndroid Build Coastguard Worker dec hd 238*c0909341SAndroid Build Coastguard Worker jz .v3 239*c0909341SAndroid Build Coastguard Worker.main: 240*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 241*c0909341SAndroid Build Coastguard Worker.main_loop: 242*c0909341SAndroid Build Coastguard Worker call .hv 243*c0909341SAndroid Build Coastguard Worker dec hd 244*c0909341SAndroid Build Coastguard Worker jnz .main_loop 245*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 246*c0909341SAndroid Build Coastguard Worker jz .v3 247*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 248*c0909341SAndroid Build Coastguard Worker call .hv_bottom 249*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 250*c0909341SAndroid Build Coastguard Worker call .hv_bottom 251*c0909341SAndroid Build Coastguard Worker.v1: 252*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v 253*c0909341SAndroid Build Coastguard Worker RET 254*c0909341SAndroid Build Coastguard Worker.no_top: 255*c0909341SAndroid Build Coastguard Worker lea t3, [lpfq+tmpstrideq*4] 256*c0909341SAndroid Build Coastguard Worker mov lpfq, dstmp 257*c0909341SAndroid Build Coastguard Worker lea t3, [t3+tmpstrideq*2] 258*c0909341SAndroid Build Coastguard Worker mov [rsp], t3 259*c0909341SAndroid Build Coastguard Worker call .h 260*c0909341SAndroid Build Coastguard Worker mov t6, t1 261*c0909341SAndroid Build Coastguard Worker mov t5, t1 262*c0909341SAndroid Build Coastguard Worker mov t4, t1 263*c0909341SAndroid Build Coastguard Worker mov t3, t1 264*c0909341SAndroid Build Coastguard Worker mov t2, t1 265*c0909341SAndroid Build Coastguard Worker dec hd 266*c0909341SAndroid Build Coastguard Worker jz .v1 267*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 268*c0909341SAndroid Build Coastguard Worker add t1, 384*2 269*c0909341SAndroid Build Coastguard Worker call .h 270*c0909341SAndroid Build Coastguard Worker mov t2, t1 271*c0909341SAndroid Build Coastguard Worker dec hd 272*c0909341SAndroid Build Coastguard Worker jz .v2 273*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 274*c0909341SAndroid Build Coastguard Worker add t1, 384*2 275*c0909341SAndroid Build Coastguard Worker call .h 276*c0909341SAndroid Build Coastguard Worker dec hd 277*c0909341SAndroid Build Coastguard Worker jz .v3 278*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 279*c0909341SAndroid Build Coastguard Worker call .hv 280*c0909341SAndroid Build Coastguard Worker dec hd 281*c0909341SAndroid Build Coastguard Worker jz .v3 282*c0909341SAndroid Build Coastguard Worker add t0, 384*8 283*c0909341SAndroid Build Coastguard Worker call .hv 284*c0909341SAndroid Build Coastguard Worker dec hd 285*c0909341SAndroid Build Coastguard Worker jnz .main 286*c0909341SAndroid Build Coastguard Worker.v3: 287*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v 288*c0909341SAndroid Build Coastguard Worker.v2: 289*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v 290*c0909341SAndroid Build Coastguard Worker jmp .v1 291*c0909341SAndroid Build Coastguard Worker.extend_right: 292*c0909341SAndroid Build Coastguard Worker movd m2, [lpfq-1] 293*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 294*c0909341SAndroid Build Coastguard Worker push r0 295*c0909341SAndroid Build Coastguard Worker lea r0, [pb_right_ext_mask+21] 296*c0909341SAndroid Build Coastguard Worker movu m0, [r0+xq+0] 297*c0909341SAndroid Build Coastguard Worker movu m1, [r0+xq+8] 298*c0909341SAndroid Build Coastguard Worker pop r0 299*c0909341SAndroid Build Coastguard Worker%else 300*c0909341SAndroid Build Coastguard Worker movu m0, [r6+xq+0] 301*c0909341SAndroid Build Coastguard Worker movu m1, [r6+xq+8] 302*c0909341SAndroid Build Coastguard Worker%endif 303*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 304*c0909341SAndroid Build Coastguard Worker pxor m3, m3 305*c0909341SAndroid Build Coastguard Worker pshufb m2, m3 306*c0909341SAndroid Build Coastguard Worker%else 307*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m2 308*c0909341SAndroid Build Coastguard Worker pshuflw m2, m2, q0000 309*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m2 310*c0909341SAndroid Build Coastguard Worker%endif 311*c0909341SAndroid Build Coastguard Worker pand m4, m0 312*c0909341SAndroid Build Coastguard Worker pand m5, m1 313*c0909341SAndroid Build Coastguard Worker pandn m0, m2 314*c0909341SAndroid Build Coastguard Worker pandn m1, m2 315*c0909341SAndroid Build Coastguard Worker por m4, m0 316*c0909341SAndroid Build Coastguard Worker por m5, m1 317*c0909341SAndroid Build Coastguard Worker ret 318*c0909341SAndroid Build Coastguard Worker.h: 319*c0909341SAndroid Build Coastguard Worker %define stk esp+4 ; offset due to call 320*c0909341SAndroid Build Coastguard Worker mov xq, wq 321*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 322*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 323*c0909341SAndroid Build Coastguard Worker movifnidn leftq, leftmp 324*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+xq] 325*c0909341SAndroid Build Coastguard Worker movd m5, [leftq] 326*c0909341SAndroid Build Coastguard Worker add leftq, 4 327*c0909341SAndroid Build Coastguard Worker pslldq m4, 4 328*c0909341SAndroid Build Coastguard Worker por m4, m5 329*c0909341SAndroid Build Coastguard Worker movifnidn leftmp, leftq 330*c0909341SAndroid Build Coastguard Worker jmp .h_main 331*c0909341SAndroid Build Coastguard Worker.h_extend_left: 332*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 333*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+xq] 334*c0909341SAndroid Build Coastguard Worker pshufb m4, [base+wiener_l_shuf] 335*c0909341SAndroid Build Coastguard Worker%else 336*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+xq] 337*c0909341SAndroid Build Coastguard Worker pshufd m4, m5, q2103 338*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m5 339*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m5 340*c0909341SAndroid Build Coastguard Worker movss m4, m5 341*c0909341SAndroid Build Coastguard Worker%endif 342*c0909341SAndroid Build Coastguard Worker jmp .h_main 343*c0909341SAndroid Build Coastguard Worker.h_top: 344*c0909341SAndroid Build Coastguard Worker mov xq, wq 345*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 346*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 347*c0909341SAndroid Build Coastguard Worker.h_loop: 348*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+xq-4] 349*c0909341SAndroid Build Coastguard Worker.h_main: 350*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+xq+4] 351*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 352*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 353*c0909341SAndroid Build Coastguard Worker cmp xd, -18 354*c0909341SAndroid Build Coastguard Worker jl .h_have_right 355*c0909341SAndroid Build Coastguard Worker call .extend_right 356*c0909341SAndroid Build Coastguard Worker.h_have_right: 357*c0909341SAndroid Build Coastguard Worker%macro %%h7 0 358*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 359*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m8 360*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m12 361*c0909341SAndroid Build Coastguard Worker pshufb m1, m5, m8 362*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m12 363*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m9 364*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m13 365*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m9 366*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m13 367*c0909341SAndroid Build Coastguard Worker paddw m0, m2 368*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m10 369*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m13 370*c0909341SAndroid Build Coastguard Worker paddw m1, m3 371*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m10 372*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m13 373*c0909341SAndroid Build Coastguard Worker pshufb m4, m11 374*c0909341SAndroid Build Coastguard Worker paddw m0, m2 375*c0909341SAndroid Build Coastguard Worker pmullw m2, m14, m4 376*c0909341SAndroid Build Coastguard Worker pshufb m5, m11 377*c0909341SAndroid Build Coastguard Worker paddw m1, m3 378*c0909341SAndroid Build Coastguard Worker pmullw m3, m14, m5 379*c0909341SAndroid Build Coastguard Worker psllw m4, 7 380*c0909341SAndroid Build Coastguard Worker psllw m5, 7 381*c0909341SAndroid Build Coastguard Worker paddw m0, m2 382*c0909341SAndroid Build Coastguard Worker mova m2, [base+pw_m16380] 383*c0909341SAndroid Build Coastguard Worker paddw m1, m3 384*c0909341SAndroid Build Coastguard Worker paddw m4, m2 385*c0909341SAndroid Build Coastguard Worker paddw m5, m2 386*c0909341SAndroid Build Coastguard Worker paddsw m0, m4 387*c0909341SAndroid Build Coastguard Worker paddsw m1, m5 388*c0909341SAndroid Build Coastguard Worker%else 389*c0909341SAndroid Build Coastguard Worker psrldq m0, m4, 1 390*c0909341SAndroid Build Coastguard Worker pslldq m1, m4, 1 391*c0909341SAndroid Build Coastguard Worker pxor m3, m3 392*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m3 393*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m3 394*c0909341SAndroid Build Coastguard Worker paddw m0, m1 395*c0909341SAndroid Build Coastguard Worker pmullw m0, m11 396*c0909341SAndroid Build Coastguard Worker psrldq m1, m4, 2 397*c0909341SAndroid Build Coastguard Worker pslldq m2, m4, 2 398*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3 399*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m3 400*c0909341SAndroid Build Coastguard Worker paddw m1, m2 401*c0909341SAndroid Build Coastguard Worker pmullw m1, m12 402*c0909341SAndroid Build Coastguard Worker paddw m0, m1 403*c0909341SAndroid Build Coastguard Worker pshufd m2, m4, q0321 404*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3 405*c0909341SAndroid Build Coastguard Worker pmullw m1, m14, m2 406*c0909341SAndroid Build Coastguard Worker paddw m0, m1 407*c0909341SAndroid Build Coastguard Worker psrldq m1, m4, 3 408*c0909341SAndroid Build Coastguard Worker pslldq m4, 3 409*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3 410*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m3 411*c0909341SAndroid Build Coastguard Worker paddw m1, m4 412*c0909341SAndroid Build Coastguard Worker pmullw m1, m13 413*c0909341SAndroid Build Coastguard Worker paddw m0, m1 414*c0909341SAndroid Build Coastguard Worker psllw m2, 7 415*c0909341SAndroid Build Coastguard Worker paddw m2, m10 416*c0909341SAndroid Build Coastguard Worker paddsw m0, m2 417*c0909341SAndroid Build Coastguard Worker psrldq m1, m5, 1 418*c0909341SAndroid Build Coastguard Worker pslldq m2, m5, 1 419*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3 420*c0909341SAndroid Build Coastguard Worker punpckhbw m2, m3 421*c0909341SAndroid Build Coastguard Worker paddw m1, m2 422*c0909341SAndroid Build Coastguard Worker pmullw m1, m11 423*c0909341SAndroid Build Coastguard Worker psrldq m2, m5, 2 424*c0909341SAndroid Build Coastguard Worker pslldq m4, m5, 2 425*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3 426*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m3 427*c0909341SAndroid Build Coastguard Worker paddw m2, m4 428*c0909341SAndroid Build Coastguard Worker pmullw m2, m12 429*c0909341SAndroid Build Coastguard Worker paddw m1, m2 430*c0909341SAndroid Build Coastguard Worker pshufd m4, m5, q0321 431*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m3 432*c0909341SAndroid Build Coastguard Worker pmullw m2, m14, m4 433*c0909341SAndroid Build Coastguard Worker paddw m1, m2 434*c0909341SAndroid Build Coastguard Worker psrldq m2, m5, 3 435*c0909341SAndroid Build Coastguard Worker pslldq m5, 3 436*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3 437*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m3 438*c0909341SAndroid Build Coastguard Worker paddw m2, m5 439*c0909341SAndroid Build Coastguard Worker pmullw m2, m13 440*c0909341SAndroid Build Coastguard Worker paddw m1, m2 441*c0909341SAndroid Build Coastguard Worker psllw m4, 7 442*c0909341SAndroid Build Coastguard Worker paddw m4, m10 443*c0909341SAndroid Build Coastguard Worker paddsw m1, m4 444*c0909341SAndroid Build Coastguard Worker%endif 445*c0909341SAndroid Build Coastguard Worker%endmacro 446*c0909341SAndroid Build Coastguard Worker %%h7 447*c0909341SAndroid Build Coastguard Worker psraw m0, 3 448*c0909341SAndroid Build Coastguard Worker psraw m1, 3 449*c0909341SAndroid Build Coastguard Worker paddw m0, m15 450*c0909341SAndroid Build Coastguard Worker paddw m1, m15 451*c0909341SAndroid Build Coastguard Worker mova [t1+xq*2+ 0], m0 452*c0909341SAndroid Build Coastguard Worker mova [t1+xq*2+16], m1 453*c0909341SAndroid Build Coastguard Worker add xq, 16 454*c0909341SAndroid Build Coastguard Worker jl .h_loop 455*c0909341SAndroid Build Coastguard Worker ret 456*c0909341SAndroid Build Coastguard WorkerALIGN function_align 457*c0909341SAndroid Build Coastguard Worker.hv: 458*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 459*c0909341SAndroid Build Coastguard Worker mov xq, wq 460*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 461*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 462*c0909341SAndroid Build Coastguard Worker movifnidn leftq, leftmp 463*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+xq] 464*c0909341SAndroid Build Coastguard Worker movd m5, [leftq] 465*c0909341SAndroid Build Coastguard Worker add leftq, 4 466*c0909341SAndroid Build Coastguard Worker pslldq m4, 4 467*c0909341SAndroid Build Coastguard Worker por m4, m5 468*c0909341SAndroid Build Coastguard Worker movifnidn leftmp, leftq 469*c0909341SAndroid Build Coastguard Worker jmp .hv_main 470*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 471*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 472*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+xq] 473*c0909341SAndroid Build Coastguard Worker pshufb m4, [base+wiener_l_shuf] 474*c0909341SAndroid Build Coastguard Worker%else 475*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+xq] 476*c0909341SAndroid Build Coastguard Worker pshufd m4, m5, q2103 477*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m5 478*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m5 479*c0909341SAndroid Build Coastguard Worker movss m4, m5 480*c0909341SAndroid Build Coastguard Worker%endif 481*c0909341SAndroid Build Coastguard Worker jmp .hv_main 482*c0909341SAndroid Build Coastguard Worker.hv_bottom: 483*c0909341SAndroid Build Coastguard Worker mov xq, wq 484*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 485*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 486*c0909341SAndroid Build Coastguard Worker.hv_loop: 487*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+xq-4] 488*c0909341SAndroid Build Coastguard Worker.hv_main: 489*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+xq+4] 490*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 491*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 492*c0909341SAndroid Build Coastguard Worker cmp xd, -18 493*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 494*c0909341SAndroid Build Coastguard Worker call .extend_right 495*c0909341SAndroid Build Coastguard Worker.hv_have_right: 496*c0909341SAndroid Build Coastguard Worker %%h7 497*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 498*c0909341SAndroid Build Coastguard Worker mova m2, [t4+xq*2] 499*c0909341SAndroid Build Coastguard Worker paddw m2, [t2+xq*2] 500*c0909341SAndroid Build Coastguard Worker%else 501*c0909341SAndroid Build Coastguard Worker mov r2, t4 502*c0909341SAndroid Build Coastguard Worker mova m2, [r2+xq*2] 503*c0909341SAndroid Build Coastguard Worker mov r2, t2 504*c0909341SAndroid Build Coastguard Worker paddw m2, [r2+xq*2] 505*c0909341SAndroid Build Coastguard Worker mov r2, t5 506*c0909341SAndroid Build Coastguard Worker%endif 507*c0909341SAndroid Build Coastguard Worker mova m3, [t3+xq*2] 508*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 509*c0909341SAndroid Build Coastguard Worker mova m5, [t5+xq*2] 510*c0909341SAndroid Build Coastguard Worker%else 511*c0909341SAndroid Build Coastguard Worker mova m5, [r2+xq*2] 512*c0909341SAndroid Build Coastguard Worker mov r2, t6 513*c0909341SAndroid Build Coastguard Worker%endif 514*c0909341SAndroid Build Coastguard Worker paddw m5, [t1+xq*2] 515*c0909341SAndroid Build Coastguard Worker psraw m0, 3 516*c0909341SAndroid Build Coastguard Worker psraw m1, 3 517*c0909341SAndroid Build Coastguard Worker paddw m0, m15 518*c0909341SAndroid Build Coastguard Worker paddw m1, m15 519*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 520*c0909341SAndroid Build Coastguard Worker paddw m4, m0, [t6+xq*2] 521*c0909341SAndroid Build Coastguard Worker%else 522*c0909341SAndroid Build Coastguard Worker paddw m4, m0, [r2+xq*2] 523*c0909341SAndroid Build Coastguard Worker mov r2, t4 524*c0909341SAndroid Build Coastguard Worker%endif 525*c0909341SAndroid Build Coastguard Worker mova [t0+xq*2], m0 526*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2, m3 527*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m7 528*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 529*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m7 530*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 531*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m6 532*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 533*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m6 534*c0909341SAndroid Build Coastguard Worker paddd m0, m3 535*c0909341SAndroid Build Coastguard Worker mova m3, [t3+xq*2+16] 536*c0909341SAndroid Build Coastguard Worker paddd m4, m2 537*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 538*c0909341SAndroid Build Coastguard Worker mova m2, [t4+xq*2+16] 539*c0909341SAndroid Build Coastguard Worker paddw m2, [t2+xq*2+16] 540*c0909341SAndroid Build Coastguard Worker mova m5, [t5+xq*2+16] 541*c0909341SAndroid Build Coastguard Worker%else 542*c0909341SAndroid Build Coastguard Worker mova m2, [r2+xq*2+16] 543*c0909341SAndroid Build Coastguard Worker mov r2, t2 544*c0909341SAndroid Build Coastguard Worker paddw m2, [r2+xq*2+16] 545*c0909341SAndroid Build Coastguard Worker mov r2, t5 546*c0909341SAndroid Build Coastguard Worker mova m5, [r2+xq*2+16] 547*c0909341SAndroid Build Coastguard Worker mov r2, t6 548*c0909341SAndroid Build Coastguard Worker%endif 549*c0909341SAndroid Build Coastguard Worker paddw m5, [t1+xq*2+16] 550*c0909341SAndroid Build Coastguard Worker packuswb m0, m4 551*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 552*c0909341SAndroid Build Coastguard Worker paddw m4, m1, [t6+xq*2+16] 553*c0909341SAndroid Build Coastguard Worker%else 554*c0909341SAndroid Build Coastguard Worker paddw m4, m1, [r2+xq*2+16] 555*c0909341SAndroid Build Coastguard Worker mov dstq, dstmp 556*c0909341SAndroid Build Coastguard Worker%endif 557*c0909341SAndroid Build Coastguard Worker mova [t0+xq*2+16], m1 558*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 559*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m7 560*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 561*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m7 562*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 563*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m6 564*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 565*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m6 566*c0909341SAndroid Build Coastguard Worker paddd m1, m3 567*c0909341SAndroid Build Coastguard Worker paddd m2, m4 568*c0909341SAndroid Build Coastguard Worker packuswb m1, m2 569*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 570*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 571*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 572*c0909341SAndroid Build Coastguard Worker mova [dstq+xq], m0 573*c0909341SAndroid Build Coastguard Worker add xq, 16 574*c0909341SAndroid Build Coastguard Worker jl .hv_loop 575*c0909341SAndroid Build Coastguard Worker add dstq, strideq 576*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 577*c0909341SAndroid Build Coastguard Worker mov t6, t5 578*c0909341SAndroid Build Coastguard Worker mov t5, t4 579*c0909341SAndroid Build Coastguard Worker mov t4, t3 580*c0909341SAndroid Build Coastguard Worker mov t3, t2 581*c0909341SAndroid Build Coastguard Worker mov t2, t1 582*c0909341SAndroid Build Coastguard Worker mov t1, t0 583*c0909341SAndroid Build Coastguard Worker mov t0, t6 584*c0909341SAndroid Build Coastguard Worker%else 585*c0909341SAndroid Build Coastguard Worker mov dstmp, dstq 586*c0909341SAndroid Build Coastguard Worker mov r1, t5 587*c0909341SAndroid Build Coastguard Worker mov r2, t4 588*c0909341SAndroid Build Coastguard Worker mov t6, r1 589*c0909341SAndroid Build Coastguard Worker mov t5, r2 590*c0909341SAndroid Build Coastguard Worker mov t4, t3 591*c0909341SAndroid Build Coastguard Worker mov t3, t2 592*c0909341SAndroid Build Coastguard Worker mov t2, t1 593*c0909341SAndroid Build Coastguard Worker mov t1, t0 594*c0909341SAndroid Build Coastguard Worker mov t0, r1 595*c0909341SAndroid Build Coastguard Worker%endif 596*c0909341SAndroid Build Coastguard Worker ret 597*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code 598*c0909341SAndroid Build Coastguard Worker.v: 599*c0909341SAndroid Build Coastguard Worker mov xq, wq 600*c0909341SAndroid Build Coastguard Worker.v_loop: 601*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 602*c0909341SAndroid Build Coastguard Worker mova m1, [t4+xq*2] 603*c0909341SAndroid Build Coastguard Worker paddw m1, [t2+xq*2] 604*c0909341SAndroid Build Coastguard Worker%else 605*c0909341SAndroid Build Coastguard Worker mov r2, t4 606*c0909341SAndroid Build Coastguard Worker mova m1, [r2+xq*2] 607*c0909341SAndroid Build Coastguard Worker mov r2, t2 608*c0909341SAndroid Build Coastguard Worker paddw m1, [r2+xq*2] 609*c0909341SAndroid Build Coastguard Worker mov r2, t6 610*c0909341SAndroid Build Coastguard Worker%endif 611*c0909341SAndroid Build Coastguard Worker mova m2, [t3+xq*2] 612*c0909341SAndroid Build Coastguard Worker mova m4, [t1+xq*2] 613*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 614*c0909341SAndroid Build Coastguard Worker paddw m3, m4, [t6+xq*2] 615*c0909341SAndroid Build Coastguard Worker paddw m4, [t5+xq*2] 616*c0909341SAndroid Build Coastguard Worker%else 617*c0909341SAndroid Build Coastguard Worker paddw m3, m4, [r2+xq*2] 618*c0909341SAndroid Build Coastguard Worker mov r2, t5 619*c0909341SAndroid Build Coastguard Worker paddw m4, [r2+xq*2] 620*c0909341SAndroid Build Coastguard Worker mov r2, t4 621*c0909341SAndroid Build Coastguard Worker%endif 622*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m2 623*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m7 624*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2 625*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m7 626*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 627*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m6 628*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 629*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m6 630*c0909341SAndroid Build Coastguard Worker paddd m0, m2 631*c0909341SAndroid Build Coastguard Worker paddd m1, m3 632*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 633*c0909341SAndroid Build Coastguard Worker mova m2, [t4+xq*2+16] 634*c0909341SAndroid Build Coastguard Worker paddw m2, [t2+xq*2+16] 635*c0909341SAndroid Build Coastguard Worker%else 636*c0909341SAndroid Build Coastguard Worker mova m2, [r2+xq*2+16] 637*c0909341SAndroid Build Coastguard Worker mov r2, t2 638*c0909341SAndroid Build Coastguard Worker paddw m2, [r2+xq*2+16] 639*c0909341SAndroid Build Coastguard Worker mov r2, t6 640*c0909341SAndroid Build Coastguard Worker%endif 641*c0909341SAndroid Build Coastguard Worker mova m3, [t3+xq*2+16] 642*c0909341SAndroid Build Coastguard Worker mova m5, [t1+xq*2+16] 643*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 644*c0909341SAndroid Build Coastguard Worker paddw m4, m5, [t6+xq*2+16] 645*c0909341SAndroid Build Coastguard Worker paddw m5, [t5+xq*2+16] 646*c0909341SAndroid Build Coastguard Worker%else 647*c0909341SAndroid Build Coastguard Worker paddw m4, m5, [r2+xq*2+16] 648*c0909341SAndroid Build Coastguard Worker mov r2, t5 649*c0909341SAndroid Build Coastguard Worker paddw m5, [r2+xq*2+16] 650*c0909341SAndroid Build Coastguard Worker movifnidn dstq, dstmp 651*c0909341SAndroid Build Coastguard Worker%endif 652*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 653*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 654*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m7 655*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 656*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m7 657*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 658*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m6 659*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 660*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m6 661*c0909341SAndroid Build Coastguard Worker paddd m1, m3 662*c0909341SAndroid Build Coastguard Worker paddd m2, m4 663*c0909341SAndroid Build Coastguard Worker packuswb m1, m2 664*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 665*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 666*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 667*c0909341SAndroid Build Coastguard Worker mova [dstq+xq], m0 668*c0909341SAndroid Build Coastguard Worker add xq, 16 669*c0909341SAndroid Build Coastguard Worker jl .v_loop 670*c0909341SAndroid Build Coastguard Worker add dstq, strideq 671*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 672*c0909341SAndroid Build Coastguard Worker mov t6, t5 673*c0909341SAndroid Build Coastguard Worker mov t5, t4 674*c0909341SAndroid Build Coastguard Worker%else 675*c0909341SAndroid Build Coastguard Worker mov dstmp, dstq 676*c0909341SAndroid Build Coastguard Worker mov r1, t5 677*c0909341SAndroid Build Coastguard Worker mov r2, t4 678*c0909341SAndroid Build Coastguard Worker mov t6, r1 679*c0909341SAndroid Build Coastguard Worker mov t5, r2 680*c0909341SAndroid Build Coastguard Worker%endif 681*c0909341SAndroid Build Coastguard Worker mov t4, t3 682*c0909341SAndroid Build Coastguard Worker mov t3, t2 683*c0909341SAndroid Build Coastguard Worker mov t2, t1 684*c0909341SAndroid Build Coastguard Worker ret 685*c0909341SAndroid Build Coastguard Worker%endif 686*c0909341SAndroid Build Coastguard Worker 687*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 688*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ 689*c0909341SAndroid Build Coastguard Worker w, h, edge, flt, x 690*c0909341SAndroid Build Coastguard Worker mov fltq, r6mp 691*c0909341SAndroid Build Coastguard Worker mov wd, wm 692*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 693*c0909341SAndroid Build Coastguard Worker mov edged, r7m 694*c0909341SAndroid Build Coastguard Worker movq m14, [fltq] 695*c0909341SAndroid Build Coastguard Worker add lpfq, wq 696*c0909341SAndroid Build Coastguard Worker movq m7, [fltq+16] 697*c0909341SAndroid Build Coastguard Worker add dstq, wq 698*c0909341SAndroid Build Coastguard Worker mova m8, [pw_m16380] 699*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+16] 700*c0909341SAndroid Build Coastguard Worker mova m15, [pw_2056] 701*c0909341SAndroid Build Coastguard Worker neg wq 702*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 703*c0909341SAndroid Build Coastguard Worker pshufb m14, [wiener_init] 704*c0909341SAndroid Build Coastguard Worker mova m9, [wiener_shufB] 705*c0909341SAndroid Build Coastguard Worker pshufd m13, m14, q3333 ; x1 x2 706*c0909341SAndroid Build Coastguard Worker mova m10, [wiener_shufC] 707*c0909341SAndroid Build Coastguard Worker punpcklqdq m14, m14 ; x3 708*c0909341SAndroid Build Coastguard Worker mova m11, [wiener_shufD] 709*c0909341SAndroid Build Coastguard Worker mova m12, [wiener_l_shuf] 710*c0909341SAndroid Build Coastguard Worker%else 711*c0909341SAndroid Build Coastguard Worker punpcklwd m14, m14 712*c0909341SAndroid Build Coastguard Worker pshufd m11, m14, q1111 ; x1 713*c0909341SAndroid Build Coastguard Worker pshufd m13, m14, q2222 ; x2 714*c0909341SAndroid Build Coastguard Worker pshufd m14, m14, q3333 ; x3 715*c0909341SAndroid Build Coastguard Worker%endif 716*c0909341SAndroid Build Coastguard Worker%else 717*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 718*c0909341SAndroid Build Coastguard Worker %define stk_off 80 719*c0909341SAndroid Build Coastguard Worker%else 720*c0909341SAndroid Build Coastguard Worker %define m11 [stk+80] 721*c0909341SAndroid Build Coastguard Worker %define stk_off 96 722*c0909341SAndroid Build Coastguard Worker%endif 723*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride 724*c0909341SAndroid Build Coastguard Worker %define stk esp 725*c0909341SAndroid Build Coastguard Worker %define leftmp [stk+28] 726*c0909341SAndroid Build Coastguard Worker %define m8 [base+pw_m16380] 727*c0909341SAndroid Build Coastguard Worker %define m12 [base+wiener_l_shuf] 728*c0909341SAndroid Build Coastguard Worker %define m14 [stk+48] 729*c0909341SAndroid Build Coastguard Worker mov r1, r6m ; flt 730*c0909341SAndroid Build Coastguard Worker mov r0, r0m ; dst 731*c0909341SAndroid Build Coastguard Worker mov r4, r4m ; w 732*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 733*c0909341SAndroid Build Coastguard Worker mov r2, r7m ; edge 734*c0909341SAndroid Build Coastguard Worker mov r5, r5m ; h 735*c0909341SAndroid Build Coastguard Worker movq m2, [r1+ 0] 736*c0909341SAndroid Build Coastguard Worker movq m7, [r1+16] 737*c0909341SAndroid Build Coastguard Worker add r0, r4 738*c0909341SAndroid Build Coastguard Worker mov r1, r1m ; stride 739*c0909341SAndroid Build Coastguard Worker add lpfq, r4 740*c0909341SAndroid Build Coastguard Worker mov edged, r2 741*c0909341SAndroid Build Coastguard Worker mov r2, r2m ; left 742*c0909341SAndroid Build Coastguard Worker mov dstmp, r0 743*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+r4*2+stk_off] 744*c0909341SAndroid Build Coastguard Worker mov hd, r5 745*c0909341SAndroid Build Coastguard Worker neg r4 746*c0909341SAndroid Build Coastguard Worker LEA r6, pb_right_ext_mask+21 747*c0909341SAndroid Build Coastguard Worker mov wq, r4 748*c0909341SAndroid Build Coastguard Worker mov strideq, r1 749*c0909341SAndroid Build Coastguard Worker mov leftmp, r2 750*c0909341SAndroid Build Coastguard Worker mov r4, r1 751*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 752*c0909341SAndroid Build Coastguard Worker pshufb m2, [base+wiener_init] 753*c0909341SAndroid Build Coastguard Worker pshufd m1, m2, q3333 754*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m2 755*c0909341SAndroid Build Coastguard Worker%else 756*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m2 757*c0909341SAndroid Build Coastguard Worker pshufd m0, m2, q1111 758*c0909341SAndroid Build Coastguard Worker pshufd m1, m2, q2222 759*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q3333 760*c0909341SAndroid Build Coastguard Worker mova m11, m0 761*c0909341SAndroid Build Coastguard Worker%endif 762*c0909341SAndroid Build Coastguard Worker mova m13, m1 763*c0909341SAndroid Build Coastguard Worker mova m14, m2 764*c0909341SAndroid Build Coastguard Worker%endif 765*c0909341SAndroid Build Coastguard Worker psllw m7, 5 766*c0909341SAndroid Build Coastguard Worker pshufd m6, m7, q0000 ; __ y1 767*c0909341SAndroid Build Coastguard Worker pshufd m7, m7, q1111 ; y2 y3 768*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 769*c0909341SAndroid Build Coastguard Worker jz .no_top 770*c0909341SAndroid Build Coastguard Worker call .h_top 771*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 772*c0909341SAndroid Build Coastguard Worker mov t4, t1 773*c0909341SAndroid Build Coastguard Worker add t1, 384*2 774*c0909341SAndroid Build Coastguard Worker call .h_top 775*c0909341SAndroid Build Coastguard Worker lea xq, [lpfq+tmpstrideq*4] 776*c0909341SAndroid Build Coastguard Worker mov lpfq, dstmp 777*c0909341SAndroid Build Coastguard Worker mov t3, t1 778*c0909341SAndroid Build Coastguard Worker add t1, 384*2 779*c0909341SAndroid Build Coastguard Worker add xq, tmpstrideq 780*c0909341SAndroid Build Coastguard Worker mov [rsp], xq ; below 781*c0909341SAndroid Build Coastguard Worker call .h 782*c0909341SAndroid Build Coastguard Worker mov t2, t1 783*c0909341SAndroid Build Coastguard Worker dec hd 784*c0909341SAndroid Build Coastguard Worker jz .v1 785*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 786*c0909341SAndroid Build Coastguard Worker add t1, 384*2 787*c0909341SAndroid Build Coastguard Worker call .h 788*c0909341SAndroid Build Coastguard Worker dec hd 789*c0909341SAndroid Build Coastguard Worker jz .v2 790*c0909341SAndroid Build Coastguard Worker.main: 791*c0909341SAndroid Build Coastguard Worker mov t0, t4 792*c0909341SAndroid Build Coastguard Worker.main_loop: 793*c0909341SAndroid Build Coastguard Worker call .hv 794*c0909341SAndroid Build Coastguard Worker dec hd 795*c0909341SAndroid Build Coastguard Worker jnz .main_loop 796*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 797*c0909341SAndroid Build Coastguard Worker jz .v2 798*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 799*c0909341SAndroid Build Coastguard Worker call .hv_bottom 800*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 801*c0909341SAndroid Build Coastguard Worker call .hv_bottom 802*c0909341SAndroid Build Coastguard Worker.end: 803*c0909341SAndroid Build Coastguard Worker RET 804*c0909341SAndroid Build Coastguard Worker.no_top: 805*c0909341SAndroid Build Coastguard Worker lea t3, [lpfq+tmpstrideq*4] 806*c0909341SAndroid Build Coastguard Worker mov lpfq, dstmp 807*c0909341SAndroid Build Coastguard Worker lea t3, [t3+tmpstrideq*2] 808*c0909341SAndroid Build Coastguard Worker mov [rsp], t3 809*c0909341SAndroid Build Coastguard Worker call .h 810*c0909341SAndroid Build Coastguard Worker mov t4, t1 811*c0909341SAndroid Build Coastguard Worker mov t3, t1 812*c0909341SAndroid Build Coastguard Worker mov t2, t1 813*c0909341SAndroid Build Coastguard Worker dec hd 814*c0909341SAndroid Build Coastguard Worker jz .v1 815*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 816*c0909341SAndroid Build Coastguard Worker add t1, 384*2 817*c0909341SAndroid Build Coastguard Worker call .h 818*c0909341SAndroid Build Coastguard Worker dec hd 819*c0909341SAndroid Build Coastguard Worker jz .v2 820*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 821*c0909341SAndroid Build Coastguard Worker call .hv 822*c0909341SAndroid Build Coastguard Worker dec hd 823*c0909341SAndroid Build Coastguard Worker jz .v2 824*c0909341SAndroid Build Coastguard Worker add t0, 384*6 825*c0909341SAndroid Build Coastguard Worker call .hv 826*c0909341SAndroid Build Coastguard Worker dec hd 827*c0909341SAndroid Build Coastguard Worker jnz .main 828*c0909341SAndroid Build Coastguard Worker.v2: 829*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v 830*c0909341SAndroid Build Coastguard Worker add dstq, strideq 831*c0909341SAndroid Build Coastguard Worker mov t4, t3 832*c0909341SAndroid Build Coastguard Worker mov t3, t2 833*c0909341SAndroid Build Coastguard Worker mov t2, t1 834*c0909341SAndroid Build Coastguard Worker movifnidn dstmp, dstq 835*c0909341SAndroid Build Coastguard Worker.v1: 836*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v 837*c0909341SAndroid Build Coastguard Worker jmp .end 838*c0909341SAndroid Build Coastguard Worker.h: 839*c0909341SAndroid Build Coastguard Worker %define stk esp+4 840*c0909341SAndroid Build Coastguard Worker mov xq, wq 841*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 842*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 843*c0909341SAndroid Build Coastguard Worker movifnidn leftq, leftmp 844*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+xq] 845*c0909341SAndroid Build Coastguard Worker movd m5, [leftq] 846*c0909341SAndroid Build Coastguard Worker add leftq, 4 847*c0909341SAndroid Build Coastguard Worker pslldq m4, 4 848*c0909341SAndroid Build Coastguard Worker por m4, m5 849*c0909341SAndroid Build Coastguard Worker movifnidn leftmp, leftq 850*c0909341SAndroid Build Coastguard Worker jmp .h_main 851*c0909341SAndroid Build Coastguard Worker.h_extend_left: 852*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 853*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+xq] 854*c0909341SAndroid Build Coastguard Worker pshufb m4, m12 855*c0909341SAndroid Build Coastguard Worker%else 856*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+xq] 857*c0909341SAndroid Build Coastguard Worker pshufd m4, m5, q2103 858*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m5 859*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m5 860*c0909341SAndroid Build Coastguard Worker movss m4, m5 861*c0909341SAndroid Build Coastguard Worker%endif 862*c0909341SAndroid Build Coastguard Worker jmp .h_main 863*c0909341SAndroid Build Coastguard Worker.h_top: 864*c0909341SAndroid Build Coastguard Worker mov xq, wq 865*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 866*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 867*c0909341SAndroid Build Coastguard Worker.h_loop: 868*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+xq-4] 869*c0909341SAndroid Build Coastguard Worker.h_main: 870*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+xq+4] 871*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 872*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 873*c0909341SAndroid Build Coastguard Worker cmp xd, -17 874*c0909341SAndroid Build Coastguard Worker jl .h_have_right 875*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right 876*c0909341SAndroid Build Coastguard Worker.h_have_right: 877*c0909341SAndroid Build Coastguard Worker%macro %%h5 0 878*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 879*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m9 880*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m13 881*c0909341SAndroid Build Coastguard Worker pshufb m1, m5, m9 882*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m13 883*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m10 884*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m13 885*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m10 886*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m13 887*c0909341SAndroid Build Coastguard Worker pshufb m4, m11 888*c0909341SAndroid Build Coastguard Worker paddw m0, m2 889*c0909341SAndroid Build Coastguard Worker pmullw m2, m14, m4 890*c0909341SAndroid Build Coastguard Worker pshufb m5, m11 891*c0909341SAndroid Build Coastguard Worker paddw m1, m3 892*c0909341SAndroid Build Coastguard Worker pmullw m3, m14, m5 893*c0909341SAndroid Build Coastguard Worker psllw m4, 7 894*c0909341SAndroid Build Coastguard Worker psllw m5, 7 895*c0909341SAndroid Build Coastguard Worker paddw m4, m8 896*c0909341SAndroid Build Coastguard Worker paddw m5, m8 897*c0909341SAndroid Build Coastguard Worker paddw m0, m2 898*c0909341SAndroid Build Coastguard Worker paddw m1, m3 899*c0909341SAndroid Build Coastguard Worker paddsw m0, m4 900*c0909341SAndroid Build Coastguard Worker paddsw m1, m5 901*c0909341SAndroid Build Coastguard Worker%else 902*c0909341SAndroid Build Coastguard Worker psrldq m0, m4, 2 903*c0909341SAndroid Build Coastguard Worker pslldq m1, m4, 2 904*c0909341SAndroid Build Coastguard Worker pxor m3, m3 905*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m3 906*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m3 907*c0909341SAndroid Build Coastguard Worker paddw m0, m1 908*c0909341SAndroid Build Coastguard Worker pmullw m0, m11 909*c0909341SAndroid Build Coastguard Worker pshufd m2, m4, q0321 910*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3 911*c0909341SAndroid Build Coastguard Worker pmullw m1, m14, m2 912*c0909341SAndroid Build Coastguard Worker paddw m0, m1 913*c0909341SAndroid Build Coastguard Worker psrldq m1, m4, 3 914*c0909341SAndroid Build Coastguard Worker pslldq m4, 3 915*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3 916*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m3 917*c0909341SAndroid Build Coastguard Worker paddw m1, m4 918*c0909341SAndroid Build Coastguard Worker pmullw m1, m13 919*c0909341SAndroid Build Coastguard Worker paddw m0, m1 920*c0909341SAndroid Build Coastguard Worker psllw m2, 7 921*c0909341SAndroid Build Coastguard Worker paddw m2, m8 922*c0909341SAndroid Build Coastguard Worker paddsw m0, m2 923*c0909341SAndroid Build Coastguard Worker psrldq m1, m5, 2 924*c0909341SAndroid Build Coastguard Worker pslldq m4, m5, 2 925*c0909341SAndroid Build Coastguard Worker punpcklbw m1, m3 926*c0909341SAndroid Build Coastguard Worker punpckhbw m4, m3 927*c0909341SAndroid Build Coastguard Worker paddw m1, m4 928*c0909341SAndroid Build Coastguard Worker pmullw m1, m11 929*c0909341SAndroid Build Coastguard Worker pshufd m4, m5, q0321 930*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m3 931*c0909341SAndroid Build Coastguard Worker pmullw m2, m14, m4 932*c0909341SAndroid Build Coastguard Worker paddw m1, m2 933*c0909341SAndroid Build Coastguard Worker psrldq m2, m5, 3 934*c0909341SAndroid Build Coastguard Worker pslldq m5, 3 935*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3 936*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m3 937*c0909341SAndroid Build Coastguard Worker paddw m2, m5 938*c0909341SAndroid Build Coastguard Worker pmullw m2, m13 939*c0909341SAndroid Build Coastguard Worker paddw m1, m2 940*c0909341SAndroid Build Coastguard Worker psllw m4, 7 941*c0909341SAndroid Build Coastguard Worker paddw m4, m8 942*c0909341SAndroid Build Coastguard Worker paddsw m1, m4 943*c0909341SAndroid Build Coastguard Worker%endif 944*c0909341SAndroid Build Coastguard Worker%endmacro 945*c0909341SAndroid Build Coastguard Worker %%h5 946*c0909341SAndroid Build Coastguard Worker psraw m0, 3 947*c0909341SAndroid Build Coastguard Worker psraw m1, 3 948*c0909341SAndroid Build Coastguard Worker paddw m0, m15 949*c0909341SAndroid Build Coastguard Worker paddw m1, m15 950*c0909341SAndroid Build Coastguard Worker mova [t1+xq*2+ 0], m0 951*c0909341SAndroid Build Coastguard Worker mova [t1+xq*2+16], m1 952*c0909341SAndroid Build Coastguard Worker add xq, 16 953*c0909341SAndroid Build Coastguard Worker jl .h_loop 954*c0909341SAndroid Build Coastguard Worker ret 955*c0909341SAndroid Build Coastguard WorkerALIGN function_align 956*c0909341SAndroid Build Coastguard Worker.hv: 957*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 958*c0909341SAndroid Build Coastguard Worker mov xq, wq 959*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 960*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 961*c0909341SAndroid Build Coastguard Worker movifnidn leftq, leftmp 962*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+xq] 963*c0909341SAndroid Build Coastguard Worker movd m5, [leftq] 964*c0909341SAndroid Build Coastguard Worker add leftq, 4 965*c0909341SAndroid Build Coastguard Worker pslldq m4, 4 966*c0909341SAndroid Build Coastguard Worker por m4, m5 967*c0909341SAndroid Build Coastguard Worker movifnidn leftmp, leftq 968*c0909341SAndroid Build Coastguard Worker jmp .hv_main 969*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 970*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 971*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+xq] 972*c0909341SAndroid Build Coastguard Worker pshufb m4, m12 973*c0909341SAndroid Build Coastguard Worker%else 974*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+xq] 975*c0909341SAndroid Build Coastguard Worker pshufd m4, m5, q2103 976*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m5 977*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m5 978*c0909341SAndroid Build Coastguard Worker movss m4, m5 979*c0909341SAndroid Build Coastguard Worker%endif 980*c0909341SAndroid Build Coastguard Worker jmp .hv_main 981*c0909341SAndroid Build Coastguard Worker.hv_bottom: 982*c0909341SAndroid Build Coastguard Worker mov xq, wq 983*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 984*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 985*c0909341SAndroid Build Coastguard Worker.hv_loop: 986*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+xq-4] 987*c0909341SAndroid Build Coastguard Worker.hv_main: 988*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+xq+4] 989*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 990*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 991*c0909341SAndroid Build Coastguard Worker cmp xd, -17 992*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 993*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right 994*c0909341SAndroid Build Coastguard Worker.hv_have_right: 995*c0909341SAndroid Build Coastguard Worker %%h5 996*c0909341SAndroid Build Coastguard Worker mova m2, [t3+xq*2] 997*c0909341SAndroid Build Coastguard Worker paddw m2, [t1+xq*2] 998*c0909341SAndroid Build Coastguard Worker psraw m0, 3 999*c0909341SAndroid Build Coastguard Worker psraw m1, 3 1000*c0909341SAndroid Build Coastguard Worker paddw m0, m15 1001*c0909341SAndroid Build Coastguard Worker paddw m1, m15 1002*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1003*c0909341SAndroid Build Coastguard Worker mova m3, [t2+xq*2] 1004*c0909341SAndroid Build Coastguard Worker paddw m4, m0, [t4+xq*2] 1005*c0909341SAndroid Build Coastguard Worker%else 1006*c0909341SAndroid Build Coastguard Worker mov r2, t2 1007*c0909341SAndroid Build Coastguard Worker mova m3, [r2+xq*2] 1008*c0909341SAndroid Build Coastguard Worker mov r2, t4 1009*c0909341SAndroid Build Coastguard Worker paddw m4, m0, [r2+xq*2] 1010*c0909341SAndroid Build Coastguard Worker%endif 1011*c0909341SAndroid Build Coastguard Worker mova [t0+xq*2], m0 1012*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2, m3 1013*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m7 1014*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 1015*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m7 1016*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m4 1017*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m6 1018*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m4 1019*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m6 1020*c0909341SAndroid Build Coastguard Worker paddd m0, m3 1021*c0909341SAndroid Build Coastguard Worker paddd m4, m2 1022*c0909341SAndroid Build Coastguard Worker mova m2, [t3+xq*2+16] 1023*c0909341SAndroid Build Coastguard Worker paddw m2, [t1+xq*2+16] 1024*c0909341SAndroid Build Coastguard Worker packuswb m0, m4 1025*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1026*c0909341SAndroid Build Coastguard Worker mova m3, [t2+xq*2+16] 1027*c0909341SAndroid Build Coastguard Worker paddw m4, m1, [t4+xq*2+16] 1028*c0909341SAndroid Build Coastguard Worker%else 1029*c0909341SAndroid Build Coastguard Worker paddw m4, m1, [r2+xq*2+16] 1030*c0909341SAndroid Build Coastguard Worker mov r2, t2 1031*c0909341SAndroid Build Coastguard Worker mova m3, [r2+xq*2+16] 1032*c0909341SAndroid Build Coastguard Worker mov dstq, dstmp 1033*c0909341SAndroid Build Coastguard Worker%endif 1034*c0909341SAndroid Build Coastguard Worker mova [t0+xq*2+16], m1 1035*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 1036*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m7 1037*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 1038*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m7 1039*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m4 1040*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m6 1041*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m4 1042*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m6 1043*c0909341SAndroid Build Coastguard Worker paddd m1, m3 1044*c0909341SAndroid Build Coastguard Worker paddd m2, m4 1045*c0909341SAndroid Build Coastguard Worker packuswb m1, m2 1046*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 1047*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 1048*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1049*c0909341SAndroid Build Coastguard Worker mova [dstq+xq], m0 1050*c0909341SAndroid Build Coastguard Worker add xq, 16 1051*c0909341SAndroid Build Coastguard Worker jl .hv_loop 1052*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1053*c0909341SAndroid Build Coastguard Worker mov t4, t3 1054*c0909341SAndroid Build Coastguard Worker mov t3, t2 1055*c0909341SAndroid Build Coastguard Worker mov t2, t1 1056*c0909341SAndroid Build Coastguard Worker mov t1, t0 1057*c0909341SAndroid Build Coastguard Worker mov t0, t4 1058*c0909341SAndroid Build Coastguard Worker movifnidn dstmp, dstq 1059*c0909341SAndroid Build Coastguard Worker ret 1060*c0909341SAndroid Build Coastguard Worker%if cpuflag(ssse3) 1061*c0909341SAndroid Build Coastguard Worker.v: 1062*c0909341SAndroid Build Coastguard Worker mov xq, wq 1063*c0909341SAndroid Build Coastguard Worker.v_loop: 1064*c0909341SAndroid Build Coastguard Worker mova m3, [t1+xq*2] 1065*c0909341SAndroid Build Coastguard Worker paddw m1, m3, [t3+xq*2] 1066*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1067*c0909341SAndroid Build Coastguard Worker mova m2, [t2+xq*2] 1068*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+xq*2] 1069*c0909341SAndroid Build Coastguard Worker%else 1070*c0909341SAndroid Build Coastguard Worker mov r2, t2 1071*c0909341SAndroid Build Coastguard Worker mova m2, [r2+xq*2] 1072*c0909341SAndroid Build Coastguard Worker mov r2, t4 1073*c0909341SAndroid Build Coastguard Worker paddw m3, [r2+xq*2] 1074*c0909341SAndroid Build Coastguard Worker%endif 1075*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m2 1076*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m7 1077*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2 1078*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m7 1079*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 1080*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m6 1081*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m3 1082*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m6 1083*c0909341SAndroid Build Coastguard Worker paddd m0, m2 1084*c0909341SAndroid Build Coastguard Worker paddd m1, m3 1085*c0909341SAndroid Build Coastguard Worker mova m4, [t1+xq*2+16] 1086*c0909341SAndroid Build Coastguard Worker paddw m2, m4, [t3+xq*2+16] 1087*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1088*c0909341SAndroid Build Coastguard Worker mova m3, [t2+xq*2+16] 1089*c0909341SAndroid Build Coastguard Worker paddw m4, [t4+xq*2+16] 1090*c0909341SAndroid Build Coastguard Worker%else 1091*c0909341SAndroid Build Coastguard Worker paddw m4, [r2+xq*2+16] 1092*c0909341SAndroid Build Coastguard Worker mov r2, t2 1093*c0909341SAndroid Build Coastguard Worker mova m3, [r2+xq*2+16] 1094*c0909341SAndroid Build Coastguard Worker mov dstq, dstmp 1095*c0909341SAndroid Build Coastguard Worker%endif 1096*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1097*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 1098*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m7 1099*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 1100*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m7 1101*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 1102*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m6 1103*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m4 1104*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m6 1105*c0909341SAndroid Build Coastguard Worker paddd m1, m3 1106*c0909341SAndroid Build Coastguard Worker paddd m2, m4 1107*c0909341SAndroid Build Coastguard Worker packuswb m1, m2 1108*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 1109*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 1110*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1111*c0909341SAndroid Build Coastguard Worker mova [dstq+xq], m0 1112*c0909341SAndroid Build Coastguard Worker add xq, 16 1113*c0909341SAndroid Build Coastguard Worker jl .v_loop 1114*c0909341SAndroid Build Coastguard Worker ret 1115*c0909341SAndroid Build Coastguard Worker%endif 1116*c0909341SAndroid Build Coastguard Worker%endmacro 1117*c0909341SAndroid Build Coastguard Worker 1118*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse2 1119*c0909341SAndroid Build Coastguard WorkerWIENER 1120*c0909341SAndroid Build Coastguard Worker 1121*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3 1122*c0909341SAndroid Build Coastguard WorkerWIENER 1123*c0909341SAndroid Build Coastguard Worker 1124*c0909341SAndroid Build Coastguard Worker;;;;;;;;;;;;;;;;;;;;;;;;;; 1125*c0909341SAndroid Build Coastguard Worker;; self-guided ;; 1126*c0909341SAndroid Build Coastguard Worker;;;;;;;;;;;;;;;;;;;;;;;;;; 1127*c0909341SAndroid Build Coastguard Worker 1128*c0909341SAndroid Build Coastguard Worker%macro GATHERDD 3 ; dst, src, tmp 1129*c0909341SAndroid Build Coastguard Worker movd %3d, %2 1130*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 1131*c0909341SAndroid Build Coastguard Worker movd %1, [r13+%3] 1132*c0909341SAndroid Build Coastguard Worker pextrw %3d, %2, 2 1133*c0909341SAndroid Build Coastguard Worker pinsrw %1, [r13+%3+2], 3 1134*c0909341SAndroid Build Coastguard Worker pextrw %3d, %2, 4 1135*c0909341SAndroid Build Coastguard Worker pinsrw %1, [r13+%3+2], 5 1136*c0909341SAndroid Build Coastguard Worker pextrw %3d, %2, 6 1137*c0909341SAndroid Build Coastguard Worker pinsrw %1, [r13+%3+2], 7 1138*c0909341SAndroid Build Coastguard Worker %else 1139*c0909341SAndroid Build Coastguard Worker movd %1, [base+sgr_x_by_x-0xf03+%3] 1140*c0909341SAndroid Build Coastguard Worker pextrw %3, %2, 2 1141*c0909341SAndroid Build Coastguard Worker pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 1142*c0909341SAndroid Build Coastguard Worker pextrw %3, %2, 4 1143*c0909341SAndroid Build Coastguard Worker pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 1144*c0909341SAndroid Build Coastguard Worker pextrw %3, %2, 6 1145*c0909341SAndroid Build Coastguard Worker pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 1146*c0909341SAndroid Build Coastguard Worker %endif 1147*c0909341SAndroid Build Coastguard Worker%endmacro 1148*c0909341SAndroid Build Coastguard Worker 1149*c0909341SAndroid Build Coastguard Worker%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore 1150*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64 1151*c0909341SAndroid Build Coastguard Worker %define tmp r14 1152*c0909341SAndroid Build Coastguard Worker %else 1153*c0909341SAndroid Build Coastguard Worker %define tmp %4 1154*c0909341SAndroid Build Coastguard Worker %endif 1155*c0909341SAndroid Build Coastguard Worker GATHERDD %1, %2, tmp 1156*c0909341SAndroid Build Coastguard Worker GATHERDD %2, %3, tmp 1157*c0909341SAndroid Build Coastguard Worker movif32 %4, %5 1158*c0909341SAndroid Build Coastguard Worker psrld %1, 24 1159*c0909341SAndroid Build Coastguard Worker psrld %2, 24 1160*c0909341SAndroid Build Coastguard Worker packssdw %1, %2 1161*c0909341SAndroid Build Coastguard Worker%endmacro 1162*c0909341SAndroid Build Coastguard Worker 1163*c0909341SAndroid Build Coastguard Worker%macro MULLD 3 ; dst, src, tmp 1164*c0909341SAndroid Build Coastguard Worker pmulhuw %3, %1, %2 1165*c0909341SAndroid Build Coastguard Worker pmullw %1, %2 1166*c0909341SAndroid Build Coastguard Worker pslld %3, 16 1167*c0909341SAndroid Build Coastguard Worker paddd %1, %3 1168*c0909341SAndroid Build Coastguard Worker%endmacro 1169*c0909341SAndroid Build Coastguard Worker 1170*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1171*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 0, 1, 2, 3, 5 1172*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 1173*c0909341SAndroid Build Coastguard Worker %assign extra_stack 5*16 1174*c0909341SAndroid Build Coastguard Worker %else 1175*c0909341SAndroid Build Coastguard Worker %assign extra_stack 3*16 1176*c0909341SAndroid Build Coastguard Worker %endif 1177*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \ 1178*c0909341SAndroid Build Coastguard Worker dst, stride, left, lpf, w 1179*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 1180*c0909341SAndroid Build Coastguard Worker %define dstm dword [esp+calloff+16*0+4*6] 1181*c0909341SAndroid Build Coastguard Worker %define stridemp dword [esp+calloff+16*0+4*7] 1182*c0909341SAndroid Build Coastguard Worker %define leftm dword [esp+calloff+16*3+4*0] 1183*c0909341SAndroid Build Coastguard Worker %define lpfm dword [esp+calloff+16*3+4*1] 1184*c0909341SAndroid Build Coastguard Worker %define w0m dword [esp+calloff+16*3+4*2] 1185*c0909341SAndroid Build Coastguard Worker %define hd dword [esp+calloff+16*3+4*3] 1186*c0909341SAndroid Build Coastguard Worker %define edgeb byte [esp+calloff+16*3+4*4] 1187*c0909341SAndroid Build Coastguard Worker %define edged dword [esp+calloff+16*3+4*4] 1188*c0909341SAndroid Build Coastguard Worker %define leftmp leftm 1189*c0909341SAndroid Build Coastguard Worker %else 1190*c0909341SAndroid Build Coastguard Worker %define w0m wm 1191*c0909341SAndroid Build Coastguard Worker %define hd dword r5m 1192*c0909341SAndroid Build Coastguard Worker %define edgeb byte r7m 1193*c0909341SAndroid Build Coastguard Worker %define edged dword r7m 1194*c0909341SAndroid Build Coastguard Worker %endif 1195*c0909341SAndroid Build Coastguard Worker %define hvsrcm dword [esp+calloff+4*0] 1196*c0909341SAndroid Build Coastguard Worker %define w1m dword [esp+calloff+4*1] 1197*c0909341SAndroid Build Coastguard Worker %define t0m dword [esp+calloff+4*2] 1198*c0909341SAndroid Build Coastguard Worker %define t2m dword [esp+calloff+4*3] 1199*c0909341SAndroid Build Coastguard Worker %define t3m dword [esp+calloff+4*4] 1200*c0909341SAndroid Build Coastguard Worker %define t4m dword [esp+calloff+4*5] 1201*c0909341SAndroid Build Coastguard Worker %define m8 [base+pb_1] 1202*c0909341SAndroid Build Coastguard Worker %define m9 [esp+calloff+16*2] 1203*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0xf00800a4] 1204*c0909341SAndroid Build Coastguard Worker %define m11 [base+sgr_lshuf5] 1205*c0909341SAndroid Build Coastguard Worker %define m12 [base+pd_34816] 1206*c0909341SAndroid Build Coastguard Worker %define m13 [base+pb_0to15] 1207*c0909341SAndroid Build Coastguard Worker %define r10 r4 1208*c0909341SAndroid Build Coastguard Worker %define base r6-$$ 1209*c0909341SAndroid Build Coastguard Worker %assign calloff 0 1210*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 1211*c0909341SAndroid Build Coastguard Worker mov strideq, [rstk+stack_offset+ 8] 1212*c0909341SAndroid Build Coastguard Worker mov leftq, [rstk+stack_offset+12] 1213*c0909341SAndroid Build Coastguard Worker mov lpfq, [rstk+stack_offset+16] 1214*c0909341SAndroid Build Coastguard Worker mov wd, [rstk+stack_offset+20] 1215*c0909341SAndroid Build Coastguard Worker mov dstm, dstq 1216*c0909341SAndroid Build Coastguard Worker mov stridemp, strideq 1217*c0909341SAndroid Build Coastguard Worker mov leftm, leftq 1218*c0909341SAndroid Build Coastguard Worker mov r1, [rstk+stack_offset+24] 1219*c0909341SAndroid Build Coastguard Worker mov r2, [rstk+stack_offset+32] 1220*c0909341SAndroid Build Coastguard Worker mov lpfm, lpfq 1221*c0909341SAndroid Build Coastguard Worker mov hd, r1 1222*c0909341SAndroid Build Coastguard Worker mov edged, r2 1223*c0909341SAndroid Build Coastguard Worker %endif 1224*c0909341SAndroid Build Coastguard Worker%else 1225*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 7, 9, 11, 12 1226*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \ 1227*c0909341SAndroid Build Coastguard Worker w, h, edge, params 1228*c0909341SAndroid Build Coastguard Worker%endif 1229*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1230*c0909341SAndroid Build Coastguard Worker mov wd, wm 1231*c0909341SAndroid Build Coastguard Worker%endif 1232*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1233*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 1234*c0909341SAndroid Build Coastguard Worker lea r13, [sgr_x_by_x-0xf03] 1235*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1236*c0909341SAndroid Build Coastguard Worker mov edged, r7m 1237*c0909341SAndroid Build Coastguard Worker movu m9, [paramsq] 1238*c0909341SAndroid Build Coastguard Worker add lpfq, wq 1239*c0909341SAndroid Build Coastguard Worker mova m8, [pb_1] 1240*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+20] 1241*c0909341SAndroid Build Coastguard Worker mova m10, [pd_0xf00800a4] 1242*c0909341SAndroid Build Coastguard Worker add dstq, wq 1243*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*4+400*12+16] 1244*c0909341SAndroid Build Coastguard Worker mova m12, [pd_34816] ; (1 << 11) + (1 << 15) 1245*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+wq*2+400*20+16] 1246*c0909341SAndroid Build Coastguard Worker pshufhw m7, m9, q0000 1247*c0909341SAndroid Build Coastguard Worker pshufb m9, [pw_256] ; s0 1248*c0909341SAndroid Build Coastguard Worker punpckhqdq m7, m7 ; w0 1249*c0909341SAndroid Build Coastguard Worker neg wq 1250*c0909341SAndroid Build Coastguard Worker mova m13, [pb_0to15] 1251*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1252*c0909341SAndroid Build Coastguard Worker mova m11, [sgr_lshuf5] 1253*c0909341SAndroid Build Coastguard Worker psllw m7, 4 1254*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 1255*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp] 1256*c0909341SAndroid Build Coastguard Worker%else 1257*c0909341SAndroid Build Coastguard Worker mov r1, [rstk+stack_offset+28] ; params 1258*c0909341SAndroid Build Coastguard Worker LEA r6, $$ 1259*c0909341SAndroid Build Coastguard Worker movu m1, [r1] 1260*c0909341SAndroid Build Coastguard Worker add lpfm, wq 1261*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+extra_stack+wq*2+20] 1262*c0909341SAndroid Build Coastguard Worker add dstq, wq 1263*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+extra_stack+wq*4+400*12+16] 1264*c0909341SAndroid Build Coastguard Worker mov dstm, dstq 1265*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+extra_stack+wq*2+400*20+16] 1266*c0909341SAndroid Build Coastguard Worker mov t3m, t3 1267*c0909341SAndroid Build Coastguard Worker pshufhw m7, m1, q0000 1268*c0909341SAndroid Build Coastguard Worker mov t4m, t4 1269*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+pw_256] ; s0 1270*c0909341SAndroid Build Coastguard Worker punpckhqdq m7, m7 ; w0 1271*c0909341SAndroid Build Coastguard Worker psllw m7, 4 1272*c0909341SAndroid Build Coastguard Worker neg wq 1273*c0909341SAndroid Build Coastguard Worker mova m9, m1 1274*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1275*c0909341SAndroid Build Coastguard Worker mov w1m, wd 1276*c0909341SAndroid Build Coastguard Worker sub wd, 2 1277*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 1278*c0909341SAndroid Build Coastguard Worker mov w0m, wd 1279*c0909341SAndroid Build Coastguard Worker %define strideq r5 1280*c0909341SAndroid Build Coastguard Worker%endif 1281*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 1282*c0909341SAndroid Build Coastguard Worker jz .no_top 1283*c0909341SAndroid Build Coastguard Worker call .h_top 1284*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1285*c0909341SAndroid Build Coastguard Worker movif32 t2m, t1 1286*c0909341SAndroid Build Coastguard Worker mov t2, t1 1287*c0909341SAndroid Build Coastguard Worker call .top_fixup 1288*c0909341SAndroid Build Coastguard Worker add t1, 400*6 1289*c0909341SAndroid Build Coastguard Worker call .h_top 1290*c0909341SAndroid Build Coastguard Worker movif32 strideq, stridemp 1291*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1292*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1293*c0909341SAndroid Build Coastguard Worker add r10, strideq 1294*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 ; below 1295*c0909341SAndroid Build Coastguard Worker movif32 t0m, t2 1296*c0909341SAndroid Build Coastguard Worker mov t0, t2 1297*c0909341SAndroid Build Coastguard Worker dec hd 1298*c0909341SAndroid Build Coastguard Worker jz .height1 1299*c0909341SAndroid Build Coastguard Worker or edged, 16 1300*c0909341SAndroid Build Coastguard Worker call .h 1301*c0909341SAndroid Build Coastguard Worker.main: 1302*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1303*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 1304*c0909341SAndroid Build Coastguard Worker call .hv 1305*c0909341SAndroid Build Coastguard Worker call .prep_n 1306*c0909341SAndroid Build Coastguard Worker sub hd, 2 1307*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 1308*c0909341SAndroid Build Coastguard Worker.main_loop: 1309*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 1310*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1311*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1312*c0909341SAndroid Build Coastguard Worker test hb, hb 1313*c0909341SAndroid Build Coastguard Worker%else 1314*c0909341SAndroid Build Coastguard Worker mov r4, hd 1315*c0909341SAndroid Build Coastguard Worker test r4, r4 1316*c0909341SAndroid Build Coastguard Worker%endif 1317*c0909341SAndroid Build Coastguard Worker jz .odd_height 1318*c0909341SAndroid Build Coastguard Worker call .h 1319*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1320*c0909341SAndroid Build Coastguard Worker call .hv 1321*c0909341SAndroid Build Coastguard Worker movif32 dstq, dstm 1322*c0909341SAndroid Build Coastguard Worker call .n0 1323*c0909341SAndroid Build Coastguard Worker call .n1 1324*c0909341SAndroid Build Coastguard Worker sub hd, 2 1325*c0909341SAndroid Build Coastguard Worker movif32 t0, t0m 1326*c0909341SAndroid Build Coastguard Worker jge .main_loop 1327*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 1328*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1329*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 1330*c0909341SAndroid Build Coastguard Worker call .h_top 1331*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1332*c0909341SAndroid Build Coastguard Worker call .hv_bottom 1333*c0909341SAndroid Build Coastguard Worker.end: 1334*c0909341SAndroid Build Coastguard Worker movif32 dstq, dstm 1335*c0909341SAndroid Build Coastguard Worker call .n0 1336*c0909341SAndroid Build Coastguard Worker call .n1 1337*c0909341SAndroid Build Coastguard Worker.end2: 1338*c0909341SAndroid Build Coastguard Worker RET 1339*c0909341SAndroid Build Coastguard Worker.height1: 1340*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 1341*c0909341SAndroid Build Coastguard Worker call .hv 1342*c0909341SAndroid Build Coastguard Worker call .prep_n 1343*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 1344*c0909341SAndroid Build Coastguard Worker.odd_height: 1345*c0909341SAndroid Build Coastguard Worker call .hv 1346*c0909341SAndroid Build Coastguard Worker movif32 dstq, dstm 1347*c0909341SAndroid Build Coastguard Worker call .n0 1348*c0909341SAndroid Build Coastguard Worker call .n1 1349*c0909341SAndroid Build Coastguard Worker.odd_height_end: 1350*c0909341SAndroid Build Coastguard Worker call .v 1351*c0909341SAndroid Build Coastguard Worker movif32 dstq, dstm 1352*c0909341SAndroid Build Coastguard Worker call .n0 1353*c0909341SAndroid Build Coastguard Worker jmp .end2 1354*c0909341SAndroid Build Coastguard Worker.extend_bottom: 1355*c0909341SAndroid Build Coastguard Worker call .v 1356*c0909341SAndroid Build Coastguard Worker jmp .end 1357*c0909341SAndroid Build Coastguard Worker.no_top: 1358*c0909341SAndroid Build Coastguard Worker movif32 strideq, stridemp 1359*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1360*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1361*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 1362*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 1363*c0909341SAndroid Build Coastguard Worker call .h 1364*c0909341SAndroid Build Coastguard Worker lea t2, [t1+400*6] 1365*c0909341SAndroid Build Coastguard Worker movif32 t2m, t2 1366*c0909341SAndroid Build Coastguard Worker call .top_fixup 1367*c0909341SAndroid Build Coastguard Worker dec hd 1368*c0909341SAndroid Build Coastguard Worker jz .no_top_height1 1369*c0909341SAndroid Build Coastguard Worker or edged, 16 1370*c0909341SAndroid Build Coastguard Worker mov t0, t1 1371*c0909341SAndroid Build Coastguard Worker mov t1, t2 1372*c0909341SAndroid Build Coastguard Worker movif32 t0m, t0 1373*c0909341SAndroid Build Coastguard Worker jmp .main 1374*c0909341SAndroid Build Coastguard Worker.no_top_height1: 1375*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 1376*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 1377*c0909341SAndroid Build Coastguard Worker call .v 1378*c0909341SAndroid Build Coastguard Worker call .prep_n 1379*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 1380*c0909341SAndroid Build Coastguard Worker.extend_right: 1381*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+8 1382*c0909341SAndroid Build Coastguard Worker%assign calloff 8 1383*c0909341SAndroid Build Coastguard Worker movd m1, wd 1384*c0909341SAndroid Build Coastguard Worker movd m3, [lpfq-1] 1385*c0909341SAndroid Build Coastguard Worker pshufb m1, m6 1386*c0909341SAndroid Build Coastguard Worker pshufb m3, m6 1387*c0909341SAndroid Build Coastguard Worker psubb m2, m8, m1 1388*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m13 1389*c0909341SAndroid Build Coastguard Worker pand m5, m2 1390*c0909341SAndroid Build Coastguard Worker pandn m2, m3 1391*c0909341SAndroid Build Coastguard Worker por m5, m2 1392*c0909341SAndroid Build Coastguard Worker ret 1393*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset-4 1394*c0909341SAndroid Build Coastguard Worker%assign calloff 4 1395*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 1396*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1397*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 1398*c0909341SAndroid Build Coastguard Worker%else 1399*c0909341SAndroid Build Coastguard Worker %define leftq r4 1400*c0909341SAndroid Build Coastguard Worker%endif 1401*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1402*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1403*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 1404*c0909341SAndroid Build Coastguard Worker movddup m4, [leftq-4] 1405*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1406*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 1407*c0909341SAndroid Build Coastguard Worker add leftmp, 4 1408*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 13 1409*c0909341SAndroid Build Coastguard Worker jmp .h_main 1410*c0909341SAndroid Build Coastguard Worker.h_extend_left: 1411*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1412*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 1413*c0909341SAndroid Build Coastguard Worker pshufb m5, m11 1414*c0909341SAndroid Build Coastguard Worker jmp .h_main 1415*c0909341SAndroid Build Coastguard Worker.h_top: 1416*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1417*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 1418*c0909341SAndroid Build Coastguard Worker%endif 1419*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1420*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1421*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1422*c0909341SAndroid Build Coastguard Worker.h_loop: 1423*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq-1] 1424*c0909341SAndroid Build Coastguard Worker.h_main: 1425*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1426*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 1427*c0909341SAndroid Build Coastguard Worker cmp wd, -10 1428*c0909341SAndroid Build Coastguard Worker jl .h_have_right 1429*c0909341SAndroid Build Coastguard Worker call .extend_right 1430*c0909341SAndroid Build Coastguard Worker.h_have_right: 1431*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m6 1432*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m6 1433*c0909341SAndroid Build Coastguard Worker palignr m2, m5, m4, 2 1434*c0909341SAndroid Build Coastguard Worker paddw m0, m4, m2 1435*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 6 1436*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1437*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 1438*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 1439*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 1440*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1441*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 8 1442*c0909341SAndroid Build Coastguard Worker paddw m0, m5 1443*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 1444*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1445*c0909341SAndroid Build Coastguard Worker paddd m1, m3 1446*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m5 1447*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1448*c0909341SAndroid Build Coastguard Worker shufps m4, m5, q2121 1449*c0909341SAndroid Build Coastguard Worker paddw m0, m4 ; sum 1450*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m4, m6 1451*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1452*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m6 1453*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 1454*c0909341SAndroid Build Coastguard Worker paddd m2, m3 1455*c0909341SAndroid Build Coastguard Worker test edgeb, 16 ; y > 0 1456*c0909341SAndroid Build Coastguard Worker jz .h_loop_end 1457*c0909341SAndroid Build Coastguard Worker paddw m0, [t1+wq*2+400*0] 1458*c0909341SAndroid Build Coastguard Worker paddd m1, [t1+wq*2+400*2] 1459*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+wq*2+400*4] 1460*c0909341SAndroid Build Coastguard Worker.h_loop_end: 1461*c0909341SAndroid Build Coastguard Worker paddd m1, m5 ; sumsq 1462*c0909341SAndroid Build Coastguard Worker paddd m2, m4 1463*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*0], m0 1464*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*2], m1 1465*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*4], m2 1466*c0909341SAndroid Build Coastguard Worker add wq, 8 1467*c0909341SAndroid Build Coastguard Worker jl .h_loop 1468*c0909341SAndroid Build Coastguard Worker ret 1469*c0909341SAndroid Build Coastguard Worker.top_fixup: 1470*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1471*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 1472*c0909341SAndroid Build Coastguard Worker%else 1473*c0909341SAndroid Build Coastguard Worker mov wd, w0m 1474*c0909341SAndroid Build Coastguard Worker%endif 1475*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: ; the sums of the first row needs to be doubled 1476*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq*2+400*0] 1477*c0909341SAndroid Build Coastguard Worker mova m1, [t1+wq*2+400*2] 1478*c0909341SAndroid Build Coastguard Worker mova m2, [t1+wq*2+400*4] 1479*c0909341SAndroid Build Coastguard Worker paddw m0, m0 1480*c0909341SAndroid Build Coastguard Worker paddd m1, m1 1481*c0909341SAndroid Build Coastguard Worker paddd m2, m2 1482*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*0], m0 1483*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*2], m1 1484*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*4], m2 1485*c0909341SAndroid Build Coastguard Worker add wq, 8 1486*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 1487*c0909341SAndroid Build Coastguard Worker ret 1488*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1489*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab 1490*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1491*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 1492*c0909341SAndroid Build Coastguard Worker%else 1493*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 1494*c0909341SAndroid Build Coastguard Worker%endif 1495*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1496*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 1497*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 1498*c0909341SAndroid Build Coastguard Worker movddup m4, [leftq-4] 1499*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1500*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 1501*c0909341SAndroid Build Coastguard Worker add leftmp, 4 1502*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 13 1503*c0909341SAndroid Build Coastguard Worker jmp .hv_main 1504*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 1505*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1506*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 1507*c0909341SAndroid Build Coastguard Worker pshufb m5, m11 1508*c0909341SAndroid Build Coastguard Worker jmp .hv_main 1509*c0909341SAndroid Build Coastguard Worker.hv_bottom: 1510*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1511*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 1512*c0909341SAndroid Build Coastguard Worker%else 1513*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 1514*c0909341SAndroid Build Coastguard Worker%endif 1515*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1516*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 1517*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 1518*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1519*c0909341SAndroid Build Coastguard Worker jmp .hv_loop_start 1520*c0909341SAndroid Build Coastguard Worker%endif 1521*c0909341SAndroid Build Coastguard Worker.hv_loop: 1522*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 1523*c0909341SAndroid Build Coastguard Worker.hv_loop_start: 1524*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq-1] 1525*c0909341SAndroid Build Coastguard Worker.hv_main: 1526*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1527*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 1528*c0909341SAndroid Build Coastguard Worker cmp wd, -10 1529*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 1530*c0909341SAndroid Build Coastguard Worker call .extend_right 1531*c0909341SAndroid Build Coastguard Worker.hv_have_right: 1532*c0909341SAndroid Build Coastguard Worker movif32 t3, hd 1533*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m6 1534*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m6 1535*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 2 1536*c0909341SAndroid Build Coastguard Worker paddw m0, m4, m3 1537*c0909341SAndroid Build Coastguard Worker palignr m1, m5, m4, 6 1538*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1539*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m1 1540*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1541*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m1 1542*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1543*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 8 1544*c0909341SAndroid Build Coastguard Worker paddw m0, m5 1545*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m4, m5 1546*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 1547*c0909341SAndroid Build Coastguard Worker paddd m2, m1 1548*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4, m5 1549*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 1550*c0909341SAndroid Build Coastguard Worker shufps m4, m5, q2121 1551*c0909341SAndroid Build Coastguard Worker paddw m0, m4 ; h sum 1552*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m4, m6 1553*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1554*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m6 1555*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 1556*c0909341SAndroid Build Coastguard Worker paddd m3, m1 1557*c0909341SAndroid Build Coastguard Worker paddd m2, m5 ; h sumsq 1558*c0909341SAndroid Build Coastguard Worker paddd m3, m4 1559*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t1+wq*2+400*0] 1560*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t1+wq*2+400*2] 1561*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t1+wq*2+400*4] 1562*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1563*c0909341SAndroid Build Coastguard Worker test hd, hd 1564*c0909341SAndroid Build Coastguard Worker%else 1565*c0909341SAndroid Build Coastguard Worker test t3, t3 1566*c0909341SAndroid Build Coastguard Worker%endif 1567*c0909341SAndroid Build Coastguard Worker jz .hv_last_row 1568*c0909341SAndroid Build Coastguard Worker.hv_main2: 1569*c0909341SAndroid Build Coastguard Worker paddw m1, [t2+wq*2+400*0] ; hv sum 1570*c0909341SAndroid Build Coastguard Worker paddd m4, [t2+wq*2+400*2] ; hv sumsq 1571*c0909341SAndroid Build Coastguard Worker paddd m5, [t2+wq*2+400*4] 1572*c0909341SAndroid Build Coastguard Worker mova [t0+wq*2+400*0], m0 1573*c0909341SAndroid Build Coastguard Worker pslld m0, m4, 4 1574*c0909341SAndroid Build Coastguard Worker mova [t0+wq*2+400*2], m2 1575*c0909341SAndroid Build Coastguard Worker mova [t0+wq*2+400*4], m3 1576*c0909341SAndroid Build Coastguard Worker pslld m2, m4, 3 1577*c0909341SAndroid Build Coastguard Worker paddd m4, m0 1578*c0909341SAndroid Build Coastguard Worker pslld m0, m5, 4 1579*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a * 25 1580*c0909341SAndroid Build Coastguard Worker pslld m2, m5, 3 1581*c0909341SAndroid Build Coastguard Worker paddd m5, m0 1582*c0909341SAndroid Build Coastguard Worker paddd m5, m2 1583*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 1584*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 1585*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b * b 1586*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 1587*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 1588*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1589*c0909341SAndroid Build Coastguard Worker MULLD m4, m9, m2 ; p * s 1590*c0909341SAndroid Build Coastguard Worker MULLD m5, m9, m2 1591*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 164 1592*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 1593*c0909341SAndroid Build Coastguard Worker paddusw m4, m10 1594*c0909341SAndroid Build Coastguard Worker paddusw m5, m10 1595*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z, 255) 1596*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 1597*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1598*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, t2, t2m 1599*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 1600*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 1601*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m2 1602*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m2 1603*c0909341SAndroid Build Coastguard Worker paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) 1604*c0909341SAndroid Build Coastguard Worker paddd m1, m12 1605*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+4], m3 1606*c0909341SAndroid Build Coastguard Worker psrld m0, 12 ; b 1607*c0909341SAndroid Build Coastguard Worker psrld m1, 12 1608*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+ 8], m0 1609*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+24], m1 1610*c0909341SAndroid Build Coastguard Worker add wq, 8 1611*c0909341SAndroid Build Coastguard Worker jl .hv_loop 1612*c0909341SAndroid Build Coastguard Worker mov t2, t1 1613*c0909341SAndroid Build Coastguard Worker mov t1, t0 1614*c0909341SAndroid Build Coastguard Worker mov t0, t2 1615*c0909341SAndroid Build Coastguard Worker movif32 t2m, t2 1616*c0909341SAndroid Build Coastguard Worker movif32 t0m, t0 1617*c0909341SAndroid Build Coastguard Worker ret 1618*c0909341SAndroid Build Coastguard Worker.hv_last_row: ; esoteric edge case for odd heights 1619*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*0], m1 1620*c0909341SAndroid Build Coastguard Worker paddw m1, m0 1621*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*2], m4 1622*c0909341SAndroid Build Coastguard Worker paddd m4, m2 1623*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*4], m5 1624*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1625*c0909341SAndroid Build Coastguard Worker jmp .hv_main2 1626*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab 1627*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1628*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 1629*c0909341SAndroid Build Coastguard Worker%else 1630*c0909341SAndroid Build Coastguard Worker mov wd, w0m 1631*c0909341SAndroid Build Coastguard Worker%endif 1632*c0909341SAndroid Build Coastguard Worker.v_loop: 1633*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq*2+400*0] 1634*c0909341SAndroid Build Coastguard Worker mova m2, [t1+wq*2+400*2] 1635*c0909341SAndroid Build Coastguard Worker mova m3, [t1+wq*2+400*4] 1636*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq*2+400*0] 1637*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t2+wq*2+400*2] 1638*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t2+wq*2+400*4] 1639*c0909341SAndroid Build Coastguard Worker paddw m0, m0 1640*c0909341SAndroid Build Coastguard Worker paddd m2, m2 1641*c0909341SAndroid Build Coastguard Worker paddd m3, m3 1642*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; hv sum 1643*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; hv sumsq 1644*c0909341SAndroid Build Coastguard Worker pslld m0, m4, 4 1645*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1646*c0909341SAndroid Build Coastguard Worker pslld m2, m4, 3 1647*c0909341SAndroid Build Coastguard Worker paddd m4, m0 1648*c0909341SAndroid Build Coastguard Worker pslld m0, m5, 4 1649*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a * 25 1650*c0909341SAndroid Build Coastguard Worker pslld m2, m5, 3 1651*c0909341SAndroid Build Coastguard Worker paddd m5, m0 1652*c0909341SAndroid Build Coastguard Worker paddd m5, m2 1653*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 1654*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 1655*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b * b 1656*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 1657*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 1658*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1659*c0909341SAndroid Build Coastguard Worker MULLD m4, m9, m2 ; p * s 1660*c0909341SAndroid Build Coastguard Worker MULLD m5, m9, m2 1661*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 164 1662*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 1663*c0909341SAndroid Build Coastguard Worker paddusw m4, m10 1664*c0909341SAndroid Build Coastguard Worker paddusw m5, m10 1665*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z, 255) 1666*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1667*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, t2, t2m 1668*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 1669*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 1670*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m2 1671*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m2 1672*c0909341SAndroid Build Coastguard Worker paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) 1673*c0909341SAndroid Build Coastguard Worker paddd m1, m12 1674*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+4], m3 1675*c0909341SAndroid Build Coastguard Worker psrld m0, 12 ; b 1676*c0909341SAndroid Build Coastguard Worker psrld m1, 12 1677*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+ 8], m0 1678*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+24], m1 1679*c0909341SAndroid Build Coastguard Worker add wq, 8 1680*c0909341SAndroid Build Coastguard Worker jl .v_loop 1681*c0909341SAndroid Build Coastguard Worker ret 1682*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 1683*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 1684*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 1685*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 1686*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*2+ 2] 1687*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*2+ 4] 1688*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+ 4] 1689*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*4+ 8] 1690*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*4+20] 1691*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*4+24] 1692*c0909341SAndroid Build Coastguard Worker paddw m3, m0 1693*c0909341SAndroid Build Coastguard Worker paddd m4, m1 1694*c0909341SAndroid Build Coastguard Worker paddd m5, m2 1695*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*2+ 0] 1696*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*4+ 0] 1697*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*4+16] 1698*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1699*c0909341SAndroid Build Coastguard Worker psllw m3, 2 1700*c0909341SAndroid Build Coastguard Worker paddd m1, m4 1701*c0909341SAndroid Build Coastguard Worker pslld m4, 2 1702*c0909341SAndroid Build Coastguard Worker paddd m2, m5 1703*c0909341SAndroid Build Coastguard Worker pslld m5, 2 1704*c0909341SAndroid Build Coastguard Worker paddw m0, m3 ; a 565 1705*c0909341SAndroid Build Coastguard Worker paddd m1, m4 ; b 565 1706*c0909341SAndroid Build Coastguard Worker paddd m2, m5 1707*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*2+ 0], m0 1708*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*4+ 0], m1 1709*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*4+16], m2 1710*c0909341SAndroid Build Coastguard Worker add wq, 8 1711*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 1712*c0909341SAndroid Build Coastguard Worker ret 1713*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1714*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 1715*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 1716*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 1717*c0909341SAndroid Build Coastguard Worker.n0_loop: 1718*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*2+ 2] 1719*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*2+ 4] 1720*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+ 4] 1721*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*4+ 8] 1722*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*4+20] 1723*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*4+24] 1724*c0909341SAndroid Build Coastguard Worker paddw m3, m0 1725*c0909341SAndroid Build Coastguard Worker paddd m4, m1 1726*c0909341SAndroid Build Coastguard Worker paddd m5, m2 1727*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*2+ 0] 1728*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*4+ 0] 1729*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*4+16] 1730*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1731*c0909341SAndroid Build Coastguard Worker psllw m3, 2 1732*c0909341SAndroid Build Coastguard Worker paddd m1, m4 1733*c0909341SAndroid Build Coastguard Worker pslld m4, 2 1734*c0909341SAndroid Build Coastguard Worker paddd m2, m5 1735*c0909341SAndroid Build Coastguard Worker pslld m5, 2 1736*c0909341SAndroid Build Coastguard Worker paddw m0, m3 ; a 565 1737*c0909341SAndroid Build Coastguard Worker paddd m1, m4 ; b 565 1738*c0909341SAndroid Build Coastguard Worker paddd m2, m5 1739*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t4+wq*2+400*2+ 0] 1740*c0909341SAndroid Build Coastguard Worker paddd m4, m1, [t3+wq*4+400*4+ 0] 1741*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+wq*4+400*4+16] 1742*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*2+ 0], m0 1743*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*4+ 0], m1 1744*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*4+16], m2 1745*c0909341SAndroid Build Coastguard Worker movq m0, [dstq+wq] 1746*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m6 1747*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 ; src 1748*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 ; a 1749*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 1750*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 1751*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1752*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1753*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; b - a * src + (1 << 8) 1754*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1755*c0909341SAndroid Build Coastguard Worker psrad m4, 9 1756*c0909341SAndroid Build Coastguard Worker psrad m5, 9 1757*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 1758*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 1759*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1760*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 1761*c0909341SAndroid Build Coastguard Worker movq [dstq+wq], m0 1762*c0909341SAndroid Build Coastguard Worker add wq, 8 1763*c0909341SAndroid Build Coastguard Worker jl .n0_loop 1764*c0909341SAndroid Build Coastguard Worker add dstq, stridemp 1765*c0909341SAndroid Build Coastguard Worker ret 1766*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1767*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 1768*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 1769*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 1770*c0909341SAndroid Build Coastguard Worker.n1_loop: 1771*c0909341SAndroid Build Coastguard Worker movq m0, [dstq+wq] 1772*c0909341SAndroid Build Coastguard Worker mova m3, [t4+wq*2+400*2+ 0] 1773*c0909341SAndroid Build Coastguard Worker mova m4, [t3+wq*4+400*4+ 0] 1774*c0909341SAndroid Build Coastguard Worker mova m5, [t3+wq*4+400*4+16] 1775*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m6 1776*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 ; src 1777*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 ; a 1778*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 1779*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 1780*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1781*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1782*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; b - a * src + (1 << 7) 1783*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1784*c0909341SAndroid Build Coastguard Worker psrad m4, 8 1785*c0909341SAndroid Build Coastguard Worker psrad m5, 8 1786*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 1787*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 1788*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1789*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 1790*c0909341SAndroid Build Coastguard Worker movq [dstq+wq], m0 1791*c0909341SAndroid Build Coastguard Worker add wq, 8 1792*c0909341SAndroid Build Coastguard Worker jl .n1_loop 1793*c0909341SAndroid Build Coastguard Worker add dstq, stridemp 1794*c0909341SAndroid Build Coastguard Worker movif32 dstm, dstq 1795*c0909341SAndroid Build Coastguard Worker ret 1796*c0909341SAndroid Build Coastguard Worker 1797*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 1798*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 1799*c0909341SAndroid Build Coastguard Worker %assign extra_stack 4*16 1800*c0909341SAndroid Build Coastguard Worker %else 1801*c0909341SAndroid Build Coastguard Worker %assign extra_stack 2*16 1802*c0909341SAndroid Build Coastguard Worker %endif 1803*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \ 1804*c0909341SAndroid Build Coastguard Worker dst, stride, left, lpf, w 1805*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 1806*c0909341SAndroid Build Coastguard Worker %define dstm dword [esp+calloff+16*2+4*0] 1807*c0909341SAndroid Build Coastguard Worker %define stridemp dword [esp+calloff+16*2+4*1] 1808*c0909341SAndroid Build Coastguard Worker %define leftm dword [esp+calloff+16*2+4*2] 1809*c0909341SAndroid Build Coastguard Worker %define lpfm dword [esp+calloff+16*2+4*3] 1810*c0909341SAndroid Build Coastguard Worker %define w0m dword [esp+calloff+16*2+4*4] 1811*c0909341SAndroid Build Coastguard Worker %define hd dword [esp+calloff+16*2+4*5] 1812*c0909341SAndroid Build Coastguard Worker %define edgeb byte [esp+calloff+16*2+4*6] 1813*c0909341SAndroid Build Coastguard Worker %define edged dword [esp+calloff+16*2+4*6] 1814*c0909341SAndroid Build Coastguard Worker %define leftmp leftm 1815*c0909341SAndroid Build Coastguard Worker %else 1816*c0909341SAndroid Build Coastguard Worker %define w0m wm 1817*c0909341SAndroid Build Coastguard Worker %define hd dword r5m 1818*c0909341SAndroid Build Coastguard Worker %define edgeb byte r7m 1819*c0909341SAndroid Build Coastguard Worker %define edged dword r7m 1820*c0909341SAndroid Build Coastguard Worker %endif 1821*c0909341SAndroid Build Coastguard Worker %define hvsrcm dword [esp+calloff+4*0] 1822*c0909341SAndroid Build Coastguard Worker %define w1m dword [esp+calloff+4*1] 1823*c0909341SAndroid Build Coastguard Worker %define t3m dword [esp+calloff+4*2] 1824*c0909341SAndroid Build Coastguard Worker %define t4m dword [esp+calloff+4*3] 1825*c0909341SAndroid Build Coastguard Worker %define m8 [base+pb_0to15] 1826*c0909341SAndroid Build Coastguard Worker %define m9 [esp+calloff+16*1] 1827*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0xf00801c7] 1828*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_34816] 1829*c0909341SAndroid Build Coastguard Worker %define m12 m6 1830*c0909341SAndroid Build Coastguard Worker %define m13 [base+sgr_lshuf3] 1831*c0909341SAndroid Build Coastguard Worker %define base r6-$$ 1832*c0909341SAndroid Build Coastguard Worker %assign calloff 0 1833*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 1834*c0909341SAndroid Build Coastguard Worker mov strideq, [rstk+stack_offset+ 8] 1835*c0909341SAndroid Build Coastguard Worker mov leftq, [rstk+stack_offset+12] 1836*c0909341SAndroid Build Coastguard Worker mov lpfq, [rstk+stack_offset+16] 1837*c0909341SAndroid Build Coastguard Worker mov wd, [rstk+stack_offset+20] 1838*c0909341SAndroid Build Coastguard Worker mov dstm, dstq 1839*c0909341SAndroid Build Coastguard Worker mov stridemp, strideq 1840*c0909341SAndroid Build Coastguard Worker mov leftm, leftq 1841*c0909341SAndroid Build Coastguard Worker mov r1, [rstk+stack_offset+24] 1842*c0909341SAndroid Build Coastguard Worker mov r2, [rstk+stack_offset+32] 1843*c0909341SAndroid Build Coastguard Worker mov lpfm, lpfq 1844*c0909341SAndroid Build Coastguard Worker mov hd, r1 1845*c0909341SAndroid Build Coastguard Worker mov edged, r2 1846*c0909341SAndroid Build Coastguard Worker %endif 1847*c0909341SAndroid Build Coastguard Worker%else 1848*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \ 1849*c0909341SAndroid Build Coastguard Worker w, h, edge, params 1850*c0909341SAndroid Build Coastguard Worker%endif 1851*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1852*c0909341SAndroid Build Coastguard Worker mov wd, wm 1853*c0909341SAndroid Build Coastguard Worker%endif 1854*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1855*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 1856*c0909341SAndroid Build Coastguard Worker lea r13, [sgr_x_by_x-0xf03] 1857*c0909341SAndroid Build Coastguard Worker mov hd, hm 1858*c0909341SAndroid Build Coastguard Worker mov edged, r7m 1859*c0909341SAndroid Build Coastguard Worker movq m9, [paramsq+4] 1860*c0909341SAndroid Build Coastguard Worker add lpfq, wq 1861*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+12] 1862*c0909341SAndroid Build Coastguard Worker mova m8, [pb_0to15] 1863*c0909341SAndroid Build Coastguard Worker add dstq, wq 1864*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*4+400*12+8] 1865*c0909341SAndroid Build Coastguard Worker mova m10, [pd_0xf00801c7] 1866*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+wq*2+400*32+8] 1867*c0909341SAndroid Build Coastguard Worker mova m11, [pd_34816] 1868*c0909341SAndroid Build Coastguard Worker pshuflw m7, m9, q3333 1869*c0909341SAndroid Build Coastguard Worker pshufb m9, [pw_256] ; s1 1870*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m7 ; w1 1871*c0909341SAndroid Build Coastguard Worker neg wq 1872*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1873*c0909341SAndroid Build Coastguard Worker mova m13, [sgr_lshuf3] 1874*c0909341SAndroid Build Coastguard Worker psllw m7, 4 1875*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 1876*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp] 1877*c0909341SAndroid Build Coastguard Worker%else 1878*c0909341SAndroid Build Coastguard Worker mov r1, [rstk+stack_offset+28] ; params 1879*c0909341SAndroid Build Coastguard Worker LEA r6, $$ 1880*c0909341SAndroid Build Coastguard Worker movq m1, [r1+4] 1881*c0909341SAndroid Build Coastguard Worker add lpfm, wq 1882*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+extra_stack+wq*2+20] 1883*c0909341SAndroid Build Coastguard Worker add dstq, wq 1884*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+extra_stack+wq*4+400*12+16] 1885*c0909341SAndroid Build Coastguard Worker mov dstm, dstq 1886*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+extra_stack+wq*2+400*32+16] 1887*c0909341SAndroid Build Coastguard Worker mov t3m, t3 1888*c0909341SAndroid Build Coastguard Worker pshuflw m7, m1, q3333 1889*c0909341SAndroid Build Coastguard Worker mov t4m, t4 1890*c0909341SAndroid Build Coastguard Worker pshufb m1, [base+pw_256] ; s1 1891*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m7 ; w1 1892*c0909341SAndroid Build Coastguard Worker psllw m7, 4 1893*c0909341SAndroid Build Coastguard Worker neg wq 1894*c0909341SAndroid Build Coastguard Worker mova m9, m1 1895*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1896*c0909341SAndroid Build Coastguard Worker mov w1m, wd 1897*c0909341SAndroid Build Coastguard Worker sub wd, 2 1898*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 1899*c0909341SAndroid Build Coastguard Worker mov w0m, wd 1900*c0909341SAndroid Build Coastguard Worker %define strideq r5 1901*c0909341SAndroid Build Coastguard Worker%endif 1902*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 1903*c0909341SAndroid Build Coastguard Worker jz .no_top 1904*c0909341SAndroid Build Coastguard Worker call .h_top 1905*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1906*c0909341SAndroid Build Coastguard Worker mov t2, t1 1907*c0909341SAndroid Build Coastguard Worker add t1, 400*6 1908*c0909341SAndroid Build Coastguard Worker call .h_top 1909*c0909341SAndroid Build Coastguard Worker movif32 strideq, stridemp 1910*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1911*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1912*c0909341SAndroid Build Coastguard Worker add r10, strideq 1913*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 ; below 1914*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 1915*c0909341SAndroid Build Coastguard Worker call .hv0 1916*c0909341SAndroid Build Coastguard Worker.main: 1917*c0909341SAndroid Build Coastguard Worker dec hd 1918*c0909341SAndroid Build Coastguard Worker jz .height1 1919*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 1920*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1921*c0909341SAndroid Build Coastguard Worker call .hv1 1922*c0909341SAndroid Build Coastguard Worker call .prep_n 1923*c0909341SAndroid Build Coastguard Worker sub hd, 2 1924*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 1925*c0909341SAndroid Build Coastguard Worker.main_loop: 1926*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 1927*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1928*c0909341SAndroid Build Coastguard Worker call .hv0 1929*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1930*c0909341SAndroid Build Coastguard Worker test hb, hb 1931*c0909341SAndroid Build Coastguard Worker%else 1932*c0909341SAndroid Build Coastguard Worker mov r4, hd 1933*c0909341SAndroid Build Coastguard Worker test r4, r4 1934*c0909341SAndroid Build Coastguard Worker%endif 1935*c0909341SAndroid Build Coastguard Worker jz .odd_height 1936*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 1937*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1938*c0909341SAndroid Build Coastguard Worker call .hv1 1939*c0909341SAndroid Build Coastguard Worker call .n0 1940*c0909341SAndroid Build Coastguard Worker call .n1 1941*c0909341SAndroid Build Coastguard Worker sub hd, 2 1942*c0909341SAndroid Build Coastguard Worker jge .main_loop 1943*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 1944*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1945*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 1946*c0909341SAndroid Build Coastguard Worker call .hv0_bottom 1947*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 1948*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 1949*c0909341SAndroid Build Coastguard Worker call .hv1_bottom 1950*c0909341SAndroid Build Coastguard Worker.end: 1951*c0909341SAndroid Build Coastguard Worker call .n0 1952*c0909341SAndroid Build Coastguard Worker call .n1 1953*c0909341SAndroid Build Coastguard Worker.end2: 1954*c0909341SAndroid Build Coastguard Worker RET 1955*c0909341SAndroid Build Coastguard Worker.height1: 1956*c0909341SAndroid Build Coastguard Worker call .v1 1957*c0909341SAndroid Build Coastguard Worker call .prep_n 1958*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 1959*c0909341SAndroid Build Coastguard Worker.odd_height: 1960*c0909341SAndroid Build Coastguard Worker call .v1 1961*c0909341SAndroid Build Coastguard Worker call .n0 1962*c0909341SAndroid Build Coastguard Worker call .n1 1963*c0909341SAndroid Build Coastguard Worker.odd_height_end: 1964*c0909341SAndroid Build Coastguard Worker call .v0 1965*c0909341SAndroid Build Coastguard Worker call .v1 1966*c0909341SAndroid Build Coastguard Worker call .n0 1967*c0909341SAndroid Build Coastguard Worker jmp .end2 1968*c0909341SAndroid Build Coastguard Worker.extend_bottom: 1969*c0909341SAndroid Build Coastguard Worker call .v0 1970*c0909341SAndroid Build Coastguard Worker call .v1 1971*c0909341SAndroid Build Coastguard Worker jmp .end 1972*c0909341SAndroid Build Coastguard Worker.no_top: 1973*c0909341SAndroid Build Coastguard Worker movif32 strideq, stridemp 1974*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1975*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1976*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 1977*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 1978*c0909341SAndroid Build Coastguard Worker call .h 1979*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 1980*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 1981*c0909341SAndroid Build Coastguard Worker%else 1982*c0909341SAndroid Build Coastguard Worker mov wq, w0m 1983*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 1984*c0909341SAndroid Build Coastguard Worker%endif 1985*c0909341SAndroid Build Coastguard Worker lea t2, [t1+400*6] 1986*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: 1987*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq*2+400*0] 1988*c0909341SAndroid Build Coastguard Worker mova m1, [t1+wq*2+400*2] 1989*c0909341SAndroid Build Coastguard Worker mova m2, [t1+wq*2+400*4] 1990*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*0], m0 1991*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*2], m1 1992*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*4], m2 1993*c0909341SAndroid Build Coastguard Worker add wq, 8 1994*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 1995*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 1996*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 1997*c0909341SAndroid Build Coastguard Worker call .v0 1998*c0909341SAndroid Build Coastguard Worker jmp .main 1999*c0909341SAndroid Build Coastguard Worker.extend_right: 2000*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+8 2001*c0909341SAndroid Build Coastguard Worker%assign calloff 8 2002*c0909341SAndroid Build Coastguard Worker movd m0, [lpfq-1] 2003*c0909341SAndroid Build Coastguard Worker movd m1, wd 2004*c0909341SAndroid Build Coastguard Worker mova m3, m8 2005*c0909341SAndroid Build Coastguard Worker pshufb m0, m6 2006*c0909341SAndroid Build Coastguard Worker pshufb m1, m6 2007*c0909341SAndroid Build Coastguard Worker mova m2, m6 2008*c0909341SAndroid Build Coastguard Worker psubb m2, m1 2009*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, m3 2010*c0909341SAndroid Build Coastguard Worker pand m5, m2 2011*c0909341SAndroid Build Coastguard Worker pandn m2, m0 2012*c0909341SAndroid Build Coastguard Worker por m5, m2 2013*c0909341SAndroid Build Coastguard Worker ret 2014*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset-4 2015*c0909341SAndroid Build Coastguard Worker%assign calloff 4 2016*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 2017*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2018*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2019*c0909341SAndroid Build Coastguard Worker%else 2020*c0909341SAndroid Build Coastguard Worker %define leftq r4 2021*c0909341SAndroid Build Coastguard Worker%endif 2022*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2023*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 2024*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 2025*c0909341SAndroid Build Coastguard Worker movddup m4, [leftq-4] 2026*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2027*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 2028*c0909341SAndroid Build Coastguard Worker add leftmp, 4 2029*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 14 2030*c0909341SAndroid Build Coastguard Worker jmp .h_main 2031*c0909341SAndroid Build Coastguard Worker.h_extend_left: 2032*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2033*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 2034*c0909341SAndroid Build Coastguard Worker pshufb m5, m13 2035*c0909341SAndroid Build Coastguard Worker jmp .h_main 2036*c0909341SAndroid Build Coastguard Worker.h_top: 2037*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2038*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2039*c0909341SAndroid Build Coastguard Worker%endif 2040*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2041*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 2042*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2043*c0909341SAndroid Build Coastguard Worker.h_loop: 2044*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq] 2045*c0909341SAndroid Build Coastguard Worker.h_main: 2046*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2047*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 2048*c0909341SAndroid Build Coastguard Worker cmp wd, -9 2049*c0909341SAndroid Build Coastguard Worker jl .h_have_right 2050*c0909341SAndroid Build Coastguard Worker call .extend_right 2051*c0909341SAndroid Build Coastguard Worker.h_have_right: 2052*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m6 2053*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m6 2054*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 2 2055*c0909341SAndroid Build Coastguard Worker paddw m1, m4, m0 2056*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 2057*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2058*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m0 2059*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2060*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 4 2061*c0909341SAndroid Build Coastguard Worker paddw m1, m5 ; sum 2062*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m6 2063*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 2064*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 2065*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2066*c0909341SAndroid Build Coastguard Worker paddd m2, m4 ; sumsq 2067*c0909341SAndroid Build Coastguard Worker paddd m3, m5 2068*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*0], m1 2069*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*2], m2 2070*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*4], m3 2071*c0909341SAndroid Build Coastguard Worker add wq, 8 2072*c0909341SAndroid Build Coastguard Worker jl .h_loop 2073*c0909341SAndroid Build Coastguard Worker ret 2074*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2075*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 2076*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2077*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2078*c0909341SAndroid Build Coastguard Worker%else 2079*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2080*c0909341SAndroid Build Coastguard Worker%endif 2081*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2082*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 2083*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 2084*c0909341SAndroid Build Coastguard Worker movddup m4, [leftq-4] 2085*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2086*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 2087*c0909341SAndroid Build Coastguard Worker add leftmp, 4 2088*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 14 2089*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 2090*c0909341SAndroid Build Coastguard Worker.hv0_extend_left: 2091*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2092*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 2093*c0909341SAndroid Build Coastguard Worker pshufb m5, m13 2094*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 2095*c0909341SAndroid Build Coastguard Worker.hv0_bottom: 2096*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2097*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2098*c0909341SAndroid Build Coastguard Worker%else 2099*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2100*c0909341SAndroid Build Coastguard Worker%endif 2101*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2102*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 2103*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2104*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2105*c0909341SAndroid Build Coastguard Worker jmp .hv0_loop_start 2106*c0909341SAndroid Build Coastguard Worker%endif 2107*c0909341SAndroid Build Coastguard Worker.hv0_loop: 2108*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2109*c0909341SAndroid Build Coastguard Worker.hv0_loop_start: 2110*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq] 2111*c0909341SAndroid Build Coastguard Worker.hv0_main: 2112*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2113*c0909341SAndroid Build Coastguard Worker jnz .hv0_have_right 2114*c0909341SAndroid Build Coastguard Worker cmp wd, -9 2115*c0909341SAndroid Build Coastguard Worker jl .hv0_have_right 2116*c0909341SAndroid Build Coastguard Worker call .extend_right 2117*c0909341SAndroid Build Coastguard Worker.hv0_have_right: 2118*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m6 2119*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m6 2120*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 2 2121*c0909341SAndroid Build Coastguard Worker paddw m1, m4, m0 2122*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 2123*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2124*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m0 2125*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2126*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 4 2127*c0909341SAndroid Build Coastguard Worker paddw m1, m5 ; sum 2128*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m6 2129*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 2130*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 2131*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2132*c0909341SAndroid Build Coastguard Worker paddd m2, m4 ; sumsq 2133*c0909341SAndroid Build Coastguard Worker paddd m3, m5 2134*c0909341SAndroid Build Coastguard Worker paddw m0, m1, [t1+wq*2+400*0] 2135*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t1+wq*2+400*2] 2136*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t1+wq*2+400*4] 2137*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*0], m1 2138*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*2], m2 2139*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*4], m3 2140*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq*2+400*0] 2141*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+wq*2+400*2] 2142*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+wq*2+400*4] 2143*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*0], m0 2144*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*2], m4 2145*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*4], m5 2146*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 2147*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 2148*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a * 9 2149*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2150*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 2151*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b * b 2152*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 2153*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 2154*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 2155*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2156*c0909341SAndroid Build Coastguard Worker MULLD m4, m9, m12 ; p * s 2157*c0909341SAndroid Build Coastguard Worker MULLD m5, m9, m12 2158*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 455 2159*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 2160*c0909341SAndroid Build Coastguard Worker paddusw m4, m10 2161*c0909341SAndroid Build Coastguard Worker paddusw m5, m10 2162*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z, 255) 2163*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 2164*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2165*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 2166*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 2167*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 2168*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m12 2169*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m12 2170*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2171*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2172*c0909341SAndroid Build Coastguard Worker%endif 2173*c0909341SAndroid Build Coastguard Worker paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2174*c0909341SAndroid Build Coastguard Worker paddd m1, m11 2175*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+4], m3 2176*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2177*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2178*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+ 8], m0 2179*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+24], m1 2180*c0909341SAndroid Build Coastguard Worker add wq, 8 2181*c0909341SAndroid Build Coastguard Worker jl .hv0_loop 2182*c0909341SAndroid Build Coastguard Worker ret 2183*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2184*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2185*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2186*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2187*c0909341SAndroid Build Coastguard Worker%else 2188*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2189*c0909341SAndroid Build Coastguard Worker%endif 2190*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2191*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 2192*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 2193*c0909341SAndroid Build Coastguard Worker movddup m4, [leftq-4] 2194*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2195*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 2196*c0909341SAndroid Build Coastguard Worker add leftmp, 4 2197*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 14 2198*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 2199*c0909341SAndroid Build Coastguard Worker.hv1_extend_left: 2200*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2201*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 2202*c0909341SAndroid Build Coastguard Worker pshufb m5, m13 2203*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 2204*c0909341SAndroid Build Coastguard Worker.hv1_bottom: 2205*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2206*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2207*c0909341SAndroid Build Coastguard Worker%else 2208*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2209*c0909341SAndroid Build Coastguard Worker%endif 2210*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2211*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 2212*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2213*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2214*c0909341SAndroid Build Coastguard Worker jmp .hv1_loop_start 2215*c0909341SAndroid Build Coastguard Worker%endif 2216*c0909341SAndroid Build Coastguard Worker.hv1_loop: 2217*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2218*c0909341SAndroid Build Coastguard Worker.hv1_loop_start: 2219*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq] 2220*c0909341SAndroid Build Coastguard Worker.hv1_main: 2221*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2222*c0909341SAndroid Build Coastguard Worker jnz .hv1_have_right 2223*c0909341SAndroid Build Coastguard Worker cmp wd, -9 2224*c0909341SAndroid Build Coastguard Worker jl .hv1_have_right 2225*c0909341SAndroid Build Coastguard Worker call .extend_right 2226*c0909341SAndroid Build Coastguard Worker.hv1_have_right: 2227*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m6 2228*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m6 2229*c0909341SAndroid Build Coastguard Worker palignr m1, m5, m4, 2 2230*c0909341SAndroid Build Coastguard Worker paddw m0, m4, m1 2231*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m1 2232*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2233*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m1 2234*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2235*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 4 2236*c0909341SAndroid Build Coastguard Worker paddw m0, m5 ; h sum 2237*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m5, m6 2238*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 2239*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 2240*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2241*c0909341SAndroid Build Coastguard Worker paddd m2, m1 ; h sumsq 2242*c0909341SAndroid Build Coastguard Worker paddd m3, m5 2243*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq*2+400*0] 2244*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t2+wq*2+400*2] 2245*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t2+wq*2+400*4] 2246*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*0], m0 2247*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*2], m2 2248*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*4], m3 2249*c0909341SAndroid Build Coastguard Worker pslld m2, m4, 3 2250*c0909341SAndroid Build Coastguard Worker pslld m3, m5, 3 2251*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a * 9 2252*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2253*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 2254*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b * b 2255*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 2256*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 2257*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 2258*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2259*c0909341SAndroid Build Coastguard Worker MULLD m4, m9, m12 ; p * s 2260*c0909341SAndroid Build Coastguard Worker MULLD m5, m9, m12 2261*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 455 2262*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 2263*c0909341SAndroid Build Coastguard Worker paddusw m4, m10 2264*c0909341SAndroid Build Coastguard Worker paddusw m5, m10 2265*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z, 255) 2266*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 2267*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2268*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 2269*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 2270*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 2271*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m12 2272*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m12 2273*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2274*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2275*c0909341SAndroid Build Coastguard Worker%endif 2276*c0909341SAndroid Build Coastguard Worker paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2277*c0909341SAndroid Build Coastguard Worker paddd m1, m11 2278*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*2 +4], m3 2279*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2280*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2281*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*4+ 8], m0 2282*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*4+24], m1 2283*c0909341SAndroid Build Coastguard Worker add wq, 8 2284*c0909341SAndroid Build Coastguard Worker jl .hv1_loop 2285*c0909341SAndroid Build Coastguard Worker mov r10, t2 2286*c0909341SAndroid Build Coastguard Worker mov t2, t1 2287*c0909341SAndroid Build Coastguard Worker mov t1, r10 2288*c0909341SAndroid Build Coastguard Worker ret 2289*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab (even rows) 2290*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2291*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2292*c0909341SAndroid Build Coastguard Worker%else 2293*c0909341SAndroid Build Coastguard Worker mov wd, w0m 2294*c0909341SAndroid Build Coastguard Worker%endif 2295*c0909341SAndroid Build Coastguard Worker.v0_loop: 2296*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq*2+400*0] 2297*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq*2+400*2] 2298*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq*2+400*4] 2299*c0909341SAndroid Build Coastguard Worker paddw m0, m0 2300*c0909341SAndroid Build Coastguard Worker paddd m4, m4 2301*c0909341SAndroid Build Coastguard Worker paddd m5, m5 2302*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq*2+400*0] 2303*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+wq*2+400*2] 2304*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+wq*2+400*4] 2305*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*0], m0 2306*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*2], m4 2307*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*4], m5 2308*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 2309*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 2310*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a * 9 2311*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2312*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 2313*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b * b 2314*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 2315*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 2316*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 2317*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2318*c0909341SAndroid Build Coastguard Worker MULLD m4, m9, m12 ; p * s 2319*c0909341SAndroid Build Coastguard Worker MULLD m5, m9, m12 2320*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 455 2321*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 2322*c0909341SAndroid Build Coastguard Worker paddusw m4, m10 2323*c0909341SAndroid Build Coastguard Worker paddusw m5, m10 2324*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z, 255) 2325*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2326*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 2327*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 2328*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 2329*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m12 2330*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m12 2331*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2332*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2333*c0909341SAndroid Build Coastguard Worker%endif 2334*c0909341SAndroid Build Coastguard Worker paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2335*c0909341SAndroid Build Coastguard Worker paddd m1, m11 2336*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+4], m3 2337*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2338*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2339*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+ 8], m0 2340*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+24], m1 2341*c0909341SAndroid Build Coastguard Worker add wq, 8 2342*c0909341SAndroid Build Coastguard Worker jl .v0_loop 2343*c0909341SAndroid Build Coastguard Worker ret 2344*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows) 2345*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2346*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2347*c0909341SAndroid Build Coastguard Worker%else 2348*c0909341SAndroid Build Coastguard Worker mov wd, w0m 2349*c0909341SAndroid Build Coastguard Worker%endif 2350*c0909341SAndroid Build Coastguard Worker.v1_loop: 2351*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq*2+400*0] 2352*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq*2+400*2] 2353*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq*2+400*4] 2354*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq*2+400*0] 2355*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+wq*2+400*2] 2356*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+wq*2+400*4] 2357*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*0], m0 2358*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*2], m4 2359*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*4], m5 2360*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 2361*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 2362*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a * 9 2363*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2364*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 2365*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b * b 2366*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 2367*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 2368*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 2369*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2370*c0909341SAndroid Build Coastguard Worker MULLD m4, m9, m12 ; p * s 2371*c0909341SAndroid Build Coastguard Worker MULLD m5, m9, m12 2372*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 455 2373*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 2374*c0909341SAndroid Build Coastguard Worker paddusw m4, m10 2375*c0909341SAndroid Build Coastguard Worker paddusw m5, m10 2376*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z, 255) 2377*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2378*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 2379*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 2380*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 2381*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m12 2382*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m12 2383*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2384*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2385*c0909341SAndroid Build Coastguard Worker%endif 2386*c0909341SAndroid Build Coastguard Worker paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2387*c0909341SAndroid Build Coastguard Worker paddd m1, m11 2388*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*2+ 4], m3 2389*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2390*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2391*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*4+ 8], m0 2392*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*4+24], m1 2393*c0909341SAndroid Build Coastguard Worker add wq, 8 2394*c0909341SAndroid Build Coastguard Worker jl .v1_loop 2395*c0909341SAndroid Build Coastguard Worker mov r10, t2 2396*c0909341SAndroid Build Coastguard Worker mov t2, t1 2397*c0909341SAndroid Build Coastguard Worker mov t1, r10 2398*c0909341SAndroid Build Coastguard Worker ret 2399*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 2400*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 2401*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 2402*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 2403*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*2+400*0+ 4] 2404*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+400*0+ 8] 2405*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*4+400*0+24] 2406*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*2+400*0+ 2] 2407*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*4+400*0+ 4] 2408*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*4+400*0+20] 2409*c0909341SAndroid Build Coastguard Worker paddw m0, [t4+wq*2+400*0+ 0] 2410*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*4+400*0+ 0] 2411*c0909341SAndroid Build Coastguard Worker paddd m2, [t3+wq*4+400*0+16] 2412*c0909341SAndroid Build Coastguard Worker paddw m3, m0 2413*c0909341SAndroid Build Coastguard Worker paddd m4, m1 2414*c0909341SAndroid Build Coastguard Worker paddd m5, m2 2415*c0909341SAndroid Build Coastguard Worker psllw m3, 2 ; a[-1] 444 2416*c0909341SAndroid Build Coastguard Worker pslld m4, 2 ; b[-1] 444 2417*c0909341SAndroid Build Coastguard Worker pslld m5, 2 2418*c0909341SAndroid Build Coastguard Worker psubw m3, m0 ; a[-1] 343 2419*c0909341SAndroid Build Coastguard Worker psubd m4, m1 ; b[-1] 343 2420*c0909341SAndroid Build Coastguard Worker psubd m5, m2 2421*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*4], m3 2422*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*8+ 0], m4 2423*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*8+16], m5 2424*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*2+400*2+ 4] 2425*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+400*4+ 8] 2426*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*4+400*4+24] 2427*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*2+400*2+ 2] 2428*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*4+400*4+ 4] 2429*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*4+400*4+20] 2430*c0909341SAndroid Build Coastguard Worker paddw m0, [t4+wq*2+400*2+ 0] 2431*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*4+400*4+ 0] 2432*c0909341SAndroid Build Coastguard Worker paddd m2, [t3+wq*4+400*4+16] 2433*c0909341SAndroid Build Coastguard Worker paddw m3, m0 2434*c0909341SAndroid Build Coastguard Worker paddd m4, m1 2435*c0909341SAndroid Build Coastguard Worker paddd m5, m2 2436*c0909341SAndroid Build Coastguard Worker psllw m3, 2 ; a[ 0] 444 2437*c0909341SAndroid Build Coastguard Worker pslld m4, 2 ; b[ 0] 444 2438*c0909341SAndroid Build Coastguard Worker pslld m5, 2 2439*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400* 6], m3 2440*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*12+ 0], m4 2441*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*12+16], m5 2442*c0909341SAndroid Build Coastguard Worker psubw m3, m0 ; a[ 0] 343 2443*c0909341SAndroid Build Coastguard Worker psubd m4, m1 ; b[ 0] 343 2444*c0909341SAndroid Build Coastguard Worker psubd m5, m2 2445*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400* 8], m3 2446*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*16+ 0], m4 2447*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*16+16], m5 2448*c0909341SAndroid Build Coastguard Worker add wq, 8 2449*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 2450*c0909341SAndroid Build Coastguard Worker ret 2451*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2452*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 2453*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 2454*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 2455*c0909341SAndroid Build Coastguard Worker.n0_loop: 2456*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*2+400*0+4] 2457*c0909341SAndroid Build Coastguard Worker movu m1, [t4+wq*2+400*0+2] 2458*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*2+400*0+0] 2459*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2460*c0909341SAndroid Build Coastguard Worker psllw m1, 2 ; a[ 1] 444 2461*c0909341SAndroid Build Coastguard Worker psubw m2, m1, m3 ; a[ 1] 343 2462*c0909341SAndroid Build Coastguard Worker paddw m3, m2, [t4+wq*2+400*4] 2463*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*2+400*6] 2464*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*4], m2 2465*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*6], m1 2466*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*4+400*0+8] 2467*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+400*0+4] 2468*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*4+400*0+0] 2469*c0909341SAndroid Build Coastguard Worker paddd m1, m4 2470*c0909341SAndroid Build Coastguard Worker pslld m1, 2 ; b[ 1] 444 2471*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m4 ; b[ 1] 343 2472*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t3+wq*4+400* 8+ 0] 2473*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*4+400*12+ 0] 2474*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400* 8+ 0], m2 2475*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*12+ 0], m1 2476*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*4+400*0+24] 2477*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+400*0+20] 2478*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*4+400*0+16] 2479*c0909341SAndroid Build Coastguard Worker paddd m1, m5 2480*c0909341SAndroid Build Coastguard Worker pslld m1, 2 2481*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m5 2482*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+wq*4+400* 8+16] 2483*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*4+400*12+16] 2484*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400* 8+16], m2 2485*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*12+16], m1 2486*c0909341SAndroid Build Coastguard Worker movq m0, [dstq+wq] 2487*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m6 2488*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 2489*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 2490*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 2491*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 2492*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 2493*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 2494*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; b - a * src + (1 << 8) 2495*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2496*c0909341SAndroid Build Coastguard Worker psrad m4, 9 2497*c0909341SAndroid Build Coastguard Worker psrad m5, 9 2498*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 2499*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 2500*c0909341SAndroid Build Coastguard Worker paddw m0, m4 2501*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 2502*c0909341SAndroid Build Coastguard Worker movq [dstq+wq], m0 2503*c0909341SAndroid Build Coastguard Worker add wq, 8 2504*c0909341SAndroid Build Coastguard Worker jl .n0_loop 2505*c0909341SAndroid Build Coastguard Worker add dstq, stridemp 2506*c0909341SAndroid Build Coastguard Worker ret 2507*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2508*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 2509*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 2510*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 2511*c0909341SAndroid Build Coastguard Worker.n1_loop: 2512*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*2+400*2+4] 2513*c0909341SAndroid Build Coastguard Worker movu m1, [t4+wq*2+400*2+2] 2514*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*2+400*2+0] 2515*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2516*c0909341SAndroid Build Coastguard Worker psllw m1, 2 ; a[ 1] 444 2517*c0909341SAndroid Build Coastguard Worker psubw m2, m1, m3 ; a[ 1] 343 2518*c0909341SAndroid Build Coastguard Worker paddw m3, m2, [t4+wq*2+400*6] 2519*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*2+400*8] 2520*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*6], m1 2521*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*8], m2 2522*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*4+400*4+8] 2523*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+400*4+4] 2524*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*4+400*4+0] 2525*c0909341SAndroid Build Coastguard Worker paddd m1, m4 2526*c0909341SAndroid Build Coastguard Worker pslld m1, 2 ; b[ 1] 444 2527*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m4 ; b[ 1] 343 2528*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t3+wq*4+400*12+ 0] 2529*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*4+400*16+ 0] 2530*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*12+ 0], m1 2531*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*16+ 0], m2 2532*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*4+400*4+24] 2533*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+400*4+20] 2534*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*4+400*4+16] 2535*c0909341SAndroid Build Coastguard Worker paddd m1, m5 2536*c0909341SAndroid Build Coastguard Worker pslld m1, 2 2537*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m5 2538*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+wq*4+400*12+16] 2539*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*4+400*16+16] 2540*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*12+16], m1 2541*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*16+16], m2 2542*c0909341SAndroid Build Coastguard Worker movq m0, [dstq+wq] 2543*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m6 2544*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 2545*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 2546*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 2547*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 2548*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 2549*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 2550*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; b - a * src + (1 << 8) 2551*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2552*c0909341SAndroid Build Coastguard Worker psrad m4, 9 2553*c0909341SAndroid Build Coastguard Worker psrad m5, 9 2554*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 2555*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7 2556*c0909341SAndroid Build Coastguard Worker paddw m0, m4 2557*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 2558*c0909341SAndroid Build Coastguard Worker movq [dstq+wq], m0 2559*c0909341SAndroid Build Coastguard Worker add wq, 8 2560*c0909341SAndroid Build Coastguard Worker jl .n1_loop 2561*c0909341SAndroid Build Coastguard Worker add dstq, stridemp 2562*c0909341SAndroid Build Coastguard Worker movif32 dstm, dstq 2563*c0909341SAndroid Build Coastguard Worker ret 2564*c0909341SAndroid Build Coastguard Worker 2565*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2566*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 2567*c0909341SAndroid Build Coastguard Worker %assign extra_stack 10*16 2568*c0909341SAndroid Build Coastguard Worker %else 2569*c0909341SAndroid Build Coastguard Worker %assign extra_stack 8*16 2570*c0909341SAndroid Build Coastguard Worker %endif 2571*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \ 2572*c0909341SAndroid Build Coastguard Worker dst, stride, left, lpf, w 2573*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 2574*c0909341SAndroid Build Coastguard Worker %define dstm dword [esp+calloff+16*8+4*0] 2575*c0909341SAndroid Build Coastguard Worker %define stridemp dword [esp+calloff+16*8+4*1] 2576*c0909341SAndroid Build Coastguard Worker %define leftm dword [esp+calloff+16*8+4*2] 2577*c0909341SAndroid Build Coastguard Worker %define lpfm dword [esp+calloff+16*8+4*3] 2578*c0909341SAndroid Build Coastguard Worker %define w0m dword [esp+calloff+16*8+4*4] 2579*c0909341SAndroid Build Coastguard Worker %define hd dword [esp+calloff+16*8+4*5] 2580*c0909341SAndroid Build Coastguard Worker %define edgeb byte [esp+calloff+16*8+4*6] 2581*c0909341SAndroid Build Coastguard Worker %define edged dword [esp+calloff+16*8+4*6] 2582*c0909341SAndroid Build Coastguard Worker %define leftmp leftm 2583*c0909341SAndroid Build Coastguard Worker %else 2584*c0909341SAndroid Build Coastguard Worker %define w0m wm 2585*c0909341SAndroid Build Coastguard Worker %define hd dword r5m 2586*c0909341SAndroid Build Coastguard Worker %define edgeb byte r7m 2587*c0909341SAndroid Build Coastguard Worker %define edged dword r7m 2588*c0909341SAndroid Build Coastguard Worker %endif 2589*c0909341SAndroid Build Coastguard Worker %define hvsrcm dword [esp+calloff+4*0] 2590*c0909341SAndroid Build Coastguard Worker %define w1m dword [esp+calloff+4*1] 2591*c0909341SAndroid Build Coastguard Worker %define t3m dword [esp+calloff+4*2] 2592*c0909341SAndroid Build Coastguard Worker %define t4m dword [esp+calloff+4*3] 2593*c0909341SAndroid Build Coastguard Worker %xdefine m8 m6 2594*c0909341SAndroid Build Coastguard Worker %define m9 [base+pd_0xffff] 2595*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_34816] 2596*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_0xf00801c7] 2597*c0909341SAndroid Build Coastguard Worker %define m12 [base+pd_0xf00800a4] 2598*c0909341SAndroid Build Coastguard Worker %define m13 [esp+calloff+16*4] 2599*c0909341SAndroid Build Coastguard Worker %define m14 [esp+calloff+16*5] 2600*c0909341SAndroid Build Coastguard Worker %define m15 [esp+calloff+16*6] 2601*c0909341SAndroid Build Coastguard Worker %define m6 [esp+calloff+16*7] 2602*c0909341SAndroid Build Coastguard Worker %define base r6-$$ 2603*c0909341SAndroid Build Coastguard Worker %assign calloff 0 2604*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < 16 2605*c0909341SAndroid Build Coastguard Worker mov strideq, [rstk+stack_offset+ 8] 2606*c0909341SAndroid Build Coastguard Worker mov leftq, [rstk+stack_offset+12] 2607*c0909341SAndroid Build Coastguard Worker mov lpfq, [rstk+stack_offset+16] 2608*c0909341SAndroid Build Coastguard Worker mov wd, [rstk+stack_offset+20] 2609*c0909341SAndroid Build Coastguard Worker mov dstm, dstq 2610*c0909341SAndroid Build Coastguard Worker mov stridemp, strideq 2611*c0909341SAndroid Build Coastguard Worker mov leftm, leftq 2612*c0909341SAndroid Build Coastguard Worker mov r1, [rstk+stack_offset+24] 2613*c0909341SAndroid Build Coastguard Worker mov r2, [rstk+stack_offset+32] 2614*c0909341SAndroid Build Coastguard Worker mov lpfm, lpfq 2615*c0909341SAndroid Build Coastguard Worker mov hd, r1 2616*c0909341SAndroid Build Coastguard Worker mov edged, r2 2617*c0909341SAndroid Build Coastguard Worker %endif 2618*c0909341SAndroid Build Coastguard Worker%else 2619*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ 2620*c0909341SAndroid Build Coastguard Worker w, h, edge, params 2621*c0909341SAndroid Build Coastguard Worker%endif 2622*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 2623*c0909341SAndroid Build Coastguard Worker mov wd, wm 2624*c0909341SAndroid Build Coastguard Worker%endif 2625*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2626*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 2627*c0909341SAndroid Build Coastguard Worker lea r13, [sgr_x_by_x-0xf03] 2628*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 2629*c0909341SAndroid Build Coastguard Worker mov edged, r7m 2630*c0909341SAndroid Build Coastguard Worker mova m15, [paramsq] 2631*c0909341SAndroid Build Coastguard Worker add lpfq, wq 2632*c0909341SAndroid Build Coastguard Worker mova m9, [pd_0xffff] 2633*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+44] 2634*c0909341SAndroid Build Coastguard Worker mova m10, [pd_34816] 2635*c0909341SAndroid Build Coastguard Worker add dstq, wq 2636*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*4+400*24+40] 2637*c0909341SAndroid Build Coastguard Worker mova m11, [pd_0xf00801c7] 2638*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+wq*2+400*52+40] 2639*c0909341SAndroid Build Coastguard Worker mova m12, [base+pd_0xf00800a4] 2640*c0909341SAndroid Build Coastguard Worker neg wq 2641*c0909341SAndroid Build Coastguard Worker pshuflw m13, m15, q0000 2642*c0909341SAndroid Build Coastguard Worker pshuflw m14, m15, q2222 2643*c0909341SAndroid Build Coastguard Worker pshufhw m15, m15, q1010 2644*c0909341SAndroid Build Coastguard Worker punpcklqdq m13, m13 ; s0 2645*c0909341SAndroid Build Coastguard Worker punpcklqdq m14, m14 ; s1 2646*c0909341SAndroid Build Coastguard Worker punpckhqdq m15, m15 ; w0 w1 2647*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2648*c0909341SAndroid Build Coastguard Worker psllw m15, 2 2649*c0909341SAndroid Build Coastguard Worker DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 2650*c0909341SAndroid Build Coastguard Worker %define lpfm [rsp] 2651*c0909341SAndroid Build Coastguard Worker%else 2652*c0909341SAndroid Build Coastguard Worker mov r1, [rstk+stack_offset+28] ; params 2653*c0909341SAndroid Build Coastguard Worker LEA r6, $$ 2654*c0909341SAndroid Build Coastguard Worker mova m2, [r1] 2655*c0909341SAndroid Build Coastguard Worker add lpfm, wq 2656*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+extra_stack+wq*2+52] 2657*c0909341SAndroid Build Coastguard Worker add dstq, wq 2658*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+extra_stack+wq*4+400*24+48] 2659*c0909341SAndroid Build Coastguard Worker mov dstm, dstq 2660*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+extra_stack+wq*2+400*52+48] 2661*c0909341SAndroid Build Coastguard Worker mov t3m, t3 2662*c0909341SAndroid Build Coastguard Worker mov t4m, t4 2663*c0909341SAndroid Build Coastguard Worker neg wq 2664*c0909341SAndroid Build Coastguard Worker pshuflw m0, m2, q0000 2665*c0909341SAndroid Build Coastguard Worker pshuflw m1, m2, q2222 2666*c0909341SAndroid Build Coastguard Worker pshufhw m2, m2, q1010 2667*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m0 ; s0 2668*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m1 ; s1 2669*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m2 ; w0 w1 2670*c0909341SAndroid Build Coastguard Worker mov w1m, wd 2671*c0909341SAndroid Build Coastguard Worker pxor m3, m3 2672*c0909341SAndroid Build Coastguard Worker psllw m2, 2 2673*c0909341SAndroid Build Coastguard Worker mova m13, m0 2674*c0909341SAndroid Build Coastguard Worker mova m14, m1 2675*c0909341SAndroid Build Coastguard Worker sub wd, 2 2676*c0909341SAndroid Build Coastguard Worker mova m15, m2 2677*c0909341SAndroid Build Coastguard Worker mova m6, m3 2678*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 2679*c0909341SAndroid Build Coastguard Worker mov w0m, wd 2680*c0909341SAndroid Build Coastguard Worker %define strideq r5 2681*c0909341SAndroid Build Coastguard Worker%endif 2682*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 2683*c0909341SAndroid Build Coastguard Worker jz .no_top 2684*c0909341SAndroid Build Coastguard Worker call .h_top 2685*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 2686*c0909341SAndroid Build Coastguard Worker mov t2, t1 2687*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2688*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup 2689*c0909341SAndroid Build Coastguard Worker%else 2690*c0909341SAndroid Build Coastguard Worker mov wq, w0m 2691*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop 2692*c0909341SAndroid Build Coastguard Worker%endif 2693*c0909341SAndroid Build Coastguard Worker add t1, 400*12 2694*c0909341SAndroid Build Coastguard Worker call .h_top 2695*c0909341SAndroid Build Coastguard Worker movif32 strideq, stridemp 2696*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 2697*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 2698*c0909341SAndroid Build Coastguard Worker add r10, strideq 2699*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 ; below 2700*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 2701*c0909341SAndroid Build Coastguard Worker call .hv0 2702*c0909341SAndroid Build Coastguard Worker.main: 2703*c0909341SAndroid Build Coastguard Worker dec hd 2704*c0909341SAndroid Build Coastguard Worker jz .height1 2705*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2706*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 2707*c0909341SAndroid Build Coastguard Worker call .hv1 2708*c0909341SAndroid Build Coastguard Worker call .prep_n 2709*c0909341SAndroid Build Coastguard Worker sub hd, 2 2710*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 2711*c0909341SAndroid Build Coastguard Worker.main_loop: 2712*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2713*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 2714*c0909341SAndroid Build Coastguard Worker call .hv0 2715*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2716*c0909341SAndroid Build Coastguard Worker test hd, hd 2717*c0909341SAndroid Build Coastguard Worker%else 2718*c0909341SAndroid Build Coastguard Worker mov r4, hd 2719*c0909341SAndroid Build Coastguard Worker test r4, r4 2720*c0909341SAndroid Build Coastguard Worker%endif 2721*c0909341SAndroid Build Coastguard Worker jz .odd_height 2722*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2723*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 2724*c0909341SAndroid Build Coastguard Worker call .hv1 2725*c0909341SAndroid Build Coastguard Worker call .n0 2726*c0909341SAndroid Build Coastguard Worker call .n1 2727*c0909341SAndroid Build Coastguard Worker sub hd, 2 2728*c0909341SAndroid Build Coastguard Worker jge .main_loop 2729*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 2730*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 2731*c0909341SAndroid Build Coastguard Worker mov lpfq, lpfm 2732*c0909341SAndroid Build Coastguard Worker call .hv0_bottom 2733*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2734*c0909341SAndroid Build Coastguard Worker add lpfq, stridemp 2735*c0909341SAndroid Build Coastguard Worker call .hv1_bottom 2736*c0909341SAndroid Build Coastguard Worker.end: 2737*c0909341SAndroid Build Coastguard Worker call .n0 2738*c0909341SAndroid Build Coastguard Worker call .n1 2739*c0909341SAndroid Build Coastguard Worker.end2: 2740*c0909341SAndroid Build Coastguard Worker RET 2741*c0909341SAndroid Build Coastguard Worker.height1: 2742*c0909341SAndroid Build Coastguard Worker call .v1 2743*c0909341SAndroid Build Coastguard Worker call .prep_n 2744*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 2745*c0909341SAndroid Build Coastguard Worker.odd_height: 2746*c0909341SAndroid Build Coastguard Worker call .v1 2747*c0909341SAndroid Build Coastguard Worker call .n0 2748*c0909341SAndroid Build Coastguard Worker call .n1 2749*c0909341SAndroid Build Coastguard Worker.odd_height_end: 2750*c0909341SAndroid Build Coastguard Worker call .v0 2751*c0909341SAndroid Build Coastguard Worker call .v1 2752*c0909341SAndroid Build Coastguard Worker call .n0 2753*c0909341SAndroid Build Coastguard Worker jmp .end2 2754*c0909341SAndroid Build Coastguard Worker.extend_bottom: 2755*c0909341SAndroid Build Coastguard Worker call .v0 2756*c0909341SAndroid Build Coastguard Worker call .v1 2757*c0909341SAndroid Build Coastguard Worker jmp .end 2758*c0909341SAndroid Build Coastguard Worker.no_top: 2759*c0909341SAndroid Build Coastguard Worker movif32 strideq, stridemp 2760*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 2761*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 2762*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 2763*c0909341SAndroid Build Coastguard Worker mov lpfm, r10 2764*c0909341SAndroid Build Coastguard Worker call .h 2765*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2766*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2767*c0909341SAndroid Build Coastguard Worker%else 2768*c0909341SAndroid Build Coastguard Worker mov wq, w0m 2769*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2770*c0909341SAndroid Build Coastguard Worker%endif 2771*c0909341SAndroid Build Coastguard Worker lea t2, [t1+400*12] 2772*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: 2773*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq*2+400* 0] 2774*c0909341SAndroid Build Coastguard Worker mova m1, [t1+wq*2+400* 2] 2775*c0909341SAndroid Build Coastguard Worker mova m2, [t1+wq*2+400* 4] 2776*c0909341SAndroid Build Coastguard Worker paddw m0, m0 2777*c0909341SAndroid Build Coastguard Worker mova m3, [t1+wq*2+400* 6] 2778*c0909341SAndroid Build Coastguard Worker paddd m1, m1 2779*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq*2+400* 8] 2780*c0909341SAndroid Build Coastguard Worker paddd m2, m2 2781*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq*2+400*10] 2782*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 0], m0 2783*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 2], m1 2784*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 4], m2 2785*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 6], m3 2786*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 8], m4 2787*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*10], m5 2788*c0909341SAndroid Build Coastguard Worker add wq, 8 2789*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 2790*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 2791*c0909341SAndroid Build Coastguard Worker movif32 t4, t4m 2792*c0909341SAndroid Build Coastguard Worker call .v0 2793*c0909341SAndroid Build Coastguard Worker jmp .main 2794*c0909341SAndroid Build Coastguard Worker.extend_right: 2795*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+8 2796*c0909341SAndroid Build Coastguard Worker%assign calloff 8 2797*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2798*c0909341SAndroid Build Coastguard Worker SWAP m8, m6 2799*c0909341SAndroid Build Coastguard Worker%endif 2800*c0909341SAndroid Build Coastguard Worker movd m1, wd 2801*c0909341SAndroid Build Coastguard Worker movd m3, [lpfq-1] 2802*c0909341SAndroid Build Coastguard Worker pshufb m1, m8 2803*c0909341SAndroid Build Coastguard Worker pshufb m3, m8 2804*c0909341SAndroid Build Coastguard Worker psubb m2, [base+pb_1], m1 2805*c0909341SAndroid Build Coastguard Worker pcmpgtb m2, [base+pb_0to15] 2806*c0909341SAndroid Build Coastguard Worker pand m5, m2 2807*c0909341SAndroid Build Coastguard Worker pandn m2, m3 2808*c0909341SAndroid Build Coastguard Worker por m5, m2 2809*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2810*c0909341SAndroid Build Coastguard Worker SWAP m6, m8 2811*c0909341SAndroid Build Coastguard Worker%endif 2812*c0909341SAndroid Build Coastguard Worker ret 2813*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset-4 2814*c0909341SAndroid Build Coastguard Worker%assign calloff 4 2815*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 2816*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2817*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2818*c0909341SAndroid Build Coastguard Worker%else 2819*c0909341SAndroid Build Coastguard Worker %define leftq r4 2820*c0909341SAndroid Build Coastguard Worker%endif 2821*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2822*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 2823*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 2824*c0909341SAndroid Build Coastguard Worker movddup m4, [leftq-4] 2825*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2826*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 2827*c0909341SAndroid Build Coastguard Worker add leftmp, 4 2828*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 13 2829*c0909341SAndroid Build Coastguard Worker jmp .h_main 2830*c0909341SAndroid Build Coastguard Worker.h_extend_left: 2831*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2832*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 2833*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+sgr_lshuf5] 2834*c0909341SAndroid Build Coastguard Worker jmp .h_main 2835*c0909341SAndroid Build Coastguard Worker.h_top: 2836*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2837*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2838*c0909341SAndroid Build Coastguard Worker%endif 2839*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2840*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 2841*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2842*c0909341SAndroid Build Coastguard Worker.h_loop: 2843*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq-1] 2844*c0909341SAndroid Build Coastguard Worker.h_main: 2845*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2846*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2847*c0909341SAndroid Build Coastguard Worker pxor m8, m8 2848*c0909341SAndroid Build Coastguard Worker%else 2849*c0909341SAndroid Build Coastguard Worker SWAP m8, m6 2850*c0909341SAndroid Build Coastguard Worker%endif 2851*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 2852*c0909341SAndroid Build Coastguard Worker cmp wd, -10 2853*c0909341SAndroid Build Coastguard Worker jl .h_have_right 2854*c0909341SAndroid Build Coastguard Worker call .extend_right 2855*c0909341SAndroid Build Coastguard Worker.h_have_right: 2856*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m8 2857*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m8 2858*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 2 2859*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 4 2860*c0909341SAndroid Build Coastguard Worker paddw m1, m3, m0 2861*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m0 2862*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2863*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0 2864*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2865*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 6 2866*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; sum3 2867*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m0, m8 2868*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 2869*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m8 2870*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 2871*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2872*c0909341SAndroid Build Coastguard Worker SWAP m6, m8 2873*c0909341SAndroid Build Coastguard Worker%endif 2874*c0909341SAndroid Build Coastguard Worker paddd m2, m7 ; sumsq3 2875*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 8 2876*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m5, m4 2877*c0909341SAndroid Build Coastguard Worker paddw m8, m4, m5 2878*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 2879*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m4 2880*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2881*c0909341SAndroid Build Coastguard Worker paddd m3, m0 2882*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400* 6], m1 2883*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400* 8], m2 2884*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*10], m3 2885*c0909341SAndroid Build Coastguard Worker paddw m8, m1 ; sum5 2886*c0909341SAndroid Build Coastguard Worker paddd m7, m2 ; sumsq5 2887*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2888*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400* 0], m8 2889*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400* 2], m7 2890*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400* 4], m5 2891*c0909341SAndroid Build Coastguard Worker add wq, 8 2892*c0909341SAndroid Build Coastguard Worker jl .h_loop 2893*c0909341SAndroid Build Coastguard Worker ret 2894*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2895*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 2896*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2897*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2898*c0909341SAndroid Build Coastguard Worker%else 2899*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2900*c0909341SAndroid Build Coastguard Worker%endif 2901*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2902*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 2903*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 2904*c0909341SAndroid Build Coastguard Worker movddup m4, [leftq-4] 2905*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2906*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 2907*c0909341SAndroid Build Coastguard Worker add leftmp, 4 2908*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 13 2909*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 2910*c0909341SAndroid Build Coastguard Worker.hv0_extend_left: 2911*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2912*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 2913*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+sgr_lshuf5] 2914*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 2915*c0909341SAndroid Build Coastguard Worker.hv0_bottom: 2916*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2917*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 2918*c0909341SAndroid Build Coastguard Worker%else 2919*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 2920*c0909341SAndroid Build Coastguard Worker%endif 2921*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2922*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 2923*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 2924*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2925*c0909341SAndroid Build Coastguard Worker jmp .hv0_loop_start 2926*c0909341SAndroid Build Coastguard Worker%endif 2927*c0909341SAndroid Build Coastguard Worker.hv0_loop: 2928*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 2929*c0909341SAndroid Build Coastguard Worker.hv0_loop_start: 2930*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq-1] 2931*c0909341SAndroid Build Coastguard Worker.hv0_main: 2932*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2933*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2934*c0909341SAndroid Build Coastguard Worker pxor m8, m8 2935*c0909341SAndroid Build Coastguard Worker%else 2936*c0909341SAndroid Build Coastguard Worker SWAP m8, m6 2937*c0909341SAndroid Build Coastguard Worker%endif 2938*c0909341SAndroid Build Coastguard Worker jnz .hv0_have_right 2939*c0909341SAndroid Build Coastguard Worker cmp wd, -10 2940*c0909341SAndroid Build Coastguard Worker jl .hv0_have_right 2941*c0909341SAndroid Build Coastguard Worker call .extend_right 2942*c0909341SAndroid Build Coastguard Worker.hv0_have_right: 2943*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m8 2944*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m8 2945*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 2 2946*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 4 2947*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 2948*c0909341SAndroid Build Coastguard Worker paddw m1, m3, m0 2949*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m0 2950*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2951*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0 2952*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2953*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 6 2954*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; h sum3 2955*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m0, m8 2956*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 2957*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m8 2958*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 2959*c0909341SAndroid Build Coastguard Worker SWAP m6, m8 2960*c0909341SAndroid Build Coastguard Worker%endif 2961*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 2962*c0909341SAndroid Build Coastguard Worker paddd m2, m7 ; h sumsq3 2963*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 8 2964*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m5, m4 2965*c0909341SAndroid Build Coastguard Worker paddw m8, m4, m5 2966*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 2967*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m4 2968*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2969*c0909341SAndroid Build Coastguard Worker paddd m3, m0 2970*c0909341SAndroid Build Coastguard Worker paddw m8, m1 ; h sum5 2971*c0909341SAndroid Build Coastguard Worker paddd m7, m2 ; h sumsq5 2972*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2973*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*8+ 8], m8 2974*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*0+ 8], m7 2975*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*0+24], m5 2976*c0909341SAndroid Build Coastguard Worker paddw m8, [t1+wq*2+400* 0] 2977*c0909341SAndroid Build Coastguard Worker paddd m7, [t1+wq*2+400* 2] 2978*c0909341SAndroid Build Coastguard Worker paddd m5, [t1+wq*2+400* 4] 2979*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400* 0], m8 2980*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400* 2], m7 2981*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400* 4], m5 2982*c0909341SAndroid Build Coastguard Worker paddw m0, m1, [t1+wq*2+400* 6] 2983*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t1+wq*2+400* 8] 2984*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t1+wq*2+400*10] 2985*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400* 6], m1 2986*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400* 8], m2 2987*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*10], m3 2988*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq*2+400* 6] 2989*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+wq*2+400* 8] 2990*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+wq*2+400*10] 2991*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 6], m0 2992*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 8], m4 2993*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*10], m5 2994*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 2995*c0909341SAndroid Build Coastguard Worker pxor m7, m7 2996*c0909341SAndroid Build Coastguard Worker%else 2997*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 2998*c0909341SAndroid Build Coastguard Worker%endif 2999*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 3000*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 3001*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a3 * 9 3002*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3003*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 3004*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 3005*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 3006*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 3007*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3008*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3009*c0909341SAndroid Build Coastguard Worker%endif 3010*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p3 3011*c0909341SAndroid Build Coastguard Worker psubd m5, m3 3012*c0909341SAndroid Build Coastguard Worker MULLD m4, m14, m7 ; p3 * s1 3013*c0909341SAndroid Build Coastguard Worker MULLD m5, m14, m7 3014*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b3 * 455 3015*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 3016*c0909341SAndroid Build Coastguard Worker paddusw m4, m11 3017*c0909341SAndroid Build Coastguard Worker paddusw m5, m11 3018*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z3, 255) 3019*c0909341SAndroid Build Coastguard Worker psrld m5, 20 3020*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 3021*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 3022*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 3023*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m7 3024*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m7 3025*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3026*c0909341SAndroid Build Coastguard Worker paddd m1, m10 3027*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*2+ 4], m3 3028*c0909341SAndroid Build Coastguard Worker psrld m0, 12 3029*c0909341SAndroid Build Coastguard Worker psrld m1, 12 3030*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*4+ 8], m0 3031*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*4+24], m1 3032*c0909341SAndroid Build Coastguard Worker add wq, 8 3033*c0909341SAndroid Build Coastguard Worker jl .hv0_loop 3034*c0909341SAndroid Build Coastguard Worker ret 3035*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3036*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 3037*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3038*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 3039*c0909341SAndroid Build Coastguard Worker%else 3040*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 3041*c0909341SAndroid Build Coastguard Worker%endif 3042*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 3043*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 3044*c0909341SAndroid Build Coastguard Worker movif32 leftq, leftm 3045*c0909341SAndroid Build Coastguard Worker movddup m4, [leftq-4] 3046*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 3047*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 3048*c0909341SAndroid Build Coastguard Worker add leftmp, 4 3049*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 13 3050*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 3051*c0909341SAndroid Build Coastguard Worker.hv1_extend_left: 3052*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 3053*c0909341SAndroid Build Coastguard Worker mova m5, [lpfq+wq+2] 3054*c0909341SAndroid Build Coastguard Worker pshufb m5, [base+sgr_lshuf5] 3055*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 3056*c0909341SAndroid Build Coastguard Worker.hv1_bottom: 3057*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3058*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 3059*c0909341SAndroid Build Coastguard Worker%else 3060*c0909341SAndroid Build Coastguard Worker mov hvsrcm, lpfq 3061*c0909341SAndroid Build Coastguard Worker%endif 3062*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 3063*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 3064*c0909341SAndroid Build Coastguard Worker movif32 wq, w0m 3065*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3066*c0909341SAndroid Build Coastguard Worker jmp .hv1_loop_start 3067*c0909341SAndroid Build Coastguard Worker%endif 3068*c0909341SAndroid Build Coastguard Worker.hv1_loop: 3069*c0909341SAndroid Build Coastguard Worker movif32 lpfq, hvsrcm 3070*c0909341SAndroid Build Coastguard Worker.hv1_loop_start: 3071*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+wq-1] 3072*c0909341SAndroid Build Coastguard Worker.hv1_main: 3073*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 3074*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3075*c0909341SAndroid Build Coastguard Worker pxor m8, m8 3076*c0909341SAndroid Build Coastguard Worker%else 3077*c0909341SAndroid Build Coastguard Worker SWAP m8, m6 3078*c0909341SAndroid Build Coastguard Worker%endif 3079*c0909341SAndroid Build Coastguard Worker jnz .hv1_have_right 3080*c0909341SAndroid Build Coastguard Worker cmp wd, -10 3081*c0909341SAndroid Build Coastguard Worker jl .hv1_have_right 3082*c0909341SAndroid Build Coastguard Worker call .extend_right 3083*c0909341SAndroid Build Coastguard Worker.hv1_have_right: 3084*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m5, m8 3085*c0909341SAndroid Build Coastguard Worker punpckhbw m5, m8 3086*c0909341SAndroid Build Coastguard Worker palignr m7, m5, m4, 2 3087*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 4 3088*c0909341SAndroid Build Coastguard Worker paddw m2, m7, m3 3089*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m7, m3 3090*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 3091*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m3 3092*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m7 3093*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 6 3094*c0909341SAndroid Build Coastguard Worker paddw m2, m3 ; h sum3 3095*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m8 3096*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 3097*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m8 3098*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3099*c0909341SAndroid Build Coastguard Worker SWAP m6, m8 3100*c0909341SAndroid Build Coastguard Worker%endif 3101*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 3102*c0909341SAndroid Build Coastguard Worker paddd m0, m1 ; h sumsq3 3103*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 8 3104*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4, m5 3105*c0909341SAndroid Build Coastguard Worker paddw m8, m4, m5 3106*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 3107*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 3108*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 3109*c0909341SAndroid Build Coastguard Worker paddd m7, m3 3110*c0909341SAndroid Build Coastguard Worker paddw m5, m2, [t2+wq*2+400* 6] 3111*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 6], m2 3112*c0909341SAndroid Build Coastguard Worker paddw m8, m2 ; h sum5 3113*c0909341SAndroid Build Coastguard Worker paddd m2, m0, [t2+wq*2+400* 8] 3114*c0909341SAndroid Build Coastguard Worker paddd m3, m7, [t2+wq*2+400*10] 3115*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 8], m0 3116*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*10], m7 3117*c0909341SAndroid Build Coastguard Worker paddd m4, m0 ; h sumsq5 3118*c0909341SAndroid Build Coastguard Worker paddd m1, m7 3119*c0909341SAndroid Build Coastguard Worker pslld m0, m2, 3 3120*c0909341SAndroid Build Coastguard Worker pslld m7, m3, 3 3121*c0909341SAndroid Build Coastguard Worker paddd m2, m0 ; a3 * 9 3122*c0909341SAndroid Build Coastguard Worker paddd m3, m7 3123*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3124*c0909341SAndroid Build Coastguard Worker mova [esp+20], m8 3125*c0909341SAndroid Build Coastguard Worker pxor m8, m8 3126*c0909341SAndroid Build Coastguard Worker%else 3127*c0909341SAndroid Build Coastguard Worker SWAP m8, m6 3128*c0909341SAndroid Build Coastguard Worker%endif 3129*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m5, m8 ; b3 3130*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m0, m0 3131*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m8 3132*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m5, m5 3133*c0909341SAndroid Build Coastguard Worker psubd m2, m7 ; p3 3134*c0909341SAndroid Build Coastguard Worker psubd m3, m8 3135*c0909341SAndroid Build Coastguard Worker MULLD m2, m14, m8 ; p3 * s1 3136*c0909341SAndroid Build Coastguard Worker MULLD m3, m14, m8 3137*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b3 * 455 3138*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m11 3139*c0909341SAndroid Build Coastguard Worker paddusw m2, m11 3140*c0909341SAndroid Build Coastguard Worker paddusw m3, m11 3141*c0909341SAndroid Build Coastguard Worker psrld m2, 20 ; min(z3, 255) 3142*c0909341SAndroid Build Coastguard Worker movif32 t3, t3m 3143*c0909341SAndroid Build Coastguard Worker psrld m3, 20 3144*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m8, m2, m3, r0, dstm 3145*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m8, m8 3146*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m8, m8 3147*c0909341SAndroid Build Coastguard Worker MULLD m0, m2, m7 3148*c0909341SAndroid Build Coastguard Worker MULLD m5, m3, m7 3149*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3150*c0909341SAndroid Build Coastguard Worker paddd m5, m10 3151*c0909341SAndroid Build Coastguard Worker psrld m0, 12 3152*c0909341SAndroid Build Coastguard Worker psrld m5, 12 3153*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*4+ 4], m8 3154*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*8+ 8], m0 3155*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*8+24], m5 3156*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3157*c0909341SAndroid Build Coastguard Worker mova m8, [esp+20] 3158*c0909341SAndroid Build Coastguard Worker%else 3159*c0909341SAndroid Build Coastguard Worker SWAP m6, m8 3160*c0909341SAndroid Build Coastguard Worker pxor m6, m6 3161*c0909341SAndroid Build Coastguard Worker%endif 3162*c0909341SAndroid Build Coastguard Worker paddw m5, m8, [t2+wq*2+400*0] 3163*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+wq*2+400*2] 3164*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t2+wq*2+400*4] 3165*c0909341SAndroid Build Coastguard Worker paddw m5, [t1+wq*2+400*0] 3166*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+wq*2+400*2] 3167*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+wq*2+400*4] 3168*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*0], m8 3169*c0909341SAndroid Build Coastguard Worker pslld m0, m2, 4 3170*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*2], m4 3171*c0909341SAndroid Build Coastguard Worker pslld m8, m3, 4 3172*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*4], m1 3173*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 3174*c0909341SAndroid Build Coastguard Worker paddd m2, m0 3175*c0909341SAndroid Build Coastguard Worker pslld m7, m3, 3 3176*c0909341SAndroid Build Coastguard Worker paddd m3, m8 3177*c0909341SAndroid Build Coastguard Worker paddd m2, m4 ; a5 * 25 3178*c0909341SAndroid Build Coastguard Worker paddd m3, m7 3179*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3180*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3181*c0909341SAndroid Build Coastguard Worker%else 3182*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3183*c0909341SAndroid Build Coastguard Worker%endif 3184*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m5, m7 ; b5 3185*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m0 3186*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m7 3187*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m5, m5 3188*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3189*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3190*c0909341SAndroid Build Coastguard Worker%endif 3191*c0909341SAndroid Build Coastguard Worker psubd m2, m4 ; p5 3192*c0909341SAndroid Build Coastguard Worker psubd m3, m1 3193*c0909341SAndroid Build Coastguard Worker MULLD m2, m13, m7 ; p5 * s0 3194*c0909341SAndroid Build Coastguard Worker MULLD m3, m13, m7 3195*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 ; b5 * 164 3196*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m12 3197*c0909341SAndroid Build Coastguard Worker paddusw m2, m12 3198*c0909341SAndroid Build Coastguard Worker paddusw m3, m12 3199*c0909341SAndroid Build Coastguard Worker psrld m2, 20 ; min(z5, 255) 3200*c0909341SAndroid Build Coastguard Worker psrld m3, 20 3201*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m1, m2, m3, r0, dstm 3202*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1, m1 3203*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m1, m1 3204*c0909341SAndroid Build Coastguard Worker MULLD m0, m2, m7 3205*c0909341SAndroid Build Coastguard Worker MULLD m5, m3, m7 3206*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3207*c0909341SAndroid Build Coastguard Worker paddd m5, m10 3208*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+4], m1 3209*c0909341SAndroid Build Coastguard Worker psrld m0, 12 3210*c0909341SAndroid Build Coastguard Worker psrld m5, 12 3211*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+ 8], m0 3212*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+24], m5 3213*c0909341SAndroid Build Coastguard Worker add wq, 8 3214*c0909341SAndroid Build Coastguard Worker jl .hv1_loop 3215*c0909341SAndroid Build Coastguard Worker mov r10, t2 3216*c0909341SAndroid Build Coastguard Worker mov t2, t1 3217*c0909341SAndroid Build Coastguard Worker mov t1, r10 3218*c0909341SAndroid Build Coastguard Worker ret 3219*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab3 (even rows) 3220*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3221*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 3222*c0909341SAndroid Build Coastguard Worker%else 3223*c0909341SAndroid Build Coastguard Worker mov wd, w0m 3224*c0909341SAndroid Build Coastguard Worker%endif 3225*c0909341SAndroid Build Coastguard Worker.v0_loop: 3226*c0909341SAndroid Build Coastguard Worker mova m0, [t1+wq*2+400* 6] 3227*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq*2+400* 8] 3228*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq*2+400*10] 3229*c0909341SAndroid Build Coastguard Worker paddw m0, m0 3230*c0909341SAndroid Build Coastguard Worker paddd m4, m4 3231*c0909341SAndroid Build Coastguard Worker paddd m5, m5 3232*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+wq*2+400* 6] 3233*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+wq*2+400* 8] 3234*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+wq*2+400*10] 3235*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 6], m0 3236*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 8], m4 3237*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*10], m5 3238*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3239*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3240*c0909341SAndroid Build Coastguard Worker%else 3241*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3242*c0909341SAndroid Build Coastguard Worker%endif 3243*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 3244*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 3245*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a3 * 9 3246*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3247*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 3248*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 3249*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 3250*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 3251*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p3 3252*c0909341SAndroid Build Coastguard Worker psubd m5, m3 3253*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3254*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3255*c0909341SAndroid Build Coastguard Worker%endif 3256*c0909341SAndroid Build Coastguard Worker MULLD m4, m14, m7 ; p3 * s1 3257*c0909341SAndroid Build Coastguard Worker MULLD m5, m14, m7 3258*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b3 * 455 3259*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 3260*c0909341SAndroid Build Coastguard Worker paddusw m4, m11 3261*c0909341SAndroid Build Coastguard Worker paddusw m5, m11 3262*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z3, 255) 3263*c0909341SAndroid Build Coastguard Worker psrld m5, 20 3264*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 3265*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 3266*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 3267*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m7 3268*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m7 3269*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3270*c0909341SAndroid Build Coastguard Worker paddd m1, m10 3271*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*2+4], m3 3272*c0909341SAndroid Build Coastguard Worker psrld m0, 12 3273*c0909341SAndroid Build Coastguard Worker psrld m1, 12 3274*c0909341SAndroid Build Coastguard Worker mova m3, [t1+wq*2+400*0] 3275*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq*2+400*2] 3276*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq*2+400*4] 3277*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*8+ 8], m3 3278*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*0+ 8], m4 3279*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*0+24], m5 3280*c0909341SAndroid Build Coastguard Worker paddw m3, m3 ; cc5 3281*c0909341SAndroid Build Coastguard Worker paddd m4, m4 3282*c0909341SAndroid Build Coastguard Worker paddd m5, m5 3283*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*0], m3 3284*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*2], m4 3285*c0909341SAndroid Build Coastguard Worker mova [t1+wq*2+400*4], m5 3286*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*4+ 8], m0 3287*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*4+24], m1 3288*c0909341SAndroid Build Coastguard Worker add wq, 8 3289*c0909341SAndroid Build Coastguard Worker jl .v0_loop 3290*c0909341SAndroid Build Coastguard Worker ret 3291*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows) 3292*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3293*c0909341SAndroid Build Coastguard Worker lea wq, [r4-2] 3294*c0909341SAndroid Build Coastguard Worker%else 3295*c0909341SAndroid Build Coastguard Worker mov wd, w0m 3296*c0909341SAndroid Build Coastguard Worker%endif 3297*c0909341SAndroid Build Coastguard Worker.v1_loop: 3298*c0909341SAndroid Build Coastguard Worker mova m4, [t1+wq*2+400* 6] 3299*c0909341SAndroid Build Coastguard Worker mova m5, [t1+wq*2+400* 8] 3300*c0909341SAndroid Build Coastguard Worker mova m7, [t1+wq*2+400*10] 3301*c0909341SAndroid Build Coastguard Worker paddw m1, m4, [t2+wq*2+400* 6] 3302*c0909341SAndroid Build Coastguard Worker paddd m2, m5, [t2+wq*2+400* 8] 3303*c0909341SAndroid Build Coastguard Worker paddd m3, m7, [t2+wq*2+400*10] 3304*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 6], m4 3305*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400* 8], m5 3306*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*10], m7 3307*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3308*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3309*c0909341SAndroid Build Coastguard Worker%else 3310*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3311*c0909341SAndroid Build Coastguard Worker%endif 3312*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 3313*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 3314*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a3 + 8) >> 4) * 9 3315*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3316*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 3317*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 3318*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 3319*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 3320*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p3 3321*c0909341SAndroid Build Coastguard Worker psubd m5, m3 3322*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3323*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3324*c0909341SAndroid Build Coastguard Worker%endif 3325*c0909341SAndroid Build Coastguard Worker MULLD m4, m14, m7 ; p3 * s1 3326*c0909341SAndroid Build Coastguard Worker MULLD m5, m14, m7 3327*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b3 * 455 3328*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 3329*c0909341SAndroid Build Coastguard Worker paddusw m4, m11 3330*c0909341SAndroid Build Coastguard Worker paddusw m5, m11 3331*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; min(z3, 255) 3332*c0909341SAndroid Build Coastguard Worker psrld m5, 20 3333*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m3, m4, m5, r0, dstm 3334*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m3 3335*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m3 3336*c0909341SAndroid Build Coastguard Worker MULLD m0, m4, m7 3337*c0909341SAndroid Build Coastguard Worker MULLD m1, m5, m7 3338*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3339*c0909341SAndroid Build Coastguard Worker paddd m1, m10 3340*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*4+4], m3 3341*c0909341SAndroid Build Coastguard Worker psrld m0, 12 3342*c0909341SAndroid Build Coastguard Worker psrld m8, m1, 12 3343*c0909341SAndroid Build Coastguard Worker mova m4, [t3+wq*4+400*8+ 8] 3344*c0909341SAndroid Build Coastguard Worker mova m5, [t3+wq*4+400*0+ 8] 3345*c0909341SAndroid Build Coastguard Worker mova m7, [t3+wq*4+400*0+24] 3346*c0909341SAndroid Build Coastguard Worker paddw m1, m4, [t2+wq*2+400*0] 3347*c0909341SAndroid Build Coastguard Worker paddd m2, m5, [t2+wq*2+400*2] 3348*c0909341SAndroid Build Coastguard Worker paddd m3, m7, [t2+wq*2+400*4] 3349*c0909341SAndroid Build Coastguard Worker paddw m1, [t1+wq*2+400*0] 3350*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+wq*2+400*2] 3351*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+wq*2+400*4] 3352*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*0], m4 3353*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*2], m5 3354*c0909341SAndroid Build Coastguard Worker mova [t2+wq*2+400*4], m7 3355*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 4 3356*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*8+ 8], m0 3357*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 4 3358*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*8+24], m8 3359*c0909341SAndroid Build Coastguard Worker pslld m7, m2, 3 3360*c0909341SAndroid Build Coastguard Worker paddd m2, m4 3361*c0909341SAndroid Build Coastguard Worker pslld m8, m3, 3 3362*c0909341SAndroid Build Coastguard Worker paddd m3, m5 3363*c0909341SAndroid Build Coastguard Worker paddd m2, m7 ; a5 * 25 3364*c0909341SAndroid Build Coastguard Worker paddd m3, m8 3365*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3366*c0909341SAndroid Build Coastguard Worker pxor m7, m7 3367*c0909341SAndroid Build Coastguard Worker%else 3368*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3369*c0909341SAndroid Build Coastguard Worker%endif 3370*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b5 3371*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m0 3372*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 3373*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m1 3374*c0909341SAndroid Build Coastguard Worker psubd m2, m4 ; p5 3375*c0909341SAndroid Build Coastguard Worker psubd m3, m5 3376*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 3377*c0909341SAndroid Build Coastguard Worker SWAP m7, m6 3378*c0909341SAndroid Build Coastguard Worker%endif 3379*c0909341SAndroid Build Coastguard Worker MULLD m2, m13, m7 ; p5 * s0 3380*c0909341SAndroid Build Coastguard Worker MULLD m3, m13, m7 3381*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 ; b5 * 164 3382*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 3383*c0909341SAndroid Build Coastguard Worker paddusw m2, m12 3384*c0909341SAndroid Build Coastguard Worker paddusw m3, m12 3385*c0909341SAndroid Build Coastguard Worker psrld m2, 20 ; min(z5, 255) 3386*c0909341SAndroid Build Coastguard Worker psrld m3, 20 3387*c0909341SAndroid Build Coastguard Worker GATHER_X_BY_X m4, m2, m3, r0, dstm 3388*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m4 3389*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m4 3390*c0909341SAndroid Build Coastguard Worker MULLD m0, m2, m7 3391*c0909341SAndroid Build Coastguard Worker MULLD m1, m3, m7 3392*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3393*c0909341SAndroid Build Coastguard Worker paddd m1, m10 3394*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+4], m4 3395*c0909341SAndroid Build Coastguard Worker psrld m0, 12 3396*c0909341SAndroid Build Coastguard Worker psrld m1, 12 3397*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+ 8], m0 3398*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+24], m1 3399*c0909341SAndroid Build Coastguard Worker add wq, 8 3400*c0909341SAndroid Build Coastguard Worker jl .v1_loop 3401*c0909341SAndroid Build Coastguard Worker mov r10, t2 3402*c0909341SAndroid Build Coastguard Worker mov t2, t1 3403*c0909341SAndroid Build Coastguard Worker mov t1, r10 3404*c0909341SAndroid Build Coastguard Worker ret 3405*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 3406*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 3407*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 3408*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 3409*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*2+400*0+ 2] 3410*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+400*0+ 4] 3411*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*4+400*0+20] 3412*c0909341SAndroid Build Coastguard Worker movu m7, [t4+wq*2+400*0+ 4] 3413*c0909341SAndroid Build Coastguard Worker movu m8, [t3+wq*4+400*0+ 8] 3414*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t4+wq*2+400*0+ 0] 3415*c0909341SAndroid Build Coastguard Worker paddd m4, m1, [t3+wq*4+400*0+ 0] 3416*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+wq*4+400*0+16] 3417*c0909341SAndroid Build Coastguard Worker paddw m3, m7 3418*c0909341SAndroid Build Coastguard Worker paddd m4, m8 3419*c0909341SAndroid Build Coastguard Worker movu m7, [t3+wq*4+400*0+24] 3420*c0909341SAndroid Build Coastguard Worker paddw m0, m3 3421*c0909341SAndroid Build Coastguard Worker paddd m1, m4 3422*c0909341SAndroid Build Coastguard Worker psllw m3, 2 3423*c0909341SAndroid Build Coastguard Worker pslld m4, 2 3424*c0909341SAndroid Build Coastguard Worker paddd m5, m7 3425*c0909341SAndroid Build Coastguard Worker paddd m2, m5 3426*c0909341SAndroid Build Coastguard Worker pslld m5, 2 3427*c0909341SAndroid Build Coastguard Worker paddw m0, m3 ; a5 565 3428*c0909341SAndroid Build Coastguard Worker paddd m1, m4 ; b5 565 3429*c0909341SAndroid Build Coastguard Worker paddd m2, m5 3430*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400* 6+ 0], m0 3431*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*12+ 0], m1 3432*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*12+16], m2 3433*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*2+400*2+ 4] 3434*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+400*4+ 8] 3435*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*4+400*4+24] 3436*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*2+400*2+ 2] 3437*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*4+400*4+ 4] 3438*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*4+400*4+20] 3439*c0909341SAndroid Build Coastguard Worker paddw m0, [t4+wq*2+400*2+ 0] 3440*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*4+400*4+ 0] 3441*c0909341SAndroid Build Coastguard Worker paddd m2, [t3+wq*4+400*4+16] 3442*c0909341SAndroid Build Coastguard Worker paddw m3, m0 3443*c0909341SAndroid Build Coastguard Worker paddd m4, m1 3444*c0909341SAndroid Build Coastguard Worker paddd m5, m2 3445*c0909341SAndroid Build Coastguard Worker psllw m3, 2 ; a3[-1] 444 3446*c0909341SAndroid Build Coastguard Worker pslld m4, 2 ; b3[-1] 444 3447*c0909341SAndroid Build Coastguard Worker pslld m5, 2 3448*c0909341SAndroid Build Coastguard Worker psubw m3, m0 ; a3[-1] 343 3449*c0909341SAndroid Build Coastguard Worker psubd m4, m1 ; b3[-1] 343 3450*c0909341SAndroid Build Coastguard Worker psubd m5, m2 3451*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400* 8+ 0], m3 3452*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*16+ 0], m4 3453*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*16+16], m5 3454*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*2+400*4+ 4] 3455*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+400*8+ 8] 3456*c0909341SAndroid Build Coastguard Worker movu m2, [t3+wq*4+400*8+24] 3457*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*2+400*4+ 2] 3458*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*4+400*8+ 4] 3459*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*4+400*8+20] 3460*c0909341SAndroid Build Coastguard Worker paddw m0, [t4+wq*2+400*4+ 0] 3461*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*4+400*8+ 0] 3462*c0909341SAndroid Build Coastguard Worker paddd m2, [t3+wq*4+400*8+16] 3463*c0909341SAndroid Build Coastguard Worker paddw m3, m0 3464*c0909341SAndroid Build Coastguard Worker paddd m4, m1 3465*c0909341SAndroid Build Coastguard Worker paddd m5, m2 3466*c0909341SAndroid Build Coastguard Worker psllw m3, 2 ; a3[ 0] 444 3467*c0909341SAndroid Build Coastguard Worker pslld m4, 2 ; b3[ 0] 444 3468*c0909341SAndroid Build Coastguard Worker pslld m5, 2 3469*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*10+ 0], m3 3470*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*20+ 0], m4 3471*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*20+16], m5 3472*c0909341SAndroid Build Coastguard Worker psubw m3, m0 ; a3[ 0] 343 3473*c0909341SAndroid Build Coastguard Worker psubd m4, m1 ; b3[ 0] 343 3474*c0909341SAndroid Build Coastguard Worker psubd m5, m2 3475*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*12+ 0], m3 3476*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*24+ 0], m4 3477*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*24+16], m5 3478*c0909341SAndroid Build Coastguard Worker add wq, 8 3479*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 3480*c0909341SAndroid Build Coastguard Worker ret 3481*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3482*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 3483*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 3484*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 3485*c0909341SAndroid Build Coastguard Worker.n0_loop: 3486*c0909341SAndroid Build Coastguard Worker movu m0, [t4+wq*2+ 4] 3487*c0909341SAndroid Build Coastguard Worker movu m2, [t4+wq*2+ 2] 3488*c0909341SAndroid Build Coastguard Worker paddw m0, [t4+wq*2+ 0] 3489*c0909341SAndroid Build Coastguard Worker paddw m0, m2 3490*c0909341SAndroid Build Coastguard Worker paddw m2, m0 3491*c0909341SAndroid Build Coastguard Worker psllw m0, 2 3492*c0909341SAndroid Build Coastguard Worker paddw m0, m2 ; a5 3493*c0909341SAndroid Build Coastguard Worker movu m4, [t3+wq*4+ 8] 3494*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*4+24] 3495*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+ 4] 3496*c0909341SAndroid Build Coastguard Worker movu m3, [t3+wq*4+20] 3497*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+wq*4+ 0] 3498*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+wq*4+16] 3499*c0909341SAndroid Build Coastguard Worker paddd m4, m1 3500*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3501*c0909341SAndroid Build Coastguard Worker paddd m1, m4 3502*c0909341SAndroid Build Coastguard Worker paddd m3, m5 3503*c0909341SAndroid Build Coastguard Worker pslld m4, 2 3504*c0909341SAndroid Build Coastguard Worker pslld m5, 2 3505*c0909341SAndroid Build Coastguard Worker paddd m4, m1 ; b5 3506*c0909341SAndroid Build Coastguard Worker paddd m5, m3 3507*c0909341SAndroid Build Coastguard Worker movu m2, [t4+wq*2+400* 6] 3508*c0909341SAndroid Build Coastguard Worker paddw m2, m0 3509*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400* 6], m0 3510*c0909341SAndroid Build Coastguard Worker paddd m0, m4, [t3+wq*4+400*12+ 0] 3511*c0909341SAndroid Build Coastguard Worker paddd m1, m5, [t3+wq*4+400*12+16] 3512*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*12+ 0], m4 3513*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*12+16], m5 3514*c0909341SAndroid Build Coastguard Worker mova [rsp+16+ARCH_X86_32*4], m1 3515*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*2+400*2+4] 3516*c0909341SAndroid Build Coastguard Worker movu m5, [t4+wq*2+400*2+2] 3517*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*2+400*2+0] 3518*c0909341SAndroid Build Coastguard Worker paddw m5, m3 3519*c0909341SAndroid Build Coastguard Worker psllw m5, 2 ; a3[ 1] 444 3520*c0909341SAndroid Build Coastguard Worker psubw m4, m5, m3 ; a3[ 1] 343 3521*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*2+400* 8] 3522*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*2+400*10] 3523*c0909341SAndroid Build Coastguard Worker paddw m3, m4 3524*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400* 8], m4 3525*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*10], m5 3526*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+400*4+ 8] 3527*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*4+400*4+ 4] 3528*c0909341SAndroid Build Coastguard Worker movu m7, [t3+wq*4+400*4+24] 3529*c0909341SAndroid Build Coastguard Worker movu m8, [t3+wq*4+400*4+20] 3530*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*4+400*4+ 0] 3531*c0909341SAndroid Build Coastguard Worker paddd m7, [t3+wq*4+400*4+16] 3532*c0909341SAndroid Build Coastguard Worker paddd m5, m1 3533*c0909341SAndroid Build Coastguard Worker paddd m8, m7 3534*c0909341SAndroid Build Coastguard Worker pslld m5, 2 ; b3[ 1] 444 3535*c0909341SAndroid Build Coastguard Worker pslld m8, 2 3536*c0909341SAndroid Build Coastguard Worker psubd m4, m5, m1 ; b3[ 1] 343 3537*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3538*c0909341SAndroid Build Coastguard Worker mova [esp+52], m8 3539*c0909341SAndroid Build Coastguard Worker psubd m8, m7 3540*c0909341SAndroid Build Coastguard Worker%else 3541*c0909341SAndroid Build Coastguard Worker psubd m6, m8, m7 3542*c0909341SAndroid Build Coastguard Worker SWAP m8, m6 3543*c0909341SAndroid Build Coastguard Worker%endif 3544*c0909341SAndroid Build Coastguard Worker paddd m1, m4, [t3+wq*4+400*16+ 0] 3545*c0909341SAndroid Build Coastguard Worker paddd m7, m8, [t3+wq*4+400*16+16] 3546*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*4+400*20+ 0] 3547*c0909341SAndroid Build Coastguard Worker paddd m7, [t3+wq*4+400*20+16] 3548*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*16+ 0], m4 3549*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*16+16], m8 3550*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*20+ 0], m5 3551*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 3552*c0909341SAndroid Build Coastguard Worker mova m8, [esp+52] 3553*c0909341SAndroid Build Coastguard Worker%else 3554*c0909341SAndroid Build Coastguard Worker SWAP m8, m6 3555*c0909341SAndroid Build Coastguard Worker pxor m6, m6 3556*c0909341SAndroid Build Coastguard Worker%endif 3557*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*20+16], m8 3558*c0909341SAndroid Build Coastguard Worker mova [rsp+32+ARCH_X86_32*4], m7 3559*c0909341SAndroid Build Coastguard Worker movq m4, [dstq+wq] 3560*c0909341SAndroid Build Coastguard Worker punpcklbw m4, m6 3561*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m4, m6 3562*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m2, m6 3563*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m5 ; a5 * src 3564*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m3, m6 3565*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m5 ; a3 * src 3566*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m4, m6 3567*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m6 3568*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m5 3569*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 3570*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m5 3571*c0909341SAndroid Build Coastguard Worker psubd m0, m7 ; b5 - a5 * src + (1 << 8) - (src << 13) 3572*c0909341SAndroid Build Coastguard Worker psubd m1, m8 ; b3 - a3 * src + (1 << 8) - (src << 13) 3573*c0909341SAndroid Build Coastguard Worker psrld m0, 9 3574*c0909341SAndroid Build Coastguard Worker pslld m1, 7 3575*c0909341SAndroid Build Coastguard Worker pand m0, m9 3576*c0909341SAndroid Build Coastguard Worker pandn m8, m9, m1 3577*c0909341SAndroid Build Coastguard Worker por m0, m8 3578*c0909341SAndroid Build Coastguard Worker mova m1, [rsp+16+ARCH_X86_32*4] 3579*c0909341SAndroid Build Coastguard Worker psubd m1, m2 3580*c0909341SAndroid Build Coastguard Worker mova m2, [rsp+32+ARCH_X86_32*4] 3581*c0909341SAndroid Build Coastguard Worker psubd m2, m3 3582*c0909341SAndroid Build Coastguard Worker mova m3, [base+pd_4096] 3583*c0909341SAndroid Build Coastguard Worker psrld m1, 9 3584*c0909341SAndroid Build Coastguard Worker pslld m2, 7 3585*c0909341SAndroid Build Coastguard Worker pand m1, m9 3586*c0909341SAndroid Build Coastguard Worker pandn m5, m9, m2 3587*c0909341SAndroid Build Coastguard Worker por m1, m5 3588*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 3589*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m15 3590*c0909341SAndroid Build Coastguard Worker paddd m0, m3 3591*c0909341SAndroid Build Coastguard Worker paddd m1, m3 3592*c0909341SAndroid Build Coastguard Worker psrad m0, 13 3593*c0909341SAndroid Build Coastguard Worker psrad m1, 13 3594*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 3595*c0909341SAndroid Build Coastguard Worker paddw m0, m4 3596*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 3597*c0909341SAndroid Build Coastguard Worker movq [dstq+wq], m0 3598*c0909341SAndroid Build Coastguard Worker add wq, 8 3599*c0909341SAndroid Build Coastguard Worker jl .n0_loop 3600*c0909341SAndroid Build Coastguard Worker add dstq, stridemp 3601*c0909341SAndroid Build Coastguard Worker ret 3602*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3603*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 3604*c0909341SAndroid Build Coastguard Worker movif64 wq, r4 3605*c0909341SAndroid Build Coastguard Worker movif32 wd, w1m 3606*c0909341SAndroid Build Coastguard Worker.n1_loop: 3607*c0909341SAndroid Build Coastguard Worker movu m3, [t4+wq*2+400*4+4] 3608*c0909341SAndroid Build Coastguard Worker movu m5, [t4+wq*2+400*4+2] 3609*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*2+400*4+0] 3610*c0909341SAndroid Build Coastguard Worker paddw m5, m3 3611*c0909341SAndroid Build Coastguard Worker psllw m5, 2 ; a3[ 1] 444 3612*c0909341SAndroid Build Coastguard Worker psubw m4, m5, m3 ; a3[ 1] 343 3613*c0909341SAndroid Build Coastguard Worker paddw m3, m4, [t4+wq*2+400*12] 3614*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+wq*2+400*10] 3615*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*10], m5 3616*c0909341SAndroid Build Coastguard Worker mova [t4+wq*2+400*12], m4 3617*c0909341SAndroid Build Coastguard Worker movu m1, [t3+wq*4+400*8+ 8] 3618*c0909341SAndroid Build Coastguard Worker movu m5, [t3+wq*4+400*8+ 4] 3619*c0909341SAndroid Build Coastguard Worker movu m7, [t3+wq*4+400*8+24] 3620*c0909341SAndroid Build Coastguard Worker movu m8, [t3+wq*4+400*8+20] 3621*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*4+400*8+ 0] 3622*c0909341SAndroid Build Coastguard Worker paddd m7, [t3+wq*4+400*8+16] 3623*c0909341SAndroid Build Coastguard Worker paddd m5, m1 3624*c0909341SAndroid Build Coastguard Worker paddd m8, m7 3625*c0909341SAndroid Build Coastguard Worker pslld m5, 2 ; b3[ 1] 444 3626*c0909341SAndroid Build Coastguard Worker pslld m8, 2 3627*c0909341SAndroid Build Coastguard Worker psubd m4, m5, m1 ; b3[ 1] 343 3628*c0909341SAndroid Build Coastguard Worker psubd m0, m8, m7 3629*c0909341SAndroid Build Coastguard Worker paddd m1, m4, [t3+wq*4+400*24+ 0] 3630*c0909341SAndroid Build Coastguard Worker paddd m7, m0, [t3+wq*4+400*24+16] 3631*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+wq*4+400*20+ 0] 3632*c0909341SAndroid Build Coastguard Worker paddd m7, [t3+wq*4+400*20+16] 3633*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*20+ 0], m5 3634*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*20+16], m8 3635*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*24+ 0], m4 3636*c0909341SAndroid Build Coastguard Worker mova [t3+wq*4+400*24+16], m0 3637*c0909341SAndroid Build Coastguard Worker movq m5, [dstq+wq] 3638*c0909341SAndroid Build Coastguard Worker mova m2, [t4+wq*2+400* 6] 3639*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m6 3640*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m6 3641*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m2, m6 3642*c0909341SAndroid Build Coastguard Worker pmaddwd m8, m4 ; a5 * src 3643*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m3, m6 3644*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m4 ; a3 * src 3645*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5, m6 3646*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m6 3647*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m4 3648*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 3649*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m4 3650*c0909341SAndroid Build Coastguard Worker psubd m1, m0 ; b3 - a3 * src + (1 << 8) - (src << 13) 3651*c0909341SAndroid Build Coastguard Worker mova m0, [t3+wq*4+400*12+ 0] 3652*c0909341SAndroid Build Coastguard Worker psubd m0, m8 ; b5 - a5 * src + (1 << 8) - (src << 13) 3653*c0909341SAndroid Build Coastguard Worker mova m4, [t3+wq*4+400*12+16] 3654*c0909341SAndroid Build Coastguard Worker psubd m4, m2 3655*c0909341SAndroid Build Coastguard Worker psubd m7, m3 3656*c0909341SAndroid Build Coastguard Worker pslld m1, 7 3657*c0909341SAndroid Build Coastguard Worker psrld m0, 8 3658*c0909341SAndroid Build Coastguard Worker psrld m4, 8 3659*c0909341SAndroid Build Coastguard Worker pslld m7, 7 3660*c0909341SAndroid Build Coastguard Worker pandn m3, m9, m1 3661*c0909341SAndroid Build Coastguard Worker pand m0, m9 3662*c0909341SAndroid Build Coastguard Worker por m0, m3 3663*c0909341SAndroid Build Coastguard Worker pand m4, m9 3664*c0909341SAndroid Build Coastguard Worker pandn m2, m9, m7 3665*c0909341SAndroid Build Coastguard Worker por m2, m4 3666*c0909341SAndroid Build Coastguard Worker mova m1, [base+pd_4096] 3667*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 3668*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m15 3669*c0909341SAndroid Build Coastguard Worker paddd m0, m1 3670*c0909341SAndroid Build Coastguard Worker paddd m2, m1 3671*c0909341SAndroid Build Coastguard Worker psrad m0, 13 3672*c0909341SAndroid Build Coastguard Worker psrad m2, 13 3673*c0909341SAndroid Build Coastguard Worker packssdw m0, m2 3674*c0909341SAndroid Build Coastguard Worker paddw m0, m5 3675*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 3676*c0909341SAndroid Build Coastguard Worker movq [dstq+wq], m0 3677*c0909341SAndroid Build Coastguard Worker add wq, 8 3678*c0909341SAndroid Build Coastguard Worker jl .n1_loop 3679*c0909341SAndroid Build Coastguard Worker add dstq, stridemp 3680*c0909341SAndroid Build Coastguard Worker movif32 dstm, dstq 3681*c0909341SAndroid Build Coastguard Worker ret 3682