1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 32 32*c0909341SAndroid Build Coastguard Worker 33*c0909341SAndroid Build Coastguard Workerwiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 34*c0909341SAndroid Build Coastguard Worker db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 35*c0909341SAndroid Build Coastguard Workerwiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 36*c0909341SAndroid Build Coastguard Workerwiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 37*c0909341SAndroid Build Coastguard Workerwiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 38*c0909341SAndroid Build Coastguard Workersgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 39*c0909341SAndroid Build Coastguard Workersgr_r_ext: times 16 db 1 40*c0909341SAndroid Build Coastguard Worker times 16 db 9 41*c0909341SAndroid Build Coastguard Workersgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 42*c0909341SAndroid Build Coastguard Worker db 9, -1, 10, -1, 11, -1, 12, -1 43*c0909341SAndroid Build Coastguard Worker 44*c0909341SAndroid Build Coastguard Workerpb_m5: times 4 db -5 45*c0909341SAndroid Build Coastguard Workerpb_3: times 4 db 3 46*c0909341SAndroid Build Coastguard Workerpw_5_6: dw 5, 6 47*c0909341SAndroid Build Coastguard Workerpw_164_24: dw 164, 24 48*c0909341SAndroid Build Coastguard Workerpw_455_24: dw 455, 24 49*c0909341SAndroid Build Coastguard Workerpw_256: times 2 dw 256 50*c0909341SAndroid Build Coastguard Workerpw_2056: times 2 dw 2056 51*c0909341SAndroid Build Coastguard Workerpw_m16380: times 2 dw -16380 52*c0909341SAndroid Build Coastguard Workerpd_25: dd 25 53*c0909341SAndroid Build Coastguard Workerpd_34816: dd 34816 54*c0909341SAndroid Build Coastguard Workerpd_m4096: dd -4096 55*c0909341SAndroid Build Coastguard Workerpf_256: dd 256.0 56*c0909341SAndroid Build Coastguard Worker 57*c0909341SAndroid Build Coastguard Workercextern pb_0to63 58*c0909341SAndroid Build Coastguard Worker 59*c0909341SAndroid Build Coastguard WorkerSECTION .text 60*c0909341SAndroid Build Coastguard Worker 61*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers 62*c0909341SAndroid Build Coastguard Worker 63*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 64*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ 65*c0909341SAndroid Build Coastguard Worker w, h, edge, flt 66*c0909341SAndroid Build Coastguard Worker mov fltq, r6mp 67*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 68*c0909341SAndroid Build Coastguard Worker mov edged, r7m 69*c0909341SAndroid Build Coastguard Worker mov wd, wm 70*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [wiener_shufA] 71*c0909341SAndroid Build Coastguard Worker vpbroadcastb m11, [fltq+ 0] ; x0 x0 72*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [wiener_shufB] 73*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [fltq+ 2] 74*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [wiener_shufC] 75*c0909341SAndroid Build Coastguard Worker packsswb m12, m12 ; x1 x2 76*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, [fltq+ 6] ; x3 77*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [sgr_shuf+6] 78*c0909341SAndroid Build Coastguard Worker add lpfq, wq 79*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pw_m16380] 80*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [fltq+16] ; y0 y1 81*c0909341SAndroid Build Coastguard Worker add dstq, wq 82*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [fltq+20] ; y2 y3 83*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+16] 84*c0909341SAndroid Build Coastguard Worker psllw m14, 5 85*c0909341SAndroid Build Coastguard Worker neg wq 86*c0909341SAndroid Build Coastguard Worker psllw m15, 5 87*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 88*c0909341SAndroid Build Coastguard Worker jz .no_top 89*c0909341SAndroid Build Coastguard Worker call .h_top 90*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 91*c0909341SAndroid Build Coastguard Worker mov t6, t1 92*c0909341SAndroid Build Coastguard Worker mov t5, t1 93*c0909341SAndroid Build Coastguard Worker add t1, 384*2 94*c0909341SAndroid Build Coastguard Worker call .h_top 95*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 96*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 97*c0909341SAndroid Build Coastguard Worker mov t4, t1 98*c0909341SAndroid Build Coastguard Worker add t1, 384*2 99*c0909341SAndroid Build Coastguard Worker add r10, strideq 100*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 101*c0909341SAndroid Build Coastguard Worker call .h 102*c0909341SAndroid Build Coastguard Worker mov t3, t1 103*c0909341SAndroid Build Coastguard Worker mov t2, t1 104*c0909341SAndroid Build Coastguard Worker dec hd 105*c0909341SAndroid Build Coastguard Worker jz .v1 106*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 107*c0909341SAndroid Build Coastguard Worker add t1, 384*2 108*c0909341SAndroid Build Coastguard Worker call .h 109*c0909341SAndroid Build Coastguard Worker mov t2, t1 110*c0909341SAndroid Build Coastguard Worker dec hd 111*c0909341SAndroid Build Coastguard Worker jz .v2 112*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 113*c0909341SAndroid Build Coastguard Worker add t1, 384*2 114*c0909341SAndroid Build Coastguard Worker call .h 115*c0909341SAndroid Build Coastguard Worker dec hd 116*c0909341SAndroid Build Coastguard Worker jz .v3 117*c0909341SAndroid Build Coastguard Worker.main: 118*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 119*c0909341SAndroid Build Coastguard Worker.main_loop: 120*c0909341SAndroid Build Coastguard Worker call .hv 121*c0909341SAndroid Build Coastguard Worker dec hd 122*c0909341SAndroid Build Coastguard Worker jnz .main_loop 123*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 124*c0909341SAndroid Build Coastguard Worker jz .v3 125*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 126*c0909341SAndroid Build Coastguard Worker call .hv_bottom 127*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 128*c0909341SAndroid Build Coastguard Worker call .hv_bottom 129*c0909341SAndroid Build Coastguard Worker.v1: 130*c0909341SAndroid Build Coastguard Worker call .v 131*c0909341SAndroid Build Coastguard Worker RET 132*c0909341SAndroid Build Coastguard Worker.no_top: 133*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 134*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 135*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 136*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 137*c0909341SAndroid Build Coastguard Worker call .h 138*c0909341SAndroid Build Coastguard Worker mov t6, t1 139*c0909341SAndroid Build Coastguard Worker mov t5, t1 140*c0909341SAndroid Build Coastguard Worker mov t4, t1 141*c0909341SAndroid Build Coastguard Worker mov t3, t1 142*c0909341SAndroid Build Coastguard Worker mov t2, t1 143*c0909341SAndroid Build Coastguard Worker dec hd 144*c0909341SAndroid Build Coastguard Worker jz .v1 145*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 146*c0909341SAndroid Build Coastguard Worker add t1, 384*2 147*c0909341SAndroid Build Coastguard Worker call .h 148*c0909341SAndroid Build Coastguard Worker mov t2, t1 149*c0909341SAndroid Build Coastguard Worker dec hd 150*c0909341SAndroid Build Coastguard Worker jz .v2 151*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 152*c0909341SAndroid Build Coastguard Worker add t1, 384*2 153*c0909341SAndroid Build Coastguard Worker call .h 154*c0909341SAndroid Build Coastguard Worker dec hd 155*c0909341SAndroid Build Coastguard Worker jz .v3 156*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 157*c0909341SAndroid Build Coastguard Worker call .hv 158*c0909341SAndroid Build Coastguard Worker dec hd 159*c0909341SAndroid Build Coastguard Worker jz .v3 160*c0909341SAndroid Build Coastguard Worker add t0, 384*8 161*c0909341SAndroid Build Coastguard Worker call .hv 162*c0909341SAndroid Build Coastguard Worker dec hd 163*c0909341SAndroid Build Coastguard Worker jnz .main 164*c0909341SAndroid Build Coastguard Worker.v3: 165*c0909341SAndroid Build Coastguard Worker call .v 166*c0909341SAndroid Build Coastguard Worker.v2: 167*c0909341SAndroid Build Coastguard Worker call .v 168*c0909341SAndroid Build Coastguard Worker jmp .v1 169*c0909341SAndroid Build Coastguard Worker.extend_right: 170*c0909341SAndroid Build Coastguard Worker movd xm2, r10d 171*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [pb_3] 172*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [pb_m5] 173*c0909341SAndroid Build Coastguard Worker vpbroadcastb m2, xm2 174*c0909341SAndroid Build Coastguard Worker mova m3, [pb_0to63] 175*c0909341SAndroid Build Coastguard Worker psubb m0, m2 176*c0909341SAndroid Build Coastguard Worker psubb m1, m2 177*c0909341SAndroid Build Coastguard Worker pminub m0, m3 178*c0909341SAndroid Build Coastguard Worker pminub m1, m3 179*c0909341SAndroid Build Coastguard Worker pshufb m4, m0 180*c0909341SAndroid Build Coastguard Worker pshufb m5, m1 181*c0909341SAndroid Build Coastguard Worker ret 182*c0909341SAndroid Build Coastguard Worker.h: 183*c0909341SAndroid Build Coastguard Worker mov r10, wq 184*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 185*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 186*c0909341SAndroid Build Coastguard Worker movd xm4, [leftq] 187*c0909341SAndroid Build Coastguard Worker vpblendd m4, [lpfq+r10-4], 0xfe 188*c0909341SAndroid Build Coastguard Worker add leftq, 4 189*c0909341SAndroid Build Coastguard Worker jmp .h_main 190*c0909341SAndroid Build Coastguard Worker.h_extend_left: 191*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located 192*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+r10] ; before the start of the buffer 193*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 12 194*c0909341SAndroid Build Coastguard Worker pshufb m4, [wiener_l_shuf] 195*c0909341SAndroid Build Coastguard Worker jmp .h_main 196*c0909341SAndroid Build Coastguard Worker.h_top: 197*c0909341SAndroid Build Coastguard Worker mov r10, wq 198*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 199*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 200*c0909341SAndroid Build Coastguard Worker.h_loop: 201*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10-4] 202*c0909341SAndroid Build Coastguard Worker.h_main: 203*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+4] 204*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 205*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 206*c0909341SAndroid Build Coastguard Worker cmp r10d, -34 207*c0909341SAndroid Build Coastguard Worker jl .h_have_right 208*c0909341SAndroid Build Coastguard Worker call .extend_right 209*c0909341SAndroid Build Coastguard Worker.h_have_right: 210*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m6 211*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m11 212*c0909341SAndroid Build Coastguard Worker pshufb m1, m5, m6 213*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m11 214*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m7 215*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m12 216*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m7 217*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m12 218*c0909341SAndroid Build Coastguard Worker paddw m0, m2 219*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m8 220*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m12 221*c0909341SAndroid Build Coastguard Worker paddw m1, m3 222*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m8 223*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m12 224*c0909341SAndroid Build Coastguard Worker pshufb m4, m9 225*c0909341SAndroid Build Coastguard Worker paddw m0, m2 226*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, m13 227*c0909341SAndroid Build Coastguard Worker pshufb m5, m9 228*c0909341SAndroid Build Coastguard Worker paddw m1, m3 229*c0909341SAndroid Build Coastguard Worker pmullw m3, m5, m13 230*c0909341SAndroid Build Coastguard Worker psllw m4, 7 231*c0909341SAndroid Build Coastguard Worker psllw m5, 7 232*c0909341SAndroid Build Coastguard Worker paddw m4, m10 233*c0909341SAndroid Build Coastguard Worker paddw m5, m10 234*c0909341SAndroid Build Coastguard Worker paddw m0, m2 235*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pw_2056] 236*c0909341SAndroid Build Coastguard Worker paddw m1, m3 237*c0909341SAndroid Build Coastguard Worker paddsw m0, m4 238*c0909341SAndroid Build Coastguard Worker paddsw m1, m5 239*c0909341SAndroid Build Coastguard Worker psraw m0, 3 240*c0909341SAndroid Build Coastguard Worker psraw m1, 3 241*c0909341SAndroid Build Coastguard Worker paddw m0, m2 242*c0909341SAndroid Build Coastguard Worker paddw m1, m2 243*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+ 0], m0 244*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+32], m1 245*c0909341SAndroid Build Coastguard Worker add r10, 32 246*c0909341SAndroid Build Coastguard Worker jl .h_loop 247*c0909341SAndroid Build Coastguard Worker ret 248*c0909341SAndroid Build Coastguard WorkerALIGN function_align 249*c0909341SAndroid Build Coastguard Worker.hv: 250*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 251*c0909341SAndroid Build Coastguard Worker mov r10, wq 252*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 253*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 254*c0909341SAndroid Build Coastguard Worker movd xm4, [leftq] 255*c0909341SAndroid Build Coastguard Worker vpblendd m4, [lpfq+r10-4], 0xfe 256*c0909341SAndroid Build Coastguard Worker add leftq, 4 257*c0909341SAndroid Build Coastguard Worker jmp .hv_main 258*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 259*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10-4] 260*c0909341SAndroid Build Coastguard Worker pshufb m4, [wiener_l_shuf] 261*c0909341SAndroid Build Coastguard Worker jmp .hv_main 262*c0909341SAndroid Build Coastguard Worker.hv_bottom: 263*c0909341SAndroid Build Coastguard Worker mov r10, wq 264*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 265*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 266*c0909341SAndroid Build Coastguard Worker.hv_loop: 267*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10-4] 268*c0909341SAndroid Build Coastguard Worker.hv_main: 269*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+4] 270*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 271*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 272*c0909341SAndroid Build Coastguard Worker cmp r10d, -34 273*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 274*c0909341SAndroid Build Coastguard Worker call .extend_right 275*c0909341SAndroid Build Coastguard Worker.hv_have_right: 276*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m6 277*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m11 278*c0909341SAndroid Build Coastguard Worker pshufb m1, m5, m6 279*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m11 280*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m7 281*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m12 282*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m7 283*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m12 284*c0909341SAndroid Build Coastguard Worker paddw m0, m2 285*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m8 286*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m12 287*c0909341SAndroid Build Coastguard Worker paddw m1, m3 288*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m8 289*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m12 290*c0909341SAndroid Build Coastguard Worker pshufb m4, m9 291*c0909341SAndroid Build Coastguard Worker paddw m0, m2 292*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, m13 293*c0909341SAndroid Build Coastguard Worker pshufb m5, m9 294*c0909341SAndroid Build Coastguard Worker paddw m1, m3 295*c0909341SAndroid Build Coastguard Worker pmullw m3, m5, m13 296*c0909341SAndroid Build Coastguard Worker psllw m4, 7 297*c0909341SAndroid Build Coastguard Worker psllw m5, 7 298*c0909341SAndroid Build Coastguard Worker paddw m4, m10 299*c0909341SAndroid Build Coastguard Worker paddw m5, m10 300*c0909341SAndroid Build Coastguard Worker paddw m0, m2 301*c0909341SAndroid Build Coastguard Worker paddw m1, m3 302*c0909341SAndroid Build Coastguard Worker mova m2, [t4+r10*2] 303*c0909341SAndroid Build Coastguard Worker paddw m2, [t2+r10*2] 304*c0909341SAndroid Build Coastguard Worker mova m3, [t3+r10*2] 305*c0909341SAndroid Build Coastguard Worker paddsw m0, m4 306*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pw_2056] 307*c0909341SAndroid Build Coastguard Worker paddsw m1, m5 308*c0909341SAndroid Build Coastguard Worker mova m5, [t5+r10*2] 309*c0909341SAndroid Build Coastguard Worker paddw m5, [t1+r10*2] 310*c0909341SAndroid Build Coastguard Worker psraw m0, 3 311*c0909341SAndroid Build Coastguard Worker psraw m1, 3 312*c0909341SAndroid Build Coastguard Worker paddw m0, m4 313*c0909341SAndroid Build Coastguard Worker paddw m1, m4 314*c0909341SAndroid Build Coastguard Worker paddw m4, m0, [t6+r10*2] 315*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2], m0 316*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2, m3 317*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 318*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 319*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m15 320*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 321*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 322*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 323*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m14 324*c0909341SAndroid Build Coastguard Worker paddd m0, m3 325*c0909341SAndroid Build Coastguard Worker paddd m4, m2 326*c0909341SAndroid Build Coastguard Worker mova m2, [t4+r10*2+32] 327*c0909341SAndroid Build Coastguard Worker paddw m2, [t2+r10*2+32] 328*c0909341SAndroid Build Coastguard Worker mova m3, [t3+r10*2+32] 329*c0909341SAndroid Build Coastguard Worker mova m5, [t5+r10*2+32] 330*c0909341SAndroid Build Coastguard Worker paddw m5, [t1+r10*2+32] 331*c0909341SAndroid Build Coastguard Worker packuswb m0, m4 332*c0909341SAndroid Build Coastguard Worker paddw m4, m1, [t6+r10*2+32] 333*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+32], m1 334*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 335*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m15 336*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 337*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m15 338*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 339*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 340*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 341*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m14 342*c0909341SAndroid Build Coastguard Worker paddd m1, m3 343*c0909341SAndroid Build Coastguard Worker paddd m2, m4 344*c0909341SAndroid Build Coastguard Worker packuswb m1, m2 345*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 346*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 347*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 348*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 349*c0909341SAndroid Build Coastguard Worker add r10, 32 350*c0909341SAndroid Build Coastguard Worker jl .hv_loop 351*c0909341SAndroid Build Coastguard Worker mov t6, t5 352*c0909341SAndroid Build Coastguard Worker mov t5, t4 353*c0909341SAndroid Build Coastguard Worker mov t4, t3 354*c0909341SAndroid Build Coastguard Worker mov t3, t2 355*c0909341SAndroid Build Coastguard Worker mov t2, t1 356*c0909341SAndroid Build Coastguard Worker mov t1, t0 357*c0909341SAndroid Build Coastguard Worker mov t0, t6 358*c0909341SAndroid Build Coastguard Worker add dstq, strideq 359*c0909341SAndroid Build Coastguard Worker ret 360*c0909341SAndroid Build Coastguard Worker.v: 361*c0909341SAndroid Build Coastguard Worker mov r10, wq 362*c0909341SAndroid Build Coastguard Worker.v_loop: 363*c0909341SAndroid Build Coastguard Worker mova m2, [t4+r10*2+ 0] 364*c0909341SAndroid Build Coastguard Worker paddw m2, [t2+r10*2+ 0] 365*c0909341SAndroid Build Coastguard Worker mova m4, [t3+r10*2+ 0] 366*c0909341SAndroid Build Coastguard Worker mova m6, [t1+r10*2+ 0] 367*c0909341SAndroid Build Coastguard Worker paddw m8, m6, [t6+r10*2+ 0] 368*c0909341SAndroid Build Coastguard Worker paddw m6, [t5+r10*2+ 0] 369*c0909341SAndroid Build Coastguard Worker mova m3, [t4+r10*2+32] 370*c0909341SAndroid Build Coastguard Worker paddw m3, [t2+r10*2+32] 371*c0909341SAndroid Build Coastguard Worker mova m5, [t3+r10*2+32] 372*c0909341SAndroid Build Coastguard Worker mova m7, [t1+r10*2+32] 373*c0909341SAndroid Build Coastguard Worker paddw m9, m7, [t6+r10*2+32] 374*c0909341SAndroid Build Coastguard Worker paddw m7, [t5+r10*2+32] 375*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2, m4 376*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 377*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m4 378*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m15 379*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m8, m6 380*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m14 381*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m8, m6 382*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m14 383*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m5 384*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m15 385*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m5 386*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m15 387*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m9, m7 388*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m14 389*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m9, m7 390*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m14 391*c0909341SAndroid Build Coastguard Worker paddd m0, m4 392*c0909341SAndroid Build Coastguard Worker paddd m2, m6 393*c0909341SAndroid Build Coastguard Worker paddd m1, m5 394*c0909341SAndroid Build Coastguard Worker paddd m3, m7 395*c0909341SAndroid Build Coastguard Worker packuswb m0, m2 396*c0909341SAndroid Build Coastguard Worker packuswb m1, m3 397*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 398*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 399*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 400*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 401*c0909341SAndroid Build Coastguard Worker add r10, 32 402*c0909341SAndroid Build Coastguard Worker jl .v_loop 403*c0909341SAndroid Build Coastguard Worker mov t6, t5 404*c0909341SAndroid Build Coastguard Worker mov t5, t4 405*c0909341SAndroid Build Coastguard Worker mov t4, t3 406*c0909341SAndroid Build Coastguard Worker mov t3, t2 407*c0909341SAndroid Build Coastguard Worker mov t2, t1 408*c0909341SAndroid Build Coastguard Worker add dstq, strideq 409*c0909341SAndroid Build Coastguard Worker ret 410*c0909341SAndroid Build Coastguard Worker 411*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ 412*c0909341SAndroid Build Coastguard Worker w, h, edge, flt 413*c0909341SAndroid Build Coastguard Worker mov fltq, r6mp 414*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 415*c0909341SAndroid Build Coastguard Worker mov edged, r7m 416*c0909341SAndroid Build Coastguard Worker mov wd, wm 417*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [wiener_shufB] 418*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [fltq+ 2] 419*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [wiener_shufC] 420*c0909341SAndroid Build Coastguard Worker packsswb m12, m12 ; x1 x2 421*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, [fltq+ 6] ; x3 422*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [sgr_shuf+6] 423*c0909341SAndroid Build Coastguard Worker add lpfq, wq 424*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pw_m16380] 425*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pw_2056] 426*c0909341SAndroid Build Coastguard Worker mova m11, [wiener_l_shuf] 427*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [fltq+16] ; __ y1 428*c0909341SAndroid Build Coastguard Worker add dstq, wq 429*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [fltq+20] ; y2 y3 430*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+16] 431*c0909341SAndroid Build Coastguard Worker psllw m14, 5 432*c0909341SAndroid Build Coastguard Worker neg wq 433*c0909341SAndroid Build Coastguard Worker psllw m15, 5 434*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 435*c0909341SAndroid Build Coastguard Worker jz .no_top 436*c0909341SAndroid Build Coastguard Worker call .h_top 437*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 438*c0909341SAndroid Build Coastguard Worker mov t4, t1 439*c0909341SAndroid Build Coastguard Worker add t1, 384*2 440*c0909341SAndroid Build Coastguard Worker call .h_top 441*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 442*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 443*c0909341SAndroid Build Coastguard Worker mov t3, t1 444*c0909341SAndroid Build Coastguard Worker add t1, 384*2 445*c0909341SAndroid Build Coastguard Worker add r10, strideq 446*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 447*c0909341SAndroid Build Coastguard Worker call .h 448*c0909341SAndroid Build Coastguard Worker mov t2, t1 449*c0909341SAndroid Build Coastguard Worker dec hd 450*c0909341SAndroid Build Coastguard Worker jz .v1 451*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 452*c0909341SAndroid Build Coastguard Worker add t1, 384*2 453*c0909341SAndroid Build Coastguard Worker call .h 454*c0909341SAndroid Build Coastguard Worker dec hd 455*c0909341SAndroid Build Coastguard Worker jz .v2 456*c0909341SAndroid Build Coastguard Worker.main: 457*c0909341SAndroid Build Coastguard Worker mov t0, t4 458*c0909341SAndroid Build Coastguard Worker.main_loop: 459*c0909341SAndroid Build Coastguard Worker call .hv 460*c0909341SAndroid Build Coastguard Worker dec hd 461*c0909341SAndroid Build Coastguard Worker jnz .main_loop 462*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 463*c0909341SAndroid Build Coastguard Worker jz .v2 464*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 465*c0909341SAndroid Build Coastguard Worker call .hv_bottom 466*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 467*c0909341SAndroid Build Coastguard Worker call .hv_bottom 468*c0909341SAndroid Build Coastguard Worker.end: 469*c0909341SAndroid Build Coastguard Worker RET 470*c0909341SAndroid Build Coastguard Worker.no_top: 471*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 472*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 473*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 474*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 475*c0909341SAndroid Build Coastguard Worker call .h 476*c0909341SAndroid Build Coastguard Worker mov t4, t1 477*c0909341SAndroid Build Coastguard Worker mov t3, t1 478*c0909341SAndroid Build Coastguard Worker mov t2, t1 479*c0909341SAndroid Build Coastguard Worker dec hd 480*c0909341SAndroid Build Coastguard Worker jz .v1 481*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 482*c0909341SAndroid Build Coastguard Worker add t1, 384*2 483*c0909341SAndroid Build Coastguard Worker call .h 484*c0909341SAndroid Build Coastguard Worker dec hd 485*c0909341SAndroid Build Coastguard Worker jz .v2 486*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 487*c0909341SAndroid Build Coastguard Worker call .hv 488*c0909341SAndroid Build Coastguard Worker dec hd 489*c0909341SAndroid Build Coastguard Worker jz .v2 490*c0909341SAndroid Build Coastguard Worker add t0, 384*6 491*c0909341SAndroid Build Coastguard Worker call .hv 492*c0909341SAndroid Build Coastguard Worker dec hd 493*c0909341SAndroid Build Coastguard Worker jnz .main 494*c0909341SAndroid Build Coastguard Worker.v2: 495*c0909341SAndroid Build Coastguard Worker call .v 496*c0909341SAndroid Build Coastguard Worker mov t4, t3 497*c0909341SAndroid Build Coastguard Worker mov t3, t2 498*c0909341SAndroid Build Coastguard Worker mov t2, t1 499*c0909341SAndroid Build Coastguard Worker add dstq, strideq 500*c0909341SAndroid Build Coastguard Worker.v1: 501*c0909341SAndroid Build Coastguard Worker call .v 502*c0909341SAndroid Build Coastguard Worker jmp .end 503*c0909341SAndroid Build Coastguard Worker.h: 504*c0909341SAndroid Build Coastguard Worker mov r10, wq 505*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 506*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 507*c0909341SAndroid Build Coastguard Worker movd xm4, [leftq] 508*c0909341SAndroid Build Coastguard Worker vpblendd m4, [lpfq+r10-4], 0xfe 509*c0909341SAndroid Build Coastguard Worker add leftq, 4 510*c0909341SAndroid Build Coastguard Worker jmp .h_main 511*c0909341SAndroid Build Coastguard Worker.h_extend_left: 512*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located 513*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+r10] ; before the start of the buffer 514*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 12 515*c0909341SAndroid Build Coastguard Worker pshufb m4, m11 516*c0909341SAndroid Build Coastguard Worker jmp .h_main 517*c0909341SAndroid Build Coastguard Worker.h_top: 518*c0909341SAndroid Build Coastguard Worker mov r10, wq 519*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 520*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 521*c0909341SAndroid Build Coastguard Worker.h_loop: 522*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10-4] 523*c0909341SAndroid Build Coastguard Worker.h_main: 524*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+4] 525*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 526*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 527*c0909341SAndroid Build Coastguard Worker cmp r10d, -33 528*c0909341SAndroid Build Coastguard Worker jl .h_have_right 529*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right 530*c0909341SAndroid Build Coastguard Worker.h_have_right: 531*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m6 532*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m12 533*c0909341SAndroid Build Coastguard Worker pshufb m1, m5, m6 534*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m12 535*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m7 536*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m12 537*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m7 538*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m12 539*c0909341SAndroid Build Coastguard Worker pshufb m4, m8 540*c0909341SAndroid Build Coastguard Worker paddw m0, m2 541*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, m13 542*c0909341SAndroid Build Coastguard Worker pshufb m5, m8 543*c0909341SAndroid Build Coastguard Worker paddw m1, m3 544*c0909341SAndroid Build Coastguard Worker pmullw m3, m5, m13 545*c0909341SAndroid Build Coastguard Worker psllw m4, 7 546*c0909341SAndroid Build Coastguard Worker psllw m5, 7 547*c0909341SAndroid Build Coastguard Worker paddw m4, m9 548*c0909341SAndroid Build Coastguard Worker paddw m5, m9 549*c0909341SAndroid Build Coastguard Worker paddw m0, m2 550*c0909341SAndroid Build Coastguard Worker paddw m1, m3 551*c0909341SAndroid Build Coastguard Worker paddsw m0, m4 552*c0909341SAndroid Build Coastguard Worker paddsw m1, m5 553*c0909341SAndroid Build Coastguard Worker psraw m0, 3 554*c0909341SAndroid Build Coastguard Worker psraw m1, 3 555*c0909341SAndroid Build Coastguard Worker paddw m0, m10 556*c0909341SAndroid Build Coastguard Worker paddw m1, m10 557*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+ 0], m0 558*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+32], m1 559*c0909341SAndroid Build Coastguard Worker add r10, 32 560*c0909341SAndroid Build Coastguard Worker jl .h_loop 561*c0909341SAndroid Build Coastguard Worker ret 562*c0909341SAndroid Build Coastguard WorkerALIGN function_align 563*c0909341SAndroid Build Coastguard Worker.hv: 564*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 565*c0909341SAndroid Build Coastguard Worker mov r10, wq 566*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 567*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 568*c0909341SAndroid Build Coastguard Worker movd xm4, [leftq] 569*c0909341SAndroid Build Coastguard Worker vpblendd m4, [lpfq+r10-4], 0xfe 570*c0909341SAndroid Build Coastguard Worker add leftq, 4 571*c0909341SAndroid Build Coastguard Worker jmp .hv_main 572*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 573*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10-4] 574*c0909341SAndroid Build Coastguard Worker pshufb m4, m11 575*c0909341SAndroid Build Coastguard Worker jmp .hv_main 576*c0909341SAndroid Build Coastguard Worker.hv_bottom: 577*c0909341SAndroid Build Coastguard Worker mov r10, wq 578*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 579*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 580*c0909341SAndroid Build Coastguard Worker.hv_loop: 581*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10-4] 582*c0909341SAndroid Build Coastguard Worker.hv_main: 583*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+4] 584*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 585*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 586*c0909341SAndroid Build Coastguard Worker cmp r10d, -33 587*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 588*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right 589*c0909341SAndroid Build Coastguard Worker.hv_have_right: 590*c0909341SAndroid Build Coastguard Worker pshufb m0, m4, m6 591*c0909341SAndroid Build Coastguard Worker pmaddubsw m0, m12 592*c0909341SAndroid Build Coastguard Worker pshufb m1, m5, m6 593*c0909341SAndroid Build Coastguard Worker pmaddubsw m1, m12 594*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m7 595*c0909341SAndroid Build Coastguard Worker pmaddubsw m2, m12 596*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m7 597*c0909341SAndroid Build Coastguard Worker pmaddubsw m3, m12 598*c0909341SAndroid Build Coastguard Worker pshufb m4, m8 599*c0909341SAndroid Build Coastguard Worker paddw m0, m2 600*c0909341SAndroid Build Coastguard Worker pmullw m2, m4, m13 601*c0909341SAndroid Build Coastguard Worker pshufb m5, m8 602*c0909341SAndroid Build Coastguard Worker paddw m1, m3 603*c0909341SAndroid Build Coastguard Worker pmullw m3, m5, m13 604*c0909341SAndroid Build Coastguard Worker psllw m4, 7 605*c0909341SAndroid Build Coastguard Worker psllw m5, 7 606*c0909341SAndroid Build Coastguard Worker paddw m4, m9 607*c0909341SAndroid Build Coastguard Worker paddw m5, m9 608*c0909341SAndroid Build Coastguard Worker paddw m0, m2 609*c0909341SAndroid Build Coastguard Worker paddw m1, m3 610*c0909341SAndroid Build Coastguard Worker mova m2, [t3+r10*2] 611*c0909341SAndroid Build Coastguard Worker paddw m2, [t1+r10*2] 612*c0909341SAndroid Build Coastguard Worker mova m3, [t2+r10*2] 613*c0909341SAndroid Build Coastguard Worker paddsw m0, m4 614*c0909341SAndroid Build Coastguard Worker paddsw m1, m5 615*c0909341SAndroid Build Coastguard Worker psraw m0, 3 616*c0909341SAndroid Build Coastguard Worker psraw m1, 3 617*c0909341SAndroid Build Coastguard Worker paddw m0, m10 618*c0909341SAndroid Build Coastguard Worker paddw m1, m10 619*c0909341SAndroid Build Coastguard Worker paddw m4, m0, [t4+r10*2] 620*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2], m0 621*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2, m3 622*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 623*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 624*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m15 625*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m4 626*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 627*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m4 628*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m14 629*c0909341SAndroid Build Coastguard Worker paddd m0, m3 630*c0909341SAndroid Build Coastguard Worker paddd m4, m2 631*c0909341SAndroid Build Coastguard Worker mova m2, [t3+r10*2+32] 632*c0909341SAndroid Build Coastguard Worker paddw m2, [t1+r10*2+32] 633*c0909341SAndroid Build Coastguard Worker mova m3, [t2+r10*2+32] 634*c0909341SAndroid Build Coastguard Worker packuswb m0, m4 635*c0909341SAndroid Build Coastguard Worker paddw m4, m1, [t4+r10*2+32] 636*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+32], m1 637*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 638*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m15 639*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 640*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m15 641*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m4 642*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 643*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m4 644*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m14 645*c0909341SAndroid Build Coastguard Worker paddd m1, m3 646*c0909341SAndroid Build Coastguard Worker paddd m2, m4 647*c0909341SAndroid Build Coastguard Worker packuswb m1, m2 648*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 649*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 650*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 651*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 652*c0909341SAndroid Build Coastguard Worker add r10, 32 653*c0909341SAndroid Build Coastguard Worker jl .hv_loop 654*c0909341SAndroid Build Coastguard Worker mov t4, t3 655*c0909341SAndroid Build Coastguard Worker mov t3, t2 656*c0909341SAndroid Build Coastguard Worker mov t2, t1 657*c0909341SAndroid Build Coastguard Worker mov t1, t0 658*c0909341SAndroid Build Coastguard Worker mov t0, t4 659*c0909341SAndroid Build Coastguard Worker add dstq, strideq 660*c0909341SAndroid Build Coastguard Worker ret 661*c0909341SAndroid Build Coastguard Worker.v: 662*c0909341SAndroid Build Coastguard Worker mov r10, wq 663*c0909341SAndroid Build Coastguard Worker psrld m13, m14, 16 ; y1 __ 664*c0909341SAndroid Build Coastguard Worker.v_loop: 665*c0909341SAndroid Build Coastguard Worker mova m6, [t1+r10*2+ 0] 666*c0909341SAndroid Build Coastguard Worker paddw m2, m6, [t3+r10*2+ 0] 667*c0909341SAndroid Build Coastguard Worker mova m4, [t2+r10*2+ 0] 668*c0909341SAndroid Build Coastguard Worker mova m7, [t1+r10*2+32] 669*c0909341SAndroid Build Coastguard Worker paddw m3, m7, [t3+r10*2+32] 670*c0909341SAndroid Build Coastguard Worker mova m5, [t2+r10*2+32] 671*c0909341SAndroid Build Coastguard Worker paddw m6, [t4+r10*2+ 0] 672*c0909341SAndroid Build Coastguard Worker paddw m7, [t4+r10*2+32] 673*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2, m4 674*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 675*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m4 676*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m15 677*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m5 678*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m15 679*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m5 680*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m15 681*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m7, m6 682*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m5, m14 683*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m6 684*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m7, m14 685*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m13 686*c0909341SAndroid Build Coastguard Worker pmaddwd m7, m13 687*c0909341SAndroid Build Coastguard Worker paddd m0, m4 688*c0909341SAndroid Build Coastguard Worker paddd m2, m6 689*c0909341SAndroid Build Coastguard Worker paddd m1, m5 690*c0909341SAndroid Build Coastguard Worker paddd m3, m7 691*c0909341SAndroid Build Coastguard Worker packuswb m0, m2 692*c0909341SAndroid Build Coastguard Worker packuswb m1, m3 693*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 694*c0909341SAndroid Build Coastguard Worker psrlw m1, 8 695*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 696*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 697*c0909341SAndroid Build Coastguard Worker add r10, 32 698*c0909341SAndroid Build Coastguard Worker jl .v_loop 699*c0909341SAndroid Build Coastguard Worker ret 700*c0909341SAndroid Build Coastguard Worker 701*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_8bpc, 4, 12, 16, 400*24+16, dst, stride, left, lpf, \ 702*c0909341SAndroid Build Coastguard Worker w, h, edge, params 703*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 704*c0909341SAndroid Build Coastguard Worker mov wd, wm 705*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 706*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [sgr_shuf+0] 707*c0909341SAndroid Build Coastguard Worker mov edged, r7m 708*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [sgr_shuf+8] 709*c0909341SAndroid Build Coastguard Worker add lpfq, wq 710*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [sgr_shuf+2] 711*c0909341SAndroid Build Coastguard Worker add dstq, wq 712*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m11, [sgr_shuf+6] 713*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*4+16+400*12] 714*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, [paramsq+8] ; w0 715*c0909341SAndroid Build Coastguard Worker pxor m6, m6 716*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [paramsq+0] ; s0 717*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+20] 718*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [pw_164_24] 719*c0909341SAndroid Build Coastguard Worker neg wq 720*c0909341SAndroid Build Coastguard Worker vbroadcastss m14, [pf_256] 721*c0909341SAndroid Build Coastguard Worker psllw m7, 4 722*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [pd_m4096] 723*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 724*c0909341SAndroid Build Coastguard Worker jz .no_top 725*c0909341SAndroid Build Coastguard Worker call .h_top 726*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 727*c0909341SAndroid Build Coastguard Worker mov t2, t1 728*c0909341SAndroid Build Coastguard Worker call .top_fixup 729*c0909341SAndroid Build Coastguard Worker add t1, 400*6 730*c0909341SAndroid Build Coastguard Worker call .h_top 731*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 732*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 733*c0909341SAndroid Build Coastguard Worker add r10, strideq 734*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 735*c0909341SAndroid Build Coastguard Worker mov t0, t2 736*c0909341SAndroid Build Coastguard Worker dec hd 737*c0909341SAndroid Build Coastguard Worker jz .height1 738*c0909341SAndroid Build Coastguard Worker or edged, 16 739*c0909341SAndroid Build Coastguard Worker call .h 740*c0909341SAndroid Build Coastguard Worker.main: 741*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 742*c0909341SAndroid Build Coastguard Worker call .hv 743*c0909341SAndroid Build Coastguard Worker call .prep_n 744*c0909341SAndroid Build Coastguard Worker sub hd, 2 745*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 746*c0909341SAndroid Build Coastguard Worker.main_loop: 747*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 748*c0909341SAndroid Build Coastguard Worker test hd, hd 749*c0909341SAndroid Build Coastguard Worker jz .odd_height 750*c0909341SAndroid Build Coastguard Worker call .h 751*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 752*c0909341SAndroid Build Coastguard Worker call .hv 753*c0909341SAndroid Build Coastguard Worker call .n0 754*c0909341SAndroid Build Coastguard Worker call .n1 755*c0909341SAndroid Build Coastguard Worker sub hd, 2 756*c0909341SAndroid Build Coastguard Worker jge .main_loop 757*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 758*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 759*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 760*c0909341SAndroid Build Coastguard Worker call .h_top 761*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 762*c0909341SAndroid Build Coastguard Worker call .hv_bottom 763*c0909341SAndroid Build Coastguard Worker.end: 764*c0909341SAndroid Build Coastguard Worker call .n0 765*c0909341SAndroid Build Coastguard Worker call .n1 766*c0909341SAndroid Build Coastguard Worker.end2: 767*c0909341SAndroid Build Coastguard Worker RET 768*c0909341SAndroid Build Coastguard Worker.height1: 769*c0909341SAndroid Build Coastguard Worker call .hv 770*c0909341SAndroid Build Coastguard Worker call .prep_n 771*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 772*c0909341SAndroid Build Coastguard Worker.odd_height: 773*c0909341SAndroid Build Coastguard Worker call .hv 774*c0909341SAndroid Build Coastguard Worker call .n0 775*c0909341SAndroid Build Coastguard Worker call .n1 776*c0909341SAndroid Build Coastguard Worker.odd_height_end: 777*c0909341SAndroid Build Coastguard Worker call .v 778*c0909341SAndroid Build Coastguard Worker call .n0 779*c0909341SAndroid Build Coastguard Worker jmp .end2 780*c0909341SAndroid Build Coastguard Worker.extend_bottom: 781*c0909341SAndroid Build Coastguard Worker call .v 782*c0909341SAndroid Build Coastguard Worker jmp .end 783*c0909341SAndroid Build Coastguard Worker.no_top: 784*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 785*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 786*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 787*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 788*c0909341SAndroid Build Coastguard Worker call .h 789*c0909341SAndroid Build Coastguard Worker lea t2, [t1+400*6] 790*c0909341SAndroid Build Coastguard Worker call .top_fixup 791*c0909341SAndroid Build Coastguard Worker dec hd 792*c0909341SAndroid Build Coastguard Worker jz .no_top_height1 793*c0909341SAndroid Build Coastguard Worker or edged, 16 794*c0909341SAndroid Build Coastguard Worker mov t0, t1 795*c0909341SAndroid Build Coastguard Worker mov t1, t2 796*c0909341SAndroid Build Coastguard Worker jmp .main 797*c0909341SAndroid Build Coastguard Worker.no_top_height1: 798*c0909341SAndroid Build Coastguard Worker call .v 799*c0909341SAndroid Build Coastguard Worker call .prep_n 800*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 801*c0909341SAndroid Build Coastguard Worker.extend_right: 802*c0909341SAndroid Build Coastguard Worker movd xm2, r10d 803*c0909341SAndroid Build Coastguard Worker mova m0, [sgr_r_ext] 804*c0909341SAndroid Build Coastguard Worker vpbroadcastb m2, xm2 805*c0909341SAndroid Build Coastguard Worker psubb m0, m2 806*c0909341SAndroid Build Coastguard Worker pminub m0, [pb_0to63] 807*c0909341SAndroid Build Coastguard Worker pshufb m5, m0 808*c0909341SAndroid Build Coastguard Worker ret 809*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 810*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 811*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 812*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 813*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [leftq] 814*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 815*c0909341SAndroid Build Coastguard Worker palignr xm5, xm0, 12 816*c0909341SAndroid Build Coastguard Worker add leftq, 4 817*c0909341SAndroid Build Coastguard Worker jmp .h_main 818*c0909341SAndroid Build Coastguard Worker.h_extend_left: 819*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 820*c0909341SAndroid Build Coastguard Worker pshufb xm5, [sgr_l_shuf] 821*c0909341SAndroid Build Coastguard Worker jmp .h_main 822*c0909341SAndroid Build Coastguard Worker.h_top: 823*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 824*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 825*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 826*c0909341SAndroid Build Coastguard Worker.h_loop: 827*c0909341SAndroid Build Coastguard Worker movu xm5, [lpfq+r10-2] 828*c0909341SAndroid Build Coastguard Worker.h_main: 829*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+r10+6], 1 830*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 831*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 832*c0909341SAndroid Build Coastguard Worker cmp r10d, -18 833*c0909341SAndroid Build Coastguard Worker jl .h_have_right 834*c0909341SAndroid Build Coastguard Worker call .extend_right 835*c0909341SAndroid Build Coastguard Worker.h_have_right: 836*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m8 837*c0909341SAndroid Build Coastguard Worker pmullw m4, m3, m3 838*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m9 839*c0909341SAndroid Build Coastguard Worker paddw m0, m3, m2 840*c0909341SAndroid Build Coastguard Worker shufps m3, m2, q2121 841*c0909341SAndroid Build Coastguard Worker paddw m0, m3 842*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 843*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 844*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 845*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 846*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m6 847*c0909341SAndroid Build Coastguard Worker paddd m1, m3 848*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m6 849*c0909341SAndroid Build Coastguard Worker paddd m2, m4 850*c0909341SAndroid Build Coastguard Worker pshufb m4, m5, m10 851*c0909341SAndroid Build Coastguard Worker paddw m0, m4 852*c0909341SAndroid Build Coastguard Worker pshufb m5, m11 853*c0909341SAndroid Build Coastguard Worker paddw m0, m5 ; sum 854*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 855*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 856*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 857*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 858*c0909341SAndroid Build Coastguard Worker test edgeb, 16 ; y > 0 859*c0909341SAndroid Build Coastguard Worker jz .h_loop_end 860*c0909341SAndroid Build Coastguard Worker paddw m0, [t1+r10*2+400*0] 861*c0909341SAndroid Build Coastguard Worker paddd m1, [t1+r10*2+400*2] 862*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10*2+400*4] 863*c0909341SAndroid Build Coastguard Worker.h_loop_end: 864*c0909341SAndroid Build Coastguard Worker paddd m1, m3 ; sumsq 865*c0909341SAndroid Build Coastguard Worker paddd m2, m4 866*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*0], m0 867*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*2], m1 868*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*4], m2 869*c0909341SAndroid Build Coastguard Worker add r10, 16 870*c0909341SAndroid Build Coastguard Worker jl .h_loop 871*c0909341SAndroid Build Coastguard Worker ret 872*c0909341SAndroid Build Coastguard Worker.top_fixup: 873*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 874*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: ; the sums of the first row needs to be doubled 875*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10*2+400*0] 876*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10*2+400*2] 877*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10*2+400*4] 878*c0909341SAndroid Build Coastguard Worker paddw m0, m0 879*c0909341SAndroid Build Coastguard Worker paddd m1, m1 880*c0909341SAndroid Build Coastguard Worker paddd m2, m2 881*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*0], m0 882*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*2], m1 883*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*4], m2 884*c0909341SAndroid Build Coastguard Worker add r10, 16 885*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 886*c0909341SAndroid Build Coastguard Worker ret 887*c0909341SAndroid Build Coastguard WorkerALIGN function_align 888*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab 889*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 890*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 891*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 892*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [leftq] 893*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 894*c0909341SAndroid Build Coastguard Worker palignr xm5, xm0, 12 895*c0909341SAndroid Build Coastguard Worker add leftq, 4 896*c0909341SAndroid Build Coastguard Worker jmp .hv_main 897*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 898*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 899*c0909341SAndroid Build Coastguard Worker pshufb xm5, [sgr_l_shuf] 900*c0909341SAndroid Build Coastguard Worker jmp .hv_main 901*c0909341SAndroid Build Coastguard Worker.hv_bottom: 902*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 903*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 904*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 905*c0909341SAndroid Build Coastguard Worker.hv_loop: 906*c0909341SAndroid Build Coastguard Worker movu xm5, [lpfq+r10-2] 907*c0909341SAndroid Build Coastguard Worker.hv_main: 908*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+r10+6], 1 909*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 910*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 911*c0909341SAndroid Build Coastguard Worker cmp r10d, -18 912*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 913*c0909341SAndroid Build Coastguard Worker call .extend_right 914*c0909341SAndroid Build Coastguard Worker.hv_have_right: 915*c0909341SAndroid Build Coastguard Worker pshufb m1, m5, m8 916*c0909341SAndroid Build Coastguard Worker pmullw m4, m1, m1 917*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m9 918*c0909341SAndroid Build Coastguard Worker paddw m0, m1, m3 919*c0909341SAndroid Build Coastguard Worker shufps m1, m3, q2121 920*c0909341SAndroid Build Coastguard Worker paddw m0, m1 921*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m1 922*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 923*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m1 924*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 925*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m4, m6 926*c0909341SAndroid Build Coastguard Worker paddd m2, m1 927*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m6 928*c0909341SAndroid Build Coastguard Worker paddd m3, m4 929*c0909341SAndroid Build Coastguard Worker pshufb m1, m5, m10 930*c0909341SAndroid Build Coastguard Worker paddw m0, m1 931*c0909341SAndroid Build Coastguard Worker pshufb m5, m11 932*c0909341SAndroid Build Coastguard Worker paddw m0, m5 ; h sum 933*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m1 934*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 935*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m1 936*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 937*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t1+r10*2+400*0] 938*c0909341SAndroid Build Coastguard Worker paddd m2, m4 ; h sumsq 939*c0909341SAndroid Build Coastguard Worker paddd m3, m5 940*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t1+r10*2+400*2] 941*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t1+r10*2+400*4] 942*c0909341SAndroid Build Coastguard Worker test hd, hd 943*c0909341SAndroid Build Coastguard Worker jz .hv_last_row 944*c0909341SAndroid Build Coastguard Worker.hv_main2: 945*c0909341SAndroid Build Coastguard Worker paddw m1, [t2+r10*2+400*0] ; hv sum 946*c0909341SAndroid Build Coastguard Worker paddd m4, [t2+r10*2+400*2] ; hv sumsq 947*c0909341SAndroid Build Coastguard Worker paddd m5, [t2+r10*2+400*4] 948*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+400*0], m0 949*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+400*2], m2 950*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+400*4], m3 951*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pd_25] 952*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 953*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 954*c0909341SAndroid Build Coastguard Worker pmulld m4, m2 ; a * 25 955*c0909341SAndroid Build Coastguard Worker pmulld m5, m2 956*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b * b 957*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 958*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 959*c0909341SAndroid Build Coastguard Worker psubd m5, m3 960*c0909341SAndroid Build Coastguard Worker pmulld m4, m12 ; p * s 961*c0909341SAndroid Build Coastguard Worker pmulld m5, m12 962*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m13 ; b * 164 963*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m13 964*c0909341SAndroid Build Coastguard Worker paddw m4, m13 965*c0909341SAndroid Build Coastguard Worker paddw m5, m13 966*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z + 1 967*c0909341SAndroid Build Coastguard Worker psrld m5, 20 968*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 969*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 970*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z + 1) 971*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 972*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m14, m4 973*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m14, m5 974*c0909341SAndroid Build Coastguard Worker mulps m2, m14 ; 256 / (z + 1) 975*c0909341SAndroid Build Coastguard Worker mulps m3, m14 976*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z < 255 ? 255 : 0 977*c0909341SAndroid Build Coastguard Worker psrld m5, 24 978*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 979*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 980*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x 981*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 982*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pd_34816] 983*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 984*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 985*c0909341SAndroid Build Coastguard Worker paddd m0, m4 ; x * b * 164 + (1 << 11) + (1 << 15) 986*c0909341SAndroid Build Coastguard Worker paddd m1, m4 987*c0909341SAndroid Build Coastguard Worker pand m0, m15 988*c0909341SAndroid Build Coastguard Worker pand m1, m15 989*c0909341SAndroid Build Coastguard Worker por m0, m2 ; a | (b << 12) 990*c0909341SAndroid Build Coastguard Worker por m1, m3 991*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires 992*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b. 993*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but 994*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way. 995*c0909341SAndroid Build Coastguard Worker add r10, 16 996*c0909341SAndroid Build Coastguard Worker jl .hv_loop 997*c0909341SAndroid Build Coastguard Worker mov t2, t1 998*c0909341SAndroid Build Coastguard Worker mov t1, t0 999*c0909341SAndroid Build Coastguard Worker mov t0, t2 1000*c0909341SAndroid Build Coastguard Worker ret 1001*c0909341SAndroid Build Coastguard Worker.hv_last_row: ; esoteric edge case for odd heights 1002*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*0], m1 1003*c0909341SAndroid Build Coastguard Worker paddw m1, m0 1004*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*2], m4 1005*c0909341SAndroid Build Coastguard Worker paddd m4, m2 1006*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*4], m5 1007*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1008*c0909341SAndroid Build Coastguard Worker jmp .hv_main2 1009*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab 1010*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1011*c0909341SAndroid Build Coastguard Worker.v_loop: 1012*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10*2+400*0] 1013*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10*2+400*2] 1014*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10*2+400*4] 1015*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10*2+400*0] 1016*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t2+r10*2+400*2] 1017*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t2+r10*2+400*4] 1018*c0909341SAndroid Build Coastguard Worker paddw m0, m0 1019*c0909341SAndroid Build Coastguard Worker paddd m2, m2 1020*c0909341SAndroid Build Coastguard Worker paddd m3, m3 1021*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; hv sum 1022*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; hv sumsq 1023*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1024*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pd_25] 1025*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 1026*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 1027*c0909341SAndroid Build Coastguard Worker pmulld m4, m2 ; a * 25 1028*c0909341SAndroid Build Coastguard Worker pmulld m5, m2 1029*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b * b 1030*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 1031*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 1032*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1033*c0909341SAndroid Build Coastguard Worker pmulld m4, m12 ; p * s 1034*c0909341SAndroid Build Coastguard Worker pmulld m5, m12 1035*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m13 ; b * 164 1036*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m13 1037*c0909341SAndroid Build Coastguard Worker paddw m4, m13 1038*c0909341SAndroid Build Coastguard Worker paddw m5, m13 1039*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z + 1 1040*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1041*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 1042*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 1043*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z + 1) 1044*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 1045*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m14, m4 1046*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m14, m5 1047*c0909341SAndroid Build Coastguard Worker mulps m2, m14 ; 256 / (z + 1) 1048*c0909341SAndroid Build Coastguard Worker mulps m3, m14 1049*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z < 255 ? 255 : 0 1050*c0909341SAndroid Build Coastguard Worker psrld m5, 24 1051*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 1052*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 1053*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x 1054*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 1055*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pd_34816] 1056*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 1057*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 1058*c0909341SAndroid Build Coastguard Worker paddd m0, m4 ; x * b * 164 + (1 << 11) + (1 << 15) 1059*c0909341SAndroid Build Coastguard Worker paddd m1, m4 1060*c0909341SAndroid Build Coastguard Worker pand m0, m15 1061*c0909341SAndroid Build Coastguard Worker pand m1, m15 1062*c0909341SAndroid Build Coastguard Worker por m0, m2 ; a | (b << 12) 1063*c0909341SAndroid Build Coastguard Worker por m1, m3 1064*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 8], xm0 1065*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+40], m0, 1 1066*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+24], xm1 1067*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+56], m1, 1 1068*c0909341SAndroid Build Coastguard Worker add r10, 16 1069*c0909341SAndroid Build Coastguard Worker jl .v_loop 1070*c0909341SAndroid Build Coastguard Worker ret 1071*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 1072*c0909341SAndroid Build Coastguard Worker mov r10, wq 1073*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 1074*c0909341SAndroid Build Coastguard Worker movu m0, [t3+r10*4+ 4] 1075*c0909341SAndroid Build Coastguard Worker movu m1, [t3+r10*4+36] 1076*c0909341SAndroid Build Coastguard Worker paddd m2, m0, [t3+r10*4+ 0] 1077*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t3+r10*4+32] 1078*c0909341SAndroid Build Coastguard Worker paddd m2, [t3+r10*4+ 8] 1079*c0909341SAndroid Build Coastguard Worker paddd m3, [t3+r10*4+40] 1080*c0909341SAndroid Build Coastguard Worker paddd m0, m2 1081*c0909341SAndroid Build Coastguard Worker pslld m2, 2 1082*c0909341SAndroid Build Coastguard Worker paddd m1, m3 1083*c0909341SAndroid Build Coastguard Worker pslld m3, 2 1084*c0909341SAndroid Build Coastguard Worker paddd m2, m0 ; ab 565 1085*c0909341SAndroid Build Coastguard Worker paddd m3, m1 1086*c0909341SAndroid Build Coastguard Worker pandn m0, m15, m2 ; a 1087*c0909341SAndroid Build Coastguard Worker psrld m2, 12 ; b 1088*c0909341SAndroid Build Coastguard Worker pandn m1, m15, m3 1089*c0909341SAndroid Build Coastguard Worker psrld m3, 12 1090*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*4+ 0], m0 1091*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*8+ 0], m2 1092*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*4+32], m1 1093*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*8+32], m3 1094*c0909341SAndroid Build Coastguard Worker add r10, 16 1095*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 1096*c0909341SAndroid Build Coastguard Worker ret 1097*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1098*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 1099*c0909341SAndroid Build Coastguard Worker mov r10, wq 1100*c0909341SAndroid Build Coastguard Worker.n0_loop: 1101*c0909341SAndroid Build Coastguard Worker movu m0, [t3+r10*4+ 4] 1102*c0909341SAndroid Build Coastguard Worker movu m1, [t3+r10*4+36] 1103*c0909341SAndroid Build Coastguard Worker paddd m2, m0, [t3+r10*4+ 0] 1104*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t3+r10*4+32] 1105*c0909341SAndroid Build Coastguard Worker paddd m2, [t3+r10*4+ 8] 1106*c0909341SAndroid Build Coastguard Worker paddd m3, [t3+r10*4+40] 1107*c0909341SAndroid Build Coastguard Worker paddd m0, m2 1108*c0909341SAndroid Build Coastguard Worker pslld m2, 2 1109*c0909341SAndroid Build Coastguard Worker paddd m1, m3 1110*c0909341SAndroid Build Coastguard Worker pslld m3, 2 1111*c0909341SAndroid Build Coastguard Worker paddd m2, m0 1112*c0909341SAndroid Build Coastguard Worker paddd m3, m1 1113*c0909341SAndroid Build Coastguard Worker pandn m0, m15, m2 1114*c0909341SAndroid Build Coastguard Worker psrld m2, 12 1115*c0909341SAndroid Build Coastguard Worker pandn m1, m15, m3 1116*c0909341SAndroid Build Coastguard Worker psrld m3, 12 1117*c0909341SAndroid Build Coastguard Worker paddd m4, m0, [t3+r10*4+400*4+ 0] ; a 1118*c0909341SAndroid Build Coastguard Worker paddd m5, m1, [t3+r10*4+400*4+32] 1119*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*4+ 0], m0 1120*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*4+32], m1 1121*c0909341SAndroid Build Coastguard Worker paddd m0, m2, [t3+r10*4+400*8+ 0] ; b 1122*c0909341SAndroid Build Coastguard Worker paddd m1, m3, [t3+r10*4+400*8+32] 1123*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*8+ 0], m2 1124*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*8+32], m3 1125*c0909341SAndroid Build Coastguard Worker pmovzxbd m2, [dstq+r10+0] 1126*c0909341SAndroid Build Coastguard Worker pmovzxbd m3, [dstq+r10+8] 1127*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m2 ; a * src 1128*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m3 1129*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1130*c0909341SAndroid Build Coastguard Worker psubd m0, m4 ; b - a * src + (1 << 8) 1131*c0909341SAndroid Build Coastguard Worker psubd m1, m5 1132*c0909341SAndroid Build Coastguard Worker psrad m0, 9 1133*c0909341SAndroid Build Coastguard Worker psrad m1, 9 1134*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 1135*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m7 1136*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1137*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 1138*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm1 1139*c0909341SAndroid Build Coastguard Worker pshufd xm0, xm0, q3120 1140*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], xm0 1141*c0909341SAndroid Build Coastguard Worker add r10, 16 1142*c0909341SAndroid Build Coastguard Worker jl .n0_loop 1143*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1144*c0909341SAndroid Build Coastguard Worker ret 1145*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1146*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 1147*c0909341SAndroid Build Coastguard Worker mov r10, wq 1148*c0909341SAndroid Build Coastguard Worker.n1_loop: 1149*c0909341SAndroid Build Coastguard Worker pmovzxbd m2, [dstq+r10+0] 1150*c0909341SAndroid Build Coastguard Worker pmovzxbd m3, [dstq+r10+8] 1151*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src 1152*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m3, [t3+r10*4+400*4+32] 1153*c0909341SAndroid Build Coastguard Worker mova m0, [t3+r10*4+400*8+ 0] ; b 1154*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*4+400*8+32] 1155*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1156*c0909341SAndroid Build Coastguard Worker psubd m0, m4 ; b - a * src + (1 << 7) 1157*c0909341SAndroid Build Coastguard Worker psubd m1, m5 1158*c0909341SAndroid Build Coastguard Worker psrad m0, 8 1159*c0909341SAndroid Build Coastguard Worker psrad m1, 8 1160*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 1161*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m7 1162*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1163*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 1164*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm1 1165*c0909341SAndroid Build Coastguard Worker pshufd xm0, xm0, q3120 1166*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], xm0 1167*c0909341SAndroid Build Coastguard Worker add r10, 16 1168*c0909341SAndroid Build Coastguard Worker jl .n1_loop 1169*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1170*c0909341SAndroid Build Coastguard Worker ret 1171*c0909341SAndroid Build Coastguard Worker 1172*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_8bpc, 4, 14, 16, -400*28-16, dst, stride, left, lpf, \ 1173*c0909341SAndroid Build Coastguard Worker w, h, edge, params 1174*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 1175*c0909341SAndroid Build Coastguard Worker mov wd, wm 1176*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1177*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [sgr_shuf+2] 1178*c0909341SAndroid Build Coastguard Worker mov edged, r7m 1179*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [sgr_shuf+4] 1180*c0909341SAndroid Build Coastguard Worker add lpfq, wq 1181*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [sgr_shuf+6] 1182*c0909341SAndroid Build Coastguard Worker add dstq, wq 1183*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, [paramsq+10] ; w1 1184*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*4+16+400*12] 1185*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [paramsq+ 4] ; s1 1186*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1187*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [pw_455_24] 1188*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+20] 1189*c0909341SAndroid Build Coastguard Worker vbroadcastss m13, [pf_256] 1190*c0909341SAndroid Build Coastguard Worker neg wq 1191*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) 1192*c0909341SAndroid Build Coastguard Worker psllw m7, 4 1193*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [pd_m4096] 1194*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 1195*c0909341SAndroid Build Coastguard Worker jz .no_top 1196*c0909341SAndroid Build Coastguard Worker call .h_top 1197*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1198*c0909341SAndroid Build Coastguard Worker mov t2, t1 1199*c0909341SAndroid Build Coastguard Worker add t1, 400*6 1200*c0909341SAndroid Build Coastguard Worker call .h_top 1201*c0909341SAndroid Build Coastguard Worker lea t4, [lpfq+strideq*4] 1202*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1203*c0909341SAndroid Build Coastguard Worker add t4, strideq 1204*c0909341SAndroid Build Coastguard Worker mov [rsp], t4 ; below 1205*c0909341SAndroid Build Coastguard Worker mov t0, t2 1206*c0909341SAndroid Build Coastguard Worker call .hv 1207*c0909341SAndroid Build Coastguard Worker.main: 1208*c0909341SAndroid Build Coastguard Worker mov t5, t3 1209*c0909341SAndroid Build Coastguard Worker add t3, 400*4 1210*c0909341SAndroid Build Coastguard Worker dec hd 1211*c0909341SAndroid Build Coastguard Worker jz .height1 1212*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1213*c0909341SAndroid Build Coastguard Worker call .hv 1214*c0909341SAndroid Build Coastguard Worker call .prep_n 1215*c0909341SAndroid Build Coastguard Worker dec hd 1216*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1217*c0909341SAndroid Build Coastguard Worker.main_loop: 1218*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1219*c0909341SAndroid Build Coastguard Worker call .hv 1220*c0909341SAndroid Build Coastguard Worker call .n 1221*c0909341SAndroid Build Coastguard Worker dec hd 1222*c0909341SAndroid Build Coastguard Worker jnz .main_loop 1223*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 1224*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1225*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 1226*c0909341SAndroid Build Coastguard Worker call .hv_bottom 1227*c0909341SAndroid Build Coastguard Worker call .n 1228*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1229*c0909341SAndroid Build Coastguard Worker call .hv_bottom 1230*c0909341SAndroid Build Coastguard Worker.end: 1231*c0909341SAndroid Build Coastguard Worker call .n 1232*c0909341SAndroid Build Coastguard Worker RET 1233*c0909341SAndroid Build Coastguard Worker.height1: 1234*c0909341SAndroid Build Coastguard Worker call .v 1235*c0909341SAndroid Build Coastguard Worker call .prep_n 1236*c0909341SAndroid Build Coastguard Worker mov t2, t1 1237*c0909341SAndroid Build Coastguard Worker call .v 1238*c0909341SAndroid Build Coastguard Worker jmp .end 1239*c0909341SAndroid Build Coastguard Worker.extend_bottom: 1240*c0909341SAndroid Build Coastguard Worker call .v 1241*c0909341SAndroid Build Coastguard Worker call .n 1242*c0909341SAndroid Build Coastguard Worker mov t2, t1 1243*c0909341SAndroid Build Coastguard Worker call .v 1244*c0909341SAndroid Build Coastguard Worker jmp .end 1245*c0909341SAndroid Build Coastguard Worker.no_top: 1246*c0909341SAndroid Build Coastguard Worker lea t4, [lpfq+strideq*4] 1247*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1248*c0909341SAndroid Build Coastguard Worker lea t4, [t4+strideq*2] 1249*c0909341SAndroid Build Coastguard Worker mov [rsp], t4 1250*c0909341SAndroid Build Coastguard Worker call .h 1251*c0909341SAndroid Build Coastguard Worker lea t0, [t1+400*6] 1252*c0909341SAndroid Build Coastguard Worker mov t2, t1 1253*c0909341SAndroid Build Coastguard Worker call .v 1254*c0909341SAndroid Build Coastguard Worker jmp .main 1255*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 1256*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1257*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1258*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1259*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [leftq] 1260*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 1261*c0909341SAndroid Build Coastguard Worker palignr xm5, xm0, 12 1262*c0909341SAndroid Build Coastguard Worker add leftq, 4 1263*c0909341SAndroid Build Coastguard Worker jmp .h_main 1264*c0909341SAndroid Build Coastguard Worker.h_extend_left: 1265*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 1266*c0909341SAndroid Build Coastguard Worker pshufb xm5, [sgr_l_shuf] 1267*c0909341SAndroid Build Coastguard Worker jmp .h_main 1268*c0909341SAndroid Build Coastguard Worker.h_top: 1269*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1270*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1271*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1272*c0909341SAndroid Build Coastguard Worker.h_loop: 1273*c0909341SAndroid Build Coastguard Worker movu xm5, [lpfq+r10-2] 1274*c0909341SAndroid Build Coastguard Worker.h_main: 1275*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+r10+6], 1 1276*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1277*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 1278*c0909341SAndroid Build Coastguard Worker cmp r10d, -17 1279*c0909341SAndroid Build Coastguard Worker jl .h_have_right 1280*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1281*c0909341SAndroid Build Coastguard Worker.h_have_right: 1282*c0909341SAndroid Build Coastguard Worker pshufb m0, m5, m8 1283*c0909341SAndroid Build Coastguard Worker pmullw m2, m0, m0 1284*c0909341SAndroid Build Coastguard Worker pshufb m4, m5, m9 1285*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1286*c0909341SAndroid Build Coastguard Worker pshufb m5, m10 1287*c0909341SAndroid Build Coastguard Worker paddw m0, m5 ; sum 1288*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 1289*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1290*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 1291*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 1292*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m6 1293*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m6 1294*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*0], m0 1295*c0909341SAndroid Build Coastguard Worker paddd m1, m3 ; sumsq 1296*c0909341SAndroid Build Coastguard Worker paddd m2, m4 1297*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*2], m1 1298*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*4], m2 1299*c0909341SAndroid Build Coastguard Worker add r10, 16 1300*c0909341SAndroid Build Coastguard Worker jl .h_loop 1301*c0909341SAndroid Build Coastguard Worker ret 1302*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1303*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab 1304*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1305*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1306*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 1307*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [leftq] 1308*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 1309*c0909341SAndroid Build Coastguard Worker palignr xm5, xm0, 12 1310*c0909341SAndroid Build Coastguard Worker add leftq, 4 1311*c0909341SAndroid Build Coastguard Worker jmp .hv_main 1312*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 1313*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 1314*c0909341SAndroid Build Coastguard Worker pshufb xm5, [sgr_l_shuf] 1315*c0909341SAndroid Build Coastguard Worker jmp .hv_main 1316*c0909341SAndroid Build Coastguard Worker.hv_bottom: 1317*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1318*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1319*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 1320*c0909341SAndroid Build Coastguard Worker.hv_loop: 1321*c0909341SAndroid Build Coastguard Worker movu xm5, [lpfq+r10-2] 1322*c0909341SAndroid Build Coastguard Worker.hv_main: 1323*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+r10+6], 1 1324*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1325*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 1326*c0909341SAndroid Build Coastguard Worker cmp r10d, -17 1327*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 1328*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1329*c0909341SAndroid Build Coastguard Worker.hv_have_right: 1330*c0909341SAndroid Build Coastguard Worker pshufb m0, m5, m8 1331*c0909341SAndroid Build Coastguard Worker pmullw m3, m0, m0 1332*c0909341SAndroid Build Coastguard Worker pshufb m1, m5, m9 1333*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1334*c0909341SAndroid Build Coastguard Worker pshufb m5, m10 1335*c0909341SAndroid Build Coastguard Worker paddw m0, m5 ; h sum 1336*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m1 1337*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 1338*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m1 1339*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1340*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10*2+400*0] 1341*c0909341SAndroid Build Coastguard Worker paddw m1, [t1+r10*2+400*0] ; hv sum 1342*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 1343*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1344*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; h sumsq 1345*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1346*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+r10*2+400*2] 1347*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+r10*2+400*4] 1348*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10*2+400*2] ; hv sumsq 1349*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+r10*2+400*4] 1350*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+400*0], m0 1351*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 1352*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 1353*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+400*2], m4 1354*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 1355*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+400*4], m5 1356*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 1357*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a * 9 1358*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b * b 1359*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1360*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 1361*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 1362*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1363*c0909341SAndroid Build Coastguard Worker pmulld m4, m11 ; p * s 1364*c0909341SAndroid Build Coastguard Worker pmulld m5, m11 1365*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 ; b * 455 1366*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 1367*c0909341SAndroid Build Coastguard Worker paddw m4, m12 1368*c0909341SAndroid Build Coastguard Worker paddw m5, m12 1369*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z + 1 1370*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1371*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 1372*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 1373*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z + 1) 1374*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 1375*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m13, m4 1376*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m13, m5 1377*c0909341SAndroid Build Coastguard Worker mulps m2, m13 ; 256 / (z + 1) 1378*c0909341SAndroid Build Coastguard Worker mulps m3, m13 1379*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z < 255 ? 255 : 0 1380*c0909341SAndroid Build Coastguard Worker psrld m5, 24 1381*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 1382*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 1383*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x 1384*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 1385*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 1386*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 1387*c0909341SAndroid Build Coastguard Worker paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) 1388*c0909341SAndroid Build Coastguard Worker paddd m1, m14 1389*c0909341SAndroid Build Coastguard Worker pand m0, m15 1390*c0909341SAndroid Build Coastguard Worker pand m1, m15 1391*c0909341SAndroid Build Coastguard Worker por m0, m2 ; a | (b << 12) 1392*c0909341SAndroid Build Coastguard Worker por m1, m3 1393*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 8], xm0 1394*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+40], m0, 1 1395*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+24], xm1 1396*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+56], m1, 1 1397*c0909341SAndroid Build Coastguard Worker add r10, 16 1398*c0909341SAndroid Build Coastguard Worker jl .hv_loop 1399*c0909341SAndroid Build Coastguard Worker mov t2, t1 1400*c0909341SAndroid Build Coastguard Worker mov t1, t0 1401*c0909341SAndroid Build Coastguard Worker mov t0, t2 1402*c0909341SAndroid Build Coastguard Worker ret 1403*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab 1404*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1405*c0909341SAndroid Build Coastguard Worker.v_loop: 1406*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10*2+400*0] 1407*c0909341SAndroid Build Coastguard Worker paddw m1, m1 1408*c0909341SAndroid Build Coastguard Worker paddw m1, [t2+r10*2+400*0] ; hv sum 1409*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10*2+400*2] 1410*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10*2+400*4] 1411*c0909341SAndroid Build Coastguard Worker paddd m2, m2 1412*c0909341SAndroid Build Coastguard Worker paddd m3, m3 1413*c0909341SAndroid Build Coastguard Worker paddd m2, [t2+r10*2+400*2] ; hv sumsq 1414*c0909341SAndroid Build Coastguard Worker paddd m3, [t2+r10*2+400*4] 1415*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 1416*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 1417*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 1418*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 1419*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a * 9 1420*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b * b 1421*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1422*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 1423*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 1424*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1425*c0909341SAndroid Build Coastguard Worker pmulld m4, m11 ; p * s 1426*c0909341SAndroid Build Coastguard Worker pmulld m5, m11 1427*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 ; b * 455 1428*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 1429*c0909341SAndroid Build Coastguard Worker paddw m4, m12 1430*c0909341SAndroid Build Coastguard Worker paddw m5, m12 1431*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z + 1 1432*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1433*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 1434*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 1435*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z + 1) 1436*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 1437*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m13, m4 1438*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m13, m5 1439*c0909341SAndroid Build Coastguard Worker mulps m2, m13 ; 256 / (z + 1) 1440*c0909341SAndroid Build Coastguard Worker mulps m3, m13 1441*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z < 255 ? 255 : 0 1442*c0909341SAndroid Build Coastguard Worker psrld m5, 24 1443*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 1444*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 1445*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x 1446*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 1447*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 1448*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 1449*c0909341SAndroid Build Coastguard Worker paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) 1450*c0909341SAndroid Build Coastguard Worker paddd m1, m14 1451*c0909341SAndroid Build Coastguard Worker pand m0, m15 1452*c0909341SAndroid Build Coastguard Worker pand m1, m15 1453*c0909341SAndroid Build Coastguard Worker por m0, m2 ; a | (b << 12) 1454*c0909341SAndroid Build Coastguard Worker por m1, m3 1455*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 8], xm0 1456*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+40], m0, 1 1457*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+24], xm1 1458*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+56], m1, 1 1459*c0909341SAndroid Build Coastguard Worker add r10, 16 1460*c0909341SAndroid Build Coastguard Worker jl .v_loop 1461*c0909341SAndroid Build Coastguard Worker ret 1462*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 1463*c0909341SAndroid Build Coastguard Worker mov r10, wq 1464*c0909341SAndroid Build Coastguard Worker mov t4, t3 1465*c0909341SAndroid Build Coastguard Worker add t3, 400*4 1466*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 1467*c0909341SAndroid Build Coastguard Worker mova m2, [t5+r10*4+0] 1468*c0909341SAndroid Build Coastguard Worker mova m3, [t4+r10*4+0] 1469*c0909341SAndroid Build Coastguard Worker paddd m2, [t5+r10*4+8] 1470*c0909341SAndroid Build Coastguard Worker paddd m3, [t4+r10*4+8] 1471*c0909341SAndroid Build Coastguard Worker paddd m0, m2, [t5+r10*4+4] 1472*c0909341SAndroid Build Coastguard Worker paddd m1, m3, [t4+r10*4+4] 1473*c0909341SAndroid Build Coastguard Worker pslld m0, 2 1474*c0909341SAndroid Build Coastguard Worker paddd m1, m1 ; ab[ 0] 222 1475*c0909341SAndroid Build Coastguard Worker psubd m0, m2 ; ab[-1] 343 1476*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*4], m1 1477*c0909341SAndroid Build Coastguard Worker paddd m1, m1 1478*c0909341SAndroid Build Coastguard Worker mova [t5+r10*4], m0 1479*c0909341SAndroid Build Coastguard Worker psubd m1, m3 ; ab[ 0] 343 1480*c0909341SAndroid Build Coastguard Worker mova [t4+r10*4], m1 1481*c0909341SAndroid Build Coastguard Worker add r10, 8 1482*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 1483*c0909341SAndroid Build Coastguard Worker ret 1484*c0909341SAndroid Build Coastguard Worker; a+b are packed together in a single dword, but we can't do the 1485*c0909341SAndroid Build Coastguard Worker; full neighbor calculations before splitting them since we don't 1486*c0909341SAndroid Build Coastguard Worker; have sufficient precision. The solution is to do the calculations 1487*c0909341SAndroid Build Coastguard Worker; in two equal halves and split a and b before doing the final sum. 1488*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1489*c0909341SAndroid Build Coastguard Worker.n: ; neighbor + output 1490*c0909341SAndroid Build Coastguard Worker mov r10, wq 1491*c0909341SAndroid Build Coastguard Worker.n_loop: 1492*c0909341SAndroid Build Coastguard Worker mova m4, [t3+r10*4+ 0] 1493*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+r10*4+ 8] 1494*c0909341SAndroid Build Coastguard Worker paddd m5, m4, [t3+r10*4+ 4] 1495*c0909341SAndroid Build Coastguard Worker paddd m5, m5 ; ab[+1] 222 1496*c0909341SAndroid Build Coastguard Worker mova m2, [t3+r10*4+400*4+ 0] 1497*c0909341SAndroid Build Coastguard Worker paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 1498*c0909341SAndroid Build Coastguard Worker mova m3, [t3+r10*4+400*4+32] 1499*c0909341SAndroid Build Coastguard Worker paddd m1, m3, [t5+r10*4+32] 1500*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*4+ 0], m5 1501*c0909341SAndroid Build Coastguard Worker paddd m5, m5 1502*c0909341SAndroid Build Coastguard Worker psubd m5, m4 ; ab[+1] 343 1503*c0909341SAndroid Build Coastguard Worker mova [t5+r10*4+ 0], m5 1504*c0909341SAndroid Build Coastguard Worker paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343 1505*c0909341SAndroid Build Coastguard Worker mova m4, [t3+r10*4+32] 1506*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+r10*4+40] 1507*c0909341SAndroid Build Coastguard Worker paddd m5, m4, [t3+r10*4+36] 1508*c0909341SAndroid Build Coastguard Worker paddd m5, m5 1509*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*4+32], m5 1510*c0909341SAndroid Build Coastguard Worker paddd m5, m5 1511*c0909341SAndroid Build Coastguard Worker psubd m5, m4 1512*c0909341SAndroid Build Coastguard Worker mova [t5+r10*4+32], m5 1513*c0909341SAndroid Build Coastguard Worker pandn m4, m15, m0 1514*c0909341SAndroid Build Coastguard Worker psrld m0, 12 1515*c0909341SAndroid Build Coastguard Worker paddd m3, m5 1516*c0909341SAndroid Build Coastguard Worker pandn m5, m15, m2 1517*c0909341SAndroid Build Coastguard Worker psrld m2, 12 1518*c0909341SAndroid Build Coastguard Worker paddd m4, m5 ; a 1519*c0909341SAndroid Build Coastguard Worker pandn m5, m15, m1 1520*c0909341SAndroid Build Coastguard Worker psrld m1, 12 1521*c0909341SAndroid Build Coastguard Worker paddd m0, m2 ; b + (1 << 8) 1522*c0909341SAndroid Build Coastguard Worker pandn m2, m15, m3 1523*c0909341SAndroid Build Coastguard Worker psrld m3, 12 1524*c0909341SAndroid Build Coastguard Worker paddd m5, m2 1525*c0909341SAndroid Build Coastguard Worker pmovzxbd m2, [dstq+r10+0] 1526*c0909341SAndroid Build Coastguard Worker paddd m1, m3 1527*c0909341SAndroid Build Coastguard Worker pmovzxbd m3, [dstq+r10+8] 1528*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m2 ; a * src 1529*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m3 1530*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1531*c0909341SAndroid Build Coastguard Worker psubd m0, m4 ; b - a * src + (1 << 8) 1532*c0909341SAndroid Build Coastguard Worker psubd m1, m5 1533*c0909341SAndroid Build Coastguard Worker psrad m0, 9 1534*c0909341SAndroid Build Coastguard Worker psrad m1, 9 1535*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 1536*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m7 1537*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1538*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 1539*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm1 1540*c0909341SAndroid Build Coastguard Worker pshufd xm0, xm0, q3120 1541*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], xm0 1542*c0909341SAndroid Build Coastguard Worker add r10, 16 1543*c0909341SAndroid Build Coastguard Worker jl .n_loop 1544*c0909341SAndroid Build Coastguard Worker mov r10, t5 1545*c0909341SAndroid Build Coastguard Worker mov t5, t4 1546*c0909341SAndroid Build Coastguard Worker mov t4, r10 1547*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1548*c0909341SAndroid Build Coastguard Worker ret 1549*c0909341SAndroid Build Coastguard Worker 1550*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_8bpc, 4, 12, 16, 400*56+8, dst, stride, left, lpf, \ 1551*c0909341SAndroid Build Coastguard Worker w, h, edge, params 1552*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 1553*c0909341SAndroid Build Coastguard Worker mov wd, wm 1554*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1555*c0909341SAndroid Build Coastguard Worker mov edged, r7m 1556*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [sgr_shuf+0] 1557*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m10, [sgr_shuf+8] 1558*c0909341SAndroid Build Coastguard Worker add lpfq, wq 1559*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m11, [sgr_shuf+2] 1560*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m12, [sgr_shuf+6] 1561*c0909341SAndroid Build Coastguard Worker add dstq, wq 1562*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [paramsq+8] ; w0 w1 1563*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*4+400*24+8] 1564*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [paramsq+0] ; s0 1565*c0909341SAndroid Build Coastguard Worker pxor m7, m7 1566*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [paramsq+4] ; s1 1567*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+12] 1568*c0909341SAndroid Build Coastguard Worker neg wq 1569*c0909341SAndroid Build Coastguard Worker psllw m15, 2 ; to reuse existing pd_m4096 register for rounding 1570*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 1571*c0909341SAndroid Build Coastguard Worker jz .no_top 1572*c0909341SAndroid Build Coastguard Worker call .h_top 1573*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1574*c0909341SAndroid Build Coastguard Worker mov t2, t1 1575*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup 1576*c0909341SAndroid Build Coastguard Worker add t1, 400*12 1577*c0909341SAndroid Build Coastguard Worker call .h_top 1578*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1579*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1580*c0909341SAndroid Build Coastguard Worker add r10, strideq 1581*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 1582*c0909341SAndroid Build Coastguard Worker call .hv0 1583*c0909341SAndroid Build Coastguard Worker.main: 1584*c0909341SAndroid Build Coastguard Worker dec hd 1585*c0909341SAndroid Build Coastguard Worker jz .height1 1586*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1587*c0909341SAndroid Build Coastguard Worker call .hv1 1588*c0909341SAndroid Build Coastguard Worker call .prep_n 1589*c0909341SAndroid Build Coastguard Worker sub hd, 2 1590*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 1591*c0909341SAndroid Build Coastguard Worker.main_loop: 1592*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1593*c0909341SAndroid Build Coastguard Worker call .hv0 1594*c0909341SAndroid Build Coastguard Worker test hd, hd 1595*c0909341SAndroid Build Coastguard Worker jz .odd_height 1596*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1597*c0909341SAndroid Build Coastguard Worker call .hv1 1598*c0909341SAndroid Build Coastguard Worker call .n0 1599*c0909341SAndroid Build Coastguard Worker call .n1 1600*c0909341SAndroid Build Coastguard Worker sub hd, 2 1601*c0909341SAndroid Build Coastguard Worker jge .main_loop 1602*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 1603*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1604*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 1605*c0909341SAndroid Build Coastguard Worker call .hv0_bottom 1606*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1607*c0909341SAndroid Build Coastguard Worker call .hv1_bottom 1608*c0909341SAndroid Build Coastguard Worker.end: 1609*c0909341SAndroid Build Coastguard Worker call .n0 1610*c0909341SAndroid Build Coastguard Worker call .n1 1611*c0909341SAndroid Build Coastguard Worker.end2: 1612*c0909341SAndroid Build Coastguard Worker RET 1613*c0909341SAndroid Build Coastguard Worker.height1: 1614*c0909341SAndroid Build Coastguard Worker call .v1 1615*c0909341SAndroid Build Coastguard Worker call .prep_n 1616*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 1617*c0909341SAndroid Build Coastguard Worker.odd_height: 1618*c0909341SAndroid Build Coastguard Worker call .v1 1619*c0909341SAndroid Build Coastguard Worker call .n0 1620*c0909341SAndroid Build Coastguard Worker call .n1 1621*c0909341SAndroid Build Coastguard Worker.odd_height_end: 1622*c0909341SAndroid Build Coastguard Worker call .v0 1623*c0909341SAndroid Build Coastguard Worker call .v1 1624*c0909341SAndroid Build Coastguard Worker call .n0 1625*c0909341SAndroid Build Coastguard Worker jmp .end2 1626*c0909341SAndroid Build Coastguard Worker.extend_bottom: 1627*c0909341SAndroid Build Coastguard Worker call .v0 1628*c0909341SAndroid Build Coastguard Worker call .v1 1629*c0909341SAndroid Build Coastguard Worker jmp .end 1630*c0909341SAndroid Build Coastguard Worker.no_top: 1631*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1632*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1633*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 1634*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 1635*c0909341SAndroid Build Coastguard Worker call .h 1636*c0909341SAndroid Build Coastguard Worker lea t2, [t1+400*12] 1637*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1638*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: 1639*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10*2+400* 0] 1640*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10*2+400* 2] 1641*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10*2+400* 4] 1642*c0909341SAndroid Build Coastguard Worker paddw m0, m0 1643*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10*2+400* 6] 1644*c0909341SAndroid Build Coastguard Worker paddd m1, m1 1645*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10*2+400* 8] 1646*c0909341SAndroid Build Coastguard Worker paddd m2, m2 1647*c0909341SAndroid Build Coastguard Worker mova m5, [t1+r10*2+400*10] 1648*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 0], m0 1649*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 2], m1 1650*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 4], m2 1651*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 6], m3 1652*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 8], m4 1653*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*10], m5 1654*c0909341SAndroid Build Coastguard Worker add r10, 16 1655*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 1656*c0909341SAndroid Build Coastguard Worker call .v0 1657*c0909341SAndroid Build Coastguard Worker jmp .main 1658*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsums 1659*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1660*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1661*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1662*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [leftq] 1663*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 1664*c0909341SAndroid Build Coastguard Worker palignr xm5, xm0, 12 1665*c0909341SAndroid Build Coastguard Worker add leftq, 4 1666*c0909341SAndroid Build Coastguard Worker jmp .h_main 1667*c0909341SAndroid Build Coastguard Worker.h_extend_left: 1668*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 1669*c0909341SAndroid Build Coastguard Worker pshufb xm5, [sgr_l_shuf] 1670*c0909341SAndroid Build Coastguard Worker jmp .h_main 1671*c0909341SAndroid Build Coastguard Worker.h_top: 1672*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1673*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1674*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1675*c0909341SAndroid Build Coastguard Worker.h_loop: 1676*c0909341SAndroid Build Coastguard Worker movu xm5, [lpfq+r10-2] 1677*c0909341SAndroid Build Coastguard Worker.h_main: 1678*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+r10+6], 1 1679*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1680*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 1681*c0909341SAndroid Build Coastguard Worker cmp r10d, -18 1682*c0909341SAndroid Build Coastguard Worker jl .h_have_right 1683*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1684*c0909341SAndroid Build Coastguard Worker.h_have_right: 1685*c0909341SAndroid Build Coastguard Worker pshufb m6, m5, m9 1686*c0909341SAndroid Build Coastguard Worker pshufb m4, m5, m10 1687*c0909341SAndroid Build Coastguard Worker paddw m8, m6, m4 1688*c0909341SAndroid Build Coastguard Worker shufps m0, m6, m4, q2121 1689*c0909341SAndroid Build Coastguard Worker pmullw m3, m0, m0 1690*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m11 1691*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1692*c0909341SAndroid Build Coastguard Worker pshufb m5, m12 1693*c0909341SAndroid Build Coastguard Worker paddw m0, m5 ; sum3 1694*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m5 1695*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 1696*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m5 1697*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1698*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m4 1699*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1700*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m4 1701*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m6 1702*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m7 1703*c0909341SAndroid Build Coastguard Worker paddd m1, m4 ; sumsq3 1704*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m7 1705*c0909341SAndroid Build Coastguard Worker paddd m2, m3 1706*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400* 6], m0 1707*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400* 8], m1 1708*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*10], m2 1709*c0909341SAndroid Build Coastguard Worker paddw m8, m0 ; sum5 1710*c0909341SAndroid Build Coastguard Worker paddd m5, m1 ; sumsq5 1711*c0909341SAndroid Build Coastguard Worker paddd m6, m2 1712*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400* 0], m8 1713*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400* 2], m5 1714*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400* 4], m6 1715*c0909341SAndroid Build Coastguard Worker add r10, 16 1716*c0909341SAndroid Build Coastguard Worker jl .h_loop 1717*c0909341SAndroid Build Coastguard Worker ret 1718*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1719*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) 1720*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1721*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1722*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 1723*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [leftq] 1724*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 1725*c0909341SAndroid Build Coastguard Worker palignr xm5, xm0, 12 1726*c0909341SAndroid Build Coastguard Worker add leftq, 4 1727*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 1728*c0909341SAndroid Build Coastguard Worker.hv0_extend_left: 1729*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 1730*c0909341SAndroid Build Coastguard Worker pshufb xm5, [sgr_l_shuf] 1731*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 1732*c0909341SAndroid Build Coastguard Worker.hv0_bottom: 1733*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1734*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1735*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 1736*c0909341SAndroid Build Coastguard Worker.hv0_loop: 1737*c0909341SAndroid Build Coastguard Worker movu xm5, [lpfq+r10-2] 1738*c0909341SAndroid Build Coastguard Worker.hv0_main: 1739*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+r10+6], 1 1740*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1741*c0909341SAndroid Build Coastguard Worker jnz .hv0_have_right 1742*c0909341SAndroid Build Coastguard Worker cmp r10d, -18 1743*c0909341SAndroid Build Coastguard Worker jl .hv0_have_right 1744*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1745*c0909341SAndroid Build Coastguard Worker.hv0_have_right: 1746*c0909341SAndroid Build Coastguard Worker pshufb m6, m5, m9 1747*c0909341SAndroid Build Coastguard Worker pshufb m4, m5, m10 1748*c0909341SAndroid Build Coastguard Worker paddw m8, m6, m4 1749*c0909341SAndroid Build Coastguard Worker shufps m1, m6, m4, q2121 1750*c0909341SAndroid Build Coastguard Worker pmullw m0, m1, m1 1751*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m11 1752*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1753*c0909341SAndroid Build Coastguard Worker pshufb m5, m12 1754*c0909341SAndroid Build Coastguard Worker paddw m1, m5 ; sum3 1755*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m5 1756*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1757*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m5 1758*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1759*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m4 1760*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1761*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m4 1762*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m6 1763*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m0, m7 1764*c0909341SAndroid Build Coastguard Worker paddd m2, m4 ; sumsq3 1765*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m7 1766*c0909341SAndroid Build Coastguard Worker paddd m3, m0 1767*c0909341SAndroid Build Coastguard Worker paddw m8, m1 ; sum5 1768*c0909341SAndroid Build Coastguard Worker paddd m5, m2 ; sumsq5 1769*c0909341SAndroid Build Coastguard Worker paddd m6, m3 1770*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row 1771*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd 1772*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*0+40], m6 1773*c0909341SAndroid Build Coastguard Worker paddw m8, [t1+r10*2+400* 0] 1774*c0909341SAndroid Build Coastguard Worker paddd m5, [t1+r10*2+400* 2] 1775*c0909341SAndroid Build Coastguard Worker paddd m6, [t1+r10*2+400* 4] 1776*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400* 0], m8 1777*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400* 2], m5 1778*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400* 4], m6 1779*c0909341SAndroid Build Coastguard Worker paddw m0, m1, [t1+r10*2+400* 6] 1780*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t1+r10*2+400* 8] 1781*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t1+r10*2+400*10] 1782*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400* 6], m1 1783*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400* 8], m2 1784*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*10], m3 1785*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10*2+400* 6] 1786*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+r10*2+400* 8] 1787*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+r10*2+400*10] 1788*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 6], m0 1789*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 8], m4 1790*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*10], m5 1791*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pw_455_24] 1792*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 1793*c0909341SAndroid Build Coastguard Worker vbroadcastss m6, [pf_256] 1794*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 1795*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 1796*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 1797*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a3 * 9 1798*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b3 * b 1799*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1800*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 1801*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p3 1802*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1803*c0909341SAndroid Build Coastguard Worker pmulld m4, m14 ; p3 * s1 1804*c0909341SAndroid Build Coastguard Worker pmulld m5, m14 1805*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m8 ; b3 * 455 1806*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m8 1807*c0909341SAndroid Build Coastguard Worker paddw m4, m8 1808*c0909341SAndroid Build Coastguard Worker paddw m5, m8 1809*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pd_34816] 1810*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z3 + 1 1811*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1812*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 1813*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 1814*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z3 + 1) 1815*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 1816*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m6, m4 1817*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m6, m5 1818*c0909341SAndroid Build Coastguard Worker mulps m2, m6 ; 256 / (z3 + 1) 1819*c0909341SAndroid Build Coastguard Worker mulps m3, m6 1820*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [pd_m4096] 1821*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z3 < 255 ? 255 : 0 1822*c0909341SAndroid Build Coastguard Worker psrld m5, 24 1823*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 1824*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 1825*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x3 1826*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 1827*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 1828*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 1829*c0909341SAndroid Build Coastguard Worker paddd m0, m8 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1830*c0909341SAndroid Build Coastguard Worker paddd m1, m8 1831*c0909341SAndroid Build Coastguard Worker pand m0, m6 1832*c0909341SAndroid Build Coastguard Worker pand m1, m6 1833*c0909341SAndroid Build Coastguard Worker por m0, m2 ; a3 | (b3 << 12) 1834*c0909341SAndroid Build Coastguard Worker por m1, m3 1835*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*4+ 8], xm0 1836*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+400*4+40], m0, 1 1837*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*4+24], xm1 1838*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+400*4+56], m1, 1 1839*c0909341SAndroid Build Coastguard Worker add r10, 16 1840*c0909341SAndroid Build Coastguard Worker jl .hv0_loop 1841*c0909341SAndroid Build Coastguard Worker ret 1842*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1843*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 1844*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1845*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1846*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 1847*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [leftq] 1848*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 1849*c0909341SAndroid Build Coastguard Worker palignr xm5, xm0, 12 1850*c0909341SAndroid Build Coastguard Worker add leftq, 4 1851*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 1852*c0909341SAndroid Build Coastguard Worker.hv1_extend_left: 1853*c0909341SAndroid Build Coastguard Worker mova xm5, [lpfq+wq] 1854*c0909341SAndroid Build Coastguard Worker pshufb xm5, [sgr_l_shuf] 1855*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 1856*c0909341SAndroid Build Coastguard Worker.hv1_bottom: 1857*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1858*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1859*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 1860*c0909341SAndroid Build Coastguard Worker.hv1_loop: 1861*c0909341SAndroid Build Coastguard Worker movu xm5, [lpfq+r10-2] 1862*c0909341SAndroid Build Coastguard Worker.hv1_main: 1863*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+r10+6], 1 1864*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1865*c0909341SAndroid Build Coastguard Worker jnz .hv1_have_right 1866*c0909341SAndroid Build Coastguard Worker cmp r10d, -18 1867*c0909341SAndroid Build Coastguard Worker jl .hv1_have_right 1868*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1869*c0909341SAndroid Build Coastguard Worker.hv1_have_right: 1870*c0909341SAndroid Build Coastguard Worker pshufb m6, m5, m9 1871*c0909341SAndroid Build Coastguard Worker pshufb m3, m5, m10 1872*c0909341SAndroid Build Coastguard Worker paddw m8, m6, m3 1873*c0909341SAndroid Build Coastguard Worker shufps m2, m6, m3, q2121 1874*c0909341SAndroid Build Coastguard Worker pmullw m1, m2, m2 1875*c0909341SAndroid Build Coastguard Worker pshufb m0, m5, m11 1876*c0909341SAndroid Build Coastguard Worker paddw m2, m0 1877*c0909341SAndroid Build Coastguard Worker pshufb m5, m12 1878*c0909341SAndroid Build Coastguard Worker paddw m2, m5 ; sum3 1879*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m0 1880*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 1881*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m0 1882*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1883*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m6, m3 1884*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 1885*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m3 1886*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m6 1887*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m1, m7 1888*c0909341SAndroid Build Coastguard Worker paddd m4, m3 ; sumsq3 1889*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 1890*c0909341SAndroid Build Coastguard Worker paddd m5, m1 1891*c0909341SAndroid Build Coastguard Worker paddw m1, m2, [t2+r10*2+400* 6] 1892*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 6], m2 1893*c0909341SAndroid Build Coastguard Worker paddw m8, m2 ; sum5 1894*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+r10*2+400* 8] 1895*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+r10*2+400*10] 1896*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 8], m4 1897*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*10], m5 1898*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pw_455_24] 1899*c0909341SAndroid Build Coastguard Worker paddd m4, m0 ; sumsq5 1900*c0909341SAndroid Build Coastguard Worker paddd m5, m6 1901*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 1902*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 1903*c0909341SAndroid Build Coastguard Worker pslld m6, m2, 3 1904*c0909341SAndroid Build Coastguard Worker pslld m7, m3, 3 1905*c0909341SAndroid Build Coastguard Worker paddd m6, m2 ; a3 * 9 1906*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b3 * b3 1907*c0909341SAndroid Build Coastguard Worker paddd m7, m3 1908*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 1909*c0909341SAndroid Build Coastguard Worker psubd m6, m2 ; p3 1910*c0909341SAndroid Build Coastguard Worker psubd m7, m3 1911*c0909341SAndroid Build Coastguard Worker pmulld m6, m14 ; p3 * s1 1912*c0909341SAndroid Build Coastguard Worker pmulld m7, m14 1913*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m9 ; b3 * 455 1914*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m9 1915*c0909341SAndroid Build Coastguard Worker paddw m6, m9 1916*c0909341SAndroid Build Coastguard Worker paddw m7, m9 1917*c0909341SAndroid Build Coastguard Worker vbroadcastss m9, [pf_256] 1918*c0909341SAndroid Build Coastguard Worker psrld m6, 20 ; z3 + 1 1919*c0909341SAndroid Build Coastguard Worker psrld m7, 20 1920*c0909341SAndroid Build Coastguard Worker cvtdq2ps m6, m6 1921*c0909341SAndroid Build Coastguard Worker cvtdq2ps m7, m7 1922*c0909341SAndroid Build Coastguard Worker rcpps m2, m6 ; 1 / (z3 + 1) 1923*c0909341SAndroid Build Coastguard Worker rcpps m3, m7 1924*c0909341SAndroid Build Coastguard Worker pcmpgtd m6, m9, m6 1925*c0909341SAndroid Build Coastguard Worker pcmpgtd m7, m9, m7 1926*c0909341SAndroid Build Coastguard Worker mulps m2, m9 ; 256 / (z3 + 1) 1927*c0909341SAndroid Build Coastguard Worker mulps m3, m9 1928*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pd_34816] 1929*c0909341SAndroid Build Coastguard Worker psrld m6, 24 ; z3 < 255 ? 255 : 0 1930*c0909341SAndroid Build Coastguard Worker psrld m7, 24 1931*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 1932*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 1933*c0909341SAndroid Build Coastguard Worker pminsw m2, m6 ; x3 1934*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [pd_m4096] 1935*c0909341SAndroid Build Coastguard Worker pminsw m3, m7 1936*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 1937*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 1938*c0909341SAndroid Build Coastguard Worker paddd m0, m9 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1939*c0909341SAndroid Build Coastguard Worker paddd m1, m9 1940*c0909341SAndroid Build Coastguard Worker pand m0, m6 1941*c0909341SAndroid Build Coastguard Worker pand m7, m6, m1 1942*c0909341SAndroid Build Coastguard Worker por m0, m2 ; a3 | (b3 << 12) 1943*c0909341SAndroid Build Coastguard Worker por m7, m3 1944*c0909341SAndroid Build Coastguard Worker paddw m1, m8, [t2+r10*2+400*0] 1945*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+r10*2+400*2] 1946*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+r10*2+400*4] 1947*c0909341SAndroid Build Coastguard Worker paddw m1, [t1+r10*2+400*0] 1948*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10*2+400*2] 1949*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+r10*2+400*4] 1950*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*0], m8 1951*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*2], m4 1952*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*4], m5 1953*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*8+ 8], xm0 1954*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+400*8+40], m0, 1 1955*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*8+24], xm7 1956*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+400*8+56], m7, 1 1957*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pd_25] 1958*c0909341SAndroid Build Coastguard Worker pxor m7, m7 1959*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pw_164_24] 1960*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b5 1961*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 1962*c0909341SAndroid Build Coastguard Worker pmulld m2, m4 ; a5 * 25 1963*c0909341SAndroid Build Coastguard Worker pmulld m3, m4 1964*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m0 ; b5 * b5 1965*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m1 1966*c0909341SAndroid Build Coastguard Worker psubd m2, m4 ; p5 1967*c0909341SAndroid Build Coastguard Worker psubd m3, m5 1968*c0909341SAndroid Build Coastguard Worker pmulld m2, m13 ; p5 * s0 1969*c0909341SAndroid Build Coastguard Worker pmulld m3, m13 1970*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m8 ; b5 * 164 1971*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m8 1972*c0909341SAndroid Build Coastguard Worker paddw m2, m8 1973*c0909341SAndroid Build Coastguard Worker paddw m3, m8 1974*c0909341SAndroid Build Coastguard Worker vbroadcastss m8, [pf_256] 1975*c0909341SAndroid Build Coastguard Worker psrld m2, 20 ; z5 + 1 1976*c0909341SAndroid Build Coastguard Worker psrld m3, 20 1977*c0909341SAndroid Build Coastguard Worker cvtdq2ps m2, m2 1978*c0909341SAndroid Build Coastguard Worker cvtdq2ps m3, m3 1979*c0909341SAndroid Build Coastguard Worker rcpps m4, m2 ; 1 / (z5 + 1) 1980*c0909341SAndroid Build Coastguard Worker rcpps m5, m3 1981*c0909341SAndroid Build Coastguard Worker pcmpgtd m2, m8, m2 1982*c0909341SAndroid Build Coastguard Worker pcmpgtd m3, m8, m3 1983*c0909341SAndroid Build Coastguard Worker mulps m4, m8 ; 256 / (z5 + 1) 1984*c0909341SAndroid Build Coastguard Worker mulps m5, m8 1985*c0909341SAndroid Build Coastguard Worker psrld m2, 24 ; z5 < 255 ? 255 : 0 1986*c0909341SAndroid Build Coastguard Worker psrld m3, 24 1987*c0909341SAndroid Build Coastguard Worker cvtps2dq m4, m4 1988*c0909341SAndroid Build Coastguard Worker cvtps2dq m5, m5 1989*c0909341SAndroid Build Coastguard Worker pminsw m4, m2 ; x5 1990*c0909341SAndroid Build Coastguard Worker pminsw m5, m3 1991*c0909341SAndroid Build Coastguard Worker pmulld m0, m4 1992*c0909341SAndroid Build Coastguard Worker pmulld m1, m5 1993*c0909341SAndroid Build Coastguard Worker paddd m0, m9 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 1994*c0909341SAndroid Build Coastguard Worker paddd m1, m9 1995*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [sgr_shuf] 1996*c0909341SAndroid Build Coastguard Worker pand m0, m6 1997*c0909341SAndroid Build Coastguard Worker pand m1, m6 1998*c0909341SAndroid Build Coastguard Worker por m0, m4 ; a5 | (b5 << 12) 1999*c0909341SAndroid Build Coastguard Worker por m1, m5 2000*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*0+ 8], xm0 2001*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+400*0+40], m0, 1 2002*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*0+24], xm1 2003*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+400*0+56], m1, 1 2004*c0909341SAndroid Build Coastguard Worker add r10, 16 2005*c0909341SAndroid Build Coastguard Worker jl .hv1_loop 2006*c0909341SAndroid Build Coastguard Worker mov r10, t2 2007*c0909341SAndroid Build Coastguard Worker mov t2, t1 2008*c0909341SAndroid Build Coastguard Worker mov t1, r10 2009*c0909341SAndroid Build Coastguard Worker ret 2010*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab3 (even rows) 2011*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 2012*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [pd_34816] 2013*c0909341SAndroid Build Coastguard Worker.v0_loop: 2014*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10*2+400* 6] 2015*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10*2+400* 8] 2016*c0909341SAndroid Build Coastguard Worker mova m5, [t1+r10*2+400*10] 2017*c0909341SAndroid Build Coastguard Worker paddw m0, m0 2018*c0909341SAndroid Build Coastguard Worker paddd m4, m4 2019*c0909341SAndroid Build Coastguard Worker paddd m5, m5 2020*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10*2+400* 6] 2021*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+r10*2+400* 8] 2022*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+r10*2+400*10] 2023*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 6], m0 2024*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 8], m4 2025*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*10], m5 2026*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pw_455_24] 2027*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 2028*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 2029*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 2030*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 2031*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a3 * 9 2032*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b3 * b3 2033*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2034*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 2035*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p3 2036*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2037*c0909341SAndroid Build Coastguard Worker pmulld m4, m14 ; p3 * s1 2038*c0909341SAndroid Build Coastguard Worker pmulld m5, m14 2039*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m8 ; b3 * 455 2040*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m8 2041*c0909341SAndroid Build Coastguard Worker paddw m4, m8 2042*c0909341SAndroid Build Coastguard Worker paddw m5, m8 2043*c0909341SAndroid Build Coastguard Worker vbroadcastss m8, [pf_256] 2044*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z3 + 1 2045*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2046*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 2047*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 2048*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z3 + 1) 2049*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 2050*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m8, m4 2051*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m8, m5 2052*c0909341SAndroid Build Coastguard Worker mulps m2, m8 ; 256 / (z3 + 1) 2053*c0909341SAndroid Build Coastguard Worker mulps m3, m8 2054*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pd_m4096] 2055*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z3 < 255 ? 255 : 0 2056*c0909341SAndroid Build Coastguard Worker psrld m5, 24 2057*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 2058*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 2059*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x3 2060*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 2061*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 2062*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 2063*c0909341SAndroid Build Coastguard Worker paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2064*c0909341SAndroid Build Coastguard Worker paddd m1, m6 2065*c0909341SAndroid Build Coastguard Worker pand m0, m8 2066*c0909341SAndroid Build Coastguard Worker pand m1, m8 2067*c0909341SAndroid Build Coastguard Worker por m0, m2 ; a3 | (b3 << 12) 2068*c0909341SAndroid Build Coastguard Worker por m1, m3 2069*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10*2+400*0] 2070*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10*2+400*2] 2071*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10*2+400*4] 2072*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*8+ 8], m2 2073*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*0+ 8], m3 2074*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*0+40], m4 2075*c0909341SAndroid Build Coastguard Worker paddw m2, m2 ; cc5 2076*c0909341SAndroid Build Coastguard Worker paddd m3, m3 2077*c0909341SAndroid Build Coastguard Worker paddd m4, m4 2078*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*0], m2 2079*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*2], m3 2080*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+400*4], m4 2081*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*4+ 8], xm0 2082*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+400*4+40], m0, 1 2083*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*4+24], xm1 2084*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+400*4+56], m1, 1 2085*c0909341SAndroid Build Coastguard Worker add r10, 16 2086*c0909341SAndroid Build Coastguard Worker jl .v0_loop 2087*c0909341SAndroid Build Coastguard Worker ret 2088*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows) 2089*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 2090*c0909341SAndroid Build Coastguard Worker.v1_loop: 2091*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10*2+400* 6] 2092*c0909341SAndroid Build Coastguard Worker mova m5, [t1+r10*2+400* 8] 2093*c0909341SAndroid Build Coastguard Worker mova m6, [t1+r10*2+400*10] 2094*c0909341SAndroid Build Coastguard Worker paddw m1, m4, [t2+r10*2+400* 6] 2095*c0909341SAndroid Build Coastguard Worker paddd m2, m5, [t2+r10*2+400* 8] 2096*c0909341SAndroid Build Coastguard Worker paddd m3, m6, [t2+r10*2+400*10] 2097*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 6], m4 2098*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400* 8], m5 2099*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*10], m6 2100*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pw_455_24] 2101*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 2102*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 2103*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 2104*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 2105*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; a3 * 9 2106*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, m0 ; b3 * b3 2107*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2108*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, m1 2109*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p3 2110*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2111*c0909341SAndroid Build Coastguard Worker pmulld m4, m14 ; p3 * s1 2112*c0909341SAndroid Build Coastguard Worker pmulld m5, m14 2113*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m8 ; b3 * 455 2114*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m8 2115*c0909341SAndroid Build Coastguard Worker paddw m4, m8 2116*c0909341SAndroid Build Coastguard Worker paddw m5, m8 2117*c0909341SAndroid Build Coastguard Worker vbroadcastss m8, [pf_256] 2118*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z3 + 1 2119*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2120*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 2121*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 2122*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z3 + 1) 2123*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 2124*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m8, m4 2125*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m8, m5 2126*c0909341SAndroid Build Coastguard Worker mulps m2, m8 ; 256 / (z3 + 1) 2127*c0909341SAndroid Build Coastguard Worker mulps m3, m8 2128*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pd_m4096] 2129*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z3 < 255 ? 255 : 0 2130*c0909341SAndroid Build Coastguard Worker psrld m5, 24 2131*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 2132*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 2133*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x3 2134*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pd_34816] 2135*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 2136*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 2137*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 2138*c0909341SAndroid Build Coastguard Worker paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2139*c0909341SAndroid Build Coastguard Worker paddd m1, m4 2140*c0909341SAndroid Build Coastguard Worker pand m0, m8 2141*c0909341SAndroid Build Coastguard Worker pand m8, m1 2142*c0909341SAndroid Build Coastguard Worker por m0, m2 ; a3 | (b3 << 12) 2143*c0909341SAndroid Build Coastguard Worker por m8, m3 2144*c0909341SAndroid Build Coastguard Worker mova m4, [t3+r10*4+400*8+ 8] 2145*c0909341SAndroid Build Coastguard Worker mova m5, [t3+r10*4+400*0+ 8] 2146*c0909341SAndroid Build Coastguard Worker mova m6, [t3+r10*4+400*0+40] 2147*c0909341SAndroid Build Coastguard Worker paddw m1, m4, [t2+r10*2+400*0] 2148*c0909341SAndroid Build Coastguard Worker paddd m2, m5, [t2+r10*2+400*2] 2149*c0909341SAndroid Build Coastguard Worker paddd m3, m6, [t2+r10*2+400*4] 2150*c0909341SAndroid Build Coastguard Worker paddw m1, [t1+r10*2+400*0] 2151*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10*2+400*2] 2152*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+r10*2+400*4] 2153*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*0], m4 2154*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*2], m5 2155*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+400*4], m6 2156*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [pd_25] 2157*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*8+ 8], xm0 2158*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+400*8+40], m0, 1 2159*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*8+24], xm8 2160*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+400*8+56], m8, 1 2161*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pw_164_24] 2162*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b5 2163*c0909341SAndroid Build Coastguard Worker vbroadcastss m6, [pf_256] 2164*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 2165*c0909341SAndroid Build Coastguard Worker pmulld m2, m4 ; a5 * 25 2166*c0909341SAndroid Build Coastguard Worker pmulld m3, m4 2167*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m0, m0 ; b5 * b5 2168*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m1, m1 2169*c0909341SAndroid Build Coastguard Worker psubd m2, m4 ; p5 2170*c0909341SAndroid Build Coastguard Worker psubd m3, m5 2171*c0909341SAndroid Build Coastguard Worker pmulld m2, m13 ; p5 * s0 2172*c0909341SAndroid Build Coastguard Worker pmulld m3, m13 2173*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m8 ; b5 * 164 2174*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m8 2175*c0909341SAndroid Build Coastguard Worker paddw m2, m8 2176*c0909341SAndroid Build Coastguard Worker paddw m3, m8 2177*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pd_34816] 2178*c0909341SAndroid Build Coastguard Worker psrld m2, 20 ; z5 + 1 2179*c0909341SAndroid Build Coastguard Worker psrld m3, 20 2180*c0909341SAndroid Build Coastguard Worker cvtdq2ps m2, m2 2181*c0909341SAndroid Build Coastguard Worker cvtdq2ps m3, m3 2182*c0909341SAndroid Build Coastguard Worker rcpps m4, m2 ; 1 / (z5 + 1) 2183*c0909341SAndroid Build Coastguard Worker rcpps m5, m3 2184*c0909341SAndroid Build Coastguard Worker pcmpgtd m2, m6, m2 2185*c0909341SAndroid Build Coastguard Worker pcmpgtd m3, m6, m3 2186*c0909341SAndroid Build Coastguard Worker mulps m4, m6 ; 256 / (z5 + 1) 2187*c0909341SAndroid Build Coastguard Worker mulps m5, m6 2188*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [pd_m4096] 2189*c0909341SAndroid Build Coastguard Worker psrld m2, 24 ; z5 < 255 ? 255 : 0 2190*c0909341SAndroid Build Coastguard Worker psrld m3, 24 2191*c0909341SAndroid Build Coastguard Worker cvtps2dq m4, m4 2192*c0909341SAndroid Build Coastguard Worker cvtps2dq m5, m5 2193*c0909341SAndroid Build Coastguard Worker pminsw m4, m2 ; x5 2194*c0909341SAndroid Build Coastguard Worker pminsw m5, m3 2195*c0909341SAndroid Build Coastguard Worker pmulld m0, m4 2196*c0909341SAndroid Build Coastguard Worker pmulld m1, m5 2197*c0909341SAndroid Build Coastguard Worker paddd m0, m8 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2198*c0909341SAndroid Build Coastguard Worker paddd m1, m8 2199*c0909341SAndroid Build Coastguard Worker pand m0, m6 2200*c0909341SAndroid Build Coastguard Worker pand m1, m6 2201*c0909341SAndroid Build Coastguard Worker por m0, m4 ; a5 | (b5 << 12) 2202*c0909341SAndroid Build Coastguard Worker por m1, m5 2203*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*0+ 8], xm0 2204*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+400*0+40], m0, 1 2205*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*0+24], xm1 2206*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+400*0+56], m1, 1 2207*c0909341SAndroid Build Coastguard Worker add r10, 16 2208*c0909341SAndroid Build Coastguard Worker jl .v1_loop 2209*c0909341SAndroid Build Coastguard Worker mov r10, t2 2210*c0909341SAndroid Build Coastguard Worker mov t2, t1 2211*c0909341SAndroid Build Coastguard Worker mov t1, r10 2212*c0909341SAndroid Build Coastguard Worker ret 2213*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 2214*c0909341SAndroid Build Coastguard Worker mov r10, wq 2215*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 2216*c0909341SAndroid Build Coastguard Worker movu m0, [t3+r10*4+400*0+4] 2217*c0909341SAndroid Build Coastguard Worker paddd m1, m0, [t3+r10*4+400*0+0] 2218*c0909341SAndroid Build Coastguard Worker mova m4, [t3+r10*4+400*4+0] 2219*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*4+400*0+8] 2220*c0909341SAndroid Build Coastguard Worker mova m5, [t3+r10*4+400*8+0] 2221*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+r10*4+400*4+8] 2222*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+r10*4+400*8+8] 2223*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t3+r10*4+400*4+4] 2224*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t3+r10*4+400*8+4] 2225*c0909341SAndroid Build Coastguard Worker paddd m0, m1 2226*c0909341SAndroid Build Coastguard Worker pslld m1, 2 2227*c0909341SAndroid Build Coastguard Worker pslld m2, 2 2228*c0909341SAndroid Build Coastguard Worker paddd m1, m0 ; ab5 565 2229*c0909341SAndroid Build Coastguard Worker paddd m3, m3 ; ab3[ 0] 222 2230*c0909341SAndroid Build Coastguard Worker psubd m2, m4 ; ab3[-1] 343 2231*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*20], m3 2232*c0909341SAndroid Build Coastguard Worker pandn m0, m6, m1 ; a5 565 2233*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*24], m2 2234*c0909341SAndroid Build Coastguard Worker psrld m1, 12 ; b5 565 2235*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*12], m0 2236*c0909341SAndroid Build Coastguard Worker paddd m3, m3 2237*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*16], m1 2238*c0909341SAndroid Build Coastguard Worker psubd m3, m5 ; ab3[ 0] 343 2239*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*28], m3 2240*c0909341SAndroid Build Coastguard Worker add r10, 8 2241*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 2242*c0909341SAndroid Build Coastguard Worker ret 2243*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2244*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 2245*c0909341SAndroid Build Coastguard Worker mov r10, wq 2246*c0909341SAndroid Build Coastguard Worker.n0_loop: 2247*c0909341SAndroid Build Coastguard Worker movu m0, [t3+r10*4+4] 2248*c0909341SAndroid Build Coastguard Worker paddd m4, m0, [t3+r10*4+0] 2249*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+r10*4+8] 2250*c0909341SAndroid Build Coastguard Worker paddd m0, m4 2251*c0909341SAndroid Build Coastguard Worker pslld m4, 2 2252*c0909341SAndroid Build Coastguard Worker paddd m4, m0 2253*c0909341SAndroid Build Coastguard Worker pandn m0, m6, m4 2254*c0909341SAndroid Build Coastguard Worker psrld m4, 12 2255*c0909341SAndroid Build Coastguard Worker paddd m2, m0, [t3+r10*4+400*12] ; a5 2256*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*12], m0 2257*c0909341SAndroid Build Coastguard Worker paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) 2258*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*16], m4 2259*c0909341SAndroid Build Coastguard Worker mova m3, [t3+r10*4+400*4+0] 2260*c0909341SAndroid Build Coastguard Worker paddd m3, [t3+r10*4+400*4+8] 2261*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t3+r10*4+400*4+4] 2262*c0909341SAndroid Build Coastguard Worker paddd m5, m5 ; ab3[ 1] 222 2263*c0909341SAndroid Build Coastguard Worker mova m4, [t3+r10*4+400*20] 2264*c0909341SAndroid Build Coastguard Worker paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343 2265*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*20], m5 2266*c0909341SAndroid Build Coastguard Worker paddd m5, m5 2267*c0909341SAndroid Build Coastguard Worker psubd m5, m3 ; ab3[ 1] 343 2268*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*24], m5 2269*c0909341SAndroid Build Coastguard Worker paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 2270*c0909341SAndroid Build Coastguard Worker pandn m3, m6, m1 2271*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2272*c0909341SAndroid Build Coastguard Worker pandn m5, m6, m4 2273*c0909341SAndroid Build Coastguard Worker psrld m4, 12 2274*c0909341SAndroid Build Coastguard Worker paddd m3, m5 ; a3 2275*c0909341SAndroid Build Coastguard Worker paddd m1, m4 ; b3 + (1 << 8) 2276*c0909341SAndroid Build Coastguard Worker pmovzxbd m4, [dstq+r10] 2277*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m4 ; a5 * src 2278*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m4 ; a3 * src 2279*c0909341SAndroid Build Coastguard Worker psubd m0, m2 ; b5 - a5 * src + (1 << 8) 2280*c0909341SAndroid Build Coastguard Worker psubd m1, m3 ; b3 - a3 * src + (1 << 8) 2281*c0909341SAndroid Build Coastguard Worker psrld m0, 9 2282*c0909341SAndroid Build Coastguard Worker pslld m1, 7 2283*c0909341SAndroid Build Coastguard Worker pblendw m0, m1, 0xaa 2284*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 2285*c0909341SAndroid Build Coastguard Worker psubd m0, m6 2286*c0909341SAndroid Build Coastguard Worker psrad m0, 13 2287*c0909341SAndroid Build Coastguard Worker paddd m0, m4 2288*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 2289*c0909341SAndroid Build Coastguard Worker packssdw xm0, xm1 2290*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm0 2291*c0909341SAndroid Build Coastguard Worker movq [dstq+r10], xm0 2292*c0909341SAndroid Build Coastguard Worker add r10, 8 2293*c0909341SAndroid Build Coastguard Worker jl .n0_loop 2294*c0909341SAndroid Build Coastguard Worker add dstq, strideq 2295*c0909341SAndroid Build Coastguard Worker ret 2296*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2297*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 2298*c0909341SAndroid Build Coastguard Worker mov r10, wq 2299*c0909341SAndroid Build Coastguard Worker.n1_loop: 2300*c0909341SAndroid Build Coastguard Worker mova m3, [t3+r10*4+400*8+0] 2301*c0909341SAndroid Build Coastguard Worker paddd m3, [t3+r10*4+400*8+8] 2302*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t3+r10*4+400*8+4] 2303*c0909341SAndroid Build Coastguard Worker paddd m5, m5 ; ab3[ 1] 222 2304*c0909341SAndroid Build Coastguard Worker mova m4, [t3+r10*4+400*20] 2305*c0909341SAndroid Build Coastguard Worker paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343 2306*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*20], m5 2307*c0909341SAndroid Build Coastguard Worker paddd m5, m5 2308*c0909341SAndroid Build Coastguard Worker psubd m5, m3 ; ab3[ 1] 343 2309*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+400*28], m5 2310*c0909341SAndroid Build Coastguard Worker paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 2311*c0909341SAndroid Build Coastguard Worker pandn m3, m6, m1 2312*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2313*c0909341SAndroid Build Coastguard Worker pandn m5, m6, m4 2314*c0909341SAndroid Build Coastguard Worker psrld m4, 12 2315*c0909341SAndroid Build Coastguard Worker paddd m3, m5 ; -a3 2316*c0909341SAndroid Build Coastguard Worker paddd m1, m4 ; b3 + (1 << 8) 2317*c0909341SAndroid Build Coastguard Worker pmovzxbd m4, [dstq+r10] 2318*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src 2319*c0909341SAndroid Build Coastguard Worker mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7) 2320*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m4 ; -a3 * src 2321*c0909341SAndroid Build Coastguard Worker psubd m0, m2 ; a5 * src + b5 + (1 << 7) 2322*c0909341SAndroid Build Coastguard Worker psubd m1, m3 ; a3 * src + b3 + (1 << 8) 2323*c0909341SAndroid Build Coastguard Worker psrld m0, 8 2324*c0909341SAndroid Build Coastguard Worker pslld m1, 7 2325*c0909341SAndroid Build Coastguard Worker pblendw m0, m1, 0xaa 2326*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 2327*c0909341SAndroid Build Coastguard Worker psubd m0, m6 2328*c0909341SAndroid Build Coastguard Worker psrad m0, 13 2329*c0909341SAndroid Build Coastguard Worker paddd m0, m4 2330*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 2331*c0909341SAndroid Build Coastguard Worker packssdw xm0, xm1 2332*c0909341SAndroid Build Coastguard Worker packuswb xm0, xm0 2333*c0909341SAndroid Build Coastguard Worker movq [dstq+r10], xm0 2334*c0909341SAndroid Build Coastguard Worker add r10, 8 2335*c0909341SAndroid Build Coastguard Worker jl .n1_loop 2336*c0909341SAndroid Build Coastguard Worker add dstq, strideq 2337*c0909341SAndroid Build Coastguard Worker ret 2338*c0909341SAndroid Build Coastguard Worker 2339*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 2340