1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 32 32*c0909341SAndroid Build Coastguard Worker 33*c0909341SAndroid Build Coastguard Workersgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 34*c0909341SAndroid Build Coastguard Workersgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 35*c0909341SAndroid Build Coastguard Workerwiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 36*c0909341SAndroid Build Coastguard Worker db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 37*c0909341SAndroid Build Coastguard Workerwiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15 38*c0909341SAndroid Build Coastguard Worker db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 39*c0909341SAndroid Build Coastguard Workerwiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 40*c0909341SAndroid Build Coastguard Workerwiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 41*c0909341SAndroid Build Coastguard Workerwiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 42*c0909341SAndroid Build Coastguard Workerwiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 43*c0909341SAndroid Build Coastguard Workerwiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 44*c0909341SAndroid Build Coastguard Worker 45*c0909341SAndroid Build Coastguard Workerwiener_hshift: dw 4, 4, 1, 1 46*c0909341SAndroid Build Coastguard Workerwiener_vshift: dw 1024, 1024, 4096, 4096 47*c0909341SAndroid Build Coastguard Workerwiener_round: dd 1049600, 1048832 48*c0909341SAndroid Build Coastguard Worker 49*c0909341SAndroid Build Coastguard Workerpb_m10_m9: times 2 db -10, -9 50*c0909341SAndroid Build Coastguard Workerpb_m6_m5: times 2 db -6, -5 51*c0909341SAndroid Build Coastguard Workerpb_m2_m1: times 2 db -2, -1 52*c0909341SAndroid Build Coastguard Workerpb_2_3: times 2 db 2, 3 53*c0909341SAndroid Build Coastguard Workerpb_6_7: times 2 db 6, 7 54*c0909341SAndroid Build Coastguard Workerpw_1023: times 2 dw 1023 55*c0909341SAndroid Build Coastguard Workerpw_164_24: dw 164, 24 56*c0909341SAndroid Build Coastguard Workerpw_455_24: dw 455, 24 57*c0909341SAndroid Build Coastguard Workerpd_8: dd 8 58*c0909341SAndroid Build Coastguard Workerpd_25: dd 25 59*c0909341SAndroid Build Coastguard Workerpd_4096: dd 4096 60*c0909341SAndroid Build Coastguard Workerpd_34816: dd 34816 61*c0909341SAndroid Build Coastguard Workerpd_m262128: dd -262128 62*c0909341SAndroid Build Coastguard Workerpf_256: dd 256.0 63*c0909341SAndroid Build Coastguard Worker 64*c0909341SAndroid Build Coastguard Worker%define pw_256 sgr_lshuf5 65*c0909341SAndroid Build Coastguard Worker 66*c0909341SAndroid Build Coastguard Workercextern pb_0to63 67*c0909341SAndroid Build Coastguard Worker 68*c0909341SAndroid Build Coastguard WorkerSECTION .text 69*c0909341SAndroid Build Coastguard Worker 70*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers 71*c0909341SAndroid Build Coastguard Worker 72*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2 73*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ 74*c0909341SAndroid Build Coastguard Worker w, h, edge, flt 75*c0909341SAndroid Build Coastguard Worker%define base t4-wiener_hshift 76*c0909341SAndroid Build Coastguard Worker mov fltq, r6mp 77*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 78*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 79*c0909341SAndroid Build Coastguard Worker mov edged, r7m 80*c0909341SAndroid Build Coastguard Worker mov t3d, r8m ; pixel_max 81*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [wiener_shufA] 82*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [fltq+ 0] ; x0 x1 83*c0909341SAndroid Build Coastguard Worker lea t4, [wiener_hshift] 84*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [wiener_shufB] 85*c0909341SAndroid Build Coastguard Worker add wd, wd 86*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [fltq+ 4] ; x2 x3 87*c0909341SAndroid Build Coastguard Worker shr t3d, 11 88*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [fltq+16] ; y0 y1 89*c0909341SAndroid Build Coastguard Worker add lpfq, wq 90*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [fltq+20] ; y2 y3 91*c0909341SAndroid Build Coastguard Worker add dstq, wq 92*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [wiener_shufC] 93*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+16] 94*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [wiener_shufD] 95*c0909341SAndroid Build Coastguard Worker neg wq 96*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+wiener_hshift+t3*4] 97*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+wiener_round+t3*4] 98*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+wiener_vshift+t3*4] 99*c0909341SAndroid Build Coastguard Worker pmullw m12, m0 ; upshift filter coefs to make the 100*c0909341SAndroid Build Coastguard Worker pmullw m13, m0 ; horizontal downshift constant 101*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 102*c0909341SAndroid Build Coastguard Worker jz .no_top 103*c0909341SAndroid Build Coastguard Worker call .h_top 104*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 105*c0909341SAndroid Build Coastguard Worker mov t6, t1 106*c0909341SAndroid Build Coastguard Worker mov t5, t1 107*c0909341SAndroid Build Coastguard Worker add t1, 384*2 108*c0909341SAndroid Build Coastguard Worker call .h_top 109*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 110*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 111*c0909341SAndroid Build Coastguard Worker mov t4, t1 112*c0909341SAndroid Build Coastguard Worker add t1, 384*2 113*c0909341SAndroid Build Coastguard Worker add r10, strideq 114*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 115*c0909341SAndroid Build Coastguard Worker call .h 116*c0909341SAndroid Build Coastguard Worker mov t3, t1 117*c0909341SAndroid Build Coastguard Worker mov t2, t1 118*c0909341SAndroid Build Coastguard Worker dec hd 119*c0909341SAndroid Build Coastguard Worker jz .v1 120*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 121*c0909341SAndroid Build Coastguard Worker add t1, 384*2 122*c0909341SAndroid Build Coastguard Worker call .h 123*c0909341SAndroid Build Coastguard Worker mov t2, t1 124*c0909341SAndroid Build Coastguard Worker dec hd 125*c0909341SAndroid Build Coastguard Worker jz .v2 126*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 127*c0909341SAndroid Build Coastguard Worker add t1, 384*2 128*c0909341SAndroid Build Coastguard Worker call .h 129*c0909341SAndroid Build Coastguard Worker dec hd 130*c0909341SAndroid Build Coastguard Worker jz .v3 131*c0909341SAndroid Build Coastguard Worker.main: 132*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 133*c0909341SAndroid Build Coastguard Worker.main_loop: 134*c0909341SAndroid Build Coastguard Worker call .hv 135*c0909341SAndroid Build Coastguard Worker dec hd 136*c0909341SAndroid Build Coastguard Worker jnz .main_loop 137*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 138*c0909341SAndroid Build Coastguard Worker jz .v3 139*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 140*c0909341SAndroid Build Coastguard Worker call .hv_bottom 141*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 142*c0909341SAndroid Build Coastguard Worker call .hv_bottom 143*c0909341SAndroid Build Coastguard Worker.v1: 144*c0909341SAndroid Build Coastguard Worker call .v 145*c0909341SAndroid Build Coastguard Worker RET 146*c0909341SAndroid Build Coastguard Worker.no_top: 147*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 148*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 149*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 150*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 151*c0909341SAndroid Build Coastguard Worker call .h 152*c0909341SAndroid Build Coastguard Worker mov t6, t1 153*c0909341SAndroid Build Coastguard Worker mov t5, t1 154*c0909341SAndroid Build Coastguard Worker mov t4, t1 155*c0909341SAndroid Build Coastguard Worker mov t3, t1 156*c0909341SAndroid Build Coastguard Worker mov t2, t1 157*c0909341SAndroid Build Coastguard Worker dec hd 158*c0909341SAndroid Build Coastguard Worker jz .v1 159*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 160*c0909341SAndroid Build Coastguard Worker add t1, 384*2 161*c0909341SAndroid Build Coastguard Worker call .h 162*c0909341SAndroid Build Coastguard Worker mov t2, t1 163*c0909341SAndroid Build Coastguard Worker dec hd 164*c0909341SAndroid Build Coastguard Worker jz .v2 165*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 166*c0909341SAndroid Build Coastguard Worker add t1, 384*2 167*c0909341SAndroid Build Coastguard Worker call .h 168*c0909341SAndroid Build Coastguard Worker dec hd 169*c0909341SAndroid Build Coastguard Worker jz .v3 170*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 171*c0909341SAndroid Build Coastguard Worker call .hv 172*c0909341SAndroid Build Coastguard Worker dec hd 173*c0909341SAndroid Build Coastguard Worker jz .v3 174*c0909341SAndroid Build Coastguard Worker add t0, 384*8 175*c0909341SAndroid Build Coastguard Worker call .hv 176*c0909341SAndroid Build Coastguard Worker dec hd 177*c0909341SAndroid Build Coastguard Worker jnz .main 178*c0909341SAndroid Build Coastguard Worker.v3: 179*c0909341SAndroid Build Coastguard Worker call .v 180*c0909341SAndroid Build Coastguard Worker.v2: 181*c0909341SAndroid Build Coastguard Worker call .v 182*c0909341SAndroid Build Coastguard Worker jmp .v1 183*c0909341SAndroid Build Coastguard Worker.extend_right: 184*c0909341SAndroid Build Coastguard Worker movd xm1, r10d 185*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [pb_6_7] 186*c0909341SAndroid Build Coastguard Worker mova m2, [pb_0to63] 187*c0909341SAndroid Build Coastguard Worker vpbroadcastb m1, xm1 188*c0909341SAndroid Build Coastguard Worker psubb m0, m1 189*c0909341SAndroid Build Coastguard Worker pminub m0, m2 190*c0909341SAndroid Build Coastguard Worker pshufb m3, m0 191*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [pb_m2_m1] 192*c0909341SAndroid Build Coastguard Worker psubb m0, m1 193*c0909341SAndroid Build Coastguard Worker pminub m0, m2 194*c0909341SAndroid Build Coastguard Worker pshufb m4, m0 195*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [pb_m10_m9] 196*c0909341SAndroid Build Coastguard Worker psubb m0, m1 197*c0909341SAndroid Build Coastguard Worker pminub m0, m2 198*c0909341SAndroid Build Coastguard Worker pshufb m5, m0 199*c0909341SAndroid Build Coastguard Worker ret 200*c0909341SAndroid Build Coastguard Worker.h: 201*c0909341SAndroid Build Coastguard Worker mov r10, wq 202*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 203*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 204*c0909341SAndroid Build Coastguard Worker movq xm3, [leftq] 205*c0909341SAndroid Build Coastguard Worker vpblendd m3, [lpfq+r10-8], 0xfc 206*c0909341SAndroid Build Coastguard Worker add leftq, 8 207*c0909341SAndroid Build Coastguard Worker jmp .h_main 208*c0909341SAndroid Build Coastguard Worker.h_extend_left: 209*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m3, [lpfq+r10] ; avoid accessing memory located 210*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+r10] ; before the start of the buffer 211*c0909341SAndroid Build Coastguard Worker shufpd m3, m4, 0x05 212*c0909341SAndroid Build Coastguard Worker pshufb m3, [wiener_lshuf7] 213*c0909341SAndroid Build Coastguard Worker jmp .h_main2 214*c0909341SAndroid Build Coastguard Worker.h_top: 215*c0909341SAndroid Build Coastguard Worker mov r10, wq 216*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 217*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 218*c0909341SAndroid Build Coastguard Worker.h_loop: 219*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+r10-8] 220*c0909341SAndroid Build Coastguard Worker.h_main: 221*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+r10+0] 222*c0909341SAndroid Build Coastguard Worker.h_main2: 223*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+8] 224*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 225*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 226*c0909341SAndroid Build Coastguard Worker cmp r10d, -36 227*c0909341SAndroid Build Coastguard Worker jl .h_have_right 228*c0909341SAndroid Build Coastguard Worker call .extend_right 229*c0909341SAndroid Build Coastguard Worker.h_have_right: 230*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, m6 231*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m7 232*c0909341SAndroid Build Coastguard Worker paddw m0, m1 233*c0909341SAndroid Build Coastguard Worker pshufb m3, m8 234*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 235*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m9 236*c0909341SAndroid Build Coastguard Worker paddw m3, m1 237*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m6 238*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 239*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m7 240*c0909341SAndroid Build Coastguard Worker paddw m1, m2 241*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pd_m262128] ; (1 << 4) - (1 << 18) 242*c0909341SAndroid Build Coastguard Worker pshufb m4, m8 243*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 244*c0909341SAndroid Build Coastguard Worker pshufb m5, m9 245*c0909341SAndroid Build Coastguard Worker paddw m4, m5 246*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m13 247*c0909341SAndroid Build Coastguard Worker paddd m0, m2 248*c0909341SAndroid Build Coastguard Worker paddd m1, m2 249*c0909341SAndroid Build Coastguard Worker paddd m0, m3 250*c0909341SAndroid Build Coastguard Worker paddd m1, m4 251*c0909341SAndroid Build Coastguard Worker psrad m0, 4 252*c0909341SAndroid Build Coastguard Worker psrad m1, 4 253*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 254*c0909341SAndroid Build Coastguard Worker psraw m0, 1 255*c0909341SAndroid Build Coastguard Worker mova [t1+r10], m0 256*c0909341SAndroid Build Coastguard Worker add r10, 32 257*c0909341SAndroid Build Coastguard Worker jl .h_loop 258*c0909341SAndroid Build Coastguard Worker ret 259*c0909341SAndroid Build Coastguard WorkerALIGN function_align 260*c0909341SAndroid Build Coastguard Worker.hv: 261*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 262*c0909341SAndroid Build Coastguard Worker mov r10, wq 263*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 264*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 265*c0909341SAndroid Build Coastguard Worker movq xm3, [leftq] 266*c0909341SAndroid Build Coastguard Worker vpblendd m3, [lpfq+r10-8], 0xfc 267*c0909341SAndroid Build Coastguard Worker add leftq, 8 268*c0909341SAndroid Build Coastguard Worker jmp .hv_main 269*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 270*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+r10-8] 271*c0909341SAndroid Build Coastguard Worker pshufb m3, [wiener_lshuf7] 272*c0909341SAndroid Build Coastguard Worker jmp .hv_main 273*c0909341SAndroid Build Coastguard Worker.hv_bottom: 274*c0909341SAndroid Build Coastguard Worker mov r10, wq 275*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 276*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 277*c0909341SAndroid Build Coastguard Worker.hv_loop: 278*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+r10-8] 279*c0909341SAndroid Build Coastguard Worker.hv_main: 280*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+r10+0] 281*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+8] 282*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 283*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 284*c0909341SAndroid Build Coastguard Worker cmp r10d, -36 285*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 286*c0909341SAndroid Build Coastguard Worker call .extend_right 287*c0909341SAndroid Build Coastguard Worker.hv_have_right: 288*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, m6 289*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m7 290*c0909341SAndroid Build Coastguard Worker paddw m0, m1 291*c0909341SAndroid Build Coastguard Worker pshufb m3, m8 292*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 293*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m9 294*c0909341SAndroid Build Coastguard Worker paddw m3, m1 295*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m6 296*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 297*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m7 298*c0909341SAndroid Build Coastguard Worker paddw m1, m2 299*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [pd_m262128] 300*c0909341SAndroid Build Coastguard Worker pshufb m4, m8 301*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 302*c0909341SAndroid Build Coastguard Worker pshufb m5, m9 303*c0909341SAndroid Build Coastguard Worker paddw m4, m5 304*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m13 305*c0909341SAndroid Build Coastguard Worker paddd m0, m2 306*c0909341SAndroid Build Coastguard Worker paddd m1, m2 307*c0909341SAndroid Build Coastguard Worker mova m2, [t4+r10] 308*c0909341SAndroid Build Coastguard Worker paddw m2, [t2+r10] 309*c0909341SAndroid Build Coastguard Worker mova m5, [t3+r10] 310*c0909341SAndroid Build Coastguard Worker paddd m0, m3 311*c0909341SAndroid Build Coastguard Worker paddd m1, m4 312*c0909341SAndroid Build Coastguard Worker psrad m0, 4 313*c0909341SAndroid Build Coastguard Worker psrad m1, 4 314*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 315*c0909341SAndroid Build Coastguard Worker mova m4, [t5+r10] 316*c0909341SAndroid Build Coastguard Worker paddw m4, [t1+r10] 317*c0909341SAndroid Build Coastguard Worker psraw m0, 1 318*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t6+r10] 319*c0909341SAndroid Build Coastguard Worker mova [t0+r10], m0 320*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2, m5 321*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 322*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m5 323*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m15 324*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m4 325*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m14 326*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 327*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 328*c0909341SAndroid Build Coastguard Worker paddd m0, m10 329*c0909341SAndroid Build Coastguard Worker paddd m2, m10 330*c0909341SAndroid Build Coastguard Worker paddd m0, m1 331*c0909341SAndroid Build Coastguard Worker paddd m2, m3 332*c0909341SAndroid Build Coastguard Worker psrad m0, 5 333*c0909341SAndroid Build Coastguard Worker psrad m2, 5 334*c0909341SAndroid Build Coastguard Worker packusdw m0, m2 335*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m11 336*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 337*c0909341SAndroid Build Coastguard Worker add r10, 32 338*c0909341SAndroid Build Coastguard Worker jl .hv_loop 339*c0909341SAndroid Build Coastguard Worker mov t6, t5 340*c0909341SAndroid Build Coastguard Worker mov t5, t4 341*c0909341SAndroid Build Coastguard Worker mov t4, t3 342*c0909341SAndroid Build Coastguard Worker mov t3, t2 343*c0909341SAndroid Build Coastguard Worker mov t2, t1 344*c0909341SAndroid Build Coastguard Worker mov t1, t0 345*c0909341SAndroid Build Coastguard Worker mov t0, t6 346*c0909341SAndroid Build Coastguard Worker add dstq, strideq 347*c0909341SAndroid Build Coastguard Worker ret 348*c0909341SAndroid Build Coastguard Worker.v: 349*c0909341SAndroid Build Coastguard Worker mov r10, wq 350*c0909341SAndroid Build Coastguard Worker.v_loop: 351*c0909341SAndroid Build Coastguard Worker mova m1, [t4+r10] 352*c0909341SAndroid Build Coastguard Worker paddw m1, [t2+r10] 353*c0909341SAndroid Build Coastguard Worker mova m2, [t3+r10] 354*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10] 355*c0909341SAndroid Build Coastguard Worker paddw m3, m4, [t6+r10] 356*c0909341SAndroid Build Coastguard Worker paddw m4, [t5+r10] 357*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m2 358*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 359*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2 360*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m15 361*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 362*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m14 363*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 364*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 365*c0909341SAndroid Build Coastguard Worker paddd m0, m10 366*c0909341SAndroid Build Coastguard Worker paddd m1, m10 367*c0909341SAndroid Build Coastguard Worker paddd m0, m2 368*c0909341SAndroid Build Coastguard Worker paddd m1, m3 369*c0909341SAndroid Build Coastguard Worker psrad m0, 5 370*c0909341SAndroid Build Coastguard Worker psrad m1, 5 371*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 372*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m11 373*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 374*c0909341SAndroid Build Coastguard Worker add r10, 32 375*c0909341SAndroid Build Coastguard Worker jl .v_loop 376*c0909341SAndroid Build Coastguard Worker mov t6, t5 377*c0909341SAndroid Build Coastguard Worker mov t5, t4 378*c0909341SAndroid Build Coastguard Worker mov t4, t3 379*c0909341SAndroid Build Coastguard Worker mov t3, t2 380*c0909341SAndroid Build Coastguard Worker mov t2, t1 381*c0909341SAndroid Build Coastguard Worker add dstq, strideq 382*c0909341SAndroid Build Coastguard Worker ret 383*c0909341SAndroid Build Coastguard Worker 384*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ 385*c0909341SAndroid Build Coastguard Worker w, h, edge, flt 386*c0909341SAndroid Build Coastguard Worker%define base t4-wiener_hshift 387*c0909341SAndroid Build Coastguard Worker mov fltq, r6mp 388*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 389*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 390*c0909341SAndroid Build Coastguard Worker mov edged, r7m 391*c0909341SAndroid Build Coastguard Worker mov t3d, r8m ; pixel_max 392*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [wiener_shufE] 393*c0909341SAndroid Build Coastguard Worker vpbroadcastw m11, [fltq+ 2] ; x1 394*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [wiener_shufB] 395*c0909341SAndroid Build Coastguard Worker lea t4, [wiener_hshift] 396*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [wiener_shufD] 397*c0909341SAndroid Build Coastguard Worker add wd, wd 398*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [fltq+ 4] ; x2 x3 399*c0909341SAndroid Build Coastguard Worker shr t3d, 11 400*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) 401*c0909341SAndroid Build Coastguard Worker add lpfq, wq 402*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, [fltq+18] ; y1 403*c0909341SAndroid Build Coastguard Worker add dstq, wq 404*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [fltq+20] ; y2 y3 405*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+16] 406*c0909341SAndroid Build Coastguard Worker neg wq 407*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+wiener_hshift+t3*4] 408*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+wiener_round+t3*4] 409*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+wiener_vshift+t3*4] 410*c0909341SAndroid Build Coastguard Worker mova m15, [wiener_lshuf5] 411*c0909341SAndroid Build Coastguard Worker pmullw m11, m0 412*c0909341SAndroid Build Coastguard Worker pmullw m12, m0 413*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 414*c0909341SAndroid Build Coastguard Worker jz .no_top 415*c0909341SAndroid Build Coastguard Worker call .h_top 416*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 417*c0909341SAndroid Build Coastguard Worker mov t4, t1 418*c0909341SAndroid Build Coastguard Worker add t1, 384*2 419*c0909341SAndroid Build Coastguard Worker call .h_top 420*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 421*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 422*c0909341SAndroid Build Coastguard Worker mov t3, t1 423*c0909341SAndroid Build Coastguard Worker add t1, 384*2 424*c0909341SAndroid Build Coastguard Worker add r10, strideq 425*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 426*c0909341SAndroid Build Coastguard Worker call .h 427*c0909341SAndroid Build Coastguard Worker mov t2, t1 428*c0909341SAndroid Build Coastguard Worker dec hd 429*c0909341SAndroid Build Coastguard Worker jz .v1 430*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 431*c0909341SAndroid Build Coastguard Worker add t1, 384*2 432*c0909341SAndroid Build Coastguard Worker call .h 433*c0909341SAndroid Build Coastguard Worker dec hd 434*c0909341SAndroid Build Coastguard Worker jz .v2 435*c0909341SAndroid Build Coastguard Worker.main: 436*c0909341SAndroid Build Coastguard Worker mov t0, t4 437*c0909341SAndroid Build Coastguard Worker.main_loop: 438*c0909341SAndroid Build Coastguard Worker call .hv 439*c0909341SAndroid Build Coastguard Worker dec hd 440*c0909341SAndroid Build Coastguard Worker jnz .main_loop 441*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 442*c0909341SAndroid Build Coastguard Worker jz .v2 443*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 444*c0909341SAndroid Build Coastguard Worker call .hv_bottom 445*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 446*c0909341SAndroid Build Coastguard Worker call .hv_bottom 447*c0909341SAndroid Build Coastguard Worker.end: 448*c0909341SAndroid Build Coastguard Worker RET 449*c0909341SAndroid Build Coastguard Worker.no_top: 450*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 451*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 452*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 453*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 454*c0909341SAndroid Build Coastguard Worker call .h 455*c0909341SAndroid Build Coastguard Worker mov t4, t1 456*c0909341SAndroid Build Coastguard Worker mov t3, t1 457*c0909341SAndroid Build Coastguard Worker mov t2, t1 458*c0909341SAndroid Build Coastguard Worker dec hd 459*c0909341SAndroid Build Coastguard Worker jz .v1 460*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 461*c0909341SAndroid Build Coastguard Worker add t1, 384*2 462*c0909341SAndroid Build Coastguard Worker call .h 463*c0909341SAndroid Build Coastguard Worker dec hd 464*c0909341SAndroid Build Coastguard Worker jz .v2 465*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 466*c0909341SAndroid Build Coastguard Worker call .hv 467*c0909341SAndroid Build Coastguard Worker dec hd 468*c0909341SAndroid Build Coastguard Worker jz .v2 469*c0909341SAndroid Build Coastguard Worker add t0, 384*6 470*c0909341SAndroid Build Coastguard Worker call .hv 471*c0909341SAndroid Build Coastguard Worker dec hd 472*c0909341SAndroid Build Coastguard Worker jnz .main 473*c0909341SAndroid Build Coastguard Worker.v2: 474*c0909341SAndroid Build Coastguard Worker call .v 475*c0909341SAndroid Build Coastguard Worker mov t4, t3 476*c0909341SAndroid Build Coastguard Worker mov t3, t2 477*c0909341SAndroid Build Coastguard Worker mov t2, t1 478*c0909341SAndroid Build Coastguard Worker add dstq, strideq 479*c0909341SAndroid Build Coastguard Worker.v1: 480*c0909341SAndroid Build Coastguard Worker call .v 481*c0909341SAndroid Build Coastguard Worker jmp .end 482*c0909341SAndroid Build Coastguard Worker.extend_right: 483*c0909341SAndroid Build Coastguard Worker movd xm2, r10d 484*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [pb_2_3] 485*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [pb_m6_m5] 486*c0909341SAndroid Build Coastguard Worker vpbroadcastb m2, xm2 487*c0909341SAndroid Build Coastguard Worker psubb m0, m2 488*c0909341SAndroid Build Coastguard Worker psubb m1, m2 489*c0909341SAndroid Build Coastguard Worker mova m2, [pb_0to63] 490*c0909341SAndroid Build Coastguard Worker pminub m0, m2 491*c0909341SAndroid Build Coastguard Worker pminub m1, m2 492*c0909341SAndroid Build Coastguard Worker pshufb m3, m0 493*c0909341SAndroid Build Coastguard Worker pshufb m4, m1 494*c0909341SAndroid Build Coastguard Worker ret 495*c0909341SAndroid Build Coastguard Worker.h: 496*c0909341SAndroid Build Coastguard Worker mov r10, wq 497*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 498*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 499*c0909341SAndroid Build Coastguard Worker movd xm3, [leftq+4] 500*c0909341SAndroid Build Coastguard Worker vpblendd m3, [lpfq+r10-4], 0xfe 501*c0909341SAndroid Build Coastguard Worker add leftq, 8 502*c0909341SAndroid Build Coastguard Worker jmp .h_main 503*c0909341SAndroid Build Coastguard Worker.h_extend_left: 504*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m4, [lpfq+r10] ; avoid accessing memory located 505*c0909341SAndroid Build Coastguard Worker mova m3, [lpfq+r10] ; before the start of the buffer 506*c0909341SAndroid Build Coastguard Worker palignr m3, m4, 12 507*c0909341SAndroid Build Coastguard Worker pshufb m3, m15 508*c0909341SAndroid Build Coastguard Worker jmp .h_main 509*c0909341SAndroid Build Coastguard Worker.h_top: 510*c0909341SAndroid Build Coastguard Worker mov r10, wq 511*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 512*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 513*c0909341SAndroid Build Coastguard Worker.h_loop: 514*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+r10-4] 515*c0909341SAndroid Build Coastguard Worker.h_main: 516*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10+4] 517*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 518*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 519*c0909341SAndroid Build Coastguard Worker cmp r10d, -34 520*c0909341SAndroid Build Coastguard Worker jl .h_have_right 521*c0909341SAndroid Build Coastguard Worker call .extend_right 522*c0909341SAndroid Build Coastguard Worker.h_have_right: 523*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, m5 524*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 525*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m5 526*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 527*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m6 528*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 529*c0909341SAndroid Build Coastguard Worker paddw m2, m3 530*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m6 531*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m12 532*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 533*c0909341SAndroid Build Coastguard Worker paddw m3, m4 534*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12 535*c0909341SAndroid Build Coastguard Worker paddd m0, m8 536*c0909341SAndroid Build Coastguard Worker paddd m1, m8 537*c0909341SAndroid Build Coastguard Worker paddd m0, m2 538*c0909341SAndroid Build Coastguard Worker paddd m1, m3 539*c0909341SAndroid Build Coastguard Worker psrad m0, 4 540*c0909341SAndroid Build Coastguard Worker psrad m1, 4 541*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 542*c0909341SAndroid Build Coastguard Worker psraw m0, 1 543*c0909341SAndroid Build Coastguard Worker mova [t1+r10], m0 544*c0909341SAndroid Build Coastguard Worker add r10, 32 545*c0909341SAndroid Build Coastguard Worker jl .h_loop 546*c0909341SAndroid Build Coastguard Worker ret 547*c0909341SAndroid Build Coastguard WorkerALIGN function_align 548*c0909341SAndroid Build Coastguard Worker.hv: 549*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 550*c0909341SAndroid Build Coastguard Worker mov r10, wq 551*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 552*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 553*c0909341SAndroid Build Coastguard Worker movd xm3, [leftq+4] 554*c0909341SAndroid Build Coastguard Worker vpblendd m3, [lpfq+r10-4], 0xfe 555*c0909341SAndroid Build Coastguard Worker add leftq, 8 556*c0909341SAndroid Build Coastguard Worker jmp .hv_main 557*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 558*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+r10-4] 559*c0909341SAndroid Build Coastguard Worker pshufb m3, m15 560*c0909341SAndroid Build Coastguard Worker jmp .hv_main 561*c0909341SAndroid Build Coastguard Worker.hv_bottom: 562*c0909341SAndroid Build Coastguard Worker mov r10, wq 563*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 564*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 565*c0909341SAndroid Build Coastguard Worker.hv_loop: 566*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+r10-4] 567*c0909341SAndroid Build Coastguard Worker.hv_main: 568*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10+4] 569*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 570*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 571*c0909341SAndroid Build Coastguard Worker cmp r10d, -34 572*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 573*c0909341SAndroid Build Coastguard Worker call .extend_right 574*c0909341SAndroid Build Coastguard Worker.hv_have_right: 575*c0909341SAndroid Build Coastguard Worker pshufb m0, m3, m5 576*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 577*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m5 578*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 579*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m6 580*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 581*c0909341SAndroid Build Coastguard Worker paddw m2, m3 582*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m6 583*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m12 584*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 585*c0909341SAndroid Build Coastguard Worker paddw m3, m4 586*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m12 587*c0909341SAndroid Build Coastguard Worker paddd m0, m8 588*c0909341SAndroid Build Coastguard Worker paddd m1, m8 589*c0909341SAndroid Build Coastguard Worker paddd m0, m2 590*c0909341SAndroid Build Coastguard Worker mova m2, [t3+r10] 591*c0909341SAndroid Build Coastguard Worker paddw m2, [t1+r10] 592*c0909341SAndroid Build Coastguard Worker paddd m1, m3 593*c0909341SAndroid Build Coastguard Worker mova m4, [t2+r10] 594*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m2, m4 595*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 596*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4 597*c0909341SAndroid Build Coastguard Worker mova m4, [t4+r10] 598*c0909341SAndroid Build Coastguard Worker psrad m0, 4 599*c0909341SAndroid Build Coastguard Worker psrad m1, 4 600*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 601*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m14 602*c0909341SAndroid Build Coastguard Worker psraw m0, 1 603*c0909341SAndroid Build Coastguard Worker mova [t0+r10], m0 604*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m4 605*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m13 606*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4 607*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m13 608*c0909341SAndroid Build Coastguard Worker paddd m3, m9 609*c0909341SAndroid Build Coastguard Worker paddd m2, m9 610*c0909341SAndroid Build Coastguard Worker paddd m1, m3 611*c0909341SAndroid Build Coastguard Worker paddd m0, m2 612*c0909341SAndroid Build Coastguard Worker psrad m1, 5 613*c0909341SAndroid Build Coastguard Worker psrad m0, 5 614*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 615*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m10 616*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 617*c0909341SAndroid Build Coastguard Worker add r10, 32 618*c0909341SAndroid Build Coastguard Worker jl .hv_loop 619*c0909341SAndroid Build Coastguard Worker mov t4, t3 620*c0909341SAndroid Build Coastguard Worker mov t3, t2 621*c0909341SAndroid Build Coastguard Worker mov t2, t1 622*c0909341SAndroid Build Coastguard Worker mov t1, t0 623*c0909341SAndroid Build Coastguard Worker mov t0, t4 624*c0909341SAndroid Build Coastguard Worker add dstq, strideq 625*c0909341SAndroid Build Coastguard Worker ret 626*c0909341SAndroid Build Coastguard Worker.v: 627*c0909341SAndroid Build Coastguard Worker mov r10, wq 628*c0909341SAndroid Build Coastguard Worker.v_loop: 629*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10] 630*c0909341SAndroid Build Coastguard Worker paddw m2, m0, [t3+r10] 631*c0909341SAndroid Build Coastguard Worker mova m1, [t2+r10] 632*c0909341SAndroid Build Coastguard Worker mova m4, [t4+r10] 633*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m2, m1 634*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 635*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1 636*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m14 637*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m4 638*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m13 639*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4 640*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m13 641*c0909341SAndroid Build Coastguard Worker paddd m3, m9 642*c0909341SAndroid Build Coastguard Worker paddd m2, m9 643*c0909341SAndroid Build Coastguard Worker paddd m1, m3 644*c0909341SAndroid Build Coastguard Worker paddd m0, m2 645*c0909341SAndroid Build Coastguard Worker psrad m1, 5 646*c0909341SAndroid Build Coastguard Worker psrad m0, 5 647*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 648*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m10 649*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 650*c0909341SAndroid Build Coastguard Worker add r10, 32 651*c0909341SAndroid Build Coastguard Worker jl .v_loop 652*c0909341SAndroid Build Coastguard Worker ret 653*c0909341SAndroid Build Coastguard Worker 654*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_16bpc, 4, 14, 16, 400*24+16, dst, stride, left, lpf, \ 655*c0909341SAndroid Build Coastguard Worker w, h, edge, params 656*c0909341SAndroid Build Coastguard Worker%define base r13-pb_m10_m9 657*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 658*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 659*c0909341SAndroid Build Coastguard Worker lea r13, [pb_m10_m9] 660*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 661*c0909341SAndroid Build Coastguard Worker mov edged, r7m 662*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, [paramsq+8] ; w0 663*c0909341SAndroid Build Coastguard Worker add wd, wd 664*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pd_8] 665*c0909341SAndroid Build Coastguard Worker add lpfq, wq 666*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pd_25] 667*c0909341SAndroid Build Coastguard Worker add dstq, wq 668*c0909341SAndroid Build Coastguard Worker mova xm10, [base+sgr_lshuf5] 669*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*2+400*12+16] 670*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [paramsq+0] ; s0 671*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+wq+400*20+16] 672*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+pw_164_24] 673*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+20] 674*c0909341SAndroid Build Coastguard Worker vbroadcastss m13, [base+pf_256] 675*c0909341SAndroid Build Coastguard Worker neg wq 676*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15) 677*c0909341SAndroid Build Coastguard Worker pxor m6, m6 678*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+pw_1023] 679*c0909341SAndroid Build Coastguard Worker psllw m7, 4 680*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 681*c0909341SAndroid Build Coastguard Worker jz .no_top 682*c0909341SAndroid Build Coastguard Worker call .h_top 683*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 684*c0909341SAndroid Build Coastguard Worker mov t2, t1 685*c0909341SAndroid Build Coastguard Worker call .top_fixup 686*c0909341SAndroid Build Coastguard Worker add t1, 400*6 687*c0909341SAndroid Build Coastguard Worker call .h_top 688*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 689*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 690*c0909341SAndroid Build Coastguard Worker add r10, strideq 691*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 692*c0909341SAndroid Build Coastguard Worker mov t0, t2 693*c0909341SAndroid Build Coastguard Worker dec hd 694*c0909341SAndroid Build Coastguard Worker jz .height1 695*c0909341SAndroid Build Coastguard Worker or edged, 16 696*c0909341SAndroid Build Coastguard Worker call .h 697*c0909341SAndroid Build Coastguard Worker.main: 698*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 699*c0909341SAndroid Build Coastguard Worker call .hv 700*c0909341SAndroid Build Coastguard Worker call .prep_n 701*c0909341SAndroid Build Coastguard Worker sub hd, 2 702*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 703*c0909341SAndroid Build Coastguard Worker.main_loop: 704*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 705*c0909341SAndroid Build Coastguard Worker test hd, hd 706*c0909341SAndroid Build Coastguard Worker jz .odd_height 707*c0909341SAndroid Build Coastguard Worker call .h 708*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 709*c0909341SAndroid Build Coastguard Worker call .hv 710*c0909341SAndroid Build Coastguard Worker call .n0 711*c0909341SAndroid Build Coastguard Worker call .n1 712*c0909341SAndroid Build Coastguard Worker sub hd, 2 713*c0909341SAndroid Build Coastguard Worker jge .main_loop 714*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 715*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 716*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 717*c0909341SAndroid Build Coastguard Worker call .h_top 718*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 719*c0909341SAndroid Build Coastguard Worker call .hv_bottom 720*c0909341SAndroid Build Coastguard Worker.end: 721*c0909341SAndroid Build Coastguard Worker call .n0 722*c0909341SAndroid Build Coastguard Worker call .n1 723*c0909341SAndroid Build Coastguard Worker.end2: 724*c0909341SAndroid Build Coastguard Worker RET 725*c0909341SAndroid Build Coastguard Worker.height1: 726*c0909341SAndroid Build Coastguard Worker call .hv 727*c0909341SAndroid Build Coastguard Worker call .prep_n 728*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 729*c0909341SAndroid Build Coastguard Worker.odd_height: 730*c0909341SAndroid Build Coastguard Worker call .hv 731*c0909341SAndroid Build Coastguard Worker call .n0 732*c0909341SAndroid Build Coastguard Worker call .n1 733*c0909341SAndroid Build Coastguard Worker.odd_height_end: 734*c0909341SAndroid Build Coastguard Worker call .v 735*c0909341SAndroid Build Coastguard Worker call .n0 736*c0909341SAndroid Build Coastguard Worker jmp .end2 737*c0909341SAndroid Build Coastguard Worker.extend_bottom: 738*c0909341SAndroid Build Coastguard Worker call .v 739*c0909341SAndroid Build Coastguard Worker jmp .end 740*c0909341SAndroid Build Coastguard Worker.no_top: 741*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 742*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 743*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 744*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 745*c0909341SAndroid Build Coastguard Worker call .h 746*c0909341SAndroid Build Coastguard Worker lea t2, [t1+400*6] 747*c0909341SAndroid Build Coastguard Worker call .top_fixup 748*c0909341SAndroid Build Coastguard Worker dec hd 749*c0909341SAndroid Build Coastguard Worker jz .no_top_height1 750*c0909341SAndroid Build Coastguard Worker or edged, 16 751*c0909341SAndroid Build Coastguard Worker mov t0, t1 752*c0909341SAndroid Build Coastguard Worker mov t1, t2 753*c0909341SAndroid Build Coastguard Worker jmp .main 754*c0909341SAndroid Build Coastguard Worker.no_top_height1: 755*c0909341SAndroid Build Coastguard Worker call .v 756*c0909341SAndroid Build Coastguard Worker call .prep_n 757*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 758*c0909341SAndroid Build Coastguard Worker.extend_right: 759*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 760*c0909341SAndroid Build Coastguard Worker movu m1, [r13+r10+ 0] 761*c0909341SAndroid Build Coastguard Worker movu m2, [r13+r10+16] 762*c0909341SAndroid Build Coastguard Worker vpblendvb m4, m0, m1 763*c0909341SAndroid Build Coastguard Worker vpblendvb m5, m0, m2 764*c0909341SAndroid Build Coastguard Worker ret 765*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 766*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 767*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 768*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 769*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm5, [leftq] 770*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+wq], 1 771*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq] 772*c0909341SAndroid Build Coastguard Worker add leftq, 8 773*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 10 774*c0909341SAndroid Build Coastguard Worker jmp .h_main 775*c0909341SAndroid Build Coastguard Worker.h_extend_left: 776*c0909341SAndroid Build Coastguard Worker mova xm4, [lpfq+wq] 777*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm10 778*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [lpfq+wq+10], 1 779*c0909341SAndroid Build Coastguard Worker jmp .h_main 780*c0909341SAndroid Build Coastguard Worker.h_top: 781*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 782*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 783*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 784*c0909341SAndroid Build Coastguard Worker.h_loop: 785*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10- 2] 786*c0909341SAndroid Build Coastguard Worker.h_main: 787*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+14] 788*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 789*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 790*c0909341SAndroid Build Coastguard Worker cmp r10d, -36 791*c0909341SAndroid Build Coastguard Worker jl .h_have_right 792*c0909341SAndroid Build Coastguard Worker call .extend_right 793*c0909341SAndroid Build Coastguard Worker.h_have_right: 794*c0909341SAndroid Build Coastguard Worker palignr m2, m5, m4, 2 795*c0909341SAndroid Build Coastguard Worker paddw m0, m4, m2 796*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 6 797*c0909341SAndroid Build Coastguard Worker paddw m0, m3 798*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 799*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 800*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 801*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 802*c0909341SAndroid Build Coastguard Worker shufpd m5, m4, m5, 0x05 803*c0909341SAndroid Build Coastguard Worker paddw m0, m5 804*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 805*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 806*c0909341SAndroid Build Coastguard Worker paddd m1, m3 807*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m5 808*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 809*c0909341SAndroid Build Coastguard Worker shufps m4, m5, q2121 810*c0909341SAndroid Build Coastguard Worker paddw m0, m4 ; sum 811*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m4, m6 812*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 813*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m6 814*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 815*c0909341SAndroid Build Coastguard Worker paddd m2, m3 816*c0909341SAndroid Build Coastguard Worker test edgeb, 16 ; y > 0 817*c0909341SAndroid Build Coastguard Worker jz .h_loop_end 818*c0909341SAndroid Build Coastguard Worker paddw m0, [t1+r10+400*0] 819*c0909341SAndroid Build Coastguard Worker paddd m1, [t1+r10+400*2] 820*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10+400*4] 821*c0909341SAndroid Build Coastguard Worker.h_loop_end: 822*c0909341SAndroid Build Coastguard Worker paddd m1, m5 ; sumsq 823*c0909341SAndroid Build Coastguard Worker paddd m2, m4 824*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*0], m0 825*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*2], m1 826*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*4], m2 827*c0909341SAndroid Build Coastguard Worker add r10, 32 828*c0909341SAndroid Build Coastguard Worker jl .h_loop 829*c0909341SAndroid Build Coastguard Worker ret 830*c0909341SAndroid Build Coastguard Worker.top_fixup: 831*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 832*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: ; the sums of the first row needs to be doubled 833*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+400*0] 834*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10+400*2] 835*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10+400*4] 836*c0909341SAndroid Build Coastguard Worker paddw m0, m0 837*c0909341SAndroid Build Coastguard Worker paddd m1, m1 838*c0909341SAndroid Build Coastguard Worker paddd m2, m2 839*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*0], m0 840*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*2], m1 841*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*4], m2 842*c0909341SAndroid Build Coastguard Worker add r10, 32 843*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 844*c0909341SAndroid Build Coastguard Worker ret 845*c0909341SAndroid Build Coastguard WorkerALIGN function_align 846*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab 847*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 848*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 849*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 850*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm5, [leftq] 851*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+wq], 1 852*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq] 853*c0909341SAndroid Build Coastguard Worker add leftq, 8 854*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 10 855*c0909341SAndroid Build Coastguard Worker jmp .hv_main 856*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 857*c0909341SAndroid Build Coastguard Worker mova xm4, [lpfq+wq] 858*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm10 859*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [lpfq+wq+10], 1 860*c0909341SAndroid Build Coastguard Worker jmp .hv_main 861*c0909341SAndroid Build Coastguard Worker.hv_bottom: 862*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 863*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 864*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 865*c0909341SAndroid Build Coastguard Worker.hv_loop: 866*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10- 2] 867*c0909341SAndroid Build Coastguard Worker.hv_main: 868*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+14] 869*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 870*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 871*c0909341SAndroid Build Coastguard Worker cmp r10d, -36 872*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 873*c0909341SAndroid Build Coastguard Worker call .extend_right 874*c0909341SAndroid Build Coastguard Worker.hv_have_right: 875*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 2 876*c0909341SAndroid Build Coastguard Worker paddw m0, m4, m3 877*c0909341SAndroid Build Coastguard Worker palignr m1, m5, m4, 6 878*c0909341SAndroid Build Coastguard Worker paddw m0, m1 879*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m1 880*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 881*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m1 882*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 883*c0909341SAndroid Build Coastguard Worker shufpd m5, m4, m5, 0x05 884*c0909341SAndroid Build Coastguard Worker paddw m0, m5 885*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m4, m5 886*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 887*c0909341SAndroid Build Coastguard Worker paddd m2, m1 888*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4, m5 889*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 890*c0909341SAndroid Build Coastguard Worker shufps m4, m5, q2121 891*c0909341SAndroid Build Coastguard Worker paddw m0, m4 ; h sum 892*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m4, m6 893*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 894*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m6 895*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 896*c0909341SAndroid Build Coastguard Worker paddd m3, m1 897*c0909341SAndroid Build Coastguard Worker paddd m2, m5 ; h sumsq 898*c0909341SAndroid Build Coastguard Worker paddd m3, m4 899*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t1+r10+400*0] 900*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t1+r10+400*2] 901*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t1+r10+400*4] 902*c0909341SAndroid Build Coastguard Worker test hd, hd 903*c0909341SAndroid Build Coastguard Worker jz .hv_last_row 904*c0909341SAndroid Build Coastguard Worker.hv_main2: 905*c0909341SAndroid Build Coastguard Worker paddw m1, [t2+r10+400*0] ; hv sum 906*c0909341SAndroid Build Coastguard Worker paddd m4, [t2+r10+400*2] ; hv sumsq 907*c0909341SAndroid Build Coastguard Worker paddd m5, [t2+r10+400*4] 908*c0909341SAndroid Build Coastguard Worker mova [t0+r10+400*0], m0 909*c0909341SAndroid Build Coastguard Worker mova [t0+r10+400*2], m2 910*c0909341SAndroid Build Coastguard Worker mova [t0+r10+400*4], m3 911*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 912*c0909341SAndroid Build Coastguard Worker paddd m4, m8 913*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 914*c0909341SAndroid Build Coastguard Worker paddd m5, m8 915*c0909341SAndroid Build Coastguard Worker psrld m4, 4 ; (a + 8) >> 4 916*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 917*c0909341SAndroid Build Coastguard Worker psrld m5, 4 918*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 919*c0909341SAndroid Build Coastguard Worker pmulld m4, m9 ; a * 25 920*c0909341SAndroid Build Coastguard Worker pmulld m5, m9 921*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 ; b * b 922*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 923*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 924*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 925*c0909341SAndroid Build Coastguard Worker pmaxud m4, m2 926*c0909341SAndroid Build Coastguard Worker pmaxud m5, m3 927*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 928*c0909341SAndroid Build Coastguard Worker psubd m5, m3 929*c0909341SAndroid Build Coastguard Worker pmulld m4, m11 ; p * s 930*c0909341SAndroid Build Coastguard Worker pmulld m5, m11 931*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 ; b * 164 932*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 933*c0909341SAndroid Build Coastguard Worker paddw m4, m12 934*c0909341SAndroid Build Coastguard Worker paddw m5, m12 935*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z + 1 936*c0909341SAndroid Build Coastguard Worker psrld m5, 20 937*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 938*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 939*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z + 1) 940*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 941*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m13, m4 942*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m13, m5 943*c0909341SAndroid Build Coastguard Worker mulps m2, m13 ; 256 / (z + 1) 944*c0909341SAndroid Build Coastguard Worker mulps m3, m13 945*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z < 255 ? 255 : 0 946*c0909341SAndroid Build Coastguard Worker psrld m5, 24 947*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 948*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 949*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x 950*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 951*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 952*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 953*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 954*c0909341SAndroid Build Coastguard Worker paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) 955*c0909341SAndroid Build Coastguard Worker paddd m1, m14 956*c0909341SAndroid Build Coastguard Worker mova [t4+r10+4], m2 957*c0909341SAndroid Build Coastguard Worker psrld m0, 12 ; b 958*c0909341SAndroid Build Coastguard Worker psrld m1, 12 959*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+ 8], xm0 960*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+40], m0, 1 961*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+24], xm1 962*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+56], m1, 1 963*c0909341SAndroid Build Coastguard Worker add r10, 32 964*c0909341SAndroid Build Coastguard Worker jl .hv_loop 965*c0909341SAndroid Build Coastguard Worker mov t2, t1 966*c0909341SAndroid Build Coastguard Worker mov t1, t0 967*c0909341SAndroid Build Coastguard Worker mov t0, t2 968*c0909341SAndroid Build Coastguard Worker ret 969*c0909341SAndroid Build Coastguard Worker.hv_last_row: ; esoteric edge case for odd heights 970*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*0], m1 971*c0909341SAndroid Build Coastguard Worker paddw m1, m0 972*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*2], m4 973*c0909341SAndroid Build Coastguard Worker paddd m4, m2 974*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*4], m5 975*c0909341SAndroid Build Coastguard Worker paddd m5, m3 976*c0909341SAndroid Build Coastguard Worker jmp .hv_main2 977*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab 978*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 979*c0909341SAndroid Build Coastguard Worker.v_loop: 980*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+400*0] 981*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10+400*2] 982*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10+400*4] 983*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+400*0] 984*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t2+r10+400*2] 985*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t2+r10+400*4] 986*c0909341SAndroid Build Coastguard Worker paddw m0, m0 987*c0909341SAndroid Build Coastguard Worker paddd m2, m2 988*c0909341SAndroid Build Coastguard Worker paddd m3, m3 989*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; hv sum 990*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; hv sumsq 991*c0909341SAndroid Build Coastguard Worker paddd m5, m3 992*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 993*c0909341SAndroid Build Coastguard Worker paddd m4, m8 994*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 995*c0909341SAndroid Build Coastguard Worker paddd m5, m8 996*c0909341SAndroid Build Coastguard Worker psrld m4, 4 ; (a + 8) >> 4 997*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 998*c0909341SAndroid Build Coastguard Worker psrld m5, 4 999*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1000*c0909341SAndroid Build Coastguard Worker pmulld m4, m9 ; a * 25 1001*c0909341SAndroid Build Coastguard Worker pmulld m5, m9 1002*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 ; b * b 1003*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1004*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 1005*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 1006*c0909341SAndroid Build Coastguard Worker pmaxud m4, m2 1007*c0909341SAndroid Build Coastguard Worker pmaxud m5, m3 1008*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 1009*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1010*c0909341SAndroid Build Coastguard Worker pmulld m4, m11 ; p * s 1011*c0909341SAndroid Build Coastguard Worker pmulld m5, m11 1012*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 ; b * 164 1013*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 1014*c0909341SAndroid Build Coastguard Worker paddw m4, m12 1015*c0909341SAndroid Build Coastguard Worker paddw m5, m12 1016*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z + 1 1017*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1018*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 1019*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 1020*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z + 1) 1021*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 1022*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m13, m4 1023*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m13, m5 1024*c0909341SAndroid Build Coastguard Worker mulps m2, m13 ; 256 / (z + 1) 1025*c0909341SAndroid Build Coastguard Worker mulps m3, m13 1026*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z < 255 ? 255 : 0 1027*c0909341SAndroid Build Coastguard Worker psrld m5, 24 1028*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 1029*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 1030*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x 1031*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 1032*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 1033*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 1034*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1035*c0909341SAndroid Build Coastguard Worker paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) 1036*c0909341SAndroid Build Coastguard Worker paddd m1, m14 1037*c0909341SAndroid Build Coastguard Worker mova [t4+r10+4], m2 1038*c0909341SAndroid Build Coastguard Worker psrld m0, 12 ; b 1039*c0909341SAndroid Build Coastguard Worker psrld m1, 12 1040*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+ 8], xm0 1041*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+40], m0, 1 1042*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+24], xm1 1043*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+56], m1, 1 1044*c0909341SAndroid Build Coastguard Worker add r10, 32 1045*c0909341SAndroid Build Coastguard Worker jl .v_loop 1046*c0909341SAndroid Build Coastguard Worker ret 1047*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 1048*c0909341SAndroid Build Coastguard Worker mov r10, wq 1049*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 1050*c0909341SAndroid Build Coastguard Worker movu m0, [t4+r10*1+ 2] 1051*c0909341SAndroid Build Coastguard Worker movu m1, [t3+r10*2+ 4] 1052*c0909341SAndroid Build Coastguard Worker movu m2, [t3+r10*2+36] 1053*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t4+r10*1+ 0] 1054*c0909341SAndroid Build Coastguard Worker paddd m4, m1, [t3+r10*2+ 0] 1055*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+r10*2+32] 1056*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+r10*1+ 4] 1057*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+r10*2+ 8] 1058*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+r10*2+40] 1059*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1060*c0909341SAndroid Build Coastguard Worker psllw m3, 2 1061*c0909341SAndroid Build Coastguard Worker paddd m1, m4 1062*c0909341SAndroid Build Coastguard Worker pslld m4, 2 1063*c0909341SAndroid Build Coastguard Worker paddd m2, m5 1064*c0909341SAndroid Build Coastguard Worker pslld m5, 2 1065*c0909341SAndroid Build Coastguard Worker paddw m0, m3 ; a 565 1066*c0909341SAndroid Build Coastguard Worker paddd m1, m4 ; b 565 1067*c0909341SAndroid Build Coastguard Worker paddd m2, m5 1068*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*2+ 0], m0 1069*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*4+ 0], m1 1070*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*4+32], m2 1071*c0909341SAndroid Build Coastguard Worker add r10, 32 1072*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 1073*c0909341SAndroid Build Coastguard Worker ret 1074*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1075*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 1076*c0909341SAndroid Build Coastguard Worker mov r10, wq 1077*c0909341SAndroid Build Coastguard Worker.n0_loop: 1078*c0909341SAndroid Build Coastguard Worker movu m0, [t4+r10*1+ 2] 1079*c0909341SAndroid Build Coastguard Worker movu m1, [t3+r10*2+ 4] 1080*c0909341SAndroid Build Coastguard Worker movu m2, [t3+r10*2+36] 1081*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t4+r10*1+ 0] 1082*c0909341SAndroid Build Coastguard Worker paddd m4, m1, [t3+r10*2+ 0] 1083*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+r10*2+32] 1084*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+r10*1+ 4] 1085*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+r10*2+ 8] 1086*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+r10*2+40] 1087*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1088*c0909341SAndroid Build Coastguard Worker psllw m3, 2 1089*c0909341SAndroid Build Coastguard Worker paddd m1, m4 1090*c0909341SAndroid Build Coastguard Worker pslld m4, 2 1091*c0909341SAndroid Build Coastguard Worker paddd m2, m5 1092*c0909341SAndroid Build Coastguard Worker pslld m5, 2 1093*c0909341SAndroid Build Coastguard Worker paddw m0, m3 ; a 565 1094*c0909341SAndroid Build Coastguard Worker paddd m1, m4 ; b 565 1095*c0909341SAndroid Build Coastguard Worker paddd m2, m5 1096*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t4+r10*1+400*2+ 0] 1097*c0909341SAndroid Build Coastguard Worker paddd m4, m1, [t3+r10*2+400*4+ 0] 1098*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+r10*2+400*4+32] 1099*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*2+ 0], m0 1100*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*4+ 0], m1 1101*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*4+32], m2 1102*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+r10] 1103*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 ; src 1104*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 ; a 1105*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 1106*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 1107*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1108*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1109*c0909341SAndroid Build Coastguard Worker vinserti128 m1, m4, xm5, 1 1110*c0909341SAndroid Build Coastguard Worker vperm2i128 m4, m5, 0x31 1111*c0909341SAndroid Build Coastguard Worker psubd m1, m2 ; b - a * src + (1 << 8) 1112*c0909341SAndroid Build Coastguard Worker psubd m4, m3 1113*c0909341SAndroid Build Coastguard Worker psrad m1, 9 1114*c0909341SAndroid Build Coastguard Worker psrad m4, 9 1115*c0909341SAndroid Build Coastguard Worker packssdw m1, m4 1116*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 1117*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1118*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 1119*c0909341SAndroid Build Coastguard Worker pminsw m0, m15 1120*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 1121*c0909341SAndroid Build Coastguard Worker add r10, 32 1122*c0909341SAndroid Build Coastguard Worker jl .n0_loop 1123*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1124*c0909341SAndroid Build Coastguard Worker ret 1125*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1126*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 1127*c0909341SAndroid Build Coastguard Worker mov r10, wq 1128*c0909341SAndroid Build Coastguard Worker.n1_loop: 1129*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+r10] 1130*c0909341SAndroid Build Coastguard Worker mova m3, [t4+r10*1+400*2+ 0] 1131*c0909341SAndroid Build Coastguard Worker mova m4, [t3+r10*2+400*4+ 0] 1132*c0909341SAndroid Build Coastguard Worker mova m5, [t3+r10*2+400*4+32] 1133*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 ; src 1134*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 ; a 1135*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 1136*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 1137*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1138*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1139*c0909341SAndroid Build Coastguard Worker vinserti128 m1, m4, xm5, 1 1140*c0909341SAndroid Build Coastguard Worker vperm2i128 m4, m5, 0x31 1141*c0909341SAndroid Build Coastguard Worker psubd m1, m2 ; b - a * src + (1 << 7) 1142*c0909341SAndroid Build Coastguard Worker psubd m4, m3 1143*c0909341SAndroid Build Coastguard Worker psrad m1, 8 1144*c0909341SAndroid Build Coastguard Worker psrad m4, 8 1145*c0909341SAndroid Build Coastguard Worker packssdw m1, m4 1146*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 1147*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1148*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 1149*c0909341SAndroid Build Coastguard Worker pminsw m0, m15 1150*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 1151*c0909341SAndroid Build Coastguard Worker add r10, 32 1152*c0909341SAndroid Build Coastguard Worker jl .n1_loop 1153*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1154*c0909341SAndroid Build Coastguard Worker ret 1155*c0909341SAndroid Build Coastguard Worker 1156*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_16bpc, 4, 14, 15, 400*42+8, dst, stride, left, lpf, \ 1157*c0909341SAndroid Build Coastguard Worker w, h, edge, params 1158*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 1159*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 1160*c0909341SAndroid Build Coastguard Worker lea r13, [pb_m10_m9] 1161*c0909341SAndroid Build Coastguard Worker add wd, wd 1162*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1163*c0909341SAndroid Build Coastguard Worker mov edged, r7m 1164*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, [paramsq+10] ; w1 1165*c0909341SAndroid Build Coastguard Worker add lpfq, wq 1166*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pd_8] 1167*c0909341SAndroid Build Coastguard Worker add dstq, wq 1168*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [paramsq+ 4] ; s1 1169*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*2+400*12+8] 1170*c0909341SAndroid Build Coastguard Worker mova xm10, [base+sgr_lshuf3] 1171*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+wq+400*32+8] 1172*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pw_455_24] 1173*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+12] 1174*c0909341SAndroid Build Coastguard Worker vbroadcastss m12, [base+pf_256] 1175*c0909341SAndroid Build Coastguard Worker neg wq 1176*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+pd_34816] 1177*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1178*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [base+pw_1023] 1179*c0909341SAndroid Build Coastguard Worker psllw m7, 4 1180*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 1181*c0909341SAndroid Build Coastguard Worker jz .no_top 1182*c0909341SAndroid Build Coastguard Worker call .h_top 1183*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1184*c0909341SAndroid Build Coastguard Worker mov t2, t1 1185*c0909341SAndroid Build Coastguard Worker add t1, 400*6 1186*c0909341SAndroid Build Coastguard Worker call .h_top 1187*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1188*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1189*c0909341SAndroid Build Coastguard Worker add r10, strideq 1190*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 1191*c0909341SAndroid Build Coastguard Worker call .hv0 1192*c0909341SAndroid Build Coastguard Worker.main: 1193*c0909341SAndroid Build Coastguard Worker dec hd 1194*c0909341SAndroid Build Coastguard Worker jz .height1 1195*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1196*c0909341SAndroid Build Coastguard Worker call .hv1 1197*c0909341SAndroid Build Coastguard Worker call .prep_n 1198*c0909341SAndroid Build Coastguard Worker sub hd, 2 1199*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 1200*c0909341SAndroid Build Coastguard Worker.main_loop: 1201*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1202*c0909341SAndroid Build Coastguard Worker call .hv0 1203*c0909341SAndroid Build Coastguard Worker test hd, hd 1204*c0909341SAndroid Build Coastguard Worker jz .odd_height 1205*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1206*c0909341SAndroid Build Coastguard Worker call .hv1 1207*c0909341SAndroid Build Coastguard Worker call .n0 1208*c0909341SAndroid Build Coastguard Worker call .n1 1209*c0909341SAndroid Build Coastguard Worker sub hd, 2 1210*c0909341SAndroid Build Coastguard Worker jge .main_loop 1211*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 1212*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1213*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 1214*c0909341SAndroid Build Coastguard Worker call .hv0_bottom 1215*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1216*c0909341SAndroid Build Coastguard Worker call .hv1_bottom 1217*c0909341SAndroid Build Coastguard Worker.end: 1218*c0909341SAndroid Build Coastguard Worker call .n0 1219*c0909341SAndroid Build Coastguard Worker call .n1 1220*c0909341SAndroid Build Coastguard Worker.end2: 1221*c0909341SAndroid Build Coastguard Worker RET 1222*c0909341SAndroid Build Coastguard Worker.height1: 1223*c0909341SAndroid Build Coastguard Worker call .v1 1224*c0909341SAndroid Build Coastguard Worker call .prep_n 1225*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 1226*c0909341SAndroid Build Coastguard Worker.odd_height: 1227*c0909341SAndroid Build Coastguard Worker call .v1 1228*c0909341SAndroid Build Coastguard Worker call .n0 1229*c0909341SAndroid Build Coastguard Worker call .n1 1230*c0909341SAndroid Build Coastguard Worker.odd_height_end: 1231*c0909341SAndroid Build Coastguard Worker call .v0 1232*c0909341SAndroid Build Coastguard Worker call .v1 1233*c0909341SAndroid Build Coastguard Worker call .n0 1234*c0909341SAndroid Build Coastguard Worker jmp .end2 1235*c0909341SAndroid Build Coastguard Worker.extend_bottom: 1236*c0909341SAndroid Build Coastguard Worker call .v0 1237*c0909341SAndroid Build Coastguard Worker call .v1 1238*c0909341SAndroid Build Coastguard Worker jmp .end 1239*c0909341SAndroid Build Coastguard Worker.no_top: 1240*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1241*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1242*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 1243*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 1244*c0909341SAndroid Build Coastguard Worker call .h 1245*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1246*c0909341SAndroid Build Coastguard Worker lea t2, [t1+400*6] 1247*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: 1248*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+400*0] 1249*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10+400*2] 1250*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10+400*4] 1251*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*0], m0 1252*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*2], m1 1253*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*4], m2 1254*c0909341SAndroid Build Coastguard Worker add r10, 32 1255*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 1256*c0909341SAndroid Build Coastguard Worker call .v0 1257*c0909341SAndroid Build Coastguard Worker jmp .main 1258*c0909341SAndroid Build Coastguard Worker.extend_right: 1259*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 1260*c0909341SAndroid Build Coastguard Worker movu m1, [r13+r10+ 2] 1261*c0909341SAndroid Build Coastguard Worker movu m2, [r13+r10+18] 1262*c0909341SAndroid Build Coastguard Worker vpblendvb m4, m0, m1 1263*c0909341SAndroid Build Coastguard Worker vpblendvb m5, m0, m2 1264*c0909341SAndroid Build Coastguard Worker ret 1265*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 1266*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1267*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1268*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1269*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm5, [leftq] 1270*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+wq], 1 1271*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq] 1272*c0909341SAndroid Build Coastguard Worker add leftq, 8 1273*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 12 1274*c0909341SAndroid Build Coastguard Worker jmp .h_main 1275*c0909341SAndroid Build Coastguard Worker.h_extend_left: 1276*c0909341SAndroid Build Coastguard Worker mova xm4, [lpfq+wq] 1277*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm10 1278*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [lpfq+wq+12], 1 1279*c0909341SAndroid Build Coastguard Worker jmp .h_main 1280*c0909341SAndroid Build Coastguard Worker.h_top: 1281*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1282*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1283*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1284*c0909341SAndroid Build Coastguard Worker.h_loop: 1285*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10+ 0] 1286*c0909341SAndroid Build Coastguard Worker.h_main: 1287*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+16] 1288*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1289*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 1290*c0909341SAndroid Build Coastguard Worker cmp r10d, -34 1291*c0909341SAndroid Build Coastguard Worker jl .h_have_right 1292*c0909341SAndroid Build Coastguard Worker call .extend_right 1293*c0909341SAndroid Build Coastguard Worker.h_have_right: 1294*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 2 1295*c0909341SAndroid Build Coastguard Worker paddw m1, m4, m0 1296*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 1297*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1298*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m0 1299*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1300*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 4 1301*c0909341SAndroid Build Coastguard Worker paddw m1, m5 ; sum 1302*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m6 1303*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 1304*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 1305*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1306*c0909341SAndroid Build Coastguard Worker paddd m2, m4 ; sumsq 1307*c0909341SAndroid Build Coastguard Worker paddd m3, m5 1308*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*0], m1 1309*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*2], m2 1310*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*4], m3 1311*c0909341SAndroid Build Coastguard Worker add r10, 32 1312*c0909341SAndroid Build Coastguard Worker jl .h_loop 1313*c0909341SAndroid Build Coastguard Worker ret 1314*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1315*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 1316*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1317*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1318*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 1319*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm5, [leftq] 1320*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+wq], 1 1321*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq] 1322*c0909341SAndroid Build Coastguard Worker add leftq, 8 1323*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 12 1324*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 1325*c0909341SAndroid Build Coastguard Worker.hv0_extend_left: 1326*c0909341SAndroid Build Coastguard Worker mova xm4, [lpfq+wq] 1327*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm10 1328*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [lpfq+wq+12], 1 1329*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 1330*c0909341SAndroid Build Coastguard Worker.hv0_bottom: 1331*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1332*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1333*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 1334*c0909341SAndroid Build Coastguard Worker.hv0_loop: 1335*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10+ 0] 1336*c0909341SAndroid Build Coastguard Worker.hv0_main: 1337*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+16] 1338*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1339*c0909341SAndroid Build Coastguard Worker jnz .hv0_have_right 1340*c0909341SAndroid Build Coastguard Worker cmp r10d, -34 1341*c0909341SAndroid Build Coastguard Worker jl .hv0_have_right 1342*c0909341SAndroid Build Coastguard Worker call .extend_right 1343*c0909341SAndroid Build Coastguard Worker.hv0_have_right: 1344*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 2 1345*c0909341SAndroid Build Coastguard Worker paddw m1, m4, m0 1346*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 1347*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1348*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m0 1349*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1350*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 4 1351*c0909341SAndroid Build Coastguard Worker paddw m1, m5 ; sum 1352*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m6 1353*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 1354*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 1355*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1356*c0909341SAndroid Build Coastguard Worker paddd m2, m4 ; sumsq 1357*c0909341SAndroid Build Coastguard Worker paddd m3, m5 1358*c0909341SAndroid Build Coastguard Worker paddw m0, m1, [t1+r10+400*0] 1359*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t1+r10+400*2] 1360*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t1+r10+400*4] 1361*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*0], m1 1362*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*2], m2 1363*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*4], m3 1364*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+400*0] 1365*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+r10+400*2] 1366*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+r10+400*4] 1367*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*0], m0 1368*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*2], m4 1369*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*4], m5 1370*c0909341SAndroid Build Coastguard Worker paddd m2, m8 1371*c0909341SAndroid Build Coastguard Worker paddd m3, m8 1372*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a + 8) >> 4 1373*c0909341SAndroid Build Coastguard Worker psrld m3, 4 1374*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 1375*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 1376*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a + 8) >> 4) * 9 1377*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1378*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 1379*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 1380*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 1381*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1382*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1383*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1384*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 1385*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 1386*c0909341SAndroid Build Coastguard Worker pmaxud m4, m2 1387*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 1388*c0909341SAndroid Build Coastguard Worker pmaxud m5, m3 1389*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1390*c0909341SAndroid Build Coastguard Worker pmulld m4, m9 ; p * s 1391*c0909341SAndroid Build Coastguard Worker pmulld m5, m9 1392*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b * 455 1393*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 1394*c0909341SAndroid Build Coastguard Worker paddw m4, m11 1395*c0909341SAndroid Build Coastguard Worker paddw m5, m11 1396*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z + 1 1397*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1398*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 1399*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 1400*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z + 1) 1401*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 1402*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m12, m4 1403*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m12, m5 1404*c0909341SAndroid Build Coastguard Worker mulps m2, m12 ; 256 / (z + 1) 1405*c0909341SAndroid Build Coastguard Worker mulps m3, m12 1406*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z < 255 ? 255 : 0 1407*c0909341SAndroid Build Coastguard Worker psrld m5, 24 1408*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 1409*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 1410*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x 1411*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 1412*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 1413*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 1414*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1415*c0909341SAndroid Build Coastguard Worker paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1416*c0909341SAndroid Build Coastguard Worker paddd m1, m13 1417*c0909341SAndroid Build Coastguard Worker psrld m0, 12 1418*c0909341SAndroid Build Coastguard Worker psrld m1, 12 1419*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*0+ 4], m2 1420*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*0+ 8], xm0 1421*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*0+40], m0, 1 1422*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*0+24], xm1 1423*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*0+56], m1, 1 1424*c0909341SAndroid Build Coastguard Worker add r10, 32 1425*c0909341SAndroid Build Coastguard Worker jl .hv0_loop 1426*c0909341SAndroid Build Coastguard Worker ret 1427*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1428*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 1429*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1430*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1431*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 1432*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm5, [leftq] 1433*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+wq], 1 1434*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq] 1435*c0909341SAndroid Build Coastguard Worker add leftq, 8 1436*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 12 1437*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 1438*c0909341SAndroid Build Coastguard Worker.hv1_extend_left: 1439*c0909341SAndroid Build Coastguard Worker mova xm4, [lpfq+wq] 1440*c0909341SAndroid Build Coastguard Worker pshufb xm4, xm10 1441*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [lpfq+wq+12], 1 1442*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 1443*c0909341SAndroid Build Coastguard Worker.hv1_bottom: 1444*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1445*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1446*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 1447*c0909341SAndroid Build Coastguard Worker.hv1_loop: 1448*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10+ 0] 1449*c0909341SAndroid Build Coastguard Worker.hv1_main: 1450*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+16] 1451*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1452*c0909341SAndroid Build Coastguard Worker jnz .hv1_have_right 1453*c0909341SAndroid Build Coastguard Worker cmp r10d, -34 1454*c0909341SAndroid Build Coastguard Worker jl .hv1_have_right 1455*c0909341SAndroid Build Coastguard Worker call .extend_right 1456*c0909341SAndroid Build Coastguard Worker.hv1_have_right: 1457*c0909341SAndroid Build Coastguard Worker palignr m1, m5, m4, 2 1458*c0909341SAndroid Build Coastguard Worker paddw m0, m4, m1 1459*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m1 1460*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1461*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4, m1 1462*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1463*c0909341SAndroid Build Coastguard Worker palignr m5, m4, 4 1464*c0909341SAndroid Build Coastguard Worker paddw m0, m5 ; h sum 1465*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m5, m6 1466*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 1467*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 1468*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1469*c0909341SAndroid Build Coastguard Worker paddd m2, m1 ; h sumsq 1470*c0909341SAndroid Build Coastguard Worker paddd m3, m5 1471*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+400*0] 1472*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t2+r10+400*2] 1473*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t2+r10+400*4] 1474*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*0], m0 1475*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*2], m2 1476*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*4], m3 1477*c0909341SAndroid Build Coastguard Worker paddd m4, m8 1478*c0909341SAndroid Build Coastguard Worker paddd m5, m8 1479*c0909341SAndroid Build Coastguard Worker psrld m4, 4 ; (a + 8) >> 4 1480*c0909341SAndroid Build Coastguard Worker psrld m5, 4 1481*c0909341SAndroid Build Coastguard Worker pslld m2, m4, 3 1482*c0909341SAndroid Build Coastguard Worker pslld m3, m5, 3 1483*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a + 8) >> 4) * 9 1484*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1485*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 1486*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 1487*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 1488*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1489*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1490*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1491*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 1492*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 1493*c0909341SAndroid Build Coastguard Worker pmaxud m4, m2 1494*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 1495*c0909341SAndroid Build Coastguard Worker pmaxud m5, m3 1496*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1497*c0909341SAndroid Build Coastguard Worker pmulld m4, m9 ; p * s 1498*c0909341SAndroid Build Coastguard Worker pmulld m5, m9 1499*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b * 455 1500*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 1501*c0909341SAndroid Build Coastguard Worker paddw m4, m11 1502*c0909341SAndroid Build Coastguard Worker paddw m5, m11 1503*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z + 1 1504*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1505*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 1506*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 1507*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z + 1) 1508*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 1509*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m12, m4 1510*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m12, m5 1511*c0909341SAndroid Build Coastguard Worker mulps m2, m12 ; 256 / (z + 1) 1512*c0909341SAndroid Build Coastguard Worker mulps m3, m12 1513*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z < 255 ? 255 : 0 1514*c0909341SAndroid Build Coastguard Worker psrld m5, 24 1515*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 1516*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 1517*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x 1518*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 1519*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 1520*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 1521*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1522*c0909341SAndroid Build Coastguard Worker paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1523*c0909341SAndroid Build Coastguard Worker paddd m1, m13 1524*c0909341SAndroid Build Coastguard Worker psrld m0, 12 1525*c0909341SAndroid Build Coastguard Worker psrld m1, 12 1526*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*2 +4], m2 1527*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*4+ 8], xm0 1528*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*4+40], m0, 1 1529*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*4+24], xm1 1530*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*4+56], m1, 1 1531*c0909341SAndroid Build Coastguard Worker add r10, 32 1532*c0909341SAndroid Build Coastguard Worker jl .hv1_loop 1533*c0909341SAndroid Build Coastguard Worker mov r10, t2 1534*c0909341SAndroid Build Coastguard Worker mov t2, t1 1535*c0909341SAndroid Build Coastguard Worker mov t1, r10 1536*c0909341SAndroid Build Coastguard Worker ret 1537*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab (even rows) 1538*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1539*c0909341SAndroid Build Coastguard Worker.v0_loop: 1540*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+400*0] 1541*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10+400*2] 1542*c0909341SAndroid Build Coastguard Worker mova m5, [t1+r10+400*4] 1543*c0909341SAndroid Build Coastguard Worker paddw m0, m0 1544*c0909341SAndroid Build Coastguard Worker paddd m4, m4 1545*c0909341SAndroid Build Coastguard Worker paddd m5, m5 1546*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+400*0] 1547*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+r10+400*2] 1548*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+r10+400*4] 1549*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*0], m0 1550*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*2], m4 1551*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*4], m5 1552*c0909341SAndroid Build Coastguard Worker paddd m2, m8 1553*c0909341SAndroid Build Coastguard Worker paddd m3, m8 1554*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a + 8) >> 4 1555*c0909341SAndroid Build Coastguard Worker psrld m3, 4 1556*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 1557*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 1558*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a + 8) >> 4) * 9 1559*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1560*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 1561*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 1562*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 1563*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1564*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1565*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1566*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 1567*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 1568*c0909341SAndroid Build Coastguard Worker pmaxud m4, m2 1569*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 1570*c0909341SAndroid Build Coastguard Worker pmaxud m5, m3 1571*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1572*c0909341SAndroid Build Coastguard Worker pmulld m4, m9 ; p * s 1573*c0909341SAndroid Build Coastguard Worker pmulld m5, m9 1574*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b * 455 1575*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 1576*c0909341SAndroid Build Coastguard Worker paddw m4, m11 1577*c0909341SAndroid Build Coastguard Worker paddw m5, m11 1578*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z + 1 1579*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1580*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 1581*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 1582*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z + 1) 1583*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 1584*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m12, m4 1585*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m12, m5 1586*c0909341SAndroid Build Coastguard Worker mulps m2, m12 ; 256 / (z + 1) 1587*c0909341SAndroid Build Coastguard Worker mulps m3, m12 1588*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z < 255 ? 255 : 0 1589*c0909341SAndroid Build Coastguard Worker psrld m5, 24 1590*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 1591*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 1592*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x 1593*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 1594*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 1595*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 1596*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1597*c0909341SAndroid Build Coastguard Worker paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1598*c0909341SAndroid Build Coastguard Worker paddd m1, m13 1599*c0909341SAndroid Build Coastguard Worker psrld m0, 12 1600*c0909341SAndroid Build Coastguard Worker psrld m1, 12 1601*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*0+ 4], m2 1602*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*0+ 8], xm0 1603*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*0+40], m0, 1 1604*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*0+24], xm1 1605*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*0+56], m1, 1 1606*c0909341SAndroid Build Coastguard Worker add r10, 32 1607*c0909341SAndroid Build Coastguard Worker jl .v0_loop 1608*c0909341SAndroid Build Coastguard Worker ret 1609*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows) 1610*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1611*c0909341SAndroid Build Coastguard Worker.v1_loop: 1612*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+400*0] 1613*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10+400*2] 1614*c0909341SAndroid Build Coastguard Worker mova m5, [t1+r10+400*4] 1615*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+400*0] 1616*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+r10+400*2] 1617*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+r10+400*4] 1618*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*0], m0 1619*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*2], m4 1620*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*4], m5 1621*c0909341SAndroid Build Coastguard Worker paddd m2, m8 1622*c0909341SAndroid Build Coastguard Worker paddd m3, m8 1623*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a + 8) >> 4 1624*c0909341SAndroid Build Coastguard Worker psrld m3, 4 1625*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 1626*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 1627*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a + 8) >> 4) * 9 1628*c0909341SAndroid Build Coastguard Worker paddd m5, m3 1629*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 1630*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 1631*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 1632*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1633*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1634*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1635*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 1636*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 1637*c0909341SAndroid Build Coastguard Worker pmaxud m4, m2 1638*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p 1639*c0909341SAndroid Build Coastguard Worker pmaxud m5, m3 1640*c0909341SAndroid Build Coastguard Worker psubd m5, m3 1641*c0909341SAndroid Build Coastguard Worker pmulld m4, m9 ; p * s 1642*c0909341SAndroid Build Coastguard Worker pmulld m5, m9 1643*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b * 455 1644*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 1645*c0909341SAndroid Build Coastguard Worker paddw m4, m11 1646*c0909341SAndroid Build Coastguard Worker paddw m5, m11 1647*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z + 1 1648*c0909341SAndroid Build Coastguard Worker psrld m5, 20 1649*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 1650*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 1651*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z + 1) 1652*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 1653*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m12, m4 1654*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m12, m5 1655*c0909341SAndroid Build Coastguard Worker mulps m2, m12 ; 256 / (z + 1) 1656*c0909341SAndroid Build Coastguard Worker mulps m3, m12 1657*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z < 255 ? 255 : 0 1658*c0909341SAndroid Build Coastguard Worker psrld m5, 24 1659*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 1660*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 1661*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x 1662*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 1663*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 1664*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 1665*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1666*c0909341SAndroid Build Coastguard Worker paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1667*c0909341SAndroid Build Coastguard Worker paddd m1, m13 1668*c0909341SAndroid Build Coastguard Worker psrld m0, 12 1669*c0909341SAndroid Build Coastguard Worker psrld m1, 12 1670*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*2+ 4], m2 1671*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*4+ 8], xm0 1672*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*4+40], m0, 1 1673*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*4+24], xm1 1674*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*4+56], m1, 1 1675*c0909341SAndroid Build Coastguard Worker add r10, 32 1676*c0909341SAndroid Build Coastguard Worker jl .v1_loop 1677*c0909341SAndroid Build Coastguard Worker mov r10, t2 1678*c0909341SAndroid Build Coastguard Worker mov t2, t1 1679*c0909341SAndroid Build Coastguard Worker mov t1, r10 1680*c0909341SAndroid Build Coastguard Worker ret 1681*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 1682*c0909341SAndroid Build Coastguard Worker mov r10, wq 1683*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 1684*c0909341SAndroid Build Coastguard Worker mova xm0, [t4+r10*1+400*0+0] 1685*c0909341SAndroid Build Coastguard Worker paddw xm0, [t4+r10*1+400*0+4] 1686*c0909341SAndroid Build Coastguard Worker paddw xm2, xm0, [t4+r10*1+400*0+2] 1687*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*2+400*0+0] 1688*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*2+400*0+8] 1689*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t3+r10*2+400*0+4] 1690*c0909341SAndroid Build Coastguard Worker psllw xm2, 2 ; a[-1] 444 1691*c0909341SAndroid Build Coastguard Worker pslld m3, 2 ; b[-1] 444 1692*c0909341SAndroid Build Coastguard Worker psubw xm2, xm0 ; a[-1] 343 1693*c0909341SAndroid Build Coastguard Worker psubd m3, m1 ; b[-1] 343 1694*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400* 4], xm2 1695*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400* 8], m3 1696*c0909341SAndroid Build Coastguard Worker mova xm0, [t4+r10*1+400*2+0] 1697*c0909341SAndroid Build Coastguard Worker paddw xm0, [t4+r10*1+400*2+4] 1698*c0909341SAndroid Build Coastguard Worker paddw xm2, xm0, [t4+r10*1+400*2+2] 1699*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*2+400*4+0] 1700*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*2+400*4+8] 1701*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t3+r10*2+400*4+4] 1702*c0909341SAndroid Build Coastguard Worker psllw xm2, 2 ; a[ 0] 444 1703*c0909341SAndroid Build Coastguard Worker pslld m3, 2 ; b[ 0] 444 1704*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400* 6], xm2 1705*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*12], m3 1706*c0909341SAndroid Build Coastguard Worker psubw xm2, xm0 ; a[ 0] 343 1707*c0909341SAndroid Build Coastguard Worker psubd m3, m1 ; b[ 0] 343 1708*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400* 8], xm2 1709*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*16], m3 1710*c0909341SAndroid Build Coastguard Worker add r10, 16 1711*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 1712*c0909341SAndroid Build Coastguard Worker ret 1713*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1714*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 1715*c0909341SAndroid Build Coastguard Worker mov r10, wq 1716*c0909341SAndroid Build Coastguard Worker.n0_loop: 1717*c0909341SAndroid Build Coastguard Worker mova m3, [t4+r10*1+400*0+0] 1718*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+r10*1+400*0+4] 1719*c0909341SAndroid Build Coastguard Worker paddw m1, m3, [t4+r10*1+400*0+2] 1720*c0909341SAndroid Build Coastguard Worker psllw m1, 2 ; a[ 1] 444 1721*c0909341SAndroid Build Coastguard Worker psubw m2, m1, m3 ; a[ 1] 343 1722*c0909341SAndroid Build Coastguard Worker paddw m3, m2, [t4+r10*1+400*4] 1723*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+r10*1+400*6] 1724*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*4], m2 1725*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*6], m1 1726*c0909341SAndroid Build Coastguard Worker mova m4, [t3+r10*2+400*0+0] 1727*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+r10*2+400*0+8] 1728*c0909341SAndroid Build Coastguard Worker paddd m1, m4, [t3+r10*2+400*0+4] 1729*c0909341SAndroid Build Coastguard Worker pslld m1, 2 ; b[ 1] 444 1730*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m4 ; b[ 1] 343 1731*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t3+r10*2+400* 8+ 0] 1732*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+r10*2+400*12+ 0] 1733*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400* 8+ 0], m2 1734*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*12+ 0], m1 1735*c0909341SAndroid Build Coastguard Worker mova m5, [t3+r10*2+400*0+32] 1736*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+r10*2+400*0+40] 1737*c0909341SAndroid Build Coastguard Worker paddd m1, m5, [t3+r10*2+400*0+36] 1738*c0909341SAndroid Build Coastguard Worker pslld m1, 2 1739*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m5 1740*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+r10*2+400* 8+32] 1741*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+r10*2+400*12+32] 1742*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400* 8+32], m2 1743*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*12+32], m1 1744*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+r10] 1745*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 1746*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 1747*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 1748*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 1749*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1750*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1751*c0909341SAndroid Build Coastguard Worker vinserti128 m1, m4, xm5, 1 1752*c0909341SAndroid Build Coastguard Worker vperm2i128 m4, m5, 0x31 1753*c0909341SAndroid Build Coastguard Worker psubd m1, m2 ; b - a * src + (1 << 8) 1754*c0909341SAndroid Build Coastguard Worker psubd m4, m3 1755*c0909341SAndroid Build Coastguard Worker psrad m1, 9 1756*c0909341SAndroid Build Coastguard Worker psrad m4, 9 1757*c0909341SAndroid Build Coastguard Worker packssdw m1, m4 1758*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 1759*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1760*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 1761*c0909341SAndroid Build Coastguard Worker pminsw m0, m14 1762*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 1763*c0909341SAndroid Build Coastguard Worker add r10, 32 1764*c0909341SAndroid Build Coastguard Worker jl .n0_loop 1765*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1766*c0909341SAndroid Build Coastguard Worker ret 1767*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1768*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 1769*c0909341SAndroid Build Coastguard Worker mov r10, wq 1770*c0909341SAndroid Build Coastguard Worker.n1_loop: 1771*c0909341SAndroid Build Coastguard Worker mova m3, [t4+r10*1+400*2+0] 1772*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+r10*1+400*2+4] 1773*c0909341SAndroid Build Coastguard Worker paddw m1, m3, [t4+r10*1+400*2+2] 1774*c0909341SAndroid Build Coastguard Worker psllw m1, 2 ; a[ 1] 444 1775*c0909341SAndroid Build Coastguard Worker psubw m2, m1, m3 ; a[ 1] 343 1776*c0909341SAndroid Build Coastguard Worker paddw m3, m2, [t4+r10*1+400*6] 1777*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+r10*1+400*8] 1778*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*6], m1 1779*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*8], m2 1780*c0909341SAndroid Build Coastguard Worker mova m4, [t3+r10*2+400*4+0] 1781*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+r10*2+400*4+8] 1782*c0909341SAndroid Build Coastguard Worker paddd m1, m4, [t3+r10*2+400*4+4] 1783*c0909341SAndroid Build Coastguard Worker pslld m1, 2 ; b[ 1] 444 1784*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m4 ; b[ 1] 343 1785*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t3+r10*2+400*12+ 0] 1786*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+r10*2+400*16+ 0] 1787*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*12+ 0], m1 1788*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*16+ 0], m2 1789*c0909341SAndroid Build Coastguard Worker mova m5, [t3+r10*2+400*4+32] 1790*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+r10*2+400*4+40] 1791*c0909341SAndroid Build Coastguard Worker paddd m1, m5, [t3+r10*2+400*4+36] 1792*c0909341SAndroid Build Coastguard Worker pslld m1, 2 1793*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m5 1794*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t3+r10*2+400*12+32] 1795*c0909341SAndroid Build Coastguard Worker paddd m5, [t3+r10*2+400*16+32] 1796*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*12+32], m1 1797*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*16+32], m2 1798*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+r10] 1799*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 1800*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 1801*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 1802*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 1803*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1804*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1805*c0909341SAndroid Build Coastguard Worker vinserti128 m1, m4, xm5, 1 1806*c0909341SAndroid Build Coastguard Worker vperm2i128 m4, m5, 0x31 1807*c0909341SAndroid Build Coastguard Worker psubd m1, m2 ; b - a * src + (1 << 8) 1808*c0909341SAndroid Build Coastguard Worker psubd m4, m3 1809*c0909341SAndroid Build Coastguard Worker psrad m1, 9 1810*c0909341SAndroid Build Coastguard Worker psrad m4, 9 1811*c0909341SAndroid Build Coastguard Worker packssdw m1, m4 1812*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 1813*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1814*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 1815*c0909341SAndroid Build Coastguard Worker pminsw m0, m14 1816*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 1817*c0909341SAndroid Build Coastguard Worker add r10, 32 1818*c0909341SAndroid Build Coastguard Worker jl .n1_loop 1819*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1820*c0909341SAndroid Build Coastguard Worker ret 1821*c0909341SAndroid Build Coastguard Worker 1822*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \ 1823*c0909341SAndroid Build Coastguard Worker w, h, edge, params 1824*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 1825*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 1826*c0909341SAndroid Build Coastguard Worker lea r13, [pb_m10_m9] 1827*c0909341SAndroid Build Coastguard Worker add wd, wd 1828*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1829*c0909341SAndroid Build Coastguard Worker mov edged, r7m 1830*c0909341SAndroid Build Coastguard Worker add lpfq, wq 1831*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [paramsq+8] ; w0 w1 1832*c0909341SAndroid Build Coastguard Worker add dstq, wq 1833*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [paramsq+0] ; s0 1834*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*2+400*24+8] 1835*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [paramsq+4] ; s1 1836*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+wq+400*52+8] 1837*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pd_8] 1838*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+12] 1839*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pd_34816] 1840*c0909341SAndroid Build Coastguard Worker neg wq 1841*c0909341SAndroid Build Coastguard Worker vbroadcastss m11, [base+pf_256] 1842*c0909341SAndroid Build Coastguard Worker pxor m7, m7 1843*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+pw_455_24] 1844*c0909341SAndroid Build Coastguard Worker psllw m15, 2 1845*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 1846*c0909341SAndroid Build Coastguard Worker jz .no_top 1847*c0909341SAndroid Build Coastguard Worker call .h_top 1848*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1849*c0909341SAndroid Build Coastguard Worker mov t2, t1 1850*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup 1851*c0909341SAndroid Build Coastguard Worker add t1, 400*12 1852*c0909341SAndroid Build Coastguard Worker call .h_top 1853*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1854*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1855*c0909341SAndroid Build Coastguard Worker add r10, strideq 1856*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 1857*c0909341SAndroid Build Coastguard Worker call .hv0 1858*c0909341SAndroid Build Coastguard Worker.main: 1859*c0909341SAndroid Build Coastguard Worker dec hd 1860*c0909341SAndroid Build Coastguard Worker jz .height1 1861*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1862*c0909341SAndroid Build Coastguard Worker call .hv1 1863*c0909341SAndroid Build Coastguard Worker call .prep_n 1864*c0909341SAndroid Build Coastguard Worker sub hd, 2 1865*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 1866*c0909341SAndroid Build Coastguard Worker.main_loop: 1867*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1868*c0909341SAndroid Build Coastguard Worker call .hv0 1869*c0909341SAndroid Build Coastguard Worker test hd, hd 1870*c0909341SAndroid Build Coastguard Worker jz .odd_height 1871*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1872*c0909341SAndroid Build Coastguard Worker call .hv1 1873*c0909341SAndroid Build Coastguard Worker call .n0 1874*c0909341SAndroid Build Coastguard Worker call .n1 1875*c0909341SAndroid Build Coastguard Worker sub hd, 2 1876*c0909341SAndroid Build Coastguard Worker jge .main_loop 1877*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 1878*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1879*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 1880*c0909341SAndroid Build Coastguard Worker call .hv0_bottom 1881*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1882*c0909341SAndroid Build Coastguard Worker call .hv1_bottom 1883*c0909341SAndroid Build Coastguard Worker.end: 1884*c0909341SAndroid Build Coastguard Worker call .n0 1885*c0909341SAndroid Build Coastguard Worker call .n1 1886*c0909341SAndroid Build Coastguard Worker.end2: 1887*c0909341SAndroid Build Coastguard Worker RET 1888*c0909341SAndroid Build Coastguard Worker.height1: 1889*c0909341SAndroid Build Coastguard Worker call .v1 1890*c0909341SAndroid Build Coastguard Worker call .prep_n 1891*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 1892*c0909341SAndroid Build Coastguard Worker.odd_height: 1893*c0909341SAndroid Build Coastguard Worker call .v1 1894*c0909341SAndroid Build Coastguard Worker call .n0 1895*c0909341SAndroid Build Coastguard Worker call .n1 1896*c0909341SAndroid Build Coastguard Worker.odd_height_end: 1897*c0909341SAndroid Build Coastguard Worker call .v0 1898*c0909341SAndroid Build Coastguard Worker call .v1 1899*c0909341SAndroid Build Coastguard Worker call .n0 1900*c0909341SAndroid Build Coastguard Worker jmp .end2 1901*c0909341SAndroid Build Coastguard Worker.extend_bottom: 1902*c0909341SAndroid Build Coastguard Worker call .v0 1903*c0909341SAndroid Build Coastguard Worker call .v1 1904*c0909341SAndroid Build Coastguard Worker jmp .end 1905*c0909341SAndroid Build Coastguard Worker.no_top: 1906*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1907*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1908*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 1909*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 1910*c0909341SAndroid Build Coastguard Worker call .h 1911*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1912*c0909341SAndroid Build Coastguard Worker lea t2, [t1+400*12] 1913*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: 1914*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+400* 0] 1915*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10+400* 2] 1916*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10+400* 4] 1917*c0909341SAndroid Build Coastguard Worker paddw m0, m0 1918*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10+400* 6] 1919*c0909341SAndroid Build Coastguard Worker paddd m1, m1 1920*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10+400* 8] 1921*c0909341SAndroid Build Coastguard Worker paddd m2, m2 1922*c0909341SAndroid Build Coastguard Worker mova m5, [t1+r10+400*10] 1923*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 0], m0 1924*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 2], m1 1925*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 4], m2 1926*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 6], m3 1927*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 8], m4 1928*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*10], m5 1929*c0909341SAndroid Build Coastguard Worker add r10, 32 1930*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 1931*c0909341SAndroid Build Coastguard Worker call .v0 1932*c0909341SAndroid Build Coastguard Worker jmp .main 1933*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 1934*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1935*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1936*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1937*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm5, [leftq] 1938*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+wq], 1 1939*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq] 1940*c0909341SAndroid Build Coastguard Worker add leftq, 8 1941*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 10 1942*c0909341SAndroid Build Coastguard Worker jmp .h_main 1943*c0909341SAndroid Build Coastguard Worker.h_extend_left: 1944*c0909341SAndroid Build Coastguard Worker mova xm4, [lpfq+wq] 1945*c0909341SAndroid Build Coastguard Worker pshufb xm4, [base+sgr_lshuf5] 1946*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [lpfq+wq+10], 1 1947*c0909341SAndroid Build Coastguard Worker jmp .h_main 1948*c0909341SAndroid Build Coastguard Worker.h_top: 1949*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1950*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1951*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1952*c0909341SAndroid Build Coastguard Worker.h_loop: 1953*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10- 2] 1954*c0909341SAndroid Build Coastguard Worker.h_main: 1955*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+14] 1956*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1957*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 1958*c0909341SAndroid Build Coastguard Worker cmp r10d, -36 1959*c0909341SAndroid Build Coastguard Worker jl .h_have_right 1960*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right 1961*c0909341SAndroid Build Coastguard Worker.h_have_right: 1962*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 2 1963*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 4 1964*c0909341SAndroid Build Coastguard Worker paddw m1, m3, m0 1965*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m0 1966*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1967*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0 1968*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1969*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 6 1970*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; sum3 1971*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m0, m7 1972*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m6 1973*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m7 1974*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 1975*c0909341SAndroid Build Coastguard Worker paddd m2, m6 ; sumsq3 1976*c0909341SAndroid Build Coastguard Worker shufpd m6, m4, m5, 0x05 1977*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m4 1978*c0909341SAndroid Build Coastguard Worker paddw m8, m4, m6 1979*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 1980*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m4 1981*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m6 1982*c0909341SAndroid Build Coastguard Worker paddd m3, m0 1983*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400* 6], m1 1984*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400* 8], m2 1985*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*10], m3 1986*c0909341SAndroid Build Coastguard Worker paddw m8, m1 ; sum5 1987*c0909341SAndroid Build Coastguard Worker paddd m5, m2 ; sumsq5 1988*c0909341SAndroid Build Coastguard Worker paddd m6, m3 1989*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400* 0], m8 1990*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400* 2], m5 1991*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400* 4], m6 1992*c0909341SAndroid Build Coastguard Worker add r10, 32 1993*c0909341SAndroid Build Coastguard Worker jl .h_loop 1994*c0909341SAndroid Build Coastguard Worker ret 1995*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1996*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 1997*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1998*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1999*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 2000*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm5, [leftq] 2001*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+wq], 1 2002*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq] 2003*c0909341SAndroid Build Coastguard Worker add leftq, 8 2004*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 10 2005*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 2006*c0909341SAndroid Build Coastguard Worker.hv0_extend_left: 2007*c0909341SAndroid Build Coastguard Worker mova xm4, [lpfq+wq] 2008*c0909341SAndroid Build Coastguard Worker pshufb xm4, [base+sgr_lshuf5] 2009*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [lpfq+wq+10], 1 2010*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 2011*c0909341SAndroid Build Coastguard Worker.hv0_bottom: 2012*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 2013*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2014*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 2015*c0909341SAndroid Build Coastguard Worker.hv0_loop: 2016*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10- 2] 2017*c0909341SAndroid Build Coastguard Worker.hv0_main: 2018*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+14] 2019*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2020*c0909341SAndroid Build Coastguard Worker jnz .hv0_have_right 2021*c0909341SAndroid Build Coastguard Worker cmp r10d, -36 2022*c0909341SAndroid Build Coastguard Worker jl .hv0_have_right 2023*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right 2024*c0909341SAndroid Build Coastguard Worker.hv0_have_right: 2025*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 2 2026*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 4 2027*c0909341SAndroid Build Coastguard Worker paddw m1, m3, m0 2028*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m0 2029*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2030*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0 2031*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2032*c0909341SAndroid Build Coastguard Worker palignr m0, m5, m4, 6 2033*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; h sum3 2034*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m0, m7 2035*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m6 2036*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m7 2037*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 2038*c0909341SAndroid Build Coastguard Worker paddd m2, m6 ; h sumsq3 2039*c0909341SAndroid Build Coastguard Worker shufpd m6, m4, m5, 0x05 2040*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m4 2041*c0909341SAndroid Build Coastguard Worker paddw m8, m4, m6 2042*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2043*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m4 2044*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m6 2045*c0909341SAndroid Build Coastguard Worker paddd m3, m0 2046*c0909341SAndroid Build Coastguard Worker paddw m8, m1 ; h sum5 2047*c0909341SAndroid Build Coastguard Worker paddd m5, m2 ; h sumsq5 2048*c0909341SAndroid Build Coastguard Worker paddd m6, m3 2049*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4? 2050*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd 2051*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*0+40], m6 2052*c0909341SAndroid Build Coastguard Worker paddw m8, [t1+r10+400* 0] 2053*c0909341SAndroid Build Coastguard Worker paddd m5, [t1+r10+400* 2] 2054*c0909341SAndroid Build Coastguard Worker paddd m6, [t1+r10+400* 4] 2055*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400* 0], m8 2056*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400* 2], m5 2057*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400* 4], m6 2058*c0909341SAndroid Build Coastguard Worker paddw m0, m1, [t1+r10+400* 6] 2059*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t1+r10+400* 8] 2060*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t1+r10+400*10] 2061*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400* 6], m1 2062*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400* 8], m2 2063*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*10], m3 2064*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+400* 6] 2065*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+r10+400* 8] 2066*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+r10+400*10] 2067*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 6], m0 2068*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 8], m4 2069*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*10], m5 2070*c0909341SAndroid Build Coastguard Worker paddd m2, m9 2071*c0909341SAndroid Build Coastguard Worker paddd m3, m9 2072*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a3 + 8) >> 4 2073*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2074*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 2075*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 2076*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2077*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2078*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 2079*c0909341SAndroid Build Coastguard Worker pavgw m3, m7 ; (b3 + 2) >> 2 2080*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m7 2081*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2082*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m7 2083*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2084*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 2085*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 2086*c0909341SAndroid Build Coastguard Worker pmaxud m4, m2 2087*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p3 2088*c0909341SAndroid Build Coastguard Worker pmaxud m5, m3 2089*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2090*c0909341SAndroid Build Coastguard Worker pmulld m4, m14 ; p3 * s1 2091*c0909341SAndroid Build Coastguard Worker pmulld m5, m14 2092*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 ; b3 * 455 2093*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 2094*c0909341SAndroid Build Coastguard Worker paddw m4, m12 2095*c0909341SAndroid Build Coastguard Worker paddw m5, m12 2096*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z3 + 1 2097*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2098*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 2099*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 2100*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z3 + 1) 2101*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 2102*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m11, m4 2103*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m11, m5 2104*c0909341SAndroid Build Coastguard Worker mulps m2, m11 ; 256 / (z3 + 1) 2105*c0909341SAndroid Build Coastguard Worker mulps m3, m11 2106*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z3 < 255 ? 255 : 0 2107*c0909341SAndroid Build Coastguard Worker psrld m5, 24 2108*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 2109*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 2110*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x3 2111*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 2112*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 2113*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 2114*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 2115*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2116*c0909341SAndroid Build Coastguard Worker paddd m1, m10 2117*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2118*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2119*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*2+ 4], m2 2120*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*4+ 8], xm0 2121*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*4+40], m0, 1 2122*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*4+24], xm1 2123*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*4+56], m1, 1 2124*c0909341SAndroid Build Coastguard Worker add r10, 32 2125*c0909341SAndroid Build Coastguard Worker jl .hv0_loop 2126*c0909341SAndroid Build Coastguard Worker ret 2127*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2128*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2129*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 2130*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2131*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 2132*c0909341SAndroid Build Coastguard Worker vpbroadcastq xm5, [leftq] 2133*c0909341SAndroid Build Coastguard Worker vinserti128 m5, [lpfq+wq], 1 2134*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+wq] 2135*c0909341SAndroid Build Coastguard Worker add leftq, 8 2136*c0909341SAndroid Build Coastguard Worker palignr m4, m5, 10 2137*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 2138*c0909341SAndroid Build Coastguard Worker.hv1_extend_left: 2139*c0909341SAndroid Build Coastguard Worker mova xm4, [lpfq+wq] 2140*c0909341SAndroid Build Coastguard Worker pshufb xm4, [base+sgr_lshuf5] 2141*c0909341SAndroid Build Coastguard Worker vinserti128 m4, [lpfq+wq+10], 1 2142*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 2143*c0909341SAndroid Build Coastguard Worker.hv1_bottom: 2144*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 2145*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2146*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 2147*c0909341SAndroid Build Coastguard Worker.hv1_loop: 2148*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10- 2] 2149*c0909341SAndroid Build Coastguard Worker.hv1_main: 2150*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+14] 2151*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2152*c0909341SAndroid Build Coastguard Worker jnz .hv1_have_right 2153*c0909341SAndroid Build Coastguard Worker cmp r10d, -36 2154*c0909341SAndroid Build Coastguard Worker jl .hv1_have_right 2155*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right 2156*c0909341SAndroid Build Coastguard Worker.hv1_have_right: 2157*c0909341SAndroid Build Coastguard Worker palignr m6, m5, m4, 2 2158*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 4 2159*c0909341SAndroid Build Coastguard Worker paddw m2, m6, m3 2160*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m6, m3 2161*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 2162*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m3 2163*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m6 2164*c0909341SAndroid Build Coastguard Worker palignr m3, m5, m4, 6 2165*c0909341SAndroid Build Coastguard Worker paddw m2, m3 ; h sum3 2166*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m7 2167*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 2168*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m7 2169*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2170*c0909341SAndroid Build Coastguard Worker paddd m0, m1 ; h sumsq3 2171*c0909341SAndroid Build Coastguard Worker shufpd m1, m4, m5, 0x05 2172*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m4, m1 2173*c0909341SAndroid Build Coastguard Worker paddw m8, m4, m1 2174*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2175*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m1 2176*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 2177*c0909341SAndroid Build Coastguard Worker paddd m6, m3 2178*c0909341SAndroid Build Coastguard Worker paddw m1, m2, [t2+r10+400* 6] 2179*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 6], m2 2180*c0909341SAndroid Build Coastguard Worker paddw m8, m2 ; h sum5 2181*c0909341SAndroid Build Coastguard Worker paddd m2, m0, [t2+r10+400* 8] 2182*c0909341SAndroid Build Coastguard Worker paddd m3, m6, [t2+r10+400*10] 2183*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 8], m0 2184*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*10], m6 2185*c0909341SAndroid Build Coastguard Worker paddd m4, m0 ; h sumsq5 2186*c0909341SAndroid Build Coastguard Worker paddd m5, m6 2187*c0909341SAndroid Build Coastguard Worker paddd m2, m9 2188*c0909341SAndroid Build Coastguard Worker paddd m3, m9 2189*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a3 + 8) >> 4 2190*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2191*c0909341SAndroid Build Coastguard Worker pslld m0, m2, 3 2192*c0909341SAndroid Build Coastguard Worker pslld m6, m3, 3 2193*c0909341SAndroid Build Coastguard Worker paddd m2, m0 ; ((a3 + 8) >> 4) * 9 2194*c0909341SAndroid Build Coastguard Worker paddd m3, m6 2195*c0909341SAndroid Build Coastguard Worker psrlw m6, m1, 1 2196*c0909341SAndroid Build Coastguard Worker pavgw m6, m7 ; (b3 + 2) >> 2 2197*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m6, m7 2198*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 2199*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m7 2200*c0909341SAndroid Build Coastguard Worker pmaddwd m6, m6 2201*c0909341SAndroid Build Coastguard Worker pmaxud m2, m0 2202*c0909341SAndroid Build Coastguard Worker psubd m2, m0 ; p3 2203*c0909341SAndroid Build Coastguard Worker pmaxud m3, m6 2204*c0909341SAndroid Build Coastguard Worker psubd m3, m6 2205*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 2206*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 2207*c0909341SAndroid Build Coastguard Worker pmulld m2, m14 ; p3 * s1 2208*c0909341SAndroid Build Coastguard Worker pmulld m3, m14 2209*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 ; b3 * 455 2210*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 2211*c0909341SAndroid Build Coastguard Worker paddw m2, m12 2212*c0909341SAndroid Build Coastguard Worker paddw m3, m12 2213*c0909341SAndroid Build Coastguard Worker psrld m2, 20 ; z + 1 2214*c0909341SAndroid Build Coastguard Worker psrld m3, 20 2215*c0909341SAndroid Build Coastguard Worker cvtdq2ps m2, m2 2216*c0909341SAndroid Build Coastguard Worker cvtdq2ps m3, m3 2217*c0909341SAndroid Build Coastguard Worker rcpps m6, m2 ; 1 / (z + 1) 2218*c0909341SAndroid Build Coastguard Worker rcpps m7, m3 2219*c0909341SAndroid Build Coastguard Worker pcmpgtd m2, m11, m2 2220*c0909341SAndroid Build Coastguard Worker pcmpgtd m3, m11, m3 2221*c0909341SAndroid Build Coastguard Worker mulps m6, m11 ; 256 / (z + 1) 2222*c0909341SAndroid Build Coastguard Worker mulps m7, m11 2223*c0909341SAndroid Build Coastguard Worker psrld m2, 24 ; z < 255 ? 255 : 0 2224*c0909341SAndroid Build Coastguard Worker psrld m3, 24 2225*c0909341SAndroid Build Coastguard Worker cvtps2dq m6, m6 2226*c0909341SAndroid Build Coastguard Worker cvtps2dq m7, m7 2227*c0909341SAndroid Build Coastguard Worker pminsw m6, m2 ; x 2228*c0909341SAndroid Build Coastguard Worker pminsw m7, m3 2229*c0909341SAndroid Build Coastguard Worker pmulld m0, m6 2230*c0909341SAndroid Build Coastguard Worker packssdw m6, m7 2231*c0909341SAndroid Build Coastguard Worker pmulld m7, m1 2232*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2233*c0909341SAndroid Build Coastguard Worker paddd m7, m10 2234*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2235*c0909341SAndroid Build Coastguard Worker psrld m7, 12 2236*c0909341SAndroid Build Coastguard Worker paddw m1, m8, [t2+r10+400*0] 2237*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+r10+400*2] 2238*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+r10+400*4] 2239*c0909341SAndroid Build Coastguard Worker paddw m1, [t1+r10+400*0] 2240*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10+400*2] 2241*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+r10+400*4] 2242*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*0], m8 2243*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*2], m4 2244*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*4], m5 2245*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*4 +4], m6 2246*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*8+ 8], xm0 2247*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*8+40], m0, 1 2248*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*8+24], xm7 2249*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*8+56], m7, 1 2250*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+pd_25] 2251*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_164_24] 2252*c0909341SAndroid Build Coastguard Worker pxor m7, m7 2253*c0909341SAndroid Build Coastguard Worker paddd m2, m9 2254*c0909341SAndroid Build Coastguard Worker paddd m3, m9 2255*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a5 + 8) >> 4 2256*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2257*c0909341SAndroid Build Coastguard Worker pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 2258*c0909341SAndroid Build Coastguard Worker pmulld m3, m4 2259*c0909341SAndroid Build Coastguard Worker psrlw m5, m1, 1 2260*c0909341SAndroid Build Coastguard Worker pavgw m5, m7 ; (b5 + 2) >> 2 2261*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m7 2262*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 2263*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m7 2264*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2265*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b5 2266*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 2267*c0909341SAndroid Build Coastguard Worker pmaxud m2, m4 2268*c0909341SAndroid Build Coastguard Worker psubd m2, m4 ; p5 2269*c0909341SAndroid Build Coastguard Worker pmaxud m3, m5 2270*c0909341SAndroid Build Coastguard Worker psubd m3, m5 2271*c0909341SAndroid Build Coastguard Worker pmulld m2, m13 ; p5 * s0 2272*c0909341SAndroid Build Coastguard Worker pmulld m3, m13 2273*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m6 ; b5 * 164 2274*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m6 2275*c0909341SAndroid Build Coastguard Worker paddw m2, m6 2276*c0909341SAndroid Build Coastguard Worker paddw m3, m6 2277*c0909341SAndroid Build Coastguard Worker psrld m2, 20 ; z5 + 1 2278*c0909341SAndroid Build Coastguard Worker psrld m3, 20 2279*c0909341SAndroid Build Coastguard Worker cvtdq2ps m2, m2 2280*c0909341SAndroid Build Coastguard Worker cvtdq2ps m3, m3 2281*c0909341SAndroid Build Coastguard Worker rcpps m4, m2 ; 1 / (z5 + 1) 2282*c0909341SAndroid Build Coastguard Worker rcpps m5, m3 2283*c0909341SAndroid Build Coastguard Worker pcmpgtd m2, m11, m2 2284*c0909341SAndroid Build Coastguard Worker pcmpgtd m3, m11, m3 2285*c0909341SAndroid Build Coastguard Worker mulps m4, m11 ; 256 / (z5 + 1) 2286*c0909341SAndroid Build Coastguard Worker mulps m5, m11 2287*c0909341SAndroid Build Coastguard Worker psrld m2, 24 ; z5 < 255 ? 255 : 0 2288*c0909341SAndroid Build Coastguard Worker psrld m3, 24 2289*c0909341SAndroid Build Coastguard Worker cvtps2dq m4, m4 2290*c0909341SAndroid Build Coastguard Worker cvtps2dq m5, m5 2291*c0909341SAndroid Build Coastguard Worker pminsw m4, m2 ; x5 2292*c0909341SAndroid Build Coastguard Worker pminsw m5, m3 2293*c0909341SAndroid Build Coastguard Worker pmulld m0, m4 2294*c0909341SAndroid Build Coastguard Worker pmulld m1, m5 2295*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 2296*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2297*c0909341SAndroid Build Coastguard Worker paddd m1, m10 2298*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2299*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2300*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*0+ 4], m4 2301*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*0+ 8], xm0 2302*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*0+40], m0, 1 2303*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*0+24], xm1 2304*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*0+56], m1, 1 2305*c0909341SAndroid Build Coastguard Worker add r10, 32 2306*c0909341SAndroid Build Coastguard Worker jl .hv1_loop 2307*c0909341SAndroid Build Coastguard Worker mov r10, t2 2308*c0909341SAndroid Build Coastguard Worker mov t2, t1 2309*c0909341SAndroid Build Coastguard Worker mov t1, r10 2310*c0909341SAndroid Build Coastguard Worker ret 2311*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab3 (even rows) 2312*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 2313*c0909341SAndroid Build Coastguard Worker.v0_loop: 2314*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+400* 6] 2315*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10+400* 8] 2316*c0909341SAndroid Build Coastguard Worker mova m5, [t1+r10+400*10] 2317*c0909341SAndroid Build Coastguard Worker paddw m0, m0 2318*c0909341SAndroid Build Coastguard Worker paddd m4, m4 2319*c0909341SAndroid Build Coastguard Worker paddd m5, m5 2320*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+400* 6] 2321*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+r10+400* 8] 2322*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+r10+400*10] 2323*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 6], m0 2324*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 8], m4 2325*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*10], m5 2326*c0909341SAndroid Build Coastguard Worker paddd m2, m9 2327*c0909341SAndroid Build Coastguard Worker paddd m3, m9 2328*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a3 + 8) >> 4 2329*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2330*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 2331*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 2332*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2333*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2334*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 2335*c0909341SAndroid Build Coastguard Worker pavgw m3, m7 ; (b3 + 2) >> 2 2336*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m7 2337*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2338*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m7 2339*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2340*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 2341*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 2342*c0909341SAndroid Build Coastguard Worker pmaxud m4, m2 2343*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p3 2344*c0909341SAndroid Build Coastguard Worker pmaxud m5, m3 2345*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2346*c0909341SAndroid Build Coastguard Worker pmulld m4, m14 ; p3 * s1 2347*c0909341SAndroid Build Coastguard Worker pmulld m5, m14 2348*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 ; b3 * 455 2349*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 2350*c0909341SAndroid Build Coastguard Worker paddw m4, m12 2351*c0909341SAndroid Build Coastguard Worker paddw m5, m12 2352*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z + 1 2353*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2354*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 2355*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 2356*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z + 1) 2357*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 2358*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m11, m4 2359*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m11, m5 2360*c0909341SAndroid Build Coastguard Worker mulps m2, m11 ; 256 / (z + 1) 2361*c0909341SAndroid Build Coastguard Worker mulps m3, m11 2362*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z < 255 ? 255 : 0 2363*c0909341SAndroid Build Coastguard Worker psrld m5, 24 2364*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 2365*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 2366*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x 2367*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 2368*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 2369*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 2370*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 2371*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2372*c0909341SAndroid Build Coastguard Worker paddd m1, m10 2373*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2374*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2375*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10+400*0] 2376*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10+400*2] 2377*c0909341SAndroid Build Coastguard Worker mova m5, [t1+r10+400*4] 2378*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*8+ 8], m3 2379*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*0+ 8], m4 2380*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*0+40], m5 2381*c0909341SAndroid Build Coastguard Worker paddw m3, m3 ; cc5 2382*c0909341SAndroid Build Coastguard Worker paddd m4, m4 2383*c0909341SAndroid Build Coastguard Worker paddd m5, m5 2384*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*0], m3 2385*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*2], m4 2386*c0909341SAndroid Build Coastguard Worker mova [t1+r10+400*4], m5 2387*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*2+ 4], m2 2388*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*4+ 8], xm0 2389*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*4+40], m0, 1 2390*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*4+24], xm1 2391*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*4+56], m1, 1 2392*c0909341SAndroid Build Coastguard Worker add r10, 32 2393*c0909341SAndroid Build Coastguard Worker jl .v0_loop 2394*c0909341SAndroid Build Coastguard Worker ret 2395*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows) 2396*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 2397*c0909341SAndroid Build Coastguard Worker.v1_loop: 2398*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10+400* 6] 2399*c0909341SAndroid Build Coastguard Worker mova m5, [t1+r10+400* 8] 2400*c0909341SAndroid Build Coastguard Worker mova m6, [t1+r10+400*10] 2401*c0909341SAndroid Build Coastguard Worker paddw m1, m4, [t2+r10+400* 6] 2402*c0909341SAndroid Build Coastguard Worker paddd m2, m5, [t2+r10+400* 8] 2403*c0909341SAndroid Build Coastguard Worker paddd m3, m6, [t2+r10+400*10] 2404*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 6], m4 2405*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400* 8], m5 2406*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*10], m6 2407*c0909341SAndroid Build Coastguard Worker paddd m2, m9 2408*c0909341SAndroid Build Coastguard Worker paddd m3, m9 2409*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a3 + 8) >> 4 2410*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2411*c0909341SAndroid Build Coastguard Worker pslld m4, m2, 3 2412*c0909341SAndroid Build Coastguard Worker pslld m5, m3, 3 2413*c0909341SAndroid Build Coastguard Worker paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2414*c0909341SAndroid Build Coastguard Worker paddd m5, m3 2415*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 2416*c0909341SAndroid Build Coastguard Worker pavgw m3, m7 ; (b3 + 2) >> 2 2417*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m7 2418*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 2419*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m7 2420*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 2421*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b3 2422*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 2423*c0909341SAndroid Build Coastguard Worker pmaxud m4, m2 2424*c0909341SAndroid Build Coastguard Worker psubd m4, m2 ; p3 2425*c0909341SAndroid Build Coastguard Worker pmaxud m5, m3 2426*c0909341SAndroid Build Coastguard Worker psubd m5, m3 2427*c0909341SAndroid Build Coastguard Worker pmulld m4, m14 ; p3 * s1 2428*c0909341SAndroid Build Coastguard Worker pmulld m5, m14 2429*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m12 ; b3 * 455 2430*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m12 2431*c0909341SAndroid Build Coastguard Worker paddw m4, m12 2432*c0909341SAndroid Build Coastguard Worker paddw m5, m12 2433*c0909341SAndroid Build Coastguard Worker psrld m4, 20 ; z + 1 2434*c0909341SAndroid Build Coastguard Worker psrld m5, 20 2435*c0909341SAndroid Build Coastguard Worker cvtdq2ps m4, m4 2436*c0909341SAndroid Build Coastguard Worker cvtdq2ps m5, m5 2437*c0909341SAndroid Build Coastguard Worker rcpps m2, m4 ; 1 / (z + 1) 2438*c0909341SAndroid Build Coastguard Worker rcpps m3, m5 2439*c0909341SAndroid Build Coastguard Worker pcmpgtd m4, m11, m4 2440*c0909341SAndroid Build Coastguard Worker pcmpgtd m5, m11, m5 2441*c0909341SAndroid Build Coastguard Worker mulps m2, m11 ; 256 / (z + 1) 2442*c0909341SAndroid Build Coastguard Worker mulps m3, m11 2443*c0909341SAndroid Build Coastguard Worker psrld m4, 24 ; z < 255 ? 255 : 0 2444*c0909341SAndroid Build Coastguard Worker psrld m5, 24 2445*c0909341SAndroid Build Coastguard Worker cvtps2dq m2, m2 2446*c0909341SAndroid Build Coastguard Worker cvtps2dq m3, m3 2447*c0909341SAndroid Build Coastguard Worker pminsw m2, m4 ; x 2448*c0909341SAndroid Build Coastguard Worker pminsw m3, m5 2449*c0909341SAndroid Build Coastguard Worker pmulld m0, m2 2450*c0909341SAndroid Build Coastguard Worker pmulld m1, m3 2451*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 2452*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2453*c0909341SAndroid Build Coastguard Worker paddd m1, m10 2454*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2455*c0909341SAndroid Build Coastguard Worker psrld m8, m1, 12 2456*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*4+4], m2 2457*c0909341SAndroid Build Coastguard Worker mova m4, [t3+r10*2+400*8+ 8] 2458*c0909341SAndroid Build Coastguard Worker mova m5, [t3+r10*2+400*0+ 8] 2459*c0909341SAndroid Build Coastguard Worker mova m6, [t3+r10*2+400*0+40] 2460*c0909341SAndroid Build Coastguard Worker paddw m1, m4, [t2+r10+400*0] 2461*c0909341SAndroid Build Coastguard Worker paddd m2, m5, [t2+r10+400*2] 2462*c0909341SAndroid Build Coastguard Worker paddd m3, m6, [t2+r10+400*4] 2463*c0909341SAndroid Build Coastguard Worker paddw m1, [t1+r10+400*0] 2464*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10+400*2] 2465*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+r10+400*4] 2466*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*0], m4 2467*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*2], m5 2468*c0909341SAndroid Build Coastguard Worker mova [t2+r10+400*4], m6 2469*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*8+ 8], xm0 2470*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*8+40], m0, 1 2471*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*8+24], xm8 2472*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*8+56], m8, 1 2473*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [base+pd_25] 2474*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pw_164_24] 2475*c0909341SAndroid Build Coastguard Worker paddd m2, m9 2476*c0909341SAndroid Build Coastguard Worker paddd m3, m9 2477*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a5 + 8) >> 4 2478*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2479*c0909341SAndroid Build Coastguard Worker pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 2480*c0909341SAndroid Build Coastguard Worker pmulld m3, m4 2481*c0909341SAndroid Build Coastguard Worker psrlw m5, m1, 1 2482*c0909341SAndroid Build Coastguard Worker pavgw m5, m7 ; (b5 + 2) >> 2 2483*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m7 2484*c0909341SAndroid Build Coastguard Worker pmaddwd m4, m4 2485*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m7 2486*c0909341SAndroid Build Coastguard Worker pmaddwd m5, m5 2487*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; b5 2488*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 2489*c0909341SAndroid Build Coastguard Worker pmaxud m2, m4 2490*c0909341SAndroid Build Coastguard Worker psubd m2, m4 ; p5 2491*c0909341SAndroid Build Coastguard Worker pmaxud m3, m5 2492*c0909341SAndroid Build Coastguard Worker psubd m3, m5 2493*c0909341SAndroid Build Coastguard Worker pmulld m2, m13 ; p5 * s0 2494*c0909341SAndroid Build Coastguard Worker pmulld m3, m13 2495*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m6 ; b5 * 164 2496*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m6 2497*c0909341SAndroid Build Coastguard Worker paddw m2, m6 2498*c0909341SAndroid Build Coastguard Worker paddw m3, m6 2499*c0909341SAndroid Build Coastguard Worker psrld m2, 20 ; z5 + 1 2500*c0909341SAndroid Build Coastguard Worker psrld m3, 20 2501*c0909341SAndroid Build Coastguard Worker cvtdq2ps m2, m2 2502*c0909341SAndroid Build Coastguard Worker cvtdq2ps m3, m3 2503*c0909341SAndroid Build Coastguard Worker rcpps m4, m2 ; 1 / (z5 + 1) 2504*c0909341SAndroid Build Coastguard Worker rcpps m5, m3 2505*c0909341SAndroid Build Coastguard Worker pcmpgtd m2, m11, m2 2506*c0909341SAndroid Build Coastguard Worker pcmpgtd m3, m11, m3 2507*c0909341SAndroid Build Coastguard Worker mulps m4, m11 ; 256 / (z5 + 1) 2508*c0909341SAndroid Build Coastguard Worker mulps m5, m11 2509*c0909341SAndroid Build Coastguard Worker psrld m2, 24 ; z5 < 255 ? 255 : 0 2510*c0909341SAndroid Build Coastguard Worker psrld m3, 24 2511*c0909341SAndroid Build Coastguard Worker cvtps2dq m4, m4 2512*c0909341SAndroid Build Coastguard Worker cvtps2dq m5, m5 2513*c0909341SAndroid Build Coastguard Worker pminsw m4, m2 ; x5 2514*c0909341SAndroid Build Coastguard Worker pminsw m5, m3 2515*c0909341SAndroid Build Coastguard Worker pmulld m0, m4 2516*c0909341SAndroid Build Coastguard Worker pmulld m1, m5 2517*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 2518*c0909341SAndroid Build Coastguard Worker paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2519*c0909341SAndroid Build Coastguard Worker paddd m1, m10 2520*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2521*c0909341SAndroid Build Coastguard Worker psrld m1, 12 2522*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*0+ 4], m4 2523*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*0+ 8], xm0 2524*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*0+40], m0, 1 2525*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*0+24], xm1 2526*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+400*0+56], m1, 1 2527*c0909341SAndroid Build Coastguard Worker add r10, 32 2528*c0909341SAndroid Build Coastguard Worker jl .v1_loop 2529*c0909341SAndroid Build Coastguard Worker mov r10, t2 2530*c0909341SAndroid Build Coastguard Worker mov t2, t1 2531*c0909341SAndroid Build Coastguard Worker mov t1, r10 2532*c0909341SAndroid Build Coastguard Worker ret 2533*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 2534*c0909341SAndroid Build Coastguard Worker mov r10, wq 2535*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 2536*c0909341SAndroid Build Coastguard Worker movu xm0, [t4+r10*1+400*0+2] 2537*c0909341SAndroid Build Coastguard Worker paddw xm2, xm0, [t4+r10*1+400*0+0] 2538*c0909341SAndroid Build Coastguard Worker paddw xm2, [t4+r10*1+400*0+4] 2539*c0909341SAndroid Build Coastguard Worker movu m1, [t3+r10*2+400*0+4] 2540*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t3+r10*2+400*0+0] 2541*c0909341SAndroid Build Coastguard Worker paddd m3, [t3+r10*2+400*0+8] 2542*c0909341SAndroid Build Coastguard Worker paddw xm0, xm2 2543*c0909341SAndroid Build Coastguard Worker paddd m1, m3 2544*c0909341SAndroid Build Coastguard Worker psllw xm2, 2 2545*c0909341SAndroid Build Coastguard Worker pslld m3, 2 2546*c0909341SAndroid Build Coastguard Worker paddw xm0, xm2 ; a5 565 2547*c0909341SAndroid Build Coastguard Worker paddd m1, m3 ; b5 565 2548*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400* 6], xm0 2549*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*12], m1 2550*c0909341SAndroid Build Coastguard Worker mova xm0, [t4+r10*1+400*2+0] 2551*c0909341SAndroid Build Coastguard Worker paddw xm0, [t4+r10*1+400*2+4] 2552*c0909341SAndroid Build Coastguard Worker paddw xm2, xm0, [t4+r10*1+400*2+2] 2553*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*2+400*4+0] 2554*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*2+400*4+8] 2555*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t3+r10*2+400*4+4] 2556*c0909341SAndroid Build Coastguard Worker psllw xm2, 2 ; a3[-1] 444 2557*c0909341SAndroid Build Coastguard Worker pslld m3, 2 ; b3[-1] 444 2558*c0909341SAndroid Build Coastguard Worker psubw xm2, xm0 ; a3[-1] 343 2559*c0909341SAndroid Build Coastguard Worker psubd m3, m1 ; b3[-1] 343 2560*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400* 8], xm2 2561*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*16], m3 2562*c0909341SAndroid Build Coastguard Worker mova xm0, [t4+r10*1+400*4+0] 2563*c0909341SAndroid Build Coastguard Worker paddw xm0, [t4+r10*1+400*4+4] 2564*c0909341SAndroid Build Coastguard Worker paddw xm2, xm0, [t4+r10*1+400*4+2] 2565*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*2+400*8+0] 2566*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*2+400*8+8] 2567*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t3+r10*2+400*8+4] 2568*c0909341SAndroid Build Coastguard Worker psllw xm2, 2 ; a3[ 0] 444 2569*c0909341SAndroid Build Coastguard Worker pslld m3, 2 ; b3[ 0] 444 2570*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*10], xm2 2571*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*20], m3 2572*c0909341SAndroid Build Coastguard Worker psubw xm2, xm0 ; a3[ 0] 343 2573*c0909341SAndroid Build Coastguard Worker psubd m3, m1 ; b3[ 0] 343 2574*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*12], xm2 2575*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*24], m3 2576*c0909341SAndroid Build Coastguard Worker add r10, 16 2577*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 2578*c0909341SAndroid Build Coastguard Worker ret 2579*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2580*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 2581*c0909341SAndroid Build Coastguard Worker mov r10, wq 2582*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pd_4096] 2583*c0909341SAndroid Build Coastguard Worker.n0_loop: 2584*c0909341SAndroid Build Coastguard Worker movu xm2, [t4+r10*1+2] 2585*c0909341SAndroid Build Coastguard Worker paddw xm0, xm2, [t4+r10*1+0] 2586*c0909341SAndroid Build Coastguard Worker paddw xm0, [t4+r10*1+4] 2587*c0909341SAndroid Build Coastguard Worker paddw xm2, xm0 2588*c0909341SAndroid Build Coastguard Worker psllw xm0, 2 2589*c0909341SAndroid Build Coastguard Worker paddw xm0, xm2 ; a5 2590*c0909341SAndroid Build Coastguard Worker movu m1, [t3+r10*2+4] 2591*c0909341SAndroid Build Coastguard Worker paddd m4, m1, [t3+r10*2+0] 2592*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+r10*2+8] 2593*c0909341SAndroid Build Coastguard Worker paddd m1, m4 2594*c0909341SAndroid Build Coastguard Worker pslld m4, 2 2595*c0909341SAndroid Build Coastguard Worker paddd m4, m1 ; b5 2596*c0909341SAndroid Build Coastguard Worker paddw xm2, xm0, [t4+r10*1+400* 6] 2597*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400* 6], xm0 2598*c0909341SAndroid Build Coastguard Worker paddd m0, m4, [t3+r10*2+400*12] 2599*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*12], m4 2600*c0909341SAndroid Build Coastguard Worker mova xm3, [t4+r10*1+400*2+0] 2601*c0909341SAndroid Build Coastguard Worker paddw xm3, [t4+r10*1+400*2+4] 2602*c0909341SAndroid Build Coastguard Worker paddw xm5, xm3, [t4+r10*1+400*2+2] 2603*c0909341SAndroid Build Coastguard Worker psllw xm5, 2 ; a3[ 1] 444 2604*c0909341SAndroid Build Coastguard Worker psubw xm4, xm5, xm3 ; a3[ 1] 343 2605*c0909341SAndroid Build Coastguard Worker paddw xm3, xm4, [t4+r10*1+400* 8] 2606*c0909341SAndroid Build Coastguard Worker paddw xm3, [t4+r10*1+400*10] 2607*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400* 8], xm4 2608*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*10], xm5 2609*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*2+400*4+0] 2610*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*2+400*4+8] 2611*c0909341SAndroid Build Coastguard Worker paddd m5, m1, [t3+r10*2+400*4+4] 2612*c0909341SAndroid Build Coastguard Worker pslld m5, 2 ; b3[ 1] 444 2613*c0909341SAndroid Build Coastguard Worker psubd m4, m5, m1 ; b3[ 1] 343 2614*c0909341SAndroid Build Coastguard Worker paddd m1, m4, [t3+r10*2+400*16] 2615*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*2+400*20] 2616*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*16], m4 2617*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*20], m5 2618*c0909341SAndroid Build Coastguard Worker pmovzxwd m4, [dstq+r10] 2619*c0909341SAndroid Build Coastguard Worker pmovzxwd m2, xm2 ; a5 2620*c0909341SAndroid Build Coastguard Worker pmovzxwd m3, xm3 ; a3 2621*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m4 ; a5 * src 2622*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m4 ; a3 * src 2623*c0909341SAndroid Build Coastguard Worker pslld m4, 13 2624*c0909341SAndroid Build Coastguard Worker psubd m0, m2 ; b5 - a5 * src + (1 << 8) 2625*c0909341SAndroid Build Coastguard Worker psubd m1, m3 ; b3 - a3 * src + (1 << 8) 2626*c0909341SAndroid Build Coastguard Worker psrld m0, 9 2627*c0909341SAndroid Build Coastguard Worker pslld m1, 7 2628*c0909341SAndroid Build Coastguard Worker pblendw m0, m1, 0xaa 2629*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 2630*c0909341SAndroid Build Coastguard Worker paddd m4, m6 2631*c0909341SAndroid Build Coastguard Worker paddd m0, m4 2632*c0909341SAndroid Build Coastguard Worker psrad m0, 7 2633*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 2634*c0909341SAndroid Build Coastguard Worker packusdw xm0, xm1 ; clip 2635*c0909341SAndroid Build Coastguard Worker psrlw xm0, 6 2636*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], xm0 2637*c0909341SAndroid Build Coastguard Worker add r10, 16 2638*c0909341SAndroid Build Coastguard Worker jl .n0_loop 2639*c0909341SAndroid Build Coastguard Worker add dstq, strideq 2640*c0909341SAndroid Build Coastguard Worker ret 2641*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2642*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 2643*c0909341SAndroid Build Coastguard Worker mov r10, wq 2644*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [base+pd_4096] 2645*c0909341SAndroid Build Coastguard Worker.n1_loop: 2646*c0909341SAndroid Build Coastguard Worker mova xm3, [t4+r10*1+400*4+0] 2647*c0909341SAndroid Build Coastguard Worker paddw xm3, [t4+r10*1+400*4+4] 2648*c0909341SAndroid Build Coastguard Worker paddw xm5, xm3, [t4+r10*1+400*4+2] 2649*c0909341SAndroid Build Coastguard Worker psllw xm5, 2 ; a3[ 1] 444 2650*c0909341SAndroid Build Coastguard Worker psubw xm4, xm5, xm3 ; a3[ 1] 343 2651*c0909341SAndroid Build Coastguard Worker paddw xm3, xm4, [t4+r10*1+400*12] 2652*c0909341SAndroid Build Coastguard Worker paddw xm3, [t4+r10*1+400*10] 2653*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*10], xm5 2654*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+400*12], xm4 2655*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*2+400*8+0] 2656*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*2+400*8+8] 2657*c0909341SAndroid Build Coastguard Worker paddd m5, m1, [t3+r10*2+400*8+4] 2658*c0909341SAndroid Build Coastguard Worker pslld m5, 2 ; b3[ 1] 444 2659*c0909341SAndroid Build Coastguard Worker psubd m4, m5, m1 ; b3[ 1] 343 2660*c0909341SAndroid Build Coastguard Worker paddd m1, m4, [t3+r10*2+400*24] 2661*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*2+400*20] 2662*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*20], m5 2663*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+400*24], m4 2664*c0909341SAndroid Build Coastguard Worker pmovzxwd m4, [dstq+r10] 2665*c0909341SAndroid Build Coastguard Worker pmovzxwd m2, [t4+r10*1+400* 6] 2666*c0909341SAndroid Build Coastguard Worker pmovzxwd m3, xm3 2667*c0909341SAndroid Build Coastguard Worker mova m0, [t3+r10*2+400*12] 2668*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m4 ; a5 * src 2669*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m4 ; a3 * src 2670*c0909341SAndroid Build Coastguard Worker pslld m4, 13 2671*c0909341SAndroid Build Coastguard Worker psubd m0, m2 ; b5 - a5 * src + (1 << 8) 2672*c0909341SAndroid Build Coastguard Worker psubd m1, m3 ; b3 - a3 * src + (1 << 8) 2673*c0909341SAndroid Build Coastguard Worker psrld m0, 8 2674*c0909341SAndroid Build Coastguard Worker pslld m1, 7 2675*c0909341SAndroid Build Coastguard Worker pblendw m0, m1, 0xaa 2676*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m15 2677*c0909341SAndroid Build Coastguard Worker paddd m4, m6 2678*c0909341SAndroid Build Coastguard Worker paddd m0, m4 2679*c0909341SAndroid Build Coastguard Worker psrad m0, 7 2680*c0909341SAndroid Build Coastguard Worker vextracti128 xm1, m0, 1 2681*c0909341SAndroid Build Coastguard Worker packusdw xm0, xm1 ; clip 2682*c0909341SAndroid Build Coastguard Worker psrlw xm0, 6 2683*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], xm0 2684*c0909341SAndroid Build Coastguard Worker add r10, 16 2685*c0909341SAndroid Build Coastguard Worker jl .n1_loop 2686*c0909341SAndroid Build Coastguard Worker add dstq, strideq 2687*c0909341SAndroid Build Coastguard Worker ret 2688*c0909341SAndroid Build Coastguard Worker 2689*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 2690