1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16 32*c0909341SAndroid Build Coastguard Worker 33*c0909341SAndroid Build Coastguard Workerwiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 34*c0909341SAndroid Build Coastguard Workerwiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 35*c0909341SAndroid Build Coastguard Workerwiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 36*c0909341SAndroid Build Coastguard Workerwiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 37*c0909341SAndroid Build Coastguard Workerwiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 38*c0909341SAndroid Build Coastguard Workerr_ext_mask: times 72 db -1 39*c0909341SAndroid Build Coastguard Worker times 8 db 0 40*c0909341SAndroid Build Coastguard Workerwiener_hshift: dw 4, 4, 1, 1 41*c0909341SAndroid Build Coastguard Workerwiener_vshift: dw 1024, 1024, 4096, 4096 42*c0909341SAndroid Build Coastguard Workerwiener_round: dd 1049600, 1048832 43*c0909341SAndroid Build Coastguard Worker 44*c0909341SAndroid Build Coastguard Workerpw_164_455: dw 164, 455 45*c0909341SAndroid Build Coastguard Workerpw_1023: times 2 dw 1023 46*c0909341SAndroid Build Coastguard Workerpw_61448: times 2 dw 61448 47*c0909341SAndroid Build Coastguard Workerpd_m262128: dd -262128 48*c0909341SAndroid Build Coastguard Workerpd_m34816: dd -34816 49*c0909341SAndroid Build Coastguard Workerpd_m25: dd -25 50*c0909341SAndroid Build Coastguard Workerpd_m9: dd -9 51*c0909341SAndroid Build Coastguard Workerpd_8: dd 8 52*c0909341SAndroid Build Coastguard Workerpd_2147483648: dd 2147483648 53*c0909341SAndroid Build Coastguard Worker 54*c0909341SAndroid Build Coastguard Workercextern sgr_x_by_x 55*c0909341SAndroid Build Coastguard Worker 56*c0909341SAndroid Build Coastguard WorkerSECTION .text 57*c0909341SAndroid Build Coastguard Worker 58*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers 59*c0909341SAndroid Build Coastguard Worker 60*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 61*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_16bpc, 4, 15, 17, -384*12-16, dst, stride, left, lpf, \ 62*c0909341SAndroid Build Coastguard Worker w, h, edge, flt 63*c0909341SAndroid Build Coastguard Worker%define base t4-wiener_hshift 64*c0909341SAndroid Build Coastguard Worker mov fltq, r6mp 65*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 66*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 67*c0909341SAndroid Build Coastguard Worker mov edged, r7m 68*c0909341SAndroid Build Coastguard Worker mov t3d, r8m ; pixel_max 69*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [wiener_shufA] 70*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [fltq+ 0] ; x0 x1 71*c0909341SAndroid Build Coastguard Worker lea t4, [wiener_hshift] 72*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [wiener_shufB] 73*c0909341SAndroid Build Coastguard Worker add wd, wd 74*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [fltq+ 4] ; x2 x3 75*c0909341SAndroid Build Coastguard Worker shr t3d, 11 76*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [fltq+16] ; y0 y1 77*c0909341SAndroid Build Coastguard Worker add lpfq, wq 78*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [fltq+20] ; y2 y3 79*c0909341SAndroid Build Coastguard Worker add dstq, wq 80*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [wiener_shufC] 81*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+16] 82*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m9, [wiener_shufD] 83*c0909341SAndroid Build Coastguard Worker neg wq 84*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+wiener_hshift+t3*4] 85*c0909341SAndroid Build Coastguard Worker mov r10d, 0xfe 86*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+wiener_round+t3*4] 87*c0909341SAndroid Build Coastguard Worker kmovb k1, r10d 88*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+wiener_vshift+t3*4] 89*c0909341SAndroid Build Coastguard Worker pmullw m12, m0 ; upshift filter coefs to make the 90*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, [pd_m262128] 91*c0909341SAndroid Build Coastguard Worker pmullw m13, m0 ; horizontal downshift constant 92*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 93*c0909341SAndroid Build Coastguard Worker jz .no_top 94*c0909341SAndroid Build Coastguard Worker call .h_top 95*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 96*c0909341SAndroid Build Coastguard Worker mov t6, t1 97*c0909341SAndroid Build Coastguard Worker mov t5, t1 98*c0909341SAndroid Build Coastguard Worker add t1, 384*2 99*c0909341SAndroid Build Coastguard Worker call .h_top 100*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 101*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 102*c0909341SAndroid Build Coastguard Worker mov t4, t1 103*c0909341SAndroid Build Coastguard Worker add t1, 384*2 104*c0909341SAndroid Build Coastguard Worker add r10, strideq 105*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 106*c0909341SAndroid Build Coastguard Worker call .h 107*c0909341SAndroid Build Coastguard Worker mov t3, t1 108*c0909341SAndroid Build Coastguard Worker mov t2, t1 109*c0909341SAndroid Build Coastguard Worker dec hd 110*c0909341SAndroid Build Coastguard Worker jz .v1 111*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 112*c0909341SAndroid Build Coastguard Worker add t1, 384*2 113*c0909341SAndroid Build Coastguard Worker call .h 114*c0909341SAndroid Build Coastguard Worker mov t2, t1 115*c0909341SAndroid Build Coastguard Worker dec hd 116*c0909341SAndroid Build Coastguard Worker jz .v2 117*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 118*c0909341SAndroid Build Coastguard Worker add t1, 384*2 119*c0909341SAndroid Build Coastguard Worker call .h 120*c0909341SAndroid Build Coastguard Worker dec hd 121*c0909341SAndroid Build Coastguard Worker jz .v3 122*c0909341SAndroid Build Coastguard Worker.main: 123*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 124*c0909341SAndroid Build Coastguard Worker.main_loop: 125*c0909341SAndroid Build Coastguard Worker call .hv 126*c0909341SAndroid Build Coastguard Worker dec hd 127*c0909341SAndroid Build Coastguard Worker jnz .main_loop 128*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 129*c0909341SAndroid Build Coastguard Worker jz .v3 130*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 131*c0909341SAndroid Build Coastguard Worker call .hv_bottom 132*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 133*c0909341SAndroid Build Coastguard Worker call .hv_bottom 134*c0909341SAndroid Build Coastguard Worker.v1: 135*c0909341SAndroid Build Coastguard Worker call .v 136*c0909341SAndroid Build Coastguard Worker RET 137*c0909341SAndroid Build Coastguard Worker.no_top: 138*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 139*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 140*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 141*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 142*c0909341SAndroid Build Coastguard Worker call .h 143*c0909341SAndroid Build Coastguard Worker mov t6, t1 144*c0909341SAndroid Build Coastguard Worker mov t5, t1 145*c0909341SAndroid Build Coastguard Worker mov t4, t1 146*c0909341SAndroid Build Coastguard Worker mov t3, t1 147*c0909341SAndroid Build Coastguard Worker mov t2, t1 148*c0909341SAndroid Build Coastguard Worker dec hd 149*c0909341SAndroid Build Coastguard Worker jz .v1 150*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 151*c0909341SAndroid Build Coastguard Worker add t1, 384*2 152*c0909341SAndroid Build Coastguard Worker call .h 153*c0909341SAndroid Build Coastguard Worker mov t2, t1 154*c0909341SAndroid Build Coastguard Worker dec hd 155*c0909341SAndroid Build Coastguard Worker jz .v2 156*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 157*c0909341SAndroid Build Coastguard Worker add t1, 384*2 158*c0909341SAndroid Build Coastguard Worker call .h 159*c0909341SAndroid Build Coastguard Worker dec hd 160*c0909341SAndroid Build Coastguard Worker jz .v3 161*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 162*c0909341SAndroid Build Coastguard Worker call .hv 163*c0909341SAndroid Build Coastguard Worker dec hd 164*c0909341SAndroid Build Coastguard Worker jz .v3 165*c0909341SAndroid Build Coastguard Worker add t0, 384*8 166*c0909341SAndroid Build Coastguard Worker call .hv 167*c0909341SAndroid Build Coastguard Worker dec hd 168*c0909341SAndroid Build Coastguard Worker jnz .main 169*c0909341SAndroid Build Coastguard Worker.v3: 170*c0909341SAndroid Build Coastguard Worker call .v 171*c0909341SAndroid Build Coastguard Worker.v2: 172*c0909341SAndroid Build Coastguard Worker call .v 173*c0909341SAndroid Build Coastguard Worker jmp .v1 174*c0909341SAndroid Build Coastguard Worker.h: 175*c0909341SAndroid Build Coastguard Worker mov r10, wq 176*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 177*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 178*c0909341SAndroid Build Coastguard Worker movq xm3, [leftq] 179*c0909341SAndroid Build Coastguard Worker vmovdqu64 m3{k1}, [lpfq+r10-8] 180*c0909341SAndroid Build Coastguard Worker add leftq, 8 181*c0909341SAndroid Build Coastguard Worker jmp .h_main 182*c0909341SAndroid Build Coastguard Worker.h_extend_left: 183*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+r10+0] 184*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm3, xm4 185*c0909341SAndroid Build Coastguard Worker vmovdqu64 m3{k1}, [lpfq+r10-8] 186*c0909341SAndroid Build Coastguard Worker jmp .h_main2 187*c0909341SAndroid Build Coastguard Worker.h_top: 188*c0909341SAndroid Build Coastguard Worker mov r10, wq 189*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 190*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 191*c0909341SAndroid Build Coastguard Worker.h_loop: 192*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+r10-8] 193*c0909341SAndroid Build Coastguard Worker.h_main: 194*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+r10+0] 195*c0909341SAndroid Build Coastguard Worker.h_main2: 196*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+8] 197*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 198*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 199*c0909341SAndroid Build Coastguard Worker cmp r10d, -68 200*c0909341SAndroid Build Coastguard Worker jl .h_have_right 201*c0909341SAndroid Build Coastguard Worker push r0 202*c0909341SAndroid Build Coastguard Worker lea r0, [r_ext_mask+66] 203*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 204*c0909341SAndroid Build Coastguard Worker vpternlogd m3, m0, [r0+r10+ 0], 0xe4 ; c ? a : b 205*c0909341SAndroid Build Coastguard Worker vpternlogd m4, m0, [r0+r10+ 8], 0xe4 206*c0909341SAndroid Build Coastguard Worker vpternlogd m5, m0, [r0+r10+16], 0xe4 207*c0909341SAndroid Build Coastguard Worker pop r0 208*c0909341SAndroid Build Coastguard Worker.h_have_right: 209*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m6 210*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m7 211*c0909341SAndroid Build Coastguard Worker paddw m2, m1 212*c0909341SAndroid Build Coastguard Worker pshufb m3, m8 213*c0909341SAndroid Build Coastguard Worker mova m0, m16 214*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m2, m12 215*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m9 216*c0909341SAndroid Build Coastguard Worker paddw m3, m1 217*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m6 218*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m3, m13 219*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m7 220*c0909341SAndroid Build Coastguard Worker paddw m2, m1 221*c0909341SAndroid Build Coastguard Worker mova m1, m16 222*c0909341SAndroid Build Coastguard Worker pshufb m4, m8 223*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m2, m12 224*c0909341SAndroid Build Coastguard Worker pshufb m5, m9 225*c0909341SAndroid Build Coastguard Worker paddw m4, m5 226*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, m13 227*c0909341SAndroid Build Coastguard Worker psrad m0, 4 228*c0909341SAndroid Build Coastguard Worker psrad m1, 4 229*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 230*c0909341SAndroid Build Coastguard Worker psraw m0, 1 231*c0909341SAndroid Build Coastguard Worker mova [t1+r10], m0 232*c0909341SAndroid Build Coastguard Worker add r10, 64 233*c0909341SAndroid Build Coastguard Worker jl .h_loop 234*c0909341SAndroid Build Coastguard Worker ret 235*c0909341SAndroid Build Coastguard WorkerALIGN function_align 236*c0909341SAndroid Build Coastguard Worker.hv: 237*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 238*c0909341SAndroid Build Coastguard Worker mov r10, wq 239*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 240*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 241*c0909341SAndroid Build Coastguard Worker movq xm3, [leftq] 242*c0909341SAndroid Build Coastguard Worker vmovdqu64 m3{k1}, [lpfq+r10-8] 243*c0909341SAndroid Build Coastguard Worker add leftq, 8 244*c0909341SAndroid Build Coastguard Worker jmp .hv_main 245*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 246*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+r10+0] 247*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm3, xm4 248*c0909341SAndroid Build Coastguard Worker vmovdqu64 m3{k1}, [lpfq+r10-8] 249*c0909341SAndroid Build Coastguard Worker jmp .hv_main2 250*c0909341SAndroid Build Coastguard Worker.hv_bottom: 251*c0909341SAndroid Build Coastguard Worker mov r10, wq 252*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 253*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 254*c0909341SAndroid Build Coastguard Worker.hv_loop: 255*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+r10-8] 256*c0909341SAndroid Build Coastguard Worker.hv_main: 257*c0909341SAndroid Build Coastguard Worker mova m4, [lpfq+r10+0] 258*c0909341SAndroid Build Coastguard Worker.hv_main2: 259*c0909341SAndroid Build Coastguard Worker movu m5, [lpfq+r10+8] 260*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 261*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 262*c0909341SAndroid Build Coastguard Worker cmp r10d, -68 263*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 264*c0909341SAndroid Build Coastguard Worker push r0 265*c0909341SAndroid Build Coastguard Worker lea r0, [r_ext_mask+66] 266*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 267*c0909341SAndroid Build Coastguard Worker vpternlogd m3, m0, [r0+r10+ 0], 0xe4 268*c0909341SAndroid Build Coastguard Worker vpternlogd m4, m0, [r0+r10+ 8], 0xe4 269*c0909341SAndroid Build Coastguard Worker vpternlogd m5, m0, [r0+r10+16], 0xe4 270*c0909341SAndroid Build Coastguard Worker pop r0 271*c0909341SAndroid Build Coastguard Worker.hv_have_right: 272*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m6 273*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m7 274*c0909341SAndroid Build Coastguard Worker paddw m2, m1 275*c0909341SAndroid Build Coastguard Worker pshufb m3, m8 276*c0909341SAndroid Build Coastguard Worker mova m0, m16 277*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m2, m12 278*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m9 279*c0909341SAndroid Build Coastguard Worker paddw m3, m1 280*c0909341SAndroid Build Coastguard Worker pshufb m1, m4, m6 281*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m3, m13 282*c0909341SAndroid Build Coastguard Worker pshufb m2, m5, m7 283*c0909341SAndroid Build Coastguard Worker paddw m2, m1 284*c0909341SAndroid Build Coastguard Worker pshufb m4, m8 285*c0909341SAndroid Build Coastguard Worker mova m1, m16 286*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m2, m12 287*c0909341SAndroid Build Coastguard Worker pshufb m5, m9 288*c0909341SAndroid Build Coastguard Worker paddw m4, m5 289*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, m13 290*c0909341SAndroid Build Coastguard Worker mova m2, [t4+r10] 291*c0909341SAndroid Build Coastguard Worker paddw m2, [t2+r10] 292*c0909341SAndroid Build Coastguard Worker mova m5, [t3+r10] 293*c0909341SAndroid Build Coastguard Worker psrad m0, 4 294*c0909341SAndroid Build Coastguard Worker psrad m1, 4 295*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 296*c0909341SAndroid Build Coastguard Worker mova m4, [t5+r10] 297*c0909341SAndroid Build Coastguard Worker paddw m4, [t1+r10] 298*c0909341SAndroid Build Coastguard Worker psraw m0, 1 299*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t6+r10] 300*c0909341SAndroid Build Coastguard Worker mova [t0+r10], m0 301*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m5 302*c0909341SAndroid Build Coastguard Worker mova m0, m10 303*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m1, m15 304*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m5 305*c0909341SAndroid Build Coastguard Worker mova m1, m10 306*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m2, m15 307*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 308*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m2, m14 309*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 310*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m3, m14 311*c0909341SAndroid Build Coastguard Worker psrad m0, 5 312*c0909341SAndroid Build Coastguard Worker psrad m1, 5 313*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 314*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m11 315*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 316*c0909341SAndroid Build Coastguard Worker add r10, 64 317*c0909341SAndroid Build Coastguard Worker jl .hv_loop 318*c0909341SAndroid Build Coastguard Worker mov t6, t5 319*c0909341SAndroid Build Coastguard Worker mov t5, t4 320*c0909341SAndroid Build Coastguard Worker mov t4, t3 321*c0909341SAndroid Build Coastguard Worker mov t3, t2 322*c0909341SAndroid Build Coastguard Worker mov t2, t1 323*c0909341SAndroid Build Coastguard Worker mov t1, t0 324*c0909341SAndroid Build Coastguard Worker mov t0, t6 325*c0909341SAndroid Build Coastguard Worker add dstq, strideq 326*c0909341SAndroid Build Coastguard Worker ret 327*c0909341SAndroid Build Coastguard Worker.v: 328*c0909341SAndroid Build Coastguard Worker mov r10, wq 329*c0909341SAndroid Build Coastguard Worker.v_loop: 330*c0909341SAndroid Build Coastguard Worker mova m2, [t4+r10] 331*c0909341SAndroid Build Coastguard Worker paddw m2, [t2+r10] 332*c0909341SAndroid Build Coastguard Worker mova m3, [t3+r10] 333*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 334*c0909341SAndroid Build Coastguard Worker mova m0, m10 335*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m1, m15 336*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 337*c0909341SAndroid Build Coastguard Worker mova m1, m10 338*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m2, m15 339*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10] 340*c0909341SAndroid Build Coastguard Worker paddw m3, m4, [t6+r10] 341*c0909341SAndroid Build Coastguard Worker paddw m4, [t5+r10] 342*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 343*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m2, m14 344*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 345*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m3, m14 346*c0909341SAndroid Build Coastguard Worker psrad m0, 5 347*c0909341SAndroid Build Coastguard Worker psrad m1, 5 348*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 349*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m11 350*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 351*c0909341SAndroid Build Coastguard Worker add r10, 64 352*c0909341SAndroid Build Coastguard Worker jl .v_loop 353*c0909341SAndroid Build Coastguard Worker mov t6, t5 354*c0909341SAndroid Build Coastguard Worker mov t5, t4 355*c0909341SAndroid Build Coastguard Worker mov t4, t3 356*c0909341SAndroid Build Coastguard Worker mov t3, t2 357*c0909341SAndroid Build Coastguard Worker mov t2, t1 358*c0909341SAndroid Build Coastguard Worker add dstq, strideq 359*c0909341SAndroid Build Coastguard Worker ret 360*c0909341SAndroid Build Coastguard Worker 361*c0909341SAndroid Build Coastguard Workercglobal wiener_filter5_16bpc, 4, 14, 15, 384*8+16, dst, stride, left, lpf, \ 362*c0909341SAndroid Build Coastguard Worker w, h, edge, flt 363*c0909341SAndroid Build Coastguard Worker%define base r13-r_ext_mask-70 364*c0909341SAndroid Build Coastguard Worker mov fltq, r6mp 365*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 366*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 367*c0909341SAndroid Build Coastguard Worker mov edged, r7m 368*c0909341SAndroid Build Coastguard Worker mov t3d, r8m ; pixel_max 369*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [wiener_shufE] 370*c0909341SAndroid Build Coastguard Worker vpbroadcastw m11, [fltq+ 2] ; x1 371*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [wiener_shufB] 372*c0909341SAndroid Build Coastguard Worker lea r13, [r_ext_mask+70] 373*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [wiener_shufD] 374*c0909341SAndroid Build Coastguard Worker add wd, wd 375*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [fltq+ 4] ; x2 x3 376*c0909341SAndroid Build Coastguard Worker shr t3d, 11 377*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) 378*c0909341SAndroid Build Coastguard Worker add lpfq, wq 379*c0909341SAndroid Build Coastguard Worker vpbroadcastw m13, [fltq+18] ; y1 380*c0909341SAndroid Build Coastguard Worker add dstq, wq 381*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [fltq+20] ; y2 y3 382*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+16] 383*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [base+wiener_hshift+t3*4] 384*c0909341SAndroid Build Coastguard Worker neg wq 385*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+wiener_round+t3*4] 386*c0909341SAndroid Build Coastguard Worker mov r10d, 0xfffe 387*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+wiener_vshift+t3*4] 388*c0909341SAndroid Build Coastguard Worker kmovw k1, r10d 389*c0909341SAndroid Build Coastguard Worker pmullw m11, m0 390*c0909341SAndroid Build Coastguard Worker pmullw m12, m0 391*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 392*c0909341SAndroid Build Coastguard Worker jz .no_top 393*c0909341SAndroid Build Coastguard Worker call .h_top 394*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 395*c0909341SAndroid Build Coastguard Worker mov t4, t1 396*c0909341SAndroid Build Coastguard Worker add t1, 384*2 397*c0909341SAndroid Build Coastguard Worker call .h_top 398*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 399*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 400*c0909341SAndroid Build Coastguard Worker mov t3, t1 401*c0909341SAndroid Build Coastguard Worker add t1, 384*2 402*c0909341SAndroid Build Coastguard Worker add r10, strideq 403*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 404*c0909341SAndroid Build Coastguard Worker call .h 405*c0909341SAndroid Build Coastguard Worker mov t2, t1 406*c0909341SAndroid Build Coastguard Worker dec hd 407*c0909341SAndroid Build Coastguard Worker jz .v1 408*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 409*c0909341SAndroid Build Coastguard Worker add t1, 384*2 410*c0909341SAndroid Build Coastguard Worker call .h 411*c0909341SAndroid Build Coastguard Worker dec hd 412*c0909341SAndroid Build Coastguard Worker jz .v2 413*c0909341SAndroid Build Coastguard Worker.main: 414*c0909341SAndroid Build Coastguard Worker mov t0, t4 415*c0909341SAndroid Build Coastguard Worker.main_loop: 416*c0909341SAndroid Build Coastguard Worker call .hv 417*c0909341SAndroid Build Coastguard Worker dec hd 418*c0909341SAndroid Build Coastguard Worker jnz .main_loop 419*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 420*c0909341SAndroid Build Coastguard Worker jz .v2 421*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 422*c0909341SAndroid Build Coastguard Worker call .hv_bottom 423*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 424*c0909341SAndroid Build Coastguard Worker call .hv_bottom 425*c0909341SAndroid Build Coastguard Worker.end: 426*c0909341SAndroid Build Coastguard Worker RET 427*c0909341SAndroid Build Coastguard Worker.no_top: 428*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 429*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 430*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 431*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 432*c0909341SAndroid Build Coastguard Worker call .h 433*c0909341SAndroid Build Coastguard Worker mov t4, t1 434*c0909341SAndroid Build Coastguard Worker mov t3, t1 435*c0909341SAndroid Build Coastguard Worker mov t2, t1 436*c0909341SAndroid Build Coastguard Worker dec hd 437*c0909341SAndroid Build Coastguard Worker jz .v1 438*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 439*c0909341SAndroid Build Coastguard Worker add t1, 384*2 440*c0909341SAndroid Build Coastguard Worker call .h 441*c0909341SAndroid Build Coastguard Worker dec hd 442*c0909341SAndroid Build Coastguard Worker jz .v2 443*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 444*c0909341SAndroid Build Coastguard Worker call .hv 445*c0909341SAndroid Build Coastguard Worker dec hd 446*c0909341SAndroid Build Coastguard Worker jz .v2 447*c0909341SAndroid Build Coastguard Worker add t0, 384*6 448*c0909341SAndroid Build Coastguard Worker call .hv 449*c0909341SAndroid Build Coastguard Worker dec hd 450*c0909341SAndroid Build Coastguard Worker jnz .main 451*c0909341SAndroid Build Coastguard Worker.v2: 452*c0909341SAndroid Build Coastguard Worker call .v 453*c0909341SAndroid Build Coastguard Worker mov t4, t3 454*c0909341SAndroid Build Coastguard Worker mov t3, t2 455*c0909341SAndroid Build Coastguard Worker mov t2, t1 456*c0909341SAndroid Build Coastguard Worker add dstq, strideq 457*c0909341SAndroid Build Coastguard Worker.v1: 458*c0909341SAndroid Build Coastguard Worker call .v 459*c0909341SAndroid Build Coastguard Worker jmp .end 460*c0909341SAndroid Build Coastguard Worker.h: 461*c0909341SAndroid Build Coastguard Worker mov r10, wq 462*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 463*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 464*c0909341SAndroid Build Coastguard Worker movd xm3, [leftq+4] 465*c0909341SAndroid Build Coastguard Worker vmovdqu32 m3{k1}, [lpfq+r10-4] 466*c0909341SAndroid Build Coastguard Worker add leftq, 8 467*c0909341SAndroid Build Coastguard Worker jmp .h_main 468*c0909341SAndroid Build Coastguard Worker.h_extend_left: 469*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm3, [lpfq+r10] 470*c0909341SAndroid Build Coastguard Worker vmovdqu32 m3{k1}, [lpfq+r10-4] 471*c0909341SAndroid Build Coastguard Worker jmp .h_main 472*c0909341SAndroid Build Coastguard Worker.h_top: 473*c0909341SAndroid Build Coastguard Worker mov r10, wq 474*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 475*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 476*c0909341SAndroid Build Coastguard Worker.h_loop: 477*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+r10-4] 478*c0909341SAndroid Build Coastguard Worker.h_main: 479*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10+4] 480*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 481*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 482*c0909341SAndroid Build Coastguard Worker cmp r10d, -66 483*c0909341SAndroid Build Coastguard Worker jl .h_have_right 484*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 485*c0909341SAndroid Build Coastguard Worker vpternlogd m3, m0, [r13+r10+0], 0xe4 ; c ? a : b 486*c0909341SAndroid Build Coastguard Worker vpternlogd m4, m0, [r13+r10+8], 0xe4 487*c0909341SAndroid Build Coastguard Worker.h_have_right: 488*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m5 489*c0909341SAndroid Build Coastguard Worker mova m0, m8 490*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m1, m11 491*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m5 492*c0909341SAndroid Build Coastguard Worker mova m1, m8 493*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m2, m11 494*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m6 495*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 496*c0909341SAndroid Build Coastguard Worker paddw m2, m3 497*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m6 498*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m2, m12 499*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 500*c0909341SAndroid Build Coastguard Worker paddw m3, m4 501*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m3, m12 502*c0909341SAndroid Build Coastguard Worker psrad m0, 4 503*c0909341SAndroid Build Coastguard Worker psrad m1, 4 504*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 505*c0909341SAndroid Build Coastguard Worker psraw m0, 1 506*c0909341SAndroid Build Coastguard Worker mova [t1+r10], m0 507*c0909341SAndroid Build Coastguard Worker add r10, 64 508*c0909341SAndroid Build Coastguard Worker jl .h_loop 509*c0909341SAndroid Build Coastguard Worker ret 510*c0909341SAndroid Build Coastguard WorkerALIGN function_align 511*c0909341SAndroid Build Coastguard Worker.hv: 512*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 513*c0909341SAndroid Build Coastguard Worker mov r10, wq 514*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 515*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 516*c0909341SAndroid Build Coastguard Worker movd xm3, [leftq+4] 517*c0909341SAndroid Build Coastguard Worker vmovdqu32 m3{k1}, [lpfq+r10-4] 518*c0909341SAndroid Build Coastguard Worker add leftq, 8 519*c0909341SAndroid Build Coastguard Worker jmp .hv_main 520*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 521*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm3, [lpfq+r10] 522*c0909341SAndroid Build Coastguard Worker vmovdqu32 m3{k1}, [lpfq+r10-4] 523*c0909341SAndroid Build Coastguard Worker jmp .hv_main 524*c0909341SAndroid Build Coastguard Worker.hv_bottom: 525*c0909341SAndroid Build Coastguard Worker mov r10, wq 526*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 527*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 528*c0909341SAndroid Build Coastguard Worker.hv_loop: 529*c0909341SAndroid Build Coastguard Worker movu m3, [lpfq+r10-4] 530*c0909341SAndroid Build Coastguard Worker.hv_main: 531*c0909341SAndroid Build Coastguard Worker movu m4, [lpfq+r10+4] 532*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 533*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 534*c0909341SAndroid Build Coastguard Worker cmp r10d, -66 535*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 536*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 537*c0909341SAndroid Build Coastguard Worker vpternlogd m3, m0, [r13+r10+0], 0xe4 538*c0909341SAndroid Build Coastguard Worker vpternlogd m4, m0, [r13+r10+8], 0xe4 539*c0909341SAndroid Build Coastguard Worker.hv_have_right: 540*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m5 541*c0909341SAndroid Build Coastguard Worker mova m0, m8 542*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m1, m11 543*c0909341SAndroid Build Coastguard Worker pshufb m2, m4, m5 544*c0909341SAndroid Build Coastguard Worker mova m1, m8 545*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m2, m11 546*c0909341SAndroid Build Coastguard Worker pshufb m2, m3, m6 547*c0909341SAndroid Build Coastguard Worker pshufb m3, m7 548*c0909341SAndroid Build Coastguard Worker paddw m2, m3 549*c0909341SAndroid Build Coastguard Worker pshufb m3, m4, m6 550*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m2, m12 551*c0909341SAndroid Build Coastguard Worker pshufb m4, m7 552*c0909341SAndroid Build Coastguard Worker paddw m4, m3 553*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, m12 554*c0909341SAndroid Build Coastguard Worker mova m2, [t3+r10] 555*c0909341SAndroid Build Coastguard Worker paddw m2, [t1+r10] 556*c0909341SAndroid Build Coastguard Worker mova m3, [t2+r10] 557*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m2, m3 558*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 559*c0909341SAndroid Build Coastguard Worker mova m3, m9 560*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m2, m14 561*c0909341SAndroid Build Coastguard Worker mova m2, m9 562*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m4, m14 563*c0909341SAndroid Build Coastguard Worker mova m4, [t4+r10] 564*c0909341SAndroid Build Coastguard Worker psrad m0, 4 565*c0909341SAndroid Build Coastguard Worker psrad m1, 4 566*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 567*c0909341SAndroid Build Coastguard Worker psraw m0, 1 568*c0909341SAndroid Build Coastguard Worker mova [t0+r10], m0 569*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m4 570*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m1, m13 571*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m4 572*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m0, m13 573*c0909341SAndroid Build Coastguard Worker psrad m2, 5 574*c0909341SAndroid Build Coastguard Worker psrad m3, 5 575*c0909341SAndroid Build Coastguard Worker packusdw m2, m3 576*c0909341SAndroid Build Coastguard Worker pmulhuw m2, m10 577*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m2 578*c0909341SAndroid Build Coastguard Worker add r10, 64 579*c0909341SAndroid Build Coastguard Worker jl .hv_loop 580*c0909341SAndroid Build Coastguard Worker mov t4, t3 581*c0909341SAndroid Build Coastguard Worker mov t3, t2 582*c0909341SAndroid Build Coastguard Worker mov t2, t1 583*c0909341SAndroid Build Coastguard Worker mov t1, t0 584*c0909341SAndroid Build Coastguard Worker mov t0, t4 585*c0909341SAndroid Build Coastguard Worker add dstq, strideq 586*c0909341SAndroid Build Coastguard Worker ret 587*c0909341SAndroid Build Coastguard Worker.v: 588*c0909341SAndroid Build Coastguard Worker mov r10, wq 589*c0909341SAndroid Build Coastguard Worker.v_loop: 590*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10] 591*c0909341SAndroid Build Coastguard Worker paddw m2, m0, [t3+r10] 592*c0909341SAndroid Build Coastguard Worker mova m1, [t2+r10] 593*c0909341SAndroid Build Coastguard Worker mova m4, [t4+r10] 594*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m2, m1 595*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m14 596*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1 597*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m14 598*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m4 599*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m13 600*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4 601*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m13 602*c0909341SAndroid Build Coastguard Worker paddd m3, m9 603*c0909341SAndroid Build Coastguard Worker paddd m2, m9 604*c0909341SAndroid Build Coastguard Worker paddd m1, m3 605*c0909341SAndroid Build Coastguard Worker paddd m0, m2 606*c0909341SAndroid Build Coastguard Worker psrad m1, 5 607*c0909341SAndroid Build Coastguard Worker psrad m0, 5 608*c0909341SAndroid Build Coastguard Worker packusdw m0, m1 609*c0909341SAndroid Build Coastguard Worker pmulhuw m0, m10 610*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 611*c0909341SAndroid Build Coastguard Worker add r10, 64 612*c0909341SAndroid Build Coastguard Worker jl .v_loop 613*c0909341SAndroid Build Coastguard Worker ret 614*c0909341SAndroid Build Coastguard Worker 615*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_16bpc, 4, 14, 22, 416*24+8, dst, stride, left, lpf, \ 616*c0909341SAndroid Build Coastguard Worker w, h, edge, params 617*c0909341SAndroid Build Coastguard Worker%define base r13-r_ext_mask-72 618*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 619*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 620*c0909341SAndroid Build Coastguard Worker lea r13, [r_ext_mask+72] 621*c0909341SAndroid Build Coastguard Worker mov edged, r7m 622*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 623*c0909341SAndroid Build Coastguard Worker pxor m6, m6 624*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, [paramsq+8] ; w0 625*c0909341SAndroid Build Coastguard Worker add wd, wd 626*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pd_8] 627*c0909341SAndroid Build Coastguard Worker add lpfq, wq 628*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pd_m25] 629*c0909341SAndroid Build Coastguard Worker add dstq, wq 630*c0909341SAndroid Build Coastguard Worker vpsubd m10, m6, [paramsq+0] {1to16} ; -s0 631*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*2+416*12+8] 632*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pw_164_455] 633*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+wq+416*20+8] 634*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+pw_61448] ; (15 << 12) + (1 << 3) 635*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+12] 636*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+pd_m34816] ; -((1 << 11) + (1 << 15)) 637*c0909341SAndroid Build Coastguard Worker neg wq 638*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [base+pw_1023] 639*c0909341SAndroid Build Coastguard Worker psllw m7, 4 640*c0909341SAndroid Build Coastguard Worker mova m18, [sgr_x_by_x+64*0] 641*c0909341SAndroid Build Coastguard Worker mov r10d, 0xfffffff8 642*c0909341SAndroid Build Coastguard Worker mova m19, [sgr_x_by_x+64*1] 643*c0909341SAndroid Build Coastguard Worker kmovd k1, r10d 644*c0909341SAndroid Build Coastguard Worker mova m20, [sgr_x_by_x+64*2] 645*c0909341SAndroid Build Coastguard Worker mov r10, 0x3333333333333333 646*c0909341SAndroid Build Coastguard Worker mova m21, [sgr_x_by_x+64*3] 647*c0909341SAndroid Build Coastguard Worker kmovq k2, r10 648*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 649*c0909341SAndroid Build Coastguard Worker jz .no_top 650*c0909341SAndroid Build Coastguard Worker call .h_top 651*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 652*c0909341SAndroid Build Coastguard Worker mov t2, t1 653*c0909341SAndroid Build Coastguard Worker call .top_fixup 654*c0909341SAndroid Build Coastguard Worker add t1, 416*6 655*c0909341SAndroid Build Coastguard Worker call .h_top 656*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 657*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 658*c0909341SAndroid Build Coastguard Worker add r10, strideq 659*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 660*c0909341SAndroid Build Coastguard Worker mov t0, t2 661*c0909341SAndroid Build Coastguard Worker dec hd 662*c0909341SAndroid Build Coastguard Worker jz .height1 663*c0909341SAndroid Build Coastguard Worker or edged, 16 664*c0909341SAndroid Build Coastguard Worker call .h 665*c0909341SAndroid Build Coastguard Worker.main: 666*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 667*c0909341SAndroid Build Coastguard Worker call .hv 668*c0909341SAndroid Build Coastguard Worker call .prep_n 669*c0909341SAndroid Build Coastguard Worker sub hd, 2 670*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 671*c0909341SAndroid Build Coastguard Worker.main_loop: 672*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 673*c0909341SAndroid Build Coastguard Worker test hd, hd 674*c0909341SAndroid Build Coastguard Worker jz .odd_height 675*c0909341SAndroid Build Coastguard Worker call .h 676*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 677*c0909341SAndroid Build Coastguard Worker call .hv 678*c0909341SAndroid Build Coastguard Worker call .n0 679*c0909341SAndroid Build Coastguard Worker call .n1 680*c0909341SAndroid Build Coastguard Worker sub hd, 2 681*c0909341SAndroid Build Coastguard Worker jge .main_loop 682*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 683*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 684*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 685*c0909341SAndroid Build Coastguard Worker call .h_top 686*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 687*c0909341SAndroid Build Coastguard Worker call .hv_bottom 688*c0909341SAndroid Build Coastguard Worker.end: 689*c0909341SAndroid Build Coastguard Worker call .n0 690*c0909341SAndroid Build Coastguard Worker call .n1 691*c0909341SAndroid Build Coastguard Worker.end2: 692*c0909341SAndroid Build Coastguard Worker RET 693*c0909341SAndroid Build Coastguard Worker.height1: 694*c0909341SAndroid Build Coastguard Worker call .hv 695*c0909341SAndroid Build Coastguard Worker call .prep_n 696*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 697*c0909341SAndroid Build Coastguard Worker.odd_height: 698*c0909341SAndroid Build Coastguard Worker call .hv 699*c0909341SAndroid Build Coastguard Worker call .n0 700*c0909341SAndroid Build Coastguard Worker call .n1 701*c0909341SAndroid Build Coastguard Worker.odd_height_end: 702*c0909341SAndroid Build Coastguard Worker call .v 703*c0909341SAndroid Build Coastguard Worker call .n0 704*c0909341SAndroid Build Coastguard Worker jmp .end2 705*c0909341SAndroid Build Coastguard Worker.extend_bottom: 706*c0909341SAndroid Build Coastguard Worker call .v 707*c0909341SAndroid Build Coastguard Worker jmp .end 708*c0909341SAndroid Build Coastguard Worker.no_top: 709*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 710*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 711*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 712*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 713*c0909341SAndroid Build Coastguard Worker call .h 714*c0909341SAndroid Build Coastguard Worker lea t2, [t1+416*6] 715*c0909341SAndroid Build Coastguard Worker call .top_fixup 716*c0909341SAndroid Build Coastguard Worker dec hd 717*c0909341SAndroid Build Coastguard Worker jz .no_top_height1 718*c0909341SAndroid Build Coastguard Worker or edged, 16 719*c0909341SAndroid Build Coastguard Worker mov t0, t1 720*c0909341SAndroid Build Coastguard Worker mov t1, t2 721*c0909341SAndroid Build Coastguard Worker jmp .main 722*c0909341SAndroid Build Coastguard Worker.no_top_height1: 723*c0909341SAndroid Build Coastguard Worker call .v 724*c0909341SAndroid Build Coastguard Worker call .prep_n 725*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 726*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 727*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 728*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 729*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 730*c0909341SAndroid Build Coastguard Worker movq xm16, [leftq+2] 731*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-6] 732*c0909341SAndroid Build Coastguard Worker add leftq, 8 733*c0909341SAndroid Build Coastguard Worker jmp .h_main 734*c0909341SAndroid Build Coastguard Worker.h_extend_left: 735*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm16, [lpfq+wq] 736*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-6] 737*c0909341SAndroid Build Coastguard Worker jmp .h_main 738*c0909341SAndroid Build Coastguard Worker.h_top: 739*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 740*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 741*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 742*c0909341SAndroid Build Coastguard Worker.h_loop: 743*c0909341SAndroid Build Coastguard Worker movu m16, [lpfq+r10- 2] 744*c0909341SAndroid Build Coastguard Worker.h_main: 745*c0909341SAndroid Build Coastguard Worker movu m17, [lpfq+r10+14] 746*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 747*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 748*c0909341SAndroid Build Coastguard Worker cmp r10d, -68 749*c0909341SAndroid Build Coastguard Worker jl .h_have_right 750*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 751*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, [r13+r10+ 0], 0xe4 ; c ? a : b 752*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, [r13+r10+16], 0xe4 753*c0909341SAndroid Build Coastguard Worker.h_have_right: 754*c0909341SAndroid Build Coastguard Worker palignr m2, m17, m16, 2 755*c0909341SAndroid Build Coastguard Worker paddw m0, m16, m2 756*c0909341SAndroid Build Coastguard Worker palignr m3, m17, m16, 6 757*c0909341SAndroid Build Coastguard Worker paddw m0, m3 758*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 759*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 760*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 761*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 762*c0909341SAndroid Build Coastguard Worker shufpd m17, m16, m17, 0x55 763*c0909341SAndroid Build Coastguard Worker paddw m0, m17 764*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m16, m17 765*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m3, m3 766*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m16, m17 767*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m3, m3 768*c0909341SAndroid Build Coastguard Worker shufps m16, m17, q2121 769*c0909341SAndroid Build Coastguard Worker paddw m0, m16 ; sum 770*c0909341SAndroid Build Coastguard Worker test edgeb, 16 ; y > 0 771*c0909341SAndroid Build Coastguard Worker jz .h_loop_end 772*c0909341SAndroid Build Coastguard Worker paddw m0, [t1+r10+416*0] 773*c0909341SAndroid Build Coastguard Worker paddd m1, [t1+r10+416*2] 774*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10+416*4] 775*c0909341SAndroid Build Coastguard Worker.h_loop_end: 776*c0909341SAndroid Build Coastguard Worker punpcklwd m17, m16, m6 777*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m17, m17 ; sumsq 778*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m6 779*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m16, m16 780*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*0], m0 781*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*2], m1 782*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*4], m2 783*c0909341SAndroid Build Coastguard Worker add r10, 64 784*c0909341SAndroid Build Coastguard Worker jl .h_loop 785*c0909341SAndroid Build Coastguard Worker ret 786*c0909341SAndroid Build Coastguard Worker.top_fixup: 787*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 788*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: ; the sums of the first row needs to be doubled 789*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+416*0] 790*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10+416*2] 791*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10+416*4] 792*c0909341SAndroid Build Coastguard Worker paddw m0, m0 793*c0909341SAndroid Build Coastguard Worker paddd m1, m1 794*c0909341SAndroid Build Coastguard Worker paddd m2, m2 795*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*0], m0 796*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*2], m1 797*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*4], m2 798*c0909341SAndroid Build Coastguard Worker add r10, 64 799*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 800*c0909341SAndroid Build Coastguard Worker ret 801*c0909341SAndroid Build Coastguard WorkerALIGN function_align 802*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab 803*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 804*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 805*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 806*c0909341SAndroid Build Coastguard Worker movq xm16, [leftq+2] 807*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-6] 808*c0909341SAndroid Build Coastguard Worker add leftq, 8 809*c0909341SAndroid Build Coastguard Worker jmp .hv_main 810*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 811*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm16, [lpfq+wq] 812*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-6] 813*c0909341SAndroid Build Coastguard Worker jmp .hv_main 814*c0909341SAndroid Build Coastguard Worker.hv_bottom: 815*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 816*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 817*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 818*c0909341SAndroid Build Coastguard Worker.hv_loop: 819*c0909341SAndroid Build Coastguard Worker movu m16, [lpfq+r10- 2] 820*c0909341SAndroid Build Coastguard Worker.hv_main: 821*c0909341SAndroid Build Coastguard Worker movu m17, [lpfq+r10+14] 822*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 823*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 824*c0909341SAndroid Build Coastguard Worker cmp r10d, -68 825*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 826*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 827*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, [r13+r10+ 0], 0xe4 828*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, [r13+r10+16], 0xe4 829*c0909341SAndroid Build Coastguard Worker.hv_have_right: 830*c0909341SAndroid Build Coastguard Worker palignr m3, m17, m16, 2 831*c0909341SAndroid Build Coastguard Worker paddw m0, m16, m3 832*c0909341SAndroid Build Coastguard Worker palignr m1, m17, m16, 6 833*c0909341SAndroid Build Coastguard Worker paddw m0, m1 834*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m1 835*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 836*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m1 837*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 838*c0909341SAndroid Build Coastguard Worker shufpd m17, m16, m17, 0x55 839*c0909341SAndroid Build Coastguard Worker paddw m0, m17 840*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m16, m17 841*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m1, m1 842*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m16, m17 843*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m1, m1 844*c0909341SAndroid Build Coastguard Worker shufps m16, m17, q2121 845*c0909341SAndroid Build Coastguard Worker paddw m0, m16 ; h sum 846*c0909341SAndroid Build Coastguard Worker punpcklwd m17, m16, m6 847*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m17, m17 ; h sumsq 848*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m6 849*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m16, m16 850*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t1+r10+416*0] 851*c0909341SAndroid Build Coastguard Worker paddd m16, m2, [t1+r10+416*2] 852*c0909341SAndroid Build Coastguard Worker paddd m17, m3, [t1+r10+416*4] 853*c0909341SAndroid Build Coastguard Worker test hd, hd 854*c0909341SAndroid Build Coastguard Worker jz .hv_last_row 855*c0909341SAndroid Build Coastguard Worker.hv_main2: 856*c0909341SAndroid Build Coastguard Worker paddw m1, [t2+r10+416*0] ; hv sum 857*c0909341SAndroid Build Coastguard Worker paddd m16, [t2+r10+416*2] ; hv sumsq 858*c0909341SAndroid Build Coastguard Worker paddd m17, [t2+r10+416*4] 859*c0909341SAndroid Build Coastguard Worker mova [t0+r10+416*0], m0 860*c0909341SAndroid Build Coastguard Worker mova [t0+r10+416*2], m2 861*c0909341SAndroid Build Coastguard Worker mova [t0+r10+416*4], m3 862*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 863*c0909341SAndroid Build Coastguard Worker paddd m16, m8 864*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 865*c0909341SAndroid Build Coastguard Worker paddd m17, m8 866*c0909341SAndroid Build Coastguard Worker psrld m16, 4 ; (a + 8) >> 4 867*c0909341SAndroid Build Coastguard Worker psrld m17, 4 868*c0909341SAndroid Build Coastguard Worker pmulld m16, m9 ; -a * 25 869*c0909341SAndroid Build Coastguard Worker pmulld m17, m9 870*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 871*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m2, m2 ; -p 872*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 873*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m3, m3 874*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 875*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 876*c0909341SAndroid Build Coastguard Worker pmulld m16, m10 ; p * s 877*c0909341SAndroid Build Coastguard Worker pmulld m17, m10 878*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b * 164 879*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 880*c0909341SAndroid Build Coastguard Worker vpalignr m17{k2}, m16, m16, 2 881*c0909341SAndroid Build Coastguard Worker mova m16, m20 882*c0909341SAndroid Build Coastguard Worker pmaxsw m17, m6 883*c0909341SAndroid Build Coastguard Worker paddusw m17, m12 884*c0909341SAndroid Build Coastguard Worker psraw m17, 4 ; min(z, 255) - 256 885*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 886*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m17 887*c0909341SAndroid Build Coastguard Worker vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 888*c0909341SAndroid Build Coastguard Worker vmovdqu8 m17{k3}, m16 ; x 889*c0909341SAndroid Build Coastguard Worker pandn m16, m13, m17 890*c0909341SAndroid Build Coastguard Worker psrld m17, 16 891*c0909341SAndroid Build Coastguard Worker pmulld m0, m16 892*c0909341SAndroid Build Coastguard Worker pmulld m1, m17 893*c0909341SAndroid Build Coastguard Worker packssdw m16, m17 894*c0909341SAndroid Build Coastguard Worker psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 895*c0909341SAndroid Build Coastguard Worker psubd m1, m13 896*c0909341SAndroid Build Coastguard Worker mova [t4+r10+4], m16 897*c0909341SAndroid Build Coastguard Worker psrld m16, m0, 12 ; b 898*c0909341SAndroid Build Coastguard Worker psrld m17, m1, 12 899*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+ 8], xm16 900*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+ 24], xm17 901*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+ 40], ym16, 1 902*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+ 56], ym17, 1 903*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+ 72], m16, 2 904*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+ 88], m17, 2 905*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+104], m16, 3 906*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+120], m17, 3 907*c0909341SAndroid Build Coastguard Worker add r10, 64 908*c0909341SAndroid Build Coastguard Worker jl .hv_loop 909*c0909341SAndroid Build Coastguard Worker mov t2, t1 910*c0909341SAndroid Build Coastguard Worker mov t1, t0 911*c0909341SAndroid Build Coastguard Worker mov t0, t2 912*c0909341SAndroid Build Coastguard Worker ret 913*c0909341SAndroid Build Coastguard Worker.hv_last_row: ; esoteric edge case for odd heights 914*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*0], m1 915*c0909341SAndroid Build Coastguard Worker paddw m1, m0 916*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*2], m16 917*c0909341SAndroid Build Coastguard Worker paddd m16, m2 918*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*4], m17 919*c0909341SAndroid Build Coastguard Worker paddd m17, m3 920*c0909341SAndroid Build Coastguard Worker jmp .hv_main2 921*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab 922*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 923*c0909341SAndroid Build Coastguard Worker.v_loop: 924*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10+416*2] 925*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10+416*4] 926*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+416*0] 927*c0909341SAndroid Build Coastguard Worker paddd m16, m2, [t2+r10+416*2] 928*c0909341SAndroid Build Coastguard Worker paddd m17, m3, [t2+r10+416*4] 929*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+416*0] 930*c0909341SAndroid Build Coastguard Worker paddd m2, m2 931*c0909341SAndroid Build Coastguard Worker paddd m3, m3 932*c0909341SAndroid Build Coastguard Worker paddd m16, m2 ; hv sumsq 933*c0909341SAndroid Build Coastguard Worker paddd m17, m3 934*c0909341SAndroid Build Coastguard Worker paddd m16, m8 935*c0909341SAndroid Build Coastguard Worker paddd m17, m8 936*c0909341SAndroid Build Coastguard Worker psrld m16, 4 ; (a + 8) >> 4 937*c0909341SAndroid Build Coastguard Worker psrld m17, 4 938*c0909341SAndroid Build Coastguard Worker pmulld m16, m9 ; -a * 25 939*c0909341SAndroid Build Coastguard Worker pmulld m17, m9 940*c0909341SAndroid Build Coastguard Worker paddw m0, m0 941*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; hv sum 942*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 943*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 944*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 945*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m2, m2 ; -p 946*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 947*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m3, m3 948*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m6 ; b 949*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 950*c0909341SAndroid Build Coastguard Worker pmulld m16, m10 ; p * s 951*c0909341SAndroid Build Coastguard Worker pmulld m17, m10 952*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b * 164 953*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 954*c0909341SAndroid Build Coastguard Worker vpalignr m17{k2}, m16, m16, 2 955*c0909341SAndroid Build Coastguard Worker mova m16, m20 956*c0909341SAndroid Build Coastguard Worker pmaxsw m17, m6 957*c0909341SAndroid Build Coastguard Worker paddusw m17, m12 958*c0909341SAndroid Build Coastguard Worker psraw m17, 4 ; min(z, 255) - 256 959*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 960*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m17 961*c0909341SAndroid Build Coastguard Worker vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 962*c0909341SAndroid Build Coastguard Worker vmovdqu8 m17{k3}, m16 ; x 963*c0909341SAndroid Build Coastguard Worker pandn m16, m13, m17 964*c0909341SAndroid Build Coastguard Worker psrld m17, 16 965*c0909341SAndroid Build Coastguard Worker pmulld m0, m16 966*c0909341SAndroid Build Coastguard Worker pmulld m1, m17 967*c0909341SAndroid Build Coastguard Worker packssdw m16, m17 968*c0909341SAndroid Build Coastguard Worker psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 969*c0909341SAndroid Build Coastguard Worker psubd m1, m13 970*c0909341SAndroid Build Coastguard Worker mova [t4+r10+4], m16 971*c0909341SAndroid Build Coastguard Worker psrld m16, m0, 12 ; b 972*c0909341SAndroid Build Coastguard Worker psrld m17, m1, 12 973*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+ 8], xm16 974*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+ 24], xm17 975*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+ 40], ym16, 1 976*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+ 56], ym17, 1 977*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+ 72], m16, 2 978*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+ 88], m17, 2 979*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+104], m16, 3 980*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+120], m17, 3 981*c0909341SAndroid Build Coastguard Worker add r10, 64 982*c0909341SAndroid Build Coastguard Worker jl .v_loop 983*c0909341SAndroid Build Coastguard Worker ret 984*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 985*c0909341SAndroid Build Coastguard Worker mov r10, wq 986*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 987*c0909341SAndroid Build Coastguard Worker movu m0, [t4+r10*1+ 2] 988*c0909341SAndroid Build Coastguard Worker movu m1, [t3+r10*2+ 4] 989*c0909341SAndroid Build Coastguard Worker movu m2, [t3+r10*2+68] 990*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t4+r10*1+ 0] 991*c0909341SAndroid Build Coastguard Worker paddd m16, m1, [t3+r10*2+ 0] 992*c0909341SAndroid Build Coastguard Worker paddd m17, m2, [t3+r10*2+64] 993*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+r10*1+ 4] 994*c0909341SAndroid Build Coastguard Worker paddd m16, [t3+r10*2+ 8] 995*c0909341SAndroid Build Coastguard Worker paddd m17, [t3+r10*2+72] 996*c0909341SAndroid Build Coastguard Worker paddw m0, m3 997*c0909341SAndroid Build Coastguard Worker psllw m3, 2 998*c0909341SAndroid Build Coastguard Worker paddd m1, m16 999*c0909341SAndroid Build Coastguard Worker pslld m16, 2 1000*c0909341SAndroid Build Coastguard Worker paddd m2, m17 1001*c0909341SAndroid Build Coastguard Worker pslld m17, 2 1002*c0909341SAndroid Build Coastguard Worker paddw m0, m3 ; a 565 1003*c0909341SAndroid Build Coastguard Worker paddd m1, m16 ; b 565 1004*c0909341SAndroid Build Coastguard Worker paddd m2, m17 1005*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*2+ 0], m0 1006*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*4+ 0], m1 1007*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*4+64], m2 1008*c0909341SAndroid Build Coastguard Worker add r10, 64 1009*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 1010*c0909341SAndroid Build Coastguard Worker ret 1011*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1012*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 1013*c0909341SAndroid Build Coastguard Worker mov r10, wq 1014*c0909341SAndroid Build Coastguard Worker.n0_loop: 1015*c0909341SAndroid Build Coastguard Worker movu m0, [t4+r10*1+ 2] 1016*c0909341SAndroid Build Coastguard Worker movu m1, [t3+r10*2+ 4] 1017*c0909341SAndroid Build Coastguard Worker movu m2, [t3+r10*2+68] 1018*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t4+r10*1+ 0] 1019*c0909341SAndroid Build Coastguard Worker paddd m16, m1, [t3+r10*2+ 0] 1020*c0909341SAndroid Build Coastguard Worker paddd m17, m2, [t3+r10*2+64] 1021*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+r10*1+ 4] 1022*c0909341SAndroid Build Coastguard Worker paddd m16, [t3+r10*2+ 8] 1023*c0909341SAndroid Build Coastguard Worker paddd m17, [t3+r10*2+72] 1024*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1025*c0909341SAndroid Build Coastguard Worker psllw m3, 2 1026*c0909341SAndroid Build Coastguard Worker paddd m1, m16 1027*c0909341SAndroid Build Coastguard Worker pslld m16, 2 1028*c0909341SAndroid Build Coastguard Worker paddd m2, m17 1029*c0909341SAndroid Build Coastguard Worker pslld m17, 2 1030*c0909341SAndroid Build Coastguard Worker paddw m0, m3 ; a 565 1031*c0909341SAndroid Build Coastguard Worker paddd m1, m16 ; b 565 1032*c0909341SAndroid Build Coastguard Worker paddd m2, m17 1033*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t4+r10*1+416*2+ 0] 1034*c0909341SAndroid Build Coastguard Worker paddd m16, m1, [t3+r10*2+416*4+ 0] 1035*c0909341SAndroid Build Coastguard Worker paddd m17, m2, [t3+r10*2+416*4+64] 1036*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*2+ 0], m0 1037*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*4+ 0], m1 1038*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*4+64], m2 1039*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+r10] 1040*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 ; src 1041*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 ; a 1042*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 1043*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 1044*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1045*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1046*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m16, m17, q2020 1047*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m17, q3131 1048*c0909341SAndroid Build Coastguard Worker psubd m1, m2 ; b - a * src + (1 << 8) 1049*c0909341SAndroid Build Coastguard Worker psubd m16, m3 1050*c0909341SAndroid Build Coastguard Worker psrad m1, 9 1051*c0909341SAndroid Build Coastguard Worker psrad m16, 9 1052*c0909341SAndroid Build Coastguard Worker packssdw m1, m16 1053*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 1054*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1055*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 1056*c0909341SAndroid Build Coastguard Worker pminsw m0, m14 1057*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 1058*c0909341SAndroid Build Coastguard Worker add r10, 64 1059*c0909341SAndroid Build Coastguard Worker jl .n0_loop 1060*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1061*c0909341SAndroid Build Coastguard Worker ret 1062*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1063*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 1064*c0909341SAndroid Build Coastguard Worker mov r10, wq 1065*c0909341SAndroid Build Coastguard Worker.n1_loop: 1066*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+r10] 1067*c0909341SAndroid Build Coastguard Worker mova m3, [t4+r10*1+416*2+ 0] 1068*c0909341SAndroid Build Coastguard Worker mova m16, [t3+r10*2+416*4+ 0] 1069*c0909341SAndroid Build Coastguard Worker mova m17, [t3+r10*2+416*4+64] 1070*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 ; src 1071*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 ; a 1072*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 1073*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 1074*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1075*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1076*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m16, m17, q2020 1077*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m17, q3131 1078*c0909341SAndroid Build Coastguard Worker psubd m1, m2 ; b - a * src + (1 << 7) 1079*c0909341SAndroid Build Coastguard Worker psubd m16, m3 1080*c0909341SAndroid Build Coastguard Worker psrad m1, 8 1081*c0909341SAndroid Build Coastguard Worker psrad m16, 8 1082*c0909341SAndroid Build Coastguard Worker packssdw m1, m16 1083*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 1084*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1085*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 1086*c0909341SAndroid Build Coastguard Worker pminsw m0, m14 1087*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 1088*c0909341SAndroid Build Coastguard Worker add r10, 64 1089*c0909341SAndroid Build Coastguard Worker jl .n1_loop 1090*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1091*c0909341SAndroid Build Coastguard Worker ret 1092*c0909341SAndroid Build Coastguard Worker 1093*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_16bpc, 4, 14, 22, 416*42+8, dst, stride, left, lpf, \ 1094*c0909341SAndroid Build Coastguard Worker w, h, edge, params 1095*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 1096*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 1097*c0909341SAndroid Build Coastguard Worker lea r13, [r_ext_mask+72] 1098*c0909341SAndroid Build Coastguard Worker mov edged, r7m 1099*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1100*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1101*c0909341SAndroid Build Coastguard Worker vpbroadcastw m7, [paramsq+10] ; w1 1102*c0909341SAndroid Build Coastguard Worker add wd, wd 1103*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pd_8] 1104*c0909341SAndroid Build Coastguard Worker add lpfq, wq 1105*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pd_m9] 1106*c0909341SAndroid Build Coastguard Worker add dstq, wq 1107*c0909341SAndroid Build Coastguard Worker vpsubd m10, m6, [paramsq+4] {1to16} ; -s1 1108*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*2+416*12+8] 1109*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [base+pw_164_455] 1110*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+wq+416*32+8] 1111*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [base+pw_61448] 1112*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+12] 1113*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+pd_m34816] 1114*c0909341SAndroid Build Coastguard Worker neg wq 1115*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [base+pw_1023] 1116*c0909341SAndroid Build Coastguard Worker psllw m7, 4 1117*c0909341SAndroid Build Coastguard Worker mova m18, [sgr_x_by_x+64*0] 1118*c0909341SAndroid Build Coastguard Worker mov r10d, 0xfffffffc 1119*c0909341SAndroid Build Coastguard Worker mova m19, [sgr_x_by_x+64*1] 1120*c0909341SAndroid Build Coastguard Worker kmovd k1, r10d 1121*c0909341SAndroid Build Coastguard Worker mova m20, [sgr_x_by_x+64*2] 1122*c0909341SAndroid Build Coastguard Worker mov r10, 0x3333333333333333 1123*c0909341SAndroid Build Coastguard Worker mova m21, [sgr_x_by_x+64*3] 1124*c0909341SAndroid Build Coastguard Worker kmovq k2, r10 1125*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 1126*c0909341SAndroid Build Coastguard Worker jz .no_top 1127*c0909341SAndroid Build Coastguard Worker call .h_top 1128*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1129*c0909341SAndroid Build Coastguard Worker mov t2, t1 1130*c0909341SAndroid Build Coastguard Worker add t1, 416*6 1131*c0909341SAndroid Build Coastguard Worker call .h_top 1132*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1133*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1134*c0909341SAndroid Build Coastguard Worker add r10, strideq 1135*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 1136*c0909341SAndroid Build Coastguard Worker call .hv0 1137*c0909341SAndroid Build Coastguard Worker.main: 1138*c0909341SAndroid Build Coastguard Worker dec hd 1139*c0909341SAndroid Build Coastguard Worker jz .height1 1140*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1141*c0909341SAndroid Build Coastguard Worker call .hv1 1142*c0909341SAndroid Build Coastguard Worker call .prep_n 1143*c0909341SAndroid Build Coastguard Worker sub hd, 2 1144*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 1145*c0909341SAndroid Build Coastguard Worker.main_loop: 1146*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1147*c0909341SAndroid Build Coastguard Worker call .hv0 1148*c0909341SAndroid Build Coastguard Worker test hd, hd 1149*c0909341SAndroid Build Coastguard Worker jz .odd_height 1150*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1151*c0909341SAndroid Build Coastguard Worker call .hv1 1152*c0909341SAndroid Build Coastguard Worker call .n0 1153*c0909341SAndroid Build Coastguard Worker call .n1 1154*c0909341SAndroid Build Coastguard Worker sub hd, 2 1155*c0909341SAndroid Build Coastguard Worker jge .main_loop 1156*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 1157*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1158*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 1159*c0909341SAndroid Build Coastguard Worker call .hv0_bottom 1160*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1161*c0909341SAndroid Build Coastguard Worker call .hv1_bottom 1162*c0909341SAndroid Build Coastguard Worker.end: 1163*c0909341SAndroid Build Coastguard Worker call .n0 1164*c0909341SAndroid Build Coastguard Worker call .n1 1165*c0909341SAndroid Build Coastguard Worker.end2: 1166*c0909341SAndroid Build Coastguard Worker RET 1167*c0909341SAndroid Build Coastguard Worker.height1: 1168*c0909341SAndroid Build Coastguard Worker call .v1 1169*c0909341SAndroid Build Coastguard Worker call .prep_n 1170*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 1171*c0909341SAndroid Build Coastguard Worker.odd_height: 1172*c0909341SAndroid Build Coastguard Worker call .v1 1173*c0909341SAndroid Build Coastguard Worker call .n0 1174*c0909341SAndroid Build Coastguard Worker call .n1 1175*c0909341SAndroid Build Coastguard Worker.odd_height_end: 1176*c0909341SAndroid Build Coastguard Worker call .v0 1177*c0909341SAndroid Build Coastguard Worker call .v1 1178*c0909341SAndroid Build Coastguard Worker call .n0 1179*c0909341SAndroid Build Coastguard Worker jmp .end2 1180*c0909341SAndroid Build Coastguard Worker.extend_bottom: 1181*c0909341SAndroid Build Coastguard Worker call .v0 1182*c0909341SAndroid Build Coastguard Worker call .v1 1183*c0909341SAndroid Build Coastguard Worker jmp .end 1184*c0909341SAndroid Build Coastguard Worker.no_top: 1185*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1186*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1187*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 1188*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 1189*c0909341SAndroid Build Coastguard Worker call .h 1190*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1191*c0909341SAndroid Build Coastguard Worker lea t2, [t1+416*6] 1192*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: 1193*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+416*0] 1194*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10+416*2] 1195*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10+416*4] 1196*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*0], m0 1197*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*2], m1 1198*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*4], m2 1199*c0909341SAndroid Build Coastguard Worker add r10, 64 1200*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 1201*c0909341SAndroid Build Coastguard Worker call .v0 1202*c0909341SAndroid Build Coastguard Worker jmp .main 1203*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 1204*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1205*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1206*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1207*c0909341SAndroid Build Coastguard Worker movd xm16, [leftq+4] 1208*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-4] 1209*c0909341SAndroid Build Coastguard Worker add leftq, 8 1210*c0909341SAndroid Build Coastguard Worker jmp .h_main 1211*c0909341SAndroid Build Coastguard Worker.h_extend_left: 1212*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm16, [lpfq+wq] 1213*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-4] 1214*c0909341SAndroid Build Coastguard Worker jmp .h_main 1215*c0909341SAndroid Build Coastguard Worker.h_top: 1216*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1217*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1218*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1219*c0909341SAndroid Build Coastguard Worker.h_loop: 1220*c0909341SAndroid Build Coastguard Worker movu m16, [lpfq+r10+ 0] 1221*c0909341SAndroid Build Coastguard Worker.h_main: 1222*c0909341SAndroid Build Coastguard Worker movu m17, [lpfq+r10+16] 1223*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1224*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 1225*c0909341SAndroid Build Coastguard Worker cmp r10d, -66 1226*c0909341SAndroid Build Coastguard Worker jl .h_have_right 1227*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 1228*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, [r13+r10+ 0], 0xe4 1229*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, [r13+r10+16], 0xe4 1230*c0909341SAndroid Build Coastguard Worker.h_have_right: 1231*c0909341SAndroid Build Coastguard Worker palignr m0, m17, m16, 2 1232*c0909341SAndroid Build Coastguard Worker paddw m1, m16, m0 1233*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m16, m0 1234*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1235*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m16, m0 1236*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1237*c0909341SAndroid Build Coastguard Worker palignr m17, m16, 4 1238*c0909341SAndroid Build Coastguard Worker paddw m1, m17 ; sum 1239*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m17, m6 1240*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m16, m16 ; sumsq 1241*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6 1242*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m17, m17 1243*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*0], m1 1244*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*2], m2 1245*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*4], m3 1246*c0909341SAndroid Build Coastguard Worker add r10, 64 1247*c0909341SAndroid Build Coastguard Worker jl .h_loop 1248*c0909341SAndroid Build Coastguard Worker ret 1249*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1250*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 1251*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1252*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1253*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 1254*c0909341SAndroid Build Coastguard Worker movd xm16, [leftq+4] 1255*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-4] 1256*c0909341SAndroid Build Coastguard Worker add leftq, 8 1257*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 1258*c0909341SAndroid Build Coastguard Worker.hv0_extend_left: 1259*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm16, [lpfq+wq] 1260*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-4] 1261*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 1262*c0909341SAndroid Build Coastguard Worker.hv0_bottom: 1263*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1264*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1265*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 1266*c0909341SAndroid Build Coastguard Worker.hv0_loop: 1267*c0909341SAndroid Build Coastguard Worker movu m16, [lpfq+r10+ 0] 1268*c0909341SAndroid Build Coastguard Worker.hv0_main: 1269*c0909341SAndroid Build Coastguard Worker movu m17, [lpfq+r10+16] 1270*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1271*c0909341SAndroid Build Coastguard Worker jnz .hv0_have_right 1272*c0909341SAndroid Build Coastguard Worker cmp r10d, -66 1273*c0909341SAndroid Build Coastguard Worker jl .hv0_have_right 1274*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 1275*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, [r13+r10+ 0], 0xe4 1276*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, [r13+r10+16], 0xe4 1277*c0909341SAndroid Build Coastguard Worker.hv0_have_right: 1278*c0909341SAndroid Build Coastguard Worker palignr m0, m17, m16, 2 1279*c0909341SAndroid Build Coastguard Worker paddw m1, m16, m0 1280*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m16, m0 1281*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1282*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m16, m0 1283*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1284*c0909341SAndroid Build Coastguard Worker palignr m17, m16, 4 1285*c0909341SAndroid Build Coastguard Worker paddw m1, m17 ; sum 1286*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m17, m6 1287*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m16, m16 ; sumsq 1288*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6 1289*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m17, m17 1290*c0909341SAndroid Build Coastguard Worker paddw m0, m1, [t1+r10+416*0] 1291*c0909341SAndroid Build Coastguard Worker paddd m16, m2, [t1+r10+416*2] 1292*c0909341SAndroid Build Coastguard Worker paddd m17, m3, [t1+r10+416*4] 1293*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*0], m1 1294*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*2], m2 1295*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*4], m3 1296*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+416*0] 1297*c0909341SAndroid Build Coastguard Worker paddd m2, m16, [t2+r10+416*2] 1298*c0909341SAndroid Build Coastguard Worker paddd m3, m17, [t2+r10+416*4] 1299*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*0], m0 1300*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*2], m16 1301*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*4], m17 1302*c0909341SAndroid Build Coastguard Worker paddd m2, m8 1303*c0909341SAndroid Build Coastguard Worker paddd m3, m8 1304*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a + 8) >> 4 1305*c0909341SAndroid Build Coastguard Worker psrld m3, 4 1306*c0909341SAndroid Build Coastguard Worker pmulld m2, m9 ; -((a + 8) >> 4) * 9 1307*c0909341SAndroid Build Coastguard Worker pmulld m3, m9 1308*c0909341SAndroid Build Coastguard Worker psrlw m17, m1, 1 1309*c0909341SAndroid Build Coastguard Worker pavgw m17, m6 ; (b + 2) >> 2 1310*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m17, m6 1311*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m16, m16 ; -p 1312*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6 1313*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m17, m17 1314*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m6, m1 ; b 1315*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6, m1 1316*c0909341SAndroid Build Coastguard Worker pminsd m2, m6 1317*c0909341SAndroid Build Coastguard Worker pminsd m3, m6 1318*c0909341SAndroid Build Coastguard Worker pmulld m2, m10 ; p * s 1319*c0909341SAndroid Build Coastguard Worker pmulld m3, m10 1320*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m11 ; b * 455 1321*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m11 1322*c0909341SAndroid Build Coastguard Worker vpalignr m3{k2}, m2, m2, 2 1323*c0909341SAndroid Build Coastguard Worker mova m2, m20 1324*c0909341SAndroid Build Coastguard Worker paddusw m3, m12 1325*c0909341SAndroid Build Coastguard Worker psraw m3, 4 ; min(z, 255) - 256 1326*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 1327*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m3 1328*c0909341SAndroid Build Coastguard Worker vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 1329*c0909341SAndroid Build Coastguard Worker vmovdqu8 m3{k3}, m2 ; x 1330*c0909341SAndroid Build Coastguard Worker pandn m2, m13, m3 1331*c0909341SAndroid Build Coastguard Worker psrld m3, 16 1332*c0909341SAndroid Build Coastguard Worker pmulld m16, m2 1333*c0909341SAndroid Build Coastguard Worker pmulld m17, m3 1334*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1335*c0909341SAndroid Build Coastguard Worker psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1336*c0909341SAndroid Build Coastguard Worker psubd m17, m13 1337*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*0+4], m2 1338*c0909341SAndroid Build Coastguard Worker psrld m16, 12 1339*c0909341SAndroid Build Coastguard Worker psrld m17, 12 1340*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*0+ 8], xm16 1341*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*0+ 24], xm17 1342*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 1343*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 1344*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 1345*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 1346*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+104], m16, 3 1347*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+120], m17, 3 1348*c0909341SAndroid Build Coastguard Worker add r10, 64 1349*c0909341SAndroid Build Coastguard Worker jl .hv0_loop 1350*c0909341SAndroid Build Coastguard Worker ret 1351*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1352*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 1353*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1354*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1355*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 1356*c0909341SAndroid Build Coastguard Worker movd xm16, [leftq+4] 1357*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-4] 1358*c0909341SAndroid Build Coastguard Worker add leftq, 8 1359*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 1360*c0909341SAndroid Build Coastguard Worker.hv1_extend_left: 1361*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm16, [lpfq+wq] 1362*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-4] 1363*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 1364*c0909341SAndroid Build Coastguard Worker.hv1_bottom: 1365*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1366*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1367*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 1368*c0909341SAndroid Build Coastguard Worker.hv1_loop: 1369*c0909341SAndroid Build Coastguard Worker movu m16, [lpfq+r10+ 0] 1370*c0909341SAndroid Build Coastguard Worker.hv1_main: 1371*c0909341SAndroid Build Coastguard Worker movu m17, [lpfq+r10+16] 1372*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1373*c0909341SAndroid Build Coastguard Worker jnz .hv1_have_right 1374*c0909341SAndroid Build Coastguard Worker cmp r10d, -66 1375*c0909341SAndroid Build Coastguard Worker jl .hv1_have_right 1376*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 1377*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, [r13+r10+ 0], 0xe4 1378*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, [r13+r10+16], 0xe4 1379*c0909341SAndroid Build Coastguard Worker.hv1_have_right: 1380*c0909341SAndroid Build Coastguard Worker palignr m1, m17, m16, 2 1381*c0909341SAndroid Build Coastguard Worker paddw m0, m16, m1 1382*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m16, m1 1383*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1384*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m16, m1 1385*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1386*c0909341SAndroid Build Coastguard Worker palignr m17, m16, 4 1387*c0909341SAndroid Build Coastguard Worker paddw m0, m17 ; h sum 1388*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m17, m6 1389*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m1, m1 ; h sumsq 1390*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6 1391*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m17, m17 1392*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+416*0] 1393*c0909341SAndroid Build Coastguard Worker paddd m16, m2, [t2+r10+416*2] 1394*c0909341SAndroid Build Coastguard Worker paddd m17, m3, [t2+r10+416*4] 1395*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*0], m0 1396*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*2], m2 1397*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*4], m3 1398*c0909341SAndroid Build Coastguard Worker paddd m16, m8 1399*c0909341SAndroid Build Coastguard Worker paddd m17, m8 1400*c0909341SAndroid Build Coastguard Worker psrld m16, 4 ; (a + 8) >> 4 1401*c0909341SAndroid Build Coastguard Worker psrld m17, 4 1402*c0909341SAndroid Build Coastguard Worker pmulld m16, m9 ; -((a + 8) >> 4) * 9 1403*c0909341SAndroid Build Coastguard Worker pmulld m17, m9 1404*c0909341SAndroid Build Coastguard Worker psrlw m3, m1, 1 1405*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b + 2) >> 2 1406*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 1407*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m2, m2 ; -p 1408*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1409*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m3, m3 1410*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m6, m1 ; b 1411*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6, m1 1412*c0909341SAndroid Build Coastguard Worker pminsd m16, m6 1413*c0909341SAndroid Build Coastguard Worker pminsd m17, m6 1414*c0909341SAndroid Build Coastguard Worker pmulld m16, m10 ; p * s 1415*c0909341SAndroid Build Coastguard Worker pmulld m17, m10 1416*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m11 ; b * 455 1417*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m11 1418*c0909341SAndroid Build Coastguard Worker vpalignr m17{k2}, m16, m16, 2 1419*c0909341SAndroid Build Coastguard Worker mova m16, m20 1420*c0909341SAndroid Build Coastguard Worker paddusw m17, m12 1421*c0909341SAndroid Build Coastguard Worker psraw m17, 4 ; min(z, 255) - 256 1422*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 1423*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m17 1424*c0909341SAndroid Build Coastguard Worker vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 1425*c0909341SAndroid Build Coastguard Worker vmovdqu8 m17{k3}, m16 ; x 1426*c0909341SAndroid Build Coastguard Worker pandn m16, m13, m17 1427*c0909341SAndroid Build Coastguard Worker psrld m17, 16 1428*c0909341SAndroid Build Coastguard Worker pmulld m0, m16 1429*c0909341SAndroid Build Coastguard Worker pmulld m1, m17 1430*c0909341SAndroid Build Coastguard Worker packssdw m16, m17 1431*c0909341SAndroid Build Coastguard Worker psubd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1432*c0909341SAndroid Build Coastguard Worker psubd m1, m13 1433*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*2+4], m16 1434*c0909341SAndroid Build Coastguard Worker psrld m16, m0, 12 1435*c0909341SAndroid Build Coastguard Worker psrld m17, m1, 12 1436*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*4+ 8], xm16 1437*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*4+ 24], xm17 1438*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 1439*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 1440*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 1441*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 1442*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+104], m16, 3 1443*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+120], m17, 3 1444*c0909341SAndroid Build Coastguard Worker add r10, 64 1445*c0909341SAndroid Build Coastguard Worker jl .hv1_loop 1446*c0909341SAndroid Build Coastguard Worker mov r10, t2 1447*c0909341SAndroid Build Coastguard Worker mov t2, t1 1448*c0909341SAndroid Build Coastguard Worker mov t1, r10 1449*c0909341SAndroid Build Coastguard Worker ret 1450*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab (even rows) 1451*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1452*c0909341SAndroid Build Coastguard Worker.v0_loop: 1453*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+416*0] 1454*c0909341SAndroid Build Coastguard Worker mova m16, [t1+r10+416*2] 1455*c0909341SAndroid Build Coastguard Worker mova m17, [t1+r10+416*4] 1456*c0909341SAndroid Build Coastguard Worker paddw m0, m0 1457*c0909341SAndroid Build Coastguard Worker paddd m16, m16 1458*c0909341SAndroid Build Coastguard Worker paddd m17, m17 1459*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+416*0] 1460*c0909341SAndroid Build Coastguard Worker paddd m2, m16, [t2+r10+416*2] 1461*c0909341SAndroid Build Coastguard Worker paddd m3, m17, [t2+r10+416*4] 1462*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*0], m0 1463*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*2], m16 1464*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*4], m17 1465*c0909341SAndroid Build Coastguard Worker paddd m2, m8 1466*c0909341SAndroid Build Coastguard Worker paddd m3, m8 1467*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a + 8) >> 4 1468*c0909341SAndroid Build Coastguard Worker psrld m3, 4 1469*c0909341SAndroid Build Coastguard Worker pmulld m2, m9 ; -((a + 8) >> 4) * 9 1470*c0909341SAndroid Build Coastguard Worker pmulld m3, m9 1471*c0909341SAndroid Build Coastguard Worker psrlw m17, m1, 1 1472*c0909341SAndroid Build Coastguard Worker pavgw m17, m6 ; (b + 2) >> 2 1473*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m17, m6 1474*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m16, m16 ; -p 1475*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6 1476*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m17, m17 1477*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m6, m1 ; b 1478*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6, m1 1479*c0909341SAndroid Build Coastguard Worker pminsd m2, m6 1480*c0909341SAndroid Build Coastguard Worker pminsd m3, m6 1481*c0909341SAndroid Build Coastguard Worker pmulld m2, m10 ; p * s 1482*c0909341SAndroid Build Coastguard Worker pmulld m3, m10 1483*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m11 ; b * 455 1484*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m11 1485*c0909341SAndroid Build Coastguard Worker vpalignr m3{k2}, m2, m2, 2 1486*c0909341SAndroid Build Coastguard Worker mova m2, m20 1487*c0909341SAndroid Build Coastguard Worker paddusw m3, m12 1488*c0909341SAndroid Build Coastguard Worker psraw m3, 4 ; min(z, 255) - 256 1489*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 1490*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m3 1491*c0909341SAndroid Build Coastguard Worker vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 1492*c0909341SAndroid Build Coastguard Worker vmovdqu8 m3{k3}, m2 ; x 1493*c0909341SAndroid Build Coastguard Worker pandn m2, m13, m3 1494*c0909341SAndroid Build Coastguard Worker psrld m3, 16 1495*c0909341SAndroid Build Coastguard Worker pmulld m16, m2 1496*c0909341SAndroid Build Coastguard Worker pmulld m17, m3 1497*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1498*c0909341SAndroid Build Coastguard Worker psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1499*c0909341SAndroid Build Coastguard Worker psubd m17, m13 1500*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*0+4], m2 1501*c0909341SAndroid Build Coastguard Worker psrld m16, 12 1502*c0909341SAndroid Build Coastguard Worker psrld m17, 12 1503*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*0+ 8], xm16 1504*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*0+ 24], xm17 1505*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 1506*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 1507*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 1508*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 1509*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+104], m16, 3 1510*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+120], m17, 3 1511*c0909341SAndroid Build Coastguard Worker add r10, 64 1512*c0909341SAndroid Build Coastguard Worker jl .v0_loop 1513*c0909341SAndroid Build Coastguard Worker ret 1514*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows) 1515*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1516*c0909341SAndroid Build Coastguard Worker.v1_loop: 1517*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+416*0] 1518*c0909341SAndroid Build Coastguard Worker mova m16, [t1+r10+416*2] 1519*c0909341SAndroid Build Coastguard Worker mova m17, [t1+r10+416*4] 1520*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+416*0] 1521*c0909341SAndroid Build Coastguard Worker paddd m2, m16, [t2+r10+416*2] 1522*c0909341SAndroid Build Coastguard Worker paddd m3, m17, [t2+r10+416*4] 1523*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*0], m0 1524*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*2], m16 1525*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*4], m17 1526*c0909341SAndroid Build Coastguard Worker paddd m2, m8 1527*c0909341SAndroid Build Coastguard Worker paddd m3, m8 1528*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a + 8) >> 4 1529*c0909341SAndroid Build Coastguard Worker psrld m3, 4 1530*c0909341SAndroid Build Coastguard Worker pmulld m2, m9 ; -((a + 8) >> 4) * 9 1531*c0909341SAndroid Build Coastguard Worker pmulld m3, m9 1532*c0909341SAndroid Build Coastguard Worker psrlw m17, m1, 1 1533*c0909341SAndroid Build Coastguard Worker pavgw m17, m6 ; (b + 2) >> 2 1534*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m17, m6 1535*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m16, m16 ; -p 1536*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6 1537*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m17, m17 1538*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m6, m1 ; b 1539*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6, m1 1540*c0909341SAndroid Build Coastguard Worker pminsd m2, m6 1541*c0909341SAndroid Build Coastguard Worker pminsd m3, m6 1542*c0909341SAndroid Build Coastguard Worker pmulld m2, m10 ; p * s 1543*c0909341SAndroid Build Coastguard Worker pmulld m3, m10 1544*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m11 ; b * 455 1545*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m11 1546*c0909341SAndroid Build Coastguard Worker vpalignr m3{k2}, m2, m2, 2 1547*c0909341SAndroid Build Coastguard Worker mova m2, m20 1548*c0909341SAndroid Build Coastguard Worker paddusw m3, m12 1549*c0909341SAndroid Build Coastguard Worker psraw m3, 4 ; min(z, 255) - 256 1550*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 1551*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m3 1552*c0909341SAndroid Build Coastguard Worker vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 1553*c0909341SAndroid Build Coastguard Worker vmovdqu8 m3{k3}, m2 ; x 1554*c0909341SAndroid Build Coastguard Worker pandn m2, m13, m3 1555*c0909341SAndroid Build Coastguard Worker psrld m3, 16 1556*c0909341SAndroid Build Coastguard Worker pmulld m16, m2 1557*c0909341SAndroid Build Coastguard Worker pmulld m17, m3 1558*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1559*c0909341SAndroid Build Coastguard Worker psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1560*c0909341SAndroid Build Coastguard Worker psubd m17, m13 1561*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*2+4], m2 1562*c0909341SAndroid Build Coastguard Worker psrld m16, 12 1563*c0909341SAndroid Build Coastguard Worker psrld m17, 12 1564*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*4+ 8], xm16 1565*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*4+ 24], xm17 1566*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 1567*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 1568*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 1569*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 1570*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+104], m16, 3 1571*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+120], m17, 3 1572*c0909341SAndroid Build Coastguard Worker add r10, 64 1573*c0909341SAndroid Build Coastguard Worker jl .v1_loop 1574*c0909341SAndroid Build Coastguard Worker mov r10, t2 1575*c0909341SAndroid Build Coastguard Worker mov t2, t1 1576*c0909341SAndroid Build Coastguard Worker mov t1, r10 1577*c0909341SAndroid Build Coastguard Worker ret 1578*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 1579*c0909341SAndroid Build Coastguard Worker mov r10, wq 1580*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 1581*c0909341SAndroid Build Coastguard Worker mova ym16, [t4+r10*1+416*0+0] 1582*c0909341SAndroid Build Coastguard Worker paddw ym16, [t4+r10*1+416*0+4] 1583*c0909341SAndroid Build Coastguard Worker paddw ym17, ym16, [t4+r10*1+416*0+2] 1584*c0909341SAndroid Build Coastguard Worker mova m0, [t3+r10*2+416*0+0] 1585*c0909341SAndroid Build Coastguard Worker paddd m0, [t3+r10*2+416*0+8] 1586*c0909341SAndroid Build Coastguard Worker paddd m1, m0, [t3+r10*2+416*0+4] 1587*c0909341SAndroid Build Coastguard Worker psllw ym17, 2 ; a[-1] 444 1588*c0909341SAndroid Build Coastguard Worker pslld m1, 2 ; b[-1] 444 1589*c0909341SAndroid Build Coastguard Worker psubw ym17, ym16 ; a[-1] 343 1590*c0909341SAndroid Build Coastguard Worker psubd m1, m0 ; b[-1] 343 1591*c0909341SAndroid Build Coastguard Worker vmovdqa32 [t4+r10*1+416* 4], ym17 1592*c0909341SAndroid Build Coastguard Worker vmovdqa32 [t3+r10*2+416* 8], m1 1593*c0909341SAndroid Build Coastguard Worker mova ym16, [t4+r10*1+416*2+0] 1594*c0909341SAndroid Build Coastguard Worker paddw ym16, [t4+r10*1+416*2+4] 1595*c0909341SAndroid Build Coastguard Worker paddw ym17, ym16, [t4+r10*1+416*2+2] 1596*c0909341SAndroid Build Coastguard Worker mova m0, [t3+r10*2+416*4+0] 1597*c0909341SAndroid Build Coastguard Worker paddd m0, [t3+r10*2+416*4+8] 1598*c0909341SAndroid Build Coastguard Worker paddd m1, m0, [t3+r10*2+416*4+4] 1599*c0909341SAndroid Build Coastguard Worker psllw ym17, 2 ; a[ 0] 444 1600*c0909341SAndroid Build Coastguard Worker pslld m1, 2 ; b[ 0] 444 1601*c0909341SAndroid Build Coastguard Worker vmovdqa32 [t4+r10*1+416* 6], ym17 1602*c0909341SAndroid Build Coastguard Worker vmovdqa32 [t3+r10*2+416*12], m1 1603*c0909341SAndroid Build Coastguard Worker psubw ym17, ym16 ; a[ 0] 343 1604*c0909341SAndroid Build Coastguard Worker psubd m1, m0 ; b[ 0] 343 1605*c0909341SAndroid Build Coastguard Worker vmovdqa32 [t4+r10*1+416* 8], ym17 1606*c0909341SAndroid Build Coastguard Worker vmovdqa32 [t3+r10*2+416*16], m1 1607*c0909341SAndroid Build Coastguard Worker add r10, 32 1608*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 1609*c0909341SAndroid Build Coastguard Worker ret 1610*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1611*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 1612*c0909341SAndroid Build Coastguard Worker mov r10, wq 1613*c0909341SAndroid Build Coastguard Worker.n0_loop: 1614*c0909341SAndroid Build Coastguard Worker mova m3, [t4+r10*1+416*0+0] 1615*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+r10*1+416*0+4] 1616*c0909341SAndroid Build Coastguard Worker paddw m1, m3, [t4+r10*1+416*0+2] 1617*c0909341SAndroid Build Coastguard Worker psllw m1, 2 ; a[ 1] 444 1618*c0909341SAndroid Build Coastguard Worker psubw m2, m1, m3 ; a[ 1] 343 1619*c0909341SAndroid Build Coastguard Worker paddw m3, m2, [t4+r10*1+416*4] 1620*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+r10*1+416*6] 1621*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*4], m2 1622*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*6], m1 1623*c0909341SAndroid Build Coastguard Worker mova m16, [t3+r10*2+416*0+0] 1624*c0909341SAndroid Build Coastguard Worker paddd m16, [t3+r10*2+416*0+8] 1625*c0909341SAndroid Build Coastguard Worker paddd m1, m16, [t3+r10*2+416*0+4] 1626*c0909341SAndroid Build Coastguard Worker pslld m1, 2 ; b[ 1] 444 1627*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m16 ; b[ 1] 343 1628*c0909341SAndroid Build Coastguard Worker paddd m16, m2, [t3+r10*2+416* 8+ 0] 1629*c0909341SAndroid Build Coastguard Worker paddd m16, [t3+r10*2+416*12+ 0] 1630*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416* 8+ 0], m2 1631*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*12+ 0], m1 1632*c0909341SAndroid Build Coastguard Worker mova m17, [t3+r10*2+416*0+64] 1633*c0909341SAndroid Build Coastguard Worker paddd m17, [t3+r10*2+416*0+72] 1634*c0909341SAndroid Build Coastguard Worker paddd m1, m17, [t3+r10*2+416*0+68] 1635*c0909341SAndroid Build Coastguard Worker pslld m1, 2 1636*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m17 1637*c0909341SAndroid Build Coastguard Worker paddd m17, m2, [t3+r10*2+416* 8+64] 1638*c0909341SAndroid Build Coastguard Worker paddd m17, [t3+r10*2+416*12+64] 1639*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416* 8+64], m2 1640*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*12+64], m1 1641*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+r10] 1642*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 1643*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 1644*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 1645*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 1646*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1647*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1648*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m16, m17, q2020 1649*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m17, q3131 1650*c0909341SAndroid Build Coastguard Worker psubd m1, m2 ; b - a * src + (1 << 8) 1651*c0909341SAndroid Build Coastguard Worker psubd m16, m3 1652*c0909341SAndroid Build Coastguard Worker psrad m1, 9 1653*c0909341SAndroid Build Coastguard Worker psrad m16, 9 1654*c0909341SAndroid Build Coastguard Worker packssdw m1, m16 1655*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 1656*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1657*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 1658*c0909341SAndroid Build Coastguard Worker pminsw m0, m14 1659*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 1660*c0909341SAndroid Build Coastguard Worker add r10, 64 1661*c0909341SAndroid Build Coastguard Worker jl .n0_loop 1662*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1663*c0909341SAndroid Build Coastguard Worker ret 1664*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1665*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 1666*c0909341SAndroid Build Coastguard Worker mov r10, wq 1667*c0909341SAndroid Build Coastguard Worker.n1_loop: 1668*c0909341SAndroid Build Coastguard Worker mova m3, [t4+r10*1+416*2+0] 1669*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+r10*1+416*2+4] 1670*c0909341SAndroid Build Coastguard Worker paddw m1, m3, [t4+r10*1+416*2+2] 1671*c0909341SAndroid Build Coastguard Worker psllw m1, 2 ; a[ 1] 444 1672*c0909341SAndroid Build Coastguard Worker psubw m2, m1, m3 ; a[ 1] 343 1673*c0909341SAndroid Build Coastguard Worker paddw m3, m2, [t4+r10*1+416*6] 1674*c0909341SAndroid Build Coastguard Worker paddw m3, [t4+r10*1+416*8] 1675*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*6], m1 1676*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*8], m2 1677*c0909341SAndroid Build Coastguard Worker mova m16, [t3+r10*2+416*4+0] 1678*c0909341SAndroid Build Coastguard Worker paddd m16, [t3+r10*2+416*4+8] 1679*c0909341SAndroid Build Coastguard Worker paddd m1, m16, [t3+r10*2+416*4+4] 1680*c0909341SAndroid Build Coastguard Worker pslld m1, 2 ; b[ 1] 444 1681*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m16 ; b[ 1] 343 1682*c0909341SAndroid Build Coastguard Worker paddd m16, m2, [t3+r10*2+416*12+ 0] 1683*c0909341SAndroid Build Coastguard Worker paddd m16, [t3+r10*2+416*16+ 0] 1684*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*12+ 0], m1 1685*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*16+ 0], m2 1686*c0909341SAndroid Build Coastguard Worker mova m17, [t3+r10*2+416*4+64] 1687*c0909341SAndroid Build Coastguard Worker paddd m17, [t3+r10*2+416*4+72] 1688*c0909341SAndroid Build Coastguard Worker paddd m1, m17, [t3+r10*2+416*4+68] 1689*c0909341SAndroid Build Coastguard Worker pslld m1, 2 1690*c0909341SAndroid Build Coastguard Worker psubd m2, m1, m17 1691*c0909341SAndroid Build Coastguard Worker paddd m17, m2, [t3+r10*2+416*12+64] 1692*c0909341SAndroid Build Coastguard Worker paddd m17, [t3+r10*2+416*16+64] 1693*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*12+64], m1 1694*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*16+64], m2 1695*c0909341SAndroid Build Coastguard Worker mova m0, [dstq+r10] 1696*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m6 1697*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 1698*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m1 ; a * src 1699*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m6 1700*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 1701*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1702*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m16, m17, q2020 1703*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m17, q3131 1704*c0909341SAndroid Build Coastguard Worker psubd m1, m2 ; b - a * src + (1 << 8) 1705*c0909341SAndroid Build Coastguard Worker psubd m16, m3 1706*c0909341SAndroid Build Coastguard Worker psrad m1, 9 1707*c0909341SAndroid Build Coastguard Worker psrad m16, 9 1708*c0909341SAndroid Build Coastguard Worker packssdw m1, m16 1709*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 1710*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1711*c0909341SAndroid Build Coastguard Worker pmaxsw m0, m6 1712*c0909341SAndroid Build Coastguard Worker pminsw m0, m14 1713*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], m0 1714*c0909341SAndroid Build Coastguard Worker add r10, 64 1715*c0909341SAndroid Build Coastguard Worker jl .n1_loop 1716*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1717*c0909341SAndroid Build Coastguard Worker ret 1718*c0909341SAndroid Build Coastguard Worker 1719*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_16bpc, 4, 14, 23, 416*66+8, dst, stride, left, lpf, \ 1720*c0909341SAndroid Build Coastguard Worker w, h, edge, params 1721*c0909341SAndroid Build Coastguard Worker movifnidn wd, wm 1722*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 1723*c0909341SAndroid Build Coastguard Worker lea r13, [r_ext_mask+72] 1724*c0909341SAndroid Build Coastguard Worker mov edged, r7m 1725*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1726*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [paramsq+8] ; w0 w1 1727*c0909341SAndroid Build Coastguard Worker pxor m6, m6 1728*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [base+pd_8] 1729*c0909341SAndroid Build Coastguard Worker add wd, wd 1730*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [base+pd_m9] 1731*c0909341SAndroid Build Coastguard Worker add lpfq, wq 1732*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [base+pd_m25] 1733*c0909341SAndroid Build Coastguard Worker add dstq, wq 1734*c0909341SAndroid Build Coastguard Worker vpsubd m11, m6, [paramsq+0] {1to16} ; -s0 1735*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*2+416*24+8] 1736*c0909341SAndroid Build Coastguard Worker vpsubd m12, m6, [paramsq+4] {1to16} ; -s1 1737*c0909341SAndroid Build Coastguard Worker lea t4, [rsp+wq+416*52+8] 1738*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [base+pw_164_455] 1739*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq+12] 1740*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [base+pw_61448] 1741*c0909341SAndroid Build Coastguard Worker neg wq 1742*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [base+pd_m34816] 1743*c0909341SAndroid Build Coastguard Worker psllw m7, 2 1744*c0909341SAndroid Build Coastguard Worker vpbroadcastd m22, [base+pd_2147483648] 1745*c0909341SAndroid Build Coastguard Worker mov r10d, 0xfffffff8 1746*c0909341SAndroid Build Coastguard Worker mova m18, [sgr_x_by_x+64*0] 1747*c0909341SAndroid Build Coastguard Worker kmovd k1, r10d 1748*c0909341SAndroid Build Coastguard Worker mova m19, [sgr_x_by_x+64*1] 1749*c0909341SAndroid Build Coastguard Worker mov r10, 0x3333333333333333 1750*c0909341SAndroid Build Coastguard Worker mova m20, [sgr_x_by_x+64*2] 1751*c0909341SAndroid Build Coastguard Worker kmovq k2, r10 1752*c0909341SAndroid Build Coastguard Worker mova m21, [sgr_x_by_x+64*3] 1753*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 1754*c0909341SAndroid Build Coastguard Worker jz .no_top 1755*c0909341SAndroid Build Coastguard Worker call .h_top 1756*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1757*c0909341SAndroid Build Coastguard Worker mov t2, t1 1758*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx512icl).top_fixup 1759*c0909341SAndroid Build Coastguard Worker add t1, 416*12 1760*c0909341SAndroid Build Coastguard Worker call .h_top 1761*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1762*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1763*c0909341SAndroid Build Coastguard Worker add r10, strideq 1764*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 1765*c0909341SAndroid Build Coastguard Worker call .hv0 1766*c0909341SAndroid Build Coastguard Worker.main: 1767*c0909341SAndroid Build Coastguard Worker dec hd 1768*c0909341SAndroid Build Coastguard Worker jz .height1 1769*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1770*c0909341SAndroid Build Coastguard Worker call .hv1 1771*c0909341SAndroid Build Coastguard Worker call .prep_n 1772*c0909341SAndroid Build Coastguard Worker sub hd, 2 1773*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 1774*c0909341SAndroid Build Coastguard Worker.main_loop: 1775*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1776*c0909341SAndroid Build Coastguard Worker call .hv0 1777*c0909341SAndroid Build Coastguard Worker test hd, hd 1778*c0909341SAndroid Build Coastguard Worker jz .odd_height 1779*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1780*c0909341SAndroid Build Coastguard Worker call .hv1 1781*c0909341SAndroid Build Coastguard Worker call .n0 1782*c0909341SAndroid Build Coastguard Worker call .n1 1783*c0909341SAndroid Build Coastguard Worker sub hd, 2 1784*c0909341SAndroid Build Coastguard Worker jge .main_loop 1785*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 1786*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1787*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 1788*c0909341SAndroid Build Coastguard Worker call .hv0_bottom 1789*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1790*c0909341SAndroid Build Coastguard Worker call .hv1_bottom 1791*c0909341SAndroid Build Coastguard Worker.end: 1792*c0909341SAndroid Build Coastguard Worker call .n0 1793*c0909341SAndroid Build Coastguard Worker call .n1 1794*c0909341SAndroid Build Coastguard Worker.end2: 1795*c0909341SAndroid Build Coastguard Worker RET 1796*c0909341SAndroid Build Coastguard Worker.height1: 1797*c0909341SAndroid Build Coastguard Worker call .v1 1798*c0909341SAndroid Build Coastguard Worker call .prep_n 1799*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 1800*c0909341SAndroid Build Coastguard Worker.odd_height: 1801*c0909341SAndroid Build Coastguard Worker call .v1 1802*c0909341SAndroid Build Coastguard Worker call .n0 1803*c0909341SAndroid Build Coastguard Worker call .n1 1804*c0909341SAndroid Build Coastguard Worker.odd_height_end: 1805*c0909341SAndroid Build Coastguard Worker call .v0 1806*c0909341SAndroid Build Coastguard Worker call .v1 1807*c0909341SAndroid Build Coastguard Worker call .n0 1808*c0909341SAndroid Build Coastguard Worker jmp .end2 1809*c0909341SAndroid Build Coastguard Worker.extend_bottom: 1810*c0909341SAndroid Build Coastguard Worker call .v0 1811*c0909341SAndroid Build Coastguard Worker call .v1 1812*c0909341SAndroid Build Coastguard Worker jmp .end 1813*c0909341SAndroid Build Coastguard Worker.no_top: 1814*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1815*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1816*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 1817*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 1818*c0909341SAndroid Build Coastguard Worker call .h 1819*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1820*c0909341SAndroid Build Coastguard Worker lea t2, [t1+416*12] 1821*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: 1822*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10+416* 0] 1823*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10+416* 2] 1824*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10+416* 4] 1825*c0909341SAndroid Build Coastguard Worker paddw m0, m0 1826*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10+416* 6] 1827*c0909341SAndroid Build Coastguard Worker paddd m1, m1 1828*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10+416* 8] 1829*c0909341SAndroid Build Coastguard Worker paddd m2, m2 1830*c0909341SAndroid Build Coastguard Worker mova m5, [t1+r10+416*10] 1831*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 0], m0 1832*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 2], m1 1833*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 4], m2 1834*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 6], m3 1835*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 8], m4 1836*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*10], m5 1837*c0909341SAndroid Build Coastguard Worker add r10, 64 1838*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 1839*c0909341SAndroid Build Coastguard Worker call .v0 1840*c0909341SAndroid Build Coastguard Worker jmp .main 1841*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 1842*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1843*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1844*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1845*c0909341SAndroid Build Coastguard Worker movq xm16, [leftq+2] 1846*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-6] 1847*c0909341SAndroid Build Coastguard Worker add leftq, 8 1848*c0909341SAndroid Build Coastguard Worker jmp .h_main 1849*c0909341SAndroid Build Coastguard Worker.h_extend_left: 1850*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm16, [lpfq+wq] 1851*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-6] 1852*c0909341SAndroid Build Coastguard Worker jmp .h_main 1853*c0909341SAndroid Build Coastguard Worker.h_top: 1854*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1855*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1856*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1857*c0909341SAndroid Build Coastguard Worker.h_loop: 1858*c0909341SAndroid Build Coastguard Worker movu m16, [lpfq+r10- 2] 1859*c0909341SAndroid Build Coastguard Worker.h_main: 1860*c0909341SAndroid Build Coastguard Worker movu m17, [lpfq+r10+14] 1861*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1862*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 1863*c0909341SAndroid Build Coastguard Worker cmp r10d, -68 1864*c0909341SAndroid Build Coastguard Worker jl .h_have_right 1865*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 1866*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, [r13+r10+ 0], 0xe4 1867*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, [r13+r10+16], 0xe4 1868*c0909341SAndroid Build Coastguard Worker.h_have_right: 1869*c0909341SAndroid Build Coastguard Worker palignr m3, m17, m16, 2 1870*c0909341SAndroid Build Coastguard Worker palignr m0, m17, m16, 4 1871*c0909341SAndroid Build Coastguard Worker paddw m1, m3, m0 1872*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m0 1873*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1874*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0 1875*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1876*c0909341SAndroid Build Coastguard Worker palignr m0, m17, m16, 6 1877*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; sum3 1878*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m0, m6 1879*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m4, m4 ; sumsq3 1880*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m6 1881*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m0, m0 1882*c0909341SAndroid Build Coastguard Worker shufpd m4, m16, m17, 0x55 1883*c0909341SAndroid Build Coastguard Worker punpcklwd m17, m4, m16 1884*c0909341SAndroid Build Coastguard Worker paddw m0, m16, m4 1885*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m16 1886*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416* 6], m1 1887*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416* 8], m2 1888*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*10], m3 1889*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; sum5 1890*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m17, m17 ; sumsq5 1891*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m4, m4 1892*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416* 0], m1 1893*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416* 2], m2 1894*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416* 4], m3 1895*c0909341SAndroid Build Coastguard Worker add r10, 64 1896*c0909341SAndroid Build Coastguard Worker jl .h_loop 1897*c0909341SAndroid Build Coastguard Worker ret 1898*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1899*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 1900*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1901*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1902*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 1903*c0909341SAndroid Build Coastguard Worker movq xm16, [leftq+2] 1904*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-6] 1905*c0909341SAndroid Build Coastguard Worker add leftq, 8 1906*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 1907*c0909341SAndroid Build Coastguard Worker.hv0_extend_left: 1908*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm16, [lpfq+wq] 1909*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-6] 1910*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 1911*c0909341SAndroid Build Coastguard Worker.hv0_bottom: 1912*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 1913*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1914*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 1915*c0909341SAndroid Build Coastguard Worker.hv0_loop: 1916*c0909341SAndroid Build Coastguard Worker movu m16, [lpfq+r10- 2] 1917*c0909341SAndroid Build Coastguard Worker.hv0_main: 1918*c0909341SAndroid Build Coastguard Worker movu m17, [lpfq+r10+14] 1919*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1920*c0909341SAndroid Build Coastguard Worker jnz .hv0_have_right 1921*c0909341SAndroid Build Coastguard Worker cmp r10d, -68 1922*c0909341SAndroid Build Coastguard Worker jl .hv0_have_right 1923*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 1924*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, [r13+r10+ 0], 0xe4 1925*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, [r13+r10+16], 0xe4 1926*c0909341SAndroid Build Coastguard Worker.hv0_have_right: 1927*c0909341SAndroid Build Coastguard Worker palignr m3, m17, m16, 2 1928*c0909341SAndroid Build Coastguard Worker palignr m0, m17, m16, 4 1929*c0909341SAndroid Build Coastguard Worker paddw m1, m3, m0 1930*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m0 1931*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m2 1932*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0 1933*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m3 1934*c0909341SAndroid Build Coastguard Worker palignr m0, m17, m16, 6 1935*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; h sum3 1936*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m0, m6 1937*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m4, m4 ; h sumsq3 1938*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m6 1939*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m0, m0 1940*c0909341SAndroid Build Coastguard Worker shufpd m17, m16, m17, 0x55 1941*c0909341SAndroid Build Coastguard Worker paddw m4, m1, [t1+r10+416* 6] 1942*c0909341SAndroid Build Coastguard Worker paddd m5, m2, [t1+r10+416* 8] 1943*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416* 6], m1 1944*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416* 8], m2 1945*c0909341SAndroid Build Coastguard Worker paddw m1, m16 1946*c0909341SAndroid Build Coastguard Worker paddw m1, m17 ; h sum5 1947*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m17, m16 1948*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m0, m0 ; h sumsq5 1949*c0909341SAndroid Build Coastguard Worker paddd m0, m3, [t1+r10+416*10] 1950*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*10], m3 1951*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m16 1952*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m17, m17 1953*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*8+ 8], m1 ; we need a clean copy of the last row 1954*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*0+ 8], m2 ; in case height is odd 1955*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*0+72], m3 1956*c0909341SAndroid Build Coastguard Worker paddw m1, [t1+r10+416* 0] 1957*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10+416* 2] 1958*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+r10+416* 4] 1959*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416* 0], m1 1960*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416* 2], m2 1961*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416* 4], m3 1962*c0909341SAndroid Build Coastguard Worker paddw m17, m4, [t2+r10+416* 6] 1963*c0909341SAndroid Build Coastguard Worker paddd m2, m5, [t2+r10+416* 8] 1964*c0909341SAndroid Build Coastguard Worker paddd m3, m0, [t2+r10+416*10] 1965*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 6], m4 1966*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 8], m5 1967*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*10], m0 1968*c0909341SAndroid Build Coastguard Worker paddd m2, m8 1969*c0909341SAndroid Build Coastguard Worker paddd m3, m8 1970*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a3 + 8) >> 4 1971*c0909341SAndroid Build Coastguard Worker psrld m3, 4 1972*c0909341SAndroid Build Coastguard Worker pmulld m2, m9 ; -((a3 + 8) >> 4) * 9 1973*c0909341SAndroid Build Coastguard Worker pmulld m3, m9 1974*c0909341SAndroid Build Coastguard Worker psrlw m5, m17, 1 1975*c0909341SAndroid Build Coastguard Worker pavgw m5, m6 ; (b3 + 2) >> 2 1976*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m6 1977*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m4, m4 ; -p3 1978*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 1979*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m5, m5 1980*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m6, m17 ; b3 1981*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6, m17 1982*c0909341SAndroid Build Coastguard Worker pminsd m2, m6 1983*c0909341SAndroid Build Coastguard Worker pminsd m3, m6 1984*c0909341SAndroid Build Coastguard Worker pmulld m2, m12 ; p3 * s1 1985*c0909341SAndroid Build Coastguard Worker pmulld m3, m12 1986*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m13 ; b3 * 455 1987*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m13 1988*c0909341SAndroid Build Coastguard Worker vpalignr m3{k2}, m2, m2, 2 1989*c0909341SAndroid Build Coastguard Worker mova m2, m20 1990*c0909341SAndroid Build Coastguard Worker paddusw m3, m14 1991*c0909341SAndroid Build Coastguard Worker psraw m3, 4 ; min(z3, 255) - 256 1992*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 1993*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m3 1994*c0909341SAndroid Build Coastguard Worker vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 1995*c0909341SAndroid Build Coastguard Worker vmovdqu8 m3{k3}, m2 ; x3 1996*c0909341SAndroid Build Coastguard Worker pandn m2, m15, m3 1997*c0909341SAndroid Build Coastguard Worker psrld m3, 16 1998*c0909341SAndroid Build Coastguard Worker pmulld m16, m2 1999*c0909341SAndroid Build Coastguard Worker pmulld m17, m3 2000*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 2001*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*2+4], m2 2002*c0909341SAndroid Build Coastguard Worker psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2003*c0909341SAndroid Build Coastguard Worker psubd m17, m15 2004*c0909341SAndroid Build Coastguard Worker psrld m16, 12 2005*c0909341SAndroid Build Coastguard Worker psrld m17, 12 2006*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*4+ 8], xm16 2007*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*4+ 24], xm17 2008*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 2009*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 2010*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 2011*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 2012*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+104], m16, 3 2013*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+120], m17, 3 2014*c0909341SAndroid Build Coastguard Worker add r10, 64 2015*c0909341SAndroid Build Coastguard Worker jl .hv0_loop 2016*c0909341SAndroid Build Coastguard Worker ret 2017*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2018*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2019*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 2020*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2021*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 2022*c0909341SAndroid Build Coastguard Worker movq xm16, [leftq+2] 2023*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-6] 2024*c0909341SAndroid Build Coastguard Worker add leftq, 8 2025*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 2026*c0909341SAndroid Build Coastguard Worker.hv1_extend_left: 2027*c0909341SAndroid Build Coastguard Worker vpbroadcastw xm16, [lpfq+wq] 2028*c0909341SAndroid Build Coastguard Worker vmovdqu16 m16{k1}, [lpfq+wq-6] 2029*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 2030*c0909341SAndroid Build Coastguard Worker.hv1_bottom: 2031*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 2032*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 2033*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 2034*c0909341SAndroid Build Coastguard Worker.hv1_loop: 2035*c0909341SAndroid Build Coastguard Worker movu m16, [lpfq+r10- 2] 2036*c0909341SAndroid Build Coastguard Worker.hv1_main: 2037*c0909341SAndroid Build Coastguard Worker movu m17, [lpfq+r10+14] 2038*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 2039*c0909341SAndroid Build Coastguard Worker jnz .hv1_have_right 2040*c0909341SAndroid Build Coastguard Worker cmp r10d, -68 2041*c0909341SAndroid Build Coastguard Worker jl .hv1_have_right 2042*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [lpfq-2] 2043*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, [r13+r10+ 0], 0xe4 2044*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, [r13+r10+16], 0xe4 2045*c0909341SAndroid Build Coastguard Worker.hv1_have_right: 2046*c0909341SAndroid Build Coastguard Worker palignr m1, m17, m16, 2 2047*c0909341SAndroid Build Coastguard Worker palignr m3, m17, m16, 4 2048*c0909341SAndroid Build Coastguard Worker paddw m2, m1, m3 2049*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m3 2050*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m0 2051*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m3 2052*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m1 2053*c0909341SAndroid Build Coastguard Worker palignr m3, m17, m16, 6 2054*c0909341SAndroid Build Coastguard Worker paddw m2, m3 ; h sum3 2055*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m3, m6 2056*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m5, m5 ; h sumsq3 2057*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 2058*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m3, m3 2059*c0909341SAndroid Build Coastguard Worker shufpd m3, m16, m17, 0x55 2060*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m16, m3 2061*c0909341SAndroid Build Coastguard Worker paddw m4, m16, m3 2062*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m3 2063*c0909341SAndroid Build Coastguard Worker paddw m17, m2, [t2+r10+416* 6] 2064*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 6], m2 2065*c0909341SAndroid Build Coastguard Worker paddw m4, m2 ; h sum5 2066*c0909341SAndroid Build Coastguard Worker paddd m2, m0, [t2+r10+416* 8] 2067*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t2+r10+416*10] 2068*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 8], m0 2069*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*10], m1 2070*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m5, m5 ; h sumsq5 2071*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m16, m16 2072*c0909341SAndroid Build Coastguard Worker paddd m2, m8 2073*c0909341SAndroid Build Coastguard Worker paddd m3, m8 2074*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a3 + 8) >> 4 2075*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2076*c0909341SAndroid Build Coastguard Worker pmulld m2, m9 ; -((a3 + 8) >> 4) * 9 2077*c0909341SAndroid Build Coastguard Worker pmulld m3, m9 2078*c0909341SAndroid Build Coastguard Worker psrlw m16, m17, 1 2079*c0909341SAndroid Build Coastguard Worker pavgw m16, m6 ; (b3 + 2) >> 2 2080*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m16, m6 2081*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m5, m5 ; -p3 2082*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m6 2083*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m16, m16 2084*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m6, m17 ; b3 2085*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6, m17 2086*c0909341SAndroid Build Coastguard Worker pminsd m2, m6 2087*c0909341SAndroid Build Coastguard Worker pminsd m3, m6 2088*c0909341SAndroid Build Coastguard Worker pmulld m2, m12 ; p3 * s1 2089*c0909341SAndroid Build Coastguard Worker pmulld m3, m12 2090*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m13 ; b3 * 455 2091*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m13 2092*c0909341SAndroid Build Coastguard Worker vpalignr m3{k2}, m2, m2, 2 2093*c0909341SAndroid Build Coastguard Worker mova m2, m20 2094*c0909341SAndroid Build Coastguard Worker paddusw m3, m14 2095*c0909341SAndroid Build Coastguard Worker psraw m3, 4 ; min(z3, 255) - 256 2096*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 2097*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m3 2098*c0909341SAndroid Build Coastguard Worker vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 2099*c0909341SAndroid Build Coastguard Worker vmovdqu8 m3{k3}, m2 ; x3 2100*c0909341SAndroid Build Coastguard Worker pandn m2, m15, m3 2101*c0909341SAndroid Build Coastguard Worker psrld m3, 16 2102*c0909341SAndroid Build Coastguard Worker pmulld m16, m2 2103*c0909341SAndroid Build Coastguard Worker pmulld m17, m3 2104*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 2105*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*4+4], m2 2106*c0909341SAndroid Build Coastguard Worker psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2107*c0909341SAndroid Build Coastguard Worker psubd m17, m15 2108*c0909341SAndroid Build Coastguard Worker psrld m16, 12 2109*c0909341SAndroid Build Coastguard Worker psrld m17, 12 2110*c0909341SAndroid Build Coastguard Worker paddw m5, m4, [t2+r10+416*0] 2111*c0909341SAndroid Build Coastguard Worker paddd m2, m0, [t2+r10+416*2] 2112*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t2+r10+416*4] 2113*c0909341SAndroid Build Coastguard Worker paddw m5, [t1+r10+416*0] 2114*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10+416*2] 2115*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+r10+416*4] 2116*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*0], m4 2117*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*2], m0 2118*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*4], m1 2119*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*8+ 8], xm16 2120*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*8+ 24], xm17 2121*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*8+ 40], ym16, 1 2122*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*8+ 56], ym17, 1 2123*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2 2124*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2 2125*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*8+104], m16, 3 2126*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*8+120], m17, 3 2127*c0909341SAndroid Build Coastguard Worker paddd m2, m8 2128*c0909341SAndroid Build Coastguard Worker paddd m3, m8 2129*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a5 + 8) >> 4 2130*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2131*c0909341SAndroid Build Coastguard Worker pmulld m2, m10 ; -((a5 + 8) >> 4) * 25 2132*c0909341SAndroid Build Coastguard Worker pmulld m3, m10 2133*c0909341SAndroid Build Coastguard Worker psrlw m17, m5, 1 2134*c0909341SAndroid Build Coastguard Worker pavgw m17, m6 ; (b5 + 2) >> 2 2135*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m17, m6 2136*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m16, m16 ; -p5 2137*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6 2138*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m17, m17 2139*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m5, m6 ; b5 2140*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m5, m6 2141*c0909341SAndroid Build Coastguard Worker pmulld m2, m11 ; p5 * s0 2142*c0909341SAndroid Build Coastguard Worker pmulld m3, m11 2143*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m13 ; b5 * 164 2144*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m13 2145*c0909341SAndroid Build Coastguard Worker vpalignr m3{k2}, m2, m2, 2 2146*c0909341SAndroid Build Coastguard Worker mova m2, m20 2147*c0909341SAndroid Build Coastguard Worker pmaxsw m3, m6 2148*c0909341SAndroid Build Coastguard Worker paddusw m3, m14 2149*c0909341SAndroid Build Coastguard Worker psraw m3, 4 ; min(z5, 255) - 256 2150*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 2151*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m3 2152*c0909341SAndroid Build Coastguard Worker vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 2153*c0909341SAndroid Build Coastguard Worker vmovdqu8 m3{k3}, m2 ; x5 2154*c0909341SAndroid Build Coastguard Worker pandn m2, m15, m3 2155*c0909341SAndroid Build Coastguard Worker psrld m3, 16 2156*c0909341SAndroid Build Coastguard Worker pmulld m16, m2 2157*c0909341SAndroid Build Coastguard Worker pmulld m17, m3 2158*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 2159*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*0+4], m2 2160*c0909341SAndroid Build Coastguard Worker psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2161*c0909341SAndroid Build Coastguard Worker psubd m17, m15 2162*c0909341SAndroid Build Coastguard Worker psrld m16, 12 2163*c0909341SAndroid Build Coastguard Worker psrld m17, 12 2164*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*0+ 8], xm16 2165*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*0+ 24], xm17 2166*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 2167*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 2168*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 2169*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 2170*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+104], m16, 3 2171*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+120], m17, 3 2172*c0909341SAndroid Build Coastguard Worker add r10, 64 2173*c0909341SAndroid Build Coastguard Worker jl .hv1_loop 2174*c0909341SAndroid Build Coastguard Worker mov r10, t2 2175*c0909341SAndroid Build Coastguard Worker mov t2, t1 2176*c0909341SAndroid Build Coastguard Worker mov t1, r10 2177*c0909341SAndroid Build Coastguard Worker ret 2178*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab3 (even rows) 2179*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 2180*c0909341SAndroid Build Coastguard Worker.v0_loop: 2181*c0909341SAndroid Build Coastguard Worker mova m16, [t1+r10+416* 6] 2182*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10+416* 8] 2183*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10+416*10] 2184*c0909341SAndroid Build Coastguard Worker paddw m16, m16 2185*c0909341SAndroid Build Coastguard Worker paddd m2, m2 2186*c0909341SAndroid Build Coastguard Worker paddd m3, m3 2187*c0909341SAndroid Build Coastguard Worker paddw m17, m16, [t2+r10+416* 6] 2188*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t2+r10+416* 8] 2189*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t2+r10+416*10] 2190*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 6], m16 2191*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 8], m2 2192*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*10], m3 2193*c0909341SAndroid Build Coastguard Worker paddd m4, m8 2194*c0909341SAndroid Build Coastguard Worker paddd m5, m8 2195*c0909341SAndroid Build Coastguard Worker psrld m4, 4 ; (a3 + 8) >> 4 2196*c0909341SAndroid Build Coastguard Worker psrld m5, 4 2197*c0909341SAndroid Build Coastguard Worker pmulld m4, m9 ; -((a3 + 8) >> 4) * 9 2198*c0909341SAndroid Build Coastguard Worker pmulld m5, m9 2199*c0909341SAndroid Build Coastguard Worker psrlw m3, m17, 1 2200*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b3 + 2) >> 2 2201*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 2202*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m2, m2 ; -p3 2203*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 2204*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m3, m3 2205*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m6, m17 ; b3 2206*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6, m17 2207*c0909341SAndroid Build Coastguard Worker pminsd m4, m6 2208*c0909341SAndroid Build Coastguard Worker pminsd m5, m6 2209*c0909341SAndroid Build Coastguard Worker pmulld m4, m12 ; p3 * s1 2210*c0909341SAndroid Build Coastguard Worker pmulld m5, m12 2211*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m13 ; b3 * 455 2212*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m13 2213*c0909341SAndroid Build Coastguard Worker vpalignr m5{k2}, m4, m4, 2 2214*c0909341SAndroid Build Coastguard Worker mova m4, m20 2215*c0909341SAndroid Build Coastguard Worker paddusw m5, m14 2216*c0909341SAndroid Build Coastguard Worker psraw m5, 4 ; min(z3, 255) - 256 2217*c0909341SAndroid Build Coastguard Worker vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255] 2218*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m5 2219*c0909341SAndroid Build Coastguard Worker vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127] 2220*c0909341SAndroid Build Coastguard Worker vmovdqu8 m5{k3}, m4 ; x3 2221*c0909341SAndroid Build Coastguard Worker pandn m4, m15, m5 2222*c0909341SAndroid Build Coastguard Worker psrld m5, 16 2223*c0909341SAndroid Build Coastguard Worker pmulld m16, m4 2224*c0909341SAndroid Build Coastguard Worker pmulld m17, m5 2225*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 2226*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*2+4], m4 2227*c0909341SAndroid Build Coastguard Worker psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2228*c0909341SAndroid Build Coastguard Worker psubd m17, m15 2229*c0909341SAndroid Build Coastguard Worker psrld m16, 12 2230*c0909341SAndroid Build Coastguard Worker psrld m17, 12 2231*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10+416*0] 2232*c0909341SAndroid Build Coastguard Worker mova m4, [t1+r10+416*2] 2233*c0909341SAndroid Build Coastguard Worker mova m5, [t1+r10+416*4] 2234*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*8+ 8], m3 2235*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*0+ 8], m4 2236*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*0+72], m5 2237*c0909341SAndroid Build Coastguard Worker paddw m3, m3 ; cc5 2238*c0909341SAndroid Build Coastguard Worker paddd m4, m4 2239*c0909341SAndroid Build Coastguard Worker paddd m5, m5 2240*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*0], m3 2241*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*2], m4 2242*c0909341SAndroid Build Coastguard Worker mova [t1+r10+416*4], m5 2243*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*4+ 8], xm16 2244*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*4+ 24], xm17 2245*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 2246*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 2247*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 2248*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 2249*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+104], m16, 3 2250*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*4+120], m17, 3 2251*c0909341SAndroid Build Coastguard Worker add r10, 64 2252*c0909341SAndroid Build Coastguard Worker jl .v0_loop 2253*c0909341SAndroid Build Coastguard Worker ret 2254*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows) 2255*c0909341SAndroid Build Coastguard Worker lea r10, [wq-4] 2256*c0909341SAndroid Build Coastguard Worker.v1_loop: 2257*c0909341SAndroid Build Coastguard Worker mova m16, [t1+r10+416* 6] 2258*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10+416* 8] 2259*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10+416*10] 2260*c0909341SAndroid Build Coastguard Worker paddw m17, m16, [t2+r10+416* 6] 2261*c0909341SAndroid Build Coastguard Worker paddd m4, m2, [t2+r10+416* 8] 2262*c0909341SAndroid Build Coastguard Worker paddd m5, m3, [t2+r10+416*10] 2263*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 6], m16 2264*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416* 8], m2 2265*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*10], m3 2266*c0909341SAndroid Build Coastguard Worker paddd m4, m8 2267*c0909341SAndroid Build Coastguard Worker paddd m5, m8 2268*c0909341SAndroid Build Coastguard Worker psrld m4, 4 ; (a3 + 8) >> 4 2269*c0909341SAndroid Build Coastguard Worker psrld m5, 4 2270*c0909341SAndroid Build Coastguard Worker pmulld m4, m9 ; -((a3 + 8) >> 4) * 9 2271*c0909341SAndroid Build Coastguard Worker pmulld m5, m9 2272*c0909341SAndroid Build Coastguard Worker psrlw m3, m17, 1 2273*c0909341SAndroid Build Coastguard Worker pavgw m3, m6 ; (b3 + 2) >> 2 2274*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m6 2275*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m2, m2 ; -p3 2276*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m6 2277*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m3, m3 2278*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m6, m17 ; b3 2279*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m6, m17 2280*c0909341SAndroid Build Coastguard Worker pminsd m4, m6 2281*c0909341SAndroid Build Coastguard Worker pminsd m5, m6 2282*c0909341SAndroid Build Coastguard Worker pmulld m4, m12 ; p3 * s1 2283*c0909341SAndroid Build Coastguard Worker pmulld m5, m12 2284*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m13 ; b3 * 455 2285*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m13 2286*c0909341SAndroid Build Coastguard Worker vpalignr m5{k2}, m4, m4, 2 2287*c0909341SAndroid Build Coastguard Worker mova m4, m20 2288*c0909341SAndroid Build Coastguard Worker paddusw m5, m14 2289*c0909341SAndroid Build Coastguard Worker psraw m5, 4 ; min(z3, 255) - 256 2290*c0909341SAndroid Build Coastguard Worker vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255] 2291*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m5 2292*c0909341SAndroid Build Coastguard Worker vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127] 2293*c0909341SAndroid Build Coastguard Worker vmovdqu8 m5{k3}, m4 ; x3 2294*c0909341SAndroid Build Coastguard Worker pandn m4, m15, m5 2295*c0909341SAndroid Build Coastguard Worker psrld m5, 16 2296*c0909341SAndroid Build Coastguard Worker pmulld m16, m4 2297*c0909341SAndroid Build Coastguard Worker pmulld m17, m5 2298*c0909341SAndroid Build Coastguard Worker packssdw m4, m5 2299*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*4+4], m4 2300*c0909341SAndroid Build Coastguard Worker psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2301*c0909341SAndroid Build Coastguard Worker psubd m17, m15 2302*c0909341SAndroid Build Coastguard Worker psrld m16, 12 2303*c0909341SAndroid Build Coastguard Worker psrld m17, 12 2304*c0909341SAndroid Build Coastguard Worker mova m0, [t3+r10*2+416*8+ 8] 2305*c0909341SAndroid Build Coastguard Worker mova m4, [t3+r10*2+416*0+ 8] 2306*c0909341SAndroid Build Coastguard Worker mova m5, [t3+r10*2+416*0+72] 2307*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10+416*0] 2308*c0909341SAndroid Build Coastguard Worker paddd m2, m4, [t2+r10+416*2] 2309*c0909341SAndroid Build Coastguard Worker paddd m3, m5, [t2+r10+416*4] 2310*c0909341SAndroid Build Coastguard Worker paddw m1, [t1+r10+416*0] 2311*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10+416*2] 2312*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+r10+416*4] 2313*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*0], m0 2314*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*2], m4 2315*c0909341SAndroid Build Coastguard Worker mova [t2+r10+416*4], m5 2316*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*8+ 8], xm16 2317*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*8+ 24], xm17 2318*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*8+ 40], ym16, 1 2319*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*8+ 56], ym17, 1 2320*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2 2321*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2 2322*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*8+104], m16, 3 2323*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*8+120], m17, 3 2324*c0909341SAndroid Build Coastguard Worker paddd m2, m8 2325*c0909341SAndroid Build Coastguard Worker paddd m3, m8 2326*c0909341SAndroid Build Coastguard Worker psrld m2, 4 ; (a5 + 8) >> 4 2327*c0909341SAndroid Build Coastguard Worker psrld m3, 4 2328*c0909341SAndroid Build Coastguard Worker pmulld m2, m10 ; -((a5 + 8) >> 4) * 25 2329*c0909341SAndroid Build Coastguard Worker pmulld m3, m10 2330*c0909341SAndroid Build Coastguard Worker psrlw m5, m1, 1 2331*c0909341SAndroid Build Coastguard Worker pavgw m5, m6 ; (b5 + 2) >> 2 2332*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m6 2333*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m4, m4 ; -p5 2334*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6 2335*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m5, m5 2336*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m1, m6 ; b5 2337*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m1, m6 2338*c0909341SAndroid Build Coastguard Worker pmulld m2, m11 ; p5 * s0 2339*c0909341SAndroid Build Coastguard Worker pmulld m3, m11 2340*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m13 ; b5 * 164 2341*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m13 2342*c0909341SAndroid Build Coastguard Worker vpalignr m3{k2}, m2, m2, 2 2343*c0909341SAndroid Build Coastguard Worker mova m2, m20 2344*c0909341SAndroid Build Coastguard Worker pmaxsw m3, m6 2345*c0909341SAndroid Build Coastguard Worker paddusw m3, m14 2346*c0909341SAndroid Build Coastguard Worker psraw m3, 4 ; min(z5, 255) - 256 2347*c0909341SAndroid Build Coastguard Worker vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] 2348*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m3 2349*c0909341SAndroid Build Coastguard Worker vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] 2350*c0909341SAndroid Build Coastguard Worker vmovdqu8 m3{k3}, m2 ; x5 2351*c0909341SAndroid Build Coastguard Worker pandn m2, m15, m3 2352*c0909341SAndroid Build Coastguard Worker psrld m3, 16 2353*c0909341SAndroid Build Coastguard Worker pmulld m16, m2 2354*c0909341SAndroid Build Coastguard Worker pmulld m17, m3 2355*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 2356*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*0+4], m2 2357*c0909341SAndroid Build Coastguard Worker psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2358*c0909341SAndroid Build Coastguard Worker psubd m17, m15 2359*c0909341SAndroid Build Coastguard Worker psrld m16, 12 2360*c0909341SAndroid Build Coastguard Worker psrld m17, 12 2361*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*0+ 8], xm16 2362*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*0+ 24], xm17 2363*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 2364*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 2365*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 2366*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 2367*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+104], m16, 3 2368*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*2+416*0+120], m17, 3 2369*c0909341SAndroid Build Coastguard Worker add r10, 64 2370*c0909341SAndroid Build Coastguard Worker jl .v1_loop 2371*c0909341SAndroid Build Coastguard Worker mov r10, t2 2372*c0909341SAndroid Build Coastguard Worker mov t2, t1 2373*c0909341SAndroid Build Coastguard Worker mov t1, r10 2374*c0909341SAndroid Build Coastguard Worker ret 2375*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 2376*c0909341SAndroid Build Coastguard Worker mov r10, wq 2377*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 2378*c0909341SAndroid Build Coastguard Worker movu ym0, [t4+r10*1+416*0+2] 2379*c0909341SAndroid Build Coastguard Worker paddw ym2, ym0, [t4+r10*1+416*0+0] 2380*c0909341SAndroid Build Coastguard Worker paddw ym2, [t4+r10*1+416*0+4] 2381*c0909341SAndroid Build Coastguard Worker movu m1, [t3+r10*2+416*0+4] 2382*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t3+r10*2+416*0+0] 2383*c0909341SAndroid Build Coastguard Worker paddd m3, [t3+r10*2+416*0+8] 2384*c0909341SAndroid Build Coastguard Worker paddw ym0, ym2 2385*c0909341SAndroid Build Coastguard Worker paddd m1, m3 2386*c0909341SAndroid Build Coastguard Worker psllw ym2, 2 2387*c0909341SAndroid Build Coastguard Worker pslld m3, 2 2388*c0909341SAndroid Build Coastguard Worker paddw ym0, ym2 ; a5 565 2389*c0909341SAndroid Build Coastguard Worker paddd m1, m3 ; b5 565 2390*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416* 6], ym0 2391*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*12], m1 2392*c0909341SAndroid Build Coastguard Worker mova ym0, [t4+r10*1+416*2+0] 2393*c0909341SAndroid Build Coastguard Worker paddw ym0, [t4+r10*1+416*2+4] 2394*c0909341SAndroid Build Coastguard Worker paddw ym2, ym0, [t4+r10*1+416*2+2] 2395*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*2+416*4+0] 2396*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*2+416*4+8] 2397*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t3+r10*2+416*4+4] 2398*c0909341SAndroid Build Coastguard Worker psllw ym2, 2 ; a3[-1] 444 2399*c0909341SAndroid Build Coastguard Worker pslld m3, 2 ; b3[-1] 444 2400*c0909341SAndroid Build Coastguard Worker psubw ym2, ym0 ; a3[-1] 343 2401*c0909341SAndroid Build Coastguard Worker psubd m3, m1 ; b3[-1] 343 2402*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416* 8], ym2 2403*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*16], m3 2404*c0909341SAndroid Build Coastguard Worker mova ym0, [t4+r10*1+416*4+0] 2405*c0909341SAndroid Build Coastguard Worker paddw ym0, [t4+r10*1+416*4+4] 2406*c0909341SAndroid Build Coastguard Worker paddw ym2, ym0, [t4+r10*1+416*4+2] 2407*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*2+416*8+0] 2408*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*2+416*8+8] 2409*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t3+r10*2+416*8+4] 2410*c0909341SAndroid Build Coastguard Worker psllw ym2, 2 ; a3[ 0] 444 2411*c0909341SAndroid Build Coastguard Worker pslld m3, 2 ; b3[ 0] 444 2412*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*10], ym2 2413*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*20], m3 2414*c0909341SAndroid Build Coastguard Worker psubw ym2, ym0 ; a3[ 0] 343 2415*c0909341SAndroid Build Coastguard Worker psubd m3, m1 ; b3[ 0] 343 2416*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*12], ym2 2417*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*24], m3 2418*c0909341SAndroid Build Coastguard Worker add r10, 32 2419*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 2420*c0909341SAndroid Build Coastguard Worker ret 2421*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2422*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 2423*c0909341SAndroid Build Coastguard Worker mov r10, wq 2424*c0909341SAndroid Build Coastguard Worker.n0_loop: 2425*c0909341SAndroid Build Coastguard Worker movu ym2, [t4+r10*1+2] 2426*c0909341SAndroid Build Coastguard Worker paddw ym0, ym2, [t4+r10*1+0] 2427*c0909341SAndroid Build Coastguard Worker paddw ym0, [t4+r10*1+4] 2428*c0909341SAndroid Build Coastguard Worker paddw ym2, ym0 2429*c0909341SAndroid Build Coastguard Worker psllw ym0, 2 2430*c0909341SAndroid Build Coastguard Worker paddw ym0, ym2 ; a5 2431*c0909341SAndroid Build Coastguard Worker movu m1, [t3+r10*2+4] 2432*c0909341SAndroid Build Coastguard Worker paddd m4, m1, [t3+r10*2+0] 2433*c0909341SAndroid Build Coastguard Worker paddd m4, [t3+r10*2+8] 2434*c0909341SAndroid Build Coastguard Worker paddd m1, m4 2435*c0909341SAndroid Build Coastguard Worker pslld m4, 2 2436*c0909341SAndroid Build Coastguard Worker paddd m4, m1 ; b5 2437*c0909341SAndroid Build Coastguard Worker paddw ym2, ym0, [t4+r10*1+416* 6] 2438*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416* 6], ym0 2439*c0909341SAndroid Build Coastguard Worker paddd m0, m4, [t3+r10*2+416*12] 2440*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*12], m4 2441*c0909341SAndroid Build Coastguard Worker mova ym3, [t4+r10*1+416*2+0] 2442*c0909341SAndroid Build Coastguard Worker paddw ym3, [t4+r10*1+416*2+4] 2443*c0909341SAndroid Build Coastguard Worker paddw ym5, ym3, [t4+r10*1+416*2+2] 2444*c0909341SAndroid Build Coastguard Worker psllw ym5, 2 ; a3[ 1] 444 2445*c0909341SAndroid Build Coastguard Worker psubw ym4, ym5, ym3 ; a3[ 1] 343 2446*c0909341SAndroid Build Coastguard Worker paddw ym3, ym4, [t4+r10*1+416* 8] 2447*c0909341SAndroid Build Coastguard Worker paddw ym3, [t4+r10*1+416*10] 2448*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416* 8], ym4 2449*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*10], ym5 2450*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*2+416*4+0] 2451*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*2+416*4+8] 2452*c0909341SAndroid Build Coastguard Worker paddd m5, m1, [t3+r10*2+416*4+4] 2453*c0909341SAndroid Build Coastguard Worker pslld m5, 2 ; b3[ 1] 444 2454*c0909341SAndroid Build Coastguard Worker psubd m4, m5, m1 ; b3[ 1] 343 2455*c0909341SAndroid Build Coastguard Worker paddd m1, m4, [t3+r10*2+416*16] 2456*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*2+416*20] 2457*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*16], m4 2458*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*20], m5 2459*c0909341SAndroid Build Coastguard Worker pmovzxwd m4, [dstq+r10] 2460*c0909341SAndroid Build Coastguard Worker pmovzxwd m2, ym2 ; a5 2461*c0909341SAndroid Build Coastguard Worker pmovzxwd m3, ym3 ; a3 2462*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m4 ; a5 * src 2463*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m4 ; a3 * src 2464*c0909341SAndroid Build Coastguard Worker vpshldd m4, m22, 13 2465*c0909341SAndroid Build Coastguard Worker psubd m0, m2 ; b5 - a5 * src + (1 << 8) 2466*c0909341SAndroid Build Coastguard Worker psubd m1, m3 ; b3 - a3 * src + (1 << 8) 2467*c0909341SAndroid Build Coastguard Worker psrld m0, 9 2468*c0909341SAndroid Build Coastguard Worker pslld m1, 7 2469*c0909341SAndroid Build Coastguard Worker vpblendmb m0{k2}, m1, m0 2470*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m0, m7 2471*c0909341SAndroid Build Coastguard Worker psrad m4, 7 2472*c0909341SAndroid Build Coastguard Worker pmaxsd m4, m6 2473*c0909341SAndroid Build Coastguard Worker vpmovusdw ym16, m4 ; clip 2474*c0909341SAndroid Build Coastguard Worker psrlw ym16, 6 2475*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], ym16 2476*c0909341SAndroid Build Coastguard Worker add r10, 32 2477*c0909341SAndroid Build Coastguard Worker jl .n0_loop 2478*c0909341SAndroid Build Coastguard Worker add dstq, strideq 2479*c0909341SAndroid Build Coastguard Worker ret 2480*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2481*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 2482*c0909341SAndroid Build Coastguard Worker mov r10, wq 2483*c0909341SAndroid Build Coastguard Worker.n1_loop: 2484*c0909341SAndroid Build Coastguard Worker mova ym3, [t4+r10*1+416*4+0] 2485*c0909341SAndroid Build Coastguard Worker paddw ym3, [t4+r10*1+416*4+4] 2486*c0909341SAndroid Build Coastguard Worker paddw ym5, ym3, [t4+r10*1+416*4+2] 2487*c0909341SAndroid Build Coastguard Worker psllw ym5, 2 ; a3[ 1] 444 2488*c0909341SAndroid Build Coastguard Worker psubw ym4, ym5, ym3 ; a3[ 1] 343 2489*c0909341SAndroid Build Coastguard Worker paddw ym3, ym4, [t4+r10*1+416*12] 2490*c0909341SAndroid Build Coastguard Worker paddw ym3, [t4+r10*1+416*10] 2491*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*10], ym5 2492*c0909341SAndroid Build Coastguard Worker mova [t4+r10*1+416*12], ym4 2493*c0909341SAndroid Build Coastguard Worker mova m0, [t3+r10*2+416*8+0] 2494*c0909341SAndroid Build Coastguard Worker paddd m0, [t3+r10*2+416*8+8] 2495*c0909341SAndroid Build Coastguard Worker paddd m5, m0, [t3+r10*2+416*8+4] 2496*c0909341SAndroid Build Coastguard Worker pslld m5, 2 ; b3[ 1] 444 2497*c0909341SAndroid Build Coastguard Worker psubd m4, m5, m0 ; b3[ 1] 343 2498*c0909341SAndroid Build Coastguard Worker paddd m0, m4, [t3+r10*2+416*24] 2499*c0909341SAndroid Build Coastguard Worker paddd m0, [t3+r10*2+416*20] 2500*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*20], m5 2501*c0909341SAndroid Build Coastguard Worker mova [t3+r10*2+416*24], m4 2502*c0909341SAndroid Build Coastguard Worker pmovzxwd m4, [dstq+r10] 2503*c0909341SAndroid Build Coastguard Worker pmovzxwd m2, [t4+r10*1+416* 6] 2504*c0909341SAndroid Build Coastguard Worker pmovzxwd m3, ym3 2505*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*2+416*12] 2506*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m4 ; a5 * src 2507*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m4 ; a3 * src 2508*c0909341SAndroid Build Coastguard Worker vpshldd m4, m22, 13 2509*c0909341SAndroid Build Coastguard Worker psubd m1, m2 ; b5 - a5 * src + (1 << 8) 2510*c0909341SAndroid Build Coastguard Worker psubd m0, m3 ; b3 - a3 * src + (1 << 8) 2511*c0909341SAndroid Build Coastguard Worker pslld m0, 7 2512*c0909341SAndroid Build Coastguard Worker vpalignr m0{k2}, m1, m1, 1 2513*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m0, m7 2514*c0909341SAndroid Build Coastguard Worker psrad m4, 7 2515*c0909341SAndroid Build Coastguard Worker pmaxsd m4, m6 2516*c0909341SAndroid Build Coastguard Worker vpmovusdw ym16, m4 ; clip 2517*c0909341SAndroid Build Coastguard Worker psrlw ym16, 6 2518*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], ym16 2519*c0909341SAndroid Build Coastguard Worker add r10, 32 2520*c0909341SAndroid Build Coastguard Worker jl .n1_loop 2521*c0909341SAndroid Build Coastguard Worker add dstq, strideq 2522*c0909341SAndroid Build Coastguard Worker ret 2523*c0909341SAndroid Build Coastguard Worker 2524*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 2525