1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 32 32*c0909341SAndroid Build Coastguard Worker 33*c0909341SAndroid Build Coastguard Workerwiener_shufA: db 1, 2, 7, 6, 3, 4, 9, 8, 5, 6, 11, 10, 7, 8, 13, 12 34*c0909341SAndroid Build Coastguard Workerwiener_shufB: db 2, 3, 8, 7, 4, 5, 10, 9, 6, 7, 12, 11, 8, 9, 14, 13 35*c0909341SAndroid Build Coastguard Workerwiener_shufC: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 36*c0909341SAndroid Build Coastguard Workerwiener_shufD: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 37*c0909341SAndroid Build Coastguard Workerwiener_perm32: db 1, 9, 3, 11, 5, 13, 7, 15, 33, 41, 35, 43, 37, 45, 39, 47 38*c0909341SAndroid Build Coastguard Worker db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63 39*c0909341SAndroid Build Coastguard Workersgr_shuf: db 128, 1, -1, 2,132, 3, -1, 4,136, 5, -1, 6,140, 7, -1, 8 40*c0909341SAndroid Build Coastguard Worker db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1, 0,128 41*c0909341SAndroid Build Coastguard Workersgr_mix_perm: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 42*c0909341SAndroid Build Coastguard Workerr_ext_mask: times 68 db -1 43*c0909341SAndroid Build Coastguard Worker times 4 db 0 44*c0909341SAndroid Build Coastguard Workerwiener_x_shuf: db 0, 2, -1, 0 45*c0909341SAndroid Build Coastguard Workerwiener_x_add: db 0, 1,127, 0 46*c0909341SAndroid Build Coastguard Worker 47*c0909341SAndroid Build Coastguard Workerpw_61448: times 2 dw 61448 48*c0909341SAndroid Build Coastguard Workerpw_164_455: dw 164, 455 49*c0909341SAndroid Build Coastguard Workerpd_m16380: dd -16380 50*c0909341SAndroid Build Coastguard Workerpd_m4096: dd -4096 51*c0909341SAndroid Build Coastguard Workerpd_m25 dd -25 52*c0909341SAndroid Build Coastguard Workerpd_m9: dd -9 53*c0909341SAndroid Build Coastguard Workerpd_34816: dd 34816 54*c0909341SAndroid Build Coastguard Workerpd_8421376: dd 8421376 55*c0909341SAndroid Build Coastguard Worker 56*c0909341SAndroid Build Coastguard Workercextern sgr_x_by_x 57*c0909341SAndroid Build Coastguard Worker 58*c0909341SAndroid Build Coastguard WorkerSECTION .text 59*c0909341SAndroid Build Coastguard Worker 60*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers 61*c0909341SAndroid Build Coastguard Worker 62*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 63*c0909341SAndroid Build Coastguard Workercglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \ 64*c0909341SAndroid Build Coastguard Worker w, h, edge, flt 65*c0909341SAndroid Build Coastguard Worker mov fltq, r6mp 66*c0909341SAndroid Build Coastguard Worker mov wd, wm 67*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 68*c0909341SAndroid Build Coastguard Worker mov edged, r7m 69*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [wiener_shufA] 70*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [wiener_shufB] 71*c0909341SAndroid Build Coastguard Worker mov r10d, 0xfffe 72*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m8, [wiener_shufC] 73*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m9, [wiener_shufD] 74*c0909341SAndroid Build Coastguard Worker kmovw k1, r10d 75*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [wiener_x_shuf] 76*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [wiener_x_add] 77*c0909341SAndroid Build Coastguard Worker mov r10, 0xaaaaaaaaaaaaaaaa 78*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [fltq+ 0] 79*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [fltq+ 4] 80*c0909341SAndroid Build Coastguard Worker kmovq k2, r10 81*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pd_m16380] 82*c0909341SAndroid Build Coastguard Worker packsswb m11, m11 ; x0 x1 x0 x1 83*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [fltq+16] 84*c0909341SAndroid Build Coastguard Worker pshufb m12, m0 85*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [fltq+20] 86*c0909341SAndroid Build Coastguard Worker paddb m12, m1 ; x2 x3+1 x2 127 87*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [pd_8421376] 88*c0909341SAndroid Build Coastguard Worker psllw m14, 5 ; y0 y1 89*c0909341SAndroid Build Coastguard Worker psllw m15, 5 ; y2 y3 90*c0909341SAndroid Build Coastguard Worker cmp wd, 32 ; the minimum lr unit size for chroma in 4:2:0 is 32 91*c0909341SAndroid Build Coastguard Worker jle .w32 ; pixels, so we need a special case for small widths 92*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+16] 93*c0909341SAndroid Build Coastguard Worker add lpfq, wq 94*c0909341SAndroid Build Coastguard Worker add dstq, wq 95*c0909341SAndroid Build Coastguard Worker neg wq 96*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 97*c0909341SAndroid Build Coastguard Worker jz .no_top 98*c0909341SAndroid Build Coastguard Worker call .h_top 99*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 100*c0909341SAndroid Build Coastguard Worker mov t6, t1 101*c0909341SAndroid Build Coastguard Worker mov t5, t1 102*c0909341SAndroid Build Coastguard Worker add t1, 384*2 103*c0909341SAndroid Build Coastguard Worker call .h_top 104*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 105*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 106*c0909341SAndroid Build Coastguard Worker mov t4, t1 107*c0909341SAndroid Build Coastguard Worker add t1, 384*2 108*c0909341SAndroid Build Coastguard Worker add r10, strideq 109*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 110*c0909341SAndroid Build Coastguard Worker call .h 111*c0909341SAndroid Build Coastguard Worker mov t3, t1 112*c0909341SAndroid Build Coastguard Worker mov t2, t1 113*c0909341SAndroid Build Coastguard Worker dec hd 114*c0909341SAndroid Build Coastguard Worker jz .v1 115*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 116*c0909341SAndroid Build Coastguard Worker add t1, 384*2 117*c0909341SAndroid Build Coastguard Worker call .h 118*c0909341SAndroid Build Coastguard Worker mov t2, t1 119*c0909341SAndroid Build Coastguard Worker dec hd 120*c0909341SAndroid Build Coastguard Worker jz .v2 121*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 122*c0909341SAndroid Build Coastguard Worker add t1, 384*2 123*c0909341SAndroid Build Coastguard Worker call .h 124*c0909341SAndroid Build Coastguard Worker dec hd 125*c0909341SAndroid Build Coastguard Worker jz .v3 126*c0909341SAndroid Build Coastguard Worker.main: 127*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 128*c0909341SAndroid Build Coastguard Worker.main_loop: 129*c0909341SAndroid Build Coastguard Worker call .hv 130*c0909341SAndroid Build Coastguard Worker dec hd 131*c0909341SAndroid Build Coastguard Worker jnz .main_loop 132*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 133*c0909341SAndroid Build Coastguard Worker jz .v3 134*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 135*c0909341SAndroid Build Coastguard Worker call .hv_bottom 136*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 137*c0909341SAndroid Build Coastguard Worker call .hv_bottom 138*c0909341SAndroid Build Coastguard Worker.v1: 139*c0909341SAndroid Build Coastguard Worker call .v 140*c0909341SAndroid Build Coastguard Worker RET 141*c0909341SAndroid Build Coastguard Worker.no_top: 142*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 143*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 144*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 145*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 146*c0909341SAndroid Build Coastguard Worker call .h 147*c0909341SAndroid Build Coastguard Worker mov t6, t1 148*c0909341SAndroid Build Coastguard Worker mov t5, t1 149*c0909341SAndroid Build Coastguard Worker mov t4, t1 150*c0909341SAndroid Build Coastguard Worker mov t3, t1 151*c0909341SAndroid Build Coastguard Worker mov t2, t1 152*c0909341SAndroid Build Coastguard Worker dec hd 153*c0909341SAndroid Build Coastguard Worker jz .v1 154*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 155*c0909341SAndroid Build Coastguard Worker add t1, 384*2 156*c0909341SAndroid Build Coastguard Worker call .h 157*c0909341SAndroid Build Coastguard Worker mov t2, t1 158*c0909341SAndroid Build Coastguard Worker dec hd 159*c0909341SAndroid Build Coastguard Worker jz .v2 160*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 161*c0909341SAndroid Build Coastguard Worker add t1, 384*2 162*c0909341SAndroid Build Coastguard Worker call .h 163*c0909341SAndroid Build Coastguard Worker dec hd 164*c0909341SAndroid Build Coastguard Worker jz .v3 165*c0909341SAndroid Build Coastguard Worker lea t0, [t1+384*2] 166*c0909341SAndroid Build Coastguard Worker call .hv 167*c0909341SAndroid Build Coastguard Worker dec hd 168*c0909341SAndroid Build Coastguard Worker jz .v3 169*c0909341SAndroid Build Coastguard Worker add t0, 384*8 170*c0909341SAndroid Build Coastguard Worker call .hv 171*c0909341SAndroid Build Coastguard Worker dec hd 172*c0909341SAndroid Build Coastguard Worker jnz .main 173*c0909341SAndroid Build Coastguard Worker.v3: 174*c0909341SAndroid Build Coastguard Worker call .v 175*c0909341SAndroid Build Coastguard Worker.v2: 176*c0909341SAndroid Build Coastguard Worker call .v 177*c0909341SAndroid Build Coastguard Worker jmp .v1 178*c0909341SAndroid Build Coastguard Worker.h: 179*c0909341SAndroid Build Coastguard Worker mov r10, wq 180*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 181*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 182*c0909341SAndroid Build Coastguard Worker movd xm16, [leftq] 183*c0909341SAndroid Build Coastguard Worker vmovdqu32 m16{k1}, [lpfq+r10-4] 184*c0909341SAndroid Build Coastguard Worker add leftq, 4 185*c0909341SAndroid Build Coastguard Worker jmp .h_main 186*c0909341SAndroid Build Coastguard Worker.h_extend_left: 187*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm16, [lpfq+r10] ; the masked load ensures that no exception 188*c0909341SAndroid Build Coastguard Worker vmovdqu32 m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory 189*c0909341SAndroid Build Coastguard Worker jmp .h_main 190*c0909341SAndroid Build Coastguard Worker.h_top: 191*c0909341SAndroid Build Coastguard Worker mov r10, wq 192*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 193*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 194*c0909341SAndroid Build Coastguard Worker.h_loop: 195*c0909341SAndroid Build Coastguard Worker movu m16, [lpfq+r10-4] 196*c0909341SAndroid Build Coastguard Worker.h_main: 197*c0909341SAndroid Build Coastguard Worker movu m17, [lpfq+r10+4] 198*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 199*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 200*c0909341SAndroid Build Coastguard Worker cmp r10d, -66 201*c0909341SAndroid Build Coastguard Worker jl .h_have_right 202*c0909341SAndroid Build Coastguard Worker push r0 203*c0909341SAndroid Build Coastguard Worker lea r0, [r_ext_mask+65] 204*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [lpfq-1] 205*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b 206*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, [r0+r10+8], 0xe4 207*c0909341SAndroid Build Coastguard Worker pop r0 208*c0909341SAndroid Build Coastguard Worker.h_have_right: 209*c0909341SAndroid Build Coastguard Worker pshufb m4, m16, m6 210*c0909341SAndroid Build Coastguard Worker mova m0, m10 211*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m4, m11 212*c0909341SAndroid Build Coastguard Worker pshufb m4, m16, m7 213*c0909341SAndroid Build Coastguard Worker mova m2, m10 214*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m4, m11 215*c0909341SAndroid Build Coastguard Worker pshufb m4, m17, m6 216*c0909341SAndroid Build Coastguard Worker mova m1, m10 217*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m4, m11 218*c0909341SAndroid Build Coastguard Worker pshufb m4, m17, m7 219*c0909341SAndroid Build Coastguard Worker mova m3, m10 220*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m4, m11 221*c0909341SAndroid Build Coastguard Worker pshufb m4, m16, m8 222*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m4, m12 223*c0909341SAndroid Build Coastguard Worker pshufb m16, m9 224*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m16, m12 225*c0909341SAndroid Build Coastguard Worker pshufb m4, m17, m8 226*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m4, m12 227*c0909341SAndroid Build Coastguard Worker pshufb m17, m9 228*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m17, m12 229*c0909341SAndroid Build Coastguard Worker packssdw m0, m2 230*c0909341SAndroid Build Coastguard Worker packssdw m1, m3 231*c0909341SAndroid Build Coastguard Worker psraw m0, 3 232*c0909341SAndroid Build Coastguard Worker psraw m1, 3 233*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+ 0], m0 234*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+64], m1 235*c0909341SAndroid Build Coastguard Worker add r10, 64 236*c0909341SAndroid Build Coastguard Worker jl .h_loop 237*c0909341SAndroid Build Coastguard Worker ret 238*c0909341SAndroid Build Coastguard WorkerALIGN function_align 239*c0909341SAndroid Build Coastguard Worker.hv: 240*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 241*c0909341SAndroid Build Coastguard Worker mov r10, wq 242*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 243*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 244*c0909341SAndroid Build Coastguard Worker movd xm16, [leftq] 245*c0909341SAndroid Build Coastguard Worker vmovdqu32 m16{k1}, [lpfq+r10-4] 246*c0909341SAndroid Build Coastguard Worker add leftq, 4 247*c0909341SAndroid Build Coastguard Worker jmp .hv_main 248*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 249*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm16, [lpfq+r10] 250*c0909341SAndroid Build Coastguard Worker vmovdqu32 m16{k1}, [lpfq+r10-4] 251*c0909341SAndroid Build Coastguard Worker jmp .hv_main 252*c0909341SAndroid Build Coastguard Worker.hv_bottom: 253*c0909341SAndroid Build Coastguard Worker mov r10, wq 254*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 255*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 256*c0909341SAndroid Build Coastguard Worker.hv_loop: 257*c0909341SAndroid Build Coastguard Worker movu m16, [lpfq+r10-4] 258*c0909341SAndroid Build Coastguard Worker.hv_main: 259*c0909341SAndroid Build Coastguard Worker movu m17, [lpfq+r10+4] 260*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 261*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 262*c0909341SAndroid Build Coastguard Worker cmp r10d, -66 263*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 264*c0909341SAndroid Build Coastguard Worker push r0 265*c0909341SAndroid Build Coastguard Worker lea r0, [r_ext_mask+65] 266*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [lpfq-1] 267*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b 268*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, [r0+r10+8], 0xe4 269*c0909341SAndroid Build Coastguard Worker pop r0 270*c0909341SAndroid Build Coastguard Worker.hv_have_right: 271*c0909341SAndroid Build Coastguard Worker pshufb m4, m16, m6 272*c0909341SAndroid Build Coastguard Worker mova m0, m10 273*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m4, m11 274*c0909341SAndroid Build Coastguard Worker pshufb m4, m16, m7 275*c0909341SAndroid Build Coastguard Worker mova m2, m10 276*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m4, m11 277*c0909341SAndroid Build Coastguard Worker pshufb m4, m17, m6 278*c0909341SAndroid Build Coastguard Worker mova m1, m10 279*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m4, m11 280*c0909341SAndroid Build Coastguard Worker pshufb m4, m17, m7 281*c0909341SAndroid Build Coastguard Worker mova m3, m10 282*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m4, m11 283*c0909341SAndroid Build Coastguard Worker pshufb m4, m16, m8 284*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m4, m12 285*c0909341SAndroid Build Coastguard Worker pshufb m16, m9 286*c0909341SAndroid Build Coastguard Worker vpdpbusd m2, m16, m12 287*c0909341SAndroid Build Coastguard Worker pshufb m4, m17, m8 288*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m4, m12 289*c0909341SAndroid Build Coastguard Worker pshufb m17, m9 290*c0909341SAndroid Build Coastguard Worker vpdpbusd m3, m17, m12 291*c0909341SAndroid Build Coastguard Worker packssdw m0, m2 292*c0909341SAndroid Build Coastguard Worker packssdw m1, m3 293*c0909341SAndroid Build Coastguard Worker psraw m0, 3 294*c0909341SAndroid Build Coastguard Worker psraw m1, 3 295*c0909341SAndroid Build Coastguard Worker mova m16, [t4+r10*2] 296*c0909341SAndroid Build Coastguard Worker paddw m16, [t2+r10*2] 297*c0909341SAndroid Build Coastguard Worker mova m3, [t3+r10*2] 298*c0909341SAndroid Build Coastguard Worker mova m17, [t4+r10*2+64] 299*c0909341SAndroid Build Coastguard Worker paddw m17, [t2+r10*2+64] 300*c0909341SAndroid Build Coastguard Worker mova m5, [t3+r10*2+64] 301*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m16, m3 302*c0909341SAndroid Build Coastguard Worker mova m2, m13 303*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m4, m15 304*c0909341SAndroid Build Coastguard Worker punpcklwd m18, m17, m5 305*c0909341SAndroid Build Coastguard Worker mova m4, m13 306*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m18, m15 307*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m3 308*c0909341SAndroid Build Coastguard Worker mova m3, m13 309*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m16, m15 310*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m5 311*c0909341SAndroid Build Coastguard Worker mova m5, m13 312*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m17, m15 313*c0909341SAndroid Build Coastguard Worker mova m17, [t5+r10*2] 314*c0909341SAndroid Build Coastguard Worker paddw m17, [t1+r10*2] 315*c0909341SAndroid Build Coastguard Worker paddw m16, m0, [t6+r10*2] 316*c0909341SAndroid Build Coastguard Worker mova m19, [t5+r10*2+64] 317*c0909341SAndroid Build Coastguard Worker paddw m19, [t1+r10*2+64] 318*c0909341SAndroid Build Coastguard Worker paddw m18, m1, [t6+r10*2+64] 319*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+ 0], m0 320*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+64], m1 321*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m16, m17 322*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m0, m14 323*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m18, m19 324*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m1, m14 325*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m17 326*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m16, m14 327*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m19 328*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m18, m14 329*c0909341SAndroid Build Coastguard Worker packuswb m2, m4 330*c0909341SAndroid Build Coastguard Worker psrlw m2, 8 331*c0909341SAndroid Build Coastguard Worker vpackuswb m2{k2}, m3, m5 332*c0909341SAndroid Build Coastguard Worker movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap 333*c0909341SAndroid Build Coastguard Worker add r10, 64 ; function is used for chroma as well, and in some 334*c0909341SAndroid Build Coastguard Worker jl .hv_loop ; esoteric edge cases chroma dst pointers may only 335*c0909341SAndroid Build Coastguard Worker mov t6, t5 ; have a 32-byte alignment despite having a width 336*c0909341SAndroid Build Coastguard Worker mov t5, t4 ; larger than 32, so use an unaligned store here. 337*c0909341SAndroid Build Coastguard Worker mov t4, t3 338*c0909341SAndroid Build Coastguard Worker mov t3, t2 339*c0909341SAndroid Build Coastguard Worker mov t2, t1 340*c0909341SAndroid Build Coastguard Worker mov t1, t0 341*c0909341SAndroid Build Coastguard Worker mov t0, t6 342*c0909341SAndroid Build Coastguard Worker add dstq, strideq 343*c0909341SAndroid Build Coastguard Worker ret 344*c0909341SAndroid Build Coastguard Worker.v: 345*c0909341SAndroid Build Coastguard Worker mov r10, wq 346*c0909341SAndroid Build Coastguard Worker.v_loop: 347*c0909341SAndroid Build Coastguard Worker mova m4, [t4+r10*2+ 0] 348*c0909341SAndroid Build Coastguard Worker paddw m4, [t2+r10*2+ 0] 349*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*2+ 0] 350*c0909341SAndroid Build Coastguard Worker mova m5, [t4+r10*2+64] 351*c0909341SAndroid Build Coastguard Worker paddw m5, [t2+r10*2+64] 352*c0909341SAndroid Build Coastguard Worker mova m3, [t3+r10*2+64] 353*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m4, m1 354*c0909341SAndroid Build Coastguard Worker mova m0, m13 355*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m6, m15 356*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m5, m3 357*c0909341SAndroid Build Coastguard Worker mova m2, m13 358*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m6, m15 359*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m1 360*c0909341SAndroid Build Coastguard Worker mova m1, m13 361*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, m15 362*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3 363*c0909341SAndroid Build Coastguard Worker mova m3, m13 364*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m5, m15 365*c0909341SAndroid Build Coastguard Worker mova m5, [t1+r10*2+ 0] 366*c0909341SAndroid Build Coastguard Worker paddw m4, m5, [t6+r10*2+ 0] 367*c0909341SAndroid Build Coastguard Worker paddw m5, [t5+r10*2+ 0] 368*c0909341SAndroid Build Coastguard Worker mova m7, [t1+r10*2+64] 369*c0909341SAndroid Build Coastguard Worker paddw m6, m7, [t6+r10*2+64] 370*c0909341SAndroid Build Coastguard Worker paddw m7, [t5+r10*2+64] 371*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m4, m5 372*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m8, m14 373*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m6, m7 374*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m8, m14 375*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 376*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, m14 377*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m7 378*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m6, m14 379*c0909341SAndroid Build Coastguard Worker packuswb m0, m2 380*c0909341SAndroid Build Coastguard Worker psrlw m0, 8 381*c0909341SAndroid Build Coastguard Worker vpackuswb m0{k2}, m1, m3 382*c0909341SAndroid Build Coastguard Worker movu [dstq+r10], m0 383*c0909341SAndroid Build Coastguard Worker add r10, 64 384*c0909341SAndroid Build Coastguard Worker jl .v_loop 385*c0909341SAndroid Build Coastguard Worker mov t6, t5 386*c0909341SAndroid Build Coastguard Worker mov t5, t4 387*c0909341SAndroid Build Coastguard Worker mov t4, t3 388*c0909341SAndroid Build Coastguard Worker mov t3, t2 389*c0909341SAndroid Build Coastguard Worker mov t2, t1 390*c0909341SAndroid Build Coastguard Worker add dstq, strideq 391*c0909341SAndroid Build Coastguard Worker ret 392*c0909341SAndroid Build Coastguard Worker.w32: 393*c0909341SAndroid Build Coastguard Worker lea r10, [r_ext_mask+73] 394*c0909341SAndroid Build Coastguard Worker mova ym18, [wiener_perm32] 395*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+16] 396*c0909341SAndroid Build Coastguard Worker sub r10, wq 397*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 398*c0909341SAndroid Build Coastguard Worker jz .w32_no_top 399*c0909341SAndroid Build Coastguard Worker call .w32_h_top 400*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 401*c0909341SAndroid Build Coastguard Worker mov t6, t1 402*c0909341SAndroid Build Coastguard Worker mov t5, t1 403*c0909341SAndroid Build Coastguard Worker add t1, 32*2 404*c0909341SAndroid Build Coastguard Worker call .w32_h_top 405*c0909341SAndroid Build Coastguard Worker lea r9, [lpfq+strideq*4] 406*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 407*c0909341SAndroid Build Coastguard Worker mov t4, t1 408*c0909341SAndroid Build Coastguard Worker add t1, 32*2 409*c0909341SAndroid Build Coastguard Worker add r9, strideq 410*c0909341SAndroid Build Coastguard Worker mov [rsp], r9 ; below 411*c0909341SAndroid Build Coastguard Worker call .w32_h 412*c0909341SAndroid Build Coastguard Worker mov t3, t1 413*c0909341SAndroid Build Coastguard Worker mov t2, t1 414*c0909341SAndroid Build Coastguard Worker dec hd 415*c0909341SAndroid Build Coastguard Worker jz .w32_v1 416*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 417*c0909341SAndroid Build Coastguard Worker add t1, 32*2 418*c0909341SAndroid Build Coastguard Worker call .w32_h 419*c0909341SAndroid Build Coastguard Worker mov t2, t1 420*c0909341SAndroid Build Coastguard Worker dec hd 421*c0909341SAndroid Build Coastguard Worker jz .w32_v2 422*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 423*c0909341SAndroid Build Coastguard Worker add t1, 32*2 424*c0909341SAndroid Build Coastguard Worker call .w32_h 425*c0909341SAndroid Build Coastguard Worker dec hd 426*c0909341SAndroid Build Coastguard Worker jz .w32_v3 427*c0909341SAndroid Build Coastguard Worker.w32_main: 428*c0909341SAndroid Build Coastguard Worker lea t0, [t1+32*2] 429*c0909341SAndroid Build Coastguard Worker.w32_main_loop: 430*c0909341SAndroid Build Coastguard Worker call .w32_hv 431*c0909341SAndroid Build Coastguard Worker dec hd 432*c0909341SAndroid Build Coastguard Worker jnz .w32_main_loop 433*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 434*c0909341SAndroid Build Coastguard Worker jz .w32_v3 435*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 436*c0909341SAndroid Build Coastguard Worker call .w32_hv_bottom 437*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 438*c0909341SAndroid Build Coastguard Worker call .w32_hv_bottom 439*c0909341SAndroid Build Coastguard Worker.w32_v1: 440*c0909341SAndroid Build Coastguard Worker call .w32_v 441*c0909341SAndroid Build Coastguard Worker RET 442*c0909341SAndroid Build Coastguard Worker.w32_no_top: 443*c0909341SAndroid Build Coastguard Worker lea r9, [lpfq+strideq*4] 444*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 445*c0909341SAndroid Build Coastguard Worker lea r9, [r9+strideq*2] 446*c0909341SAndroid Build Coastguard Worker mov [rsp], r9 447*c0909341SAndroid Build Coastguard Worker call .w32_h 448*c0909341SAndroid Build Coastguard Worker mov t6, t1 449*c0909341SAndroid Build Coastguard Worker mov t5, t1 450*c0909341SAndroid Build Coastguard Worker mov t4, t1 451*c0909341SAndroid Build Coastguard Worker mov t3, t1 452*c0909341SAndroid Build Coastguard Worker mov t2, t1 453*c0909341SAndroid Build Coastguard Worker dec hd 454*c0909341SAndroid Build Coastguard Worker jz .w32_v1 455*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 456*c0909341SAndroid Build Coastguard Worker add t1, 32*2 457*c0909341SAndroid Build Coastguard Worker call .w32_h 458*c0909341SAndroid Build Coastguard Worker mov t2, t1 459*c0909341SAndroid Build Coastguard Worker dec hd 460*c0909341SAndroid Build Coastguard Worker jz .w32_v2 461*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 462*c0909341SAndroid Build Coastguard Worker add t1, 32*2 463*c0909341SAndroid Build Coastguard Worker call .w32_h 464*c0909341SAndroid Build Coastguard Worker dec hd 465*c0909341SAndroid Build Coastguard Worker jz .w32_v3 466*c0909341SAndroid Build Coastguard Worker lea t0, [t1+32*2] 467*c0909341SAndroid Build Coastguard Worker call .w32_hv 468*c0909341SAndroid Build Coastguard Worker dec hd 469*c0909341SAndroid Build Coastguard Worker jz .w32_v3 470*c0909341SAndroid Build Coastguard Worker add t0, 32*8 471*c0909341SAndroid Build Coastguard Worker call .w32_hv 472*c0909341SAndroid Build Coastguard Worker dec hd 473*c0909341SAndroid Build Coastguard Worker jnz .w32_main 474*c0909341SAndroid Build Coastguard Worker.w32_v3: 475*c0909341SAndroid Build Coastguard Worker call .w32_v 476*c0909341SAndroid Build Coastguard Worker.w32_v2: 477*c0909341SAndroid Build Coastguard Worker call .w32_v 478*c0909341SAndroid Build Coastguard Worker jmp .w32_v1 479*c0909341SAndroid Build Coastguard Worker.w32_h: 480*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 481*c0909341SAndroid Build Coastguard Worker jz .w32_h_extend_left 482*c0909341SAndroid Build Coastguard Worker movd xm16, [leftq] 483*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym16{k1}, [lpfq-4] 484*c0909341SAndroid Build Coastguard Worker add leftq, 4 485*c0909341SAndroid Build Coastguard Worker jmp .w32_h_main 486*c0909341SAndroid Build Coastguard Worker.w32_h_extend_left: 487*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm16, [lpfq] ; the masked load ensures that no exception 488*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory 489*c0909341SAndroid Build Coastguard Worker jmp .w32_h_main 490*c0909341SAndroid Build Coastguard Worker.w32_h_top: 491*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 492*c0909341SAndroid Build Coastguard Worker jz .w32_h_extend_left 493*c0909341SAndroid Build Coastguard Worker movu ym16, [lpfq-4] 494*c0909341SAndroid Build Coastguard Worker.w32_h_main: 495*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [lpfq+4], 1 496*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 497*c0909341SAndroid Build Coastguard Worker jnz .w32_h_have_right 498*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [lpfq+wq-1] 499*c0909341SAndroid Build Coastguard Worker movu ym17, [r10-8] 500*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [r10+0], 1 501*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, m17, 0xe4 ; c ? a : b 502*c0909341SAndroid Build Coastguard Worker.w32_h_have_right: 503*c0909341SAndroid Build Coastguard Worker pshufb m2, m16, m6 504*c0909341SAndroid Build Coastguard Worker mova m0, m10 505*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m2, m11 506*c0909341SAndroid Build Coastguard Worker pshufb m2, m16, m7 507*c0909341SAndroid Build Coastguard Worker mova m1, m10 508*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m2, m11 509*c0909341SAndroid Build Coastguard Worker pshufb m2, m16, m8 510*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m2, m12 511*c0909341SAndroid Build Coastguard Worker pshufb m16, m9 512*c0909341SAndroid Build Coastguard Worker vpdpbusd m1, m16, m12 513*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 514*c0909341SAndroid Build Coastguard Worker psraw m0, 3 515*c0909341SAndroid Build Coastguard Worker mova [t1], m0 516*c0909341SAndroid Build Coastguard Worker ret 517*c0909341SAndroid Build Coastguard Worker.w32_hv: 518*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 519*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 520*c0909341SAndroid Build Coastguard Worker jz .w32_hv_extend_left 521*c0909341SAndroid Build Coastguard Worker movd xm16, [leftq] 522*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym16{k1}, [lpfq-4] 523*c0909341SAndroid Build Coastguard Worker add leftq, 4 524*c0909341SAndroid Build Coastguard Worker jmp .w32_hv_main 525*c0909341SAndroid Build Coastguard Worker.w32_hv_extend_left: 526*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm16, [lpfq] 527*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym16{k1}, [lpfq-4] 528*c0909341SAndroid Build Coastguard Worker jmp .w32_hv_main 529*c0909341SAndroid Build Coastguard Worker.w32_hv_bottom: 530*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 531*c0909341SAndroid Build Coastguard Worker jz .w32_hv_extend_left 532*c0909341SAndroid Build Coastguard Worker movu ym16, [lpfq-4] 533*c0909341SAndroid Build Coastguard Worker.w32_hv_main: 534*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [lpfq+4], 1 535*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 536*c0909341SAndroid Build Coastguard Worker jnz .w32_hv_have_right 537*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [lpfq+wq-1] 538*c0909341SAndroid Build Coastguard Worker movu ym17, [r10-8] 539*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [r10+0], 1 540*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, m17, 0xe4 541*c0909341SAndroid Build Coastguard Worker.w32_hv_have_right: 542*c0909341SAndroid Build Coastguard Worker mova m3, [t4] 543*c0909341SAndroid Build Coastguard Worker paddw m3, [t2] 544*c0909341SAndroid Build Coastguard Worker mova m2, [t3] 545*c0909341SAndroid Build Coastguard Worker pshufb m4, m16, m6 546*c0909341SAndroid Build Coastguard Worker mova m0, m10 547*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m4, m11 548*c0909341SAndroid Build Coastguard Worker pshufb m4, m16, m7 549*c0909341SAndroid Build Coastguard Worker mova m5, m10 550*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m4, m11 551*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3, m2 552*c0909341SAndroid Build Coastguard Worker mova m1, m13 553*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, m15 554*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m2 555*c0909341SAndroid Build Coastguard Worker mova m2, m13 556*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m3, m15 557*c0909341SAndroid Build Coastguard Worker pshufb m4, m16, m8 558*c0909341SAndroid Build Coastguard Worker vpdpbusd m0, m4, m12 559*c0909341SAndroid Build Coastguard Worker pshufb m16, m9 560*c0909341SAndroid Build Coastguard Worker vpdpbusd m5, m16, m12 561*c0909341SAndroid Build Coastguard Worker packssdw m0, m5 562*c0909341SAndroid Build Coastguard Worker psraw m0, 3 563*c0909341SAndroid Build Coastguard Worker mova m4, [t5] 564*c0909341SAndroid Build Coastguard Worker paddw m4, [t1] 565*c0909341SAndroid Build Coastguard Worker paddw m3, m0, [t6] 566*c0909341SAndroid Build Coastguard Worker mova [t0], m0 567*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m3, m4 568*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m0, m14 569*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 570*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m3, m14 571*c0909341SAndroid Build Coastguard Worker packuswb m1, m2 572*c0909341SAndroid Build Coastguard Worker vpermb m16, m18, m1 573*c0909341SAndroid Build Coastguard Worker mova [dstq], ym16 574*c0909341SAndroid Build Coastguard Worker mov t6, t5 575*c0909341SAndroid Build Coastguard Worker mov t5, t4 576*c0909341SAndroid Build Coastguard Worker mov t4, t3 577*c0909341SAndroid Build Coastguard Worker mov t3, t2 578*c0909341SAndroid Build Coastguard Worker mov t2, t1 579*c0909341SAndroid Build Coastguard Worker mov t1, t0 580*c0909341SAndroid Build Coastguard Worker mov t0, t6 581*c0909341SAndroid Build Coastguard Worker add dstq, strideq 582*c0909341SAndroid Build Coastguard Worker ret 583*c0909341SAndroid Build Coastguard Worker.w32_v: 584*c0909341SAndroid Build Coastguard Worker mova m2, [t4] 585*c0909341SAndroid Build Coastguard Worker paddw m2, [t2] 586*c0909341SAndroid Build Coastguard Worker mova m1, [t3] 587*c0909341SAndroid Build Coastguard Worker mova m4, [t1] 588*c0909341SAndroid Build Coastguard Worker paddw m3, m4, [t6] 589*c0909341SAndroid Build Coastguard Worker paddw m4, [t5] 590*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m2, m1 591*c0909341SAndroid Build Coastguard Worker mova m0, m13 592*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m5, m15 593*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m1 594*c0909341SAndroid Build Coastguard Worker mova m1, m13 595*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m2, m15 596*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 597*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m2, m14 598*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 599*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m3, m14 600*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 601*c0909341SAndroid Build Coastguard Worker vpermb m16, m18, m0 602*c0909341SAndroid Build Coastguard Worker mova [dstq], ym16 603*c0909341SAndroid Build Coastguard Worker mov t6, t5 604*c0909341SAndroid Build Coastguard Worker mov t5, t4 605*c0909341SAndroid Build Coastguard Worker mov t4, t3 606*c0909341SAndroid Build Coastguard Worker mov t3, t2 607*c0909341SAndroid Build Coastguard Worker mov t2, t1 608*c0909341SAndroid Build Coastguard Worker add dstq, strideq 609*c0909341SAndroid Build Coastguard Worker ret 610*c0909341SAndroid Build Coastguard Worker 611*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \ 612*c0909341SAndroid Build Coastguard Worker w, h, edge, params 613*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 614*c0909341SAndroid Build Coastguard Worker mov wd, wm 615*c0909341SAndroid Build Coastguard Worker mov hd, hm 616*c0909341SAndroid Build Coastguard Worker mov edged, r7m 617*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [sgr_shuf+1] 618*c0909341SAndroid Build Coastguard Worker add lpfq, wq 619*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [sgr_shuf+9] 620*c0909341SAndroid Build Coastguard Worker add dstq, wq 621*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [sgr_shuf+3] 622*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*4+16+416*12] 623*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m8, [sgr_shuf+7] 624*c0909341SAndroid Build Coastguard Worker pxor m4, m4 625*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pd_m25] 626*c0909341SAndroid Build Coastguard Worker vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 627*c0909341SAndroid Build Coastguard Worker vpbroadcastw m15, [paramsq+8] ; w0 628*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+20] 629*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pw_164_455] 630*c0909341SAndroid Build Coastguard Worker neg wq 631*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) 632*c0909341SAndroid Build Coastguard Worker mov r10d, 0xfe 633*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [pd_m4096] 634*c0909341SAndroid Build Coastguard Worker kmovb k1, r10d 635*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) 636*c0909341SAndroid Build Coastguard Worker mov r10, 0x3333333333333333 637*c0909341SAndroid Build Coastguard Worker mova m18, [sgr_x_by_x+64*0] 638*c0909341SAndroid Build Coastguard Worker kmovq k2, r10 639*c0909341SAndroid Build Coastguard Worker mova m19, [sgr_x_by_x+64*1] 640*c0909341SAndroid Build Coastguard Worker lea r12, [r_ext_mask+75] 641*c0909341SAndroid Build Coastguard Worker mova m20, [sgr_x_by_x+64*2] 642*c0909341SAndroid Build Coastguard Worker psllw m15, 4 643*c0909341SAndroid Build Coastguard Worker mova m21, [sgr_x_by_x+64*3] 644*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 645*c0909341SAndroid Build Coastguard Worker mova ym22, [sgr_shuf] 646*c0909341SAndroid Build Coastguard Worker add r10, strideq 647*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 648*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 649*c0909341SAndroid Build Coastguard Worker jz .no_top 650*c0909341SAndroid Build Coastguard Worker call .h_top 651*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 652*c0909341SAndroid Build Coastguard Worker mov t2, t1 653*c0909341SAndroid Build Coastguard Worker call .top_fixup 654*c0909341SAndroid Build Coastguard Worker add t1, 416*6 655*c0909341SAndroid Build Coastguard Worker call .h_top 656*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 657*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 658*c0909341SAndroid Build Coastguard Worker add r10, strideq 659*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 660*c0909341SAndroid Build Coastguard Worker mov t0, t2 661*c0909341SAndroid Build Coastguard Worker dec hd 662*c0909341SAndroid Build Coastguard Worker jz .height1 663*c0909341SAndroid Build Coastguard Worker or edged, 16 664*c0909341SAndroid Build Coastguard Worker call .h 665*c0909341SAndroid Build Coastguard Worker.main: 666*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 667*c0909341SAndroid Build Coastguard Worker call .hv 668*c0909341SAndroid Build Coastguard Worker call .prep_n 669*c0909341SAndroid Build Coastguard Worker sub hd, 2 670*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 671*c0909341SAndroid Build Coastguard Worker.main_loop: 672*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 673*c0909341SAndroid Build Coastguard Worker test hd, hd 674*c0909341SAndroid Build Coastguard Worker jz .odd_height 675*c0909341SAndroid Build Coastguard Worker call .h 676*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 677*c0909341SAndroid Build Coastguard Worker call .hv 678*c0909341SAndroid Build Coastguard Worker call .n0 679*c0909341SAndroid Build Coastguard Worker call .n1 680*c0909341SAndroid Build Coastguard Worker sub hd, 2 681*c0909341SAndroid Build Coastguard Worker jge .main_loop 682*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 683*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 684*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 685*c0909341SAndroid Build Coastguard Worker call .h_top 686*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 687*c0909341SAndroid Build Coastguard Worker call .hv_bottom 688*c0909341SAndroid Build Coastguard Worker.end: 689*c0909341SAndroid Build Coastguard Worker call .n0 690*c0909341SAndroid Build Coastguard Worker call .n1 691*c0909341SAndroid Build Coastguard Worker.end2: 692*c0909341SAndroid Build Coastguard Worker RET 693*c0909341SAndroid Build Coastguard Worker.height1: 694*c0909341SAndroid Build Coastguard Worker call .hv 695*c0909341SAndroid Build Coastguard Worker call .prep_n 696*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 697*c0909341SAndroid Build Coastguard Worker.odd_height: 698*c0909341SAndroid Build Coastguard Worker call .hv 699*c0909341SAndroid Build Coastguard Worker call .n0 700*c0909341SAndroid Build Coastguard Worker call .n1 701*c0909341SAndroid Build Coastguard Worker.odd_height_end: 702*c0909341SAndroid Build Coastguard Worker call .v 703*c0909341SAndroid Build Coastguard Worker call .n0 704*c0909341SAndroid Build Coastguard Worker jmp .end2 705*c0909341SAndroid Build Coastguard Worker.extend_bottom: 706*c0909341SAndroid Build Coastguard Worker call .v 707*c0909341SAndroid Build Coastguard Worker jmp .end 708*c0909341SAndroid Build Coastguard Worker.no_top: 709*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 710*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 711*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 712*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 713*c0909341SAndroid Build Coastguard Worker call .h 714*c0909341SAndroid Build Coastguard Worker lea t2, [t1+416*6] 715*c0909341SAndroid Build Coastguard Worker call .top_fixup 716*c0909341SAndroid Build Coastguard Worker dec hd 717*c0909341SAndroid Build Coastguard Worker jz .no_top_height1 718*c0909341SAndroid Build Coastguard Worker or edged, 16 719*c0909341SAndroid Build Coastguard Worker mov t0, t1 720*c0909341SAndroid Build Coastguard Worker mov t1, t2 721*c0909341SAndroid Build Coastguard Worker jmp .main 722*c0909341SAndroid Build Coastguard Worker.no_top_height1: 723*c0909341SAndroid Build Coastguard Worker call .v 724*c0909341SAndroid Build Coastguard Worker call .prep_n 725*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 726*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 727*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 728*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 729*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 730*c0909341SAndroid Build Coastguard Worker movd xm17, [leftq] 731*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 732*c0909341SAndroid Build Coastguard Worker add leftq, 4 733*c0909341SAndroid Build Coastguard Worker jmp .h_main 734*c0909341SAndroid Build Coastguard Worker.h_extend_left: 735*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm17, [lpfq+wq] 736*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 737*c0909341SAndroid Build Coastguard Worker jmp .h_main 738*c0909341SAndroid Build Coastguard Worker.h_top: 739*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 740*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 741*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 742*c0909341SAndroid Build Coastguard Worker.h_loop: 743*c0909341SAndroid Build Coastguard Worker movu ym17, [lpfq+r10-2] 744*c0909341SAndroid Build Coastguard Worker.h_main: 745*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [lpfq+r10+6], 1 746*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 747*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 748*c0909341SAndroid Build Coastguard Worker cmp r10d, -34 749*c0909341SAndroid Build Coastguard Worker jl .h_have_right 750*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [lpfq-1] 751*c0909341SAndroid Build Coastguard Worker movu ym16, [r12+r10-8] 752*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [r12+r10+0], 1 753*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, m16, 0xe4 754*c0909341SAndroid Build Coastguard Worker.h_have_right: 755*c0909341SAndroid Build Coastguard Worker pshufb m3, m17, m5 756*c0909341SAndroid Build Coastguard Worker pmullw m2, m3, m3 757*c0909341SAndroid Build Coastguard Worker pshufb m1, m17, m6 758*c0909341SAndroid Build Coastguard Worker paddw m0, m3, m1 759*c0909341SAndroid Build Coastguard Worker shufps m3, m1, q2121 760*c0909341SAndroid Build Coastguard Worker paddw m0, m3 761*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m3, m1 762*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m1 763*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m4 764*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m16, m16 765*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m4 766*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m3, m3 767*c0909341SAndroid Build Coastguard Worker pshufb m16, m17, m7 768*c0909341SAndroid Build Coastguard Worker paddw m0, m16 769*c0909341SAndroid Build Coastguard Worker pshufb m17, m8 770*c0909341SAndroid Build Coastguard Worker paddw m0, m17 ; sum 771*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m16, m17 772*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m3, m3 ; sumsq 773*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m17 774*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m16, m16 775*c0909341SAndroid Build Coastguard Worker test edgeb, 16 ; y > 0 776*c0909341SAndroid Build Coastguard Worker jz .h_loop_end 777*c0909341SAndroid Build Coastguard Worker paddw m0, [t1+r10*2+416*0] 778*c0909341SAndroid Build Coastguard Worker paddd m1, [t1+r10*2+416*2] 779*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10*2+416*4] 780*c0909341SAndroid Build Coastguard Worker.h_loop_end: 781*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*0], m0 782*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*2], m1 783*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*4], m2 784*c0909341SAndroid Build Coastguard Worker add r10, 32 785*c0909341SAndroid Build Coastguard Worker jl .h_loop 786*c0909341SAndroid Build Coastguard Worker ret 787*c0909341SAndroid Build Coastguard Worker.top_fixup: 788*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 789*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: ; the sums of the first row needs to be doubled 790*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10*2+416*0] 791*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10*2+416*2] 792*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10*2+416*4] 793*c0909341SAndroid Build Coastguard Worker paddw m0, m0 794*c0909341SAndroid Build Coastguard Worker paddd m1, m1 795*c0909341SAndroid Build Coastguard Worker paddd m2, m2 796*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416*0], m0 797*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416*2], m1 798*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416*4], m2 799*c0909341SAndroid Build Coastguard Worker add r10, 32 800*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 801*c0909341SAndroid Build Coastguard Worker ret 802*c0909341SAndroid Build Coastguard WorkerALIGN function_align 803*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab 804*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 805*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 806*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 807*c0909341SAndroid Build Coastguard Worker movd xm17, [leftq] 808*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 809*c0909341SAndroid Build Coastguard Worker add leftq, 4 810*c0909341SAndroid Build Coastguard Worker jmp .hv_main 811*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 812*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm17, [lpfq+wq] 813*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 814*c0909341SAndroid Build Coastguard Worker jmp .hv_main 815*c0909341SAndroid Build Coastguard Worker.hv_bottom: 816*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 817*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 818*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 819*c0909341SAndroid Build Coastguard Worker.hv_loop: 820*c0909341SAndroid Build Coastguard Worker movu ym17, [lpfq+r10-2] 821*c0909341SAndroid Build Coastguard Worker.hv_main: 822*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [lpfq+r10+6], 1 823*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 824*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 825*c0909341SAndroid Build Coastguard Worker cmp r10d, -34 826*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 827*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [lpfq-1] 828*c0909341SAndroid Build Coastguard Worker movu ym16, [r12+r10-8] 829*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [r12+r10+0], 1 830*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, m16, 0xe4 831*c0909341SAndroid Build Coastguard Worker.hv_have_right: 832*c0909341SAndroid Build Coastguard Worker pshufb m1, m17, m5 833*c0909341SAndroid Build Coastguard Worker pmullw m3, m1, m1 834*c0909341SAndroid Build Coastguard Worker pshufb m2, m17, m6 835*c0909341SAndroid Build Coastguard Worker paddw m0, m1, m2 836*c0909341SAndroid Build Coastguard Worker shufps m1, m2, q2121 837*c0909341SAndroid Build Coastguard Worker paddw m0, m1 838*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m1, m2 839*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2 840*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 841*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m16, m16 842*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 843*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m1, m1 844*c0909341SAndroid Build Coastguard Worker pshufb m16, m17, m7 845*c0909341SAndroid Build Coastguard Worker paddw m0, m16 846*c0909341SAndroid Build Coastguard Worker pshufb m17, m8 847*c0909341SAndroid Build Coastguard Worker paddw m0, m17 ; h sum 848*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m16, m17 849*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m1, m1 ; h sumsq 850*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m17 851*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m16, m16 852*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t1+r10*2+416*0] 853*c0909341SAndroid Build Coastguard Worker paddd m16, m2, [t1+r10*2+416*2] 854*c0909341SAndroid Build Coastguard Worker paddd m17, m3, [t1+r10*2+416*4] 855*c0909341SAndroid Build Coastguard Worker test hd, hd 856*c0909341SAndroid Build Coastguard Worker jz .hv_last_row 857*c0909341SAndroid Build Coastguard Worker.hv_main2: 858*c0909341SAndroid Build Coastguard Worker paddd m16, [t2+r10*2+416*2] ; hv sumsq 859*c0909341SAndroid Build Coastguard Worker paddd m17, [t2+r10*2+416*4] 860*c0909341SAndroid Build Coastguard Worker paddw m1, [t2+r10*2+416*0] ; hv sum 861*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+416*2], m2 862*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+416*4], m3 863*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+416*0], m0 864*c0909341SAndroid Build Coastguard Worker pmulld m16, m9 ; -a * 25 865*c0909341SAndroid Build Coastguard Worker pmulld m17, m9 866*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m4 ; b 867*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m0, m0 ; -p 868*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4 869*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m1, m1 870*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 164 871*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 872*c0909341SAndroid Build Coastguard Worker pmulld m16, m11 ; p * s 873*c0909341SAndroid Build Coastguard Worker pmulld m17, m11 874*c0909341SAndroid Build Coastguard Worker vpalignr m17{k2}, m16, m16, 2 875*c0909341SAndroid Build Coastguard Worker mova m16, m20 876*c0909341SAndroid Build Coastguard Worker paddusw m17, m12 877*c0909341SAndroid Build Coastguard Worker psraw m17, 4 ; min(z, 255) - 256 878*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 879*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m17 880*c0909341SAndroid Build Coastguard Worker vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 881*c0909341SAndroid Build Coastguard Worker vmovdqu8 m17{k3}, m16 ; x 882*c0909341SAndroid Build Coastguard Worker pandn m16, m13, m17 883*c0909341SAndroid Build Coastguard Worker psrld m17, 16 884*c0909341SAndroid Build Coastguard Worker pmulld m0, m16 885*c0909341SAndroid Build Coastguard Worker pmulld m1, m17 886*c0909341SAndroid Build Coastguard Worker paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) 887*c0909341SAndroid Build Coastguard Worker paddd m1, m14 888*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) 889*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m1, m13, 0xd8 890*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 8], m16 ; The neighbor calculations requires 891*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 24], xm17 ; 13 bits for a and 21 bits for b. 892*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but 893*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 72], m17 ; that gets us most of the way. 894*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+ 72], ym16, 1 895*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+104], m16, 3 896*c0909341SAndroid Build Coastguard Worker add r10, 32 897*c0909341SAndroid Build Coastguard Worker jl .hv_loop 898*c0909341SAndroid Build Coastguard Worker mov t2, t1 899*c0909341SAndroid Build Coastguard Worker mov t1, t0 900*c0909341SAndroid Build Coastguard Worker mov t0, t2 901*c0909341SAndroid Build Coastguard Worker ret 902*c0909341SAndroid Build Coastguard Worker.hv_last_row: ; esoteric edge case for odd heights 903*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*0], m1 904*c0909341SAndroid Build Coastguard Worker paddw m1, m0 905*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*2], m16 906*c0909341SAndroid Build Coastguard Worker paddd m16, m2 907*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*4], m17 908*c0909341SAndroid Build Coastguard Worker paddd m17, m3 909*c0909341SAndroid Build Coastguard Worker jmp .hv_main2 910*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab 911*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 912*c0909341SAndroid Build Coastguard Worker.v_loop: 913*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10*2+416*2] 914*c0909341SAndroid Build Coastguard Worker paddd m16, m2, [t2+r10*2+416*2] 915*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10*2+416*4] 916*c0909341SAndroid Build Coastguard Worker paddd m17, m3, [t2+r10*2+416*4] 917*c0909341SAndroid Build Coastguard Worker paddd m2, m2 918*c0909341SAndroid Build Coastguard Worker paddd m3, m3 919*c0909341SAndroid Build Coastguard Worker paddd m16, m2 ; hv sumsq 920*c0909341SAndroid Build Coastguard Worker paddd m17, m3 921*c0909341SAndroid Build Coastguard Worker pmulld m16, m9 ; -a * 25 922*c0909341SAndroid Build Coastguard Worker pmulld m17, m9 923*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10*2+416*0] 924*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10*2+416*0] 925*c0909341SAndroid Build Coastguard Worker paddw m0, m0 926*c0909341SAndroid Build Coastguard Worker paddw m1, m0 ; hv sum 927*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m4 ; b 928*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m0, m0 ; -p 929*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4 930*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m1, m1 931*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 164 932*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 933*c0909341SAndroid Build Coastguard Worker pmulld m16, m11 ; p * s 934*c0909341SAndroid Build Coastguard Worker pmulld m17, m11 935*c0909341SAndroid Build Coastguard Worker vpalignr m17{k2}, m16, m16, 2 936*c0909341SAndroid Build Coastguard Worker mova m16, m20 937*c0909341SAndroid Build Coastguard Worker paddusw m17, m12 938*c0909341SAndroid Build Coastguard Worker psraw m17, 4 ; min(z, 255) - 256 939*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 940*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m17 941*c0909341SAndroid Build Coastguard Worker vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 942*c0909341SAndroid Build Coastguard Worker vmovdqu8 m17{k3}, m16 ; x 943*c0909341SAndroid Build Coastguard Worker pandn m16, m13, m17 944*c0909341SAndroid Build Coastguard Worker psrld m17, 16 945*c0909341SAndroid Build Coastguard Worker pmulld m0, m16 946*c0909341SAndroid Build Coastguard Worker pmulld m1, m17 947*c0909341SAndroid Build Coastguard Worker paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) 948*c0909341SAndroid Build Coastguard Worker paddd m1, m14 949*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) 950*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m1, m13, 0xd8 951*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 8], m16 952*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 24], xm17 953*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+ 56], m17, 2 954*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 72], m17 955*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+ 72], ym16, 1 956*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+104], m16, 3 957*c0909341SAndroid Build Coastguard Worker add r10, 32 958*c0909341SAndroid Build Coastguard Worker jl .v_loop 959*c0909341SAndroid Build Coastguard Worker ret 960*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 961*c0909341SAndroid Build Coastguard Worker mov r10, wq 962*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 963*c0909341SAndroid Build Coastguard Worker movu m0, [t3+r10*4+ 4] 964*c0909341SAndroid Build Coastguard Worker movu m1, [t3+r10*4+68] 965*c0909341SAndroid Build Coastguard Worker paddd m2, m0, [t3+r10*4+ 0] 966*c0909341SAndroid Build Coastguard Worker paddd m3, m1, [t3+r10*4+64] 967*c0909341SAndroid Build Coastguard Worker paddd m2, [t3+r10*4+ 8] 968*c0909341SAndroid Build Coastguard Worker paddd m3, [t3+r10*4+72] 969*c0909341SAndroid Build Coastguard Worker paddd m0, m2 970*c0909341SAndroid Build Coastguard Worker pslld m2, 2 971*c0909341SAndroid Build Coastguard Worker paddd m1, m3 972*c0909341SAndroid Build Coastguard Worker pslld m3, 2 973*c0909341SAndroid Build Coastguard Worker paddd m2, m0 ; ab 565 974*c0909341SAndroid Build Coastguard Worker paddd m3, m1 975*c0909341SAndroid Build Coastguard Worker pandn m0, m13, m2 ; a 976*c0909341SAndroid Build Coastguard Worker psrld m2, 12 ; b 977*c0909341SAndroid Build Coastguard Worker pandn m1, m13, m3 978*c0909341SAndroid Build Coastguard Worker psrld m3, 12 979*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4+ 0], m0 980*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*8+ 0], m2 981*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4+64], m1 982*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*8+64], m3 983*c0909341SAndroid Build Coastguard Worker add r10, 32 984*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 985*c0909341SAndroid Build Coastguard Worker ret 986*c0909341SAndroid Build Coastguard WorkerALIGN function_align 987*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 988*c0909341SAndroid Build Coastguard Worker mov r10, wq 989*c0909341SAndroid Build Coastguard Worker.n0_loop: 990*c0909341SAndroid Build Coastguard Worker movu m16, [t3+r10*4+ 4] 991*c0909341SAndroid Build Coastguard Worker movu m17, [t3+r10*4+68] 992*c0909341SAndroid Build Coastguard Worker paddd m0, m16, [t3+r10*4+ 0] 993*c0909341SAndroid Build Coastguard Worker paddd m1, m17, [t3+r10*4+64] 994*c0909341SAndroid Build Coastguard Worker paddd m0, [t3+r10*4+ 8] 995*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*4+72] 996*c0909341SAndroid Build Coastguard Worker paddd m16, m0 997*c0909341SAndroid Build Coastguard Worker pslld m0, 2 998*c0909341SAndroid Build Coastguard Worker paddd m17, m1 999*c0909341SAndroid Build Coastguard Worker pslld m1, 2 1000*c0909341SAndroid Build Coastguard Worker paddd m0, m16 1001*c0909341SAndroid Build Coastguard Worker paddd m1, m17 1002*c0909341SAndroid Build Coastguard Worker pandn m16, m13, m0 1003*c0909341SAndroid Build Coastguard Worker psrld m0, 12 1004*c0909341SAndroid Build Coastguard Worker pandn m17, m13, m1 1005*c0909341SAndroid Build Coastguard Worker psrld m1, 12 1006*c0909341SAndroid Build Coastguard Worker paddd m2, m16, [t3+r10*4+416*4+ 0] ; a 1007*c0909341SAndroid Build Coastguard Worker paddd m3, m17, [t3+r10*4+416*4+64] 1008*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4+ 0], m16 1009*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4+64], m17 1010*c0909341SAndroid Build Coastguard Worker paddd m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8) 1011*c0909341SAndroid Build Coastguard Worker paddd m17, m1, [t3+r10*4+416*8+64] 1012*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*8+ 0], m0 1013*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*8+64], m1 1014*c0909341SAndroid Build Coastguard Worker pmovzxbd m0, [dstq+r10+ 0] 1015*c0909341SAndroid Build Coastguard Worker pmovzxbd m1, [dstq+r10+16] 1016*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0 ; a * src 1017*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1 1018*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 1019*c0909341SAndroid Build Coastguard Worker psubd m16, m2 ; b - a * src + (1 << 8) 1020*c0909341SAndroid Build Coastguard Worker psubd m17, m3 1021*c0909341SAndroid Build Coastguard Worker psrad m16, 9 1022*c0909341SAndroid Build Coastguard Worker psrad m17, 9 1023*c0909341SAndroid Build Coastguard Worker packssdw m16, m17 1024*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m15 1025*c0909341SAndroid Build Coastguard Worker paddw m16, m0 1026*c0909341SAndroid Build Coastguard Worker packuswb m16, m16 1027*c0909341SAndroid Build Coastguard Worker vpermd m16, m22, m16 1028*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], ym16 1029*c0909341SAndroid Build Coastguard Worker add r10, 32 1030*c0909341SAndroid Build Coastguard Worker jl .n0_loop 1031*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1032*c0909341SAndroid Build Coastguard Worker ret 1033*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1034*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 1035*c0909341SAndroid Build Coastguard Worker mov r10, wq 1036*c0909341SAndroid Build Coastguard Worker.n1_loop: 1037*c0909341SAndroid Build Coastguard Worker pmovzxbd m0, [dstq+r10+ 0] 1038*c0909341SAndroid Build Coastguard Worker pmovzxbd m1, [dstq+r10+16] 1039*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m0, [t3+r10*4+416*4+ 0] ; a * src 1040*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m1, [t3+r10*4+416*4+64] 1041*c0909341SAndroid Build Coastguard Worker mova m16, [t3+r10*4+416*8+ 0] ; b + (1 << 7) 1042*c0909341SAndroid Build Coastguard Worker mova m17, [t3+r10*4+416*8+64] 1043*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 1044*c0909341SAndroid Build Coastguard Worker psubd m16, m2 ; b - a * src + (1 << 7) 1045*c0909341SAndroid Build Coastguard Worker psubd m17, m3 1046*c0909341SAndroid Build Coastguard Worker psrad m16, 8 1047*c0909341SAndroid Build Coastguard Worker psrad m17, 8 1048*c0909341SAndroid Build Coastguard Worker packssdw m16, m17 1049*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m15 1050*c0909341SAndroid Build Coastguard Worker paddw m16, m0 1051*c0909341SAndroid Build Coastguard Worker packuswb m16, m16 1052*c0909341SAndroid Build Coastguard Worker vpermd m16, m22, m16 1053*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], ym16 1054*c0909341SAndroid Build Coastguard Worker add r10, 32 1055*c0909341SAndroid Build Coastguard Worker jl .n1_loop 1056*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1057*c0909341SAndroid Build Coastguard Worker ret 1058*c0909341SAndroid Build Coastguard Worker 1059*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \ 1060*c0909341SAndroid Build Coastguard Worker w, h, edge, params 1061*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 1062*c0909341SAndroid Build Coastguard Worker mov wd, wm 1063*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1064*c0909341SAndroid Build Coastguard Worker mov edged, r7m 1065*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [sgr_shuf+3] 1066*c0909341SAndroid Build Coastguard Worker add lpfq, wq 1067*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [sgr_shuf+5] 1068*c0909341SAndroid Build Coastguard Worker add dstq, wq 1069*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m7, [sgr_shuf+7] 1070*c0909341SAndroid Build Coastguard Worker pxor m4, m4 1071*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [pd_m9] 1072*c0909341SAndroid Build Coastguard Worker vpsubd m11, m4, [paramsq+4] {1to16} ; -s1 1073*c0909341SAndroid Build Coastguard Worker vpbroadcastw m15, [paramsq+10] ; w1 1074*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+20] 1075*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pw_164_455] 1076*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*4+16+416*12] 1077*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) 1078*c0909341SAndroid Build Coastguard Worker neg wq 1079*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [pd_m4096] 1080*c0909341SAndroid Build Coastguard Worker mov r10d, 0xfe 1081*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) 1082*c0909341SAndroid Build Coastguard Worker kmovb k1, r10d 1083*c0909341SAndroid Build Coastguard Worker mova m18, [sgr_x_by_x+64*0] 1084*c0909341SAndroid Build Coastguard Worker mov r10, 0x3333333333333333 1085*c0909341SAndroid Build Coastguard Worker mova m19, [sgr_x_by_x+64*1] 1086*c0909341SAndroid Build Coastguard Worker kmovq k2, r10 1087*c0909341SAndroid Build Coastguard Worker mova m20, [sgr_x_by_x+64*2] 1088*c0909341SAndroid Build Coastguard Worker psllw m15, 4 1089*c0909341SAndroid Build Coastguard Worker mova m21, [sgr_x_by_x+64*3] 1090*c0909341SAndroid Build Coastguard Worker lea r14, [r_ext_mask+75] 1091*c0909341SAndroid Build Coastguard Worker mova ym9, [sgr_shuf] 1092*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 1093*c0909341SAndroid Build Coastguard Worker jz .no_top 1094*c0909341SAndroid Build Coastguard Worker call .h_top 1095*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1096*c0909341SAndroid Build Coastguard Worker mov t2, t1 1097*c0909341SAndroid Build Coastguard Worker add t1, 416*6 1098*c0909341SAndroid Build Coastguard Worker call .h_top 1099*c0909341SAndroid Build Coastguard Worker lea t4, [lpfq+strideq*4] 1100*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1101*c0909341SAndroid Build Coastguard Worker add t4, strideq 1102*c0909341SAndroid Build Coastguard Worker mov [rsp], t4 ; below 1103*c0909341SAndroid Build Coastguard Worker mov t0, t2 1104*c0909341SAndroid Build Coastguard Worker call .hv 1105*c0909341SAndroid Build Coastguard Worker.main: 1106*c0909341SAndroid Build Coastguard Worker mov t5, t3 1107*c0909341SAndroid Build Coastguard Worker add t3, 416*4 1108*c0909341SAndroid Build Coastguard Worker dec hd 1109*c0909341SAndroid Build Coastguard Worker jz .height1 1110*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1111*c0909341SAndroid Build Coastguard Worker call .hv 1112*c0909341SAndroid Build Coastguard Worker call .prep_n 1113*c0909341SAndroid Build Coastguard Worker dec hd 1114*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1115*c0909341SAndroid Build Coastguard Worker.main_loop: 1116*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1117*c0909341SAndroid Build Coastguard Worker call .hv 1118*c0909341SAndroid Build Coastguard Worker call .n 1119*c0909341SAndroid Build Coastguard Worker dec hd 1120*c0909341SAndroid Build Coastguard Worker jnz .main_loop 1121*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 1122*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1123*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 1124*c0909341SAndroid Build Coastguard Worker call .hv_bottom 1125*c0909341SAndroid Build Coastguard Worker call .n 1126*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1127*c0909341SAndroid Build Coastguard Worker call .hv_bottom 1128*c0909341SAndroid Build Coastguard Worker.end: 1129*c0909341SAndroid Build Coastguard Worker call .n 1130*c0909341SAndroid Build Coastguard Worker RET 1131*c0909341SAndroid Build Coastguard Worker.height1: 1132*c0909341SAndroid Build Coastguard Worker call .v 1133*c0909341SAndroid Build Coastguard Worker call .prep_n 1134*c0909341SAndroid Build Coastguard Worker mov t2, t1 1135*c0909341SAndroid Build Coastguard Worker call .v 1136*c0909341SAndroid Build Coastguard Worker jmp .end 1137*c0909341SAndroid Build Coastguard Worker.extend_bottom: 1138*c0909341SAndroid Build Coastguard Worker call .v 1139*c0909341SAndroid Build Coastguard Worker call .n 1140*c0909341SAndroid Build Coastguard Worker mov t2, t1 1141*c0909341SAndroid Build Coastguard Worker call .v 1142*c0909341SAndroid Build Coastguard Worker jmp .end 1143*c0909341SAndroid Build Coastguard Worker.no_top: 1144*c0909341SAndroid Build Coastguard Worker lea t4, [lpfq+strideq*4] 1145*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1146*c0909341SAndroid Build Coastguard Worker lea t4, [t4+strideq*2] 1147*c0909341SAndroid Build Coastguard Worker mov [rsp], t4 1148*c0909341SAndroid Build Coastguard Worker call .h 1149*c0909341SAndroid Build Coastguard Worker lea t0, [t1+416*6] 1150*c0909341SAndroid Build Coastguard Worker mov t2, t1 1151*c0909341SAndroid Build Coastguard Worker call .v 1152*c0909341SAndroid Build Coastguard Worker jmp .main 1153*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsum 1154*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1155*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1156*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1157*c0909341SAndroid Build Coastguard Worker movd xm17, [leftq] 1158*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 1159*c0909341SAndroid Build Coastguard Worker add leftq, 4 1160*c0909341SAndroid Build Coastguard Worker jmp .h_main 1161*c0909341SAndroid Build Coastguard Worker.h_extend_left: 1162*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm17, [lpfq+wq] 1163*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 1164*c0909341SAndroid Build Coastguard Worker jmp .h_main 1165*c0909341SAndroid Build Coastguard Worker.h_top: 1166*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1167*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1168*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1169*c0909341SAndroid Build Coastguard Worker.h_loop: 1170*c0909341SAndroid Build Coastguard Worker movu ym17, [lpfq+r10-2] 1171*c0909341SAndroid Build Coastguard Worker.h_main: 1172*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [lpfq+r10+6], 1 1173*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1174*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 1175*c0909341SAndroid Build Coastguard Worker cmp r10d, -33 1176*c0909341SAndroid Build Coastguard Worker jl .h_have_right 1177*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [lpfq-1] 1178*c0909341SAndroid Build Coastguard Worker movu ym16, [r14+r10-8] 1179*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [r14+r10+0], 1 1180*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, m16, 0xe4 1181*c0909341SAndroid Build Coastguard Worker.h_have_right: 1182*c0909341SAndroid Build Coastguard Worker pshufb m0, m17, m5 1183*c0909341SAndroid Build Coastguard Worker pmullw m2, m0, m0 1184*c0909341SAndroid Build Coastguard Worker pshufb m16, m17, m6 1185*c0909341SAndroid Build Coastguard Worker paddw m0, m16 1186*c0909341SAndroid Build Coastguard Worker pshufb m17, m7 1187*c0909341SAndroid Build Coastguard Worker paddw m0, m17 ; sum 1188*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m16, m17 1189*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m4 1190*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m3, m3 ; sumsq 1191*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m17 1192*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m4 1193*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m16, m16 1194*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*0], m0 1195*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*2], m1 1196*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*4], m2 1197*c0909341SAndroid Build Coastguard Worker add r10, 32 1198*c0909341SAndroid Build Coastguard Worker jl .h_loop 1199*c0909341SAndroid Build Coastguard Worker ret 1200*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1201*c0909341SAndroid Build Coastguard Worker.hv: ; horizontal boxsum + vertical boxsum + ab 1202*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1203*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1204*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 1205*c0909341SAndroid Build Coastguard Worker movd xm17, [leftq] 1206*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 1207*c0909341SAndroid Build Coastguard Worker add leftq, 4 1208*c0909341SAndroid Build Coastguard Worker jmp .hv_main 1209*c0909341SAndroid Build Coastguard Worker.hv_extend_left: 1210*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm17, [lpfq+wq] 1211*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 1212*c0909341SAndroid Build Coastguard Worker jmp .hv_main 1213*c0909341SAndroid Build Coastguard Worker.hv_bottom: 1214*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1215*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1216*c0909341SAndroid Build Coastguard Worker jz .hv_extend_left 1217*c0909341SAndroid Build Coastguard Worker.hv_loop: 1218*c0909341SAndroid Build Coastguard Worker movu ym17, [lpfq+r10-2] 1219*c0909341SAndroid Build Coastguard Worker.hv_main: 1220*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [lpfq+r10+6], 1 1221*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1222*c0909341SAndroid Build Coastguard Worker jnz .hv_have_right 1223*c0909341SAndroid Build Coastguard Worker cmp r10d, -33 1224*c0909341SAndroid Build Coastguard Worker jl .hv_have_right 1225*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [lpfq-1] 1226*c0909341SAndroid Build Coastguard Worker movu ym16, [r14+r10-8] 1227*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [r14+r10+0], 1 1228*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, m16, 0xe4 1229*c0909341SAndroid Build Coastguard Worker.hv_have_right: 1230*c0909341SAndroid Build Coastguard Worker pshufb m0, m17, m5 1231*c0909341SAndroid Build Coastguard Worker pmullw m3, m0, m0 1232*c0909341SAndroid Build Coastguard Worker pshufb m1, m17, m6 1233*c0909341SAndroid Build Coastguard Worker paddw m0, m1 1234*c0909341SAndroid Build Coastguard Worker pshufb m17, m7 1235*c0909341SAndroid Build Coastguard Worker paddw m0, m17 ; h sum 1236*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m17, m1 1237*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 1238*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m16, m16 ; h sumsq 1239*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m1 1240*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 1241*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m17, m17 1242*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10*2+416*0] 1243*c0909341SAndroid Build Coastguard Worker paddw m1, [t1+r10*2+416*0] ; hv sum 1244*c0909341SAndroid Build Coastguard Worker paddd m16, m2, [t2+r10*2+416*2] 1245*c0909341SAndroid Build Coastguard Worker paddd m17, m3, [t2+r10*2+416*4] 1246*c0909341SAndroid Build Coastguard Worker paddd m16, [t1+r10*2+416*2] ; hv sumsq 1247*c0909341SAndroid Build Coastguard Worker paddd m17, [t1+r10*2+416*4] 1248*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+416*0], m0 1249*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+416*2], m2 1250*c0909341SAndroid Build Coastguard Worker mova [t0+r10*2+416*4], m3 1251*c0909341SAndroid Build Coastguard Worker pmulld m16, m8 ; -a * 9 1252*c0909341SAndroid Build Coastguard Worker pmulld m17, m8 1253*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4, m1 ; b 1254*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m0, m0 ; -p 1255*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4, m1 1256*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m1, m1 1257*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 455 1258*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 1259*c0909341SAndroid Build Coastguard Worker pmulld m16, m11 ; p * s 1260*c0909341SAndroid Build Coastguard Worker pmulld m17, m11 1261*c0909341SAndroid Build Coastguard Worker vpalignr m17{k2}, m16, m16, 2 1262*c0909341SAndroid Build Coastguard Worker mova m16, m20 1263*c0909341SAndroid Build Coastguard Worker paddusw m17, m12 1264*c0909341SAndroid Build Coastguard Worker psraw m17, 4 ; min(z, 255) - 256 1265*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 1266*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m17 1267*c0909341SAndroid Build Coastguard Worker vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 1268*c0909341SAndroid Build Coastguard Worker vmovdqu8 m17{k3}, m16 ; x 1269*c0909341SAndroid Build Coastguard Worker pandn m16, m13, m17 1270*c0909341SAndroid Build Coastguard Worker psrld m17, 16 1271*c0909341SAndroid Build Coastguard Worker pmulld m0, m16 1272*c0909341SAndroid Build Coastguard Worker pmulld m1, m17 1273*c0909341SAndroid Build Coastguard Worker paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) 1274*c0909341SAndroid Build Coastguard Worker paddd m1, m14 1275*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) 1276*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m1, m13, 0xd8 1277*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 8], m16 1278*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 24], xm17 1279*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+ 56], m17, 2 1280*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 72], m17 1281*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+ 72], ym16, 1 1282*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+104], m16, 3 1283*c0909341SAndroid Build Coastguard Worker add r10, 32 1284*c0909341SAndroid Build Coastguard Worker jl .hv_loop 1285*c0909341SAndroid Build Coastguard Worker mov t2, t1 1286*c0909341SAndroid Build Coastguard Worker mov t1, t0 1287*c0909341SAndroid Build Coastguard Worker mov t0, t2 1288*c0909341SAndroid Build Coastguard Worker ret 1289*c0909341SAndroid Build Coastguard Worker.v: ; vertical boxsum + ab 1290*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1291*c0909341SAndroid Build Coastguard Worker.v_loop: 1292*c0909341SAndroid Build Coastguard Worker mova m16, [t1+r10*2+416*2] 1293*c0909341SAndroid Build Coastguard Worker mova m17, [t1+r10*2+416*4] 1294*c0909341SAndroid Build Coastguard Worker paddd m16, m16 1295*c0909341SAndroid Build Coastguard Worker paddd m17, m17 1296*c0909341SAndroid Build Coastguard Worker paddd m16, [t2+r10*2+416*2] ; hv sumsq 1297*c0909341SAndroid Build Coastguard Worker paddd m17, [t2+r10*2+416*4] 1298*c0909341SAndroid Build Coastguard Worker pmulld m16, m8 ; -a * 9 1299*c0909341SAndroid Build Coastguard Worker pmulld m17, m8 1300*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10*2+416*0] 1301*c0909341SAndroid Build Coastguard Worker paddw m1, m1 1302*c0909341SAndroid Build Coastguard Worker paddw m1, [t2+r10*2+416*0] ; hv sum 1303*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4, m1 ; b 1304*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m0, m0 ; -p 1305*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4, m1 1306*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m1, m1 1307*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m10 ; b * 455 1308*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m10 1309*c0909341SAndroid Build Coastguard Worker pmulld m16, m11 ; p * s 1310*c0909341SAndroid Build Coastguard Worker pmulld m17, m11 1311*c0909341SAndroid Build Coastguard Worker vpalignr m17{k2}, m16, m16, 2 1312*c0909341SAndroid Build Coastguard Worker mova m16, m20 1313*c0909341SAndroid Build Coastguard Worker paddusw m17, m12 1314*c0909341SAndroid Build Coastguard Worker psraw m17, 4 ; min(z, 255) - 256 1315*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] 1316*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m17 1317*c0909341SAndroid Build Coastguard Worker vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] 1318*c0909341SAndroid Build Coastguard Worker vmovdqu8 m17{k3}, m16 ; x 1319*c0909341SAndroid Build Coastguard Worker pandn m16, m13, m17 1320*c0909341SAndroid Build Coastguard Worker psrld m17, 16 1321*c0909341SAndroid Build Coastguard Worker pmulld m0, m16 1322*c0909341SAndroid Build Coastguard Worker pmulld m1, m17 1323*c0909341SAndroid Build Coastguard Worker paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) 1324*c0909341SAndroid Build Coastguard Worker paddd m1, m14 1325*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) 1326*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m1, m13, 0xd8 1327*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 8], m16 1328*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 24], xm17 1329*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+ 56], m17, 2 1330*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+ 72], m17 1331*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+ 72], ym16, 1 1332*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+104], m16, 3 1333*c0909341SAndroid Build Coastguard Worker add r10, 32 1334*c0909341SAndroid Build Coastguard Worker jl .v_loop 1335*c0909341SAndroid Build Coastguard Worker ret 1336*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 1337*c0909341SAndroid Build Coastguard Worker mov r10, wq 1338*c0909341SAndroid Build Coastguard Worker mov t4, t3 1339*c0909341SAndroid Build Coastguard Worker add t3, 416*4 1340*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 1341*c0909341SAndroid Build Coastguard Worker mova m2, [t5+r10*4+0] 1342*c0909341SAndroid Build Coastguard Worker mova m3, [t4+r10*4+0] 1343*c0909341SAndroid Build Coastguard Worker paddd m2, [t5+r10*4+8] 1344*c0909341SAndroid Build Coastguard Worker paddd m3, [t4+r10*4+8] 1345*c0909341SAndroid Build Coastguard Worker paddd m0, m2, [t5+r10*4+4] 1346*c0909341SAndroid Build Coastguard Worker paddd m1, m3, [t4+r10*4+4] 1347*c0909341SAndroid Build Coastguard Worker pslld m0, 2 1348*c0909341SAndroid Build Coastguard Worker paddd m1, m1 ; ab[ 0] 222 1349*c0909341SAndroid Build Coastguard Worker psubd m0, m2 ; ab[-1] 343 1350*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4], m1 1351*c0909341SAndroid Build Coastguard Worker paddd m1, m1 1352*c0909341SAndroid Build Coastguard Worker mova [t5+r10*4], m0 1353*c0909341SAndroid Build Coastguard Worker psubd m1, m3 ; ab[ 0] 343 1354*c0909341SAndroid Build Coastguard Worker mova [t4+r10*4], m1 1355*c0909341SAndroid Build Coastguard Worker add r10, 16 1356*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 1357*c0909341SAndroid Build Coastguard Worker ret 1358*c0909341SAndroid Build Coastguard Worker; a+b are packed together in a single dword, but we can't do the 1359*c0909341SAndroid Build Coastguard Worker; full neighbor calculations before splitting them since we don't 1360*c0909341SAndroid Build Coastguard Worker; have sufficient precision. The solution is to do the calculations 1361*c0909341SAndroid Build Coastguard Worker; in two equal halves and split a and b before doing the final sum. 1362*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1363*c0909341SAndroid Build Coastguard Worker.n: ; neighbor + output 1364*c0909341SAndroid Build Coastguard Worker mov r10, wq 1365*c0909341SAndroid Build Coastguard Worker.n_loop: 1366*c0909341SAndroid Build Coastguard Worker mova m16, [t3+r10*4+ 0] 1367*c0909341SAndroid Build Coastguard Worker paddd m16, [t3+r10*4+ 8] 1368*c0909341SAndroid Build Coastguard Worker paddd m17, m16, [t3+r10*4+ 4] 1369*c0909341SAndroid Build Coastguard Worker paddd m17, m17 ; ab[+1] 222 1370*c0909341SAndroid Build Coastguard Worker mova m2, [t3+r10*4+416*4+ 0] 1371*c0909341SAndroid Build Coastguard Worker paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 1372*c0909341SAndroid Build Coastguard Worker mova m3, [t3+r10*4+416*4+64] 1373*c0909341SAndroid Build Coastguard Worker paddd m1, m3, [t5+r10*4+64] 1374*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4+ 0], m17 1375*c0909341SAndroid Build Coastguard Worker paddd m17, m17 1376*c0909341SAndroid Build Coastguard Worker psubd m17, m16 ; ab[+1] 343 1377*c0909341SAndroid Build Coastguard Worker mova [t5+r10*4+ 0], m17 1378*c0909341SAndroid Build Coastguard Worker paddd m2, m17 ; ab[ 0] 222 + ab[+1] 343 1379*c0909341SAndroid Build Coastguard Worker mova m16, [t3+r10*4+64] 1380*c0909341SAndroid Build Coastguard Worker paddd m16, [t3+r10*4+72] 1381*c0909341SAndroid Build Coastguard Worker paddd m17, m16, [t3+r10*4+68] 1382*c0909341SAndroid Build Coastguard Worker paddd m17, m17 1383*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4+64], m17 1384*c0909341SAndroid Build Coastguard Worker paddd m17, m17 1385*c0909341SAndroid Build Coastguard Worker psubd m17, m16 1386*c0909341SAndroid Build Coastguard Worker mova [t5+r10*4+64], m17 1387*c0909341SAndroid Build Coastguard Worker pandn m16, m13, m0 1388*c0909341SAndroid Build Coastguard Worker psrld m0, 12 1389*c0909341SAndroid Build Coastguard Worker paddd m3, m17 1390*c0909341SAndroid Build Coastguard Worker pandn m17, m13, m2 1391*c0909341SAndroid Build Coastguard Worker psrld m2, 12 1392*c0909341SAndroid Build Coastguard Worker paddd m16, m17 ; a 1393*c0909341SAndroid Build Coastguard Worker pandn m17, m13, m1 1394*c0909341SAndroid Build Coastguard Worker psrld m1, 12 1395*c0909341SAndroid Build Coastguard Worker paddd m0, m2 ; b + (1 << 8) 1396*c0909341SAndroid Build Coastguard Worker pandn m2, m13, m3 1397*c0909341SAndroid Build Coastguard Worker psrld m3, 12 1398*c0909341SAndroid Build Coastguard Worker paddd m17, m2 1399*c0909341SAndroid Build Coastguard Worker pmovzxbd m2, [dstq+r10+ 0] 1400*c0909341SAndroid Build Coastguard Worker paddd m1, m3 1401*c0909341SAndroid Build Coastguard Worker pmovzxbd m3, [dstq+r10+16] 1402*c0909341SAndroid Build Coastguard Worker pmaddwd m16, m2 ; a * src 1403*c0909341SAndroid Build Coastguard Worker pmaddwd m17, m3 1404*c0909341SAndroid Build Coastguard Worker packssdw m2, m3 1405*c0909341SAndroid Build Coastguard Worker psubd m0, m16 ; b - a * src + (1 << 8) 1406*c0909341SAndroid Build Coastguard Worker psubd m1, m17 1407*c0909341SAndroid Build Coastguard Worker psrad m0, 9 1408*c0909341SAndroid Build Coastguard Worker psrad m1, 9 1409*c0909341SAndroid Build Coastguard Worker packssdw m0, m1 1410*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15 1411*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1412*c0909341SAndroid Build Coastguard Worker packuswb m0, m0 1413*c0909341SAndroid Build Coastguard Worker vpermd m16, m9, m0 1414*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], ym16 1415*c0909341SAndroid Build Coastguard Worker add r10, 32 1416*c0909341SAndroid Build Coastguard Worker jl .n_loop 1417*c0909341SAndroid Build Coastguard Worker mov r10, t5 1418*c0909341SAndroid Build Coastguard Worker mov t5, t4 1419*c0909341SAndroid Build Coastguard Worker mov t4, r10 1420*c0909341SAndroid Build Coastguard Worker add dstq, strideq 1421*c0909341SAndroid Build Coastguard Worker ret 1422*c0909341SAndroid Build Coastguard Worker 1423*c0909341SAndroid Build Coastguard Workercglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \ 1424*c0909341SAndroid Build Coastguard Worker w, h, edge, params 1425*c0909341SAndroid Build Coastguard Worker mov paramsq, r6mp 1426*c0909341SAndroid Build Coastguard Worker mov wd, wm 1427*c0909341SAndroid Build Coastguard Worker movifnidn hd, hm 1428*c0909341SAndroid Build Coastguard Worker mov edged, r7m 1429*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m5, [sgr_shuf+1] 1430*c0909341SAndroid Build Coastguard Worker add lpfq, wq 1431*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m6, [sgr_shuf+9] 1432*c0909341SAndroid Build Coastguard Worker add dstq, wq 1433*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m7, [sgr_shuf+3] 1434*c0909341SAndroid Build Coastguard Worker lea t3, [rsp+wq*4+416*24+8] 1435*c0909341SAndroid Build Coastguard Worker vbroadcasti128 m8, [sgr_shuf+7] 1436*c0909341SAndroid Build Coastguard Worker pxor m4, m4 1437*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [pd_m9] 1438*c0909341SAndroid Build Coastguard Worker vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 1439*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [pw_61448] 1440*c0909341SAndroid Build Coastguard Worker vpsubd m12, m4, [paramsq+4] {1to16} ; -s1 1441*c0909341SAndroid Build Coastguard Worker vpbroadcastd m26, [paramsq+8] ; w0 w1 1442*c0909341SAndroid Build Coastguard Worker lea t1, [rsp+wq*2+12] 1443*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [pd_m25] 1444*c0909341SAndroid Build Coastguard Worker neg wq 1445*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [pw_164_455] 1446*c0909341SAndroid Build Coastguard Worker mov r10d, 0xfe 1447*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [pd_34816] 1448*c0909341SAndroid Build Coastguard Worker kmovb k1, r10d 1449*c0909341SAndroid Build Coastguard Worker mova m20, [sgr_x_by_x+64*0] 1450*c0909341SAndroid Build Coastguard Worker mov r10, 0x3333333333333333 1451*c0909341SAndroid Build Coastguard Worker mova m21, [sgr_x_by_x+64*1] 1452*c0909341SAndroid Build Coastguard Worker kmovq k2, r10 1453*c0909341SAndroid Build Coastguard Worker mova m22, [sgr_x_by_x+64*2] 1454*c0909341SAndroid Build Coastguard Worker lea r12, [r_ext_mask+75] 1455*c0909341SAndroid Build Coastguard Worker mova m23, [sgr_x_by_x+64*3] 1456*c0909341SAndroid Build Coastguard Worker vpbroadcastd m24, [pd_m4096] 1457*c0909341SAndroid Build Coastguard Worker vpbroadcastd m25, [sgr_shuf+28] ; 0x8000____ 1458*c0909341SAndroid Build Coastguard Worker psllw m26, 5 1459*c0909341SAndroid Build Coastguard Worker mova xm27, [sgr_mix_perm] 1460*c0909341SAndroid Build Coastguard Worker test edgeb, 4 ; LR_HAVE_TOP 1461*c0909341SAndroid Build Coastguard Worker jz .no_top 1462*c0909341SAndroid Build Coastguard Worker call .h_top 1463*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1464*c0909341SAndroid Build Coastguard Worker mov t2, t1 1465*c0909341SAndroid Build Coastguard Worker call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup 1466*c0909341SAndroid Build Coastguard Worker add t1, 416*12 1467*c0909341SAndroid Build Coastguard Worker call .h_top 1468*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1469*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1470*c0909341SAndroid Build Coastguard Worker add r10, strideq 1471*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 ; below 1472*c0909341SAndroid Build Coastguard Worker call .hv0 1473*c0909341SAndroid Build Coastguard Worker.main: 1474*c0909341SAndroid Build Coastguard Worker dec hd 1475*c0909341SAndroid Build Coastguard Worker jz .height1 1476*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1477*c0909341SAndroid Build Coastguard Worker call .hv1 1478*c0909341SAndroid Build Coastguard Worker call .prep_n 1479*c0909341SAndroid Build Coastguard Worker sub hd, 2 1480*c0909341SAndroid Build Coastguard Worker jl .extend_bottom 1481*c0909341SAndroid Build Coastguard Worker.main_loop: 1482*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1483*c0909341SAndroid Build Coastguard Worker call .hv0 1484*c0909341SAndroid Build Coastguard Worker test hd, hd 1485*c0909341SAndroid Build Coastguard Worker jz .odd_height 1486*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1487*c0909341SAndroid Build Coastguard Worker call .hv1 1488*c0909341SAndroid Build Coastguard Worker call .n0 1489*c0909341SAndroid Build Coastguard Worker call .n1 1490*c0909341SAndroid Build Coastguard Worker sub hd, 2 1491*c0909341SAndroid Build Coastguard Worker jge .main_loop 1492*c0909341SAndroid Build Coastguard Worker test edgeb, 8 ; LR_HAVE_BOTTOM 1493*c0909341SAndroid Build Coastguard Worker jz .extend_bottom 1494*c0909341SAndroid Build Coastguard Worker mov lpfq, [rsp] 1495*c0909341SAndroid Build Coastguard Worker call .hv0_bottom 1496*c0909341SAndroid Build Coastguard Worker add lpfq, strideq 1497*c0909341SAndroid Build Coastguard Worker call .hv1_bottom 1498*c0909341SAndroid Build Coastguard Worker.end: 1499*c0909341SAndroid Build Coastguard Worker call .n0 1500*c0909341SAndroid Build Coastguard Worker call .n1 1501*c0909341SAndroid Build Coastguard Worker.end2: 1502*c0909341SAndroid Build Coastguard Worker RET 1503*c0909341SAndroid Build Coastguard Worker.height1: 1504*c0909341SAndroid Build Coastguard Worker call .v1 1505*c0909341SAndroid Build Coastguard Worker call .prep_n 1506*c0909341SAndroid Build Coastguard Worker jmp .odd_height_end 1507*c0909341SAndroid Build Coastguard Worker.odd_height: 1508*c0909341SAndroid Build Coastguard Worker call .v1 1509*c0909341SAndroid Build Coastguard Worker call .n0 1510*c0909341SAndroid Build Coastguard Worker call .n1 1511*c0909341SAndroid Build Coastguard Worker.odd_height_end: 1512*c0909341SAndroid Build Coastguard Worker call .v0 1513*c0909341SAndroid Build Coastguard Worker call .v1 1514*c0909341SAndroid Build Coastguard Worker call .n0 1515*c0909341SAndroid Build Coastguard Worker jmp .end2 1516*c0909341SAndroid Build Coastguard Worker.extend_bottom: 1517*c0909341SAndroid Build Coastguard Worker call .v0 1518*c0909341SAndroid Build Coastguard Worker call .v1 1519*c0909341SAndroid Build Coastguard Worker jmp .end 1520*c0909341SAndroid Build Coastguard Worker.no_top: 1521*c0909341SAndroid Build Coastguard Worker lea r10, [lpfq+strideq*4] 1522*c0909341SAndroid Build Coastguard Worker mov lpfq, dstq 1523*c0909341SAndroid Build Coastguard Worker lea r10, [r10+strideq*2] 1524*c0909341SAndroid Build Coastguard Worker mov [rsp], r10 1525*c0909341SAndroid Build Coastguard Worker call .h 1526*c0909341SAndroid Build Coastguard Worker lea t2, [t1+416*12] 1527*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1528*c0909341SAndroid Build Coastguard Worker.top_fixup_loop: 1529*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10*2+416* 0] 1530*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10*2+416* 2] 1531*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10*2+416* 4] 1532*c0909341SAndroid Build Coastguard Worker paddw m0, m0 1533*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10*2+416* 6] 1534*c0909341SAndroid Build Coastguard Worker paddd m1, m1 1535*c0909341SAndroid Build Coastguard Worker mova m16, [t1+r10*2+416* 8] 1536*c0909341SAndroid Build Coastguard Worker paddd m2, m2 1537*c0909341SAndroid Build Coastguard Worker mova m17, [t1+r10*2+416*10] 1538*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 0], m0 1539*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 2], m1 1540*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 4], m2 1541*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 6], m3 1542*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 8], m16 1543*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416*10], m17 1544*c0909341SAndroid Build Coastguard Worker add r10, 32 1545*c0909341SAndroid Build Coastguard Worker jl .top_fixup_loop 1546*c0909341SAndroid Build Coastguard Worker call .v0 1547*c0909341SAndroid Build Coastguard Worker jmp .main 1548*c0909341SAndroid Build Coastguard Worker.h: ; horizontal boxsums 1549*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1550*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1551*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1552*c0909341SAndroid Build Coastguard Worker movd xm17, [leftq] 1553*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 1554*c0909341SAndroid Build Coastguard Worker add leftq, 4 1555*c0909341SAndroid Build Coastguard Worker jmp .h_main 1556*c0909341SAndroid Build Coastguard Worker.h_extend_left: 1557*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm17, [lpfq+wq] 1558*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 1559*c0909341SAndroid Build Coastguard Worker jmp .h_main 1560*c0909341SAndroid Build Coastguard Worker.h_top: 1561*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1562*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1563*c0909341SAndroid Build Coastguard Worker jz .h_extend_left 1564*c0909341SAndroid Build Coastguard Worker.h_loop: 1565*c0909341SAndroid Build Coastguard Worker movu ym17, [lpfq+r10-2] 1566*c0909341SAndroid Build Coastguard Worker.h_main: 1567*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [lpfq+r10+6], 1 1568*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1569*c0909341SAndroid Build Coastguard Worker jnz .h_have_right 1570*c0909341SAndroid Build Coastguard Worker cmp r10d, -34 1571*c0909341SAndroid Build Coastguard Worker jl .h_have_right 1572*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [lpfq-1] 1573*c0909341SAndroid Build Coastguard Worker movu ym16, [r12+r10-8] 1574*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [r12+r10+0], 1 1575*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, m16, 0xe4 1576*c0909341SAndroid Build Coastguard Worker.h_have_right: 1577*c0909341SAndroid Build Coastguard Worker pshufb m3, m17, m5 1578*c0909341SAndroid Build Coastguard Worker pshufb m18, m17, m6 1579*c0909341SAndroid Build Coastguard Worker shufps m0, m3, m18, q2121 1580*c0909341SAndroid Build Coastguard Worker pmullw m2, m0, m0 1581*c0909341SAndroid Build Coastguard Worker pshufb m19, m17, m7 1582*c0909341SAndroid Build Coastguard Worker paddw m0, m19 1583*c0909341SAndroid Build Coastguard Worker pshufb m17, m8 1584*c0909341SAndroid Build Coastguard Worker paddw m0, m17 ; sum3 1585*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m19, m17 1586*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m4 1587*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m16, m16 ; sumsq3 1588*c0909341SAndroid Build Coastguard Worker punpckhwd m19, m17 1589*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m4 1590*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m19, m19 1591*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416* 6], m0 1592*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416* 8], m1 1593*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*10], m2 1594*c0909341SAndroid Build Coastguard Worker punpcklwd m19, m3, m18 1595*c0909341SAndroid Build Coastguard Worker paddw m0, m3 1596*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m19, m19 ; sumsq5 1597*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m18 1598*c0909341SAndroid Build Coastguard Worker paddw m0, m18 ; sum5 1599*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m3, m3 1600*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416* 0], m0 1601*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416* 2], m1 1602*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416* 4], m2 1603*c0909341SAndroid Build Coastguard Worker add r10, 32 1604*c0909341SAndroid Build Coastguard Worker jl .h_loop 1605*c0909341SAndroid Build Coastguard Worker ret 1606*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1607*c0909341SAndroid Build Coastguard Worker.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) 1608*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1609*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1610*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 1611*c0909341SAndroid Build Coastguard Worker movd xm17, [leftq] 1612*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 1613*c0909341SAndroid Build Coastguard Worker add leftq, 4 1614*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 1615*c0909341SAndroid Build Coastguard Worker.hv0_extend_left: 1616*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm17, [lpfq+wq] 1617*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 1618*c0909341SAndroid Build Coastguard Worker jmp .hv0_main 1619*c0909341SAndroid Build Coastguard Worker.hv0_bottom: 1620*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1621*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1622*c0909341SAndroid Build Coastguard Worker jz .hv0_extend_left 1623*c0909341SAndroid Build Coastguard Worker.hv0_loop: 1624*c0909341SAndroid Build Coastguard Worker movu ym17, [lpfq+r10-2] 1625*c0909341SAndroid Build Coastguard Worker.hv0_main: 1626*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [lpfq+r10+6], 1 1627*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1628*c0909341SAndroid Build Coastguard Worker jnz .hv0_have_right 1629*c0909341SAndroid Build Coastguard Worker cmp r10d, -34 1630*c0909341SAndroid Build Coastguard Worker jl .hv0_have_right 1631*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [lpfq-1] 1632*c0909341SAndroid Build Coastguard Worker movu ym16, [r12+r10-8] 1633*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [r12+r10+0], 1 1634*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, m16, 0xe4 1635*c0909341SAndroid Build Coastguard Worker.hv0_have_right: 1636*c0909341SAndroid Build Coastguard Worker pshufb m18, m17, m5 1637*c0909341SAndroid Build Coastguard Worker pshufb m19, m17, m6 1638*c0909341SAndroid Build Coastguard Worker shufps m1, m18, m19, q2121 1639*c0909341SAndroid Build Coastguard Worker pmullw m3, m1, m1 1640*c0909341SAndroid Build Coastguard Worker pshufb m0, m17, m7 1641*c0909341SAndroid Build Coastguard Worker paddw m1, m0 1642*c0909341SAndroid Build Coastguard Worker pshufb m17, m8 1643*c0909341SAndroid Build Coastguard Worker paddw m1, m17 ; sum3 1644*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m0, m17 1645*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 1646*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m16, m16 ; sumsq3 1647*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m17 1648*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 1649*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m0, m0 1650*c0909341SAndroid Build Coastguard Worker paddw m0, m1, [t1+r10*2+416* 6] 1651*c0909341SAndroid Build Coastguard Worker paddd m16, m2, [t1+r10*2+416* 8] 1652*c0909341SAndroid Build Coastguard Worker paddd m17, m3, [t1+r10*2+416*10] 1653*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416* 6], m1 1654*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416* 8], m2 1655*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*10], m3 1656*c0909341SAndroid Build Coastguard Worker paddw m1, m18 1657*c0909341SAndroid Build Coastguard Worker paddw m1, m19 ; sum5 1658*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*8+ 8], m1 1659*c0909341SAndroid Build Coastguard Worker paddw m1, [t1+r10*2+416* 0] 1660*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416* 0], m1 1661*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m18, m19 1662*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m1, m1 ; sumsq5 1663*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m19 1664*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m18, m18 1665*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*0+ 8], m2 ; we need a clean copy of the last row 1666*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*0+72], m3 ; in case height is odd 1667*c0909341SAndroid Build Coastguard Worker paddd m2, [t1+r10*2+416* 2] 1668*c0909341SAndroid Build Coastguard Worker paddd m3, [t1+r10*2+416* 4] 1669*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416* 2], m2 1670*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416* 4], m3 1671*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10*2+416* 6] 1672*c0909341SAndroid Build Coastguard Worker paddd m2, m16, [t2+r10*2+416* 8] 1673*c0909341SAndroid Build Coastguard Worker paddd m3, m17, [t2+r10*2+416*10] 1674*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 6], m0 1675*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 8], m16 1676*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416*10], m17 1677*c0909341SAndroid Build Coastguard Worker pmulld m16, m2, m9 ; -a3 * 9 1678*c0909341SAndroid Build Coastguard Worker pmulld m17, m3, m9 1679*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4, m1 ; b3 1680*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m0, m0 ; -p3 1681*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4, m1 1682*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m1, m1 1683*c0909341SAndroid Build Coastguard Worker pmulld m16, m12 ; p3 * s1 1684*c0909341SAndroid Build Coastguard Worker pmulld m17, m12 1685*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m13 ; b3 * 455 1686*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m13 1687*c0909341SAndroid Build Coastguard Worker vpalignr m17{k2}, m16, m16, 2 1688*c0909341SAndroid Build Coastguard Worker mova m16, m22 1689*c0909341SAndroid Build Coastguard Worker paddusw m17, m14 1690*c0909341SAndroid Build Coastguard Worker psraw m17, 4 ; min(z3, 255) - 256 1691*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] 1692*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m17 1693*c0909341SAndroid Build Coastguard Worker vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] 1694*c0909341SAndroid Build Coastguard Worker vmovdqu8 m17{k3}, m16 ; x3 1695*c0909341SAndroid Build Coastguard Worker pandn m16, m24, m17 1696*c0909341SAndroid Build Coastguard Worker psrld m17, 16 1697*c0909341SAndroid Build Coastguard Worker pmulld m0, m16 1698*c0909341SAndroid Build Coastguard Worker pmulld m1, m17 1699*c0909341SAndroid Build Coastguard Worker paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1700*c0909341SAndroid Build Coastguard Worker paddd m1, m15 1701*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) 1702*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m1, m24, 0xd8 1703*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4+ 8], m16 1704*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4+ 24], xm17 1705*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 1706*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4+ 72], m17 1707*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 1708*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+416*4+104], m16, 3 1709*c0909341SAndroid Build Coastguard Worker add r10, 32 1710*c0909341SAndroid Build Coastguard Worker jl .hv0_loop 1711*c0909341SAndroid Build Coastguard Worker ret 1712*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1713*c0909341SAndroid Build Coastguard Worker.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 1714*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1715*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1716*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 1717*c0909341SAndroid Build Coastguard Worker movd xm17, [leftq] 1718*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 1719*c0909341SAndroid Build Coastguard Worker add leftq, 4 1720*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 1721*c0909341SAndroid Build Coastguard Worker.hv1_extend_left: 1722*c0909341SAndroid Build Coastguard Worker vpbroadcastb xm17, [lpfq+wq] 1723*c0909341SAndroid Build Coastguard Worker vmovdqu32 ym17{k1}, [lpfq+wq-4] 1724*c0909341SAndroid Build Coastguard Worker jmp .hv1_main 1725*c0909341SAndroid Build Coastguard Worker.hv1_bottom: 1726*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1727*c0909341SAndroid Build Coastguard Worker test edgeb, 1 ; LR_HAVE_LEFT 1728*c0909341SAndroid Build Coastguard Worker jz .hv1_extend_left 1729*c0909341SAndroid Build Coastguard Worker.hv1_loop: 1730*c0909341SAndroid Build Coastguard Worker movu ym17, [lpfq+r10-2] 1731*c0909341SAndroid Build Coastguard Worker.hv1_main: 1732*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, [lpfq+r10+6], 1 1733*c0909341SAndroid Build Coastguard Worker test edgeb, 2 ; LR_HAVE_RIGHT 1734*c0909341SAndroid Build Coastguard Worker jnz .hv1_have_right 1735*c0909341SAndroid Build Coastguard Worker cmp r10d, -34 1736*c0909341SAndroid Build Coastguard Worker jl .hv1_have_right 1737*c0909341SAndroid Build Coastguard Worker vpbroadcastb m0, [lpfq-1] 1738*c0909341SAndroid Build Coastguard Worker movu ym16, [r12+r10-8] 1739*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, [r12+r10+0], 1 1740*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m0, m16, 0xe4 1741*c0909341SAndroid Build Coastguard Worker.hv1_have_right: 1742*c0909341SAndroid Build Coastguard Worker pshufb m3, m17, m5 1743*c0909341SAndroid Build Coastguard Worker pshufb m19, m17, m6 1744*c0909341SAndroid Build Coastguard Worker shufps m2, m3, m19, q2121 1745*c0909341SAndroid Build Coastguard Worker pmullw m1, m2, m2 1746*c0909341SAndroid Build Coastguard Worker pshufb m18, m17, m7 1747*c0909341SAndroid Build Coastguard Worker paddw m2, m18 1748*c0909341SAndroid Build Coastguard Worker pshufb m17, m8 1749*c0909341SAndroid Build Coastguard Worker paddw m2, m17 ; sum3 1750*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m17, m18 1751*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m4 1752*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m16, m16 ; sumsq3 1753*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m18 1754*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4 1755*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m17, m17 1756*c0909341SAndroid Build Coastguard Worker paddd m16, m0, [t2+r10*2+416* 8] 1757*c0909341SAndroid Build Coastguard Worker paddd m17, m1, [t2+r10*2+416*10] 1758*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 8], m0 1759*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416*10], m1 1760*c0909341SAndroid Build Coastguard Worker punpcklwd m18, m3, m19 1761*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m18, m18 ; sumsq5 1762*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m3, m19 1763*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m18, m18 1764*c0909341SAndroid Build Coastguard Worker paddw m3, m19 1765*c0909341SAndroid Build Coastguard Worker pmulld m16, m9 ; -a3 * 9 1766*c0909341SAndroid Build Coastguard Worker pmulld m17, m9 1767*c0909341SAndroid Build Coastguard Worker paddd m18, m0, [t2+r10*2+416*2] 1768*c0909341SAndroid Build Coastguard Worker paddd m19, m1, [t2+r10*2+416*4] 1769*c0909341SAndroid Build Coastguard Worker paddd m18, [t1+r10*2+416*2] 1770*c0909341SAndroid Build Coastguard Worker paddd m19, [t1+r10*2+416*4] 1771*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416*2], m0 1772*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416*4], m1 1773*c0909341SAndroid Build Coastguard Worker pmulld m18, m10 ; -a5 * 25 1774*c0909341SAndroid Build Coastguard Worker pmulld m19, m10 1775*c0909341SAndroid Build Coastguard Worker paddw m1, m2, [t2+r10*2+416* 6] 1776*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 6], m2 1777*c0909341SAndroid Build Coastguard Worker paddw m2, m3 ; sum5 1778*c0909341SAndroid Build Coastguard Worker paddw m3, m2, [t2+r10*2+416*0] 1779*c0909341SAndroid Build Coastguard Worker paddw m3, [t1+r10*2+416*0] 1780*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416*0], m2 1781*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4, m1 ; b3 1782*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m0, m0 ; -p3 1783*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4, m1 1784*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m1, m1 1785*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 ; b5 1786*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m2, m2 ; -p5 1787*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 1788*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m3, m3 1789*c0909341SAndroid Build Coastguard Worker pmulld m16, m12 ; p3 * s1 1790*c0909341SAndroid Build Coastguard Worker pmulld m17, m12 1791*c0909341SAndroid Build Coastguard Worker pmulld m18, m11 ; p5 * s0 1792*c0909341SAndroid Build Coastguard Worker pmulld m19, m11 1793*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m13 ; b3 * 455 1794*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m13 1795*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m13 ; b5 * 164 1796*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 1797*c0909341SAndroid Build Coastguard Worker vpalignr m17{k2}, m16, m16, 2 1798*c0909341SAndroid Build Coastguard Worker vpalignr m19{k2}, m18, m18, 2 1799*c0909341SAndroid Build Coastguard Worker paddusw m17, m14 1800*c0909341SAndroid Build Coastguard Worker mova m16, m22 1801*c0909341SAndroid Build Coastguard Worker psraw m17, 4 ; min(z3, 255) - 256 1802*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] 1803*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m17 1804*c0909341SAndroid Build Coastguard Worker vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] 1805*c0909341SAndroid Build Coastguard Worker paddusw m19, m14 1806*c0909341SAndroid Build Coastguard Worker mova m18, m22 1807*c0909341SAndroid Build Coastguard Worker psraw m19, 4 ; min(z5, 255) - 256 1808*c0909341SAndroid Build Coastguard Worker vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] 1809*c0909341SAndroid Build Coastguard Worker vpmovb2m k4, m19 1810*c0909341SAndroid Build Coastguard Worker vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] 1811*c0909341SAndroid Build Coastguard Worker vmovdqu8 m17{k3}, m16 ; x3 1812*c0909341SAndroid Build Coastguard Worker vmovdqu8 m19{k4}, m18 ; x5 1813*c0909341SAndroid Build Coastguard Worker pandn m16, m24, m17 1814*c0909341SAndroid Build Coastguard Worker psrld m17, 16 1815*c0909341SAndroid Build Coastguard Worker pmulld m0, m16 1816*c0909341SAndroid Build Coastguard Worker pmulld m1, m17 1817*c0909341SAndroid Build Coastguard Worker pandn m18, m24, m19 1818*c0909341SAndroid Build Coastguard Worker psrld m19, 16 1819*c0909341SAndroid Build Coastguard Worker pmulld m2, m18 1820*c0909341SAndroid Build Coastguard Worker pmulld m3, m19 1821*c0909341SAndroid Build Coastguard Worker paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1822*c0909341SAndroid Build Coastguard Worker paddd m1, m15 1823*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) 1824*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m1, m24, 0xd8 1825*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*8+ 8], m16 1826*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*8+ 24], xm17 1827*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 1828*c0909341SAndroid Build Coastguard Worker paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 1829*c0909341SAndroid Build Coastguard Worker paddd m3, m15 1830*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*8+ 72], m17 1831*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 1832*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+416*8+104], m16, 3 1833*c0909341SAndroid Build Coastguard Worker vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) 1834*c0909341SAndroid Build Coastguard Worker vpternlogd m19, m3, m24, 0xd8 1835*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*0+ 8], m18 1836*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*0+ 24], xm19 1837*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 1838*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*0+ 72], m19 1839*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 1840*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+416*0+104], m18, 3 1841*c0909341SAndroid Build Coastguard Worker add r10, 32 1842*c0909341SAndroid Build Coastguard Worker jl .hv1_loop 1843*c0909341SAndroid Build Coastguard Worker mov r10, t2 1844*c0909341SAndroid Build Coastguard Worker mov t2, t1 1845*c0909341SAndroid Build Coastguard Worker mov t1, r10 1846*c0909341SAndroid Build Coastguard Worker ret 1847*c0909341SAndroid Build Coastguard Worker.v0: ; vertical boxsums + ab3 (even rows) 1848*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1849*c0909341SAndroid Build Coastguard Worker.v0_loop: 1850*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10*2+416* 8] 1851*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10*2+416*10] 1852*c0909341SAndroid Build Coastguard Worker paddd m2, m2 1853*c0909341SAndroid Build Coastguard Worker paddd m3, m3 1854*c0909341SAndroid Build Coastguard Worker paddd m16, m2, [t2+r10*2+416* 8] 1855*c0909341SAndroid Build Coastguard Worker paddd m17, m3, [t2+r10*2+416*10] 1856*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10*2+416* 6] 1857*c0909341SAndroid Build Coastguard Worker paddw m0, m0 1858*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10*2+416* 6] 1859*c0909341SAndroid Build Coastguard Worker pmulld m16, m9 ; -a3 * 9 1860*c0909341SAndroid Build Coastguard Worker pmulld m17, m9 1861*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 6], m0 1862*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 8], m2 1863*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416*10], m3 1864*c0909341SAndroid Build Coastguard Worker mova m2, [t1+r10*2+416*0] 1865*c0909341SAndroid Build Coastguard Worker mova m3, [t1+r10*2+416*2] 1866*c0909341SAndroid Build Coastguard Worker mova m18, [t1+r10*2+416*4] 1867*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4, m1 ; b3 1868*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m0, m0 ; -p3 1869*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4, m1 1870*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m1, m1 1871*c0909341SAndroid Build Coastguard Worker pmulld m16, m12 ; p3 * s1 1872*c0909341SAndroid Build Coastguard Worker pmulld m17, m12 1873*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m13 ; b3 * 455 1874*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m13 1875*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*8+ 8], m2 1876*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*0+ 8], m3 1877*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*0+72], m18 1878*c0909341SAndroid Build Coastguard Worker vpalignr m17{k2}, m16, m16, 2 1879*c0909341SAndroid Build Coastguard Worker mova m16, m22 1880*c0909341SAndroid Build Coastguard Worker paddusw m17, m14 1881*c0909341SAndroid Build Coastguard Worker psraw m17, 4 ; min(z3, 255) - 256 1882*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] 1883*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m17 1884*c0909341SAndroid Build Coastguard Worker vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] 1885*c0909341SAndroid Build Coastguard Worker vmovdqu8 m17{k3}, m16 ; x3 1886*c0909341SAndroid Build Coastguard Worker pandn m16, m24, m17 1887*c0909341SAndroid Build Coastguard Worker psrld m17, 16 1888*c0909341SAndroid Build Coastguard Worker pmulld m0, m16 1889*c0909341SAndroid Build Coastguard Worker pmulld m1, m17 1890*c0909341SAndroid Build Coastguard Worker paddw m2, m2 ; cc5 1891*c0909341SAndroid Build Coastguard Worker paddd m3, m3 1892*c0909341SAndroid Build Coastguard Worker paddd m18, m18 1893*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*0], m2 1894*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*2], m3 1895*c0909341SAndroid Build Coastguard Worker mova [t1+r10*2+416*4], m18 1896*c0909341SAndroid Build Coastguard Worker paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1897*c0909341SAndroid Build Coastguard Worker paddd m1, m15 1898*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) 1899*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m1, m24, 0xd8 1900*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4+ 8], m16 1901*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4+ 24], xm17 1902*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 1903*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*4+ 72], m17 1904*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 1905*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+416*4+104], m16, 3 1906*c0909341SAndroid Build Coastguard Worker add r10, 32 1907*c0909341SAndroid Build Coastguard Worker jl .v0_loop 1908*c0909341SAndroid Build Coastguard Worker ret 1909*c0909341SAndroid Build Coastguard Worker.v1: ; vertical boxsums + ab (odd rows) 1910*c0909341SAndroid Build Coastguard Worker lea r10, [wq-2] 1911*c0909341SAndroid Build Coastguard Worker.v1_loop: 1912*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10*2+416* 8] 1913*c0909341SAndroid Build Coastguard Worker paddd m16, m0, [t2+r10*2+416* 8] 1914*c0909341SAndroid Build Coastguard Worker mova m1, [t1+r10*2+416*10] 1915*c0909341SAndroid Build Coastguard Worker paddd m17, m1, [t2+r10*2+416*10] 1916*c0909341SAndroid Build Coastguard Worker mova m2, [t3+r10*4+416*0+ 8] 1917*c0909341SAndroid Build Coastguard Worker paddd m18, m2, [t2+r10*2+416* 2] 1918*c0909341SAndroid Build Coastguard Worker mova m3, [t3+r10*4+416*0+72] 1919*c0909341SAndroid Build Coastguard Worker paddd m19, m3, [t2+r10*2+416* 4] 1920*c0909341SAndroid Build Coastguard Worker paddd m18, [t1+r10*2+416* 2] 1921*c0909341SAndroid Build Coastguard Worker paddd m19, [t1+r10*2+416* 4] 1922*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 8], m0 1923*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416*10], m1 1924*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 2], m2 1925*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 4], m3 1926*c0909341SAndroid Build Coastguard Worker pmulld m16, m9 ; -a3 * 9 1927*c0909341SAndroid Build Coastguard Worker pmulld m17, m9 1928*c0909341SAndroid Build Coastguard Worker pmulld m18, m10 ; -a5 * 25 1929*c0909341SAndroid Build Coastguard Worker pmulld m19, m10 1930*c0909341SAndroid Build Coastguard Worker mova m0, [t1+r10*2+416* 6] 1931*c0909341SAndroid Build Coastguard Worker paddw m1, m0, [t2+r10*2+416* 6] 1932*c0909341SAndroid Build Coastguard Worker mova m2, [t3+r10*4+416*8+ 8] 1933*c0909341SAndroid Build Coastguard Worker paddw m3, m2, [t2+r10*2+416*0] 1934*c0909341SAndroid Build Coastguard Worker paddw m3, [t1+r10*2+416*0] 1935*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416* 6], m0 1936*c0909341SAndroid Build Coastguard Worker mova [t2+r10*2+416*0], m2 1937*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4, m1 ; b3 1938*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m0, m0 ; -p3 1939*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m4, m1 1940*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m1, m1 1941*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 ; b5 1942*c0909341SAndroid Build Coastguard Worker vpdpwssd m18, m2, m2 ; -p5 1943*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 1944*c0909341SAndroid Build Coastguard Worker vpdpwssd m19, m3, m3 1945*c0909341SAndroid Build Coastguard Worker pmulld m16, m12 ; p3 * s1 1946*c0909341SAndroid Build Coastguard Worker pmulld m17, m12 1947*c0909341SAndroid Build Coastguard Worker pmulld m18, m11 ; p5 * s0 1948*c0909341SAndroid Build Coastguard Worker pmulld m19, m11 1949*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m13 ; b3 * 455 1950*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m13 1951*c0909341SAndroid Build Coastguard Worker pmaddwd m2, m13 ; b5 * 164 1952*c0909341SAndroid Build Coastguard Worker pmaddwd m3, m13 1953*c0909341SAndroid Build Coastguard Worker vpalignr m17{k2}, m16, m16, 2 1954*c0909341SAndroid Build Coastguard Worker vpalignr m19{k2}, m18, m18, 2 1955*c0909341SAndroid Build Coastguard Worker paddusw m17, m14 1956*c0909341SAndroid Build Coastguard Worker mova m16, m22 1957*c0909341SAndroid Build Coastguard Worker psraw m17, 4 ; min(z3, 255) - 256 1958*c0909341SAndroid Build Coastguard Worker vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] 1959*c0909341SAndroid Build Coastguard Worker vpmovb2m k3, m17 1960*c0909341SAndroid Build Coastguard Worker vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] 1961*c0909341SAndroid Build Coastguard Worker paddusw m19, m14 1962*c0909341SAndroid Build Coastguard Worker mova m18, m22 1963*c0909341SAndroid Build Coastguard Worker psraw m19, 4 ; min(z5, 255) - 256 1964*c0909341SAndroid Build Coastguard Worker vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] 1965*c0909341SAndroid Build Coastguard Worker vpmovb2m k4, m19 1966*c0909341SAndroid Build Coastguard Worker vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] 1967*c0909341SAndroid Build Coastguard Worker vmovdqu8 m17{k3}, m16 ; x3 1968*c0909341SAndroid Build Coastguard Worker vmovdqu8 m19{k4}, m18 ; x5 1969*c0909341SAndroid Build Coastguard Worker pandn m16, m24, m17 1970*c0909341SAndroid Build Coastguard Worker psrld m17, 16 1971*c0909341SAndroid Build Coastguard Worker pmulld m0, m16 1972*c0909341SAndroid Build Coastguard Worker pmulld m1, m17 1973*c0909341SAndroid Build Coastguard Worker pandn m18, m24, m19 1974*c0909341SAndroid Build Coastguard Worker psrld m19, m19, 16 1975*c0909341SAndroid Build Coastguard Worker pmulld m2, m18 1976*c0909341SAndroid Build Coastguard Worker pmulld m3, m19 1977*c0909341SAndroid Build Coastguard Worker paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1978*c0909341SAndroid Build Coastguard Worker paddd m1, m15 1979*c0909341SAndroid Build Coastguard Worker vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) 1980*c0909341SAndroid Build Coastguard Worker vpternlogd m17, m1, m24, 0xd8 1981*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*8+ 8], m16 1982*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*8+ 24], xm17 1983*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 1984*c0909341SAndroid Build Coastguard Worker paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 1985*c0909341SAndroid Build Coastguard Worker paddd m3, m15 1986*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*8+ 72], m17 1987*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 1988*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+416*8+104], m16, 3 1989*c0909341SAndroid Build Coastguard Worker vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) 1990*c0909341SAndroid Build Coastguard Worker vpternlogd m19, m3, m24, 0xd8 1991*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*0+ 8], m18 1992*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*0+ 24], xm19 1993*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 1994*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*0+ 72], m19 1995*c0909341SAndroid Build Coastguard Worker vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 1996*c0909341SAndroid Build Coastguard Worker vextracti32x4 [t3+r10*4+416*0+104], m18, 3 1997*c0909341SAndroid Build Coastguard Worker add r10, 32 1998*c0909341SAndroid Build Coastguard Worker jl .v1_loop 1999*c0909341SAndroid Build Coastguard Worker mov r10, t2 2000*c0909341SAndroid Build Coastguard Worker mov t2, t1 2001*c0909341SAndroid Build Coastguard Worker mov t1, r10 2002*c0909341SAndroid Build Coastguard Worker ret 2003*c0909341SAndroid Build Coastguard Worker.prep_n: ; initial neighbor setup 2004*c0909341SAndroid Build Coastguard Worker mov r10, wq 2005*c0909341SAndroid Build Coastguard Worker.prep_n_loop: 2006*c0909341SAndroid Build Coastguard Worker movu m0, [t3+r10*4+416*0+4] 2007*c0909341SAndroid Build Coastguard Worker paddd m1, m0, [t3+r10*4+416*0+0] 2008*c0909341SAndroid Build Coastguard Worker mova m16, [t3+r10*4+416*4+0] 2009*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*4+416*0+8] 2010*c0909341SAndroid Build Coastguard Worker mova m17, [t3+r10*4+416*8+0] 2011*c0909341SAndroid Build Coastguard Worker paddd m16, [t3+r10*4+416*4+8] 2012*c0909341SAndroid Build Coastguard Worker paddd m17, [t3+r10*4+416*8+8] 2013*c0909341SAndroid Build Coastguard Worker paddd m2, m16, [t3+r10*4+416*4+4] 2014*c0909341SAndroid Build Coastguard Worker paddd m3, m17, [t3+r10*4+416*8+4] 2015*c0909341SAndroid Build Coastguard Worker paddd m0, m1 2016*c0909341SAndroid Build Coastguard Worker pslld m1, 2 2017*c0909341SAndroid Build Coastguard Worker pslld m2, 2 2018*c0909341SAndroid Build Coastguard Worker paddd m1, m0 ; ab5 565 2019*c0909341SAndroid Build Coastguard Worker paddd m3, m3 ; ab3[ 0] 222 2020*c0909341SAndroid Build Coastguard Worker psubd m2, m16 ; ab3[-1] 343 2021*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*20], m3 2022*c0909341SAndroid Build Coastguard Worker pandn m0, m24, m1 ; a5 565 2023*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*24], m2 2024*c0909341SAndroid Build Coastguard Worker psrld m1, 12 ; b5 565 2025*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*12], m0 2026*c0909341SAndroid Build Coastguard Worker paddd m3, m3 2027*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*16], m1 2028*c0909341SAndroid Build Coastguard Worker psubd m3, m17 ; ab3[ 0] 343 2029*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*28], m3 2030*c0909341SAndroid Build Coastguard Worker add r10, 16 2031*c0909341SAndroid Build Coastguard Worker jl .prep_n_loop 2032*c0909341SAndroid Build Coastguard Worker ret 2033*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2034*c0909341SAndroid Build Coastguard Worker.n0: ; neighbor + output (even rows) 2035*c0909341SAndroid Build Coastguard Worker mov r10, wq 2036*c0909341SAndroid Build Coastguard Worker.n0_loop: 2037*c0909341SAndroid Build Coastguard Worker movu m2, [t3+r10*4+4] 2038*c0909341SAndroid Build Coastguard Worker paddd m3, m2, [t3+r10*4+0] 2039*c0909341SAndroid Build Coastguard Worker paddd m3, [t3+r10*4+8] 2040*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*4+416*4+0] 2041*c0909341SAndroid Build Coastguard Worker paddd m2, m3 2042*c0909341SAndroid Build Coastguard Worker pslld m3, 2 2043*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*4+416*4+8] 2044*c0909341SAndroid Build Coastguard Worker paddd m3, m2 2045*c0909341SAndroid Build Coastguard Worker pandn m2, m24, m3 2046*c0909341SAndroid Build Coastguard Worker psrld m3, 12 2047*c0909341SAndroid Build Coastguard Worker paddd m0, m2, [t3+r10*4+416*12] ; a5 2048*c0909341SAndroid Build Coastguard Worker paddd m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8) 2049*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*12], m2 2050*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*16], m3 2051*c0909341SAndroid Build Coastguard Worker paddd m2, m1, [t3+r10*4+416*4+4] 2052*c0909341SAndroid Build Coastguard Worker paddd m2, m2 ; ab3[ 1] 222 2053*c0909341SAndroid Build Coastguard Worker mova m3, [t3+r10*4+416*20] 2054*c0909341SAndroid Build Coastguard Worker paddd m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343 2055*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*20], m2 2056*c0909341SAndroid Build Coastguard Worker paddd m2, m2 2057*c0909341SAndroid Build Coastguard Worker psubd m2, m1 ; ab3[ 1] 343 2058*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*24], m2 2059*c0909341SAndroid Build Coastguard Worker paddd m2, m3 ; ab3[ 0] 222 + ab3[ 1] 343 2060*c0909341SAndroid Build Coastguard Worker pandn m1, m24, m17 2061*c0909341SAndroid Build Coastguard Worker psrld m17, 12 2062*c0909341SAndroid Build Coastguard Worker pandn m3, m24, m2 2063*c0909341SAndroid Build Coastguard Worker psrld m2, 12 2064*c0909341SAndroid Build Coastguard Worker paddd m1, m3 ; a3 2065*c0909341SAndroid Build Coastguard Worker pmovzxbd m3, [dstq+r10] 2066*c0909341SAndroid Build Coastguard Worker paddd m17, m2 ; b3 + (1 << 8) 2067*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3 ; a5 * src 2068*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m3 ; a3 * src 2069*c0909341SAndroid Build Coastguard Worker vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) 2070*c0909341SAndroid Build Coastguard Worker psubd m16, m0 ; b5 - a5 * src + (1 << 8) 2071*c0909341SAndroid Build Coastguard Worker psubd m17, m1 ; b3 - a3 * src + (1 << 8) 2072*c0909341SAndroid Build Coastguard Worker psrld m16, 9 2073*c0909341SAndroid Build Coastguard Worker pslld m17, 7 2074*c0909341SAndroid Build Coastguard Worker vmovdqu8 m17{k2}, m16 2075*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m17, m26 2076*c0909341SAndroid Build Coastguard Worker packuswb m3, m2 2077*c0909341SAndroid Build Coastguard Worker vpermb m16, m27, m3 2078*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], xm16 2079*c0909341SAndroid Build Coastguard Worker add r10, 16 2080*c0909341SAndroid Build Coastguard Worker jl .n0_loop 2081*c0909341SAndroid Build Coastguard Worker add dstq, strideq 2082*c0909341SAndroid Build Coastguard Worker ret 2083*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2084*c0909341SAndroid Build Coastguard Worker.n1: ; neighbor + output (odd rows) 2085*c0909341SAndroid Build Coastguard Worker mov r10, wq 2086*c0909341SAndroid Build Coastguard Worker.n1_loop: 2087*c0909341SAndroid Build Coastguard Worker mova m1, [t3+r10*4+416*8+0] 2088*c0909341SAndroid Build Coastguard Worker paddd m1, [t3+r10*4+416*8+8] 2089*c0909341SAndroid Build Coastguard Worker paddd m2, m1, [t3+r10*4+416*8+4] 2090*c0909341SAndroid Build Coastguard Worker paddd m2, m2 ; ab3[ 1] 222 2091*c0909341SAndroid Build Coastguard Worker mova m0, [t3+r10*4+416*20] 2092*c0909341SAndroid Build Coastguard Worker paddd m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343 2093*c0909341SAndroid Build Coastguard Worker pmovzxbd m3, [dstq+r10] 2094*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*20], m2 2095*c0909341SAndroid Build Coastguard Worker paddd m2, m2 2096*c0909341SAndroid Build Coastguard Worker psubd m2, m1 ; ab3[ 1] 343 2097*c0909341SAndroid Build Coastguard Worker mova [t3+r10*4+416*28], m2 2098*c0909341SAndroid Build Coastguard Worker paddd m0, m2 ; ab3[ 0] 222 + ab3[ 1] 343 2099*c0909341SAndroid Build Coastguard Worker pandn m1, m24, m17 2100*c0909341SAndroid Build Coastguard Worker psrld m17, 12 2101*c0909341SAndroid Build Coastguard Worker pandn m2, m24, m0 2102*c0909341SAndroid Build Coastguard Worker psrld m0, 12 2103*c0909341SAndroid Build Coastguard Worker paddd m1, m2 ; a3 2104*c0909341SAndroid Build Coastguard Worker paddd m17, m0 ; b3 + (1 << 8) 2105*c0909341SAndroid Build Coastguard Worker mova m16, [t3+r10*4+416*16] ; b5 + (1 << 7) 2106*c0909341SAndroid Build Coastguard Worker pmaddwd m1, m3 ; a3 * src 2107*c0909341SAndroid Build Coastguard Worker pmaddwd m0, m3, [t3+r10*4+416*12] ; a5 * src 2108*c0909341SAndroid Build Coastguard Worker vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) 2109*c0909341SAndroid Build Coastguard Worker psubd m17, m1 ; b3 - a3 * src + (1 << 8) 2110*c0909341SAndroid Build Coastguard Worker psubd m16, m0 ; b5 - a5 * src + (1 << 7) 2111*c0909341SAndroid Build Coastguard Worker pslld m17, 7 2112*c0909341SAndroid Build Coastguard Worker palignr m17{k2}, m16, m16, 1 2113*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m17, m26 2114*c0909341SAndroid Build Coastguard Worker packuswb m3, m3 2115*c0909341SAndroid Build Coastguard Worker vpermb m16, m27, m3 2116*c0909341SAndroid Build Coastguard Worker mova [dstq+r10], xm16 2117*c0909341SAndroid Build Coastguard Worker add r10, 16 2118*c0909341SAndroid Build Coastguard Worker jl .n1_loop 2119*c0909341SAndroid Build Coastguard Worker add dstq, strideq 2120*c0909341SAndroid Build Coastguard Worker ret 2121*c0909341SAndroid Build Coastguard Worker 2122*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 2123