1*fb1b10abSAndroid Build Coastguard Worker; 2*fb1b10abSAndroid Build Coastguard Worker; Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3*fb1b10abSAndroid Build Coastguard Worker; 4*fb1b10abSAndroid Build Coastguard Worker; Use of this source code is governed by a BSD-style license 5*fb1b10abSAndroid Build Coastguard Worker; that can be found in the LICENSE file in the root of the source 6*fb1b10abSAndroid Build Coastguard Worker; tree. An additional intellectual property rights grant can be found 7*fb1b10abSAndroid Build Coastguard Worker; in the file PATENTS. All contributing project authors may 8*fb1b10abSAndroid Build Coastguard Worker; be found in the AUTHORS file in the root of the source tree. 9*fb1b10abSAndroid Build Coastguard Worker; 10*fb1b10abSAndroid Build Coastguard Worker 11*fb1b10abSAndroid Build Coastguard Worker 12*fb1b10abSAndroid Build Coastguard Worker%include "vpx_ports/x86_abi_support.asm" 13*fb1b10abSAndroid Build Coastguard Worker 14*fb1b10abSAndroid Build Coastguard WorkerSECTION .text 15*fb1b10abSAndroid Build Coastguard Worker 16*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_by_weight16x16_sse2 17*fb1b10abSAndroid Build Coastguard Worker;( 18*fb1b10abSAndroid Build Coastguard Worker; unsigned char *src, 19*fb1b10abSAndroid Build Coastguard Worker; int src_stride, 20*fb1b10abSAndroid Build Coastguard Worker; unsigned char *dst, 21*fb1b10abSAndroid Build Coastguard Worker; int dst_stride, 22*fb1b10abSAndroid Build Coastguard Worker; int src_weight 23*fb1b10abSAndroid Build Coastguard Worker;) 24*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_by_weight16x16_sse2) 25*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_by_weight16x16_sse2): 26*fb1b10abSAndroid Build Coastguard Worker push rbp 27*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 28*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 5 29*fb1b10abSAndroid Build Coastguard Worker SAVE_XMM 6 30*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 31*fb1b10abSAndroid Build Coastguard Worker push rsi 32*fb1b10abSAndroid Build Coastguard Worker push rdi 33*fb1b10abSAndroid Build Coastguard Worker ; end prolog 34*fb1b10abSAndroid Build Coastguard Worker 35*fb1b10abSAndroid Build Coastguard Worker movd xmm0, arg(4) ; src_weight 36*fb1b10abSAndroid Build Coastguard Worker pshuflw xmm0, xmm0, 0x0 ; replicate to all low words 37*fb1b10abSAndroid Build Coastguard Worker punpcklqdq xmm0, xmm0 ; replicate to all hi words 38*fb1b10abSAndroid Build Coastguard Worker 39*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, [GLOBAL(tMFQE)] 40*fb1b10abSAndroid Build Coastguard Worker psubw xmm1, xmm0 ; dst_weight 41*fb1b10abSAndroid Build Coastguard Worker 42*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(0) ; src 43*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(1) ; src_stride 44*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(2) ; dst 45*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(3) ; dst_stride 46*fb1b10abSAndroid Build Coastguard Worker 47*fb1b10abSAndroid Build Coastguard Worker mov rcx, 16 ; loop count 48*fb1b10abSAndroid Build Coastguard Worker pxor xmm6, xmm6 49*fb1b10abSAndroid Build Coastguard Worker 50*fb1b10abSAndroid Build Coastguard Worker.combine: 51*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, [rax] 52*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, [rdx] 53*fb1b10abSAndroid Build Coastguard Worker add rax, rsi 54*fb1b10abSAndroid Build Coastguard Worker 55*fb1b10abSAndroid Build Coastguard Worker ; src * src_weight 56*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, xmm2 57*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm2, xmm6 58*fb1b10abSAndroid Build Coastguard Worker punpckhbw xmm3, xmm6 59*fb1b10abSAndroid Build Coastguard Worker pmullw xmm2, xmm0 60*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, xmm0 61*fb1b10abSAndroid Build Coastguard Worker 62*fb1b10abSAndroid Build Coastguard Worker ; dst * dst_weight 63*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm4 64*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm6 65*fb1b10abSAndroid Build Coastguard Worker punpckhbw xmm5, xmm6 66*fb1b10abSAndroid Build Coastguard Worker pmullw xmm4, xmm1 67*fb1b10abSAndroid Build Coastguard Worker pmullw xmm5, xmm1 68*fb1b10abSAndroid Build Coastguard Worker 69*fb1b10abSAndroid Build Coastguard Worker ; sum, round and shift 70*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm4 71*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm5 72*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, [GLOBAL(tMFQE_round)] 73*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, [GLOBAL(tMFQE_round)] 74*fb1b10abSAndroid Build Coastguard Worker psrlw xmm2, 4 75*fb1b10abSAndroid Build Coastguard Worker psrlw xmm3, 4 76*fb1b10abSAndroid Build Coastguard Worker 77*fb1b10abSAndroid Build Coastguard Worker packuswb xmm2, xmm3 78*fb1b10abSAndroid Build Coastguard Worker movdqa [rdx], xmm2 79*fb1b10abSAndroid Build Coastguard Worker add rdx, rdi 80*fb1b10abSAndroid Build Coastguard Worker 81*fb1b10abSAndroid Build Coastguard Worker dec rcx 82*fb1b10abSAndroid Build Coastguard Worker jnz .combine 83*fb1b10abSAndroid Build Coastguard Worker 84*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 85*fb1b10abSAndroid Build Coastguard Worker pop rdi 86*fb1b10abSAndroid Build Coastguard Worker pop rsi 87*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 88*fb1b10abSAndroid Build Coastguard Worker RESTORE_XMM 89*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 90*fb1b10abSAndroid Build Coastguard Worker pop rbp 91*fb1b10abSAndroid Build Coastguard Worker 92*fb1b10abSAndroid Build Coastguard Worker ret 93*fb1b10abSAndroid Build Coastguard Worker 94*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_by_weight8x8_sse2 95*fb1b10abSAndroid Build Coastguard Worker;( 96*fb1b10abSAndroid Build Coastguard Worker; unsigned char *src, 97*fb1b10abSAndroid Build Coastguard Worker; int src_stride, 98*fb1b10abSAndroid Build Coastguard Worker; unsigned char *dst, 99*fb1b10abSAndroid Build Coastguard Worker; int dst_stride, 100*fb1b10abSAndroid Build Coastguard Worker; int src_weight 101*fb1b10abSAndroid Build Coastguard Worker;) 102*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_by_weight8x8_sse2) 103*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_by_weight8x8_sse2): 104*fb1b10abSAndroid Build Coastguard Worker push rbp 105*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 106*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 5 107*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 108*fb1b10abSAndroid Build Coastguard Worker push rsi 109*fb1b10abSAndroid Build Coastguard Worker push rdi 110*fb1b10abSAndroid Build Coastguard Worker ; end prolog 111*fb1b10abSAndroid Build Coastguard Worker 112*fb1b10abSAndroid Build Coastguard Worker movd xmm0, arg(4) ; src_weight 113*fb1b10abSAndroid Build Coastguard Worker pshuflw xmm0, xmm0, 0x0 ; replicate to all low words 114*fb1b10abSAndroid Build Coastguard Worker punpcklqdq xmm0, xmm0 ; replicate to all hi words 115*fb1b10abSAndroid Build Coastguard Worker 116*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, [GLOBAL(tMFQE)] 117*fb1b10abSAndroid Build Coastguard Worker psubw xmm1, xmm0 ; dst_weight 118*fb1b10abSAndroid Build Coastguard Worker 119*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(0) ; src 120*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(1) ; src_stride 121*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(2) ; dst 122*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(3) ; dst_stride 123*fb1b10abSAndroid Build Coastguard Worker 124*fb1b10abSAndroid Build Coastguard Worker mov rcx, 8 ; loop count 125*fb1b10abSAndroid Build Coastguard Worker pxor xmm4, xmm4 126*fb1b10abSAndroid Build Coastguard Worker 127*fb1b10abSAndroid Build Coastguard Worker.combine: 128*fb1b10abSAndroid Build Coastguard Worker movq xmm2, [rax] 129*fb1b10abSAndroid Build Coastguard Worker movq xmm3, [rdx] 130*fb1b10abSAndroid Build Coastguard Worker add rax, rsi 131*fb1b10abSAndroid Build Coastguard Worker 132*fb1b10abSAndroid Build Coastguard Worker ; src * src_weight 133*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm2, xmm4 134*fb1b10abSAndroid Build Coastguard Worker pmullw xmm2, xmm0 135*fb1b10abSAndroid Build Coastguard Worker 136*fb1b10abSAndroid Build Coastguard Worker ; dst * dst_weight 137*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm3, xmm4 138*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, xmm1 139*fb1b10abSAndroid Build Coastguard Worker 140*fb1b10abSAndroid Build Coastguard Worker ; sum, round and shift 141*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm3 142*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, [GLOBAL(tMFQE_round)] 143*fb1b10abSAndroid Build Coastguard Worker psrlw xmm2, 4 144*fb1b10abSAndroid Build Coastguard Worker 145*fb1b10abSAndroid Build Coastguard Worker packuswb xmm2, xmm4 146*fb1b10abSAndroid Build Coastguard Worker movq [rdx], xmm2 147*fb1b10abSAndroid Build Coastguard Worker add rdx, rdi 148*fb1b10abSAndroid Build Coastguard Worker 149*fb1b10abSAndroid Build Coastguard Worker dec rcx 150*fb1b10abSAndroid Build Coastguard Worker jnz .combine 151*fb1b10abSAndroid Build Coastguard Worker 152*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 153*fb1b10abSAndroid Build Coastguard Worker pop rdi 154*fb1b10abSAndroid Build Coastguard Worker pop rsi 155*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 156*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 157*fb1b10abSAndroid Build Coastguard Worker pop rbp 158*fb1b10abSAndroid Build Coastguard Worker 159*fb1b10abSAndroid Build Coastguard Worker ret 160*fb1b10abSAndroid Build Coastguard Worker 161*fb1b10abSAndroid Build Coastguard Worker;void vp8_variance_and_sad_16x16_sse2 | arg 162*fb1b10abSAndroid Build Coastguard Worker;( 163*fb1b10abSAndroid Build Coastguard Worker; unsigned char *src1, 0 164*fb1b10abSAndroid Build Coastguard Worker; int stride1, 1 165*fb1b10abSAndroid Build Coastguard Worker; unsigned char *src2, 2 166*fb1b10abSAndroid Build Coastguard Worker; int stride2, 3 167*fb1b10abSAndroid Build Coastguard Worker; unsigned int *variance, 4 168*fb1b10abSAndroid Build Coastguard Worker; unsigned int *sad, 5 169*fb1b10abSAndroid Build Coastguard Worker;) 170*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_variance_and_sad_16x16_sse2) 171*fb1b10abSAndroid Build Coastguard Workersym(vp8_variance_and_sad_16x16_sse2): 172*fb1b10abSAndroid Build Coastguard Worker push rbp 173*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 174*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 6 175*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 176*fb1b10abSAndroid Build Coastguard Worker push rsi 177*fb1b10abSAndroid Build Coastguard Worker push rdi 178*fb1b10abSAndroid Build Coastguard Worker ; end prolog 179*fb1b10abSAndroid Build Coastguard Worker 180*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(0) ; src1 181*fb1b10abSAndroid Build Coastguard Worker mov rcx, arg(1) ; stride1 182*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(2) ; src2 183*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(3) ; stride2 184*fb1b10abSAndroid Build Coastguard Worker 185*fb1b10abSAndroid Build Coastguard Worker mov rsi, 16 ; block height 186*fb1b10abSAndroid Build Coastguard Worker 187*fb1b10abSAndroid Build Coastguard Worker ; Prep accumulator registers 188*fb1b10abSAndroid Build Coastguard Worker pxor xmm3, xmm3 ; SAD 189*fb1b10abSAndroid Build Coastguard Worker pxor xmm4, xmm4 ; sum of src2 190*fb1b10abSAndroid Build Coastguard Worker pxor xmm5, xmm5 ; sum of src2^2 191*fb1b10abSAndroid Build Coastguard Worker 192*fb1b10abSAndroid Build Coastguard Worker ; Because we're working with the actual output frames 193*fb1b10abSAndroid Build Coastguard Worker ; we can't depend on any kind of data alignment. 194*fb1b10abSAndroid Build Coastguard Worker.accumulate: 195*fb1b10abSAndroid Build Coastguard Worker movdqa xmm0, [rax] ; src1 196*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, [rdx] ; src2 197*fb1b10abSAndroid Build Coastguard Worker add rax, rcx ; src1 + stride1 198*fb1b10abSAndroid Build Coastguard Worker add rdx, rdi ; src2 + stride2 199*fb1b10abSAndroid Build Coastguard Worker 200*fb1b10abSAndroid Build Coastguard Worker ; SAD(src1, src2) 201*fb1b10abSAndroid Build Coastguard Worker psadbw xmm0, xmm1 202*fb1b10abSAndroid Build Coastguard Worker paddusw xmm3, xmm0 203*fb1b10abSAndroid Build Coastguard Worker 204*fb1b10abSAndroid Build Coastguard Worker ; SUM(src2) 205*fb1b10abSAndroid Build Coastguard Worker pxor xmm2, xmm2 206*fb1b10abSAndroid Build Coastguard Worker psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 207*fb1b10abSAndroid Build Coastguard Worker paddusw xmm4, xmm2 208*fb1b10abSAndroid Build Coastguard Worker 209*fb1b10abSAndroid Build Coastguard Worker ; pmaddubsw would be ideal if it took two unsigned values. instead, 210*fb1b10abSAndroid Build Coastguard Worker ; it expects a signed and an unsigned value. so instead we zero extend 211*fb1b10abSAndroid Build Coastguard Worker ; and operate on words. 212*fb1b10abSAndroid Build Coastguard Worker pxor xmm2, xmm2 213*fb1b10abSAndroid Build Coastguard Worker movdqa xmm0, xmm1 214*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm0, xmm2 215*fb1b10abSAndroid Build Coastguard Worker punpckhbw xmm1, xmm2 216*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm0, xmm0 217*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm1, xmm1 218*fb1b10abSAndroid Build Coastguard Worker paddd xmm5, xmm0 219*fb1b10abSAndroid Build Coastguard Worker paddd xmm5, xmm1 220*fb1b10abSAndroid Build Coastguard Worker 221*fb1b10abSAndroid Build Coastguard Worker sub rsi, 1 222*fb1b10abSAndroid Build Coastguard Worker jnz .accumulate 223*fb1b10abSAndroid Build Coastguard Worker 224*fb1b10abSAndroid Build Coastguard Worker ; phaddd only operates on adjacent double words. 225*fb1b10abSAndroid Build Coastguard Worker ; Finalize SAD and store 226*fb1b10abSAndroid Build Coastguard Worker movdqa xmm0, xmm3 227*fb1b10abSAndroid Build Coastguard Worker psrldq xmm0, 8 228*fb1b10abSAndroid Build Coastguard Worker paddusw xmm0, xmm3 229*fb1b10abSAndroid Build Coastguard Worker paddd xmm0, [GLOBAL(t128)] 230*fb1b10abSAndroid Build Coastguard Worker psrld xmm0, 8 231*fb1b10abSAndroid Build Coastguard Worker 232*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(5) 233*fb1b10abSAndroid Build Coastguard Worker movd [rax], xmm0 234*fb1b10abSAndroid Build Coastguard Worker 235*fb1b10abSAndroid Build Coastguard Worker ; Accumulate sum of src2 236*fb1b10abSAndroid Build Coastguard Worker movdqa xmm0, xmm4 237*fb1b10abSAndroid Build Coastguard Worker psrldq xmm0, 8 238*fb1b10abSAndroid Build Coastguard Worker paddusw xmm0, xmm4 239*fb1b10abSAndroid Build Coastguard Worker ; Square src2. Ignore high value 240*fb1b10abSAndroid Build Coastguard Worker pmuludq xmm0, xmm0 241*fb1b10abSAndroid Build Coastguard Worker psrld xmm0, 8 242*fb1b10abSAndroid Build Coastguard Worker 243*fb1b10abSAndroid Build Coastguard Worker ; phaddw could be used to sum adjacent values but we want 244*fb1b10abSAndroid Build Coastguard Worker ; all the values summed. promote to doubles, accumulate, 245*fb1b10abSAndroid Build Coastguard Worker ; shift and sum 246*fb1b10abSAndroid Build Coastguard Worker pxor xmm2, xmm2 247*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm5 248*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm1, xmm2 249*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm5, xmm2 250*fb1b10abSAndroid Build Coastguard Worker paddd xmm1, xmm5 251*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, xmm1 252*fb1b10abSAndroid Build Coastguard Worker psrldq xmm1, 8 253*fb1b10abSAndroid Build Coastguard Worker paddd xmm1, xmm2 254*fb1b10abSAndroid Build Coastguard Worker 255*fb1b10abSAndroid Build Coastguard Worker psubd xmm1, xmm0 256*fb1b10abSAndroid Build Coastguard Worker 257*fb1b10abSAndroid Build Coastguard Worker ; (variance + 128) >> 8 258*fb1b10abSAndroid Build Coastguard Worker paddd xmm1, [GLOBAL(t128)] 259*fb1b10abSAndroid Build Coastguard Worker psrld xmm1, 8 260*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(4) 261*fb1b10abSAndroid Build Coastguard Worker 262*fb1b10abSAndroid Build Coastguard Worker movd [rax], xmm1 263*fb1b10abSAndroid Build Coastguard Worker 264*fb1b10abSAndroid Build Coastguard Worker 265*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 266*fb1b10abSAndroid Build Coastguard Worker pop rdi 267*fb1b10abSAndroid Build Coastguard Worker pop rsi 268*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 269*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 270*fb1b10abSAndroid Build Coastguard Worker pop rbp 271*fb1b10abSAndroid Build Coastguard Worker ret 272*fb1b10abSAndroid Build Coastguard Worker 273*fb1b10abSAndroid Build Coastguard WorkerSECTION_RODATA 274*fb1b10abSAndroid Build Coastguard Workeralign 16 275*fb1b10abSAndroid Build Coastguard Workert128: 276*fb1b10abSAndroid Build Coastguard Worker%ifndef __NASM_VER__ 277*fb1b10abSAndroid Build Coastguard Worker ddq 128 278*fb1b10abSAndroid Build Coastguard Worker%elif CONFIG_BIG_ENDIAN 279*fb1b10abSAndroid Build Coastguard Worker dq 0, 128 280*fb1b10abSAndroid Build Coastguard Worker%else 281*fb1b10abSAndroid Build Coastguard Worker dq 128, 0 282*fb1b10abSAndroid Build Coastguard Worker%endif 283*fb1b10abSAndroid Build Coastguard Workeralign 16 284*fb1b10abSAndroid Build Coastguard WorkertMFQE: ; 1 << MFQE_PRECISION 285*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0x10 286*fb1b10abSAndroid Build Coastguard Workeralign 16 287*fb1b10abSAndroid Build Coastguard WorkertMFQE_round: ; 1 << (MFQE_PRECISION - 1) 288*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0x08 289*fb1b10abSAndroid Build Coastguard Worker 290