1*fb1b10abSAndroid Build Coastguard Worker; 2*fb1b10abSAndroid Build Coastguard Worker; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3*fb1b10abSAndroid Build Coastguard Worker; 4*fb1b10abSAndroid Build Coastguard Worker; Use of this source code is governed by a BSD-style license 5*fb1b10abSAndroid Build Coastguard Worker; that can be found in the LICENSE file in the root of the source 6*fb1b10abSAndroid Build Coastguard Worker; tree. An additional intellectual property rights grant can be found 7*fb1b10abSAndroid Build Coastguard Worker; in the file PATENTS. All contributing project authors may 8*fb1b10abSAndroid Build Coastguard Worker; be found in the AUTHORS file in the root of the source tree. 9*fb1b10abSAndroid Build Coastguard Worker; 10*fb1b10abSAndroid Build Coastguard Worker 11*fb1b10abSAndroid Build Coastguard Worker 12*fb1b10abSAndroid Build Coastguard Worker%include "vpx_ports/x86_abi_support.asm" 13*fb1b10abSAndroid Build Coastguard Worker 14*fb1b10abSAndroid Build Coastguard Worker%macro STACK_FRAME_CREATE 0 15*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT 16*fb1b10abSAndroid Build Coastguard Worker %define input rsi 17*fb1b10abSAndroid Build Coastguard Worker %define output rdi 18*fb1b10abSAndroid Build Coastguard Worker %define pitch rax 19*fb1b10abSAndroid Build Coastguard Worker push rbp 20*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 21*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 22*fb1b10abSAndroid Build Coastguard Worker push rsi 23*fb1b10abSAndroid Build Coastguard Worker push rdi 24*fb1b10abSAndroid Build Coastguard Worker ; end prolog 25*fb1b10abSAndroid Build Coastguard Worker 26*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(0) 27*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(1) 28*fb1b10abSAndroid Build Coastguard Worker 29*fb1b10abSAndroid Build Coastguard Worker movsxd rax, dword ptr arg(2) 30*fb1b10abSAndroid Build Coastguard Worker lea rcx, [rsi + rax*2] 31*fb1b10abSAndroid Build Coastguard Worker%else 32*fb1b10abSAndroid Build Coastguard Worker %if LIBVPX_YASM_WIN64 33*fb1b10abSAndroid Build Coastguard Worker %define input rcx 34*fb1b10abSAndroid Build Coastguard Worker %define output rdx 35*fb1b10abSAndroid Build Coastguard Worker %define pitch r8 36*fb1b10abSAndroid Build Coastguard Worker SAVE_XMM 7, u 37*fb1b10abSAndroid Build Coastguard Worker %else 38*fb1b10abSAndroid Build Coastguard Worker %define input rdi 39*fb1b10abSAndroid Build Coastguard Worker %define output rsi 40*fb1b10abSAndroid Build Coastguard Worker %define pitch rdx 41*fb1b10abSAndroid Build Coastguard Worker %endif 42*fb1b10abSAndroid Build Coastguard Worker%endif 43*fb1b10abSAndroid Build Coastguard Worker%endmacro 44*fb1b10abSAndroid Build Coastguard Worker 45*fb1b10abSAndroid Build Coastguard Worker%macro STACK_FRAME_DESTROY 0 46*fb1b10abSAndroid Build Coastguard Worker %define input 47*fb1b10abSAndroid Build Coastguard Worker %define output 48*fb1b10abSAndroid Build Coastguard Worker %define pitch 49*fb1b10abSAndroid Build Coastguard Worker 50*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT 51*fb1b10abSAndroid Build Coastguard Worker pop rdi 52*fb1b10abSAndroid Build Coastguard Worker pop rsi 53*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 54*fb1b10abSAndroid Build Coastguard Worker pop rbp 55*fb1b10abSAndroid Build Coastguard Worker%else 56*fb1b10abSAndroid Build Coastguard Worker %if LIBVPX_YASM_WIN64 57*fb1b10abSAndroid Build Coastguard Worker RESTORE_XMM 58*fb1b10abSAndroid Build Coastguard Worker %endif 59*fb1b10abSAndroid Build Coastguard Worker%endif 60*fb1b10abSAndroid Build Coastguard Worker ret 61*fb1b10abSAndroid Build Coastguard Worker%endmacro 62*fb1b10abSAndroid Build Coastguard Worker 63*fb1b10abSAndroid Build Coastguard WorkerSECTION .text 64*fb1b10abSAndroid Build Coastguard Worker 65*fb1b10abSAndroid Build Coastguard Worker;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) 66*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_short_fdct4x4_sse2) 67*fb1b10abSAndroid Build Coastguard Workersym(vp8_short_fdct4x4_sse2): 68*fb1b10abSAndroid Build Coastguard Worker 69*fb1b10abSAndroid Build Coastguard Worker STACK_FRAME_CREATE 70*fb1b10abSAndroid Build Coastguard Worker 71*fb1b10abSAndroid Build Coastguard Worker movq xmm0, MMWORD PTR[input ] ;03 02 01 00 72*fb1b10abSAndroid Build Coastguard Worker movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 73*fb1b10abSAndroid Build Coastguard Worker lea input, [input+2*pitch] 74*fb1b10abSAndroid Build Coastguard Worker movq xmm1, MMWORD PTR[input ] ;23 22 21 20 75*fb1b10abSAndroid Build Coastguard Worker movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 76*fb1b10abSAndroid Build Coastguard Worker 77*fb1b10abSAndroid Build Coastguard Worker punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 78*fb1b10abSAndroid Build Coastguard Worker punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 79*fb1b10abSAndroid Build Coastguard Worker 80*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, xmm0 81*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 82*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 83*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm0 84*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 85*fb1b10abSAndroid Build Coastguard Worker pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx 86*fb1b10abSAndroid Build Coastguard Worker pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx 87*fb1b10abSAndroid Build Coastguard Worker 88*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 89*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, xmm0 90*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 91*fb1b10abSAndroid Build Coastguard Worker psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 92*fb1b10abSAndroid Build Coastguard Worker psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 93*fb1b10abSAndroid Build Coastguard Worker psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 94*fb1b10abSAndroid Build Coastguard Worker 95*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm0 96*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 97*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 98*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm3 99*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 100*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 101*fb1b10abSAndroid Build Coastguard Worker 102*fb1b10abSAndroid Build Coastguard Worker paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] 103*fb1b10abSAndroid Build Coastguard Worker paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] 104*fb1b10abSAndroid Build Coastguard Worker psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 105*fb1b10abSAndroid Build Coastguard Worker psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 106*fb1b10abSAndroid Build Coastguard Worker 107*fb1b10abSAndroid Build Coastguard Worker packssdw xmm0, xmm1 ;op[2] op[0] 108*fb1b10abSAndroid Build Coastguard Worker packssdw xmm3, xmm4 ;op[3] op[1] 109*fb1b10abSAndroid Build Coastguard Worker ; 23 22 21 20 03 02 01 00 110*fb1b10abSAndroid Build Coastguard Worker ; 111*fb1b10abSAndroid Build Coastguard Worker ; 33 32 31 30 13 12 11 10 112*fb1b10abSAndroid Build Coastguard Worker ; 113*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, xmm0 114*fb1b10abSAndroid Build Coastguard Worker punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 115*fb1b10abSAndroid Build Coastguard Worker punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 116*fb1b10abSAndroid Build Coastguard Worker 117*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, xmm0 118*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 119*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 120*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, xmm0 121*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 122*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 123*fb1b10abSAndroid Build Coastguard Worker 124*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] 125*fb1b10abSAndroid Build Coastguard Worker pshufd xmm2, xmm2, 04eh 126*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, xmm0 127*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 128*fb1b10abSAndroid Build Coastguard Worker psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 129*fb1b10abSAndroid Build Coastguard Worker 130*fb1b10abSAndroid Build Coastguard Worker pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 131*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, xmm3 ;save d1 for compare 132*fb1b10abSAndroid Build Coastguard Worker pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 133*fb1b10abSAndroid Build Coastguard Worker pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 134*fb1b10abSAndroid Build Coastguard Worker pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 135*fb1b10abSAndroid Build Coastguard Worker pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 136*fb1b10abSAndroid Build Coastguard Worker pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 137*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm0 138*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 139*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 140*fb1b10abSAndroid Build Coastguard Worker 141*fb1b10abSAndroid Build Coastguard Worker pxor xmm4, xmm4 ;zero out for compare 142*fb1b10abSAndroid Build Coastguard Worker paddd xmm0, xmm5 143*fb1b10abSAndroid Build Coastguard Worker paddd xmm1, xmm5 144*fb1b10abSAndroid Build Coastguard Worker pcmpeqw xmm2, xmm4 145*fb1b10abSAndroid Build Coastguard Worker psrad xmm0, 4 ;(a1 + b1 + 7)>>4 146*fb1b10abSAndroid Build Coastguard Worker psrad xmm1, 4 ;(a1 - b1 + 7)>>4 147*fb1b10abSAndroid Build Coastguard Worker pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, 148*fb1b10abSAndroid Build Coastguard Worker ;and keep bit 0 of lower 149*fb1b10abSAndroid Build Coastguard Worker 150*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm3 151*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 152*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 153*fb1b10abSAndroid Build Coastguard Worker paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] 154*fb1b10abSAndroid Build Coastguard Worker paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] 155*fb1b10abSAndroid Build Coastguard Worker packssdw xmm0, xmm1 ;op[8] op[0] 156*fb1b10abSAndroid Build Coastguard Worker psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 157*fb1b10abSAndroid Build Coastguard Worker psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 158*fb1b10abSAndroid Build Coastguard Worker 159*fb1b10abSAndroid Build Coastguard Worker packssdw xmm3, xmm4 ;op[12] op[4] 160*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm0 161*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm2 ;op[4] += (d1!=0) 162*fb1b10abSAndroid Build Coastguard Worker punpcklqdq xmm0, xmm3 ;op[4] op[0] 163*fb1b10abSAndroid Build Coastguard Worker punpckhqdq xmm1, xmm3 ;op[12] op[8] 164*fb1b10abSAndroid Build Coastguard Worker 165*fb1b10abSAndroid Build Coastguard Worker movdqa XMMWORD PTR[output + 0], xmm0 166*fb1b10abSAndroid Build Coastguard Worker movdqa XMMWORD PTR[output + 16], xmm1 167*fb1b10abSAndroid Build Coastguard Worker 168*fb1b10abSAndroid Build Coastguard Worker STACK_FRAME_DESTROY 169*fb1b10abSAndroid Build Coastguard Worker 170*fb1b10abSAndroid Build Coastguard Worker;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) 171*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_short_fdct8x4_sse2) 172*fb1b10abSAndroid Build Coastguard Workersym(vp8_short_fdct8x4_sse2): 173*fb1b10abSAndroid Build Coastguard Worker 174*fb1b10abSAndroid Build Coastguard Worker STACK_FRAME_CREATE 175*fb1b10abSAndroid Build Coastguard Worker 176*fb1b10abSAndroid Build Coastguard Worker ; read the input data 177*fb1b10abSAndroid Build Coastguard Worker movdqa xmm0, [input ] 178*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, [input+ pitch] 179*fb1b10abSAndroid Build Coastguard Worker lea input, [input+2*pitch] 180*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, [input ] 181*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, [input+ pitch] 182*fb1b10abSAndroid Build Coastguard Worker 183*fb1b10abSAndroid Build Coastguard Worker ; transpose for the first stage 184*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 185*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 186*fb1b10abSAndroid Build Coastguard Worker 187*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 188*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 189*fb1b10abSAndroid Build Coastguard Worker 190*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 191*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 192*fb1b10abSAndroid Build Coastguard Worker 193*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 194*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 195*fb1b10abSAndroid Build Coastguard Worker 196*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 197*fb1b10abSAndroid Build Coastguard Worker 198*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 199*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 200*fb1b10abSAndroid Build Coastguard Worker 201*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 202*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 203*fb1b10abSAndroid Build Coastguard Worker 204*fb1b10abSAndroid Build Coastguard Worker punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 205*fb1b10abSAndroid Build Coastguard Worker punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 206*fb1b10abSAndroid Build Coastguard Worker 207*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 208*fb1b10abSAndroid Build Coastguard Worker punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 209*fb1b10abSAndroid Build Coastguard Worker 210*fb1b10abSAndroid Build Coastguard Worker punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 211*fb1b10abSAndroid Build Coastguard Worker 212*fb1b10abSAndroid Build Coastguard Worker ; xmm0 0 213*fb1b10abSAndroid Build Coastguard Worker ; xmm1 1 214*fb1b10abSAndroid Build Coastguard Worker ; xmm2 2 215*fb1b10abSAndroid Build Coastguard Worker ; xmm3 3 216*fb1b10abSAndroid Build Coastguard Worker 217*fb1b10abSAndroid Build Coastguard Worker ; first stage 218*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm0 219*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm1 220*fb1b10abSAndroid Build Coastguard Worker 221*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, xmm3 ; a1 = 0 + 3 222*fb1b10abSAndroid Build Coastguard Worker paddw xmm1, xmm2 ; b1 = 1 + 2 223*fb1b10abSAndroid Build Coastguard Worker 224*fb1b10abSAndroid Build Coastguard Worker psubw xmm4, xmm2 ; c1 = 1 - 2 225*fb1b10abSAndroid Build Coastguard Worker psubw xmm5, xmm3 ; d1 = 0 - 3 226*fb1b10abSAndroid Build Coastguard Worker 227*fb1b10abSAndroid Build Coastguard Worker psllw xmm5, 3 228*fb1b10abSAndroid Build Coastguard Worker psllw xmm4, 3 229*fb1b10abSAndroid Build Coastguard Worker 230*fb1b10abSAndroid Build Coastguard Worker psllw xmm0, 3 231*fb1b10abSAndroid Build Coastguard Worker psllw xmm1, 3 232*fb1b10abSAndroid Build Coastguard Worker 233*fb1b10abSAndroid Build Coastguard Worker ; output 0 and 2 234*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, xmm0 ; a1 235*fb1b10abSAndroid Build Coastguard Worker 236*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, xmm1 ; op[0] = a1 + b1 237*fb1b10abSAndroid Build Coastguard Worker psubw xmm2, xmm1 ; op[2] = a1 - b1 238*fb1b10abSAndroid Build Coastguard Worker 239*fb1b10abSAndroid Build Coastguard Worker ; output 1 and 3 240*fb1b10abSAndroid Build Coastguard Worker ; interleave c1, d1 241*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm5 ; d1 242*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm1, xmm4 ; c1 d1 243*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm5, xmm4 ; c1 d1 244*fb1b10abSAndroid Build Coastguard Worker 245*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, xmm1 246*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm5 247*fb1b10abSAndroid Build Coastguard Worker 248*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 249*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 250*fb1b10abSAndroid Build Coastguard Worker 251*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 252*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 253*fb1b10abSAndroid Build Coastguard Worker 254*fb1b10abSAndroid Build Coastguard Worker paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] 255*fb1b10abSAndroid Build Coastguard Worker paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] 256*fb1b10abSAndroid Build Coastguard Worker paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] 257*fb1b10abSAndroid Build Coastguard Worker paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] 258*fb1b10abSAndroid Build Coastguard Worker 259*fb1b10abSAndroid Build Coastguard Worker psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 260*fb1b10abSAndroid Build Coastguard Worker psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 261*fb1b10abSAndroid Build Coastguard Worker psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 262*fb1b10abSAndroid Build Coastguard Worker psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 263*fb1b10abSAndroid Build Coastguard Worker 264*fb1b10abSAndroid Build Coastguard Worker packssdw xmm1, xmm4 ; op[1] 265*fb1b10abSAndroid Build Coastguard Worker packssdw xmm3, xmm5 ; op[3] 266*fb1b10abSAndroid Build Coastguard Worker 267*fb1b10abSAndroid Build Coastguard Worker ; done with vertical 268*fb1b10abSAndroid Build Coastguard Worker ; transpose for the second stage 269*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 270*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 271*fb1b10abSAndroid Build Coastguard Worker 272*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 273*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 274*fb1b10abSAndroid Build Coastguard Worker 275*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 276*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 277*fb1b10abSAndroid Build Coastguard Worker 278*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 279*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 280*fb1b10abSAndroid Build Coastguard Worker 281*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 282*fb1b10abSAndroid Build Coastguard Worker 283*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 284*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 285*fb1b10abSAndroid Build Coastguard Worker 286*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 287*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 288*fb1b10abSAndroid Build Coastguard Worker 289*fb1b10abSAndroid Build Coastguard Worker punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 290*fb1b10abSAndroid Build Coastguard Worker punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 291*fb1b10abSAndroid Build Coastguard Worker 292*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 293*fb1b10abSAndroid Build Coastguard Worker punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 294*fb1b10abSAndroid Build Coastguard Worker 295*fb1b10abSAndroid Build Coastguard Worker punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 296*fb1b10abSAndroid Build Coastguard Worker 297*fb1b10abSAndroid Build Coastguard Worker ; xmm0 0 298*fb1b10abSAndroid Build Coastguard Worker ; xmm1 4 299*fb1b10abSAndroid Build Coastguard Worker ; xmm2 1 300*fb1b10abSAndroid Build Coastguard Worker ; xmm3 3 301*fb1b10abSAndroid Build Coastguard Worker 302*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm0 303*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, xmm1 304*fb1b10abSAndroid Build Coastguard Worker 305*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, xmm3 ; a1 = 0 + 3 306*fb1b10abSAndroid Build Coastguard Worker paddw xmm1, xmm4 ; b1 = 1 + 2 307*fb1b10abSAndroid Build Coastguard Worker 308*fb1b10abSAndroid Build Coastguard Worker psubw xmm4, xmm2 ; c1 = 1 - 2 309*fb1b10abSAndroid Build Coastguard Worker psubw xmm5, xmm3 ; d1 = 0 - 3 310*fb1b10abSAndroid Build Coastguard Worker 311*fb1b10abSAndroid Build Coastguard Worker pxor xmm6, xmm6 ; zero out for compare 312*fb1b10abSAndroid Build Coastguard Worker 313*fb1b10abSAndroid Build Coastguard Worker pcmpeqw xmm6, xmm5 ; d1 != 0 314*fb1b10abSAndroid Build Coastguard Worker 315*fb1b10abSAndroid Build Coastguard Worker pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, 316*fb1b10abSAndroid Build Coastguard Worker ; and keep bit 0 of lower 317*fb1b10abSAndroid Build Coastguard Worker 318*fb1b10abSAndroid Build Coastguard Worker ; output 0 and 2 319*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, xmm0 ; a1 320*fb1b10abSAndroid Build Coastguard Worker 321*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, xmm1 ; a1 + b1 322*fb1b10abSAndroid Build Coastguard Worker psubw xmm2, xmm1 ; a1 - b1 323*fb1b10abSAndroid Build Coastguard Worker 324*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] 325*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] 326*fb1b10abSAndroid Build Coastguard Worker 327*fb1b10abSAndroid Build Coastguard Worker psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 328*fb1b10abSAndroid Build Coastguard Worker psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 329*fb1b10abSAndroid Build Coastguard Worker 330*fb1b10abSAndroid Build Coastguard Worker ; output 1 and 3 331*fb1b10abSAndroid Build Coastguard Worker ; interleave c1, d1 332*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm5 ; d1 333*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm1, xmm4 ; c1 d1 334*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm5, xmm4 ; c1 d1 335*fb1b10abSAndroid Build Coastguard Worker 336*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, xmm1 337*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm5 338*fb1b10abSAndroid Build Coastguard Worker 339*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 340*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 341*fb1b10abSAndroid Build Coastguard Worker 342*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 343*fb1b10abSAndroid Build Coastguard Worker pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 344*fb1b10abSAndroid Build Coastguard Worker 345*fb1b10abSAndroid Build Coastguard Worker paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] 346*fb1b10abSAndroid Build Coastguard Worker paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] 347*fb1b10abSAndroid Build Coastguard Worker paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] 348*fb1b10abSAndroid Build Coastguard Worker paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] 349*fb1b10abSAndroid Build Coastguard Worker 350*fb1b10abSAndroid Build Coastguard Worker psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 351*fb1b10abSAndroid Build Coastguard Worker psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 352*fb1b10abSAndroid Build Coastguard Worker psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 353*fb1b10abSAndroid Build Coastguard Worker psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 354*fb1b10abSAndroid Build Coastguard Worker 355*fb1b10abSAndroid Build Coastguard Worker packssdw xmm1, xmm4 ; op[4] 356*fb1b10abSAndroid Build Coastguard Worker packssdw xmm3, xmm5 ; op[12] 357*fb1b10abSAndroid Build Coastguard Worker 358*fb1b10abSAndroid Build Coastguard Worker paddw xmm1, xmm6 ; op[4] += (d1!=0) 359*fb1b10abSAndroid Build Coastguard Worker 360*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm0 361*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm2 362*fb1b10abSAndroid Build Coastguard Worker 363*fb1b10abSAndroid Build Coastguard Worker punpcklqdq xmm0, xmm1 364*fb1b10abSAndroid Build Coastguard Worker punpckhqdq xmm4, xmm1 365*fb1b10abSAndroid Build Coastguard Worker 366*fb1b10abSAndroid Build Coastguard Worker punpcklqdq xmm2, xmm3 367*fb1b10abSAndroid Build Coastguard Worker punpckhqdq xmm5, xmm3 368*fb1b10abSAndroid Build Coastguard Worker 369*fb1b10abSAndroid Build Coastguard Worker movdqa XMMWORD PTR[output + 0 ], xmm0 370*fb1b10abSAndroid Build Coastguard Worker movdqa XMMWORD PTR[output + 16], xmm2 371*fb1b10abSAndroid Build Coastguard Worker movdqa XMMWORD PTR[output + 32], xmm4 372*fb1b10abSAndroid Build Coastguard Worker movdqa XMMWORD PTR[output + 48], xmm5 373*fb1b10abSAndroid Build Coastguard Worker 374*fb1b10abSAndroid Build Coastguard Worker STACK_FRAME_DESTROY 375*fb1b10abSAndroid Build Coastguard Worker 376*fb1b10abSAndroid Build Coastguard WorkerSECTION_RODATA 377*fb1b10abSAndroid Build Coastguard Workeralign 16 378*fb1b10abSAndroid Build Coastguard Worker_5352_2217: 379*fb1b10abSAndroid Build Coastguard Worker dw 5352 380*fb1b10abSAndroid Build Coastguard Worker dw 2217 381*fb1b10abSAndroid Build Coastguard Worker dw 5352 382*fb1b10abSAndroid Build Coastguard Worker dw 2217 383*fb1b10abSAndroid Build Coastguard Worker dw 5352 384*fb1b10abSAndroid Build Coastguard Worker dw 2217 385*fb1b10abSAndroid Build Coastguard Worker dw 5352 386*fb1b10abSAndroid Build Coastguard Worker dw 2217 387*fb1b10abSAndroid Build Coastguard Workeralign 16 388*fb1b10abSAndroid Build Coastguard Worker_2217_neg5352: 389*fb1b10abSAndroid Build Coastguard Worker dw 2217 390*fb1b10abSAndroid Build Coastguard Worker dw -5352 391*fb1b10abSAndroid Build Coastguard Worker dw 2217 392*fb1b10abSAndroid Build Coastguard Worker dw -5352 393*fb1b10abSAndroid Build Coastguard Worker dw 2217 394*fb1b10abSAndroid Build Coastguard Worker dw -5352 395*fb1b10abSAndroid Build Coastguard Worker dw 2217 396*fb1b10abSAndroid Build Coastguard Worker dw -5352 397*fb1b10abSAndroid Build Coastguard Workeralign 16 398*fb1b10abSAndroid Build Coastguard Worker_mult_add: 399*fb1b10abSAndroid Build Coastguard Worker times 8 dw 1 400*fb1b10abSAndroid Build Coastguard Workeralign 16 401*fb1b10abSAndroid Build Coastguard Worker_cmp_mask: 402*fb1b10abSAndroid Build Coastguard Worker times 4 dw 1 403*fb1b10abSAndroid Build Coastguard Worker times 4 dw 0 404*fb1b10abSAndroid Build Coastguard Workeralign 16 405*fb1b10abSAndroid Build Coastguard Worker_cmp_mask8x4: 406*fb1b10abSAndroid Build Coastguard Worker times 8 dw 1 407*fb1b10abSAndroid Build Coastguard Workeralign 16 408*fb1b10abSAndroid Build Coastguard Worker_mult_sub: 409*fb1b10abSAndroid Build Coastguard Worker dw 1 410*fb1b10abSAndroid Build Coastguard Worker dw -1 411*fb1b10abSAndroid Build Coastguard Worker dw 1 412*fb1b10abSAndroid Build Coastguard Worker dw -1 413*fb1b10abSAndroid Build Coastguard Worker dw 1 414*fb1b10abSAndroid Build Coastguard Worker dw -1 415*fb1b10abSAndroid Build Coastguard Worker dw 1 416*fb1b10abSAndroid Build Coastguard Worker dw -1 417*fb1b10abSAndroid Build Coastguard Workeralign 16 418*fb1b10abSAndroid Build Coastguard Worker_7: 419*fb1b10abSAndroid Build Coastguard Worker times 4 dd 7 420*fb1b10abSAndroid Build Coastguard Workeralign 16 421*fb1b10abSAndroid Build Coastguard Worker_7w: 422*fb1b10abSAndroid Build Coastguard Worker times 8 dw 7 423*fb1b10abSAndroid Build Coastguard Workeralign 16 424*fb1b10abSAndroid Build Coastguard Worker_14500: 425*fb1b10abSAndroid Build Coastguard Worker times 4 dd 14500 426*fb1b10abSAndroid Build Coastguard Workeralign 16 427*fb1b10abSAndroid Build Coastguard Worker_7500: 428*fb1b10abSAndroid Build Coastguard Worker times 4 dd 7500 429*fb1b10abSAndroid Build Coastguard Workeralign 16 430*fb1b10abSAndroid Build Coastguard Worker_12000: 431*fb1b10abSAndroid Build Coastguard Worker times 4 dd 12000 432*fb1b10abSAndroid Build Coastguard Workeralign 16 433*fb1b10abSAndroid Build Coastguard Worker_51000: 434*fb1b10abSAndroid Build Coastguard Worker times 4 dd 51000 435