1*fb1b10abSAndroid Build Coastguard Worker; 2*fb1b10abSAndroid Build Coastguard Worker; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3*fb1b10abSAndroid Build Coastguard Worker; 4*fb1b10abSAndroid Build Coastguard Worker; Use of this source code is governed by a BSD-style license 5*fb1b10abSAndroid Build Coastguard Worker; that can be found in the LICENSE file in the root of the source 6*fb1b10abSAndroid Build Coastguard Worker; tree. An additional intellectual property rights grant can be found 7*fb1b10abSAndroid Build Coastguard Worker; in the file PATENTS. All contributing project authors may 8*fb1b10abSAndroid Build Coastguard Worker; be found in the AUTHORS file in the root of the source tree. 9*fb1b10abSAndroid Build Coastguard Worker; 10*fb1b10abSAndroid Build Coastguard Worker 11*fb1b10abSAndroid Build Coastguard Worker 12*fb1b10abSAndroid Build Coastguard Worker%include "vpx_ports/x86_abi_support.asm" 13*fb1b10abSAndroid Build Coastguard Worker 14*fb1b10abSAndroid Build Coastguard Worker;void vp8_idct_dequant_0_2x_sse2 15*fb1b10abSAndroid Build Coastguard Worker; ( 16*fb1b10abSAndroid Build Coastguard Worker; short *qcoeff - 0 17*fb1b10abSAndroid Build Coastguard Worker; short *dequant - 1 18*fb1b10abSAndroid Build Coastguard Worker; unsigned char *dst - 2 19*fb1b10abSAndroid Build Coastguard Worker; int dst_stride - 3 20*fb1b10abSAndroid Build Coastguard Worker; ) 21*fb1b10abSAndroid Build Coastguard Worker 22*fb1b10abSAndroid Build Coastguard WorkerSECTION .text 23*fb1b10abSAndroid Build Coastguard Worker 24*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_idct_dequant_0_2x_sse2) 25*fb1b10abSAndroid Build Coastguard Workersym(vp8_idct_dequant_0_2x_sse2): 26*fb1b10abSAndroid Build Coastguard Worker push rbp 27*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 28*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 4 29*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 30*fb1b10abSAndroid Build Coastguard Worker ; end prolog 31*fb1b10abSAndroid Build Coastguard Worker 32*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(1) ; dequant 33*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(0) ; qcoeff 34*fb1b10abSAndroid Build Coastguard Worker 35*fb1b10abSAndroid Build Coastguard Worker movd xmm4, [rax] 36*fb1b10abSAndroid Build Coastguard Worker movd xmm5, [rdx] 37*fb1b10abSAndroid Build Coastguard Worker 38*fb1b10abSAndroid Build Coastguard Worker pinsrw xmm4, [rax+32], 4 39*fb1b10abSAndroid Build Coastguard Worker pinsrw xmm5, [rdx], 4 40*fb1b10abSAndroid Build Coastguard Worker 41*fb1b10abSAndroid Build Coastguard Worker pmullw xmm4, xmm5 42*fb1b10abSAndroid Build Coastguard Worker 43*fb1b10abSAndroid Build Coastguard Worker ; Zero out xmm5, for use unpacking 44*fb1b10abSAndroid Build Coastguard Worker pxor xmm5, xmm5 45*fb1b10abSAndroid Build Coastguard Worker 46*fb1b10abSAndroid Build Coastguard Worker ; clear coeffs 47*fb1b10abSAndroid Build Coastguard Worker movd [rax], xmm5 48*fb1b10abSAndroid Build Coastguard Worker movd [rax+32], xmm5 49*fb1b10abSAndroid Build Coastguard Worker;pshufb 50*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(2) ; dst 51*fb1b10abSAndroid Build Coastguard Worker movsxd rdx, dword ptr arg(3) ; dst_stride 52*fb1b10abSAndroid Build Coastguard Worker 53*fb1b10abSAndroid Build Coastguard Worker pshuflw xmm4, xmm4, 00000000b 54*fb1b10abSAndroid Build Coastguard Worker pshufhw xmm4, xmm4, 00000000b 55*fb1b10abSAndroid Build Coastguard Worker 56*fb1b10abSAndroid Build Coastguard Worker lea rcx, [rdx + rdx*2] 57*fb1b10abSAndroid Build Coastguard Worker paddw xmm4, [GLOBAL(fours)] 58*fb1b10abSAndroid Build Coastguard Worker 59*fb1b10abSAndroid Build Coastguard Worker psraw xmm4, 3 60*fb1b10abSAndroid Build Coastguard Worker 61*fb1b10abSAndroid Build Coastguard Worker movq xmm0, [rax] 62*fb1b10abSAndroid Build Coastguard Worker movq xmm1, [rax+rdx] 63*fb1b10abSAndroid Build Coastguard Worker movq xmm2, [rax+2*rdx] 64*fb1b10abSAndroid Build Coastguard Worker movq xmm3, [rax+rcx] 65*fb1b10abSAndroid Build Coastguard Worker 66*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm0, xmm5 67*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm1, xmm5 68*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm2, xmm5 69*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm3, xmm5 70*fb1b10abSAndroid Build Coastguard Worker 71*fb1b10abSAndroid Build Coastguard Worker 72*fb1b10abSAndroid Build Coastguard Worker ; Add to predict buffer 73*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, xmm4 74*fb1b10abSAndroid Build Coastguard Worker paddw xmm1, xmm4 75*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm4 76*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm4 77*fb1b10abSAndroid Build Coastguard Worker 78*fb1b10abSAndroid Build Coastguard Worker ; pack up before storing 79*fb1b10abSAndroid Build Coastguard Worker packuswb xmm0, xmm5 80*fb1b10abSAndroid Build Coastguard Worker packuswb xmm1, xmm5 81*fb1b10abSAndroid Build Coastguard Worker packuswb xmm2, xmm5 82*fb1b10abSAndroid Build Coastguard Worker packuswb xmm3, xmm5 83*fb1b10abSAndroid Build Coastguard Worker 84*fb1b10abSAndroid Build Coastguard Worker ; store blocks back out 85*fb1b10abSAndroid Build Coastguard Worker movq [rax], xmm0 86*fb1b10abSAndroid Build Coastguard Worker movq [rax + rdx], xmm1 87*fb1b10abSAndroid Build Coastguard Worker 88*fb1b10abSAndroid Build Coastguard Worker lea rax, [rax + 2*rdx] 89*fb1b10abSAndroid Build Coastguard Worker 90*fb1b10abSAndroid Build Coastguard Worker movq [rax], xmm2 91*fb1b10abSAndroid Build Coastguard Worker movq [rax + rdx], xmm3 92*fb1b10abSAndroid Build Coastguard Worker 93*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 94*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 95*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 96*fb1b10abSAndroid Build Coastguard Worker pop rbp 97*fb1b10abSAndroid Build Coastguard Worker ret 98*fb1b10abSAndroid Build Coastguard Worker 99*fb1b10abSAndroid Build Coastguard Worker;void vp8_idct_dequant_full_2x_sse2 100*fb1b10abSAndroid Build Coastguard Worker; ( 101*fb1b10abSAndroid Build Coastguard Worker; short *qcoeff - 0 102*fb1b10abSAndroid Build Coastguard Worker; short *dequant - 1 103*fb1b10abSAndroid Build Coastguard Worker; unsigned char *dst - 2 104*fb1b10abSAndroid Build Coastguard Worker; int dst_stride - 3 105*fb1b10abSAndroid Build Coastguard Worker; ) 106*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_idct_dequant_full_2x_sse2) 107*fb1b10abSAndroid Build Coastguard Workersym(vp8_idct_dequant_full_2x_sse2): 108*fb1b10abSAndroid Build Coastguard Worker push rbp 109*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 110*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 4 111*fb1b10abSAndroid Build Coastguard Worker SAVE_XMM 7 112*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 113*fb1b10abSAndroid Build Coastguard Worker push rsi 114*fb1b10abSAndroid Build Coastguard Worker push rdi 115*fb1b10abSAndroid Build Coastguard Worker ; end prolog 116*fb1b10abSAndroid Build Coastguard Worker 117*fb1b10abSAndroid Build Coastguard Worker ; special case when 2 blocks have 0 or 1 coeffs 118*fb1b10abSAndroid Build Coastguard Worker ; dc is set as first coeff, so no need to load qcoeff 119*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(0) ; qcoeff 120*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(1) ; dequant 121*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(2) ; dst 122*fb1b10abSAndroid Build Coastguard Worker 123*fb1b10abSAndroid Build Coastguard Worker 124*fb1b10abSAndroid Build Coastguard Worker ; Zero out xmm7, for use unpacking 125*fb1b10abSAndroid Build Coastguard Worker pxor xmm7, xmm7 126*fb1b10abSAndroid Build Coastguard Worker 127*fb1b10abSAndroid Build Coastguard Worker 128*fb1b10abSAndroid Build Coastguard Worker ; note the transpose of xmm1 and xmm2, necessary for shuffle 129*fb1b10abSAndroid Build Coastguard Worker ; to spit out sensicle data 130*fb1b10abSAndroid Build Coastguard Worker movdqa xmm0, [rax] 131*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, [rax+16] 132*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, [rax+32] 133*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, [rax+48] 134*fb1b10abSAndroid Build Coastguard Worker 135*fb1b10abSAndroid Build Coastguard Worker ; Clear out coeffs 136*fb1b10abSAndroid Build Coastguard Worker movdqa [rax], xmm7 137*fb1b10abSAndroid Build Coastguard Worker movdqa [rax+16], xmm7 138*fb1b10abSAndroid Build Coastguard Worker movdqa [rax+32], xmm7 139*fb1b10abSAndroid Build Coastguard Worker movdqa [rax+48], xmm7 140*fb1b10abSAndroid Build Coastguard Worker 141*fb1b10abSAndroid Build Coastguard Worker ; dequantize qcoeff buffer 142*fb1b10abSAndroid Build Coastguard Worker pmullw xmm0, [rdx] 143*fb1b10abSAndroid Build Coastguard Worker pmullw xmm2, [rdx+16] 144*fb1b10abSAndroid Build Coastguard Worker pmullw xmm1, [rdx] 145*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, [rdx+16] 146*fb1b10abSAndroid Build Coastguard Worker movsxd rdx, dword ptr arg(3) ; dst_stride 147*fb1b10abSAndroid Build Coastguard Worker 148*fb1b10abSAndroid Build Coastguard Worker ; repack so block 0 row x and block 1 row x are together 149*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm0 150*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm0, xmm1 151*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm4, xmm1 152*fb1b10abSAndroid Build Coastguard Worker 153*fb1b10abSAndroid Build Coastguard Worker pshufd xmm0, xmm0, 11011000b 154*fb1b10abSAndroid Build Coastguard Worker pshufd xmm1, xmm4, 11011000b 155*fb1b10abSAndroid Build Coastguard Worker 156*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm2 157*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm2, xmm3 158*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm4, xmm3 159*fb1b10abSAndroid Build Coastguard Worker 160*fb1b10abSAndroid Build Coastguard Worker pshufd xmm2, xmm2, 11011000b 161*fb1b10abSAndroid Build Coastguard Worker pshufd xmm3, xmm4, 11011000b 162*fb1b10abSAndroid Build Coastguard Worker 163*fb1b10abSAndroid Build Coastguard Worker ; first pass 164*fb1b10abSAndroid Build Coastguard Worker psubw xmm0, xmm2 ; b1 = 0-2 165*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm2 ; 166*fb1b10abSAndroid Build Coastguard Worker 167*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm1 168*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm0 ; a1 = 0+2 169*fb1b10abSAndroid Build Coastguard Worker 170*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm5, [GLOBAL(x_s1sqr2)] 171*fb1b10abSAndroid Build Coastguard Worker lea rcx, [rdx + rdx*2] ;dst_stride * 3 172*fb1b10abSAndroid Build Coastguard Worker paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 173*fb1b10abSAndroid Build Coastguard Worker 174*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm3 175*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 176*fb1b10abSAndroid Build Coastguard Worker 177*fb1b10abSAndroid Build Coastguard Worker paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 178*fb1b10abSAndroid Build Coastguard Worker psubw xmm7, xmm5 ; c1 179*fb1b10abSAndroid Build Coastguard Worker 180*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm1 181*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm3 182*fb1b10abSAndroid Build Coastguard Worker 183*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 184*fb1b10abSAndroid Build Coastguard Worker paddw xmm5, xmm1 185*fb1b10abSAndroid Build Coastguard Worker 186*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm3, [GLOBAL(x_s1sqr2)] 187*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm4 188*fb1b10abSAndroid Build Coastguard Worker 189*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm5 ; d1 190*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm2 ; a1 191*fb1b10abSAndroid Build Coastguard Worker 192*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm0 ; b1 193*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm3 ;0 194*fb1b10abSAndroid Build Coastguard Worker 195*fb1b10abSAndroid Build Coastguard Worker paddw xmm4, xmm7 ;1 196*fb1b10abSAndroid Build Coastguard Worker psubw xmm0, xmm7 ;2 197*fb1b10abSAndroid Build Coastguard Worker 198*fb1b10abSAndroid Build Coastguard Worker psubw xmm6, xmm3 ;3 199*fb1b10abSAndroid Build Coastguard Worker 200*fb1b10abSAndroid Build Coastguard Worker ; transpose for the second pass 201*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 202*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 203*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 204*fb1b10abSAndroid Build Coastguard Worker 205*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 206*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 207*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 208*fb1b10abSAndroid Build Coastguard Worker 209*fb1b10abSAndroid Build Coastguard Worker 210*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 211*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 212*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 213*fb1b10abSAndroid Build Coastguard Worker 214*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 215*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 216*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 217*fb1b10abSAndroid Build Coastguard Worker 218*fb1b10abSAndroid Build Coastguard Worker 219*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 220*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 221*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 222*fb1b10abSAndroid Build Coastguard Worker 223*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 224*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 225*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 226*fb1b10abSAndroid Build Coastguard Worker 227*fb1b10abSAndroid Build Coastguard Worker pshufd xmm0, xmm2, 11011000b 228*fb1b10abSAndroid Build Coastguard Worker pshufd xmm2, xmm1, 11011000b 229*fb1b10abSAndroid Build Coastguard Worker 230*fb1b10abSAndroid Build Coastguard Worker pshufd xmm1, xmm5, 11011000b 231*fb1b10abSAndroid Build Coastguard Worker pshufd xmm3, xmm7, 11011000b 232*fb1b10abSAndroid Build Coastguard Worker 233*fb1b10abSAndroid Build Coastguard Worker ; second pass 234*fb1b10abSAndroid Build Coastguard Worker psubw xmm0, xmm2 ; b1 = 0-2 235*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm2 236*fb1b10abSAndroid Build Coastguard Worker 237*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm1 238*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm0 ; a1 = 0+2 239*fb1b10abSAndroid Build Coastguard Worker 240*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm5, [GLOBAL(x_s1sqr2)] 241*fb1b10abSAndroid Build Coastguard Worker paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 242*fb1b10abSAndroid Build Coastguard Worker 243*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm3 244*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 245*fb1b10abSAndroid Build Coastguard Worker 246*fb1b10abSAndroid Build Coastguard Worker paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 247*fb1b10abSAndroid Build Coastguard Worker psubw xmm7, xmm5 ; c1 248*fb1b10abSAndroid Build Coastguard Worker 249*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm1 250*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm3 251*fb1b10abSAndroid Build Coastguard Worker 252*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 253*fb1b10abSAndroid Build Coastguard Worker paddw xmm5, xmm1 254*fb1b10abSAndroid Build Coastguard Worker 255*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm3, [GLOBAL(x_s1sqr2)] 256*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm4 257*fb1b10abSAndroid Build Coastguard Worker 258*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm5 ; d1 259*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, [GLOBAL(fours)] 260*fb1b10abSAndroid Build Coastguard Worker 261*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, [GLOBAL(fours)] 262*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm2 ; a1 263*fb1b10abSAndroid Build Coastguard Worker 264*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm0 ; b1 265*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm3 ;0 266*fb1b10abSAndroid Build Coastguard Worker 267*fb1b10abSAndroid Build Coastguard Worker paddw xmm4, xmm7 ;1 268*fb1b10abSAndroid Build Coastguard Worker psubw xmm0, xmm7 ;2 269*fb1b10abSAndroid Build Coastguard Worker 270*fb1b10abSAndroid Build Coastguard Worker psubw xmm6, xmm3 ;3 271*fb1b10abSAndroid Build Coastguard Worker psraw xmm2, 3 272*fb1b10abSAndroid Build Coastguard Worker 273*fb1b10abSAndroid Build Coastguard Worker psraw xmm0, 3 274*fb1b10abSAndroid Build Coastguard Worker psraw xmm4, 3 275*fb1b10abSAndroid Build Coastguard Worker 276*fb1b10abSAndroid Build Coastguard Worker psraw xmm6, 3 277*fb1b10abSAndroid Build Coastguard Worker 278*fb1b10abSAndroid Build Coastguard Worker ; transpose to save 279*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 280*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 281*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 282*fb1b10abSAndroid Build Coastguard Worker 283*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 284*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 285*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 286*fb1b10abSAndroid Build Coastguard Worker 287*fb1b10abSAndroid Build Coastguard Worker 288*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 289*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 290*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 291*fb1b10abSAndroid Build Coastguard Worker 292*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 293*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 294*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 295*fb1b10abSAndroid Build Coastguard Worker 296*fb1b10abSAndroid Build Coastguard Worker 297*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 298*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 299*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 300*fb1b10abSAndroid Build Coastguard Worker 301*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 302*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 303*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 304*fb1b10abSAndroid Build Coastguard Worker 305*fb1b10abSAndroid Build Coastguard Worker pshufd xmm0, xmm2, 11011000b 306*fb1b10abSAndroid Build Coastguard Worker pshufd xmm2, xmm1, 11011000b 307*fb1b10abSAndroid Build Coastguard Worker 308*fb1b10abSAndroid Build Coastguard Worker pshufd xmm1, xmm5, 11011000b 309*fb1b10abSAndroid Build Coastguard Worker pshufd xmm3, xmm7, 11011000b 310*fb1b10abSAndroid Build Coastguard Worker 311*fb1b10abSAndroid Build Coastguard Worker pxor xmm7, xmm7 312*fb1b10abSAndroid Build Coastguard Worker 313*fb1b10abSAndroid Build Coastguard Worker ; Load up predict blocks 314*fb1b10abSAndroid Build Coastguard Worker movq xmm4, [rdi] 315*fb1b10abSAndroid Build Coastguard Worker movq xmm5, [rdi+rdx] 316*fb1b10abSAndroid Build Coastguard Worker 317*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm7 318*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm5, xmm7 319*fb1b10abSAndroid Build Coastguard Worker 320*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, xmm4 321*fb1b10abSAndroid Build Coastguard Worker paddw xmm1, xmm5 322*fb1b10abSAndroid Build Coastguard Worker 323*fb1b10abSAndroid Build Coastguard Worker movq xmm4, [rdi+2*rdx] 324*fb1b10abSAndroid Build Coastguard Worker movq xmm5, [rdi+rcx] 325*fb1b10abSAndroid Build Coastguard Worker 326*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm7 327*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm5, xmm7 328*fb1b10abSAndroid Build Coastguard Worker 329*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm4 330*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm5 331*fb1b10abSAndroid Build Coastguard Worker 332*fb1b10abSAndroid Build Coastguard Worker.finish: 333*fb1b10abSAndroid Build Coastguard Worker 334*fb1b10abSAndroid Build Coastguard Worker ; pack up before storing 335*fb1b10abSAndroid Build Coastguard Worker packuswb xmm0, xmm7 336*fb1b10abSAndroid Build Coastguard Worker packuswb xmm1, xmm7 337*fb1b10abSAndroid Build Coastguard Worker packuswb xmm2, xmm7 338*fb1b10abSAndroid Build Coastguard Worker packuswb xmm3, xmm7 339*fb1b10abSAndroid Build Coastguard Worker 340*fb1b10abSAndroid Build Coastguard Worker ; store blocks back out 341*fb1b10abSAndroid Build Coastguard Worker movq [rdi], xmm0 342*fb1b10abSAndroid Build Coastguard Worker movq [rdi + rdx], xmm1 343*fb1b10abSAndroid Build Coastguard Worker movq [rdi + rdx*2], xmm2 344*fb1b10abSAndroid Build Coastguard Worker movq [rdi + rcx], xmm3 345*fb1b10abSAndroid Build Coastguard Worker 346*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 347*fb1b10abSAndroid Build Coastguard Worker pop rdi 348*fb1b10abSAndroid Build Coastguard Worker pop rsi 349*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 350*fb1b10abSAndroid Build Coastguard Worker RESTORE_XMM 351*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 352*fb1b10abSAndroid Build Coastguard Worker pop rbp 353*fb1b10abSAndroid Build Coastguard Worker ret 354*fb1b10abSAndroid Build Coastguard Worker 355*fb1b10abSAndroid Build Coastguard Worker;void vp8_idct_dequant_dc_0_2x_sse2 356*fb1b10abSAndroid Build Coastguard Worker; ( 357*fb1b10abSAndroid Build Coastguard Worker; short *qcoeff - 0 358*fb1b10abSAndroid Build Coastguard Worker; short *dequant - 1 359*fb1b10abSAndroid Build Coastguard Worker; unsigned char *dst - 2 360*fb1b10abSAndroid Build Coastguard Worker; int dst_stride - 3 361*fb1b10abSAndroid Build Coastguard Worker; short *dc - 4 362*fb1b10abSAndroid Build Coastguard Worker; ) 363*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_idct_dequant_dc_0_2x_sse2) 364*fb1b10abSAndroid Build Coastguard Workersym(vp8_idct_dequant_dc_0_2x_sse2): 365*fb1b10abSAndroid Build Coastguard Worker push rbp 366*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 367*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 5 368*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 369*fb1b10abSAndroid Build Coastguard Worker push rdi 370*fb1b10abSAndroid Build Coastguard Worker ; end prolog 371*fb1b10abSAndroid Build Coastguard Worker 372*fb1b10abSAndroid Build Coastguard Worker ; special case when 2 blocks have 0 or 1 coeffs 373*fb1b10abSAndroid Build Coastguard Worker ; dc is set as first coeff, so no need to load qcoeff 374*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(0) ; qcoeff 375*fb1b10abSAndroid Build Coastguard Worker 376*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(2) ; dst 377*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(4) ; dc 378*fb1b10abSAndroid Build Coastguard Worker 379*fb1b10abSAndroid Build Coastguard Worker ; Zero out xmm5, for use unpacking 380*fb1b10abSAndroid Build Coastguard Worker pxor xmm5, xmm5 381*fb1b10abSAndroid Build Coastguard Worker 382*fb1b10abSAndroid Build Coastguard Worker ; load up 2 dc words here == 2*16 = doubleword 383*fb1b10abSAndroid Build Coastguard Worker movd xmm4, [rdx] 384*fb1b10abSAndroid Build Coastguard Worker 385*fb1b10abSAndroid Build Coastguard Worker movsxd rdx, dword ptr arg(3) ; dst_stride 386*fb1b10abSAndroid Build Coastguard Worker lea rcx, [rdx + rdx*2] 387*fb1b10abSAndroid Build Coastguard Worker ; Load up predict blocks 388*fb1b10abSAndroid Build Coastguard Worker movq xmm0, [rdi] 389*fb1b10abSAndroid Build Coastguard Worker movq xmm1, [rdi+rdx*1] 390*fb1b10abSAndroid Build Coastguard Worker movq xmm2, [rdi+rdx*2] 391*fb1b10abSAndroid Build Coastguard Worker movq xmm3, [rdi+rcx] 392*fb1b10abSAndroid Build Coastguard Worker 393*fb1b10abSAndroid Build Coastguard Worker ; Duplicate and expand dc across 394*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm4, xmm4 395*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm4, xmm4 396*fb1b10abSAndroid Build Coastguard Worker 397*fb1b10abSAndroid Build Coastguard Worker ; Rounding to dequant and downshift 398*fb1b10abSAndroid Build Coastguard Worker paddw xmm4, [GLOBAL(fours)] 399*fb1b10abSAndroid Build Coastguard Worker psraw xmm4, 3 400*fb1b10abSAndroid Build Coastguard Worker 401*fb1b10abSAndroid Build Coastguard Worker ; Predict buffer needs to be expanded from bytes to words 402*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm0, xmm5 403*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm1, xmm5 404*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm2, xmm5 405*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm3, xmm5 406*fb1b10abSAndroid Build Coastguard Worker 407*fb1b10abSAndroid Build Coastguard Worker ; Add to predict buffer 408*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, xmm4 409*fb1b10abSAndroid Build Coastguard Worker paddw xmm1, xmm4 410*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm4 411*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm4 412*fb1b10abSAndroid Build Coastguard Worker 413*fb1b10abSAndroid Build Coastguard Worker ; pack up before storing 414*fb1b10abSAndroid Build Coastguard Worker packuswb xmm0, xmm5 415*fb1b10abSAndroid Build Coastguard Worker packuswb xmm1, xmm5 416*fb1b10abSAndroid Build Coastguard Worker packuswb xmm2, xmm5 417*fb1b10abSAndroid Build Coastguard Worker packuswb xmm3, xmm5 418*fb1b10abSAndroid Build Coastguard Worker 419*fb1b10abSAndroid Build Coastguard Worker ; store blocks back out 420*fb1b10abSAndroid Build Coastguard Worker movq [rdi], xmm0 421*fb1b10abSAndroid Build Coastguard Worker movq [rdi + rdx], xmm1 422*fb1b10abSAndroid Build Coastguard Worker movq [rdi + rdx*2], xmm2 423*fb1b10abSAndroid Build Coastguard Worker movq [rdi + rcx], xmm3 424*fb1b10abSAndroid Build Coastguard Worker 425*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 426*fb1b10abSAndroid Build Coastguard Worker pop rdi 427*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 428*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 429*fb1b10abSAndroid Build Coastguard Worker pop rbp 430*fb1b10abSAndroid Build Coastguard Worker ret 431*fb1b10abSAndroid Build Coastguard Worker;void vp8_idct_dequant_dc_full_2x_sse2 432*fb1b10abSAndroid Build Coastguard Worker; ( 433*fb1b10abSAndroid Build Coastguard Worker; short *qcoeff - 0 434*fb1b10abSAndroid Build Coastguard Worker; short *dequant - 1 435*fb1b10abSAndroid Build Coastguard Worker; unsigned char *dst - 2 436*fb1b10abSAndroid Build Coastguard Worker; int dst_stride - 3 437*fb1b10abSAndroid Build Coastguard Worker; short *dc - 4 438*fb1b10abSAndroid Build Coastguard Worker; ) 439*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_idct_dequant_dc_full_2x_sse2) 440*fb1b10abSAndroid Build Coastguard Workersym(vp8_idct_dequant_dc_full_2x_sse2): 441*fb1b10abSAndroid Build Coastguard Worker push rbp 442*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 443*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 5 444*fb1b10abSAndroid Build Coastguard Worker SAVE_XMM 7 445*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 446*fb1b10abSAndroid Build Coastguard Worker push rdi 447*fb1b10abSAndroid Build Coastguard Worker ; end prolog 448*fb1b10abSAndroid Build Coastguard Worker 449*fb1b10abSAndroid Build Coastguard Worker ; special case when 2 blocks have 0 or 1 coeffs 450*fb1b10abSAndroid Build Coastguard Worker ; dc is set as first coeff, so no need to load qcoeff 451*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(0) ; qcoeff 452*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(1) ; dequant 453*fb1b10abSAndroid Build Coastguard Worker 454*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(2) ; dst 455*fb1b10abSAndroid Build Coastguard Worker 456*fb1b10abSAndroid Build Coastguard Worker ; Zero out xmm7, for use unpacking 457*fb1b10abSAndroid Build Coastguard Worker pxor xmm7, xmm7 458*fb1b10abSAndroid Build Coastguard Worker 459*fb1b10abSAndroid Build Coastguard Worker 460*fb1b10abSAndroid Build Coastguard Worker ; note the transpose of xmm1 and xmm2, necessary for shuffle 461*fb1b10abSAndroid Build Coastguard Worker ; to spit out sensicle data 462*fb1b10abSAndroid Build Coastguard Worker movdqa xmm0, [rax] 463*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, [rax+16] 464*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, [rax+32] 465*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, [rax+48] 466*fb1b10abSAndroid Build Coastguard Worker 467*fb1b10abSAndroid Build Coastguard Worker ; Clear out coeffs 468*fb1b10abSAndroid Build Coastguard Worker movdqa [rax], xmm7 469*fb1b10abSAndroid Build Coastguard Worker movdqa [rax+16], xmm7 470*fb1b10abSAndroid Build Coastguard Worker movdqa [rax+32], xmm7 471*fb1b10abSAndroid Build Coastguard Worker movdqa [rax+48], xmm7 472*fb1b10abSAndroid Build Coastguard Worker 473*fb1b10abSAndroid Build Coastguard Worker ; dequantize qcoeff buffer 474*fb1b10abSAndroid Build Coastguard Worker pmullw xmm0, [rdx] 475*fb1b10abSAndroid Build Coastguard Worker pmullw xmm2, [rdx+16] 476*fb1b10abSAndroid Build Coastguard Worker pmullw xmm1, [rdx] 477*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, [rdx+16] 478*fb1b10abSAndroid Build Coastguard Worker 479*fb1b10abSAndroid Build Coastguard Worker ; DC component 480*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(4) 481*fb1b10abSAndroid Build Coastguard Worker 482*fb1b10abSAndroid Build Coastguard Worker ; repack so block 0 row x and block 1 row x are together 483*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm0 484*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm0, xmm1 485*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm4, xmm1 486*fb1b10abSAndroid Build Coastguard Worker 487*fb1b10abSAndroid Build Coastguard Worker pshufd xmm0, xmm0, 11011000b 488*fb1b10abSAndroid Build Coastguard Worker pshufd xmm1, xmm4, 11011000b 489*fb1b10abSAndroid Build Coastguard Worker 490*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm2 491*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm2, xmm3 492*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm4, xmm3 493*fb1b10abSAndroid Build Coastguard Worker 494*fb1b10abSAndroid Build Coastguard Worker pshufd xmm2, xmm2, 11011000b 495*fb1b10abSAndroid Build Coastguard Worker pshufd xmm3, xmm4, 11011000b 496*fb1b10abSAndroid Build Coastguard Worker 497*fb1b10abSAndroid Build Coastguard Worker ; insert DC component 498*fb1b10abSAndroid Build Coastguard Worker pinsrw xmm0, [rdx], 0 499*fb1b10abSAndroid Build Coastguard Worker pinsrw xmm0, [rdx+2], 4 500*fb1b10abSAndroid Build Coastguard Worker 501*fb1b10abSAndroid Build Coastguard Worker ; first pass 502*fb1b10abSAndroid Build Coastguard Worker psubw xmm0, xmm2 ; b1 = 0-2 503*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm2 ; 504*fb1b10abSAndroid Build Coastguard Worker 505*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm1 506*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm0 ; a1 = 0+2 507*fb1b10abSAndroid Build Coastguard Worker 508*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm5, [GLOBAL(x_s1sqr2)] 509*fb1b10abSAndroid Build Coastguard Worker paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 510*fb1b10abSAndroid Build Coastguard Worker 511*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm3 512*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 513*fb1b10abSAndroid Build Coastguard Worker 514*fb1b10abSAndroid Build Coastguard Worker paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 515*fb1b10abSAndroid Build Coastguard Worker psubw xmm7, xmm5 ; c1 516*fb1b10abSAndroid Build Coastguard Worker 517*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm1 518*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm3 519*fb1b10abSAndroid Build Coastguard Worker 520*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 521*fb1b10abSAndroid Build Coastguard Worker paddw xmm5, xmm1 522*fb1b10abSAndroid Build Coastguard Worker 523*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm3, [GLOBAL(x_s1sqr2)] 524*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm4 525*fb1b10abSAndroid Build Coastguard Worker 526*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm5 ; d1 527*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm2 ; a1 528*fb1b10abSAndroid Build Coastguard Worker 529*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm0 ; b1 530*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm3 ;0 531*fb1b10abSAndroid Build Coastguard Worker 532*fb1b10abSAndroid Build Coastguard Worker paddw xmm4, xmm7 ;1 533*fb1b10abSAndroid Build Coastguard Worker psubw xmm0, xmm7 ;2 534*fb1b10abSAndroid Build Coastguard Worker 535*fb1b10abSAndroid Build Coastguard Worker psubw xmm6, xmm3 ;3 536*fb1b10abSAndroid Build Coastguard Worker 537*fb1b10abSAndroid Build Coastguard Worker ; transpose for the second pass 538*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 539*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 540*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 541*fb1b10abSAndroid Build Coastguard Worker 542*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 543*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 544*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 545*fb1b10abSAndroid Build Coastguard Worker 546*fb1b10abSAndroid Build Coastguard Worker 547*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 548*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 549*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 550*fb1b10abSAndroid Build Coastguard Worker 551*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 552*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 553*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 554*fb1b10abSAndroid Build Coastguard Worker 555*fb1b10abSAndroid Build Coastguard Worker 556*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 557*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 558*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 559*fb1b10abSAndroid Build Coastguard Worker 560*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 561*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 562*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 563*fb1b10abSAndroid Build Coastguard Worker 564*fb1b10abSAndroid Build Coastguard Worker pshufd xmm0, xmm2, 11011000b 565*fb1b10abSAndroid Build Coastguard Worker pshufd xmm2, xmm1, 11011000b 566*fb1b10abSAndroid Build Coastguard Worker 567*fb1b10abSAndroid Build Coastguard Worker pshufd xmm1, xmm5, 11011000b 568*fb1b10abSAndroid Build Coastguard Worker pshufd xmm3, xmm7, 11011000b 569*fb1b10abSAndroid Build Coastguard Worker 570*fb1b10abSAndroid Build Coastguard Worker ; second pass 571*fb1b10abSAndroid Build Coastguard Worker psubw xmm0, xmm2 ; b1 = 0-2 572*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm2 573*fb1b10abSAndroid Build Coastguard Worker 574*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm1 575*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm0 ; a1 = 0+2 576*fb1b10abSAndroid Build Coastguard Worker 577*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm5, [GLOBAL(x_s1sqr2)] 578*fb1b10abSAndroid Build Coastguard Worker paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 579*fb1b10abSAndroid Build Coastguard Worker 580*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm3 581*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 582*fb1b10abSAndroid Build Coastguard Worker 583*fb1b10abSAndroid Build Coastguard Worker paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 584*fb1b10abSAndroid Build Coastguard Worker psubw xmm7, xmm5 ; c1 585*fb1b10abSAndroid Build Coastguard Worker 586*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm1 587*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm3 588*fb1b10abSAndroid Build Coastguard Worker 589*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 590*fb1b10abSAndroid Build Coastguard Worker paddw xmm5, xmm1 591*fb1b10abSAndroid Build Coastguard Worker 592*fb1b10abSAndroid Build Coastguard Worker pmulhw xmm3, [GLOBAL(x_s1sqr2)] 593*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm4 594*fb1b10abSAndroid Build Coastguard Worker 595*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm5 ; d1 596*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, [GLOBAL(fours)] 597*fb1b10abSAndroid Build Coastguard Worker 598*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, [GLOBAL(fours)] 599*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm2 ; a1 600*fb1b10abSAndroid Build Coastguard Worker 601*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm0 ; b1 602*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm3 ;0 603*fb1b10abSAndroid Build Coastguard Worker 604*fb1b10abSAndroid Build Coastguard Worker paddw xmm4, xmm7 ;1 605*fb1b10abSAndroid Build Coastguard Worker psubw xmm0, xmm7 ;2 606*fb1b10abSAndroid Build Coastguard Worker 607*fb1b10abSAndroid Build Coastguard Worker psubw xmm6, xmm3 ;3 608*fb1b10abSAndroid Build Coastguard Worker psraw xmm2, 3 609*fb1b10abSAndroid Build Coastguard Worker 610*fb1b10abSAndroid Build Coastguard Worker psraw xmm0, 3 611*fb1b10abSAndroid Build Coastguard Worker psraw xmm4, 3 612*fb1b10abSAndroid Build Coastguard Worker 613*fb1b10abSAndroid Build Coastguard Worker psraw xmm6, 3 614*fb1b10abSAndroid Build Coastguard Worker 615*fb1b10abSAndroid Build Coastguard Worker ; transpose to save 616*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 617*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 618*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 619*fb1b10abSAndroid Build Coastguard Worker 620*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 621*fb1b10abSAndroid Build Coastguard Worker punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 622*fb1b10abSAndroid Build Coastguard Worker punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 623*fb1b10abSAndroid Build Coastguard Worker 624*fb1b10abSAndroid Build Coastguard Worker 625*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 626*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 627*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 628*fb1b10abSAndroid Build Coastguard Worker 629*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 630*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 631*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 632*fb1b10abSAndroid Build Coastguard Worker 633*fb1b10abSAndroid Build Coastguard Worker 634*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 635*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 636*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 637*fb1b10abSAndroid Build Coastguard Worker 638*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 639*fb1b10abSAndroid Build Coastguard Worker punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 640*fb1b10abSAndroid Build Coastguard Worker punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 641*fb1b10abSAndroid Build Coastguard Worker 642*fb1b10abSAndroid Build Coastguard Worker pshufd xmm0, xmm2, 11011000b 643*fb1b10abSAndroid Build Coastguard Worker pshufd xmm2, xmm1, 11011000b 644*fb1b10abSAndroid Build Coastguard Worker 645*fb1b10abSAndroid Build Coastguard Worker pshufd xmm1, xmm5, 11011000b 646*fb1b10abSAndroid Build Coastguard Worker pshufd xmm3, xmm7, 11011000b 647*fb1b10abSAndroid Build Coastguard Worker 648*fb1b10abSAndroid Build Coastguard Worker pxor xmm7, xmm7 649*fb1b10abSAndroid Build Coastguard Worker 650*fb1b10abSAndroid Build Coastguard Worker ; Load up predict blocks 651*fb1b10abSAndroid Build Coastguard Worker movsxd rdx, dword ptr arg(3) ; dst_stride 652*fb1b10abSAndroid Build Coastguard Worker movq xmm4, [rdi] 653*fb1b10abSAndroid Build Coastguard Worker movq xmm5, [rdi+rdx] 654*fb1b10abSAndroid Build Coastguard Worker lea rcx, [rdx + rdx*2] 655*fb1b10abSAndroid Build Coastguard Worker 656*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm7 657*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm5, xmm7 658*fb1b10abSAndroid Build Coastguard Worker 659*fb1b10abSAndroid Build Coastguard Worker paddw xmm0, xmm4 660*fb1b10abSAndroid Build Coastguard Worker paddw xmm1, xmm5 661*fb1b10abSAndroid Build Coastguard Worker 662*fb1b10abSAndroid Build Coastguard Worker movq xmm4, [rdi+rdx*2] 663*fb1b10abSAndroid Build Coastguard Worker movq xmm5, [rdi+rcx] 664*fb1b10abSAndroid Build Coastguard Worker 665*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm7 666*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm5, xmm7 667*fb1b10abSAndroid Build Coastguard Worker 668*fb1b10abSAndroid Build Coastguard Worker paddw xmm2, xmm4 669*fb1b10abSAndroid Build Coastguard Worker paddw xmm3, xmm5 670*fb1b10abSAndroid Build Coastguard Worker 671*fb1b10abSAndroid Build Coastguard Worker.finish: 672*fb1b10abSAndroid Build Coastguard Worker 673*fb1b10abSAndroid Build Coastguard Worker ; pack up before storing 674*fb1b10abSAndroid Build Coastguard Worker packuswb xmm0, xmm7 675*fb1b10abSAndroid Build Coastguard Worker packuswb xmm1, xmm7 676*fb1b10abSAndroid Build Coastguard Worker packuswb xmm2, xmm7 677*fb1b10abSAndroid Build Coastguard Worker packuswb xmm3, xmm7 678*fb1b10abSAndroid Build Coastguard Worker 679*fb1b10abSAndroid Build Coastguard Worker ; Load destination stride before writing out, 680*fb1b10abSAndroid Build Coastguard Worker ; doesn't need to persist 681*fb1b10abSAndroid Build Coastguard Worker movsxd rdx, dword ptr arg(3) ; dst_stride 682*fb1b10abSAndroid Build Coastguard Worker 683*fb1b10abSAndroid Build Coastguard Worker ; store blocks back out 684*fb1b10abSAndroid Build Coastguard Worker movq [rdi], xmm0 685*fb1b10abSAndroid Build Coastguard Worker movq [rdi + rdx], xmm1 686*fb1b10abSAndroid Build Coastguard Worker 687*fb1b10abSAndroid Build Coastguard Worker lea rdi, [rdi + 2*rdx] 688*fb1b10abSAndroid Build Coastguard Worker 689*fb1b10abSAndroid Build Coastguard Worker movq [rdi], xmm2 690*fb1b10abSAndroid Build Coastguard Worker movq [rdi + rdx], xmm3 691*fb1b10abSAndroid Build Coastguard Worker 692*fb1b10abSAndroid Build Coastguard Worker 693*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 694*fb1b10abSAndroid Build Coastguard Worker pop rdi 695*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 696*fb1b10abSAndroid Build Coastguard Worker RESTORE_XMM 697*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 698*fb1b10abSAndroid Build Coastguard Worker pop rbp 699*fb1b10abSAndroid Build Coastguard Worker ret 700*fb1b10abSAndroid Build Coastguard Worker 701*fb1b10abSAndroid Build Coastguard WorkerSECTION_RODATA 702*fb1b10abSAndroid Build Coastguard Workeralign 16 703*fb1b10abSAndroid Build Coastguard Workerfours: 704*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0x0004 705*fb1b10abSAndroid Build Coastguard Workeralign 16 706*fb1b10abSAndroid Build Coastguard Workerx_s1sqr2: 707*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0x8A8C 708*fb1b10abSAndroid Build Coastguard Workeralign 16 709*fb1b10abSAndroid Build Coastguard Workerx_c1sqr2less1: 710*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0x4E7B 711