1*fb1b10abSAndroid Build Coastguard Worker; 2*fb1b10abSAndroid Build Coastguard Worker; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3*fb1b10abSAndroid Build Coastguard Worker; 4*fb1b10abSAndroid Build Coastguard Worker; Use of this source code is governed by a BSD-style license 5*fb1b10abSAndroid Build Coastguard Worker; that can be found in the LICENSE file in the root of the source 6*fb1b10abSAndroid Build Coastguard Worker; tree. An additional intellectual property rights grant can be found 7*fb1b10abSAndroid Build Coastguard Worker; in the file PATENTS. All contributing project authors may 8*fb1b10abSAndroid Build Coastguard Worker; be found in the AUTHORS file in the root of the source tree. 9*fb1b10abSAndroid Build Coastguard Worker; 10*fb1b10abSAndroid Build Coastguard Worker 11*fb1b10abSAndroid Build Coastguard Worker 12*fb1b10abSAndroid Build Coastguard Worker%include "vpx_ports/x86_abi_support.asm" 13*fb1b10abSAndroid Build Coastguard Worker 14*fb1b10abSAndroid Build Coastguard Worker%define BLOCK_HEIGHT_WIDTH 4 15*fb1b10abSAndroid Build Coastguard Worker%define VP8_FILTER_WEIGHT 128 16*fb1b10abSAndroid Build Coastguard Worker%define VP8_FILTER_SHIFT 7 17*fb1b10abSAndroid Build Coastguard Worker 18*fb1b10abSAndroid Build Coastguard WorkerSECTION .text 19*fb1b10abSAndroid Build Coastguard Worker 20*fb1b10abSAndroid Build Coastguard Worker;/************************************************************************************ 21*fb1b10abSAndroid Build Coastguard Worker; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 22*fb1b10abSAndroid Build Coastguard Worker; input pixel array has output_height rows. This routine assumes that output_height is an 23*fb1b10abSAndroid Build Coastguard Worker; even number. This function handles 8 pixels in horizontal direction, calculating ONE 24*fb1b10abSAndroid Build Coastguard Worker; rows each iteration to take advantage of the 128 bits operations. 25*fb1b10abSAndroid Build Coastguard Worker;*************************************************************************************/ 26*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d8_h6_sse2 27*fb1b10abSAndroid Build Coastguard Worker;( 28*fb1b10abSAndroid Build Coastguard Worker; unsigned char *src_ptr, 29*fb1b10abSAndroid Build Coastguard Worker; unsigned short *output_ptr, 30*fb1b10abSAndroid Build Coastguard Worker; unsigned int src_pixels_per_line, 31*fb1b10abSAndroid Build Coastguard Worker; unsigned int pixel_step, 32*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_height, 33*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_width, 34*fb1b10abSAndroid Build Coastguard Worker; short *vp8_filter 35*fb1b10abSAndroid Build Coastguard Worker;) 36*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d8_h6_sse2) 37*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d8_h6_sse2): 38*fb1b10abSAndroid Build Coastguard Worker push rbp 39*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 40*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 7 41*fb1b10abSAndroid Build Coastguard Worker SAVE_XMM 7 42*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 43*fb1b10abSAndroid Build Coastguard Worker push rsi 44*fb1b10abSAndroid Build Coastguard Worker push rdi 45*fb1b10abSAndroid Build Coastguard Worker ; end prolog 46*fb1b10abSAndroid Build Coastguard Worker 47*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(6) ;vp8_filter 48*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(0) ;src_ptr 49*fb1b10abSAndroid Build Coastguard Worker 50*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(1) ;output_ptr 51*fb1b10abSAndroid Build Coastguard Worker 52*fb1b10abSAndroid Build Coastguard Worker movsxd rcx, dword ptr arg(4) ;output_height 53*fb1b10abSAndroid Build Coastguard Worker movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 54*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0 55*fb1b10abSAndroid Build Coastguard Worker movsxd r8, dword ptr arg(5) ;output_width 56*fb1b10abSAndroid Build Coastguard Worker%endif 57*fb1b10abSAndroid Build Coastguard Worker pxor xmm0, xmm0 ; clear xmm0 for unpack 58*fb1b10abSAndroid Build Coastguard Worker 59*fb1b10abSAndroid Build Coastguard Worker.filter_block1d8_h6_rowloop: 60*fb1b10abSAndroid Build Coastguard Worker movq xmm3, MMWORD PTR [rsi - 2] 61*fb1b10abSAndroid Build Coastguard Worker movq xmm1, MMWORD PTR [rsi + 6] 62*fb1b10abSAndroid Build Coastguard Worker 63*fb1b10abSAndroid Build Coastguard Worker prefetcht2 [rsi+rax-2] 64*fb1b10abSAndroid Build Coastguard Worker 65*fb1b10abSAndroid Build Coastguard Worker pslldq xmm1, 8 66*fb1b10abSAndroid Build Coastguard Worker por xmm1, xmm3 67*fb1b10abSAndroid Build Coastguard Worker 68*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm1 69*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm1 70*fb1b10abSAndroid Build Coastguard Worker 71*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm1 72*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm1 73*fb1b10abSAndroid Build Coastguard Worker 74*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 75*fb1b10abSAndroid Build Coastguard Worker psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 76*fb1b10abSAndroid Build Coastguard Worker 77*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 78*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 79*fb1b10abSAndroid Build Coastguard Worker 80*fb1b10abSAndroid Build Coastguard Worker psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 81*fb1b10abSAndroid Build Coastguard Worker pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 82*fb1b10abSAndroid Build Coastguard Worker 83*fb1b10abSAndroid Build Coastguard Worker 84*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 85*fb1b10abSAndroid Build Coastguard Worker psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 86*fb1b10abSAndroid Build Coastguard Worker 87*fb1b10abSAndroid Build Coastguard Worker pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 88*fb1b10abSAndroid Build Coastguard Worker 89*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 90*fb1b10abSAndroid Build Coastguard Worker psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 91*fb1b10abSAndroid Build Coastguard Worker 92*fb1b10abSAndroid Build Coastguard Worker pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 93*fb1b10abSAndroid Build Coastguard Worker 94*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 95*fb1b10abSAndroid Build Coastguard Worker psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 96*fb1b10abSAndroid Build Coastguard Worker 97*fb1b10abSAndroid Build Coastguard Worker 98*fb1b10abSAndroid Build Coastguard Worker pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 99*fb1b10abSAndroid Build Coastguard Worker 100*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 101*fb1b10abSAndroid Build Coastguard Worker pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 102*fb1b10abSAndroid Build Coastguard Worker 103*fb1b10abSAndroid Build Coastguard Worker 104*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm7 105*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm5 106*fb1b10abSAndroid Build Coastguard Worker 107*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm3 108*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm6 109*fb1b10abSAndroid Build Coastguard Worker 110*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm1 111*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, [GLOBAL(rd)] 112*fb1b10abSAndroid Build Coastguard Worker 113*fb1b10abSAndroid Build Coastguard Worker psraw xmm4, 7 114*fb1b10abSAndroid Build Coastguard Worker 115*fb1b10abSAndroid Build Coastguard Worker packuswb xmm4, xmm0 116*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm0 117*fb1b10abSAndroid Build Coastguard Worker 118*fb1b10abSAndroid Build Coastguard Worker movdqa XMMWORD Ptr [rdi], xmm4 119*fb1b10abSAndroid Build Coastguard Worker lea rsi, [rsi + rax] 120*fb1b10abSAndroid Build Coastguard Worker 121*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT 122*fb1b10abSAndroid Build Coastguard Worker add rdi, DWORD Ptr arg(5) ;[output_width] 123*fb1b10abSAndroid Build Coastguard Worker%else 124*fb1b10abSAndroid Build Coastguard Worker add rdi, r8 125*fb1b10abSAndroid Build Coastguard Worker%endif 126*fb1b10abSAndroid Build Coastguard Worker dec rcx 127*fb1b10abSAndroid Build Coastguard Worker 128*fb1b10abSAndroid Build Coastguard Worker jnz .filter_block1d8_h6_rowloop ; next row 129*fb1b10abSAndroid Build Coastguard Worker 130*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 131*fb1b10abSAndroid Build Coastguard Worker pop rdi 132*fb1b10abSAndroid Build Coastguard Worker pop rsi 133*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 134*fb1b10abSAndroid Build Coastguard Worker RESTORE_XMM 135*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 136*fb1b10abSAndroid Build Coastguard Worker pop rbp 137*fb1b10abSAndroid Build Coastguard Worker ret 138*fb1b10abSAndroid Build Coastguard Worker 139*fb1b10abSAndroid Build Coastguard Worker 140*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d16_h6_sse2 141*fb1b10abSAndroid Build Coastguard Worker;( 142*fb1b10abSAndroid Build Coastguard Worker; unsigned char *src_ptr, 143*fb1b10abSAndroid Build Coastguard Worker; unsigned short *output_ptr, 144*fb1b10abSAndroid Build Coastguard Worker; unsigned int src_pixels_per_line, 145*fb1b10abSAndroid Build Coastguard Worker; unsigned int pixel_step, 146*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_height, 147*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_width, 148*fb1b10abSAndroid Build Coastguard Worker; short *vp8_filter 149*fb1b10abSAndroid Build Coastguard Worker;) 150*fb1b10abSAndroid Build Coastguard Worker;/************************************************************************************ 151*fb1b10abSAndroid Build Coastguard Worker; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 152*fb1b10abSAndroid Build Coastguard Worker; input pixel array has output_height rows. This routine assumes that output_height is an 153*fb1b10abSAndroid Build Coastguard Worker; even number. This function handles 8 pixels in horizontal direction, calculating ONE 154*fb1b10abSAndroid Build Coastguard Worker; rows each iteration to take advantage of the 128 bits operations. 155*fb1b10abSAndroid Build Coastguard Worker;*************************************************************************************/ 156*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d16_h6_sse2) 157*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d16_h6_sse2): 158*fb1b10abSAndroid Build Coastguard Worker push rbp 159*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 160*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 7 161*fb1b10abSAndroid Build Coastguard Worker SAVE_XMM 7 162*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 163*fb1b10abSAndroid Build Coastguard Worker push rsi 164*fb1b10abSAndroid Build Coastguard Worker push rdi 165*fb1b10abSAndroid Build Coastguard Worker ; end prolog 166*fb1b10abSAndroid Build Coastguard Worker 167*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(6) ;vp8_filter 168*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(0) ;src_ptr 169*fb1b10abSAndroid Build Coastguard Worker 170*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(1) ;output_ptr 171*fb1b10abSAndroid Build Coastguard Worker 172*fb1b10abSAndroid Build Coastguard Worker movsxd rcx, dword ptr arg(4) ;output_height 173*fb1b10abSAndroid Build Coastguard Worker movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 174*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0 175*fb1b10abSAndroid Build Coastguard Worker movsxd r8, dword ptr arg(5) ;output_width 176*fb1b10abSAndroid Build Coastguard Worker%endif 177*fb1b10abSAndroid Build Coastguard Worker 178*fb1b10abSAndroid Build Coastguard Worker pxor xmm0, xmm0 ; clear xmm0 for unpack 179*fb1b10abSAndroid Build Coastguard Worker 180*fb1b10abSAndroid Build Coastguard Worker.filter_block1d16_h6_sse2_rowloop: 181*fb1b10abSAndroid Build Coastguard Worker movq xmm3, MMWORD PTR [rsi - 2] 182*fb1b10abSAndroid Build Coastguard Worker movq xmm1, MMWORD PTR [rsi + 6] 183*fb1b10abSAndroid Build Coastguard Worker 184*fb1b10abSAndroid Build Coastguard Worker ; Load from 11 to avoid reading out of bounds. 185*fb1b10abSAndroid Build Coastguard Worker movq xmm2, MMWORD PTR [rsi +11] 186*fb1b10abSAndroid Build Coastguard Worker ; The lower bits are not cleared before 'or'ing with xmm1, 187*fb1b10abSAndroid Build Coastguard Worker ; but that is OK because the values in the overlapping positions 188*fb1b10abSAndroid Build Coastguard Worker ; are already equal to the ones in xmm1. 189*fb1b10abSAndroid Build Coastguard Worker pslldq xmm2, 5 190*fb1b10abSAndroid Build Coastguard Worker 191*fb1b10abSAndroid Build Coastguard Worker por xmm2, xmm1 192*fb1b10abSAndroid Build Coastguard Worker prefetcht2 [rsi+rax-2] 193*fb1b10abSAndroid Build Coastguard Worker 194*fb1b10abSAndroid Build Coastguard Worker pslldq xmm1, 8 195*fb1b10abSAndroid Build Coastguard Worker por xmm1, xmm3 196*fb1b10abSAndroid Build Coastguard Worker 197*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm1 198*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm1 199*fb1b10abSAndroid Build Coastguard Worker 200*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm1 201*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm1 202*fb1b10abSAndroid Build Coastguard Worker 203*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 204*fb1b10abSAndroid Build Coastguard Worker psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 205*fb1b10abSAndroid Build Coastguard Worker 206*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 207*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 208*fb1b10abSAndroid Build Coastguard Worker 209*fb1b10abSAndroid Build Coastguard Worker psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 210*fb1b10abSAndroid Build Coastguard Worker pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 211*fb1b10abSAndroid Build Coastguard Worker 212*fb1b10abSAndroid Build Coastguard Worker 213*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 214*fb1b10abSAndroid Build Coastguard Worker psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 215*fb1b10abSAndroid Build Coastguard Worker 216*fb1b10abSAndroid Build Coastguard Worker pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 217*fb1b10abSAndroid Build Coastguard Worker 218*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 219*fb1b10abSAndroid Build Coastguard Worker psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 220*fb1b10abSAndroid Build Coastguard Worker 221*fb1b10abSAndroid Build Coastguard Worker pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 222*fb1b10abSAndroid Build Coastguard Worker 223*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 224*fb1b10abSAndroid Build Coastguard Worker psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 225*fb1b10abSAndroid Build Coastguard Worker 226*fb1b10abSAndroid Build Coastguard Worker 227*fb1b10abSAndroid Build Coastguard Worker pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 228*fb1b10abSAndroid Build Coastguard Worker 229*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 230*fb1b10abSAndroid Build Coastguard Worker pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 231*fb1b10abSAndroid Build Coastguard Worker 232*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm7 233*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm5 234*fb1b10abSAndroid Build Coastguard Worker 235*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm3 236*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm6 237*fb1b10abSAndroid Build Coastguard Worker 238*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm1 239*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, [GLOBAL(rd)] 240*fb1b10abSAndroid Build Coastguard Worker 241*fb1b10abSAndroid Build Coastguard Worker psraw xmm4, 7 242*fb1b10abSAndroid Build Coastguard Worker 243*fb1b10abSAndroid Build Coastguard Worker packuswb xmm4, xmm0 244*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm0 245*fb1b10abSAndroid Build Coastguard Worker 246*fb1b10abSAndroid Build Coastguard Worker movdqa XMMWORD Ptr [rdi], xmm4 247*fb1b10abSAndroid Build Coastguard Worker 248*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, xmm2 249*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm2 250*fb1b10abSAndroid Build Coastguard Worker 251*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm2 252*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm2 253*fb1b10abSAndroid Build Coastguard Worker 254*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm2 255*fb1b10abSAndroid Build Coastguard Worker 256*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 257*fb1b10abSAndroid Build Coastguard Worker psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 258*fb1b10abSAndroid Build Coastguard Worker 259*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 260*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 261*fb1b10abSAndroid Build Coastguard Worker 262*fb1b10abSAndroid Build Coastguard Worker psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 263*fb1b10abSAndroid Build Coastguard Worker pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 264*fb1b10abSAndroid Build Coastguard Worker 265*fb1b10abSAndroid Build Coastguard Worker 266*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 267*fb1b10abSAndroid Build Coastguard Worker psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 268*fb1b10abSAndroid Build Coastguard Worker 269*fb1b10abSAndroid Build Coastguard Worker pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 270*fb1b10abSAndroid Build Coastguard Worker 271*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 272*fb1b10abSAndroid Build Coastguard Worker psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 273*fb1b10abSAndroid Build Coastguard Worker 274*fb1b10abSAndroid Build Coastguard Worker pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 275*fb1b10abSAndroid Build Coastguard Worker 276*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 277*fb1b10abSAndroid Build Coastguard Worker psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 278*fb1b10abSAndroid Build Coastguard Worker 279*fb1b10abSAndroid Build Coastguard Worker pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 280*fb1b10abSAndroid Build Coastguard Worker 281*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 282*fb1b10abSAndroid Build Coastguard Worker pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 283*fb1b10abSAndroid Build Coastguard Worker 284*fb1b10abSAndroid Build Coastguard Worker 285*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm7 286*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm5 287*fb1b10abSAndroid Build Coastguard Worker 288*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm3 289*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm6 290*fb1b10abSAndroid Build Coastguard Worker 291*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm2 292*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, [GLOBAL(rd)] 293*fb1b10abSAndroid Build Coastguard Worker 294*fb1b10abSAndroid Build Coastguard Worker psraw xmm4, 7 295*fb1b10abSAndroid Build Coastguard Worker 296*fb1b10abSAndroid Build Coastguard Worker packuswb xmm4, xmm0 297*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm0 298*fb1b10abSAndroid Build Coastguard Worker 299*fb1b10abSAndroid Build Coastguard Worker movdqa XMMWORD Ptr [rdi+16], xmm4 300*fb1b10abSAndroid Build Coastguard Worker 301*fb1b10abSAndroid Build Coastguard Worker lea rsi, [rsi + rax] 302*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT 303*fb1b10abSAndroid Build Coastguard Worker add rdi, DWORD Ptr arg(5) ;[output_width] 304*fb1b10abSAndroid Build Coastguard Worker%else 305*fb1b10abSAndroid Build Coastguard Worker add rdi, r8 306*fb1b10abSAndroid Build Coastguard Worker%endif 307*fb1b10abSAndroid Build Coastguard Worker 308*fb1b10abSAndroid Build Coastguard Worker dec rcx 309*fb1b10abSAndroid Build Coastguard Worker jnz .filter_block1d16_h6_sse2_rowloop ; next row 310*fb1b10abSAndroid Build Coastguard Worker 311*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 312*fb1b10abSAndroid Build Coastguard Worker pop rdi 313*fb1b10abSAndroid Build Coastguard Worker pop rsi 314*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 315*fb1b10abSAndroid Build Coastguard Worker RESTORE_XMM 316*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 317*fb1b10abSAndroid Build Coastguard Worker pop rbp 318*fb1b10abSAndroid Build Coastguard Worker ret 319*fb1b10abSAndroid Build Coastguard Worker 320*fb1b10abSAndroid Build Coastguard Worker 321*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d8_v6_sse2 322*fb1b10abSAndroid Build Coastguard Worker;( 323*fb1b10abSAndroid Build Coastguard Worker; short *src_ptr, 324*fb1b10abSAndroid Build Coastguard Worker; unsigned char *output_ptr, 325*fb1b10abSAndroid Build Coastguard Worker; int dst_ptich, 326*fb1b10abSAndroid Build Coastguard Worker; unsigned int pixels_per_line, 327*fb1b10abSAndroid Build Coastguard Worker; unsigned int pixel_step, 328*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_height, 329*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_width, 330*fb1b10abSAndroid Build Coastguard Worker; short * vp8_filter 331*fb1b10abSAndroid Build Coastguard Worker;) 332*fb1b10abSAndroid Build Coastguard Worker;/************************************************************************************ 333*fb1b10abSAndroid Build Coastguard Worker; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The 334*fb1b10abSAndroid Build Coastguard Worker; input pixel array has output_height rows. 335*fb1b10abSAndroid Build Coastguard Worker;*************************************************************************************/ 336*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d8_v6_sse2) 337*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d8_v6_sse2): 338*fb1b10abSAndroid Build Coastguard Worker push rbp 339*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 340*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 8 341*fb1b10abSAndroid Build Coastguard Worker SAVE_XMM 7 342*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 343*fb1b10abSAndroid Build Coastguard Worker push rsi 344*fb1b10abSAndroid Build Coastguard Worker push rdi 345*fb1b10abSAndroid Build Coastguard Worker ; end prolog 346*fb1b10abSAndroid Build Coastguard Worker 347*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(7) ;vp8_filter 348*fb1b10abSAndroid Build Coastguard Worker movsxd rdx, dword ptr arg(3) ;pixels_per_line 349*fb1b10abSAndroid Build Coastguard Worker 350*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(1) ;output_ptr 351*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(0) ;src_ptr 352*fb1b10abSAndroid Build Coastguard Worker 353*fb1b10abSAndroid Build Coastguard Worker sub rsi, rdx 354*fb1b10abSAndroid Build Coastguard Worker sub rsi, rdx 355*fb1b10abSAndroid Build Coastguard Worker 356*fb1b10abSAndroid Build Coastguard Worker movsxd rcx, DWORD PTR arg(5) ;[output_height] 357*fb1b10abSAndroid Build Coastguard Worker pxor xmm0, xmm0 ; clear xmm0 358*fb1b10abSAndroid Build Coastguard Worker 359*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 360*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0 361*fb1b10abSAndroid Build Coastguard Worker movsxd r8, dword ptr arg(2) ; dst_ptich 362*fb1b10abSAndroid Build Coastguard Worker%endif 363*fb1b10abSAndroid Build Coastguard Worker 364*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d8_v6_sse2_loop: 365*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, XMMWORD PTR [rsi] 366*fb1b10abSAndroid Build Coastguard Worker pmullw xmm1, [rax] 367*fb1b10abSAndroid Build Coastguard Worker 368*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, XMMWORD PTR [rsi + rdx] 369*fb1b10abSAndroid Build Coastguard Worker pmullw xmm2, [rax + 16] 370*fb1b10abSAndroid Build Coastguard Worker 371*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] 372*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, [rax + 32] 373*fb1b10abSAndroid Build Coastguard Worker 374*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] 375*fb1b10abSAndroid Build Coastguard Worker pmullw xmm5, [rax + 64] 376*fb1b10abSAndroid Build Coastguard Worker 377*fb1b10abSAndroid Build Coastguard Worker add rsi, rdx 378*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] 379*fb1b10abSAndroid Build Coastguard Worker 380*fb1b10abSAndroid Build Coastguard Worker pmullw xmm4, [rax + 48] 381*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] 382*fb1b10abSAndroid Build Coastguard Worker 383*fb1b10abSAndroid Build Coastguard Worker pmullw xmm6, [rax + 80] 384*fb1b10abSAndroid Build Coastguard Worker 385*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm5 386*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm3 387*fb1b10abSAndroid Build Coastguard Worker 388*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm1 389*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm4 390*fb1b10abSAndroid Build Coastguard Worker 391*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm6 392*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm7 393*fb1b10abSAndroid Build Coastguard Worker 394*fb1b10abSAndroid Build Coastguard Worker psraw xmm2, 7 395*fb1b10abSAndroid Build Coastguard Worker packuswb xmm2, xmm0 ; pack and saturate 396*fb1b10abSAndroid Build Coastguard Worker 397*fb1b10abSAndroid Build Coastguard Worker movq QWORD PTR [rdi], xmm2 ; store the results in the destination 398*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT 399*fb1b10abSAndroid Build Coastguard Worker add rdi, DWORD PTR arg(2) ;[dst_ptich] 400*fb1b10abSAndroid Build Coastguard Worker%else 401*fb1b10abSAndroid Build Coastguard Worker add rdi, r8 402*fb1b10abSAndroid Build Coastguard Worker%endif 403*fb1b10abSAndroid Build Coastguard Worker dec rcx ; decrement count 404*fb1b10abSAndroid Build Coastguard Worker jnz .vp8_filter_block1d8_v6_sse2_loop ; next row 405*fb1b10abSAndroid Build Coastguard Worker 406*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 407*fb1b10abSAndroid Build Coastguard Worker pop rdi 408*fb1b10abSAndroid Build Coastguard Worker pop rsi 409*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 410*fb1b10abSAndroid Build Coastguard Worker RESTORE_XMM 411*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 412*fb1b10abSAndroid Build Coastguard Worker pop rbp 413*fb1b10abSAndroid Build Coastguard Worker ret 414*fb1b10abSAndroid Build Coastguard Worker 415*fb1b10abSAndroid Build Coastguard Worker 416*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d16_v6_sse2 417*fb1b10abSAndroid Build Coastguard Worker;( 418*fb1b10abSAndroid Build Coastguard Worker; unsigned short *src_ptr, 419*fb1b10abSAndroid Build Coastguard Worker; unsigned char *output_ptr, 420*fb1b10abSAndroid Build Coastguard Worker; int dst_ptich, 421*fb1b10abSAndroid Build Coastguard Worker; unsigned int pixels_per_line, 422*fb1b10abSAndroid Build Coastguard Worker; unsigned int pixel_step, 423*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_height, 424*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_width, 425*fb1b10abSAndroid Build Coastguard Worker; const short *vp8_filter 426*fb1b10abSAndroid Build Coastguard Worker;) 427*fb1b10abSAndroid Build Coastguard Worker;/************************************************************************************ 428*fb1b10abSAndroid Build Coastguard Worker; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The 429*fb1b10abSAndroid Build Coastguard Worker; input pixel array has output_height rows. 430*fb1b10abSAndroid Build Coastguard Worker;*************************************************************************************/ 431*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d16_v6_sse2) 432*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d16_v6_sse2): 433*fb1b10abSAndroid Build Coastguard Worker push rbp 434*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 435*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 8 436*fb1b10abSAndroid Build Coastguard Worker SAVE_XMM 7 437*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 438*fb1b10abSAndroid Build Coastguard Worker push rsi 439*fb1b10abSAndroid Build Coastguard Worker push rdi 440*fb1b10abSAndroid Build Coastguard Worker ; end prolog 441*fb1b10abSAndroid Build Coastguard Worker 442*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(7) ;vp8_filter 443*fb1b10abSAndroid Build Coastguard Worker movsxd rdx, dword ptr arg(3) ;pixels_per_line 444*fb1b10abSAndroid Build Coastguard Worker 445*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(1) ;output_ptr 446*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(0) ;src_ptr 447*fb1b10abSAndroid Build Coastguard Worker 448*fb1b10abSAndroid Build Coastguard Worker sub rsi, rdx 449*fb1b10abSAndroid Build Coastguard Worker sub rsi, rdx 450*fb1b10abSAndroid Build Coastguard Worker 451*fb1b10abSAndroid Build Coastguard Worker movsxd rcx, DWORD PTR arg(5) ;[output_height] 452*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0 453*fb1b10abSAndroid Build Coastguard Worker movsxd r8, dword ptr arg(2) ; dst_ptich 454*fb1b10abSAndroid Build Coastguard Worker%endif 455*fb1b10abSAndroid Build Coastguard Worker 456*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d16_v6_sse2_loop: 457*fb1b10abSAndroid Build Coastguard Worker; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. 458*fb1b10abSAndroid Build Coastguard Worker movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 459*fb1b10abSAndroid Build Coastguard Worker movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] 460*fb1b10abSAndroid Build Coastguard Worker pmullw xmm1, [rax + 16] 461*fb1b10abSAndroid Build Coastguard Worker pmullw xmm2, [rax + 16] 462*fb1b10abSAndroid Build Coastguard Worker 463*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 464*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] 465*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, [rax + 64] 466*fb1b10abSAndroid Build Coastguard Worker pmullw xmm4, [rax + 64] 467*fb1b10abSAndroid Build Coastguard Worker 468*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 469*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] 470*fb1b10abSAndroid Build Coastguard Worker pmullw xmm5, [rax + 32] 471*fb1b10abSAndroid Build Coastguard Worker pmullw xmm6, [rax + 32] 472*fb1b10abSAndroid Build Coastguard Worker 473*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, XMMWORD PTR [rsi] ; line 1 474*fb1b10abSAndroid Build Coastguard Worker movdqa xmm0, XMMWORD PTR [rsi + 16] 475*fb1b10abSAndroid Build Coastguard Worker pmullw xmm7, [rax] 476*fb1b10abSAndroid Build Coastguard Worker pmullw xmm0, [rax] 477*fb1b10abSAndroid Build Coastguard Worker 478*fb1b10abSAndroid Build Coastguard Worker paddsw xmm1, xmm3 479*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm4 480*fb1b10abSAndroid Build Coastguard Worker paddsw xmm1, xmm5 481*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm6 482*fb1b10abSAndroid Build Coastguard Worker paddsw xmm1, xmm7 483*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm0 484*fb1b10abSAndroid Build Coastguard Worker 485*fb1b10abSAndroid Build Coastguard Worker add rsi, rdx 486*fb1b10abSAndroid Build Coastguard Worker 487*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 488*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] 489*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, [rax + 48] 490*fb1b10abSAndroid Build Coastguard Worker pmullw xmm4, [rax + 48] 491*fb1b10abSAndroid Build Coastguard Worker 492*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 493*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] 494*fb1b10abSAndroid Build Coastguard Worker pmullw xmm5, [rax + 80] 495*fb1b10abSAndroid Build Coastguard Worker pmullw xmm6, [rax + 80] 496*fb1b10abSAndroid Build Coastguard Worker 497*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 498*fb1b10abSAndroid Build Coastguard Worker pxor xmm0, xmm0 ; clear xmm0 499*fb1b10abSAndroid Build Coastguard Worker 500*fb1b10abSAndroid Build Coastguard Worker paddsw xmm1, xmm3 501*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm4 502*fb1b10abSAndroid Build Coastguard Worker paddsw xmm1, xmm5 503*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm6 504*fb1b10abSAndroid Build Coastguard Worker 505*fb1b10abSAndroid Build Coastguard Worker paddsw xmm1, xmm7 506*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm7 507*fb1b10abSAndroid Build Coastguard Worker 508*fb1b10abSAndroid Build Coastguard Worker psraw xmm1, 7 509*fb1b10abSAndroid Build Coastguard Worker psraw xmm2, 7 510*fb1b10abSAndroid Build Coastguard Worker 511*fb1b10abSAndroid Build Coastguard Worker packuswb xmm1, xmm2 ; pack and saturate 512*fb1b10abSAndroid Build Coastguard Worker movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination 513*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT 514*fb1b10abSAndroid Build Coastguard Worker add rdi, DWORD PTR arg(2) ;[dst_ptich] 515*fb1b10abSAndroid Build Coastguard Worker%else 516*fb1b10abSAndroid Build Coastguard Worker add rdi, r8 517*fb1b10abSAndroid Build Coastguard Worker%endif 518*fb1b10abSAndroid Build Coastguard Worker dec rcx ; decrement count 519*fb1b10abSAndroid Build Coastguard Worker jnz .vp8_filter_block1d16_v6_sse2_loop ; next row 520*fb1b10abSAndroid Build Coastguard Worker 521*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 522*fb1b10abSAndroid Build Coastguard Worker pop rdi 523*fb1b10abSAndroid Build Coastguard Worker pop rsi 524*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 525*fb1b10abSAndroid Build Coastguard Worker RESTORE_XMM 526*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 527*fb1b10abSAndroid Build Coastguard Worker pop rbp 528*fb1b10abSAndroid Build Coastguard Worker ret 529*fb1b10abSAndroid Build Coastguard Worker 530*fb1b10abSAndroid Build Coastguard Worker 531*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d8_h6_only_sse2 532*fb1b10abSAndroid Build Coastguard Worker;( 533*fb1b10abSAndroid Build Coastguard Worker; unsigned char *src_ptr, 534*fb1b10abSAndroid Build Coastguard Worker; unsigned int src_pixels_per_line, 535*fb1b10abSAndroid Build Coastguard Worker; unsigned char *output_ptr, 536*fb1b10abSAndroid Build Coastguard Worker; int dst_ptich, 537*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_height, 538*fb1b10abSAndroid Build Coastguard Worker; const short *vp8_filter 539*fb1b10abSAndroid Build Coastguard Worker;) 540*fb1b10abSAndroid Build Coastguard Worker; First-pass filter only when yoffset==0 541*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d8_h6_only_sse2) 542*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d8_h6_only_sse2): 543*fb1b10abSAndroid Build Coastguard Worker push rbp 544*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 545*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 6 546*fb1b10abSAndroid Build Coastguard Worker SAVE_XMM 7 547*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 548*fb1b10abSAndroid Build Coastguard Worker push rsi 549*fb1b10abSAndroid Build Coastguard Worker push rdi 550*fb1b10abSAndroid Build Coastguard Worker ; end prolog 551*fb1b10abSAndroid Build Coastguard Worker 552*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(5) ;vp8_filter 553*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(0) ;src_ptr 554*fb1b10abSAndroid Build Coastguard Worker 555*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(2) ;output_ptr 556*fb1b10abSAndroid Build Coastguard Worker 557*fb1b10abSAndroid Build Coastguard Worker movsxd rcx, dword ptr arg(4) ;output_height 558*fb1b10abSAndroid Build Coastguard Worker movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source 559*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0 560*fb1b10abSAndroid Build Coastguard Worker movsxd r8, dword ptr arg(3) ;dst_ptich 561*fb1b10abSAndroid Build Coastguard Worker%endif 562*fb1b10abSAndroid Build Coastguard Worker pxor xmm0, xmm0 ; clear xmm0 for unpack 563*fb1b10abSAndroid Build Coastguard Worker 564*fb1b10abSAndroid Build Coastguard Worker.filter_block1d8_h6_only_rowloop: 565*fb1b10abSAndroid Build Coastguard Worker movq xmm3, MMWORD PTR [rsi - 2] 566*fb1b10abSAndroid Build Coastguard Worker movq xmm1, MMWORD PTR [rsi + 6] 567*fb1b10abSAndroid Build Coastguard Worker 568*fb1b10abSAndroid Build Coastguard Worker prefetcht2 [rsi+rax-2] 569*fb1b10abSAndroid Build Coastguard Worker 570*fb1b10abSAndroid Build Coastguard Worker pslldq xmm1, 8 571*fb1b10abSAndroid Build Coastguard Worker por xmm1, xmm3 572*fb1b10abSAndroid Build Coastguard Worker 573*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm1 574*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm1 575*fb1b10abSAndroid Build Coastguard Worker 576*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm1 577*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm1 578*fb1b10abSAndroid Build Coastguard Worker 579*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 580*fb1b10abSAndroid Build Coastguard Worker psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 581*fb1b10abSAndroid Build Coastguard Worker 582*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 583*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 584*fb1b10abSAndroid Build Coastguard Worker 585*fb1b10abSAndroid Build Coastguard Worker psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 586*fb1b10abSAndroid Build Coastguard Worker pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 587*fb1b10abSAndroid Build Coastguard Worker 588*fb1b10abSAndroid Build Coastguard Worker 589*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 590*fb1b10abSAndroid Build Coastguard Worker psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 591*fb1b10abSAndroid Build Coastguard Worker 592*fb1b10abSAndroid Build Coastguard Worker pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 593*fb1b10abSAndroid Build Coastguard Worker 594*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 595*fb1b10abSAndroid Build Coastguard Worker psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 596*fb1b10abSAndroid Build Coastguard Worker 597*fb1b10abSAndroid Build Coastguard Worker pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 598*fb1b10abSAndroid Build Coastguard Worker 599*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 600*fb1b10abSAndroid Build Coastguard Worker psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 601*fb1b10abSAndroid Build Coastguard Worker 602*fb1b10abSAndroid Build Coastguard Worker 603*fb1b10abSAndroid Build Coastguard Worker pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 604*fb1b10abSAndroid Build Coastguard Worker 605*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 606*fb1b10abSAndroid Build Coastguard Worker pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 607*fb1b10abSAndroid Build Coastguard Worker 608*fb1b10abSAndroid Build Coastguard Worker 609*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm7 610*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm5 611*fb1b10abSAndroid Build Coastguard Worker 612*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm3 613*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm6 614*fb1b10abSAndroid Build Coastguard Worker 615*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm1 616*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, [GLOBAL(rd)] 617*fb1b10abSAndroid Build Coastguard Worker 618*fb1b10abSAndroid Build Coastguard Worker psraw xmm4, 7 619*fb1b10abSAndroid Build Coastguard Worker 620*fb1b10abSAndroid Build Coastguard Worker packuswb xmm4, xmm0 621*fb1b10abSAndroid Build Coastguard Worker 622*fb1b10abSAndroid Build Coastguard Worker movq QWORD PTR [rdi], xmm4 ; store the results in the destination 623*fb1b10abSAndroid Build Coastguard Worker lea rsi, [rsi + rax] 624*fb1b10abSAndroid Build Coastguard Worker 625*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT 626*fb1b10abSAndroid Build Coastguard Worker add rdi, DWORD Ptr arg(3) ;dst_ptich 627*fb1b10abSAndroid Build Coastguard Worker%else 628*fb1b10abSAndroid Build Coastguard Worker add rdi, r8 629*fb1b10abSAndroid Build Coastguard Worker%endif 630*fb1b10abSAndroid Build Coastguard Worker dec rcx 631*fb1b10abSAndroid Build Coastguard Worker 632*fb1b10abSAndroid Build Coastguard Worker jnz .filter_block1d8_h6_only_rowloop ; next row 633*fb1b10abSAndroid Build Coastguard Worker 634*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 635*fb1b10abSAndroid Build Coastguard Worker pop rdi 636*fb1b10abSAndroid Build Coastguard Worker pop rsi 637*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 638*fb1b10abSAndroid Build Coastguard Worker RESTORE_XMM 639*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 640*fb1b10abSAndroid Build Coastguard Worker pop rbp 641*fb1b10abSAndroid Build Coastguard Worker ret 642*fb1b10abSAndroid Build Coastguard Worker 643*fb1b10abSAndroid Build Coastguard Worker 644*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d16_h6_only_sse2 645*fb1b10abSAndroid Build Coastguard Worker;( 646*fb1b10abSAndroid Build Coastguard Worker; unsigned char *src_ptr, 647*fb1b10abSAndroid Build Coastguard Worker; unsigned int src_pixels_per_line, 648*fb1b10abSAndroid Build Coastguard Worker; unsigned char *output_ptr, 649*fb1b10abSAndroid Build Coastguard Worker; int dst_ptich, 650*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_height, 651*fb1b10abSAndroid Build Coastguard Worker; const short *vp8_filter 652*fb1b10abSAndroid Build Coastguard Worker;) 653*fb1b10abSAndroid Build Coastguard Worker; First-pass filter only when yoffset==0 654*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d16_h6_only_sse2) 655*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d16_h6_only_sse2): 656*fb1b10abSAndroid Build Coastguard Worker push rbp 657*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 658*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 6 659*fb1b10abSAndroid Build Coastguard Worker SAVE_XMM 7 660*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 661*fb1b10abSAndroid Build Coastguard Worker push rsi 662*fb1b10abSAndroid Build Coastguard Worker push rdi 663*fb1b10abSAndroid Build Coastguard Worker ; end prolog 664*fb1b10abSAndroid Build Coastguard Worker 665*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(5) ;vp8_filter 666*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(0) ;src_ptr 667*fb1b10abSAndroid Build Coastguard Worker 668*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(2) ;output_ptr 669*fb1b10abSAndroid Build Coastguard Worker 670*fb1b10abSAndroid Build Coastguard Worker movsxd rcx, dword ptr arg(4) ;output_height 671*fb1b10abSAndroid Build Coastguard Worker movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source 672*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0 673*fb1b10abSAndroid Build Coastguard Worker movsxd r8, dword ptr arg(3) ;dst_ptich 674*fb1b10abSAndroid Build Coastguard Worker%endif 675*fb1b10abSAndroid Build Coastguard Worker 676*fb1b10abSAndroid Build Coastguard Worker pxor xmm0, xmm0 ; clear xmm0 for unpack 677*fb1b10abSAndroid Build Coastguard Worker 678*fb1b10abSAndroid Build Coastguard Worker.filter_block1d16_h6_only_sse2_rowloop: 679*fb1b10abSAndroid Build Coastguard Worker movq xmm3, MMWORD PTR [rsi - 2] 680*fb1b10abSAndroid Build Coastguard Worker movq xmm1, MMWORD PTR [rsi + 6] 681*fb1b10abSAndroid Build Coastguard Worker 682*fb1b10abSAndroid Build Coastguard Worker movq xmm2, MMWORD PTR [rsi +14] 683*fb1b10abSAndroid Build Coastguard Worker pslldq xmm2, 8 684*fb1b10abSAndroid Build Coastguard Worker 685*fb1b10abSAndroid Build Coastguard Worker por xmm2, xmm1 686*fb1b10abSAndroid Build Coastguard Worker prefetcht2 [rsi+rax-2] 687*fb1b10abSAndroid Build Coastguard Worker 688*fb1b10abSAndroid Build Coastguard Worker pslldq xmm1, 8 689*fb1b10abSAndroid Build Coastguard Worker por xmm1, xmm3 690*fb1b10abSAndroid Build Coastguard Worker 691*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm1 692*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm1 693*fb1b10abSAndroid Build Coastguard Worker 694*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm1 695*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm1 696*fb1b10abSAndroid Build Coastguard Worker 697*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 698*fb1b10abSAndroid Build Coastguard Worker psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 699*fb1b10abSAndroid Build Coastguard Worker 700*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 701*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 702*fb1b10abSAndroid Build Coastguard Worker 703*fb1b10abSAndroid Build Coastguard Worker psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 704*fb1b10abSAndroid Build Coastguard Worker pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 705*fb1b10abSAndroid Build Coastguard Worker 706*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 707*fb1b10abSAndroid Build Coastguard Worker psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 708*fb1b10abSAndroid Build Coastguard Worker 709*fb1b10abSAndroid Build Coastguard Worker pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 710*fb1b10abSAndroid Build Coastguard Worker 711*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 712*fb1b10abSAndroid Build Coastguard Worker psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 713*fb1b10abSAndroid Build Coastguard Worker 714*fb1b10abSAndroid Build Coastguard Worker pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 715*fb1b10abSAndroid Build Coastguard Worker 716*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 717*fb1b10abSAndroid Build Coastguard Worker psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 718*fb1b10abSAndroid Build Coastguard Worker 719*fb1b10abSAndroid Build Coastguard Worker pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 720*fb1b10abSAndroid Build Coastguard Worker 721*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 722*fb1b10abSAndroid Build Coastguard Worker pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 723*fb1b10abSAndroid Build Coastguard Worker 724*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm7 725*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm5 726*fb1b10abSAndroid Build Coastguard Worker 727*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm3 728*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm6 729*fb1b10abSAndroid Build Coastguard Worker 730*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm1 731*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, [GLOBAL(rd)] 732*fb1b10abSAndroid Build Coastguard Worker 733*fb1b10abSAndroid Build Coastguard Worker psraw xmm4, 7 734*fb1b10abSAndroid Build Coastguard Worker 735*fb1b10abSAndroid Build Coastguard Worker packuswb xmm4, xmm0 ; lower 8 bytes 736*fb1b10abSAndroid Build Coastguard Worker 737*fb1b10abSAndroid Build Coastguard Worker movq QWORD Ptr [rdi], xmm4 ; store the results in the destination 738*fb1b10abSAndroid Build Coastguard Worker 739*fb1b10abSAndroid Build Coastguard Worker movdqa xmm3, xmm2 740*fb1b10abSAndroid Build Coastguard Worker movdqa xmm4, xmm2 741*fb1b10abSAndroid Build Coastguard Worker 742*fb1b10abSAndroid Build Coastguard Worker movdqa xmm5, xmm2 743*fb1b10abSAndroid Build Coastguard Worker movdqa xmm6, xmm2 744*fb1b10abSAndroid Build Coastguard Worker 745*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, xmm2 746*fb1b10abSAndroid Build Coastguard Worker 747*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 748*fb1b10abSAndroid Build Coastguard Worker psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 749*fb1b10abSAndroid Build Coastguard Worker 750*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 751*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 752*fb1b10abSAndroid Build Coastguard Worker 753*fb1b10abSAndroid Build Coastguard Worker psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 754*fb1b10abSAndroid Build Coastguard Worker pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 755*fb1b10abSAndroid Build Coastguard Worker 756*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 757*fb1b10abSAndroid Build Coastguard Worker psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 758*fb1b10abSAndroid Build Coastguard Worker 759*fb1b10abSAndroid Build Coastguard Worker pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 760*fb1b10abSAndroid Build Coastguard Worker 761*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 762*fb1b10abSAndroid Build Coastguard Worker psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 763*fb1b10abSAndroid Build Coastguard Worker 764*fb1b10abSAndroid Build Coastguard Worker pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 765*fb1b10abSAndroid Build Coastguard Worker 766*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 767*fb1b10abSAndroid Build Coastguard Worker psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 768*fb1b10abSAndroid Build Coastguard Worker 769*fb1b10abSAndroid Build Coastguard Worker pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 770*fb1b10abSAndroid Build Coastguard Worker 771*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 772*fb1b10abSAndroid Build Coastguard Worker pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 773*fb1b10abSAndroid Build Coastguard Worker 774*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm7 775*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm5 776*fb1b10abSAndroid Build Coastguard Worker 777*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm3 778*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm6 779*fb1b10abSAndroid Build Coastguard Worker 780*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, xmm2 781*fb1b10abSAndroid Build Coastguard Worker paddsw xmm4, [GLOBAL(rd)] 782*fb1b10abSAndroid Build Coastguard Worker 783*fb1b10abSAndroid Build Coastguard Worker psraw xmm4, 7 784*fb1b10abSAndroid Build Coastguard Worker 785*fb1b10abSAndroid Build Coastguard Worker packuswb xmm4, xmm0 ; higher 8 bytes 786*fb1b10abSAndroid Build Coastguard Worker 787*fb1b10abSAndroid Build Coastguard Worker movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination 788*fb1b10abSAndroid Build Coastguard Worker 789*fb1b10abSAndroid Build Coastguard Worker lea rsi, [rsi + rax] 790*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT 791*fb1b10abSAndroid Build Coastguard Worker add rdi, DWORD Ptr arg(3) ;dst_ptich 792*fb1b10abSAndroid Build Coastguard Worker%else 793*fb1b10abSAndroid Build Coastguard Worker add rdi, r8 794*fb1b10abSAndroid Build Coastguard Worker%endif 795*fb1b10abSAndroid Build Coastguard Worker 796*fb1b10abSAndroid Build Coastguard Worker dec rcx 797*fb1b10abSAndroid Build Coastguard Worker jnz .filter_block1d16_h6_only_sse2_rowloop ; next row 798*fb1b10abSAndroid Build Coastguard Worker 799*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 800*fb1b10abSAndroid Build Coastguard Worker pop rdi 801*fb1b10abSAndroid Build Coastguard Worker pop rsi 802*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 803*fb1b10abSAndroid Build Coastguard Worker RESTORE_XMM 804*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 805*fb1b10abSAndroid Build Coastguard Worker pop rbp 806*fb1b10abSAndroid Build Coastguard Worker ret 807*fb1b10abSAndroid Build Coastguard Worker 808*fb1b10abSAndroid Build Coastguard Worker 809*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d8_v6_only_sse2 810*fb1b10abSAndroid Build Coastguard Worker;( 811*fb1b10abSAndroid Build Coastguard Worker; unsigned char *src_ptr, 812*fb1b10abSAndroid Build Coastguard Worker; unsigned int src_pixels_per_line, 813*fb1b10abSAndroid Build Coastguard Worker; unsigned char *output_ptr, 814*fb1b10abSAndroid Build Coastguard Worker; int dst_ptich, 815*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_height, 816*fb1b10abSAndroid Build Coastguard Worker; const short *vp8_filter 817*fb1b10abSAndroid Build Coastguard Worker;) 818*fb1b10abSAndroid Build Coastguard Worker; Second-pass filter only when xoffset==0 819*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d8_v6_only_sse2) 820*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d8_v6_only_sse2): 821*fb1b10abSAndroid Build Coastguard Worker push rbp 822*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 823*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 6 824*fb1b10abSAndroid Build Coastguard Worker SAVE_XMM 7 825*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 826*fb1b10abSAndroid Build Coastguard Worker push rsi 827*fb1b10abSAndroid Build Coastguard Worker push rdi 828*fb1b10abSAndroid Build Coastguard Worker ; end prolog 829*fb1b10abSAndroid Build Coastguard Worker 830*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(0) ;src_ptr 831*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(2) ;output_ptr 832*fb1b10abSAndroid Build Coastguard Worker 833*fb1b10abSAndroid Build Coastguard Worker movsxd rcx, dword ptr arg(4) ;output_height 834*fb1b10abSAndroid Build Coastguard Worker movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 835*fb1b10abSAndroid Build Coastguard Worker 836*fb1b10abSAndroid Build Coastguard Worker mov rax, arg(5) ;vp8_filter 837*fb1b10abSAndroid Build Coastguard Worker 838*fb1b10abSAndroid Build Coastguard Worker pxor xmm0, xmm0 ; clear xmm0 839*fb1b10abSAndroid Build Coastguard Worker 840*fb1b10abSAndroid Build Coastguard Worker movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 841*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0 842*fb1b10abSAndroid Build Coastguard Worker movsxd r8, dword ptr arg(3) ; dst_ptich 843*fb1b10abSAndroid Build Coastguard Worker%endif 844*fb1b10abSAndroid Build Coastguard Worker 845*fb1b10abSAndroid Build Coastguard Worker.vp8_filter_block1d8_v6_only_sse2_loop: 846*fb1b10abSAndroid Build Coastguard Worker movq xmm1, MMWORD PTR [rsi] 847*fb1b10abSAndroid Build Coastguard Worker movq xmm2, MMWORD PTR [rsi + rdx] 848*fb1b10abSAndroid Build Coastguard Worker movq xmm3, MMWORD PTR [rsi + rdx * 2] 849*fb1b10abSAndroid Build Coastguard Worker movq xmm5, MMWORD PTR [rsi + rdx * 4] 850*fb1b10abSAndroid Build Coastguard Worker add rsi, rdx 851*fb1b10abSAndroid Build Coastguard Worker movq xmm4, MMWORD PTR [rsi + rdx * 2] 852*fb1b10abSAndroid Build Coastguard Worker movq xmm6, MMWORD PTR [rsi + rdx * 4] 853*fb1b10abSAndroid Build Coastguard Worker 854*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm1, xmm0 855*fb1b10abSAndroid Build Coastguard Worker pmullw xmm1, [rax] 856*fb1b10abSAndroid Build Coastguard Worker 857*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm2, xmm0 858*fb1b10abSAndroid Build Coastguard Worker pmullw xmm2, [rax + 16] 859*fb1b10abSAndroid Build Coastguard Worker 860*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm3, xmm0 861*fb1b10abSAndroid Build Coastguard Worker pmullw xmm3, [rax + 32] 862*fb1b10abSAndroid Build Coastguard Worker 863*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm5, xmm0 864*fb1b10abSAndroid Build Coastguard Worker pmullw xmm5, [rax + 64] 865*fb1b10abSAndroid Build Coastguard Worker 866*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm4, xmm0 867*fb1b10abSAndroid Build Coastguard Worker pmullw xmm4, [rax + 48] 868*fb1b10abSAndroid Build Coastguard Worker 869*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm6, xmm0 870*fb1b10abSAndroid Build Coastguard Worker pmullw xmm6, [rax + 80] 871*fb1b10abSAndroid Build Coastguard Worker 872*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm5 873*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm3 874*fb1b10abSAndroid Build Coastguard Worker 875*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm1 876*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm4 877*fb1b10abSAndroid Build Coastguard Worker 878*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm6 879*fb1b10abSAndroid Build Coastguard Worker paddsw xmm2, xmm7 880*fb1b10abSAndroid Build Coastguard Worker 881*fb1b10abSAndroid Build Coastguard Worker psraw xmm2, 7 882*fb1b10abSAndroid Build Coastguard Worker packuswb xmm2, xmm0 ; pack and saturate 883*fb1b10abSAndroid Build Coastguard Worker 884*fb1b10abSAndroid Build Coastguard Worker movq QWORD PTR [rdi], xmm2 ; store the results in the destination 885*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT 886*fb1b10abSAndroid Build Coastguard Worker add rdi, DWORD PTR arg(3) ;[dst_ptich] 887*fb1b10abSAndroid Build Coastguard Worker%else 888*fb1b10abSAndroid Build Coastguard Worker add rdi, r8 889*fb1b10abSAndroid Build Coastguard Worker%endif 890*fb1b10abSAndroid Build Coastguard Worker dec rcx ; decrement count 891*fb1b10abSAndroid Build Coastguard Worker jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row 892*fb1b10abSAndroid Build Coastguard Worker 893*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 894*fb1b10abSAndroid Build Coastguard Worker pop rdi 895*fb1b10abSAndroid Build Coastguard Worker pop rsi 896*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 897*fb1b10abSAndroid Build Coastguard Worker RESTORE_XMM 898*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 899*fb1b10abSAndroid Build Coastguard Worker pop rbp 900*fb1b10abSAndroid Build Coastguard Worker ret 901*fb1b10abSAndroid Build Coastguard Worker 902*fb1b10abSAndroid Build Coastguard Worker 903*fb1b10abSAndroid Build Coastguard Worker;void vp8_unpack_block1d16_h6_sse2 904*fb1b10abSAndroid Build Coastguard Worker;( 905*fb1b10abSAndroid Build Coastguard Worker; unsigned char *src_ptr, 906*fb1b10abSAndroid Build Coastguard Worker; unsigned short *output_ptr, 907*fb1b10abSAndroid Build Coastguard Worker; unsigned int src_pixels_per_line, 908*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_height, 909*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_width 910*fb1b10abSAndroid Build Coastguard Worker;) 911*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_unpack_block1d16_h6_sse2) 912*fb1b10abSAndroid Build Coastguard Workersym(vp8_unpack_block1d16_h6_sse2): 913*fb1b10abSAndroid Build Coastguard Worker push rbp 914*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 915*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 5 916*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 917*fb1b10abSAndroid Build Coastguard Worker push rsi 918*fb1b10abSAndroid Build Coastguard Worker push rdi 919*fb1b10abSAndroid Build Coastguard Worker ; end prolog 920*fb1b10abSAndroid Build Coastguard Worker 921*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(0) ;src_ptr 922*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(1) ;output_ptr 923*fb1b10abSAndroid Build Coastguard Worker 924*fb1b10abSAndroid Build Coastguard Worker movsxd rcx, dword ptr arg(3) ;output_height 925*fb1b10abSAndroid Build Coastguard Worker movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 926*fb1b10abSAndroid Build Coastguard Worker 927*fb1b10abSAndroid Build Coastguard Worker pxor xmm0, xmm0 ; clear xmm0 for unpack 928*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT=0 929*fb1b10abSAndroid Build Coastguard Worker movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source 930*fb1b10abSAndroid Build Coastguard Worker%endif 931*fb1b10abSAndroid Build Coastguard Worker 932*fb1b10abSAndroid Build Coastguard Worker.unpack_block1d16_h6_sse2_rowloop: 933*fb1b10abSAndroid Build Coastguard Worker movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 934*fb1b10abSAndroid Build Coastguard Worker movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 935*fb1b10abSAndroid Build Coastguard Worker 936*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 937*fb1b10abSAndroid Build Coastguard Worker punpcklbw xmm1, xmm0 938*fb1b10abSAndroid Build Coastguard Worker 939*fb1b10abSAndroid Build Coastguard Worker movdqa XMMWORD Ptr [rdi], xmm1 940*fb1b10abSAndroid Build Coastguard Worker movdqa XMMWORD Ptr [rdi + 16], xmm3 941*fb1b10abSAndroid Build Coastguard Worker 942*fb1b10abSAndroid Build Coastguard Worker lea rsi, [rsi + rax] 943*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT 944*fb1b10abSAndroid Build Coastguard Worker add rdi, DWORD Ptr arg(4) ;[output_width] 945*fb1b10abSAndroid Build Coastguard Worker%else 946*fb1b10abSAndroid Build Coastguard Worker add rdi, r8 947*fb1b10abSAndroid Build Coastguard Worker%endif 948*fb1b10abSAndroid Build Coastguard Worker dec rcx 949*fb1b10abSAndroid Build Coastguard Worker jnz .unpack_block1d16_h6_sse2_rowloop ; next row 950*fb1b10abSAndroid Build Coastguard Worker 951*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 952*fb1b10abSAndroid Build Coastguard Worker pop rdi 953*fb1b10abSAndroid Build Coastguard Worker pop rsi 954*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 955*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 956*fb1b10abSAndroid Build Coastguard Worker pop rbp 957*fb1b10abSAndroid Build Coastguard Worker ret 958*fb1b10abSAndroid Build Coastguard Worker 959*fb1b10abSAndroid Build Coastguard Worker 960*fb1b10abSAndroid Build Coastguard WorkerSECTION_RODATA 961*fb1b10abSAndroid Build Coastguard Workeralign 16 962*fb1b10abSAndroid Build Coastguard Workerrd: 963*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0x40 964