1*fb1b10abSAndroid Build Coastguard Worker; 2*fb1b10abSAndroid Build Coastguard Worker; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3*fb1b10abSAndroid Build Coastguard Worker; 4*fb1b10abSAndroid Build Coastguard Worker; Use of this source code is governed by a BSD-style license 5*fb1b10abSAndroid Build Coastguard Worker; that can be found in the LICENSE file in the root of the source 6*fb1b10abSAndroid Build Coastguard Worker; tree. An additional intellectual property rights grant can be found 7*fb1b10abSAndroid Build Coastguard Worker; in the file PATENTS. All contributing project authors may 8*fb1b10abSAndroid Build Coastguard Worker; be found in the AUTHORS file in the root of the source tree. 9*fb1b10abSAndroid Build Coastguard Worker; 10*fb1b10abSAndroid Build Coastguard Worker 11*fb1b10abSAndroid Build Coastguard Worker 12*fb1b10abSAndroid Build Coastguard Worker%include "vpx_ports/x86_abi_support.asm" 13*fb1b10abSAndroid Build Coastguard Worker 14*fb1b10abSAndroid Build Coastguard Worker%define BLOCK_HEIGHT_WIDTH 4 15*fb1b10abSAndroid Build Coastguard Worker%define vp8_filter_weight 128 16*fb1b10abSAndroid Build Coastguard Worker%define VP8_FILTER_SHIFT 7 17*fb1b10abSAndroid Build Coastguard Worker 18*fb1b10abSAndroid Build Coastguard WorkerSECTION .text 19*fb1b10abSAndroid Build Coastguard Worker 20*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1d_h6_mmx 21*fb1b10abSAndroid Build Coastguard Worker;( 22*fb1b10abSAndroid Build Coastguard Worker; unsigned char *src_ptr, 23*fb1b10abSAndroid Build Coastguard Worker; unsigned short *output_ptr, 24*fb1b10abSAndroid Build Coastguard Worker; unsigned int src_pixels_per_line, 25*fb1b10abSAndroid Build Coastguard Worker; unsigned int pixel_step, 26*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_height, 27*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_width, 28*fb1b10abSAndroid Build Coastguard Worker; short * vp8_filter 29*fb1b10abSAndroid Build Coastguard Worker;) 30*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1d_h6_mmx) 31*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1d_h6_mmx): 32*fb1b10abSAndroid Build Coastguard Worker push rbp 33*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 34*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 7 35*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 36*fb1b10abSAndroid Build Coastguard Worker push rsi 37*fb1b10abSAndroid Build Coastguard Worker push rdi 38*fb1b10abSAndroid Build Coastguard Worker ; end prolog 39*fb1b10abSAndroid Build Coastguard Worker 40*fb1b10abSAndroid Build Coastguard Worker mov rdx, arg(6) ;vp8_filter 41*fb1b10abSAndroid Build Coastguard Worker 42*fb1b10abSAndroid Build Coastguard Worker movq mm1, [rdx + 16] ; do both the negative taps first!!! 43*fb1b10abSAndroid Build Coastguard Worker movq mm2, [rdx + 32] ; 44*fb1b10abSAndroid Build Coastguard Worker movq mm6, [rdx + 48] ; 45*fb1b10abSAndroid Build Coastguard Worker movq mm7, [rdx + 64] ; 46*fb1b10abSAndroid Build Coastguard Worker 47*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(1) ;output_ptr 48*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(0) ;src_ptr 49*fb1b10abSAndroid Build Coastguard Worker movsxd rcx, dword ptr arg(4) ;output_height 50*fb1b10abSAndroid Build Coastguard Worker movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? 51*fb1b10abSAndroid Build Coastguard Worker pxor mm0, mm0 ; mm0 = 00000000 52*fb1b10abSAndroid Build Coastguard Worker 53*fb1b10abSAndroid Build Coastguard Worker.nextrow: 54*fb1b10abSAndroid Build Coastguard Worker movq mm3, [rsi-2] ; mm3 = p-2..p5 55*fb1b10abSAndroid Build Coastguard Worker movq mm4, mm3 ; mm4 = p-2..p5 56*fb1b10abSAndroid Build Coastguard Worker psrlq mm3, 8 ; mm3 = p-1..p5 57*fb1b10abSAndroid Build Coastguard Worker punpcklbw mm3, mm0 ; mm3 = p-1..p2 58*fb1b10abSAndroid Build Coastguard Worker pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 59*fb1b10abSAndroid Build Coastguard Worker 60*fb1b10abSAndroid Build Coastguard Worker movq mm5, mm4 ; mm5 = p-2..p5 61*fb1b10abSAndroid Build Coastguard Worker punpckhbw mm4, mm0 ; mm5 = p2..p5 62*fb1b10abSAndroid Build Coastguard Worker pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers 63*fb1b10abSAndroid Build Coastguard Worker paddsw mm3, mm4 ; mm3 += mm5 64*fb1b10abSAndroid Build Coastguard Worker 65*fb1b10abSAndroid Build Coastguard Worker movq mm4, mm5 ; mm4 = p-2..p5; 66*fb1b10abSAndroid Build Coastguard Worker psrlq mm5, 16 ; mm5 = p0..p5; 67*fb1b10abSAndroid Build Coastguard Worker punpcklbw mm5, mm0 ; mm5 = p0..p3 68*fb1b10abSAndroid Build Coastguard Worker pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers 69*fb1b10abSAndroid Build Coastguard Worker paddsw mm3, mm5 ; mm3 += mm5 70*fb1b10abSAndroid Build Coastguard Worker 71*fb1b10abSAndroid Build Coastguard Worker movq mm5, mm4 ; mm5 = p-2..p5 72*fb1b10abSAndroid Build Coastguard Worker psrlq mm4, 24 ; mm4 = p1..p5 73*fb1b10abSAndroid Build Coastguard Worker punpcklbw mm4, mm0 ; mm4 = p1..p4 74*fb1b10abSAndroid Build Coastguard Worker pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers 75*fb1b10abSAndroid Build Coastguard Worker paddsw mm3, mm4 ; mm3 += mm5 76*fb1b10abSAndroid Build Coastguard Worker 77*fb1b10abSAndroid Build Coastguard Worker ; do outer positive taps 78*fb1b10abSAndroid Build Coastguard Worker movd mm4, [rsi+3] 79*fb1b10abSAndroid Build Coastguard Worker punpcklbw mm4, mm0 ; mm5 = p3..p6 80*fb1b10abSAndroid Build Coastguard Worker pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers 81*fb1b10abSAndroid Build Coastguard Worker paddsw mm3, mm4 ; mm3 += mm5 82*fb1b10abSAndroid Build Coastguard Worker 83*fb1b10abSAndroid Build Coastguard Worker punpcklbw mm5, mm0 ; mm5 = p-2..p1 84*fb1b10abSAndroid Build Coastguard Worker pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers 85*fb1b10abSAndroid Build Coastguard Worker paddsw mm3, mm5 ; mm3 += mm5 86*fb1b10abSAndroid Build Coastguard Worker 87*fb1b10abSAndroid Build Coastguard Worker paddsw mm3, [GLOBAL(rd)] ; mm3 += round value 88*fb1b10abSAndroid Build Coastguard Worker psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 89*fb1b10abSAndroid Build Coastguard Worker packuswb mm3, mm0 ; pack and unpack to saturate 90*fb1b10abSAndroid Build Coastguard Worker punpcklbw mm3, mm0 ; 91*fb1b10abSAndroid Build Coastguard Worker 92*fb1b10abSAndroid Build Coastguard Worker movq [rdi], mm3 ; store the results in the destination 93*fb1b10abSAndroid Build Coastguard Worker 94*fb1b10abSAndroid Build Coastguard Worker%if ABI_IS_32BIT 95*fb1b10abSAndroid Build Coastguard Worker add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line 96*fb1b10abSAndroid Build Coastguard Worker add rdi, rax; 97*fb1b10abSAndroid Build Coastguard Worker%else 98*fb1b10abSAndroid Build Coastguard Worker movsxd r8, dword ptr arg(2) ;src_pixels_per_line 99*fb1b10abSAndroid Build Coastguard Worker add rdi, rax; 100*fb1b10abSAndroid Build Coastguard Worker 101*fb1b10abSAndroid Build Coastguard Worker add rsi, r8 ; next line 102*fb1b10abSAndroid Build Coastguard Worker%endif 103*fb1b10abSAndroid Build Coastguard Worker 104*fb1b10abSAndroid Build Coastguard Worker dec rcx ; decrement count 105*fb1b10abSAndroid Build Coastguard Worker jnz .nextrow ; next row 106*fb1b10abSAndroid Build Coastguard Worker 107*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 108*fb1b10abSAndroid Build Coastguard Worker pop rdi 109*fb1b10abSAndroid Build Coastguard Worker pop rsi 110*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 111*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 112*fb1b10abSAndroid Build Coastguard Worker pop rbp 113*fb1b10abSAndroid Build Coastguard Worker ret 114*fb1b10abSAndroid Build Coastguard Worker 115*fb1b10abSAndroid Build Coastguard Worker 116*fb1b10abSAndroid Build Coastguard Worker;void vp8_filter_block1dc_v6_mmx 117*fb1b10abSAndroid Build Coastguard Worker;( 118*fb1b10abSAndroid Build Coastguard Worker; short *src_ptr, 119*fb1b10abSAndroid Build Coastguard Worker; unsigned char *output_ptr, 120*fb1b10abSAndroid Build Coastguard Worker; int output_pitch, 121*fb1b10abSAndroid Build Coastguard Worker; unsigned int pixels_per_line, 122*fb1b10abSAndroid Build Coastguard Worker; unsigned int pixel_step, 123*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_height, 124*fb1b10abSAndroid Build Coastguard Worker; unsigned int output_width, 125*fb1b10abSAndroid Build Coastguard Worker; short * vp8_filter 126*fb1b10abSAndroid Build Coastguard Worker;) 127*fb1b10abSAndroid Build Coastguard Workerglobalsym(vp8_filter_block1dc_v6_mmx) 128*fb1b10abSAndroid Build Coastguard Workersym(vp8_filter_block1dc_v6_mmx): 129*fb1b10abSAndroid Build Coastguard Worker push rbp 130*fb1b10abSAndroid Build Coastguard Worker mov rbp, rsp 131*fb1b10abSAndroid Build Coastguard Worker SHADOW_ARGS_TO_STACK 8 132*fb1b10abSAndroid Build Coastguard Worker GET_GOT rbx 133*fb1b10abSAndroid Build Coastguard Worker push rsi 134*fb1b10abSAndroid Build Coastguard Worker push rdi 135*fb1b10abSAndroid Build Coastguard Worker ; end prolog 136*fb1b10abSAndroid Build Coastguard Worker 137*fb1b10abSAndroid Build Coastguard Worker movq mm5, [GLOBAL(rd)] 138*fb1b10abSAndroid Build Coastguard Worker push rbx 139*fb1b10abSAndroid Build Coastguard Worker mov rbx, arg(7) ;vp8_filter 140*fb1b10abSAndroid Build Coastguard Worker movq mm1, [rbx + 16] ; do both the negative taps first!!! 141*fb1b10abSAndroid Build Coastguard Worker movq mm2, [rbx + 32] ; 142*fb1b10abSAndroid Build Coastguard Worker movq mm6, [rbx + 48] ; 143*fb1b10abSAndroid Build Coastguard Worker movq mm7, [rbx + 64] ; 144*fb1b10abSAndroid Build Coastguard Worker 145*fb1b10abSAndroid Build Coastguard Worker movsxd rdx, dword ptr arg(3) ;pixels_per_line 146*fb1b10abSAndroid Build Coastguard Worker mov rdi, arg(1) ;output_ptr 147*fb1b10abSAndroid Build Coastguard Worker mov rsi, arg(0) ;src_ptr 148*fb1b10abSAndroid Build Coastguard Worker sub rsi, rdx 149*fb1b10abSAndroid Build Coastguard Worker sub rsi, rdx 150*fb1b10abSAndroid Build Coastguard Worker movsxd rcx, DWORD PTR arg(5) ;output_height 151*fb1b10abSAndroid Build Coastguard Worker movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? 152*fb1b10abSAndroid Build Coastguard Worker pxor mm0, mm0 ; mm0 = 00000000 153*fb1b10abSAndroid Build Coastguard Worker 154*fb1b10abSAndroid Build Coastguard Worker 155*fb1b10abSAndroid Build Coastguard Worker.nextrow_cv: 156*fb1b10abSAndroid Build Coastguard Worker movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 157*fb1b10abSAndroid Build Coastguard Worker pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 158*fb1b10abSAndroid Build Coastguard Worker 159*fb1b10abSAndroid Build Coastguard Worker 160*fb1b10abSAndroid Build Coastguard Worker movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 161*fb1b10abSAndroid Build Coastguard Worker pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. 162*fb1b10abSAndroid Build Coastguard Worker paddsw mm3, mm4 ; mm3 += mm4 163*fb1b10abSAndroid Build Coastguard Worker 164*fb1b10abSAndroid Build Coastguard Worker movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 165*fb1b10abSAndroid Build Coastguard Worker pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. 166*fb1b10abSAndroid Build Coastguard Worker paddsw mm3, mm4 ; mm3 += mm4 167*fb1b10abSAndroid Build Coastguard Worker 168*fb1b10abSAndroid Build Coastguard Worker movq mm4, [rsi] ; mm4 = p0..p3 = row -2 169*fb1b10abSAndroid Build Coastguard Worker pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. 170*fb1b10abSAndroid Build Coastguard Worker paddsw mm3, mm4 ; mm3 += mm4 171*fb1b10abSAndroid Build Coastguard Worker 172*fb1b10abSAndroid Build Coastguard Worker 173*fb1b10abSAndroid Build Coastguard Worker add rsi, rdx ; move source forward 1 line to avoid 3 * pitch 174*fb1b10abSAndroid Build Coastguard Worker movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 175*fb1b10abSAndroid Build Coastguard Worker pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. 176*fb1b10abSAndroid Build Coastguard Worker paddsw mm3, mm4 ; mm3 += mm4 177*fb1b10abSAndroid Build Coastguard Worker 178*fb1b10abSAndroid Build Coastguard Worker movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 179*fb1b10abSAndroid Build Coastguard Worker pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. 180*fb1b10abSAndroid Build Coastguard Worker paddsw mm3, mm4 ; mm3 += mm4 181*fb1b10abSAndroid Build Coastguard Worker 182*fb1b10abSAndroid Build Coastguard Worker 183*fb1b10abSAndroid Build Coastguard Worker paddsw mm3, mm5 ; mm3 += round value 184*fb1b10abSAndroid Build Coastguard Worker psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 185*fb1b10abSAndroid Build Coastguard Worker packuswb mm3, mm0 ; pack and saturate 186*fb1b10abSAndroid Build Coastguard Worker 187*fb1b10abSAndroid Build Coastguard Worker movd [rdi],mm3 ; store the results in the destination 188*fb1b10abSAndroid Build Coastguard Worker ; the subsequent iterations repeat 3 out of 4 of these reads. Since the 189*fb1b10abSAndroid Build Coastguard Worker ; recon block should be in cache this shouldn't cost much. Its obviously 190*fb1b10abSAndroid Build Coastguard Worker ; avoidable!!!. 191*fb1b10abSAndroid Build Coastguard Worker lea rdi, [rdi+rax] ; 192*fb1b10abSAndroid Build Coastguard Worker dec rcx ; decrement count 193*fb1b10abSAndroid Build Coastguard Worker jnz .nextrow_cv ; next row 194*fb1b10abSAndroid Build Coastguard Worker 195*fb1b10abSAndroid Build Coastguard Worker pop rbx 196*fb1b10abSAndroid Build Coastguard Worker 197*fb1b10abSAndroid Build Coastguard Worker ; begin epilog 198*fb1b10abSAndroid Build Coastguard Worker pop rdi 199*fb1b10abSAndroid Build Coastguard Worker pop rsi 200*fb1b10abSAndroid Build Coastguard Worker RESTORE_GOT 201*fb1b10abSAndroid Build Coastguard Worker UNSHADOW_ARGS 202*fb1b10abSAndroid Build Coastguard Worker pop rbp 203*fb1b10abSAndroid Build Coastguard Worker ret 204*fb1b10abSAndroid Build Coastguard Worker 205*fb1b10abSAndroid Build Coastguard Worker 206*fb1b10abSAndroid Build Coastguard WorkerSECTION_RODATA 207*fb1b10abSAndroid Build Coastguard Workeralign 16 208*fb1b10abSAndroid Build Coastguard Workerrd: 209*fb1b10abSAndroid Build Coastguard Worker times 4 dw 0x40 210*fb1b10abSAndroid Build Coastguard Worker 211*fb1b10abSAndroid Build Coastguard Workeralign 16 212*fb1b10abSAndroid Build Coastguard Workerglobal HIDDEN_DATA(sym(vp8_six_tap_x86)) 213*fb1b10abSAndroid Build Coastguard Workersym(vp8_six_tap_x86): 214*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 215*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 216*fb1b10abSAndroid Build Coastguard Worker times 8 dw 128 217*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 218*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 219*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 220*fb1b10abSAndroid Build Coastguard Worker 221*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 222*fb1b10abSAndroid Build Coastguard Worker times 8 dw -6 223*fb1b10abSAndroid Build Coastguard Worker times 8 dw 123 224*fb1b10abSAndroid Build Coastguard Worker times 8 dw 12 225*fb1b10abSAndroid Build Coastguard Worker times 8 dw -1 226*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 227*fb1b10abSAndroid Build Coastguard Worker 228*fb1b10abSAndroid Build Coastguard Worker times 8 dw 2 229*fb1b10abSAndroid Build Coastguard Worker times 8 dw -11 230*fb1b10abSAndroid Build Coastguard Worker times 8 dw 108 231*fb1b10abSAndroid Build Coastguard Worker times 8 dw 36 232*fb1b10abSAndroid Build Coastguard Worker times 8 dw -8 233*fb1b10abSAndroid Build Coastguard Worker times 8 dw 1 234*fb1b10abSAndroid Build Coastguard Worker 235*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 236*fb1b10abSAndroid Build Coastguard Worker times 8 dw -9 237*fb1b10abSAndroid Build Coastguard Worker times 8 dw 93 238*fb1b10abSAndroid Build Coastguard Worker times 8 dw 50 239*fb1b10abSAndroid Build Coastguard Worker times 8 dw -6 240*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 241*fb1b10abSAndroid Build Coastguard Worker 242*fb1b10abSAndroid Build Coastguard Worker times 8 dw 3 243*fb1b10abSAndroid Build Coastguard Worker times 8 dw -16 244*fb1b10abSAndroid Build Coastguard Worker times 8 dw 77 245*fb1b10abSAndroid Build Coastguard Worker times 8 dw 77 246*fb1b10abSAndroid Build Coastguard Worker times 8 dw -16 247*fb1b10abSAndroid Build Coastguard Worker times 8 dw 3 248*fb1b10abSAndroid Build Coastguard Worker 249*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 250*fb1b10abSAndroid Build Coastguard Worker times 8 dw -6 251*fb1b10abSAndroid Build Coastguard Worker times 8 dw 50 252*fb1b10abSAndroid Build Coastguard Worker times 8 dw 93 253*fb1b10abSAndroid Build Coastguard Worker times 8 dw -9 254*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 255*fb1b10abSAndroid Build Coastguard Worker 256*fb1b10abSAndroid Build Coastguard Worker times 8 dw 1 257*fb1b10abSAndroid Build Coastguard Worker times 8 dw -8 258*fb1b10abSAndroid Build Coastguard Worker times 8 dw 36 259*fb1b10abSAndroid Build Coastguard Worker times 8 dw 108 260*fb1b10abSAndroid Build Coastguard Worker times 8 dw -11 261*fb1b10abSAndroid Build Coastguard Worker times 8 dw 2 262*fb1b10abSAndroid Build Coastguard Worker 263*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 264*fb1b10abSAndroid Build Coastguard Worker times 8 dw -1 265*fb1b10abSAndroid Build Coastguard Worker times 8 dw 12 266*fb1b10abSAndroid Build Coastguard Worker times 8 dw 123 267*fb1b10abSAndroid Build Coastguard Worker times 8 dw -6 268*fb1b10abSAndroid Build Coastguard Worker times 8 dw 0 269*fb1b10abSAndroid Build Coastguard Worker 270*fb1b10abSAndroid Build Coastguard Worker 271