1*fb1b10abSAndroid Build Coastguard Worker; 2*fb1b10abSAndroid Build Coastguard Worker; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3*fb1b10abSAndroid Build Coastguard Worker; 4*fb1b10abSAndroid Build Coastguard Worker; Use of this source code is governed by a BSD-style license 5*fb1b10abSAndroid Build Coastguard Worker; that can be found in the LICENSE file in the root of the source 6*fb1b10abSAndroid Build Coastguard Worker; tree. An additional intellectual property rights grant can be found 7*fb1b10abSAndroid Build Coastguard Worker; in the file PATENTS. All contributing project authors may 8*fb1b10abSAndroid Build Coastguard Worker; be found in the AUTHORS file in the root of the source tree. 9*fb1b10abSAndroid Build Coastguard Worker; 10*fb1b10abSAndroid Build Coastguard Worker 11*fb1b10abSAndroid Build Coastguard Worker%include "third_party/x86inc/x86inc.asm" 12*fb1b10abSAndroid Build Coastguard Worker 13*fb1b10abSAndroid Build Coastguard WorkerSECTION .text 14*fb1b10abSAndroid Build Coastguard Worker 15*fb1b10abSAndroid Build Coastguard Worker; Macro Arguments 16*fb1b10abSAndroid Build Coastguard Worker; Arg 1: Width 17*fb1b10abSAndroid Build Coastguard Worker; Arg 2: Height 18*fb1b10abSAndroid Build Coastguard Worker; Arg 3: Number of general purpose registers 19*fb1b10abSAndroid Build Coastguard Worker; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows 20*fb1b10abSAndroid Build Coastguard Worker%macro SAD_FN 4 21*fb1b10abSAndroid Build Coastguard Worker%if %4 == 0 ; normal sad 22*fb1b10abSAndroid Build Coastguard Worker%if %3 == 5 23*fb1b10abSAndroid Build Coastguard Workercglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 24*fb1b10abSAndroid Build Coastguard Worker%else ; %3 == 7 25*fb1b10abSAndroid Build Coastguard Workercglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 26*fb1b10abSAndroid Build Coastguard Worker src_stride3, ref_stride3, n_rows 27*fb1b10abSAndroid Build Coastguard Worker%endif ; %3 == 5/7 28*fb1b10abSAndroid Build Coastguard Worker 29*fb1b10abSAndroid Build Coastguard Worker%elif %4 == 2 ; skip 30*fb1b10abSAndroid Build Coastguard Worker%if %3 == 5 31*fb1b10abSAndroid Build Coastguard Workercglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 32*fb1b10abSAndroid Build Coastguard Worker%else ; %3 == 7 33*fb1b10abSAndroid Build Coastguard Workercglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 34*fb1b10abSAndroid Build Coastguard Worker src_stride3, ref_stride3, n_rows 35*fb1b10abSAndroid Build Coastguard Worker%endif ; %3 == 5/7 36*fb1b10abSAndroid Build Coastguard Worker 37*fb1b10abSAndroid Build Coastguard Worker%else 38*fb1b10abSAndroid Build Coastguard Worker%if %3 == 5 39*fb1b10abSAndroid Build Coastguard Workercglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ 40*fb1b10abSAndroid Build Coastguard Worker second_pred, n_rows 41*fb1b10abSAndroid Build Coastguard Worker%else ; %3 == 7 42*fb1b10abSAndroid Build Coastguard Workercglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ 43*fb1b10abSAndroid Build Coastguard Worker ref, ref_stride, \ 44*fb1b10abSAndroid Build Coastguard Worker second_pred, \ 45*fb1b10abSAndroid Build Coastguard Worker src_stride3, ref_stride3 46*fb1b10abSAndroid Build Coastguard Worker%if VPX_ARCH_X86_64 47*fb1b10abSAndroid Build Coastguard Worker%define n_rowsd r7d 48*fb1b10abSAndroid Build Coastguard Worker%else ; x86-32 49*fb1b10abSAndroid Build Coastguard Worker%define n_rowsd dword r0m 50*fb1b10abSAndroid Build Coastguard Worker%endif ; x86-32/64 51*fb1b10abSAndroid Build Coastguard Worker%endif ; %3 == 5/7 52*fb1b10abSAndroid Build Coastguard Worker%endif ; sad/avg/skip 53*fb1b10abSAndroid Build Coastguard Worker%if %4 == 2; skip rows so double the stride 54*fb1b10abSAndroid Build Coastguard Workerlea src_strided, [src_strided*2] 55*fb1b10abSAndroid Build Coastguard Workerlea ref_strided, [ref_strided*2] 56*fb1b10abSAndroid Build Coastguard Worker%endif ; %4 skip 57*fb1b10abSAndroid Build Coastguard Worker movsxdifnidn src_strideq, src_strided 58*fb1b10abSAndroid Build Coastguard Worker movsxdifnidn ref_strideq, ref_strided 59*fb1b10abSAndroid Build Coastguard Worker%if %3 == 7 60*fb1b10abSAndroid Build Coastguard Worker lea src_stride3q, [src_strideq*3] 61*fb1b10abSAndroid Build Coastguard Worker lea ref_stride3q, [ref_strideq*3] 62*fb1b10abSAndroid Build Coastguard Worker%endif ; %3 == 7 63*fb1b10abSAndroid Build Coastguard Worker%endmacro 64*fb1b10abSAndroid Build Coastguard Worker 65*fb1b10abSAndroid Build Coastguard Worker; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride, 66*fb1b10abSAndroid Build Coastguard Worker; uint8_t *ref, int ref_stride); 67*fb1b10abSAndroid Build Coastguard Worker%macro SAD64XN 1-2 0 68*fb1b10abSAndroid Build Coastguard Worker SAD_FN 64, %1, 5, %2 69*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 70*fb1b10abSAndroid Build Coastguard Worker mov n_rowsd, %1/2 71*fb1b10abSAndroid Build Coastguard Worker%else 72*fb1b10abSAndroid Build Coastguard Worker mov n_rowsd, %1 73*fb1b10abSAndroid Build Coastguard Worker%endif 74*fb1b10abSAndroid Build Coastguard Worker pxor m0, m0 75*fb1b10abSAndroid Build Coastguard Worker.loop: 76*fb1b10abSAndroid Build Coastguard Worker movu m1, [refq] 77*fb1b10abSAndroid Build Coastguard Worker movu m2, [refq+16] 78*fb1b10abSAndroid Build Coastguard Worker movu m3, [refq+32] 79*fb1b10abSAndroid Build Coastguard Worker movu m4, [refq+48] 80*fb1b10abSAndroid Build Coastguard Worker%if %2 == 1 81*fb1b10abSAndroid Build Coastguard Worker pavgb m1, [second_predq+mmsize*0] 82*fb1b10abSAndroid Build Coastguard Worker pavgb m2, [second_predq+mmsize*1] 83*fb1b10abSAndroid Build Coastguard Worker pavgb m3, [second_predq+mmsize*2] 84*fb1b10abSAndroid Build Coastguard Worker pavgb m4, [second_predq+mmsize*3] 85*fb1b10abSAndroid Build Coastguard Worker lea second_predq, [second_predq+mmsize*4] 86*fb1b10abSAndroid Build Coastguard Worker%endif 87*fb1b10abSAndroid Build Coastguard Worker psadbw m1, [srcq] 88*fb1b10abSAndroid Build Coastguard Worker psadbw m2, [srcq+16] 89*fb1b10abSAndroid Build Coastguard Worker psadbw m3, [srcq+32] 90*fb1b10abSAndroid Build Coastguard Worker psadbw m4, [srcq+48] 91*fb1b10abSAndroid Build Coastguard Worker paddd m1, m2 92*fb1b10abSAndroid Build Coastguard Worker paddd m3, m4 93*fb1b10abSAndroid Build Coastguard Worker add refq, ref_strideq 94*fb1b10abSAndroid Build Coastguard Worker paddd m0, m1 95*fb1b10abSAndroid Build Coastguard Worker add srcq, src_strideq 96*fb1b10abSAndroid Build Coastguard Worker paddd m0, m3 97*fb1b10abSAndroid Build Coastguard Worker dec n_rowsd 98*fb1b10abSAndroid Build Coastguard Worker jg .loop 99*fb1b10abSAndroid Build Coastguard Worker 100*fb1b10abSAndroid Build Coastguard Worker movhlps m1, m0 101*fb1b10abSAndroid Build Coastguard Worker paddd m0, m1 102*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad 103*fb1b10abSAndroid Build Coastguard Worker pslld m0, 1 104*fb1b10abSAndroid Build Coastguard Worker%endif 105*fb1b10abSAndroid Build Coastguard Worker movd eax, m0 106*fb1b10abSAndroid Build Coastguard Worker RET 107*fb1b10abSAndroid Build Coastguard Worker%endmacro 108*fb1b10abSAndroid Build Coastguard Worker 109*fb1b10abSAndroid Build Coastguard WorkerINIT_XMM sse2 110*fb1b10abSAndroid Build Coastguard WorkerSAD64XN 64 ; sad64x64_sse2 111*fb1b10abSAndroid Build Coastguard WorkerSAD64XN 32 ; sad64x32_sse2 112*fb1b10abSAndroid Build Coastguard WorkerSAD64XN 64, 1 ; sad64x64_avg_sse2 113*fb1b10abSAndroid Build Coastguard WorkerSAD64XN 32, 1 ; sad64x32_avg_sse2 114*fb1b10abSAndroid Build Coastguard WorkerSAD64XN 64, 2 ; sad64x64_skip_sse2 115*fb1b10abSAndroid Build Coastguard WorkerSAD64XN 32, 2 ; sad64x32_skip_sse2 116*fb1b10abSAndroid Build Coastguard Worker 117*fb1b10abSAndroid Build Coastguard Worker; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride, 118*fb1b10abSAndroid Build Coastguard Worker; uint8_t *ref, int ref_stride); 119*fb1b10abSAndroid Build Coastguard Worker%macro SAD32XN 1-2 0 120*fb1b10abSAndroid Build Coastguard Worker SAD_FN 32, %1, 5, %2 121*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 122*fb1b10abSAndroid Build Coastguard Worker mov n_rowsd, %1/4 123*fb1b10abSAndroid Build Coastguard Worker%else 124*fb1b10abSAndroid Build Coastguard Worker mov n_rowsd, %1/2 125*fb1b10abSAndroid Build Coastguard Worker%endif 126*fb1b10abSAndroid Build Coastguard Worker pxor m0, m0 127*fb1b10abSAndroid Build Coastguard Worker.loop: 128*fb1b10abSAndroid Build Coastguard Worker movu m1, [refq] 129*fb1b10abSAndroid Build Coastguard Worker movu m2, [refq+16] 130*fb1b10abSAndroid Build Coastguard Worker movu m3, [refq+ref_strideq] 131*fb1b10abSAndroid Build Coastguard Worker movu m4, [refq+ref_strideq+16] 132*fb1b10abSAndroid Build Coastguard Worker%if %2 == 1 133*fb1b10abSAndroid Build Coastguard Worker pavgb m1, [second_predq+mmsize*0] 134*fb1b10abSAndroid Build Coastguard Worker pavgb m2, [second_predq+mmsize*1] 135*fb1b10abSAndroid Build Coastguard Worker pavgb m3, [second_predq+mmsize*2] 136*fb1b10abSAndroid Build Coastguard Worker pavgb m4, [second_predq+mmsize*3] 137*fb1b10abSAndroid Build Coastguard Worker lea second_predq, [second_predq+mmsize*4] 138*fb1b10abSAndroid Build Coastguard Worker%endif 139*fb1b10abSAndroid Build Coastguard Worker psadbw m1, [srcq] 140*fb1b10abSAndroid Build Coastguard Worker psadbw m2, [srcq+16] 141*fb1b10abSAndroid Build Coastguard Worker psadbw m3, [srcq+src_strideq] 142*fb1b10abSAndroid Build Coastguard Worker psadbw m4, [srcq+src_strideq+16] 143*fb1b10abSAndroid Build Coastguard Worker paddd m1, m2 144*fb1b10abSAndroid Build Coastguard Worker paddd m3, m4 145*fb1b10abSAndroid Build Coastguard Worker lea refq, [refq+ref_strideq*2] 146*fb1b10abSAndroid Build Coastguard Worker paddd m0, m1 147*fb1b10abSAndroid Build Coastguard Worker lea srcq, [srcq+src_strideq*2] 148*fb1b10abSAndroid Build Coastguard Worker paddd m0, m3 149*fb1b10abSAndroid Build Coastguard Worker dec n_rowsd 150*fb1b10abSAndroid Build Coastguard Worker jg .loop 151*fb1b10abSAndroid Build Coastguard Worker 152*fb1b10abSAndroid Build Coastguard Worker movhlps m1, m0 153*fb1b10abSAndroid Build Coastguard Worker paddd m0, m1 154*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad 155*fb1b10abSAndroid Build Coastguard Worker pslld m0, 1 156*fb1b10abSAndroid Build Coastguard Worker%endif 157*fb1b10abSAndroid Build Coastguard Worker movd eax, m0 158*fb1b10abSAndroid Build Coastguard Worker RET 159*fb1b10abSAndroid Build Coastguard Worker%endmacro 160*fb1b10abSAndroid Build Coastguard Worker 161*fb1b10abSAndroid Build Coastguard WorkerINIT_XMM sse2 162*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 64 ; sad32x64_sse2 163*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 32 ; sad32x32_sse2 164*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 16 ; sad32x16_sse2 165*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 64, 1 ; sad32x64_avg_sse2 166*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 32, 1 ; sad32x32_avg_sse2 167*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 16, 1 ; sad32x16_avg_sse2 168*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 64, 2 ; sad32x64_skip_sse2 169*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 32, 2 ; sad32x32_skip_sse2 170*fb1b10abSAndroid Build Coastguard WorkerSAD32XN 16, 2 ; sad32x16_skip_sse2 171*fb1b10abSAndroid Build Coastguard Worker 172*fb1b10abSAndroid Build Coastguard Worker; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride, 173*fb1b10abSAndroid Build Coastguard Worker; uint8_t *ref, int ref_stride); 174*fb1b10abSAndroid Build Coastguard Worker%macro SAD16XN 1-2 0 175*fb1b10abSAndroid Build Coastguard Worker SAD_FN 16, %1, 7, %2 176*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 177*fb1b10abSAndroid Build Coastguard Worker mov n_rowsd, %1/8 178*fb1b10abSAndroid Build Coastguard Worker%else 179*fb1b10abSAndroid Build Coastguard Worker mov n_rowsd, %1/4 180*fb1b10abSAndroid Build Coastguard Worker%endif 181*fb1b10abSAndroid Build Coastguard Worker pxor m0, m0 182*fb1b10abSAndroid Build Coastguard Worker 183*fb1b10abSAndroid Build Coastguard Worker.loop: 184*fb1b10abSAndroid Build Coastguard Worker movu m1, [refq] 185*fb1b10abSAndroid Build Coastguard Worker movu m2, [refq+ref_strideq] 186*fb1b10abSAndroid Build Coastguard Worker movu m3, [refq+ref_strideq*2] 187*fb1b10abSAndroid Build Coastguard Worker movu m4, [refq+ref_stride3q] 188*fb1b10abSAndroid Build Coastguard Worker%if %2 == 1 189*fb1b10abSAndroid Build Coastguard Worker pavgb m1, [second_predq+mmsize*0] 190*fb1b10abSAndroid Build Coastguard Worker pavgb m2, [second_predq+mmsize*1] 191*fb1b10abSAndroid Build Coastguard Worker pavgb m3, [second_predq+mmsize*2] 192*fb1b10abSAndroid Build Coastguard Worker pavgb m4, [second_predq+mmsize*3] 193*fb1b10abSAndroid Build Coastguard Worker lea second_predq, [second_predq+mmsize*4] 194*fb1b10abSAndroid Build Coastguard Worker%endif 195*fb1b10abSAndroid Build Coastguard Worker psadbw m1, [srcq] 196*fb1b10abSAndroid Build Coastguard Worker psadbw m2, [srcq+src_strideq] 197*fb1b10abSAndroid Build Coastguard Worker psadbw m3, [srcq+src_strideq*2] 198*fb1b10abSAndroid Build Coastguard Worker psadbw m4, [srcq+src_stride3q] 199*fb1b10abSAndroid Build Coastguard Worker paddd m1, m2 200*fb1b10abSAndroid Build Coastguard Worker paddd m3, m4 201*fb1b10abSAndroid Build Coastguard Worker lea refq, [refq+ref_strideq*4] 202*fb1b10abSAndroid Build Coastguard Worker paddd m0, m1 203*fb1b10abSAndroid Build Coastguard Worker lea srcq, [srcq+src_strideq*4] 204*fb1b10abSAndroid Build Coastguard Worker paddd m0, m3 205*fb1b10abSAndroid Build Coastguard Worker dec n_rowsd 206*fb1b10abSAndroid Build Coastguard Worker jg .loop 207*fb1b10abSAndroid Build Coastguard Worker 208*fb1b10abSAndroid Build Coastguard Worker movhlps m1, m0 209*fb1b10abSAndroid Build Coastguard Worker paddd m0, m1 210*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad 211*fb1b10abSAndroid Build Coastguard Worker pslld m0, 1 212*fb1b10abSAndroid Build Coastguard Worker%endif 213*fb1b10abSAndroid Build Coastguard Worker movd eax, m0 214*fb1b10abSAndroid Build Coastguard Worker RET 215*fb1b10abSAndroid Build Coastguard Worker%endmacro 216*fb1b10abSAndroid Build Coastguard Worker 217*fb1b10abSAndroid Build Coastguard WorkerINIT_XMM sse2 218*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 32 ; sad16x32_sse2 219*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 16 ; sad16x16_sse2 220*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 8 ; sad16x8_sse2 221*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 32, 1 ; sad16x32_avg_sse2 222*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 16, 1 ; sad16x16_avg_sse2 223*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 8, 1 ; sad16x8_avg_sse2 224*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 32, 2 ; sad16x32_skip_sse2 225*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 16, 2 ; sad16x16_skip_sse2 226*fb1b10abSAndroid Build Coastguard WorkerSAD16XN 8, 2 ; sad16x8_skip_sse2 227*fb1b10abSAndroid Build Coastguard Worker 228*fb1b10abSAndroid Build Coastguard Worker; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride, 229*fb1b10abSAndroid Build Coastguard Worker; uint8_t *ref, int ref_stride); 230*fb1b10abSAndroid Build Coastguard Worker%macro SAD8XN 1-2 0 231*fb1b10abSAndroid Build Coastguard Worker SAD_FN 8, %1, 7, %2 232*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 233*fb1b10abSAndroid Build Coastguard Worker mov n_rowsd, %1/8 234*fb1b10abSAndroid Build Coastguard Worker%else 235*fb1b10abSAndroid Build Coastguard Worker mov n_rowsd, %1/4 236*fb1b10abSAndroid Build Coastguard Worker%endif 237*fb1b10abSAndroid Build Coastguard Worker pxor m0, m0 238*fb1b10abSAndroid Build Coastguard Worker 239*fb1b10abSAndroid Build Coastguard Worker.loop: 240*fb1b10abSAndroid Build Coastguard Worker movh m1, [refq] 241*fb1b10abSAndroid Build Coastguard Worker movhps m1, [refq+ref_strideq] 242*fb1b10abSAndroid Build Coastguard Worker movh m2, [refq+ref_strideq*2] 243*fb1b10abSAndroid Build Coastguard Worker movhps m2, [refq+ref_stride3q] 244*fb1b10abSAndroid Build Coastguard Worker%if %2 == 1 245*fb1b10abSAndroid Build Coastguard Worker pavgb m1, [second_predq+mmsize*0] 246*fb1b10abSAndroid Build Coastguard Worker pavgb m2, [second_predq+mmsize*1] 247*fb1b10abSAndroid Build Coastguard Worker lea second_predq, [second_predq+mmsize*2] 248*fb1b10abSAndroid Build Coastguard Worker%endif 249*fb1b10abSAndroid Build Coastguard Worker movh m3, [srcq] 250*fb1b10abSAndroid Build Coastguard Worker movhps m3, [srcq+src_strideq] 251*fb1b10abSAndroid Build Coastguard Worker movh m4, [srcq+src_strideq*2] 252*fb1b10abSAndroid Build Coastguard Worker movhps m4, [srcq+src_stride3q] 253*fb1b10abSAndroid Build Coastguard Worker psadbw m1, m3 254*fb1b10abSAndroid Build Coastguard Worker psadbw m2, m4 255*fb1b10abSAndroid Build Coastguard Worker lea refq, [refq+ref_strideq*4] 256*fb1b10abSAndroid Build Coastguard Worker paddd m0, m1 257*fb1b10abSAndroid Build Coastguard Worker lea srcq, [srcq+src_strideq*4] 258*fb1b10abSAndroid Build Coastguard Worker paddd m0, m2 259*fb1b10abSAndroid Build Coastguard Worker dec n_rowsd 260*fb1b10abSAndroid Build Coastguard Worker jg .loop 261*fb1b10abSAndroid Build Coastguard Worker 262*fb1b10abSAndroid Build Coastguard Worker movhlps m1, m0 263*fb1b10abSAndroid Build Coastguard Worker paddd m0, m1 264*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad 265*fb1b10abSAndroid Build Coastguard Worker pslld m0, 1 266*fb1b10abSAndroid Build Coastguard Worker%endif 267*fb1b10abSAndroid Build Coastguard Worker movd eax, m0 268*fb1b10abSAndroid Build Coastguard Worker RET 269*fb1b10abSAndroid Build Coastguard Worker%endmacro 270*fb1b10abSAndroid Build Coastguard Worker 271*fb1b10abSAndroid Build Coastguard WorkerINIT_XMM sse2 272*fb1b10abSAndroid Build Coastguard WorkerSAD8XN 16 ; sad8x16_sse2 273*fb1b10abSAndroid Build Coastguard WorkerSAD8XN 8 ; sad8x8_sse2 274*fb1b10abSAndroid Build Coastguard WorkerSAD8XN 4 ; sad8x4_sse2 275*fb1b10abSAndroid Build Coastguard WorkerSAD8XN 16, 1 ; sad8x16_avg_sse2 276*fb1b10abSAndroid Build Coastguard WorkerSAD8XN 8, 1 ; sad8x8_avg_sse2 277*fb1b10abSAndroid Build Coastguard WorkerSAD8XN 4, 1 ; sad8x4_avg_sse2 278*fb1b10abSAndroid Build Coastguard WorkerSAD8XN 16, 2 ; sad8x16_skip_sse2 279*fb1b10abSAndroid Build Coastguard WorkerSAD8XN 8, 2 ; sad8x8_skip_sse2 280*fb1b10abSAndroid Build Coastguard Worker 281*fb1b10abSAndroid Build Coastguard Worker; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, 282*fb1b10abSAndroid Build Coastguard Worker; uint8_t *ref, int ref_stride); 283*fb1b10abSAndroid Build Coastguard Worker%macro SAD4XN 1-2 0 284*fb1b10abSAndroid Build Coastguard Worker SAD_FN 4, %1, 7, %2 285*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 286*fb1b10abSAndroid Build Coastguard Worker mov n_rowsd, %1/8 287*fb1b10abSAndroid Build Coastguard Worker%else 288*fb1b10abSAndroid Build Coastguard Worker mov n_rowsd, %1/4 289*fb1b10abSAndroid Build Coastguard Worker%endif 290*fb1b10abSAndroid Build Coastguard Worker pxor m0, m0 291*fb1b10abSAndroid Build Coastguard Worker 292*fb1b10abSAndroid Build Coastguard Worker.loop: 293*fb1b10abSAndroid Build Coastguard Worker movd m1, [refq] 294*fb1b10abSAndroid Build Coastguard Worker movd m2, [refq+ref_strideq] 295*fb1b10abSAndroid Build Coastguard Worker movd m3, [refq+ref_strideq*2] 296*fb1b10abSAndroid Build Coastguard Worker movd m4, [refq+ref_stride3q] 297*fb1b10abSAndroid Build Coastguard Worker punpckldq m1, m2 298*fb1b10abSAndroid Build Coastguard Worker punpckldq m3, m4 299*fb1b10abSAndroid Build Coastguard Worker movlhps m1, m3 300*fb1b10abSAndroid Build Coastguard Worker%if %2 == 1 301*fb1b10abSAndroid Build Coastguard Worker pavgb m1, [second_predq+mmsize*0] 302*fb1b10abSAndroid Build Coastguard Worker lea second_predq, [second_predq+mmsize*1] 303*fb1b10abSAndroid Build Coastguard Worker%endif 304*fb1b10abSAndroid Build Coastguard Worker movd m2, [srcq] 305*fb1b10abSAndroid Build Coastguard Worker movd m5, [srcq+src_strideq] 306*fb1b10abSAndroid Build Coastguard Worker movd m4, [srcq+src_strideq*2] 307*fb1b10abSAndroid Build Coastguard Worker movd m3, [srcq+src_stride3q] 308*fb1b10abSAndroid Build Coastguard Worker punpckldq m2, m5 309*fb1b10abSAndroid Build Coastguard Worker punpckldq m4, m3 310*fb1b10abSAndroid Build Coastguard Worker movlhps m2, m4 311*fb1b10abSAndroid Build Coastguard Worker psadbw m1, m2 312*fb1b10abSAndroid Build Coastguard Worker lea refq, [refq+ref_strideq*4] 313*fb1b10abSAndroid Build Coastguard Worker paddd m0, m1 314*fb1b10abSAndroid Build Coastguard Worker lea srcq, [srcq+src_strideq*4] 315*fb1b10abSAndroid Build Coastguard Worker dec n_rowsd 316*fb1b10abSAndroid Build Coastguard Worker jg .loop 317*fb1b10abSAndroid Build Coastguard Worker 318*fb1b10abSAndroid Build Coastguard Worker movhlps m1, m0 319*fb1b10abSAndroid Build Coastguard Worker paddd m0, m1 320*fb1b10abSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad 321*fb1b10abSAndroid Build Coastguard Worker pslld m0, 1 322*fb1b10abSAndroid Build Coastguard Worker%endif 323*fb1b10abSAndroid Build Coastguard Worker movd eax, m0 324*fb1b10abSAndroid Build Coastguard Worker RET 325*fb1b10abSAndroid Build Coastguard Worker%endmacro 326*fb1b10abSAndroid Build Coastguard Worker 327*fb1b10abSAndroid Build Coastguard WorkerINIT_XMM sse2 328*fb1b10abSAndroid Build Coastguard WorkerSAD4XN 8 ; sad4x8_sse 329*fb1b10abSAndroid Build Coastguard WorkerSAD4XN 4 ; sad4x4_sse 330*fb1b10abSAndroid Build Coastguard WorkerSAD4XN 8, 1 ; sad4x8_avg_sse 331*fb1b10abSAndroid Build Coastguard WorkerSAD4XN 4, 1 ; sad4x4_avg_sse 332*fb1b10abSAndroid Build Coastguard WorkerSAD4XN 8, 2 ; sad4x8_skip_sse 333