1*77c1e3ccSAndroid Build Coastguard Worker; 2*77c1e3ccSAndroid Build Coastguard Worker; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3*77c1e3ccSAndroid Build Coastguard Worker; 4*77c1e3ccSAndroid Build Coastguard Worker; This source code is subject to the terms of the BSD 2 Clause License and 5*77c1e3ccSAndroid Build Coastguard Worker; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6*77c1e3ccSAndroid Build Coastguard Worker; was not distributed with this source code in the LICENSE file, you can 7*77c1e3ccSAndroid Build Coastguard Worker; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8*77c1e3ccSAndroid Build Coastguard Worker; Media Patent License 1.0 was not distributed with this source code in the 9*77c1e3ccSAndroid Build Coastguard Worker; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10*77c1e3ccSAndroid Build Coastguard Worker; 11*77c1e3ccSAndroid Build Coastguard Worker 12*77c1e3ccSAndroid Build Coastguard Worker; 13*77c1e3ccSAndroid Build Coastguard Worker 14*77c1e3ccSAndroid Build Coastguard Worker%include "third_party/x86inc/x86inc.asm" 15*77c1e3ccSAndroid Build Coastguard Worker 16*77c1e3ccSAndroid Build Coastguard WorkerSECTION .text 17*77c1e3ccSAndroid Build Coastguard Worker 18*77c1e3ccSAndroid Build Coastguard Worker; Macro Arguments 19*77c1e3ccSAndroid Build Coastguard Worker; Arg 1: Width 20*77c1e3ccSAndroid Build Coastguard Worker; Arg 2: Height 21*77c1e3ccSAndroid Build Coastguard Worker; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit 22*77c1e3ccSAndroid Build Coastguard Worker; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows 23*77c1e3ccSAndroid Build Coastguard Worker%macro SAD_FN 4 24*77c1e3ccSAndroid Build Coastguard Worker%if %4 == 0 ; normal sad 25*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 5 26*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 27*77c1e3ccSAndroid Build Coastguard Worker%else ; %3 == 7 28*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 29*77c1e3ccSAndroid Build Coastguard Worker src_stride3, ref_stride3, n_rows 30*77c1e3ccSAndroid Build Coastguard Worker%endif ; %3 == 5/7 31*77c1e3ccSAndroid Build Coastguard Worker 32*77c1e3ccSAndroid Build Coastguard Worker%elif %4 == 2 ; skip 33*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 5 34*77c1e3ccSAndroid Build Coastguard Workercglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 35*77c1e3ccSAndroid Build Coastguard Worker%else ; %3 == 7 36*77c1e3ccSAndroid Build Coastguard Workercglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 37*77c1e3ccSAndroid Build Coastguard Worker src_stride3, ref_stride3, n_rows 38*77c1e3ccSAndroid Build Coastguard Worker%endif ; %3 == 5/7 39*77c1e3ccSAndroid Build Coastguard Worker 40*77c1e3ccSAndroid Build Coastguard Worker%else 41*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 5 42*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ 43*77c1e3ccSAndroid Build Coastguard Worker second_pred, n_rows 44*77c1e3ccSAndroid Build Coastguard Worker%else ; %3 == 7 45*77c1e3ccSAndroid Build Coastguard Workercglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \ 46*77c1e3ccSAndroid Build Coastguard Worker ref, ref_stride, \ 47*77c1e3ccSAndroid Build Coastguard Worker second_pred, \ 48*77c1e3ccSAndroid Build Coastguard Worker src_stride3, ref_stride3 49*77c1e3ccSAndroid Build Coastguard Worker%if AOM_ARCH_X86_64 50*77c1e3ccSAndroid Build Coastguard Worker%define n_rowsd r7d 51*77c1e3ccSAndroid Build Coastguard Worker%else ; x86-32 52*77c1e3ccSAndroid Build Coastguard Worker%define n_rowsd dword r0m 53*77c1e3ccSAndroid Build Coastguard Worker%endif ; x86-32/64 54*77c1e3ccSAndroid Build Coastguard Worker%endif ; %3 == 5/7 55*77c1e3ccSAndroid Build Coastguard Worker%endif ; sad/avg/skip 56*77c1e3ccSAndroid Build Coastguard Worker%if %4 == 2; skip rows so double the stride 57*77c1e3ccSAndroid Build Coastguard Workerlea src_strided, [src_strided*2] 58*77c1e3ccSAndroid Build Coastguard Workerlea ref_strided, [ref_strided*2] 59*77c1e3ccSAndroid Build Coastguard Worker%endif ; %4 skip 60*77c1e3ccSAndroid Build Coastguard Worker movsxdifnidn src_strideq, src_strided 61*77c1e3ccSAndroid Build Coastguard Worker movsxdifnidn ref_strideq, ref_strided 62*77c1e3ccSAndroid Build Coastguard Worker%if %3 == 7 63*77c1e3ccSAndroid Build Coastguard Worker lea src_stride3q, [src_strideq*3] 64*77c1e3ccSAndroid Build Coastguard Worker lea ref_stride3q, [ref_strideq*3] 65*77c1e3ccSAndroid Build Coastguard Worker%endif ; %3 == 7 66*77c1e3ccSAndroid Build Coastguard Worker%endmacro 67*77c1e3ccSAndroid Build Coastguard Worker 68*77c1e3ccSAndroid Build Coastguard Worker; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, 69*77c1e3ccSAndroid Build Coastguard Worker; uint8_t *ref, int ref_stride); 70*77c1e3ccSAndroid Build Coastguard Worker%macro SAD128XN 1-2 0 71*77c1e3ccSAndroid Build Coastguard Worker SAD_FN 128, %1, 5, %2 72*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 73*77c1e3ccSAndroid Build Coastguard Worker mov n_rowsd, %1/2 74*77c1e3ccSAndroid Build Coastguard Worker%else 75*77c1e3ccSAndroid Build Coastguard Worker mov n_rowsd, %1 76*77c1e3ccSAndroid Build Coastguard Worker%endif 77*77c1e3ccSAndroid Build Coastguard Worker pxor m0, m0 78*77c1e3ccSAndroid Build Coastguard Worker 79*77c1e3ccSAndroid Build Coastguard Worker.loop: 80*77c1e3ccSAndroid Build Coastguard Worker movu m1, [refq] 81*77c1e3ccSAndroid Build Coastguard Worker movu m2, [refq+16] 82*77c1e3ccSAndroid Build Coastguard Worker movu m3, [refq+32] 83*77c1e3ccSAndroid Build Coastguard Worker movu m4, [refq+48] 84*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1 85*77c1e3ccSAndroid Build Coastguard Worker pavgb m1, [second_predq+mmsize*0] 86*77c1e3ccSAndroid Build Coastguard Worker pavgb m2, [second_predq+mmsize*1] 87*77c1e3ccSAndroid Build Coastguard Worker pavgb m3, [second_predq+mmsize*2] 88*77c1e3ccSAndroid Build Coastguard Worker pavgb m4, [second_predq+mmsize*3] 89*77c1e3ccSAndroid Build Coastguard Worker%endif 90*77c1e3ccSAndroid Build Coastguard Worker psadbw m1, [srcq] 91*77c1e3ccSAndroid Build Coastguard Worker psadbw m2, [srcq+16] 92*77c1e3ccSAndroid Build Coastguard Worker psadbw m3, [srcq+32] 93*77c1e3ccSAndroid Build Coastguard Worker psadbw m4, [srcq+48] 94*77c1e3ccSAndroid Build Coastguard Worker 95*77c1e3ccSAndroid Build Coastguard Worker paddd m1, m2 96*77c1e3ccSAndroid Build Coastguard Worker paddd m3, m4 97*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 98*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m3 99*77c1e3ccSAndroid Build Coastguard Worker 100*77c1e3ccSAndroid Build Coastguard Worker movu m1, [refq+64] 101*77c1e3ccSAndroid Build Coastguard Worker movu m2, [refq+80] 102*77c1e3ccSAndroid Build Coastguard Worker movu m3, [refq+96] 103*77c1e3ccSAndroid Build Coastguard Worker movu m4, [refq+112] 104*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1 105*77c1e3ccSAndroid Build Coastguard Worker pavgb m1, [second_predq+mmsize*4] 106*77c1e3ccSAndroid Build Coastguard Worker pavgb m2, [second_predq+mmsize*5] 107*77c1e3ccSAndroid Build Coastguard Worker pavgb m3, [second_predq+mmsize*6] 108*77c1e3ccSAndroid Build Coastguard Worker pavgb m4, [second_predq+mmsize*7] 109*77c1e3ccSAndroid Build Coastguard Worker lea second_predq, [second_predq+mmsize*8] 110*77c1e3ccSAndroid Build Coastguard Worker%endif 111*77c1e3ccSAndroid Build Coastguard Worker psadbw m1, [srcq+64] 112*77c1e3ccSAndroid Build Coastguard Worker psadbw m2, [srcq+80] 113*77c1e3ccSAndroid Build Coastguard Worker psadbw m3, [srcq+96] 114*77c1e3ccSAndroid Build Coastguard Worker psadbw m4, [srcq+112] 115*77c1e3ccSAndroid Build Coastguard Worker 116*77c1e3ccSAndroid Build Coastguard Worker add refq, ref_strideq 117*77c1e3ccSAndroid Build Coastguard Worker add srcq, src_strideq 118*77c1e3ccSAndroid Build Coastguard Worker 119*77c1e3ccSAndroid Build Coastguard Worker paddd m1, m2 120*77c1e3ccSAndroid Build Coastguard Worker paddd m3, m4 121*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 122*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m3 123*77c1e3ccSAndroid Build Coastguard Worker 124*77c1e3ccSAndroid Build Coastguard Worker sub n_rowsd, 1 125*77c1e3ccSAndroid Build Coastguard Worker jg .loop 126*77c1e3ccSAndroid Build Coastguard Worker 127*77c1e3ccSAndroid Build Coastguard Worker movhlps m1, m0 128*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 129*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad 130*77c1e3ccSAndroid Build Coastguard Worker pslld m0, 1 131*77c1e3ccSAndroid Build Coastguard Worker%endif 132*77c1e3ccSAndroid Build Coastguard Worker movd eax, m0 133*77c1e3ccSAndroid Build Coastguard Worker RET 134*77c1e3ccSAndroid Build Coastguard Worker%endmacro 135*77c1e3ccSAndroid Build Coastguard Worker 136*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2 137*77c1e3ccSAndroid Build Coastguard WorkerSAD128XN 128 ; sad128x128_sse2 138*77c1e3ccSAndroid Build Coastguard WorkerSAD128XN 128, 1 ; sad128x128_avg_sse2 139*77c1e3ccSAndroid Build Coastguard WorkerSAD128XN 128, 2 ; sad128x128_skip_sse2 140*77c1e3ccSAndroid Build Coastguard WorkerSAD128XN 64 ; sad128x64_sse2 141*77c1e3ccSAndroid Build Coastguard WorkerSAD128XN 64, 1 ; sad128x64_avg_sse2 142*77c1e3ccSAndroid Build Coastguard WorkerSAD128XN 64, 2 ; sad128x64_skip_sse2 143*77c1e3ccSAndroid Build Coastguard Worker 144*77c1e3ccSAndroid Build Coastguard Worker 145*77c1e3ccSAndroid Build Coastguard Worker; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, 146*77c1e3ccSAndroid Build Coastguard Worker; uint8_t *ref, int ref_stride); 147*77c1e3ccSAndroid Build Coastguard Worker%macro SAD64XN 1-2 0 148*77c1e3ccSAndroid Build Coastguard Worker SAD_FN 64, %1, 5, %2 149*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 150*77c1e3ccSAndroid Build Coastguard Worker mov n_rowsd, %1/2 151*77c1e3ccSAndroid Build Coastguard Worker%else 152*77c1e3ccSAndroid Build Coastguard Worker mov n_rowsd, %1 153*77c1e3ccSAndroid Build Coastguard Worker%endif 154*77c1e3ccSAndroid Build Coastguard Worker pxor m0, m0 155*77c1e3ccSAndroid Build Coastguard Worker.loop: 156*77c1e3ccSAndroid Build Coastguard Worker movu m1, [refq] 157*77c1e3ccSAndroid Build Coastguard Worker movu m2, [refq+16] 158*77c1e3ccSAndroid Build Coastguard Worker movu m3, [refq+32] 159*77c1e3ccSAndroid Build Coastguard Worker movu m4, [refq+48] 160*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1 161*77c1e3ccSAndroid Build Coastguard Worker pavgb m1, [second_predq+mmsize*0] 162*77c1e3ccSAndroid Build Coastguard Worker pavgb m2, [second_predq+mmsize*1] 163*77c1e3ccSAndroid Build Coastguard Worker pavgb m3, [second_predq+mmsize*2] 164*77c1e3ccSAndroid Build Coastguard Worker pavgb m4, [second_predq+mmsize*3] 165*77c1e3ccSAndroid Build Coastguard Worker lea second_predq, [second_predq+mmsize*4] 166*77c1e3ccSAndroid Build Coastguard Worker%endif 167*77c1e3ccSAndroid Build Coastguard Worker psadbw m1, [srcq] 168*77c1e3ccSAndroid Build Coastguard Worker psadbw m2, [srcq+16] 169*77c1e3ccSAndroid Build Coastguard Worker psadbw m3, [srcq+32] 170*77c1e3ccSAndroid Build Coastguard Worker psadbw m4, [srcq+48] 171*77c1e3ccSAndroid Build Coastguard Worker paddd m1, m2 172*77c1e3ccSAndroid Build Coastguard Worker paddd m3, m4 173*77c1e3ccSAndroid Build Coastguard Worker add refq, ref_strideq 174*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 175*77c1e3ccSAndroid Build Coastguard Worker add srcq, src_strideq 176*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m3 177*77c1e3ccSAndroid Build Coastguard Worker dec n_rowsd 178*77c1e3ccSAndroid Build Coastguard Worker jg .loop 179*77c1e3ccSAndroid Build Coastguard Worker 180*77c1e3ccSAndroid Build Coastguard Worker movhlps m1, m0 181*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 182*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad 183*77c1e3ccSAndroid Build Coastguard Worker pslld m0, 1 184*77c1e3ccSAndroid Build Coastguard Worker%endif 185*77c1e3ccSAndroid Build Coastguard Worker movd eax, m0 186*77c1e3ccSAndroid Build Coastguard Worker RET 187*77c1e3ccSAndroid Build Coastguard Worker%endmacro 188*77c1e3ccSAndroid Build Coastguard Worker 189*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2 190*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 128 ; sad64x128_sse2 191*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 64 ; sad64x64_sse2 192*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 32 ; sad64x32_sse2 193*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 128, 1 ; sad64x128_avg_sse2 194*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 64, 1 ; sad64x64_avg_sse2 195*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 32, 1 ; sad64x32_avg_sse2 196*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 128, 2 ; sad64x128_skip_sse2 197*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 64, 2 ; sad64x64_skip_sse2 198*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 32, 2 ; sad64x32_skip_sse2 199*77c1e3ccSAndroid Build Coastguard Worker%if CONFIG_REALTIME_ONLY==0 200*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 16 ; sad64x16_sse2 201*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 16, 1 ; sad64x16_avg_sse2 202*77c1e3ccSAndroid Build Coastguard WorkerSAD64XN 16, 2 ; sad64x16_skip_sse2 203*77c1e3ccSAndroid Build Coastguard Worker%endif 204*77c1e3ccSAndroid Build Coastguard Worker 205*77c1e3ccSAndroid Build Coastguard Worker; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, 206*77c1e3ccSAndroid Build Coastguard Worker; uint8_t *ref, int ref_stride); 207*77c1e3ccSAndroid Build Coastguard Worker%macro SAD32XN 1-2 0 208*77c1e3ccSAndroid Build Coastguard Worker SAD_FN 32, %1, 5, %2 209*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 210*77c1e3ccSAndroid Build Coastguard Worker mov n_rowsd, %1/4 211*77c1e3ccSAndroid Build Coastguard Worker%else 212*77c1e3ccSAndroid Build Coastguard Worker mov n_rowsd, %1/2 213*77c1e3ccSAndroid Build Coastguard Worker%endif 214*77c1e3ccSAndroid Build Coastguard Worker pxor m0, m0 215*77c1e3ccSAndroid Build Coastguard Worker.loop: 216*77c1e3ccSAndroid Build Coastguard Worker movu m1, [refq] 217*77c1e3ccSAndroid Build Coastguard Worker movu m2, [refq+16] 218*77c1e3ccSAndroid Build Coastguard Worker movu m3, [refq+ref_strideq] 219*77c1e3ccSAndroid Build Coastguard Worker movu m4, [refq+ref_strideq+16] 220*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1 221*77c1e3ccSAndroid Build Coastguard Worker pavgb m1, [second_predq+mmsize*0] 222*77c1e3ccSAndroid Build Coastguard Worker pavgb m2, [second_predq+mmsize*1] 223*77c1e3ccSAndroid Build Coastguard Worker pavgb m3, [second_predq+mmsize*2] 224*77c1e3ccSAndroid Build Coastguard Worker pavgb m4, [second_predq+mmsize*3] 225*77c1e3ccSAndroid Build Coastguard Worker lea second_predq, [second_predq+mmsize*4] 226*77c1e3ccSAndroid Build Coastguard Worker%endif 227*77c1e3ccSAndroid Build Coastguard Worker psadbw m1, [srcq] 228*77c1e3ccSAndroid Build Coastguard Worker psadbw m2, [srcq+16] 229*77c1e3ccSAndroid Build Coastguard Worker psadbw m3, [srcq+src_strideq] 230*77c1e3ccSAndroid Build Coastguard Worker psadbw m4, [srcq+src_strideq+16] 231*77c1e3ccSAndroid Build Coastguard Worker paddd m1, m2 232*77c1e3ccSAndroid Build Coastguard Worker paddd m3, m4 233*77c1e3ccSAndroid Build Coastguard Worker lea refq, [refq+ref_strideq*2] 234*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 235*77c1e3ccSAndroid Build Coastguard Worker lea srcq, [srcq+src_strideq*2] 236*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m3 237*77c1e3ccSAndroid Build Coastguard Worker dec n_rowsd 238*77c1e3ccSAndroid Build Coastguard Worker jg .loop 239*77c1e3ccSAndroid Build Coastguard Worker 240*77c1e3ccSAndroid Build Coastguard Worker movhlps m1, m0 241*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 242*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad 243*77c1e3ccSAndroid Build Coastguard Worker pslld m0, 1 244*77c1e3ccSAndroid Build Coastguard Worker%endif 245*77c1e3ccSAndroid Build Coastguard Worker movd eax, m0 246*77c1e3ccSAndroid Build Coastguard Worker RET 247*77c1e3ccSAndroid Build Coastguard Worker%endmacro 248*77c1e3ccSAndroid Build Coastguard Worker 249*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2 250*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 64 ; sad32x64_sse2 251*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 32 ; sad32x32_sse2 252*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 16 ; sad32x16_sse2 253*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 64, 1 ; sad32x64_avg_sse2 254*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 32, 1 ; sad32x32_avg_sse2 255*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 16, 1 ; sad32x16_avg_sse2 256*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 64, 2 ; sad32x64_skip_sse2 257*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 32, 2 ; sad32x32_skip_sse2 258*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 16, 2 ; sad32x16_skip_sse2 259*77c1e3ccSAndroid Build Coastguard Worker%if CONFIG_REALTIME_ONLY==0 260*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 8 ; sad_32x8_sse2 261*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 8, 1 ; sad_32x8_avg_sse2 262*77c1e3ccSAndroid Build Coastguard WorkerSAD32XN 8, 2 ; sad_32x8_skip_sse2 263*77c1e3ccSAndroid Build Coastguard Worker%endif 264*77c1e3ccSAndroid Build Coastguard Worker 265*77c1e3ccSAndroid Build Coastguard Worker; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, 266*77c1e3ccSAndroid Build Coastguard Worker; uint8_t *ref, int ref_stride); 267*77c1e3ccSAndroid Build Coastguard Worker%macro SAD16XN 1-2 0 268*77c1e3ccSAndroid Build Coastguard Worker SAD_FN 16, %1, 7, %2 269*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 270*77c1e3ccSAndroid Build Coastguard Worker mov n_rowsd, %1/8 271*77c1e3ccSAndroid Build Coastguard Worker%else 272*77c1e3ccSAndroid Build Coastguard Worker mov n_rowsd, %1/4 273*77c1e3ccSAndroid Build Coastguard Worker%endif 274*77c1e3ccSAndroid Build Coastguard Worker pxor m0, m0 275*77c1e3ccSAndroid Build Coastguard Worker 276*77c1e3ccSAndroid Build Coastguard Worker.loop: 277*77c1e3ccSAndroid Build Coastguard Worker movu m1, [refq] 278*77c1e3ccSAndroid Build Coastguard Worker movu m2, [refq+ref_strideq] 279*77c1e3ccSAndroid Build Coastguard Worker movu m3, [refq+ref_strideq*2] 280*77c1e3ccSAndroid Build Coastguard Worker movu m4, [refq+ref_stride3q] 281*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1 282*77c1e3ccSAndroid Build Coastguard Worker pavgb m1, [second_predq+mmsize*0] 283*77c1e3ccSAndroid Build Coastguard Worker pavgb m2, [second_predq+mmsize*1] 284*77c1e3ccSAndroid Build Coastguard Worker pavgb m3, [second_predq+mmsize*2] 285*77c1e3ccSAndroid Build Coastguard Worker pavgb m4, [second_predq+mmsize*3] 286*77c1e3ccSAndroid Build Coastguard Worker lea second_predq, [second_predq+mmsize*4] 287*77c1e3ccSAndroid Build Coastguard Worker%endif 288*77c1e3ccSAndroid Build Coastguard Worker psadbw m1, [srcq] 289*77c1e3ccSAndroid Build Coastguard Worker psadbw m2, [srcq+src_strideq] 290*77c1e3ccSAndroid Build Coastguard Worker psadbw m3, [srcq+src_strideq*2] 291*77c1e3ccSAndroid Build Coastguard Worker psadbw m4, [srcq+src_stride3q] 292*77c1e3ccSAndroid Build Coastguard Worker paddd m1, m2 293*77c1e3ccSAndroid Build Coastguard Worker paddd m3, m4 294*77c1e3ccSAndroid Build Coastguard Worker lea refq, [refq+ref_strideq*4] 295*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 296*77c1e3ccSAndroid Build Coastguard Worker lea srcq, [srcq+src_strideq*4] 297*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m3 298*77c1e3ccSAndroid Build Coastguard Worker dec n_rowsd 299*77c1e3ccSAndroid Build Coastguard Worker jg .loop 300*77c1e3ccSAndroid Build Coastguard Worker 301*77c1e3ccSAndroid Build Coastguard Worker movhlps m1, m0 302*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 303*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad 304*77c1e3ccSAndroid Build Coastguard Worker pslld m0, 1 305*77c1e3ccSAndroid Build Coastguard Worker%endif 306*77c1e3ccSAndroid Build Coastguard Worker movd eax, m0 307*77c1e3ccSAndroid Build Coastguard Worker RET 308*77c1e3ccSAndroid Build Coastguard Worker%endmacro 309*77c1e3ccSAndroid Build Coastguard Worker 310*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2 311*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 32 ; sad16x32_sse2 312*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 16 ; sad16x16_sse2 313*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 8 ; sad16x8_sse2 314*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 32, 1 ; sad16x32_avg_sse2 315*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 16, 1 ; sad16x16_avg_sse2 316*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 8, 1 ; sad16x8_avg_sse2 317*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 32, 2 ; sad16x32_skip_sse2 318*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 16, 2 ; sad16x16_skip_sse2 319*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 8, 2 ; sad16x8_skip_sse2 320*77c1e3ccSAndroid Build Coastguard Worker%if CONFIG_REALTIME_ONLY==0 321*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 64 ; sad_16x64_sse2 322*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 4 ; sad_16x4_sse2 323*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 64, 1 ; sad_16x64_avg_sse2 324*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 4, 1 ; sad_16x4_avg_sse2 325*77c1e3ccSAndroid Build Coastguard WorkerSAD16XN 64, 2 ; sad_16x64_skip_sse2 326*77c1e3ccSAndroid Build Coastguard Worker%endif 327*77c1e3ccSAndroid Build Coastguard Worker 328*77c1e3ccSAndroid Build Coastguard Worker; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, 329*77c1e3ccSAndroid Build Coastguard Worker; uint8_t *ref, int ref_stride); 330*77c1e3ccSAndroid Build Coastguard Worker%macro SAD8XN 1-2 0 331*77c1e3ccSAndroid Build Coastguard Worker SAD_FN 8, %1, 7, %2 332*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 333*77c1e3ccSAndroid Build Coastguard Worker mov n_rowsd, %1/8 334*77c1e3ccSAndroid Build Coastguard Worker%else 335*77c1e3ccSAndroid Build Coastguard Worker mov n_rowsd, %1/4 336*77c1e3ccSAndroid Build Coastguard Worker%endif 337*77c1e3ccSAndroid Build Coastguard Worker pxor m0, m0 338*77c1e3ccSAndroid Build Coastguard Worker 339*77c1e3ccSAndroid Build Coastguard Worker.loop: 340*77c1e3ccSAndroid Build Coastguard Worker movh m1, [refq] 341*77c1e3ccSAndroid Build Coastguard Worker movhps m1, [refq+ref_strideq] 342*77c1e3ccSAndroid Build Coastguard Worker movh m2, [refq+ref_strideq*2] 343*77c1e3ccSAndroid Build Coastguard Worker movhps m2, [refq+ref_stride3q] 344*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1 345*77c1e3ccSAndroid Build Coastguard Worker pavgb m1, [second_predq+mmsize*0] 346*77c1e3ccSAndroid Build Coastguard Worker pavgb m2, [second_predq+mmsize*1] 347*77c1e3ccSAndroid Build Coastguard Worker lea second_predq, [second_predq+mmsize*2] 348*77c1e3ccSAndroid Build Coastguard Worker%endif 349*77c1e3ccSAndroid Build Coastguard Worker movh m3, [srcq] 350*77c1e3ccSAndroid Build Coastguard Worker movhps m3, [srcq+src_strideq] 351*77c1e3ccSAndroid Build Coastguard Worker movh m4, [srcq+src_strideq*2] 352*77c1e3ccSAndroid Build Coastguard Worker movhps m4, [srcq+src_stride3q] 353*77c1e3ccSAndroid Build Coastguard Worker psadbw m1, m3 354*77c1e3ccSAndroid Build Coastguard Worker psadbw m2, m4 355*77c1e3ccSAndroid Build Coastguard Worker lea refq, [refq+ref_strideq*4] 356*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 357*77c1e3ccSAndroid Build Coastguard Worker lea srcq, [srcq+src_strideq*4] 358*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m2 359*77c1e3ccSAndroid Build Coastguard Worker dec n_rowsd 360*77c1e3ccSAndroid Build Coastguard Worker jg .loop 361*77c1e3ccSAndroid Build Coastguard Worker 362*77c1e3ccSAndroid Build Coastguard Worker movhlps m1, m0 363*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 364*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad 365*77c1e3ccSAndroid Build Coastguard Worker pslld m0, 1 366*77c1e3ccSAndroid Build Coastguard Worker%endif 367*77c1e3ccSAndroid Build Coastguard Worker movd eax, m0 368*77c1e3ccSAndroid Build Coastguard Worker RET 369*77c1e3ccSAndroid Build Coastguard Worker%endmacro 370*77c1e3ccSAndroid Build Coastguard Worker 371*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2 372*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 16 ; sad8x16_sse2 373*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 8 ; sad8x8_sse2 374*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 4 ; sad8x4_sse2 375*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 16, 1 ; sad8x16_avg_sse2 376*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 8, 1 ; sad8x8_avg_sse2 377*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 4, 1 ; sad8x4_avg_sse2 378*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 16, 2 ; sad8x16_skip_sse2 379*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 8, 2 ; sad8x8_skip_sse2 380*77c1e3ccSAndroid Build Coastguard Worker%if CONFIG_REALTIME_ONLY==0 381*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 32 ; sad_8x32_sse2 382*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 32, 1 ; sad_8x32_avg_sse2 383*77c1e3ccSAndroid Build Coastguard WorkerSAD8XN 32, 2 ; sad_8x32_skip_sse2 384*77c1e3ccSAndroid Build Coastguard Worker%endif 385*77c1e3ccSAndroid Build Coastguard Worker 386*77c1e3ccSAndroid Build Coastguard Worker; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, 387*77c1e3ccSAndroid Build Coastguard Worker; uint8_t *ref, int ref_stride); 388*77c1e3ccSAndroid Build Coastguard Worker%macro SAD4XN 1-2 0 389*77c1e3ccSAndroid Build Coastguard Worker SAD_FN 4, %1, 7, %2 390*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 391*77c1e3ccSAndroid Build Coastguard Worker mov n_rowsd, %1/8 392*77c1e3ccSAndroid Build Coastguard Worker%else 393*77c1e3ccSAndroid Build Coastguard Worker mov n_rowsd, %1/4 394*77c1e3ccSAndroid Build Coastguard Worker%endif 395*77c1e3ccSAndroid Build Coastguard Worker pxor m0, m0 396*77c1e3ccSAndroid Build Coastguard Worker 397*77c1e3ccSAndroid Build Coastguard Worker.loop: 398*77c1e3ccSAndroid Build Coastguard Worker movd m1, [refq] 399*77c1e3ccSAndroid Build Coastguard Worker movd m2, [refq+ref_strideq] 400*77c1e3ccSAndroid Build Coastguard Worker movd m3, [refq+ref_strideq*2] 401*77c1e3ccSAndroid Build Coastguard Worker movd m4, [refq+ref_stride3q] 402*77c1e3ccSAndroid Build Coastguard Worker punpckldq m1, m2 403*77c1e3ccSAndroid Build Coastguard Worker punpckldq m3, m4 404*77c1e3ccSAndroid Build Coastguard Worker movlhps m1, m3 405*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 1 406*77c1e3ccSAndroid Build Coastguard Worker pavgb m1, [second_predq+mmsize*0] 407*77c1e3ccSAndroid Build Coastguard Worker lea second_predq, [second_predq+mmsize*1] 408*77c1e3ccSAndroid Build Coastguard Worker%endif 409*77c1e3ccSAndroid Build Coastguard Worker movd m2, [srcq] 410*77c1e3ccSAndroid Build Coastguard Worker movd m5, [srcq+src_strideq] 411*77c1e3ccSAndroid Build Coastguard Worker movd m4, [srcq+src_strideq*2] 412*77c1e3ccSAndroid Build Coastguard Worker movd m3, [srcq+src_stride3q] 413*77c1e3ccSAndroid Build Coastguard Worker punpckldq m2, m5 414*77c1e3ccSAndroid Build Coastguard Worker punpckldq m4, m3 415*77c1e3ccSAndroid Build Coastguard Worker movlhps m2, m4 416*77c1e3ccSAndroid Build Coastguard Worker psadbw m1, m2 417*77c1e3ccSAndroid Build Coastguard Worker lea refq, [refq+ref_strideq*4] 418*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 419*77c1e3ccSAndroid Build Coastguard Worker lea srcq, [srcq+src_strideq*4] 420*77c1e3ccSAndroid Build Coastguard Worker dec n_rowsd 421*77c1e3ccSAndroid Build Coastguard Worker jg .loop 422*77c1e3ccSAndroid Build Coastguard Worker 423*77c1e3ccSAndroid Build Coastguard Worker movhlps m1, m0 424*77c1e3ccSAndroid Build Coastguard Worker paddd m0, m1 425*77c1e3ccSAndroid Build Coastguard Worker%if %2 == 2 ; we skipped rows, so now we need to double the sad 426*77c1e3ccSAndroid Build Coastguard Worker pslld m0, 1 427*77c1e3ccSAndroid Build Coastguard Worker%endif 428*77c1e3ccSAndroid Build Coastguard Worker movd eax, m0 429*77c1e3ccSAndroid Build Coastguard Worker RET 430*77c1e3ccSAndroid Build Coastguard Worker%endmacro 431*77c1e3ccSAndroid Build Coastguard Worker 432*77c1e3ccSAndroid Build Coastguard WorkerINIT_XMM sse2 433*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN 8 ; sad4x8_sse2 434*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN 4 ; sad4x4_sse2 435*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN 8, 1 ; sad4x8_avg_sse2 436*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN 4, 1 ; sad4x4_avg_sse2 437*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN 8, 2 ; sad4x8_skip_sse2 438*77c1e3ccSAndroid Build Coastguard Worker%if CONFIG_REALTIME_ONLY==0 439*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN 16 ; sad_4x16_sse2 440*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN 16, 1 ; sad_4x16_avg_sse2 441*77c1e3ccSAndroid Build Coastguard WorkerSAD4XN 16, 2 ; sad_4x16_skip_sse2 442*77c1e3ccSAndroid Build Coastguard Worker%endif 443