1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15; Macro Arguments 16; Arg 1: Width 17; Arg 2: Height 18; Arg 3: Number of general purpose registers 19; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows 20%macro SAD_FN 4 21%if %4 == 0 ; normal sad 22%if %3 == 5 23cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 24%else ; %3 == 7 25cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 26 src_stride3, ref_stride3, n_rows 27%endif ; %3 == 5/7 28 29%elif %4 == 2 ; skip 30%if %3 == 5 31cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 32%else ; %3 == 7 33cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 34 src_stride3, ref_stride3, n_rows 35%endif ; %3 == 5/7 36 37%else 38%if %3 == 5 39cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ 40 second_pred, n_rows 41%else ; %3 == 7 42cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ 43 ref, ref_stride, \ 44 second_pred, \ 45 src_stride3, ref_stride3 46%if VPX_ARCH_X86_64 47%define n_rowsd r7d 48%else ; x86-32 49%define n_rowsd dword r0m 50%endif ; x86-32/64 51%endif ; %3 == 5/7 52%endif ; sad/avg/skip 53%if %4 == 2; skip rows so double the stride 54lea src_strided, [src_strided*2] 55lea ref_strided, [ref_strided*2] 56%endif ; %4 skip 57 movsxdifnidn src_strideq, src_strided 58 movsxdifnidn ref_strideq, ref_strided 59%if %3 == 7 60 lea src_stride3q, [src_strideq*3] 61 lea ref_stride3q, [ref_strideq*3] 62%endif ; %3 == 7 63%endmacro 64 65; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride, 66; uint8_t *ref, int ref_stride); 67%macro SAD64XN 1-2 0 68 SAD_FN 64, %1, 5, %2 69%if %2 == 2 70 mov n_rowsd, %1/2 71%else 72 mov n_rowsd, %1 73%endif 74 pxor m0, m0 75.loop: 76 movu m1, [refq] 77 movu m2, [refq+16] 78 movu m3, [refq+32] 79 movu m4, [refq+48] 80%if %2 == 1 81 pavgb m1, [second_predq+mmsize*0] 82 pavgb m2, [second_predq+mmsize*1] 83 pavgb m3, [second_predq+mmsize*2] 84 pavgb m4, [second_predq+mmsize*3] 85 lea second_predq, [second_predq+mmsize*4] 86%endif 87 psadbw m1, [srcq] 88 psadbw m2, [srcq+16] 89 psadbw m3, [srcq+32] 90 psadbw m4, [srcq+48] 91 paddd m1, m2 92 paddd m3, m4 93 add refq, ref_strideq 94 paddd m0, m1 95 add srcq, src_strideq 96 paddd m0, m3 97 dec n_rowsd 98 jg .loop 99 100 movhlps m1, m0 101 paddd m0, m1 102%if %2 == 2 ; we skipped rows, so now we need to double the sad 103 pslld m0, 1 104%endif 105 movd eax, m0 106 RET 107%endmacro 108 109INIT_XMM sse2 110SAD64XN 64 ; sad64x64_sse2 111SAD64XN 32 ; sad64x32_sse2 112SAD64XN 64, 1 ; sad64x64_avg_sse2 113SAD64XN 32, 1 ; sad64x32_avg_sse2 114SAD64XN 64, 2 ; sad64x64_skip_sse2 115SAD64XN 32, 2 ; sad64x32_skip_sse2 116 117; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride, 118; uint8_t *ref, int ref_stride); 119%macro SAD32XN 1-2 0 120 SAD_FN 32, %1, 5, %2 121%if %2 == 2 122 mov n_rowsd, %1/4 123%else 124 mov n_rowsd, %1/2 125%endif 126 pxor m0, m0 127.loop: 128 movu m1, [refq] 129 movu m2, [refq+16] 130 movu m3, [refq+ref_strideq] 131 movu m4, [refq+ref_strideq+16] 132%if %2 == 1 133 pavgb m1, [second_predq+mmsize*0] 134 pavgb m2, [second_predq+mmsize*1] 135 pavgb m3, [second_predq+mmsize*2] 136 pavgb m4, [second_predq+mmsize*3] 137 lea second_predq, [second_predq+mmsize*4] 138%endif 139 psadbw m1, [srcq] 140 psadbw m2, [srcq+16] 141 psadbw m3, [srcq+src_strideq] 142 psadbw m4, [srcq+src_strideq+16] 143 paddd m1, m2 144 paddd m3, m4 145 lea refq, [refq+ref_strideq*2] 146 paddd m0, m1 147 lea srcq, [srcq+src_strideq*2] 148 paddd m0, m3 149 dec n_rowsd 150 jg .loop 151 152 movhlps m1, m0 153 paddd m0, m1 154%if %2 == 2 ; we skipped rows, so now we need to double the sad 155 pslld m0, 1 156%endif 157 movd eax, m0 158 RET 159%endmacro 160 161INIT_XMM sse2 162SAD32XN 64 ; sad32x64_sse2 163SAD32XN 32 ; sad32x32_sse2 164SAD32XN 16 ; sad32x16_sse2 165SAD32XN 64, 1 ; sad32x64_avg_sse2 166SAD32XN 32, 1 ; sad32x32_avg_sse2 167SAD32XN 16, 1 ; sad32x16_avg_sse2 168SAD32XN 64, 2 ; sad32x64_skip_sse2 169SAD32XN 32, 2 ; sad32x32_skip_sse2 170SAD32XN 16, 2 ; sad32x16_skip_sse2 171 172; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride, 173; uint8_t *ref, int ref_stride); 174%macro SAD16XN 1-2 0 175 SAD_FN 16, %1, 7, %2 176%if %2 == 2 177 mov n_rowsd, %1/8 178%else 179 mov n_rowsd, %1/4 180%endif 181 pxor m0, m0 182 183.loop: 184 movu m1, [refq] 185 movu m2, [refq+ref_strideq] 186 movu m3, [refq+ref_strideq*2] 187 movu m4, [refq+ref_stride3q] 188%if %2 == 1 189 pavgb m1, [second_predq+mmsize*0] 190 pavgb m2, [second_predq+mmsize*1] 191 pavgb m3, [second_predq+mmsize*2] 192 pavgb m4, [second_predq+mmsize*3] 193 lea second_predq, [second_predq+mmsize*4] 194%endif 195 psadbw m1, [srcq] 196 psadbw m2, [srcq+src_strideq] 197 psadbw m3, [srcq+src_strideq*2] 198 psadbw m4, [srcq+src_stride3q] 199 paddd m1, m2 200 paddd m3, m4 201 lea refq, [refq+ref_strideq*4] 202 paddd m0, m1 203 lea srcq, [srcq+src_strideq*4] 204 paddd m0, m3 205 dec n_rowsd 206 jg .loop 207 208 movhlps m1, m0 209 paddd m0, m1 210%if %2 == 2 ; we skipped rows, so now we need to double the sad 211 pslld m0, 1 212%endif 213 movd eax, m0 214 RET 215%endmacro 216 217INIT_XMM sse2 218SAD16XN 32 ; sad16x32_sse2 219SAD16XN 16 ; sad16x16_sse2 220SAD16XN 8 ; sad16x8_sse2 221SAD16XN 32, 1 ; sad16x32_avg_sse2 222SAD16XN 16, 1 ; sad16x16_avg_sse2 223SAD16XN 8, 1 ; sad16x8_avg_sse2 224SAD16XN 32, 2 ; sad16x32_skip_sse2 225SAD16XN 16, 2 ; sad16x16_skip_sse2 226SAD16XN 8, 2 ; sad16x8_skip_sse2 227 228; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride, 229; uint8_t *ref, int ref_stride); 230%macro SAD8XN 1-2 0 231 SAD_FN 8, %1, 7, %2 232%if %2 == 2 233 mov n_rowsd, %1/8 234%else 235 mov n_rowsd, %1/4 236%endif 237 pxor m0, m0 238 239.loop: 240 movh m1, [refq] 241 movhps m1, [refq+ref_strideq] 242 movh m2, [refq+ref_strideq*2] 243 movhps m2, [refq+ref_stride3q] 244%if %2 == 1 245 pavgb m1, [second_predq+mmsize*0] 246 pavgb m2, [second_predq+mmsize*1] 247 lea second_predq, [second_predq+mmsize*2] 248%endif 249 movh m3, [srcq] 250 movhps m3, [srcq+src_strideq] 251 movh m4, [srcq+src_strideq*2] 252 movhps m4, [srcq+src_stride3q] 253 psadbw m1, m3 254 psadbw m2, m4 255 lea refq, [refq+ref_strideq*4] 256 paddd m0, m1 257 lea srcq, [srcq+src_strideq*4] 258 paddd m0, m2 259 dec n_rowsd 260 jg .loop 261 262 movhlps m1, m0 263 paddd m0, m1 264%if %2 == 2 ; we skipped rows, so now we need to double the sad 265 pslld m0, 1 266%endif 267 movd eax, m0 268 RET 269%endmacro 270 271INIT_XMM sse2 272SAD8XN 16 ; sad8x16_sse2 273SAD8XN 8 ; sad8x8_sse2 274SAD8XN 4 ; sad8x4_sse2 275SAD8XN 16, 1 ; sad8x16_avg_sse2 276SAD8XN 8, 1 ; sad8x8_avg_sse2 277SAD8XN 4, 1 ; sad8x4_avg_sse2 278SAD8XN 16, 2 ; sad8x16_skip_sse2 279SAD8XN 8, 2 ; sad8x8_skip_sse2 280 281; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, 282; uint8_t *ref, int ref_stride); 283%macro SAD4XN 1-2 0 284 SAD_FN 4, %1, 7, %2 285%if %2 == 2 286 mov n_rowsd, %1/8 287%else 288 mov n_rowsd, %1/4 289%endif 290 pxor m0, m0 291 292.loop: 293 movd m1, [refq] 294 movd m2, [refq+ref_strideq] 295 movd m3, [refq+ref_strideq*2] 296 movd m4, [refq+ref_stride3q] 297 punpckldq m1, m2 298 punpckldq m3, m4 299 movlhps m1, m3 300%if %2 == 1 301 pavgb m1, [second_predq+mmsize*0] 302 lea second_predq, [second_predq+mmsize*1] 303%endif 304 movd m2, [srcq] 305 movd m5, [srcq+src_strideq] 306 movd m4, [srcq+src_strideq*2] 307 movd m3, [srcq+src_stride3q] 308 punpckldq m2, m5 309 punpckldq m4, m3 310 movlhps m2, m4 311 psadbw m1, m2 312 lea refq, [refq+ref_strideq*4] 313 paddd m0, m1 314 lea srcq, [srcq+src_strideq*4] 315 dec n_rowsd 316 jg .loop 317 318 movhlps m1, m0 319 paddd m0, m1 320%if %2 == 2 ; we skipped rows, so now we need to double the sad 321 pslld m0, 1 322%endif 323 movd eax, m0 324 RET 325%endmacro 326 327INIT_XMM sse2 328SAD4XN 8 ; sad4x8_sse 329SAD4XN 4 ; sad4x4_sse 330SAD4XN 8, 1 ; sad4x8_avg_sse 331SAD4XN 4, 1 ; sad4x4_avg_sse 332SAD4XN 8, 2 ; sad4x8_skip_sse 333