1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15; Macro Arguments 16; Arg 1: Width 17; Arg 2: Height 18; Arg 3: Number of general purpose registers 19; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows 20%macro HIGH_SAD_FN 4 21%if %4 == 0 22%if %3 == 5 23cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows 24%else ; %3 == 7 25cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ 26 src_stride3, ref_stride3, n_rows 27%endif ; %3 == 5/7 28%elif %4 == 1 ; avg 29%if %3 == 5 30cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ 31 second_pred, n_rows 32%else ; %3 == 7 33cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ 34 ref, ref_stride, \ 35 second_pred, \ 36 src_stride3, ref_stride3 37%if VPX_ARCH_X86_64 38%define n_rowsd r7d 39%else ; x86-32 40%define n_rowsd dword r0m 41%endif ; x86-32/64 42%endif ; %3 == 5/7 43%else ; %4 == 2, skip rows 44%if %3 == 5 45cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows 46%else ; %3 == 7 47cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ 48 src_stride3, ref_stride3, n_rows 49%endif ; %3 == 5/7 50%endif ; sad/avg/skip 51%if %4 == 2 ; double the stride if we are skipping rows 52 lea src_strided, [src_strided*2] 53 lea ref_strided, [ref_strided*2] 54%endif 55 movsxdifnidn src_strideq, src_strided 56 movsxdifnidn ref_strideq, ref_strided 57%if %3 == 7 58 lea src_stride3q, [src_strideq*3] 59 lea ref_stride3q, [ref_strideq*3] 60%endif ; %3 == 7 61; convert src, ref & second_pred to short ptrs (from byte ptrs) 62 shl srcq, 1 63 shl refq, 1 64%if %4 == 1 65 shl second_predq, 1 66%endif 67%endmacro 68 69; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, 70; uint8_t *ref, int ref_stride); 71%macro HIGH_SAD64XN 1-2 0 72 HIGH_SAD_FN 64, %1, 5, %2 73%if %2 == 2 ; skip rows, so divide number of rows by 2 74 mov n_rowsd, %1/2 75%else 76 mov n_rowsd, %1 77%endif 78 pxor m0, m0 79 pxor m6, m6 80 81.loop: 82 ; first half of each row 83 movu m1, [refq] 84 movu m2, [refq+16] 85 movu m3, [refq+32] 86 movu m4, [refq+48] 87%if %2 == 1 88 pavgw m1, [second_predq+mmsize*0] 89 pavgw m2, [second_predq+mmsize*1] 90 pavgw m3, [second_predq+mmsize*2] 91 pavgw m4, [second_predq+mmsize*3] 92 lea second_predq, [second_predq+mmsize*4] 93%endif 94 mova m5, [srcq] 95 psubusw m5, m1 96 psubusw m1, [srcq] 97 por m1, m5 98 mova m5, [srcq+16] 99 psubusw m5, m2 100 psubusw m2, [srcq+16] 101 por m2, m5 102 mova m5, [srcq+32] 103 psubusw m5, m3 104 psubusw m3, [srcq+32] 105 por m3, m5 106 mova m5, [srcq+48] 107 psubusw m5, m4 108 psubusw m4, [srcq+48] 109 por m4, m5 110 paddw m1, m2 111 paddw m3, m4 112 movhlps m2, m1 113 movhlps m4, m3 114 paddw m1, m2 115 paddw m3, m4 116 punpcklwd m1, m6 117 punpcklwd m3, m6 118 paddd m0, m1 119 paddd m0, m3 120 ; second half of each row 121 movu m1, [refq+64] 122 movu m2, [refq+80] 123 movu m3, [refq+96] 124 movu m4, [refq+112] 125%if %2 == 1 126 pavgw m1, [second_predq+mmsize*0] 127 pavgw m2, [second_predq+mmsize*1] 128 pavgw m3, [second_predq+mmsize*2] 129 pavgw m4, [second_predq+mmsize*3] 130 lea second_predq, [second_predq+mmsize*4] 131%endif 132 mova m5, [srcq+64] 133 psubusw m5, m1 134 psubusw m1, [srcq+64] 135 por m1, m5 136 mova m5, [srcq+80] 137 psubusw m5, m2 138 psubusw m2, [srcq+80] 139 por m2, m5 140 mova m5, [srcq+96] 141 psubusw m5, m3 142 psubusw m3, [srcq+96] 143 por m3, m5 144 mova m5, [srcq+112] 145 psubusw m5, m4 146 psubusw m4, [srcq+112] 147 por m4, m5 148 paddw m1, m2 149 paddw m3, m4 150 movhlps m2, m1 151 movhlps m4, m3 152 paddw m1, m2 153 paddw m3, m4 154 punpcklwd m1, m6 155 punpcklwd m3, m6 156 lea refq, [refq+ref_strideq*2] 157 paddd m0, m1 158 lea srcq, [srcq+src_strideq*2] 159 paddd m0, m3 160 161 dec n_rowsd 162 jg .loop 163 164 movhlps m1, m0 165 paddd m0, m1 166 punpckldq m0, m6 167 movhlps m1, m0 168 paddd m0, m1 169%if %2 == 2 ; we skipped rows, so we need to double the sad 170 pslld m0, 1 171%endif 172 movd eax, m0 173 RET 174%endmacro 175 176INIT_XMM sse2 177HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 178HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 179HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 180HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 181HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2 182HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2 183 184 185; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, 186; uint8_t *ref, int ref_stride); 187%macro HIGH_SAD32XN 1-2 0 188 HIGH_SAD_FN 32, %1, 5, %2 189%if %2 == 2 ; skip rows, so divide number of rows by 2 190 mov n_rowsd, %1/2 191%else 192 mov n_rowsd, %1 193%endif 194 pxor m0, m0 195 pxor m6, m6 196 197.loop: 198 movu m1, [refq] 199 movu m2, [refq+16] 200 movu m3, [refq+32] 201 movu m4, [refq+48] 202%if %2 == 1 203 pavgw m1, [second_predq+mmsize*0] 204 pavgw m2, [second_predq+mmsize*1] 205 pavgw m3, [second_predq+mmsize*2] 206 pavgw m4, [second_predq+mmsize*3] 207 lea second_predq, [second_predq+mmsize*4] 208%endif 209 mova m5, [srcq] 210 psubusw m5, m1 211 psubusw m1, [srcq] 212 por m1, m5 213 mova m5, [srcq+16] 214 psubusw m5, m2 215 psubusw m2, [srcq+16] 216 por m2, m5 217 mova m5, [srcq+32] 218 psubusw m5, m3 219 psubusw m3, [srcq+32] 220 por m3, m5 221 mova m5, [srcq+48] 222 psubusw m5, m4 223 psubusw m4, [srcq+48] 224 por m4, m5 225 paddw m1, m2 226 paddw m3, m4 227 movhlps m2, m1 228 movhlps m4, m3 229 paddw m1, m2 230 paddw m3, m4 231 punpcklwd m1, m6 232 punpcklwd m3, m6 233 lea refq, [refq+ref_strideq*2] 234 paddd m0, m1 235 lea srcq, [srcq+src_strideq*2] 236 paddd m0, m3 237 dec n_rowsd 238 jg .loop 239 240 movhlps m1, m0 241 paddd m0, m1 242 punpckldq m0, m6 243 movhlps m1, m0 244 paddd m0, m1 245%if %2 == 2 ; we skipped rows, so we need to double the sad 246 pslld m0, 1 247%endif 248 movd eax, m0 249 RET 250%endmacro 251 252INIT_XMM sse2 253HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 254HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 255HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 256HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 257HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 258HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 259HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2 260HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2 261HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2 262 263; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, 264; uint8_t *ref, int ref_stride); 265%macro HIGH_SAD16XN 1-2 0 266 HIGH_SAD_FN 16, %1, 5, %2 267%if %2 == 2 ; skip rows, so divide number of rows by 2 268 mov n_rowsd, %1/4 269%else 270 mov n_rowsd, %1/2 271%endif 272 pxor m0, m0 273 pxor m6, m6 274 275.loop: 276 movu m1, [refq] 277 movu m2, [refq+16] 278 movu m3, [refq+ref_strideq*2] 279 movu m4, [refq+ref_strideq*2+16] 280%if %2 == 1 281 pavgw m1, [second_predq+mmsize*0] 282 pavgw m2, [second_predq+16] 283 pavgw m3, [second_predq+mmsize*2] 284 pavgw m4, [second_predq+mmsize*2+16] 285 lea second_predq, [second_predq+mmsize*4] 286%endif 287 mova m5, [srcq] 288 psubusw m5, m1 289 psubusw m1, [srcq] 290 por m1, m5 291 mova m5, [srcq+16] 292 psubusw m5, m2 293 psubusw m2, [srcq+16] 294 por m2, m5 295 mova m5, [srcq+src_strideq*2] 296 psubusw m5, m3 297 psubusw m3, [srcq+src_strideq*2] 298 por m3, m5 299 mova m5, [srcq+src_strideq*2+16] 300 psubusw m5, m4 301 psubusw m4, [srcq+src_strideq*2+16] 302 por m4, m5 303 paddw m1, m2 304 paddw m3, m4 305 movhlps m2, m1 306 movhlps m4, m3 307 paddw m1, m2 308 paddw m3, m4 309 punpcklwd m1, m6 310 punpcklwd m3, m6 311 lea refq, [refq+ref_strideq*4] 312 paddd m0, m1 313 lea srcq, [srcq+src_strideq*4] 314 paddd m0, m3 315 dec n_rowsd 316 jg .loop 317 318 movhlps m1, m0 319 paddd m0, m1 320 punpckldq m0, m6 321 movhlps m1, m0 322 paddd m0, m1 323%if %2 == 2 ; we skipped rows, so we need to double the sad 324 pslld m0, 1 325%endif 326 movd eax, m0 327 RET 328%endmacro 329 330INIT_XMM sse2 331HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 332HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 333HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 334HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 335HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 336HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 337HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2 338HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2 339HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2 340 341; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, 342; uint8_t *ref, int ref_stride); 343%macro HIGH_SAD8XN 1-2 0 344 HIGH_SAD_FN 8, %1, 7, %2 345%if %2 == 2 ; skip rows, so divide number of rows by 2 346 mov n_rowsd, %1/8 347%else 348 mov n_rowsd, %1/4 349%endif 350 pxor m0, m0 351 pxor m6, m6 352 353.loop: 354 movu m1, [refq] 355 movu m2, [refq+ref_strideq*2] 356 movu m3, [refq+ref_strideq*4] 357 movu m4, [refq+ref_stride3q*2] 358%if %2 == 1 359 pavgw m1, [second_predq+mmsize*0] 360 pavgw m2, [second_predq+mmsize*1] 361 pavgw m3, [second_predq+mmsize*2] 362 pavgw m4, [second_predq+mmsize*3] 363 lea second_predq, [second_predq+mmsize*4] 364%endif 365 mova m5, [srcq] 366 psubusw m5, m1 367 psubusw m1, [srcq] 368 por m1, m5 369 mova m5, [srcq+src_strideq*2] 370 psubusw m5, m2 371 psubusw m2, [srcq+src_strideq*2] 372 por m2, m5 373 mova m5, [srcq+src_strideq*4] 374 psubusw m5, m3 375 psubusw m3, [srcq+src_strideq*4] 376 por m3, m5 377 mova m5, [srcq+src_stride3q*2] 378 psubusw m5, m4 379 psubusw m4, [srcq+src_stride3q*2] 380 por m4, m5 381 paddw m1, m2 382 paddw m3, m4 383 movhlps m2, m1 384 movhlps m4, m3 385 paddw m1, m2 386 paddw m3, m4 387 punpcklwd m1, m6 388 punpcklwd m3, m6 389 lea refq, [refq+ref_strideq*8] 390 paddd m0, m1 391 lea srcq, [srcq+src_strideq*8] 392 paddd m0, m3 393 dec n_rowsd 394 jg .loop 395 396 movhlps m1, m0 397 paddd m0, m1 398 punpckldq m0, m6 399 movhlps m1, m0 400 paddd m0, m1 401%if %2 == 2 ; we skipped rows, so we need to double the sad 402 pslld m0, 1 403%endif 404 movd eax, m0 405 RET 406%endmacro 407 408INIT_XMM sse2 409HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 410HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 411HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 412HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 413HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 414HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 415HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2 416HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2 417