1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION .text 17 18; Macro Arguments 19; Arg 1: Width 20; Arg 2: Height 21; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit 22; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows 23; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7 24%macro HIGH_SAD_FN 4-5 7 25%if %4 == 0 26%if %3 == 5 27cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows 28%else ; %3 == 7 29cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \ 30 src_stride3, ref_stride3, n_rows 31%endif ; %3 == 5/7 32%elif %4 == 1 ; avg 33%if %3 == 5 34cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \ 35 second_pred, n_rows 36%else ; %3 == 7 37cglobal highbd_sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, %5, src, src_stride, \ 38 ref, ref_stride, \ 39 second_pred, \ 40 src_stride3, ref_stride3 41%if AOM_ARCH_X86_64 42%define n_rowsd r7d 43%else ; x86-32 44%define n_rowsd dword r0m 45%endif ; x86-32/64 46%endif ; %3 == 5/7 47%else ; %4 == 2, skip rows 48%if %3 == 5 49cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows 50%else ; %3 == 7 51cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \ 52 src_stride3, ref_stride3, n_rows 53%endif ; %3 == 5/7 54%endif ; sad/avg/skip 55%if %4 == 2 ; double the stride if we are skipping rows 56 lea src_strided, [src_strided*2] 57 lea ref_strided, [ref_strided*2] 58%endif 59 movsxdifnidn src_strideq, src_strided 60 movsxdifnidn ref_strideq, ref_strided 61%if %3 == 7 62 lea src_stride3q, [src_strideq*3] 63 lea ref_stride3q, [ref_strideq*3] 64%endif ; %3 == 7 65; convert src, ref & second_pred to short ptrs (from byte ptrs) 66 shl srcq, 1 67 shl refq, 1 68%if %4 == 1 69 shl second_predq, 1 70%endif 71%endmacro 72 73; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, 74; uint8_t *ref, int ref_stride); 75%macro HIGH_SAD64XN 1-2 0 76 HIGH_SAD_FN 64, %1, 5, %2 77%if %2 == 2 ; skip rows, so divide number of rows by 2 78 mov n_rowsd, %1/2 79%else 80 mov n_rowsd, %1 81%endif 82 pxor m0, m0 83 pxor m6, m6 84 85.loop: 86 ; first half of each row 87 movu m1, [refq] 88 movu m2, [refq+16] 89 movu m3, [refq+32] 90 movu m4, [refq+48] 91%if %2 == 1 92 pavgw m1, [second_predq+mmsize*0] 93 pavgw m2, [second_predq+mmsize*1] 94 pavgw m3, [second_predq+mmsize*2] 95 pavgw m4, [second_predq+mmsize*3] 96 lea second_predq, [second_predq+mmsize*4] 97%endif 98 mova m5, [srcq] 99 psubusw m5, m1 100 psubusw m1, [srcq] 101 por m1, m5 102 mova m5, [srcq+16] 103 psubusw m5, m2 104 psubusw m2, [srcq+16] 105 por m2, m5 106 mova m5, [srcq+32] 107 psubusw m5, m3 108 psubusw m3, [srcq+32] 109 por m3, m5 110 mova m5, [srcq+48] 111 psubusw m5, m4 112 psubusw m4, [srcq+48] 113 por m4, m5 114 paddw m1, m2 115 paddw m3, m4 116 movhlps m2, m1 117 movhlps m4, m3 118 paddw m1, m2 119 paddw m3, m4 120 punpcklwd m1, m6 121 punpcklwd m3, m6 122 paddd m0, m1 123 paddd m0, m3 124 ; second half of each row 125 movu m1, [refq+64] 126 movu m2, [refq+80] 127 movu m3, [refq+96] 128 movu m4, [refq+112] 129%if %2 == 1 130 pavgw m1, [second_predq+mmsize*0] 131 pavgw m2, [second_predq+mmsize*1] 132 pavgw m3, [second_predq+mmsize*2] 133 pavgw m4, [second_predq+mmsize*3] 134 lea second_predq, [second_predq+mmsize*4] 135%endif 136 mova m5, [srcq+64] 137 psubusw m5, m1 138 psubusw m1, [srcq+64] 139 por m1, m5 140 mova m5, [srcq+80] 141 psubusw m5, m2 142 psubusw m2, [srcq+80] 143 por m2, m5 144 mova m5, [srcq+96] 145 psubusw m5, m3 146 psubusw m3, [srcq+96] 147 por m3, m5 148 mova m5, [srcq+112] 149 psubusw m5, m4 150 psubusw m4, [srcq+112] 151 por m4, m5 152 paddw m1, m2 153 paddw m3, m4 154 movhlps m2, m1 155 movhlps m4, m3 156 paddw m1, m2 157 paddw m3, m4 158 punpcklwd m1, m6 159 punpcklwd m3, m6 160 lea refq, [refq+ref_strideq*2] 161 paddd m0, m1 162 lea srcq, [srcq+src_strideq*2] 163 paddd m0, m3 164 165 dec n_rowsd 166 jg .loop 167 168 movhlps m1, m0 169 paddd m0, m1 170 punpckldq m0, m6 171 movhlps m1, m0 172 paddd m0, m1 173%if %2 == 2 ; we skipped rows, so we need to double the sad 174 pslld m0, 1 175%endif 176 movd eax, m0 177 RET 178%endmacro 179 180INIT_XMM sse2 181HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 182HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 183HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 184HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 185HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2 186HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2 187%if CONFIG_REALTIME_ONLY==0 188HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2 189HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2 190HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2 191%endif 192 193; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, 194; uint8_t *ref, int ref_stride); 195%macro HIGH_SAD32XN 1-2 0 196 HIGH_SAD_FN 32, %1, 5, %2 197%if %2 == 2 ; skip rows, so divide number of rows by 2 198 mov n_rowsd, %1/2 199%else 200 mov n_rowsd, %1 201%endif 202 pxor m0, m0 203 pxor m6, m6 204 205.loop: 206 movu m1, [refq] 207 movu m2, [refq+16] 208 movu m3, [refq+32] 209 movu m4, [refq+48] 210%if %2 == 1 211 pavgw m1, [second_predq+mmsize*0] 212 pavgw m2, [second_predq+mmsize*1] 213 pavgw m3, [second_predq+mmsize*2] 214 pavgw m4, [second_predq+mmsize*3] 215 lea second_predq, [second_predq+mmsize*4] 216%endif 217 mova m5, [srcq] 218 psubusw m5, m1 219 psubusw m1, [srcq] 220 por m1, m5 221 mova m5, [srcq+16] 222 psubusw m5, m2 223 psubusw m2, [srcq+16] 224 por m2, m5 225 mova m5, [srcq+32] 226 psubusw m5, m3 227 psubusw m3, [srcq+32] 228 por m3, m5 229 mova m5, [srcq+48] 230 psubusw m5, m4 231 psubusw m4, [srcq+48] 232 por m4, m5 233 paddw m1, m2 234 paddw m3, m4 235 movhlps m2, m1 236 movhlps m4, m3 237 paddw m1, m2 238 paddw m3, m4 239 punpcklwd m1, m6 240 punpcklwd m3, m6 241 lea refq, [refq+ref_strideq*2] 242 paddd m0, m1 243 lea srcq, [srcq+src_strideq*2] 244 paddd m0, m3 245 dec n_rowsd 246 jg .loop 247 248 movhlps m1, m0 249 paddd m0, m1 250 punpckldq m0, m6 251 movhlps m1, m0 252 paddd m0, m1 253%if %2 == 2 ; we skipped rows, so we need to double the sad 254 pslld m0, 1 255%endif 256 movd eax, m0 257 RET 258%endmacro 259 260INIT_XMM sse2 261HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 262HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 263HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 264HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 265HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 266HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 267HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2 268HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2 269HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2 270%if CONFIG_REALTIME_ONLY==0 271HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2 272HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2 273HIGH_SAD32XN 8, 2 ; highbd_sad_skip_32x8_sse2 274%endif 275 276; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, 277; uint8_t *ref, int ref_stride); 278%macro HIGH_SAD16XN 1-2 0 279 HIGH_SAD_FN 16, %1, 5, %2 280%if %2 == 2 ; skip rows, so divide number of rows by 2 281 mov n_rowsd, %1/4 282%else 283 mov n_rowsd, %1/2 284%endif 285 pxor m0, m0 286 pxor m6, m6 287 288.loop: 289 movu m1, [refq] 290 movu m2, [refq+16] 291 movu m3, [refq+ref_strideq*2] 292 movu m4, [refq+ref_strideq*2+16] 293%if %2 == 1 294 pavgw m1, [second_predq+mmsize*0] 295 pavgw m2, [second_predq+16] 296 pavgw m3, [second_predq+mmsize*2] 297 pavgw m4, [second_predq+mmsize*2+16] 298 lea second_predq, [second_predq+mmsize*4] 299%endif 300 mova m5, [srcq] 301 psubusw m5, m1 302 psubusw m1, [srcq] 303 por m1, m5 304 mova m5, [srcq+16] 305 psubusw m5, m2 306 psubusw m2, [srcq+16] 307 por m2, m5 308 mova m5, [srcq+src_strideq*2] 309 psubusw m5, m3 310 psubusw m3, [srcq+src_strideq*2] 311 por m3, m5 312 mova m5, [srcq+src_strideq*2+16] 313 psubusw m5, m4 314 psubusw m4, [srcq+src_strideq*2+16] 315 por m4, m5 316 paddw m1, m2 317 paddw m3, m4 318 movhlps m2, m1 319 movhlps m4, m3 320 paddw m1, m2 321 paddw m3, m4 322 punpcklwd m1, m6 323 punpcklwd m3, m6 324 lea refq, [refq+ref_strideq*4] 325 paddd m0, m1 326 lea srcq, [srcq+src_strideq*4] 327 paddd m0, m3 328 dec n_rowsd 329 jg .loop 330 331 movhlps m1, m0 332 paddd m0, m1 333 punpckldq m0, m6 334 movhlps m1, m0 335 paddd m0, m1 336%if %2 == 2 ; we skipped rows, so we need to double the sad 337 pslld m0, 1 338%endif 339 movd eax, m0 340 RET 341%endmacro 342 343INIT_XMM sse2 344HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 345HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 346HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 347HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 348HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 349HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 350HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2 351HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2 352HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2 353%if CONFIG_REALTIME_ONLY==0 354HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2 355HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2 356HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2 357HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2 358HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2 359; Current code fails there are only 2 rows 360; HIGH_SAD16XN 4, 2 ; highbd_sad_skip_16x4_sse2 361%endif 362 363; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, 364; uint8_t *ref, int ref_stride); 365%macro HIGH_SAD8XN 1-2 0 366 HIGH_SAD_FN 8, %1, 7, %2, 8 367%if %2 == 2 ; skip rows, so divide number of rows by 2 368 mov n_rowsd, %1/8 369%else 370 mov n_rowsd, %1/4 371%endif 372 pxor m0, m0 373 pxor m6, m6 374 375.loop: 376 movu m1, [refq] 377 movu m2, [refq+ref_strideq*2] 378 movu m3, [refq+ref_strideq*4] 379 movu m4, [refq+ref_stride3q*2] 380%if %2 == 1 381 pavgw m1, [second_predq+mmsize*0] 382 pavgw m2, [second_predq+mmsize*1] 383 pavgw m3, [second_predq+mmsize*2] 384 pavgw m4, [second_predq+mmsize*3] 385 lea second_predq, [second_predq+mmsize*4] 386%endif 387 mova m7, m1 388 movu m5, [srcq] 389 psubusw m1, m5 390 psubusw m5, m7 391 por m1, m5 392 393 mova m7, m2 394 movu m5, [srcq+src_strideq*2] 395 psubusw m2, m5 396 psubusw m5, m7 397 por m2, m5 398 399 mova m7, m3 400 movu m5, [srcq+src_strideq*4] 401 psubusw m3, m5 402 psubusw m5, m7 403 por m3, m5 404 405 mova m7, m4 406 movu m5, [srcq+src_stride3q*2] 407 psubusw m4, m5 408 psubusw m5, m7 409 por m4, m5 410 411 paddw m1, m2 412 paddw m3, m4 413 movhlps m2, m1 414 movhlps m4, m3 415 paddw m1, m2 416 paddw m3, m4 417 punpcklwd m1, m6 418 punpcklwd m3, m6 419 lea refq, [refq+ref_strideq*8] 420 paddd m0, m1 421 lea srcq, [srcq+src_strideq*8] 422 paddd m0, m3 423 dec n_rowsd 424 jg .loop 425 426 movhlps m1, m0 427 paddd m0, m1 428 punpckldq m0, m6 429 movhlps m1, m0 430 paddd m0, m1 431%if %2 == 2 ; we skipped rows, so we need to double the sad 432 pslld m0, 1 433%endif 434 movd eax, m0 435 RET 436%endmacro 437 438INIT_XMM sse2 439HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 440HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 441HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 442HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 443HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 444HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 445HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2 446HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2 447; Current code fails there are only 2 rows 448; HIGH_SAD8XN 4, 2 ; highbd_sad8x4_avg_sse2 449%if CONFIG_REALTIME_ONLY==0 450HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2 451HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2 452HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2 453%endif 454 455; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride, 456; uint8_t *ref, int ref_stride); 457%macro HIGH_SAD4XN 1-2 0 458 HIGH_SAD_FN 4, %1, 7, %2 459%if %2 == 2 ; skip rows, so divide number of rows by 2 460 mov n_rowsd, %1/8 461%else 462 mov n_rowsd, %1/4 463%endif 464 pxor m0, m0 465 pxor m6, m6 466 467.loop: 468 movq m1, [refq] 469 movq m2, [refq+ref_strideq*2] 470 movq m3, [refq+ref_strideq*4] 471 movq m4, [refq+ref_stride3q*2] 472 punpcklwd m1, m3 473 punpcklwd m2, m4 474%if %2 == 1 475 movq m3, [second_predq+8*0] 476 movq m5, [second_predq+8*2] 477 punpcklwd m3, m5 478 movq m4, [second_predq+8*1] 479 movq m5, [second_predq+8*3] 480 punpcklwd m4, m5 481 lea second_predq, [second_predq+8*4] 482 pavgw m1, m3 483 pavgw m2, m4 484%endif 485 movq m5, [srcq] 486 movq m3, [srcq+src_strideq*4] 487 punpcklwd m5, m3 488 movdqa m3, m1 489 psubusw m1, m5 490 psubusw m5, m3 491 por m1, m5 492 movq m5, [srcq+src_strideq*2] 493 movq m4, [srcq+src_stride3q*2] 494 punpcklwd m5, m4 495 movdqa m4, m2 496 psubusw m2, m5 497 psubusw m5, m4 498 por m2, m5 499 paddw m1, m2 500 movdqa m2, m1 501 punpcklwd m1, m6 502 punpckhwd m2, m6 503 lea refq, [refq+ref_strideq*8] 504 paddd m0, m1 505 lea srcq, [srcq+src_strideq*8] 506 paddd m0, m2 507 dec n_rowsd 508 jg .loop 509 510 movhlps m1, m0 511 paddd m0, m1 512 punpckldq m0, m6 513 movhlps m1, m0 514 paddd m0, m1 515%if %2 == 2 ; we skipped rows, so we need to double the sad 516 pslld m0, 1 517%endif 518 movd eax, m0 519 RET 520%endmacro 521 522INIT_XMM sse2 523HIGH_SAD4XN 8 ; highbd_sad4x8_sse2 524HIGH_SAD4XN 4 ; highbd_sad4x4_sse2 525HIGH_SAD4XN 8, 1 ; highbd_sad4x8_avg_sse2 526HIGH_SAD4XN 4, 1 ; highbd_sad4x4_avg_sse2 527HIGH_SAD4XN 8, 2 ; highbd_sad_skip_4x8_sse2 528; Current code fails there are only 2 rows 529; HIGH_SAD4XN 4, 2 ; highbd_sad_skip_4x4_sse2 530%if CONFIG_REALTIME_ONLY==0 531HIGH_SAD4XN 16 ; highbd_sad4x16_sse2 532HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2 533HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2 534%endif 535