1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION_RODATA 17pw_8: times 8 dw 8 18 19bilin_filter_m_ssse3: times 8 db 16, 0 20 times 8 db 14, 2 21 times 8 db 12, 4 22 times 8 db 10, 6 23 times 16 db 8 24 times 8 db 6, 10 25 times 8 db 4, 12 26 times 8 db 2, 14 27 28SECTION .text 29 30; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, 31; int x_offset, int y_offset, 32; const uint8_t *dst, ptrdiff_t dst_stride, 33; int height, unsigned int *sse); 34; 35; This function returns the SE and stores SSE in the given pointer. 36 37%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse 38 psubw %3, %4 39 psubw %1, %2 40 paddw %5, %3 41 pmaddwd %3, %3 42 paddw %5, %1 43 pmaddwd %1, %1 44 paddd %6, %3 45 paddd %6, %1 46%endmacro 47 48%macro STORE_AND_RET 1 49%if %1 > 4 50 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit 51 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. 52 ; We have to sign-extend it before adding the words within the register 53 ; and outputing to a dword. 54 pcmpgtw m5, m6 ; mask for 0 > x 55 movhlps m3, m7 56 punpcklwd m4, m6, m5 57 punpckhwd m6, m5 ; sign-extend m6 word->dword 58 paddd m7, m3 59 paddd m6, m4 60 pshufd m3, m7, 0x1 61 movhlps m4, m6 62 paddd m7, m3 63 paddd m6, m4 64 mov r1, ssem ; r1 = unsigned int *sse 65 pshufd m4, m6, 0x1 66 movd [r1], m7 ; store sse 67 paddd m6, m4 68 movd raxd, m6 ; store sum as return value 69%else ; 4xh 70 pshuflw m4, m6, 0xe 71 pshuflw m3, m7, 0xe 72 paddw m6, m4 73 paddd m7, m3 74 pcmpgtw m5, m6 ; mask for 0 > x 75 mov r1, ssem ; r1 = unsigned int *sse 76 punpcklwd m6, m5 ; sign-extend m6 word->dword 77 movd [r1], m7 ; store sse 78 pshuflw m4, m6, 0xe 79 paddd m6, m4 80 movd raxd, m6 ; store sum as return value 81%endif 82 RET 83%endmacro 84 85%macro INC_SRC_BY_SRC_STRIDE 0 86%if AOM_ARCH_X86=1 && CONFIG_PIC=1 87 add srcq, src_stridemp 88%else 89 add srcq, src_strideq 90%endif 91%endmacro 92 93%macro SUBPEL_VARIANCE 1-2 0 ; W 94%if cpuflag(ssse3) 95%define bilin_filter_m bilin_filter_m_ssse3 96%define filter_idx_shift 4 97%endif 98; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses 99; 11, not 13, if the registers are ordered correctly. May make a minor speed 100; difference on Win64 101 102%if AOM_ARCH_X86_64 103 %if %2 == 1 ; avg 104 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 105 x_offset, y_offset, dst, dst_stride, \ 106 sec, sec_stride, height, sse 107 %define sec_str sec_strideq 108 %else 109 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ 110 x_offset, y_offset, dst, dst_stride, \ 111 height, sse 112 %endif 113 %define block_height heightd 114 %define bilin_filter sseq 115%else 116 %if CONFIG_PIC=1 117 %if %2 == 1 ; avg 118 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 119 x_offset, y_offset, dst, dst_stride, \ 120 sec, sec_stride, height, sse 121 %define block_height dword heightm 122 %define sec_str sec_stridemp 123 %else 124 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ 125 x_offset, y_offset, dst, dst_stride, \ 126 height, sse 127 %define block_height heightd 128 %endif 129 130 ; reuse argument stack space 131 %define g_bilin_filterm x_offsetm 132 %define g_pw_8m y_offsetm 133 134 ;Store bilin_filter and pw_8 location in stack 135 %if GET_GOT_DEFINED == 1 136 GET_GOT eax 137 add esp, 4 ; restore esp 138 %endif 139 140 lea ecx, [GLOBAL(bilin_filter_m)] 141 mov g_bilin_filterm, ecx 142 143 lea ecx, [GLOBAL(pw_8)] 144 mov g_pw_8m, ecx 145 146 LOAD_IF_USED 0, 1 ; load eax, ecx back 147 %else 148 %if %2 == 1 ; avg 149 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 150 x_offset, y_offset, \ 151 dst, dst_stride, sec, sec_stride, \ 152 height, sse 153 %define block_height dword heightm 154 %define sec_str sec_stridemp 155 %else 156 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ 157 x_offset, y_offset, dst, dst_stride, \ 158 height, sse 159 %define block_height heightd 160 %endif 161 %define bilin_filter bilin_filter_m 162 %endif 163%endif 164 165%if %1 == 4 166 %define movx movd 167%else 168 %define movx movh 169%endif 170 171 ASSERT %1 <= 16 ; m6 overflows if w > 16 172 pxor m6, m6 ; sum 173 pxor m7, m7 ; sse 174 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we 175 ; could perhaps use it for something more productive then 176 pxor m5, m5 ; dedicated zero register 177%if %1 < 16 178 sar block_height, 1 179%if %2 == 1 ; avg 180 shl sec_str, 1 181%endif 182%endif 183 184 ; FIXME(rbultje) replace by jumptable? 185 test x_offsetd, x_offsetd 186 jnz .x_nonzero 187 ; x_offset == 0 188 test y_offsetd, y_offsetd 189 jnz .x_zero_y_nonzero 190 191 ; x_offset == 0 && y_offset == 0 192.x_zero_y_zero_loop: 193%if %1 == 16 194 movu m0, [srcq] 195 mova m1, [dstq] 196%if %2 == 1 ; avg 197 pavgb m0, [secq] 198 punpckhbw m3, m1, m5 199 punpcklbw m1, m5 200%endif 201 punpckhbw m2, m0, m5 202 punpcklbw m0, m5 203 204%if %2 == 0 ; !avg 205 punpckhbw m3, m1, m5 206 punpcklbw m1, m5 207%endif 208 SUM_SSE m0, m1, m2, m3, m6, m7 209 210 add srcq, src_strideq 211 add dstq, dst_strideq 212%else ; %1 < 16 213 movx m0, [srcq] 214%if %2 == 1 ; avg 215%if %1 > 4 216 movhps m0, [srcq+src_strideq] 217%else ; 4xh 218 movx m1, [srcq+src_strideq] 219 punpckldq m0, m1 220%endif 221%else ; !avg 222 movx m2, [srcq+src_strideq] 223%endif 224 225 movx m1, [dstq] 226 movx m3, [dstq+dst_strideq] 227 228%if %2 == 1 ; avg 229%if %1 > 4 230 pavgb m0, [secq] 231%else 232 movh m2, [secq] 233 pavgb m0, m2 234%endif 235 punpcklbw m3, m5 236 punpcklbw m1, m5 237%if %1 > 4 238 punpckhbw m2, m0, m5 239 punpcklbw m0, m5 240%else ; 4xh 241 punpcklbw m0, m5 242 movhlps m2, m0 243%endif 244%else ; !avg 245 punpcklbw m0, m5 246 punpcklbw m2, m5 247 punpcklbw m3, m5 248 punpcklbw m1, m5 249%endif 250 SUM_SSE m0, m1, m2, m3, m6, m7 251 252 lea srcq, [srcq+src_strideq*2] 253 lea dstq, [dstq+dst_strideq*2] 254%endif 255%if %2 == 1 ; avg 256 add secq, sec_str 257%endif 258 dec block_height 259 jg .x_zero_y_zero_loop 260 STORE_AND_RET %1 261 262.x_zero_y_nonzero: 263 cmp y_offsetd, 4 264 jne .x_zero_y_nonhalf 265 266 ; x_offset == 0 && y_offset == 0.5 267.x_zero_y_half_loop: 268%if %1 == 16 269 movu m0, [srcq] 270 movu m4, [srcq+src_strideq] 271 mova m1, [dstq] 272 pavgb m0, m4 273 punpckhbw m3, m1, m5 274%if %2 == 1 ; avg 275 pavgb m0, [secq] 276%endif 277 punpcklbw m1, m5 278 punpckhbw m2, m0, m5 279 punpcklbw m0, m5 280 SUM_SSE m0, m1, m2, m3, m6, m7 281 282 add srcq, src_strideq 283 add dstq, dst_strideq 284%else ; %1 < 16 285 movx m0, [srcq] 286 movx m2, [srcq+src_strideq] 287%if %2 == 1 ; avg 288%if %1 > 4 289 movhps m2, [srcq+src_strideq*2] 290%else ; 4xh 291 movx m1, [srcq+src_strideq*2] 292 punpckldq m2, m1 293%endif 294 movx m1, [dstq] 295%if %1 > 4 296 movlhps m0, m2 297%else ; 4xh 298 punpckldq m0, m2 299%endif 300 movx m3, [dstq+dst_strideq] 301 pavgb m0, m2 302 punpcklbw m1, m5 303%if %1 > 4 304 pavgb m0, [secq] 305 punpcklbw m3, m5 306 punpckhbw m2, m0, m5 307 punpcklbw m0, m5 308%else ; 4xh 309 movh m4, [secq] 310 pavgb m0, m4 311 punpcklbw m3, m5 312 punpcklbw m0, m5 313 movhlps m2, m0 314%endif 315%else ; !avg 316 movx m4, [srcq+src_strideq*2] 317 movx m1, [dstq] 318 pavgb m0, m2 319 movx m3, [dstq+dst_strideq] 320 pavgb m2, m4 321 punpcklbw m0, m5 322 punpcklbw m2, m5 323 punpcklbw m3, m5 324 punpcklbw m1, m5 325%endif 326 SUM_SSE m0, m1, m2, m3, m6, m7 327 328 lea srcq, [srcq+src_strideq*2] 329 lea dstq, [dstq+dst_strideq*2] 330%endif 331%if %2 == 1 ; avg 332 add secq, sec_str 333%endif 334 dec block_height 335 jg .x_zero_y_half_loop 336 STORE_AND_RET %1 337 338.x_zero_y_nonhalf: 339 ; x_offset == 0 && y_offset == bilin interpolation 340%if AOM_ARCH_X86_64 341 lea bilin_filter, [GLOBAL(bilin_filter_m)] 342%endif 343 shl y_offsetd, filter_idx_shift 344%if AOM_ARCH_X86_64 && %1 > 4 345 mova m8, [bilin_filter+y_offsetq] 346%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 347 mova m9, [bilin_filter+y_offsetq+16] 348%endif 349 mova m10, [GLOBAL(pw_8)] 350%define filter_y_a m8 351%define filter_y_b m9 352%define filter_rnd m10 353%else ; x86-32 or mmx 354%if AOM_ARCH_X86=1 && CONFIG_PIC=1 355; x_offset == 0, reuse x_offset reg 356%define tempq x_offsetq 357 add y_offsetq, g_bilin_filterm 358%define filter_y_a [y_offsetq] 359%define filter_y_b [y_offsetq+16] 360 mov tempq, g_pw_8m 361%define filter_rnd [tempq] 362%else 363 add y_offsetq, bilin_filter 364%define filter_y_a [y_offsetq] 365%define filter_y_b [y_offsetq+16] 366%define filter_rnd [GLOBAL(pw_8)] 367%endif 368%endif 369 370.x_zero_y_other_loop: 371%if %1 == 16 372 movu m0, [srcq] 373 movu m4, [srcq+src_strideq] 374 mova m1, [dstq] 375%if cpuflag(ssse3) 376 punpckhbw m2, m0, m4 377 punpcklbw m0, m4 378 pmaddubsw m2, filter_y_a 379 pmaddubsw m0, filter_y_a 380 paddw m2, filter_rnd 381 paddw m0, filter_rnd 382%else 383 punpckhbw m2, m0, m5 384 punpckhbw m3, m4, m5 385 punpcklbw m0, m5 386 punpcklbw m4, m5 387 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can 388 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of 389 ; instructions is the same (5), but it is 1 mul instead of 2, so might be 390 ; slightly faster because of pmullw latency. It would also cut our rodata 391 ; tables in half for this function, and save 1-2 registers on x86-64. 392 pmullw m2, filter_y_a 393 pmullw m3, filter_y_b 394 paddw m2, filter_rnd 395 pmullw m0, filter_y_a 396 pmullw m4, filter_y_b 397 paddw m0, filter_rnd 398 paddw m2, m3 399 paddw m0, m4 400%endif 401 psraw m2, 4 402 psraw m0, 4 403%if %2 == 1 ; avg 404 ; FIXME(rbultje) pipeline 405 packuswb m0, m2 406 pavgb m0, [secq] 407 punpckhbw m2, m0, m5 408 punpcklbw m0, m5 409%endif 410 punpckhbw m3, m1, m5 411 punpcklbw m1, m5 412 SUM_SSE m0, m1, m2, m3, m6, m7 413 414 add srcq, src_strideq 415 add dstq, dst_strideq 416%else ; %1 < 16 417 movx m0, [srcq] 418 movx m2, [srcq+src_strideq] 419 movx m4, [srcq+src_strideq*2] 420 movx m3, [dstq+dst_strideq] 421%if cpuflag(ssse3) 422 movx m1, [dstq] 423 punpcklbw m0, m2 424 punpcklbw m2, m4 425 pmaddubsw m0, filter_y_a 426 pmaddubsw m2, filter_y_a 427 punpcklbw m3, m5 428 paddw m2, filter_rnd 429 paddw m0, filter_rnd 430%else 431 punpcklbw m0, m5 432 punpcklbw m2, m5 433 punpcklbw m4, m5 434 pmullw m0, filter_y_a 435 pmullw m1, m2, filter_y_b 436 punpcklbw m3, m5 437 paddw m0, filter_rnd 438 pmullw m2, filter_y_a 439 pmullw m4, filter_y_b 440 paddw m0, m1 441 paddw m2, filter_rnd 442 movx m1, [dstq] 443 paddw m2, m4 444%endif 445 psraw m0, 4 446 psraw m2, 4 447%if %2 == 1 ; avg 448 ; FIXME(rbultje) pipeline 449%if %1 == 4 450 movlhps m0, m2 451%endif 452 packuswb m0, m2 453%if %1 > 4 454 pavgb m0, [secq] 455 punpckhbw m2, m0, m5 456 punpcklbw m0, m5 457%else ; 4xh 458 movh m2, [secq] 459 pavgb m0, m2 460 punpcklbw m0, m5 461 movhlps m2, m0 462%endif 463%endif 464 punpcklbw m1, m5 465 SUM_SSE m0, m1, m2, m3, m6, m7 466 467 lea srcq, [srcq+src_strideq*2] 468 lea dstq, [dstq+dst_strideq*2] 469%endif 470%if %2 == 1 ; avg 471 add secq, sec_str 472%endif 473 dec block_height 474 jg .x_zero_y_other_loop 475%undef filter_y_a 476%undef filter_y_b 477%undef filter_rnd 478 STORE_AND_RET %1 479 480.x_nonzero: 481 cmp x_offsetd, 4 482 jne .x_nonhalf 483 ; x_offset == 0.5 484 test y_offsetd, y_offsetd 485 jnz .x_half_y_nonzero 486 487 ; x_offset == 0.5 && y_offset == 0 488.x_half_y_zero_loop: 489%if %1 == 16 490 movu m0, [srcq] 491 movu m4, [srcq+1] 492 mova m1, [dstq] 493 pavgb m0, m4 494 punpckhbw m3, m1, m5 495%if %2 == 1 ; avg 496 pavgb m0, [secq] 497%endif 498 punpcklbw m1, m5 499 punpckhbw m2, m0, m5 500 punpcklbw m0, m5 501 SUM_SSE m0, m1, m2, m3, m6, m7 502 503 add srcq, src_strideq 504 add dstq, dst_strideq 505%else ; %1 < 16 506 movx m0, [srcq] 507 movx m4, [srcq+1] 508%if %2 == 1 ; avg 509%if %1 > 4 510 movhps m0, [srcq+src_strideq] 511 movhps m4, [srcq+src_strideq+1] 512%else ; 4xh 513 movx m1, [srcq+src_strideq] 514 punpckldq m0, m1 515 movx m2, [srcq+src_strideq+1] 516 punpckldq m4, m2 517%endif 518 movx m1, [dstq] 519 movx m3, [dstq+dst_strideq] 520 pavgb m0, m4 521 punpcklbw m3, m5 522%if %1 > 4 523 pavgb m0, [secq] 524 punpcklbw m1, m5 525 punpckhbw m2, m0, m5 526 punpcklbw m0, m5 527%else ; 4xh 528 movh m2, [secq] 529 pavgb m0, m2 530 punpcklbw m1, m5 531 punpcklbw m0, m5 532 movhlps m2, m0 533%endif 534%else ; !avg 535 movx m2, [srcq+src_strideq] 536 movx m1, [dstq] 537 pavgb m0, m4 538 movx m4, [srcq+src_strideq+1] 539 movx m3, [dstq+dst_strideq] 540 pavgb m2, m4 541 punpcklbw m0, m5 542 punpcklbw m2, m5 543 punpcklbw m3, m5 544 punpcklbw m1, m5 545%endif 546 SUM_SSE m0, m1, m2, m3, m6, m7 547 548 lea srcq, [srcq+src_strideq*2] 549 lea dstq, [dstq+dst_strideq*2] 550%endif 551%if %2 == 1 ; avg 552 add secq, sec_str 553%endif 554 dec block_height 555 jg .x_half_y_zero_loop 556 STORE_AND_RET %1 557 558.x_half_y_nonzero: 559 cmp y_offsetd, 4 560 jne .x_half_y_nonhalf 561 562 ; x_offset == 0.5 && y_offset == 0.5 563%if %1 == 16 564 movu m0, [srcq] 565 movu m3, [srcq+1] 566 add srcq, src_strideq 567 pavgb m0, m3 568.x_half_y_half_loop: 569 movu m4, [srcq] 570 movu m3, [srcq+1] 571 mova m1, [dstq] 572 pavgb m4, m3 573 punpckhbw m3, m1, m5 574 pavgb m0, m4 575%if %2 == 1 ; avg 576 punpcklbw m1, m5 577 pavgb m0, [secq] 578 punpckhbw m2, m0, m5 579 punpcklbw m0, m5 580%else 581 punpckhbw m2, m0, m5 582 punpcklbw m0, m5 583 punpcklbw m1, m5 584%endif 585 SUM_SSE m0, m1, m2, m3, m6, m7 586 mova m0, m4 587 588 add srcq, src_strideq 589 add dstq, dst_strideq 590%else ; %1 < 16 591 movx m0, [srcq] 592 movx m3, [srcq+1] 593 add srcq, src_strideq 594 pavgb m0, m3 595.x_half_y_half_loop: 596 movx m2, [srcq] 597 movx m3, [srcq+1] 598%if %2 == 1 ; avg 599%if %1 > 4 600 movhps m2, [srcq+src_strideq] 601 movhps m3, [srcq+src_strideq+1] 602%else 603 movx m1, [srcq+src_strideq] 604 punpckldq m2, m1 605 movx m1, [srcq+src_strideq+1] 606 punpckldq m3, m1 607%endif 608 pavgb m2, m3 609%if %1 > 4 610 movlhps m0, m2 611 movhlps m4, m2 612%else ; 4xh 613 punpckldq m0, m2 614 pshuflw m4, m2, 0xe 615%endif 616 movx m1, [dstq] 617 pavgb m0, m2 618 movx m3, [dstq+dst_strideq] 619%if %1 > 4 620 pavgb m0, [secq] 621%else 622 movh m2, [secq] 623 pavgb m0, m2 624%endif 625 punpcklbw m3, m5 626 punpcklbw m1, m5 627%if %1 > 4 628 punpckhbw m2, m0, m5 629 punpcklbw m0, m5 630%else 631 punpcklbw m0, m5 632 movhlps m2, m0 633%endif 634%else ; !avg 635 movx m4, [srcq+src_strideq] 636 movx m1, [srcq+src_strideq+1] 637 pavgb m2, m3 638 pavgb m4, m1 639 pavgb m0, m2 640 pavgb m2, m4 641 movx m1, [dstq] 642 movx m3, [dstq+dst_strideq] 643 punpcklbw m0, m5 644 punpcklbw m2, m5 645 punpcklbw m3, m5 646 punpcklbw m1, m5 647%endif 648 SUM_SSE m0, m1, m2, m3, m6, m7 649 mova m0, m4 650 651 lea srcq, [srcq+src_strideq*2] 652 lea dstq, [dstq+dst_strideq*2] 653%endif 654%if %2 == 1 ; avg 655 add secq, sec_str 656%endif 657 dec block_height 658 jg .x_half_y_half_loop 659 STORE_AND_RET %1 660 661.x_half_y_nonhalf: 662 ; x_offset == 0.5 && y_offset == bilin interpolation 663%if AOM_ARCH_X86_64 664 lea bilin_filter, [GLOBAL(bilin_filter_m)] 665%endif 666 shl y_offsetd, filter_idx_shift 667%if AOM_ARCH_X86_64 && %1 > 4 668 mova m8, [bilin_filter+y_offsetq] 669%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 670 mova m9, [bilin_filter+y_offsetq+16] 671%endif 672 mova m10, [GLOBAL(pw_8)] 673%define filter_y_a m8 674%define filter_y_b m9 675%define filter_rnd m10 676%else ;x86_32 677%if AOM_ARCH_X86=1 && CONFIG_PIC=1 678; x_offset == 0.5. We can reuse x_offset reg 679%define tempq x_offsetq 680 add y_offsetq, g_bilin_filterm 681%define filter_y_a [y_offsetq] 682%define filter_y_b [y_offsetq+16] 683 mov tempq, g_pw_8m 684%define filter_rnd [tempq] 685%else 686 add y_offsetq, bilin_filter 687%define filter_y_a [y_offsetq] 688%define filter_y_b [y_offsetq+16] 689%define filter_rnd [GLOBAL(pw_8)] 690%endif 691%endif 692 693%if %1 == 16 694 movu m0, [srcq] 695 movu m3, [srcq+1] 696 add srcq, src_strideq 697 pavgb m0, m3 698.x_half_y_other_loop: 699 movu m4, [srcq] 700 movu m2, [srcq+1] 701 mova m1, [dstq] 702 pavgb m4, m2 703%if cpuflag(ssse3) 704 punpckhbw m2, m0, m4 705 punpcklbw m0, m4 706 pmaddubsw m2, filter_y_a 707 pmaddubsw m0, filter_y_a 708 paddw m2, filter_rnd 709 paddw m0, filter_rnd 710 psraw m2, 4 711%else 712 punpckhbw m2, m0, m5 713 punpckhbw m3, m4, m5 714 pmullw m2, filter_y_a 715 pmullw m3, filter_y_b 716 paddw m2, filter_rnd 717 punpcklbw m0, m5 718 paddw m2, m3 719 punpcklbw m3, m4, m5 720 pmullw m0, filter_y_a 721 pmullw m3, filter_y_b 722 paddw m0, filter_rnd 723 psraw m2, 4 724 paddw m0, m3 725%endif 726 punpckhbw m3, m1, m5 727 psraw m0, 4 728%if %2 == 1 ; avg 729 ; FIXME(rbultje) pipeline 730 packuswb m0, m2 731 pavgb m0, [secq] 732 punpckhbw m2, m0, m5 733 punpcklbw m0, m5 734%endif 735 punpcklbw m1, m5 736 SUM_SSE m0, m1, m2, m3, m6, m7 737 mova m0, m4 738 739 add srcq, src_strideq 740 add dstq, dst_strideq 741%else ; %1 < 16 742 movx m0, [srcq] 743 movx m3, [srcq+1] 744 add srcq, src_strideq 745 pavgb m0, m3 746%if notcpuflag(ssse3) 747 punpcklbw m0, m5 748%endif 749.x_half_y_other_loop: 750 movx m2, [srcq] 751 movx m1, [srcq+1] 752 movx m4, [srcq+src_strideq] 753 movx m3, [srcq+src_strideq+1] 754 pavgb m2, m1 755 pavgb m4, m3 756 movx m3, [dstq+dst_strideq] 757%if cpuflag(ssse3) 758 movx m1, [dstq] 759 punpcklbw m0, m2 760 punpcklbw m2, m4 761 pmaddubsw m0, filter_y_a 762 pmaddubsw m2, filter_y_a 763 punpcklbw m3, m5 764 paddw m0, filter_rnd 765 paddw m2, filter_rnd 766%else 767 punpcklbw m2, m5 768 punpcklbw m4, m5 769 pmullw m0, filter_y_a 770 pmullw m1, m2, filter_y_b 771 punpcklbw m3, m5 772 paddw m0, filter_rnd 773 pmullw m2, filter_y_a 774 paddw m0, m1 775 pmullw m1, m4, filter_y_b 776 paddw m2, filter_rnd 777 paddw m2, m1 778 movx m1, [dstq] 779%endif 780 psraw m0, 4 781 psraw m2, 4 782%if %2 == 1 ; avg 783 ; FIXME(rbultje) pipeline 784%if %1 == 4 785 movlhps m0, m2 786%endif 787 packuswb m0, m2 788%if %1 > 4 789 pavgb m0, [secq] 790 punpckhbw m2, m0, m5 791 punpcklbw m0, m5 792%else 793 movh m2, [secq] 794 pavgb m0, m2 795 punpcklbw m0, m5 796 movhlps m2, m0 797%endif 798%endif 799 punpcklbw m1, m5 800 SUM_SSE m0, m1, m2, m3, m6, m7 801 mova m0, m4 802 803 lea srcq, [srcq+src_strideq*2] 804 lea dstq, [dstq+dst_strideq*2] 805%endif 806%if %2 == 1 ; avg 807 add secq, sec_str 808%endif 809 dec block_height 810 jg .x_half_y_other_loop 811%undef filter_y_a 812%undef filter_y_b 813%undef filter_rnd 814 STORE_AND_RET %1 815 816.x_nonhalf: 817 test y_offsetd, y_offsetd 818 jnz .x_nonhalf_y_nonzero 819 820 ; x_offset == bilin interpolation && y_offset == 0 821%if AOM_ARCH_X86_64 822 lea bilin_filter, [GLOBAL(bilin_filter_m)] 823%endif 824 shl x_offsetd, filter_idx_shift 825%if AOM_ARCH_X86_64 && %1 > 4 826 mova m8, [bilin_filter+x_offsetq] 827%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 828 mova m9, [bilin_filter+x_offsetq+16] 829%endif 830 mova m10, [GLOBAL(pw_8)] 831%define filter_x_a m8 832%define filter_x_b m9 833%define filter_rnd m10 834%else ; x86-32 835%if AOM_ARCH_X86=1 && CONFIG_PIC=1 836;y_offset == 0. We can reuse y_offset reg. 837%define tempq y_offsetq 838 add x_offsetq, g_bilin_filterm 839%define filter_x_a [x_offsetq] 840%define filter_x_b [x_offsetq+16] 841 mov tempq, g_pw_8m 842%define filter_rnd [tempq] 843%else 844 add x_offsetq, bilin_filter 845%define filter_x_a [x_offsetq] 846%define filter_x_b [x_offsetq+16] 847%define filter_rnd [GLOBAL(pw_8)] 848%endif 849%endif 850 851.x_other_y_zero_loop: 852%if %1 == 16 853 movu m0, [srcq] 854 movu m4, [srcq+1] 855 mova m1, [dstq] 856%if cpuflag(ssse3) 857 punpckhbw m2, m0, m4 858 punpcklbw m0, m4 859 pmaddubsw m2, filter_x_a 860 pmaddubsw m0, filter_x_a 861 paddw m2, filter_rnd 862 paddw m0, filter_rnd 863%else 864 punpckhbw m2, m0, m5 865 punpckhbw m3, m4, m5 866 punpcklbw m0, m5 867 punpcklbw m4, m5 868 pmullw m2, filter_x_a 869 pmullw m3, filter_x_b 870 paddw m2, filter_rnd 871 pmullw m0, filter_x_a 872 pmullw m4, filter_x_b 873 paddw m0, filter_rnd 874 paddw m2, m3 875 paddw m0, m4 876%endif 877 psraw m2, 4 878 psraw m0, 4 879%if %2 == 1 ; avg 880 ; FIXME(rbultje) pipeline 881 packuswb m0, m2 882 pavgb m0, [secq] 883 punpckhbw m2, m0, m5 884 punpcklbw m0, m5 885%endif 886 punpckhbw m3, m1, m5 887 punpcklbw m1, m5 888 SUM_SSE m0, m1, m2, m3, m6, m7 889 890 add srcq, src_strideq 891 add dstq, dst_strideq 892%else ; %1 < 16 893 movx m0, [srcq] 894 movx m1, [srcq+1] 895 movx m2, [srcq+src_strideq] 896 movx m4, [srcq+src_strideq+1] 897 movx m3, [dstq+dst_strideq] 898%if cpuflag(ssse3) 899 punpcklbw m0, m1 900 movx m1, [dstq] 901 punpcklbw m2, m4 902 pmaddubsw m0, filter_x_a 903 pmaddubsw m2, filter_x_a 904 punpcklbw m3, m5 905 paddw m0, filter_rnd 906 paddw m2, filter_rnd 907%else 908 punpcklbw m0, m5 909 punpcklbw m1, m5 910 punpcklbw m2, m5 911 punpcklbw m4, m5 912 pmullw m0, filter_x_a 913 pmullw m1, filter_x_b 914 punpcklbw m3, m5 915 paddw m0, filter_rnd 916 pmullw m2, filter_x_a 917 pmullw m4, filter_x_b 918 paddw m0, m1 919 paddw m2, filter_rnd 920 movx m1, [dstq] 921 paddw m2, m4 922%endif 923 psraw m0, 4 924 psraw m2, 4 925%if %2 == 1 ; avg 926 ; FIXME(rbultje) pipeline 927%if %1 == 4 928 movlhps m0, m2 929%endif 930 packuswb m0, m2 931%if %1 > 4 932 pavgb m0, [secq] 933 punpckhbw m2, m0, m5 934 punpcklbw m0, m5 935%else 936 movh m2, [secq] 937 pavgb m0, m2 938 punpcklbw m0, m5 939 movhlps m2, m0 940%endif 941%endif 942 punpcklbw m1, m5 943 SUM_SSE m0, m1, m2, m3, m6, m7 944 945 lea srcq, [srcq+src_strideq*2] 946 lea dstq, [dstq+dst_strideq*2] 947%endif 948%if %2 == 1 ; avg 949 add secq, sec_str 950%endif 951 dec block_height 952 jg .x_other_y_zero_loop 953%undef filter_x_a 954%undef filter_x_b 955%undef filter_rnd 956 STORE_AND_RET %1 957 958.x_nonhalf_y_nonzero: 959 cmp y_offsetd, 4 960 jne .x_nonhalf_y_nonhalf 961 962 ; x_offset == bilin interpolation && y_offset == 0.5 963%if AOM_ARCH_X86_64 964 lea bilin_filter, [GLOBAL(bilin_filter_m)] 965%endif 966 shl x_offsetd, filter_idx_shift 967%if AOM_ARCH_X86_64 && %1 > 4 968 mova m8, [bilin_filter+x_offsetq] 969%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 970 mova m9, [bilin_filter+x_offsetq+16] 971%endif 972 mova m10, [GLOBAL(pw_8)] 973%define filter_x_a m8 974%define filter_x_b m9 975%define filter_rnd m10 976%else ; x86-32 977%if AOM_ARCH_X86=1 && CONFIG_PIC=1 978; y_offset == 0.5. We can reuse y_offset reg. 979%define tempq y_offsetq 980 add x_offsetq, g_bilin_filterm 981%define filter_x_a [x_offsetq] 982%define filter_x_b [x_offsetq+16] 983 mov tempq, g_pw_8m 984%define filter_rnd [tempq] 985%else 986 add x_offsetq, bilin_filter 987%define filter_x_a [x_offsetq] 988%define filter_x_b [x_offsetq+16] 989%define filter_rnd [GLOBAL(pw_8)] 990%endif 991%endif 992 993%if %1 == 16 994 movu m0, [srcq] 995 movu m1, [srcq+1] 996%if cpuflag(ssse3) 997 punpckhbw m2, m0, m1 998 punpcklbw m0, m1 999 pmaddubsw m2, filter_x_a 1000 pmaddubsw m0, filter_x_a 1001 paddw m2, filter_rnd 1002 paddw m0, filter_rnd 1003%else 1004 punpckhbw m2, m0, m5 1005 punpckhbw m3, m1, m5 1006 punpcklbw m0, m5 1007 punpcklbw m1, m5 1008 pmullw m0, filter_x_a 1009 pmullw m1, filter_x_b 1010 paddw m0, filter_rnd 1011 pmullw m2, filter_x_a 1012 pmullw m3, filter_x_b 1013 paddw m2, filter_rnd 1014 paddw m0, m1 1015 paddw m2, m3 1016%endif 1017 psraw m0, 4 1018 psraw m2, 4 1019 add srcq, src_strideq 1020 packuswb m0, m2 1021.x_other_y_half_loop: 1022 movu m4, [srcq] 1023 movu m3, [srcq+1] 1024%if cpuflag(ssse3) 1025 mova m1, [dstq] 1026 punpckhbw m2, m4, m3 1027 punpcklbw m4, m3 1028 pmaddubsw m2, filter_x_a 1029 pmaddubsw m4, filter_x_a 1030 paddw m2, filter_rnd 1031 paddw m4, filter_rnd 1032 psraw m2, 4 1033 psraw m4, 4 1034 packuswb m4, m2 1035 pavgb m0, m4 1036 punpckhbw m3, m1, m5 1037 punpcklbw m1, m5 1038%else 1039 punpckhbw m2, m4, m5 1040 punpckhbw m1, m3, m5 1041 punpcklbw m4, m5 1042 punpcklbw m3, m5 1043 pmullw m4, filter_x_a 1044 pmullw m3, filter_x_b 1045 paddw m4, filter_rnd 1046 pmullw m2, filter_x_a 1047 pmullw m1, filter_x_b 1048 paddw m2, filter_rnd 1049 paddw m4, m3 1050 paddw m2, m1 1051 mova m1, [dstq] 1052 psraw m4, 4 1053 psraw m2, 4 1054 punpckhbw m3, m1, m5 1055 ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we 1056 ; have a 1-register shortage to be able to store the backup of the bilin 1057 ; filtered second line as words as cache for the next line. Packing into 1058 ; a byte costs 1 pack and 2 unpacks, but saves a register. 1059 packuswb m4, m2 1060 punpcklbw m1, m5 1061 pavgb m0, m4 1062%endif 1063%if %2 == 1 ; avg 1064 ; FIXME(rbultje) pipeline 1065 pavgb m0, [secq] 1066%endif 1067 punpckhbw m2, m0, m5 1068 punpcklbw m0, m5 1069 SUM_SSE m0, m1, m2, m3, m6, m7 1070 mova m0, m4 1071 1072 add srcq, src_strideq 1073 add dstq, dst_strideq 1074%else ; %1 < 16 1075 movx m0, [srcq] 1076 movx m1, [srcq+1] 1077%if cpuflag(ssse3) 1078 punpcklbw m0, m1 1079 pmaddubsw m0, filter_x_a 1080 paddw m0, filter_rnd 1081%else 1082 punpcklbw m0, m5 1083 punpcklbw m1, m5 1084 pmullw m0, filter_x_a 1085 pmullw m1, filter_x_b 1086 paddw m0, filter_rnd 1087 paddw m0, m1 1088%endif 1089 add srcq, src_strideq 1090 psraw m0, 4 1091.x_other_y_half_loop: 1092 movx m2, [srcq] 1093 movx m1, [srcq+1] 1094 movx m4, [srcq+src_strideq] 1095 movx m3, [srcq+src_strideq+1] 1096%if cpuflag(ssse3) 1097 punpcklbw m2, m1 1098 punpcklbw m4, m3 1099 pmaddubsw m2, filter_x_a 1100 pmaddubsw m4, filter_x_a 1101 movx m1, [dstq] 1102 movx m3, [dstq+dst_strideq] 1103 paddw m2, filter_rnd 1104 paddw m4, filter_rnd 1105%else 1106 punpcklbw m2, m5 1107 punpcklbw m1, m5 1108 punpcklbw m4, m5 1109 punpcklbw m3, m5 1110 pmullw m2, filter_x_a 1111 pmullw m1, filter_x_b 1112 paddw m2, filter_rnd 1113 pmullw m4, filter_x_a 1114 pmullw m3, filter_x_b 1115 paddw m4, filter_rnd 1116 paddw m2, m1 1117 movx m1, [dstq] 1118 paddw m4, m3 1119 movx m3, [dstq+dst_strideq] 1120%endif 1121 psraw m2, 4 1122 psraw m4, 4 1123 pavgw m0, m2 1124 pavgw m2, m4 1125%if %2 == 1 ; avg 1126 ; FIXME(rbultje) pipeline - also consider going to bytes here 1127%if %1 == 4 1128 movlhps m0, m2 1129%endif 1130 packuswb m0, m2 1131%if %1 > 4 1132 pavgb m0, [secq] 1133 punpckhbw m2, m0, m5 1134 punpcklbw m0, m5 1135%else 1136 movh m2, [secq] 1137 pavgb m0, m2 1138 punpcklbw m0, m5 1139 movhlps m2, m0 1140%endif 1141%endif 1142 punpcklbw m3, m5 1143 punpcklbw m1, m5 1144 SUM_SSE m0, m1, m2, m3, m6, m7 1145 mova m0, m4 1146 1147 lea srcq, [srcq+src_strideq*2] 1148 lea dstq, [dstq+dst_strideq*2] 1149%endif 1150%if %2 == 1 ; avg 1151 add secq, sec_str 1152%endif 1153 dec block_height 1154 jg .x_other_y_half_loop 1155%undef filter_x_a 1156%undef filter_x_b 1157%undef filter_rnd 1158 STORE_AND_RET %1 1159 1160.x_nonhalf_y_nonhalf: 1161%if AOM_ARCH_X86_64 1162 lea bilin_filter, [GLOBAL(bilin_filter_m)] 1163%endif 1164 shl x_offsetd, filter_idx_shift 1165 shl y_offsetd, filter_idx_shift 1166%if AOM_ARCH_X86_64 && %1 > 4 1167 mova m8, [bilin_filter+x_offsetq] 1168%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1169 mova m9, [bilin_filter+x_offsetq+16] 1170%endif 1171 mova m10, [bilin_filter+y_offsetq] 1172%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1173 mova m11, [bilin_filter+y_offsetq+16] 1174%endif 1175 mova m12, [GLOBAL(pw_8)] 1176%define filter_x_a m8 1177%define filter_x_b m9 1178%define filter_y_a m10 1179%define filter_y_b m11 1180%define filter_rnd m12 1181%else ; x86-32 1182%if AOM_ARCH_X86=1 && CONFIG_PIC=1 1183; In this case, there is NO unused register. Used src_stride register. Later, 1184; src_stride has to be loaded from stack when it is needed. 1185%define tempq src_strideq 1186 mov tempq, g_bilin_filterm 1187 add x_offsetq, tempq 1188 add y_offsetq, tempq 1189%define filter_x_a [x_offsetq] 1190%define filter_x_b [x_offsetq+16] 1191%define filter_y_a [y_offsetq] 1192%define filter_y_b [y_offsetq+16] 1193 1194 mov tempq, g_pw_8m 1195%define filter_rnd [tempq] 1196%else 1197 add x_offsetq, bilin_filter 1198 add y_offsetq, bilin_filter 1199%define filter_x_a [x_offsetq] 1200%define filter_x_b [x_offsetq+16] 1201%define filter_y_a [y_offsetq] 1202%define filter_y_b [y_offsetq+16] 1203%define filter_rnd [GLOBAL(pw_8)] 1204%endif 1205%endif 1206 1207 ; x_offset == bilin interpolation && y_offset == bilin interpolation 1208%if %1 == 16 1209 movu m0, [srcq] 1210 movu m1, [srcq+1] 1211%if cpuflag(ssse3) 1212 punpckhbw m2, m0, m1 1213 punpcklbw m0, m1 1214 pmaddubsw m2, filter_x_a 1215 pmaddubsw m0, filter_x_a 1216 paddw m2, filter_rnd 1217 paddw m0, filter_rnd 1218%else 1219 punpckhbw m2, m0, m5 1220 punpckhbw m3, m1, m5 1221 punpcklbw m0, m5 1222 punpcklbw m1, m5 1223 pmullw m0, filter_x_a 1224 pmullw m1, filter_x_b 1225 paddw m0, filter_rnd 1226 pmullw m2, filter_x_a 1227 pmullw m3, filter_x_b 1228 paddw m2, filter_rnd 1229 paddw m0, m1 1230 paddw m2, m3 1231%endif 1232 psraw m0, 4 1233 psraw m2, 4 1234 1235 INC_SRC_BY_SRC_STRIDE 1236 1237 packuswb m0, m2 1238.x_other_y_other_loop: 1239%if cpuflag(ssse3) 1240 movu m4, [srcq] 1241 movu m3, [srcq+1] 1242 mova m1, [dstq] 1243 punpckhbw m2, m4, m3 1244 punpcklbw m4, m3 1245 pmaddubsw m2, filter_x_a 1246 pmaddubsw m4, filter_x_a 1247 punpckhbw m3, m1, m5 1248 paddw m2, filter_rnd 1249 paddw m4, filter_rnd 1250 psraw m2, 4 1251 psraw m4, 4 1252 packuswb m4, m2 1253 punpckhbw m2, m0, m4 1254 punpcklbw m0, m4 1255 pmaddubsw m2, filter_y_a 1256 pmaddubsw m0, filter_y_a 1257 punpcklbw m1, m5 1258 paddw m2, filter_rnd 1259 paddw m0, filter_rnd 1260 psraw m2, 4 1261 psraw m0, 4 1262%else 1263 movu m3, [srcq] 1264 movu m4, [srcq+1] 1265 punpckhbw m1, m3, m5 1266 punpckhbw m2, m4, m5 1267 punpcklbw m3, m5 1268 punpcklbw m4, m5 1269 pmullw m3, filter_x_a 1270 pmullw m4, filter_x_b 1271 paddw m3, filter_rnd 1272 pmullw m1, filter_x_a 1273 pmullw m2, filter_x_b 1274 paddw m1, filter_rnd 1275 paddw m3, m4 1276 paddw m1, m2 1277 psraw m3, 4 1278 psraw m1, 4 1279 packuswb m4, m3, m1 1280 punpckhbw m2, m0, m5 1281 punpcklbw m0, m5 1282 pmullw m2, filter_y_a 1283 pmullw m1, filter_y_b 1284 paddw m2, filter_rnd 1285 pmullw m0, filter_y_a 1286 pmullw m3, filter_y_b 1287 paddw m2, m1 1288 mova m1, [dstq] 1289 paddw m0, filter_rnd 1290 psraw m2, 4 1291 paddw m0, m3 1292 punpckhbw m3, m1, m5 1293 psraw m0, 4 1294 punpcklbw m1, m5 1295%endif 1296%if %2 == 1 ; avg 1297 ; FIXME(rbultje) pipeline 1298 packuswb m0, m2 1299 pavgb m0, [secq] 1300 punpckhbw m2, m0, m5 1301 punpcklbw m0, m5 1302%endif 1303 SUM_SSE m0, m1, m2, m3, m6, m7 1304 mova m0, m4 1305 1306 INC_SRC_BY_SRC_STRIDE 1307 add dstq, dst_strideq 1308%else ; %1 < 16 1309 movx m0, [srcq] 1310 movx m1, [srcq+1] 1311%if cpuflag(ssse3) 1312 punpcklbw m0, m1 1313 pmaddubsw m0, filter_x_a 1314 paddw m0, filter_rnd 1315%else 1316 punpcklbw m0, m5 1317 punpcklbw m1, m5 1318 pmullw m0, filter_x_a 1319 pmullw m1, filter_x_b 1320 paddw m0, filter_rnd 1321 paddw m0, m1 1322%endif 1323 psraw m0, 4 1324%if cpuflag(ssse3) 1325 packuswb m0, m0 1326%endif 1327 1328 INC_SRC_BY_SRC_STRIDE 1329 1330.x_other_y_other_loop: 1331 movx m2, [srcq] 1332 movx m1, [srcq+1] 1333 1334 INC_SRC_BY_SRC_STRIDE 1335 movx m4, [srcq] 1336 movx m3, [srcq+1] 1337 1338%if cpuflag(ssse3) 1339 punpcklbw m2, m1 1340 punpcklbw m4, m3 1341 pmaddubsw m2, filter_x_a 1342 pmaddubsw m4, filter_x_a 1343 movx m3, [dstq+dst_strideq] 1344 movx m1, [dstq] 1345 paddw m2, filter_rnd 1346 paddw m4, filter_rnd 1347 psraw m2, 4 1348 psraw m4, 4 1349 packuswb m2, m2 1350 packuswb m4, m4 1351 punpcklbw m0, m2 1352 punpcklbw m2, m4 1353 pmaddubsw m0, filter_y_a 1354 pmaddubsw m2, filter_y_a 1355 punpcklbw m3, m5 1356 paddw m0, filter_rnd 1357 paddw m2, filter_rnd 1358 psraw m0, 4 1359 psraw m2, 4 1360 punpcklbw m1, m5 1361%else 1362 punpcklbw m2, m5 1363 punpcklbw m1, m5 1364 punpcklbw m4, m5 1365 punpcklbw m3, m5 1366 pmullw m2, filter_x_a 1367 pmullw m1, filter_x_b 1368 paddw m2, filter_rnd 1369 pmullw m4, filter_x_a 1370 pmullw m3, filter_x_b 1371 paddw m4, filter_rnd 1372 paddw m2, m1 1373 paddw m4, m3 1374 psraw m2, 4 1375 psraw m4, 4 1376 pmullw m0, filter_y_a 1377 pmullw m3, m2, filter_y_b 1378 paddw m0, filter_rnd 1379 pmullw m2, filter_y_a 1380 pmullw m1, m4, filter_y_b 1381 paddw m2, filter_rnd 1382 paddw m0, m3 1383 movx m3, [dstq+dst_strideq] 1384 paddw m2, m1 1385 movx m1, [dstq] 1386 psraw m0, 4 1387 psraw m2, 4 1388 punpcklbw m3, m5 1389 punpcklbw m1, m5 1390%endif 1391%if %2 == 1 ; avg 1392 ; FIXME(rbultje) pipeline 1393%if %1 == 4 1394 movlhps m0, m2 1395%endif 1396 packuswb m0, m2 1397%if %1 > 4 1398 pavgb m0, [secq] 1399 punpckhbw m2, m0, m5 1400 punpcklbw m0, m5 1401%else 1402 movh m2, [secq] 1403 pavgb m0, m2 1404 punpcklbw m0, m5 1405 movhlps m2, m0 1406%endif 1407%endif 1408 SUM_SSE m0, m1, m2, m3, m6, m7 1409 mova m0, m4 1410 1411 INC_SRC_BY_SRC_STRIDE 1412 lea dstq, [dstq+dst_strideq*2] 1413%endif 1414%if %2 == 1 ; avg 1415 add secq, sec_str 1416%endif 1417 dec block_height 1418 jg .x_other_y_other_loop 1419%undef filter_x_a 1420%undef filter_x_b 1421%undef filter_y_a 1422%undef filter_y_b 1423%undef filter_rnd 1424%undef movx 1425 STORE_AND_RET %1 1426%endmacro 1427 1428; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical 1429; between the ssse3 and non-ssse3 version. It may make sense to merge their 1430; code in the sense that the ssse3 version would jump to the appropriate 1431; location in the sse/2 version, rather than duplicating that code in the 1432; binary. 1433 1434INIT_XMM ssse3 1435SUBPEL_VARIANCE 4 1436SUBPEL_VARIANCE 8 1437SUBPEL_VARIANCE 16 1438 1439INIT_XMM ssse3 1440SUBPEL_VARIANCE 4, 1 1441SUBPEL_VARIANCE 8, 1 1442SUBPEL_VARIANCE 16, 1 1443