1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14 15%include "aom_ports/x86_abi_support.asm" 16 17;Note: tap3 and tap4 have to be applied and added after other taps to avoid 18;overflow. 19 20%macro HIGH_GET_FILTERS_4 0 21 mov rdx, arg(5) ;filter ptr 22 mov rcx, 0x00000040 23 24 movdqa xmm7, [rdx] ;load filters 25 pshuflw xmm0, xmm7, 0b ;k0 26 pshuflw xmm1, xmm7, 01010101b ;k1 27 pshuflw xmm2, xmm7, 10101010b ;k2 28 pshuflw xmm3, xmm7, 11111111b ;k3 29 psrldq xmm7, 8 30 pshuflw xmm4, xmm7, 0b ;k4 31 pshuflw xmm5, xmm7, 01010101b ;k5 32 pshuflw xmm6, xmm7, 10101010b ;k6 33 pshuflw xmm7, xmm7, 11111111b ;k7 34 35 punpcklwd xmm0, xmm6 36 punpcklwd xmm2, xmm5 37 punpcklwd xmm3, xmm4 38 punpcklwd xmm1, xmm7 39 40 movdqa k0k6, xmm0 41 movdqa k2k5, xmm2 42 movdqa k3k4, xmm3 43 movdqa k1k7, xmm1 44 45 movq xmm6, rcx 46 pshufd xmm6, xmm6, 0 47 movdqa krd, xmm6 48 49 ;Compute max and min values of a pixel 50 mov rdx, 0x00010001 51 movsxd rcx, DWORD PTR arg(6) ;bps 52 movq xmm0, rdx 53 movq xmm1, rcx 54 pshufd xmm0, xmm0, 0b 55 movdqa xmm2, xmm0 56 psllw xmm0, xmm1 57 psubw xmm0, xmm2 58 pxor xmm1, xmm1 59 movdqa max, xmm0 ;max value (for clamping) 60 movdqa min, xmm1 ;min value (for clamping) 61 62%endm 63 64%macro HIGH_APPLY_FILTER_4 1 65 punpcklwd xmm0, xmm6 ;two row in one register 66 punpcklwd xmm1, xmm7 67 punpcklwd xmm2, xmm5 68 punpcklwd xmm3, xmm4 69 70 pmaddwd xmm0, k0k6 ;multiply the filter factors 71 pmaddwd xmm1, k1k7 72 pmaddwd xmm2, k2k5 73 pmaddwd xmm3, k3k4 74 75 paddd xmm0, xmm1 ;sum 76 paddd xmm0, xmm2 77 paddd xmm0, xmm3 78 79 paddd xmm0, krd ;rounding 80 psrad xmm0, 7 ;shift 81 packssdw xmm0, xmm0 ;pack to word 82 83 ;clamp the values 84 pminsw xmm0, max 85 pmaxsw xmm0, min 86 87%if %1 88 movq xmm1, [rdi] 89 pavgw xmm0, xmm1 90%endif 91 movq [rdi], xmm0 92%endm 93 94%macro HIGH_GET_FILTERS 0 95 mov rdx, arg(5) ;filter ptr 96 mov rsi, arg(0) ;src_ptr 97 mov rdi, arg(2) ;output_ptr 98 mov rcx, 0x00000040 99 100 movdqa xmm7, [rdx] ;load filters 101 pshuflw xmm0, xmm7, 0b ;k0 102 pshuflw xmm1, xmm7, 01010101b ;k1 103 pshuflw xmm2, xmm7, 10101010b ;k2 104 pshuflw xmm3, xmm7, 11111111b ;k3 105 pshufhw xmm4, xmm7, 0b ;k4 106 pshufhw xmm5, xmm7, 01010101b ;k5 107 pshufhw xmm6, xmm7, 10101010b ;k6 108 pshufhw xmm7, xmm7, 11111111b ;k7 109 punpcklqdq xmm2, xmm2 110 punpcklqdq xmm3, xmm3 111 punpcklwd xmm0, xmm1 112 punpckhwd xmm6, xmm7 113 punpckhwd xmm2, xmm5 114 punpckhwd xmm3, xmm4 115 116 movdqa k0k1, xmm0 ;store filter factors on stack 117 movdqa k6k7, xmm6 118 movdqa k2k5, xmm2 119 movdqa k3k4, xmm3 120 121 movq xmm6, rcx 122 pshufd xmm6, xmm6, 0 123 movdqa krd, xmm6 ;rounding 124 125 ;Compute max and min values of a pixel 126 mov rdx, 0x00010001 127 movsxd rcx, DWORD PTR arg(6) ;bps 128 movq xmm0, rdx 129 movq xmm1, rcx 130 pshufd xmm0, xmm0, 0b 131 movdqa xmm2, xmm0 132 psllw xmm0, xmm1 133 psubw xmm0, xmm2 134 pxor xmm1, xmm1 135 movdqa max, xmm0 ;max value (for clamping) 136 movdqa min, xmm1 ;min value (for clamping) 137%endm 138 139%macro LOAD_VERT_8 1 140 movdqu xmm0, [rsi + %1] ;0 141 movdqu xmm1, [rsi + rax + %1] ;1 142 movdqu xmm6, [rsi + rdx * 2 + %1] ;6 143 lea rsi, [rsi + rax] 144 movdqu xmm7, [rsi + rdx * 2 + %1] ;7 145 movdqu xmm2, [rsi + rax + %1] ;2 146 movdqu xmm3, [rsi + rax * 2 + %1] ;3 147 movdqu xmm4, [rsi + rdx + %1] ;4 148 movdqu xmm5, [rsi + rax * 4 + %1] ;5 149%endm 150 151%macro HIGH_APPLY_FILTER_8 2 152 movdqu temp, xmm4 153 movdqa xmm4, xmm0 154 punpcklwd xmm0, xmm1 155 punpckhwd xmm4, xmm1 156 movdqa xmm1, xmm6 157 punpcklwd xmm6, xmm7 158 punpckhwd xmm1, xmm7 159 movdqa xmm7, xmm2 160 punpcklwd xmm2, xmm5 161 punpckhwd xmm7, xmm5 162 163 movdqu xmm5, temp 164 movdqu temp, xmm4 165 movdqa xmm4, xmm3 166 punpcklwd xmm3, xmm5 167 punpckhwd xmm4, xmm5 168 movdqu xmm5, temp 169 170 pmaddwd xmm0, k0k1 171 pmaddwd xmm5, k0k1 172 pmaddwd xmm6, k6k7 173 pmaddwd xmm1, k6k7 174 pmaddwd xmm2, k2k5 175 pmaddwd xmm7, k2k5 176 pmaddwd xmm3, k3k4 177 pmaddwd xmm4, k3k4 178 179 paddd xmm0, xmm6 180 paddd xmm0, xmm2 181 paddd xmm0, xmm3 182 paddd xmm5, xmm1 183 paddd xmm5, xmm7 184 paddd xmm5, xmm4 185 186 paddd xmm0, krd ;rounding 187 paddd xmm5, krd 188 psrad xmm0, 7 ;shift 189 psrad xmm5, 7 190 packssdw xmm0, xmm5 ;pack back to word 191 192 ;clamp the values 193 pminsw xmm0, max 194 pmaxsw xmm0, min 195 196%if %1 197 movdqu xmm1, [rdi + %2] 198 pavgw xmm0, xmm1 199%endif 200 movdqu [rdi + %2], xmm0 201%endm 202 203SECTION .text 204 205;void aom_highbd_filter_block1d4_v8_sse2 206;( 207; const uint16_t *src_ptr, 208; const ptrdiff_t src_pitch, 209; uint16_t *output_ptr, 210; ptrdiff_t out_pitch, 211; unsigned int output_height, 212; const int16_t *filter, 213; int bd 214;) 215globalsym(aom_highbd_filter_block1d4_v8_sse2) 216sym(aom_highbd_filter_block1d4_v8_sse2): 217 push rbp 218 mov rbp, rsp 219 SHADOW_ARGS_TO_STACK 7 220 SAVE_XMM 7 221 push rsi 222 push rdi 223 push rbx 224 ; end prolog 225 226 ALIGN_STACK 16, rax 227 sub rsp, 16 * 7 228 %define k0k6 [rsp + 16 * 0] 229 %define k2k5 [rsp + 16 * 1] 230 %define k3k4 [rsp + 16 * 2] 231 %define k1k7 [rsp + 16 * 3] 232 %define krd [rsp + 16 * 4] 233 %define max [rsp + 16 * 5] 234 %define min [rsp + 16 * 6] 235 236 HIGH_GET_FILTERS_4 237 238 mov rsi, arg(0) ;src_ptr 239 mov rdi, arg(2) ;output_ptr 240 241 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 242 movsxd rbx, DWORD PTR arg(3) ;out_pitch 243 lea rax, [rax + rax] ;bytes per line 244 lea rbx, [rbx + rbx] 245 lea rdx, [rax + rax * 2] 246 movsxd rcx, DWORD PTR arg(4) ;output_height 247 248.loop: 249 movq xmm0, [rsi] ;load src: row 0 250 movq xmm1, [rsi + rax] ;1 251 movq xmm6, [rsi + rdx * 2] ;6 252 lea rsi, [rsi + rax] 253 movq xmm7, [rsi + rdx * 2] ;7 254 movq xmm2, [rsi + rax] ;2 255 movq xmm3, [rsi + rax * 2] ;3 256 movq xmm4, [rsi + rdx] ;4 257 movq xmm5, [rsi + rax * 4] ;5 258 259 HIGH_APPLY_FILTER_4 0 260 261 lea rdi, [rdi + rbx] 262 dec rcx 263 jnz .loop 264 265 add rsp, 16 * 7 266 pop rsp 267 pop rbx 268 ; begin epilog 269 pop rdi 270 pop rsi 271 RESTORE_XMM 272 UNSHADOW_ARGS 273 pop rbp 274 ret 275 276;void aom_highbd_filter_block1d8_v8_sse2 277;( 278; const uint16_t *src_ptr, 279; const ptrdiff_t src_pitch, 280; uint16_t *output_ptr, 281; ptrdiff_t out_pitch, 282; unsigned int output_height, 283; const int16_t *filter, 284; int bd 285;) 286globalsym(aom_highbd_filter_block1d8_v8_sse2) 287sym(aom_highbd_filter_block1d8_v8_sse2): 288 push rbp 289 mov rbp, rsp 290 SHADOW_ARGS_TO_STACK 7 291 SAVE_XMM 7 292 push rsi 293 push rdi 294 push rbx 295 ; end prolog 296 297 ALIGN_STACK 16, rax 298 sub rsp, 16 * 8 299 %define k0k1 [rsp + 16 * 0] 300 %define k6k7 [rsp + 16 * 1] 301 %define k2k5 [rsp + 16 * 2] 302 %define k3k4 [rsp + 16 * 3] 303 %define krd [rsp + 16 * 4] 304 %define temp [rsp + 16 * 5] 305 %define max [rsp + 16 * 6] 306 %define min [rsp + 16 * 7] 307 308 HIGH_GET_FILTERS 309 310 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 311 movsxd rbx, DWORD PTR arg(3) ;out_pitch 312 lea rax, [rax + rax] ;bytes per line 313 lea rbx, [rbx + rbx] 314 lea rdx, [rax + rax * 2] 315 movsxd rcx, DWORD PTR arg(4) ;output_height 316 317.loop: 318 LOAD_VERT_8 0 319 HIGH_APPLY_FILTER_8 0, 0 320 321 lea rdi, [rdi + rbx] 322 dec rcx 323 jnz .loop 324 325 add rsp, 16 * 8 326 pop rsp 327 pop rbx 328 ; begin epilog 329 pop rdi 330 pop rsi 331 RESTORE_XMM 332 UNSHADOW_ARGS 333 pop rbp 334 ret 335 336;void aom_highbd_filter_block1d16_v8_sse2 337;( 338; const uint16_t *src_ptr, 339; const ptrdiff_t src_pitch, 340; uint16_t *output_ptr, 341; ptrdiff_t out_pitch, 342; unsigned int output_height, 343; const int16_t *filter, 344; int bd 345;) 346globalsym(aom_highbd_filter_block1d16_v8_sse2) 347sym(aom_highbd_filter_block1d16_v8_sse2): 348 push rbp 349 mov rbp, rsp 350 SHADOW_ARGS_TO_STACK 7 351 SAVE_XMM 7 352 push rsi 353 push rdi 354 push rbx 355 ; end prolog 356 357 ALIGN_STACK 16, rax 358 sub rsp, 16 * 8 359 %define k0k1 [rsp + 16 * 0] 360 %define k6k7 [rsp + 16 * 1] 361 %define k2k5 [rsp + 16 * 2] 362 %define k3k4 [rsp + 16 * 3] 363 %define krd [rsp + 16 * 4] 364 %define temp [rsp + 16 * 5] 365 %define max [rsp + 16 * 6] 366 %define min [rsp + 16 * 7] 367 368 HIGH_GET_FILTERS 369 370 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 371 movsxd rbx, DWORD PTR arg(3) ;out_pitch 372 lea rax, [rax + rax] ;bytes per line 373 lea rbx, [rbx + rbx] 374 lea rdx, [rax + rax * 2] 375 movsxd rcx, DWORD PTR arg(4) ;output_height 376 377.loop: 378 LOAD_VERT_8 0 379 HIGH_APPLY_FILTER_8 0, 0 380 sub rsi, rax 381 382 LOAD_VERT_8 16 383 HIGH_APPLY_FILTER_8 0, 16 384 add rdi, rbx 385 386 dec rcx 387 jnz .loop 388 389 add rsp, 16 * 8 390 pop rsp 391 pop rbx 392 ; begin epilog 393 pop rdi 394 pop rsi 395 RESTORE_XMM 396 UNSHADOW_ARGS 397 pop rbp 398 ret 399 400;void aom_highbd_filter_block1d4_h8_sse2 401;( 402; const uint16_t *src_ptr, 403; const ptrdiff_t src_pitch, 404; uint16_t *output_ptr, 405; ptrdiff_t out_pitch, 406; unsigned int output_height, 407; const int16_t *filter, 408; int bd 409;) 410globalsym(aom_highbd_filter_block1d4_h8_sse2) 411sym(aom_highbd_filter_block1d4_h8_sse2): 412 push rbp 413 mov rbp, rsp 414 SHADOW_ARGS_TO_STACK 7 415 SAVE_XMM 7 416 push rsi 417 push rdi 418 ; end prolog 419 420 ALIGN_STACK 16, rax 421 sub rsp, 16 * 7 422 %define k0k6 [rsp + 16 * 0] 423 %define k2k5 [rsp + 16 * 1] 424 %define k3k4 [rsp + 16 * 2] 425 %define k1k7 [rsp + 16 * 3] 426 %define krd [rsp + 16 * 4] 427 %define max [rsp + 16 * 5] 428 %define min [rsp + 16 * 6] 429 430 HIGH_GET_FILTERS_4 431 432 mov rsi, arg(0) ;src_ptr 433 mov rdi, arg(2) ;output_ptr 434 435 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 436 movsxd rdx, DWORD PTR arg(3) ;out_pitch 437 lea rax, [rax + rax] ;bytes per line 438 lea rdx, [rdx + rdx] 439 movsxd rcx, DWORD PTR arg(4) ;output_height 440 441.loop: 442 movdqu xmm0, [rsi - 6] ;load src 443 movdqu xmm4, [rsi + 2] 444 movdqa xmm1, xmm0 445 movdqa xmm6, xmm4 446 movdqa xmm7, xmm4 447 movdqa xmm2, xmm0 448 movdqa xmm3, xmm0 449 movdqa xmm5, xmm4 450 451 psrldq xmm1, 2 452 psrldq xmm6, 4 453 psrldq xmm7, 6 454 psrldq xmm2, 4 455 psrldq xmm3, 6 456 psrldq xmm5, 2 457 458 HIGH_APPLY_FILTER_4 0 459 460 lea rsi, [rsi + rax] 461 lea rdi, [rdi + rdx] 462 dec rcx 463 jnz .loop 464 465 add rsp, 16 * 7 466 pop rsp 467 468 ; begin epilog 469 pop rdi 470 pop rsi 471 RESTORE_XMM 472 UNSHADOW_ARGS 473 pop rbp 474 ret 475 476;void aom_highbd_filter_block1d8_h8_sse2 477;( 478; const uint16_t *src_ptr, 479; const ptrdiff_t src_pitch, 480; uint16_t *output_ptr, 481; ptrdiff_t out_pitch, 482; unsigned int output_height, 483; const int16_t *filter, 484; int bd 485;) 486globalsym(aom_highbd_filter_block1d8_h8_sse2) 487sym(aom_highbd_filter_block1d8_h8_sse2): 488 push rbp 489 mov rbp, rsp 490 SHADOW_ARGS_TO_STACK 7 491 SAVE_XMM 7 492 push rsi 493 push rdi 494 ; end prolog 495 496 ALIGN_STACK 16, rax 497 sub rsp, 16 * 8 498 %define k0k1 [rsp + 16 * 0] 499 %define k6k7 [rsp + 16 * 1] 500 %define k2k5 [rsp + 16 * 2] 501 %define k3k4 [rsp + 16 * 3] 502 %define krd [rsp + 16 * 4] 503 %define temp [rsp + 16 * 5] 504 %define max [rsp + 16 * 6] 505 %define min [rsp + 16 * 7] 506 507 HIGH_GET_FILTERS 508 509 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 510 movsxd rdx, DWORD PTR arg(3) ;out_pitch 511 lea rax, [rax + rax] ;bytes per line 512 lea rdx, [rdx + rdx] 513 movsxd rcx, DWORD PTR arg(4) ;output_height 514 515.loop: 516 movdqu xmm0, [rsi - 6] ;load src 517 movdqu xmm1, [rsi - 4] 518 movdqu xmm2, [rsi - 2] 519 movdqu xmm3, [rsi] 520 movdqu xmm4, [rsi + 2] 521 movdqu xmm5, [rsi + 4] 522 movdqu xmm6, [rsi + 6] 523 movdqu xmm7, [rsi + 8] 524 525 HIGH_APPLY_FILTER_8 0, 0 526 527 lea rsi, [rsi + rax] 528 lea rdi, [rdi + rdx] 529 dec rcx 530 jnz .loop 531 532 add rsp, 16 * 8 533 pop rsp 534 535 ; begin epilog 536 pop rdi 537 pop rsi 538 RESTORE_XMM 539 UNSHADOW_ARGS 540 pop rbp 541 ret 542 543;void aom_highbd_filter_block1d16_h8_sse2 544;( 545; const uint16_t *src_ptr, 546; const ptrdiff_t src_pitch, 547; uint16_t *output_ptr, 548; ptrdiff_t out_pitch, 549; unsigned int output_height, 550; const int16_t *filter, 551; int bd 552;) 553globalsym(aom_highbd_filter_block1d16_h8_sse2) 554sym(aom_highbd_filter_block1d16_h8_sse2): 555 push rbp 556 mov rbp, rsp 557 SHADOW_ARGS_TO_STACK 7 558 SAVE_XMM 7 559 push rsi 560 push rdi 561 ; end prolog 562 563 ALIGN_STACK 16, rax 564 sub rsp, 16 * 8 565 %define k0k1 [rsp + 16 * 0] 566 %define k6k7 [rsp + 16 * 1] 567 %define k2k5 [rsp + 16 * 2] 568 %define k3k4 [rsp + 16 * 3] 569 %define krd [rsp + 16 * 4] 570 %define temp [rsp + 16 * 5] 571 %define max [rsp + 16 * 6] 572 %define min [rsp + 16 * 7] 573 574 HIGH_GET_FILTERS 575 576 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 577 movsxd rdx, DWORD PTR arg(3) ;out_pitch 578 lea rax, [rax + rax] ;bytes per line 579 lea rdx, [rdx + rdx] 580 movsxd rcx, DWORD PTR arg(4) ;output_height 581 582.loop: 583 movdqu xmm0, [rsi - 6] ;load src 584 movdqu xmm1, [rsi - 4] 585 movdqu xmm2, [rsi - 2] 586 movdqu xmm3, [rsi] 587 movdqu xmm4, [rsi + 2] 588 movdqu xmm5, [rsi + 4] 589 movdqu xmm6, [rsi + 6] 590 movdqu xmm7, [rsi + 8] 591 592 HIGH_APPLY_FILTER_8 0, 0 593 594 movdqu xmm0, [rsi + 10] ;load src 595 movdqu xmm1, [rsi + 12] 596 movdqu xmm2, [rsi + 14] 597 movdqu xmm3, [rsi + 16] 598 movdqu xmm4, [rsi + 18] 599 movdqu xmm5, [rsi + 20] 600 movdqu xmm6, [rsi + 22] 601 movdqu xmm7, [rsi + 24] 602 603 HIGH_APPLY_FILTER_8 0, 16 604 605 lea rsi, [rsi + rax] 606 lea rdi, [rdi + rdx] 607 dec rcx 608 jnz .loop 609 610 add rsp, 16 * 8 611 pop rsp 612 613 ; begin epilog 614 pop rdi 615 pop rsi 616 RESTORE_XMM 617 UNSHADOW_ARGS 618 pop rbp 619 ret 620