1; Copyright © 2018-2021, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 16 32 33; Note: The order of (at least some of) those constants matter! 34 35const deint_shuf, db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 36 37%macro COEF_PAIR 2 38pw_%1_%2: dw %1, %2 39pw_m%2_%1: dw -%2, %1 40%endmacro 41 42; ADST-only 43pw_3803_1321: dw 3803, 1321 44pw_m1321_2482: dw -1321, 2482 45pw_2482_3344: dw 2482, 3344 46pw_m3344_3344: dw -3344, 3344 47pw_m3803_3344: dw -3803, 3344 48pw_m3803_m6688: dw -3803, -6688 49pw_2896_m2896: dw 2896, -2896 50 51const pw_5, times 2 dw 5 52const pw_2048, times 2 dw 2048 53const pw_4096, times 2 dw 4096 54const pw_8192, times 2 dw 8192 55const pw_16384, times 2 dw 16384 56const pw_1697x16, times 2 dw 1697*16 57const pw_1697x8, times 2 dw 1697*8 58const pw_2896x8, times 2 dw 2896*8 59const pd_2048, dd 2048 60 61const pw_2896_2896, dw 2896, 2896 62const pw_m2896_2896, dw -2896, 2896 63const pw_1567_3784, dw 1567, 3784 64const pw_m3784_1567, dw -3784, 1567 65COEF_PAIR 3784, 1567 66COEF_PAIR 201, 4091 67COEF_PAIR 995, 3973 68COEF_PAIR 1751, 3703 69COEF_PAIR 2440, 3290 70COEF_PAIR 3035, 2751 71COEF_PAIR 3513, 2106 72COEF_PAIR 3857, 1380 73COEF_PAIR 4052, 601 74COEF_PAIR 401, 4076 75COEF_PAIR 1931, 3612 76COEF_PAIR 3166, 2598 77COEF_PAIR 3920, 1189 78COEF_PAIR 799, 4017 79COEF_PAIR 3406, 2276 80pw_m799_m4017: dw -799, -4017 81const pw_m1567_m3784, dw -1567, -3784 82pw_m3406_m2276: dw -3406, -2276 83pw_m401_m4076: dw -401, -4076 84pw_m3166_m2598: dw -3166, -2598 85pw_m1931_m3612: dw -1931, -3612 86pw_m3920_m1189: dw -3920, -1189 87COEF_PAIR 2276, 3406 88COEF_PAIR 4017, 799 89 90%macro COEF_X8 1-* 91%rep %0 92 dw %1*8, %1*8 93 %rotate 1 94%endrep 95%endmacro 96 97pw_3703x8: COEF_X8 3703 98pw_1751x8: COEF_X8 1751 99pw_m1380x8: COEF_X8 -1380 100pw_3857x8: COEF_X8 3857 101pw_3973x8: COEF_X8 3973 102pw_995x8: COEF_X8 995 103pw_m2106x8: COEF_X8 -2106 104pw_3513x8: COEF_X8 3513 105pw_3290x8: COEF_X8 3290 106pw_2440x8: COEF_X8 2440 107pw_m601x8: COEF_X8 -601 108pw_4052x8: COEF_X8 4052 109 110const idct64_mul 111COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 112COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 113COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 114COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 115 116pw_201_4091x8: dw 201*8, 4091*8 117pw_m601_4052x8: dw -601*8, 4052*8 118pw_995_3973x8: dw 995*8, 3973*8 119pw_m1380_3857x8: dw -1380*8, 3857*8 120pw_1751_3703x8: dw 1751*8, 3703*8 121pw_m2106_3513x8: dw -2106*8, 3513*8 122pw_2440_3290x8: dw 2440*8, 3290*8 123pw_m2751_3035x8: dw -2751*8, 3035*8 124 125%define o_idct64_offset idct64_mul - (o_base) - 8 126 127SECTION .text 128 129; Code size reduction trickery: Instead of using rip-relative loads with 130; mandatory 4-byte offsets everywhere, we can set up a base pointer with a 131; single rip-relative lea and then address things relative from that with 132; 1-byte offsets as long as data is within +-128 bytes of the base pointer. 133%define o_base deint_shuf + 128 134%define o(x) (r6 - (o_base) + (x)) 135%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 136 137; flags: 1 = swap, 2 = interleave, 4: coef_regs 138%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags 139%if %7 & 4 140 pmaddwd m%2, m%5, m%1 141 pmaddwd m%1, m%6 142%else 143%if %7 & 1 144 vpbroadcastd m%2, [o(pw_%5_%6)] 145 vpbroadcastd m%3, [o(pw_m%6_%5)] 146%else 147 vpbroadcastd m%2, [o(pw_m%6_%5)] 148 vpbroadcastd m%3, [o(pw_%5_%6)] 149%endif 150 pmaddwd m%2, m%1 151 pmaddwd m%1, m%3 152%endif 153 paddd m%2, m%4 154 paddd m%1, m%4 155%if %7 & 2 156 pslld m%2, 4 157 psrld m%1, 12 158 pblendw m%1, m%2, 0xaa 159%else 160 psrad m%2, 12 161 psrad m%1, 12 162 packssdw m%1, m%2 163%endif 164%endmacro 165 166; flags: 1 = swap, 2 = interleave, 4 = coef_regs 167%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags 168%if %10 & 1 169 vpbroadcastd m%3, [o(pw_%8_%9)] 170 vpbroadcastd m%4, [o(pw_m%9_%8)] 171 vpbroadcastd xm%2, [o(pw_%6_%7)] 172 vpblendd m%2, m%3, 0xf0 173 vpbroadcastd xm%3, [o(pw_m%7_%6)] 174%else 175 vpbroadcastd m%3, [o(pw_m%9_%8)] 176 vpbroadcastd m%4, [o(pw_%8_%9)] 177 vpbroadcastd xm%2, [o(pw_m%7_%6)] 178 vpblendd m%2, m%3, 0xf0 179 vpbroadcastd xm%3, [o(pw_%6_%7)] 180%endif 181 vpblendd m%3, m%4, 0xf0 182 ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10) 183%endmacro 184 185; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 186; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 187%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 188 punpckhwd m%3, m%2, m%1 189 punpcklwd m%2, m%1 190%if %7 < 32 191 pmaddwd m%1, m%7, m%2 192 pmaddwd m%4, m%7, m%3 193%else 194 vpbroadcastd m%1, [o(pw_m%7_%6)] 195 pmaddwd m%4, m%3, m%1 196 pmaddwd m%1, m%2 197%endif 198 paddd m%4, m%5 199 paddd m%1, m%5 200 psrad m%4, 12 201 psrad m%1, 12 202 packssdw m%1, m%4 203%if %7 < 32 204 pmaddwd m%3, m%6 205 pmaddwd m%2, m%6 206%else 207 vpbroadcastd m%4, [o(pw_%6_%7)] 208 pmaddwd m%3, m%4 209 pmaddwd m%2, m%4 210%endif 211 paddd m%3, m%5 212 paddd m%2, m%5 213 psrad m%3, 12 214 psrad m%2, 12 215%if %0 == 8 216 packssdw m%8, m%2, m%3 217%else 218 packssdw m%2, m%3 219%endif 220%endmacro 221 222%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 223 ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3 224 ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0 225 psubsw m%3, m%1, m%2 226 paddsw m%2, m%1 227 paddsw m%1, m%4, m%5 228 psubsw m%4, m%5 229%endmacro 230 231%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048 232 ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a 233 ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a 234 ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3 235 paddsw m%9, m%2, m%6 ; t4 236 psubsw m%2, m%6 ; t5a 237 paddsw m%10, m%8, m%4 ; t7 238 psubsw m%8, m%4 ; t6a 239 ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0 240 ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6 241 psubsw m%6, m%1, m%3 ; dct4 out2 242 paddsw m%3, m%1 ; dct4 out1 243 paddsw m%1, m%5, m%7 ; dct4 out0 244 psubsw m%5, m%7 ; dct4 out3 245 psubsw m%7, m%3, m%2 ; out6 246 paddsw m%2, m%3 ; out1 247 paddsw m%3, m%6, m%8 ; out2 248 psubsw m%6, m%8 ; out5 249 psubsw m%8, m%1, m%10 ; out7 250 paddsw m%1, m%10 ; out0 251 paddsw m%4, m%5, m%9 ; out3 252 psubsw m%5, m%9 ; out4 253%endmacro 254 255; in1 = %1, in3 = %2, in5 = %3, in7 = %4 256; in9 = %5, in11 = %6, in13 = %7, in15 = %8 257%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048 258 ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a 259 ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a 260 ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a 261 ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a 262 psubsw m%9, m%2, m%6 ; t13 263 paddsw m%6, m%2 ; t12 264 psubsw m%2, m%8, m%4 ; t14 265 paddsw m%8, m%4 ; t15 266 psubsw m%4, m%7, m%3 ; t10 267 paddsw m%3, m%7 ; t11 268 psubsw m%7, m%1, m%5 ; t9 269 paddsw m%1, m%5 ; t8 270 ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a 271 ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a 272 psubsw m%5, m%1, m%3 ; t11a 273 paddsw m%1, m%3 ; t8a 274 psubsw m%3, m%7, m%4 ; t13 275 paddsw m%7, m%4 ; t14 276 psubsw m%4, m%8, m%6 ; t12a 277 paddsw m%8, m%6 ; t15a 278 psubsw m%6, m%2, m%9 ; t10 279 paddsw m%2, m%9 ; t9 280 ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a 281 ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12 282%endmacro 283 284%macro WRAP_XMM 1+ 285 INIT_XMM cpuname 286 %1 287 INIT_YMM cpuname 288%endmacro 289 290%macro ITX4_END 4-5 2048 ; row[1-4], rnd 291%if %5 292 vpbroadcastd m2, [o(pw_%5)] 293 pmulhrsw m0, m2 294 pmulhrsw m1, m2 295%endif 296 lea r2, [dstq+strideq*2] 297%assign %%i 1 298%rep 4 299 %if %1 & 2 300 CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) 301 %else 302 CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) 303 %endif 304 %assign %%i %%i + 1 305 %rotate 1 306%endrep 307 movd m2, [%%row_adr1] 308 pinsrd m2, [%%row_adr2], 1 309 movd m3, [%%row_adr3] 310 pinsrd m3, [%%row_adr4], 1 311 pmovzxbw m2, m2 312 pmovzxbw m3, m3 313 paddw m0, m2 314 paddw m1, m3 315 packuswb m0, m1 316 movd [%%row_adr1], m0 317 pextrd [%%row_adr2], m0, 1 318 pextrd [%%row_adr3], m0, 2 319 pextrd [%%row_adr4], m0, 3 320 ret 321%endmacro 322 323%macro IWHT4_1D_PACKED 0 324 punpckhqdq m3, m0, m1 ; in1 in3 325 punpcklqdq m0, m1 ; in0 in2 326 psubw m2, m0, m3 327 paddw m0, m3 328 punpckhqdq m2, m2 ; t2 t2 329 punpcklqdq m0, m0 ; t0 t0 330 psubw m1, m0, m2 331 psraw m1, 1 332 psubw m1, m3 ; t1 t3 333 psubw m0, m1 ; ____ out0 334 paddw m2, m1 ; out3 ____ 335%endmacro 336 337INIT_XMM avx2 338cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c 339 mova m0, [cq+16*0] 340 mova m1, [cq+16*1] 341 pxor m2, m2 342 mova [cq+16*0], m2 343 mova [cq+16*1], m2 344 psraw m0, 2 345 psraw m1, 2 346 IWHT4_1D_PACKED 347 punpckhwd m0, m1 348 punpcklwd m3, m1, m2 349 punpckhdq m1, m0, m3 350 punpckldq m0, m3 351 IWHT4_1D_PACKED 352 vpblendd m0, m2, 0x03 353 ITX4_END 3, 0, 2, 1, 0 354 355%macro INV_TXFM_FN 3 ; type1, type2, size 356cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2 357 %define %%p1 m(i%1_%3_internal_8bpc) 358 lea r6, [o_base] 359 ; Jump to the 1st txfm function if we're not taking the fast path, which 360 ; in turn performs an indirect jump to the 2nd txfm function. 361 lea tx2q, [m(i%2_%3_internal_8bpc).pass2] 362%ifidn %1_%2, dct_dct 363 test eobd, eobd 364 jnz %%p1 365%else 366 ; jump to the 1st txfm function unless it's located directly after this 367 times ((%%end - %%p1) >> 31) & 1 jmp %%p1 368ALIGN function_align 369%%end: 370%endif 371%endmacro 372 373%macro INV_TXFM_4X4_FN 2 ; type1, type2 374 INV_TXFM_FN %1, %2, 4x4 375%ifidn %1_%2, dct_dct 376 vpbroadcastw m0, [cq] 377 vpbroadcastd m1, [o(pw_2896x8)] 378 pmulhrsw m0, m1 379 mov [cq], eobd ; 0 380 pmulhrsw m0, m1 381 mova m1, m0 382 jmp m(iadst_4x4_internal_8bpc).end2 383%endif 384%endmacro 385 386%macro IDCT4_1D_PACKED 0 387 vpbroadcastd m4, [o(pd_2048)] 388 punpckhwd m2, m1, m0 389 punpcklwd m1, m0 390 ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 391 ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 392 paddsw m0, m1, m2 ; out0 out1 393 psubsw m1, m2 ; out3 out2 394%endmacro 395 396%macro IADST4_1D_PACKED 0 397 punpcklwd m2, m1, m0 398 punpckhwd m3, m1, m0 399 vpbroadcastd m5, [o(pw_m3344_3344)] 400 vpbroadcastd m0, [o(pw_3803_1321)] 401 vpbroadcastd m4, [o(pw_m1321_2482)] 402 pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2 403 psrld m5, 16 404 pmaddwd m0, m2 405 pmaddwd m2, m4 406 pmaddwd m5, m3 ; 3344*in0 407 paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3 408 vpbroadcastd m4, [o(pw_2482_3344)] 409 vpbroadcastd m5, [o(pw_m3803_3344)] 410 pmaddwd m4, m3 411 pmaddwd m5, m3 412 paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3 413 vpbroadcastd m0, [o(pw_m3803_m6688)] 414 pmaddwd m3, m0 415 vpbroadcastd m0, [o(pd_2048)] 416 paddd m2, m0 417 paddd m1, m0 418 paddd m0, m4 419 paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3 420 paddd m2, m4 421 paddd m2, m3 422 REPX {psrad x, 12}, m1, m2, m0, m5 423 packssdw m0, m5 ; out0 out1 424 packssdw m1, m2 ; out2 out3 425%endmacro 426 427INV_TXFM_4X4_FN dct, dct 428INV_TXFM_4X4_FN dct, adst 429INV_TXFM_4X4_FN dct, flipadst 430INV_TXFM_4X4_FN dct, identity 431 432cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 433 mova m0, [cq+16*0] 434 mova m1, [cq+16*1] 435 IDCT4_1D_PACKED 436 mova m2, [o(deint_shuf)] 437 shufps m3, m0, m1, q1331 438 shufps m0, m1, q0220 439 pshufb m0, m2 440 pshufb m1, m3, m2 441 jmp tx2q 442.pass2: 443 IDCT4_1D_PACKED 444 pxor m2, m2 445 mova [cq+16*0], m2 446 mova [cq+16*1], m2 447 ITX4_END 0, 1, 3, 2 448 449INV_TXFM_4X4_FN adst, dct 450INV_TXFM_4X4_FN adst, adst 451INV_TXFM_4X4_FN adst, flipadst 452INV_TXFM_4X4_FN adst, identity 453 454cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 455 mova m0, [cq+16*0] 456 mova m1, [cq+16*1] 457 call .main 458 punpckhwd m3, m0, m1 459 punpcklwd m0, m1 460 punpckhwd m1, m0, m3 461 punpcklwd m0, m3 462 jmp tx2q 463.pass2: 464 call .main 465.end: 466 pxor m2, m2 467 mova [cq+16*0], m2 468 mova [cq+16*1], m2 469.end2: 470 ITX4_END 0, 1, 2, 3 471ALIGN function_align 472cglobal_label .main 473 IADST4_1D_PACKED 474 ret 475 476INV_TXFM_4X4_FN flipadst, dct 477INV_TXFM_4X4_FN flipadst, adst 478INV_TXFM_4X4_FN flipadst, flipadst 479INV_TXFM_4X4_FN flipadst, identity 480 481cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 482 mova m0, [cq+16*0] 483 mova m1, [cq+16*1] 484 call m(iadst_4x4_internal_8bpc).main 485 punpcklwd m2, m1, m0 486 punpckhwd m1, m0 487 punpcklwd m0, m1, m2 488 punpckhwd m1, m2 489 jmp tx2q 490.pass2: 491 call m(iadst_4x4_internal_8bpc).main 492.end: 493 pxor m2, m2 494 mova [cq+16*0], m2 495 mova [cq+16*1], m2 496.end2: 497 ITX4_END 3, 2, 1, 0 498 499INV_TXFM_4X4_FN identity, dct 500INV_TXFM_4X4_FN identity, adst 501INV_TXFM_4X4_FN identity, flipadst 502INV_TXFM_4X4_FN identity, identity 503 504cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 505 mova m0, [cq+16*0] 506 mova m1, [cq+16*1] 507 vpbroadcastd m3, [o(pw_1697x8)] 508 pmulhrsw m2, m3, m0 509 pmulhrsw m3, m1 510 paddsw m0, m2 511 paddsw m1, m3 512 punpckhwd m2, m0, m1 513 punpcklwd m0, m1 514 punpckhwd m1, m0, m2 515 punpcklwd m0, m2 516 jmp tx2q 517.pass2: 518 vpbroadcastd m3, [o(pw_1697x8)] 519 pmulhrsw m2, m3, m0 520 pmulhrsw m3, m1 521 paddsw m0, m2 522 paddsw m1, m3 523 jmp m(iadst_4x4_internal_8bpc).end 524 525%macro WRITE_4X8 2 ; coefs[1-2] 526 movd xm4, [dstq+strideq*0] 527 pinsrd xm4, [dstq+strideq*1], 1 528 movd xm5, [dstq+strideq*2] 529 pinsrd xm5, [dstq+r3 ], 1 530 pinsrd xm4, [r2 +strideq*0], 2 531 pinsrd xm4, [r2 +strideq*1], 3 532 pinsrd xm5, [r2 +strideq*2], 2 533 pinsrd xm5, [r2 +r3 ], 3 534 pmovzxbw m4, xm4 535 pmovzxbw m5, xm5 536 paddw m4, m%1 537 paddw m5, m%2 538 packuswb m4, m5 539 vextracti128 xm5, m4, 1 540 movd [dstq+strideq*0], xm4 541 pextrd [dstq+strideq*1], xm4, 1 542 pextrd [dstq+strideq*2], xm4, 2 543 pextrd [dstq+r3 ], xm4, 3 544 movd [r2 +strideq*0], xm5 545 pextrd [r2 +strideq*1], xm5, 1 546 pextrd [r2 +strideq*2], xm5, 2 547 pextrd [r2 +r3 ], xm5, 3 548%endmacro 549 550%macro INV_TXFM_4X8_FN 2 ; type1, type2 551 INV_TXFM_FN %1, %2, 4x8 552%ifidn %1_%2, dct_dct 553 movd xm1, [o(pw_2896x8)] 554 pmulhrsw xm0, xm1, [cq] 555 movd xm2, [o(pw_2048)] 556 mov [cq], eobd 557 pmulhrsw xm0, xm1 558 pmulhrsw xm0, xm1 559 pmulhrsw xm0, xm2 560 vpbroadcastw m0, xm0 561 mova m1, m0 562 jmp m(iadst_4x8_internal_8bpc).end3 563%endif 564%endmacro 565 566%macro IDCT8_1D_PACKED 0 567 vpbroadcastd m6, [o(pd_2048)] 568 punpckhwd m5, m3, m0 ; in7 in1 569 punpckhwd m4, m1, m2 ; in3 in5 570 punpcklwd m3, m1 ; in6 in2 571 punpcklwd m2, m0 ; in4 in0 572 ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a 573 ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a 574 ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 575 psubsw m0, m5, m4 ; t5a t6a (interleaved) 576 paddsw m4, m5 ; t4 t7 (interleaved) 577 ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 578 vpbroadcastd m1, [o(pw_m2896_2896)] 579 ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5 580%if mmsize > 16 581 vbroadcasti128 m1, [o(deint_shuf)] 582 pshufb m4, m1 583%else 584 pshufb m4, [o(deint_shuf)] 585%endif 586 psubsw m1, m2, m3 ; tmp3 tmp2 587 paddsw m3, m2 ; tmp0 tmp1 588 shufps m2, m4, m0, q1032 ; t7 t6 589 vpblendd m4, m0, 0xcc ; t4 t5 590 paddsw m0, m3, m2 ; out0 out1 591 psubsw m3, m2 ; out7 out6 592 psubsw m2, m1, m4 ; out4 out5 593 paddsw m1, m4 ; out3 out2 594%endmacro 595 596%macro IADST8_1D_PACKED 1 ; pass 597 vpbroadcastd m6, [o(pd_2048)] 598 punpckhwd m0, m4, m3 ; 0 7 599 punpckhwd m1, m5, m2 ; 2 5 600 punpcklwd m2, m5 ; 4 3 601 punpcklwd m3, m4 ; 6 1 602%if %1 == 1 603 ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a 604 ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a 605 ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a 606 ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a 607 psubsw m4, m0, m2 ; t5 t4 608 paddsw m0, m2 ; t1 t0 609 psubsw m5, m1, m3 ; t6 t7 610 paddsw m1, m3 ; t2 t3 611 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a 612 ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a 613%if mmsize > 16 614 vbroadcasti128 m2, [o(deint_shuf)] 615%else 616 mova m2, [o(deint_shuf)] 617%endif 618 pshuflw m1, m1, q2301 619 pshufhw m1, m1, q2301 620 psubsw m3, m0, m1 ; t3 t2 621 paddsw m0, m1 ; -out7 out0 622 psubsw m1, m4, m5 ; t7 t6 623 paddsw m4, m5 ; out6 -out1 624 pshufb m0, m2 625 pshufb m4, m2 626 vpbroadcastd m5, [o(pw_m2896_2896)] 627 pmaddwd m2, m5, m3 628 pmaddwd m5, m1 629 paddd m2, m6 630 paddd m5, m6 631 psrad m2, 12 632 psrad m5, 12 633 packssdw m2, m5 ; out4 -out5 634 vpbroadcastd m5, [o(pw_2896_2896)] 635 pmaddwd m3, m5 636 pmaddwd m1, m5 637 paddd m3, m6 638 paddd m1, m6 639 psrad m3, 12 640 psrad m1, 12 641 packssdw m1, m3 ; out2 -out3 642 punpcklqdq m3, m4, m0 ; out6 -out7 643 punpckhqdq m0, m4 ; out0 -out1 644%else 645 ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a 646 ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a 647 ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a 648 ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a 649 psubsw m4, m0, m2 ; t4 t5 650 paddsw m0, m2 ; t0 t1 651 psubsw m5, m1, m3 ; t6 t7 652 paddsw m1, m3 ; t2 t3 653 shufps m2, m5, m4, q1032 654 punpckhwd m4, m2 655 punpcklwd m5, m2 656 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a 657 ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a 658 psubsw m2, m0, m1 ; t2 t3 659 paddsw m0, m1 ; out0 -out7 660 psubsw m1, m4, m5 ; t7 t6 661 paddsw m4, m5 ; out6 -out1 662 vpbroadcastd m5, [o(pw_2896x8)] 663 vpblendd m3, m0, m4, 0x33 ; out6 -out7 664 vpblendd m0, m4, 0xcc ; out0 -out1 665 shufps m4, m2, m1, q1032 ; t3 t7 666 vpblendd m1, m2, 0x33 ; t2 t6 667 psubsw m2, m1, m4 ; t2-t3 t6-t7 668 paddsw m1, m4 ; t2+t3 t6+t7 669 pmulhrsw m2, m5 ; out4 -out5 670 pshufd m1, m1, q1032 671 pmulhrsw m1, m5 ; out2 -out3 672%endif 673%endmacro 674 675INIT_YMM avx2 676INV_TXFM_4X8_FN dct, dct 677INV_TXFM_4X8_FN dct, adst 678INV_TXFM_4X8_FN dct, flipadst 679INV_TXFM_4X8_FN dct, identity 680 681cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 682 vpermq m0, [cq+32*0], q3120 683 vpermq m1, [cq+32*1], q3120 684 vpbroadcastd m2, [o(pw_2896x8)] 685 pmulhrsw m0, m2 686 pmulhrsw m1, m2 687 IDCT4_1D_PACKED 688 vbroadcasti128 m2, [o(deint_shuf)] 689 shufps m3, m0, m1, q1331 690 shufps m0, m1, q0220 691 pshufb m0, m2 692 pshufb m1, m3, m2 693 jmp tx2q 694.pass2: 695 vextracti128 xm2, m0, 1 696 vextracti128 xm3, m1, 1 697 call .main 698 vpbroadcastd m4, [o(pw_2048)] 699 vinserti128 m0, xm2, 1 700 vinserti128 m1, xm3, 1 701 pshufd m1, m1, q1032 702 jmp m(iadst_4x8_internal_8bpc).end2 703ALIGN function_align 704cglobal_label .main 705 WRAP_XMM IDCT8_1D_PACKED 706 ret 707 708INV_TXFM_4X8_FN adst, dct 709INV_TXFM_4X8_FN adst, adst 710INV_TXFM_4X8_FN adst, flipadst 711INV_TXFM_4X8_FN adst, identity 712 713cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 714 vpermq m0, [cq+32*0], q3120 715 vpermq m1, [cq+32*1], q3120 716 vpbroadcastd m2, [o(pw_2896x8)] 717 pmulhrsw m0, m2 718 pmulhrsw m1, m2 719 call m(iadst_8x4_internal_8bpc).main 720 punpckhwd m3, m0, m1 721 punpcklwd m0, m1 722 punpckhwd m1, m0, m3 723 punpcklwd m0, m3 724 jmp tx2q 725.pass2: 726 vextracti128 xm2, m0, 1 727 vextracti128 xm3, m1, 1 728 pshufd xm4, xm0, q1032 729 pshufd xm5, xm1, q1032 730 call .main_pass2 731 vpbroadcastd m4, [o(pw_2048)] 732 vinserti128 m0, xm2, 1 733 vinserti128 m1, xm3, 1 734 pxor m5, m5 735 psubw m5, m4 736.end: 737 vpblendd m4, m5, 0xcc 738.end2: 739 pmulhrsw m0, m4 740 pmulhrsw m1, m4 741 WIN64_RESTORE_XMM 742 pxor m2, m2 743 mova [cq+32*0], m2 744 mova [cq+32*1], m2 745.end3: 746 lea r2, [dstq+strideq*4] 747 lea r3, [strideq*3] 748 WRITE_4X8 0, 1 749 RET 750ALIGN function_align 751.main_pass1: 752 WRAP_XMM IADST8_1D_PACKED 1 753 ret 754ALIGN function_align 755cglobal_label .main_pass2 756 WRAP_XMM IADST8_1D_PACKED 2 757 ret 758 759INV_TXFM_4X8_FN flipadst, dct 760INV_TXFM_4X8_FN flipadst, adst 761INV_TXFM_4X8_FN flipadst, flipadst 762INV_TXFM_4X8_FN flipadst, identity 763 764cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 765 vpermq m0, [cq+32*0], q3120 766 vpermq m1, [cq+32*1], q3120 767 vpbroadcastd m2, [o(pw_2896x8)] 768 pmulhrsw m0, m2 769 pmulhrsw m1, m2 770 call m(iadst_8x4_internal_8bpc).main 771 punpcklwd m3, m1, m0 772 punpckhwd m1, m0 773 punpcklwd m0, m1, m3 774 punpckhwd m1, m3 775 jmp tx2q 776.pass2: 777 vextracti128 xm2, m0, 1 778 vextracti128 xm3, m1, 1 779 pshufd xm4, xm0, q1032 780 pshufd xm5, xm1, q1032 781 call m(iadst_4x8_internal_8bpc).main_pass2 782 vpbroadcastd m5, [o(pw_2048)] 783 vinserti128 m3, xm1, 1 784 vinserti128 m2, xm0, 1 785 pxor m4, m4 786 psubw m4, m5 787 pshufd m0, m3, q1032 788 pshufd m1, m2, q1032 789 jmp m(iadst_4x8_internal_8bpc).end 790 791INV_TXFM_4X8_FN identity, dct 792INV_TXFM_4X8_FN identity, adst 793INV_TXFM_4X8_FN identity, flipadst 794INV_TXFM_4X8_FN identity, identity 795 796cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 797 vpermq m2, [cq+32*0], q3120 798 vpermq m0, [cq+32*1], q3120 799 vpbroadcastd m3, [o(pw_2896x8)] 800 vpbroadcastd m4, [o(pw_1697x8)] 801 punpcklwd m1, m2, m0 802 punpckhwd m2, m0 803 pmulhrsw m1, m3 804 pmulhrsw m2, m3 805 punpcklwd m0, m1, m2 806 punpckhwd m1, m2 807 pmulhrsw m2, m4, m0 808 pmulhrsw m4, m1 809 paddsw m0, m2 810 paddsw m1, m4 811 jmp tx2q 812.pass2: 813 vpbroadcastd m4, [o(pw_4096)] 814 jmp m(iadst_4x8_internal_8bpc).end2 815 816%macro INV_TXFM_4X16_FN 2 ; type1, type2 817 INV_TXFM_FN %1, %2, 4x16 818%ifidn %1_%2, dct_dct 819 movd xm1, [o(pw_2896x8)] 820 pmulhrsw xm0, xm1, [cq] 821 movd xm2, [o(pw_16384)] 822 movd xm3, [o(pw_2048)] 823 mov [cq], eobd 824 pmulhrsw xm0, xm2 825 pmulhrsw xm0, xm1 826 pmulhrsw xm0, xm3 827 vpbroadcastw m0, xm0 828 mova m1, m0 829 mova m2, m0 830 mova m3, m0 831 jmp m(iadst_4x16_internal_8bpc).end3 832%endif 833%endmacro 834 835%macro IDCT16_1D_PACKED 0 836 vpbroadcastd m10, [o(pd_2048)] 837.main2: 838 punpckhwd m8, m7, m0 ; dct16 in15 in1 839 punpcklwd m9, m4, m0 ; dct4 in2 in0 840 punpckhwd m0, m3, m4 ; dct16 in7 in9 841 punpcklwd m7, m1 ; dct8 in7 in1 842 punpckhwd m1, m6 ; dct16 in3 in13 843 punpcklwd m3, m5 ; dct8 in3 in5 844 punpckhwd m5, m2 ; dct16 in11 in5 845 punpcklwd m6, m2 ; dct4 in3 in1 846 ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a 847 ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a 848 ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a 849 ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a 850 ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a 851 ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a 852 ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2 853 psubsw m2, m8, m0 ; t9 t14 854 paddsw m8, m0 ; t8 t15 855 psubsw m0, m1, m5 ; t10 t13 856 paddsw m1, m5 ; t11 t12 857 vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784 858 ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a 859 vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567 860 ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a 861 psubsw m4, m8, m1 ; t11a t12a 862 paddsw m8, m1 ; t8a t15a 863 psubsw m1, m7, m3 ; t5a t6a 864 paddsw m7, m3 ; t4 t7 865 paddsw m3, m2, m0 ; t9 t14 866 psubsw m2, m0 ; t10 t13 867%if mmsize > 16 868 vbroadcasti128 m0, [o(deint_shuf)] 869%else 870 mova m0, [o(deint_shuf)] 871%endif 872 pshufb m8, m0 873 pshufb m7, m0 874 pshufb m3, m0 875 ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1 876 vpbroadcastd m0, [o(pw_m2896_2896)] 877 ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12 878 vpbroadcastd m5, [o(pw_2896_2896)] 879 ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 880 vpbroadcastd m0, [o(pw_m2896_2896)] 881 ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a 882 punpckhqdq m0, m8, m3 ; t15a t14 883 punpcklqdq m8, m3 ; t8a t9 884 shufps m5, m4, m2, q1032 ; t12 t13a 885 vpblendd m4, m2, 0xcc ; t11 t10a 886 shufps m2, m7, m1, q1032 ; t7 t6 887 vpblendd m7, m1, 0xcc ; t4 t5 888 psubsw m1, m9, m6 ; dct4 out3 out2 889 paddsw m9, m6 ; dct4 out0 out1 890 psubsw m3, m9, m2 ; dct8 out7 out6 891 paddsw m9, m2 ; dct8 out0 out1 892 psubsw m2, m1, m7 ; dct8 out4 out5 893 paddsw m1, m7 ; dct8 out3 out2 894 psubsw m7, m9, m0 ; out15 out14 895 paddsw m0, m9 ; out0 out1 896 psubsw m6, m1, m5 ; out12 out13 897 paddsw m1, m5 ; out3 out2 898 psubsw m5, m2, m4 ; out11 out10 899 paddsw m2, m4 ; out4 out5 900 psubsw m4, m3, m8 ; out8 out9 901 paddsw m3, m8 ; out7 out6 902%endmacro 903 904INV_TXFM_4X16_FN dct, dct 905INV_TXFM_4X16_FN dct, adst 906INV_TXFM_4X16_FN dct, flipadst 907INV_TXFM_4X16_FN dct, identity 908 909cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 910 mova m0, [cq+32*0] 911 mova m1, [cq+32*1] 912 mova m2, [cq+32*2] 913 mova m3, [cq+32*3] 914 call m(idct_16x4_internal_8bpc).main 915 vpbroadcastd m5, [o(pw_16384)] 916 punpckhwd m4, m2, m3 917 punpcklwd m2, m3 918 punpckhwd m3, m0, m1 919 punpcklwd m0, m1 920 REPX {pmulhrsw x, m5}, m0, m4, m2, m3 921 punpckhdq m1, m0, m2 922 punpckldq m0, m2 923 punpckldq m2, m3, m4 924 punpckhdq m3, m4 925 jmp tx2q 926.pass2: 927 vextracti128 xm4, m0, 1 928 vextracti128 xm5, m1, 1 929 vextracti128 xm6, m2, 1 930 vextracti128 xm7, m3, 1 931 call .main 932 vinserti128 m0, xm4, 1 933 vinserti128 m1, xm5, 1 934 vpbroadcastd m5, [o(pw_2048)] 935 vinserti128 m2, xm6, 1 936 vinserti128 m3, xm7, 1 937 pshufd m1, m1, q1032 938 pshufd m3, m3, q1032 939 jmp m(iadst_4x16_internal_8bpc).end2 940ALIGN function_align 941cglobal_label .main 942 WRAP_XMM IDCT16_1D_PACKED 943 ret 944 945INV_TXFM_4X16_FN adst, dct 946INV_TXFM_4X16_FN adst, adst 947INV_TXFM_4X16_FN adst, flipadst 948INV_TXFM_4X16_FN adst, identity 949 950cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 951 mova m0, [cq+32*0] 952 mova m1, [cq+32*1] 953 mova m2, [cq+32*2] 954 mova m3, [cq+32*3] 955 call m(iadst_16x4_internal_8bpc).main 956 vpbroadcastd m5, [o(pw_16384)] 957 punpckhwd m4, m2, m3 958 punpcklwd m2, m3 959 punpckhwd m3, m0, m1 960 punpcklwd m0, m1 961 REPX {pmulhrsw x, m5}, m4, m2, m3, m0 962 punpckhdq m1, m0, m2 963 punpckldq m0, m2 964 punpckldq m2, m3, m4 965 punpckhdq m3, m4 966 jmp tx2q 967.pass2: 968 call .main 969 vpbroadcastd m5, [o(pw_2896x8)] 970 paddsw m1, m2, m4 971 psubsw m2, m4 972 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 973 pmulhrsw m2, m5 ; out8 -out11 -out9 out10 974 vpbroadcastd m5, [o(pw_2048)] 975 pshufd m1, m1, q1032 976 vpblendd m4, m1, m0, 0x33 977 vpblendd m0, m2, 0x33 978 vpblendd m2, m3, 0x33 979 vpblendd m3, m1, 0x33 980 vpermq m0, m0, q2031 981 vpermq m1, m2, q1302 982 vpermq m2, m3, q3120 983 vpermq m3, m4, q0213 984 psubw m6, m7, m5 985.end: 986 vpblendd m5, m6, 0xcc 987.end2: 988 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 989 WIN64_RESTORE_XMM 990 pxor m4, m4 991 mova [cq+32*0], m4 992 mova [cq+32*1], m4 993 mova [cq+32*2], m4 994 mova [cq+32*3], m4 995.end3: 996 lea r2, [dstq+strideq*8] 997 lea r3, [strideq*3] 998 WRITE_4X8 0, 1 999 lea dstq, [dstq+strideq*4] 1000 lea r2, [r2 +strideq*4] 1001 WRITE_4X8 2, 3 1002 RET 1003ALIGN function_align 1004.main: 1005 vpblendd m4, m1, m0, 0xcc 1006 vpblendd m1, m0, 0x33 1007 vpblendd m5, m2, m3, 0xcc 1008 vpblendd m2, m3, 0x33 1009 vperm2i128 m3, m5, m2, 0x31 1010 vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 1011 vperm2i128 m4, m1, m4, 0x31 1012 vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 1013 pshufd m3, m3, q1032 ; in15 in12 in13 in14 1014 pshufd m2, m4, q1032 ; in11 in8 in9 in10 1015cglobal_label .main2 1016 vpbroadcastd m8, [o(pd_2048)] 1017 pxor m7, m7 1018 punpckhwd m4, m3, m0 ; in12 in3 in14 in1 1019 punpcklwd m0, m3 ; in0 in15 in2 in13 1020 punpckhwd m3, m2, m1 ; in8 in7 in10 in5 1021 punpcklwd m1, m2 ; in4 in11 in6 in9 1022 ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3 1023 ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3 1024 ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3 1025 ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3 1026 psubsw m2, m0, m3 ; t9a t8a t11a t10a 1027 paddsw m0, m3 ; t1a t0a t3a t2a 1028 psubsw m3, m1, m4 ; t13a t12a t15a t14a 1029 paddsw m1, m4 ; t5a t4a t7a t6a 1030 ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3 1031 psubw m6, m7, m5 1032 ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6 1033 vpbroadcastd m6, [o(pw_m3784_1567)] 1034 vpbroadcastd m5, [o(pw_1567_3784)] 1035 psubsw m4, m0, m1 ; t5 t4 t7 t6 1036 paddsw m0, m1 ; t1 t0 t3 t2 1037 psubsw m1, m2, m3 ; t13a t12a t15a t14a 1038 paddsw m2, m3 ; t9a t8a t11a t10a 1039 psubw m3, m7, m6 ; pw_3784_m1567 1040 vpblendd m6, m3, 0xf0 1041 ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a 1042 ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 1043 vbroadcasti128 m5, [o(deint_shuf)] 1044 pshufb m0, m5 1045 pshufb m2, m5 1046 vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a 1047 vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a 1048 vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 1049 vinserti128 m4, xm1, 1 ; t4a t5a t12 t13 1050 pshufd m2, m2, q1032 ; t6a t7a t14 t15 1051 psubsw m1, m0, m3 ; t3a t2a t11 t10 1052 paddsw m0, m3 ; -out15 out0 out14 -out1 1053 paddsw m3, m4, m2 ; -out3 out12 out2 -out13 1054 psubsw m4, m2 ; t6 t7 t14a t15a 1055 shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a 1056 vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a 1057 ret 1058ALIGN function_align 1059.main_pass1_end: 1060 vpbroadcastd m5, [o(pw_m2896_2896)] 1061 vpbroadcastd m6, [o(pw_2896_2896)] 1062 punpcklwd m1, m4, m2 1063 punpckhwd m4, m2 1064 pmaddwd m2, m5, m4 1065 pmaddwd m4, m6 1066 pmaddwd m5, m1 1067 pmaddwd m1, m6 1068 REPX {paddd x, m8}, m5, m1, m2, m4 1069 REPX {psrad x, 12}, m5, m2, m1, m4 1070 packssdw m2, m5 ; -out11 out8 out10 -out9 1071 packssdw m1, m4 ; -out7 out4 out6 -out5 1072 ret 1073 1074INV_TXFM_4X16_FN flipadst, dct 1075INV_TXFM_4X16_FN flipadst, adst 1076INV_TXFM_4X16_FN flipadst, flipadst 1077INV_TXFM_4X16_FN flipadst, identity 1078 1079cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 1080 mova m0, [cq+32*0] 1081 mova m1, [cq+32*1] 1082 mova m2, [cq+32*2] 1083 mova m3, [cq+32*3] 1084 call m(iadst_16x4_internal_8bpc).main 1085 vpbroadcastd m5, [o(pw_16384)] 1086 punpcklwd m4, m1, m0 1087 punpckhwd m1, m0 1088 punpcklwd m0, m3, m2 1089 punpckhwd m3, m2 1090 REPX {pmulhrsw x, m5}, m4, m1, m0, m3 1091 punpckldq m2, m3, m1 1092 punpckhdq m3, m1 1093 punpckhdq m1, m0, m4 1094 punpckldq m0, m4 1095 jmp tx2q 1096.pass2: 1097 call m(iadst_4x16_internal_8bpc).main 1098 vpbroadcastd m5, [o(pw_2896x8)] 1099 paddsw m1, m2, m4 1100 psubsw m2, m4 1101 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 1102 pmulhrsw m2, m5 ; out8 -out11 -out9 out10 1103 vpbroadcastd m6, [o(pw_2048)] 1104 pshufd m1, m1, q1032 1105 vpblendd m4, m0, m2, 0x33 1106 vpblendd m0, m1, 0xcc 1107 vpblendd m1, m3, 0xcc 1108 vpblendd m2, m3, 0x33 1109 vpermq m0, m0, q3120 1110 vpermq m1, m1, q0213 1111 vpermq m2, m2, q2031 1112 vpermq m3, m4, q1302 1113 psubw m5, m7, m6 1114 jmp m(iadst_4x16_internal_8bpc).end 1115 1116INV_TXFM_4X16_FN identity, dct 1117INV_TXFM_4X16_FN identity, adst 1118INV_TXFM_4X16_FN identity, flipadst 1119INV_TXFM_4X16_FN identity, identity 1120 1121cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 1122 mova m3, [cq+32*0] 1123 mova m2, [cq+32*1] 1124 mova m4, [cq+32*2] 1125 mova m5, [cq+32*3] 1126 vpbroadcastd m8, [o(pw_1697x8)] 1127 pcmpeqw m0, m0 ; -1 1128 punpcklwd m1, m3, m2 1129 punpckhwd m3, m2 1130 punpcklwd m2, m4, m5 1131 punpckhwd m4, m5 1132 pmulhrsw m5, m8, m1 1133 pmulhrsw m6, m8, m2 1134 pmulhrsw m7, m8, m3 1135 pmulhrsw m8, m4 1136 pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is 1137 pxor m1, m9 ; unsigned. as long as both signs are equal 1138 pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the 1139 pxor m2, m9 ; pmulhrsw result will become 0 which causes 1140 pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless 1141 pxor m3, m9 ; we explicitly deal with that case here. 1142 pcmpeqw m0, m4 1143 pxor m4, m0 1144 pavgw m1, m5 1145 pavgw m2, m6 1146 pavgw m3, m7 1147 pavgw m4, m8 1148 punpckldq m0, m1, m2 1149 punpckhdq m1, m2 1150 punpckldq m2, m3, m4 1151 punpckhdq m3, m4 1152 jmp tx2q 1153.pass2: 1154 vpbroadcastd m8, [o(pw_1697x16)] 1155 vpbroadcastd m5, [o(pw_2048)] 1156 pmulhrsw m4, m8, m0 1157 pmulhrsw m6, m8, m1 1158 pmulhrsw m7, m8, m2 1159 pmulhrsw m8, m3 1160 REPX {paddsw x, x}, m0, m1, m2, m3 1161 paddsw m0, m4 1162 paddsw m1, m6 1163 paddsw m2, m7 1164 paddsw m3, m8 1165 jmp m(iadst_4x16_internal_8bpc).end2 1166 1167%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] 1168 movq xm%3, [dstq ] 1169 movhps xm%3, [dstq+%5] 1170 movq xm%4, [dstq+%6] 1171 movhps xm%4, [dstq+%7] 1172 pmovzxbw m%3, xm%3 1173 pmovzxbw m%4, xm%4 1174%ifnum %1 1175 paddw m%3, m%1 1176%else 1177 paddw m%3, %1 1178%endif 1179%ifnum %2 1180 paddw m%4, m%2 1181%else 1182 paddw m%4, %2 1183%endif 1184 packuswb m%3, m%4 1185 vextracti128 xm%4, m%3, 1 1186 movq [dstq ], xm%3 1187 movhps [dstq+%6], xm%3 1188 movq [dstq+%5], xm%4 1189 movhps [dstq+%7], xm%4 1190%endmacro 1191 1192%macro INV_TXFM_8X4_FN 2 ; type1, type2 1193 INV_TXFM_FN %1, %2, 8x4 1194%ifidn %1_%2, dct_dct 1195 movd xm1, [o(pw_2896x8)] 1196 pmulhrsw xm0, xm1, [cq] 1197 mov [cq], eobd 1198 pmulhrsw xm0, xm1 1199 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 1200%endif 1201%endmacro 1202 1203INV_TXFM_8X4_FN dct, dct 1204INV_TXFM_8X4_FN dct, adst 1205INV_TXFM_8X4_FN dct, flipadst 1206INV_TXFM_8X4_FN dct, identity 1207 1208cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 1209 vpbroadcastd xm3, [o(pw_2896x8)] 1210 pmulhrsw xm0, xm3, [cq+16*0] 1211 pmulhrsw xm1, xm3, [cq+16*1] 1212 pmulhrsw xm2, xm3, [cq+16*2] 1213 pmulhrsw xm3, [cq+16*3] 1214 call m(idct_4x8_internal_8bpc).main 1215 vbroadcasti128 m4, [o(deint_shuf)] 1216 vinserti128 m3, m1, xm3, 1 1217 vinserti128 m1, m0, xm2, 1 1218 shufps m0, m1, m3, q0220 1219 shufps m1, m3, q1331 1220 pshufb m0, m4 1221 pshufb m1, m4 1222 jmp tx2q 1223.pass2: 1224 IDCT4_1D_PACKED 1225 vpermq m0, m0, q3120 1226 vpermq m1, m1, q2031 1227 jmp m(iadst_8x4_internal_8bpc).end2 1228 1229INV_TXFM_8X4_FN adst, dct 1230INV_TXFM_8X4_FN adst, adst 1231INV_TXFM_8X4_FN adst, flipadst 1232INV_TXFM_8X4_FN adst, identity 1233 1234cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 1235 vpbroadcastd xm0, [o(pw_2896x8)] 1236 pshufd xm4, [cq+16*0], q1032 1237 pmulhrsw xm3, xm0, [cq+16*3] 1238 pshufd xm5, [cq+16*1], q1032 1239 pmulhrsw xm2, xm0, [cq+16*2] 1240 pmulhrsw xm4, xm0 1241 pmulhrsw xm5, xm0 1242 call m(iadst_4x8_internal_8bpc).main_pass1 1243 vinserti128 m0, xm2, 1 1244 vinserti128 m1, xm3, 1 1245 punpckhwd m2, m0, m1 1246 punpcklwd m0, m1 1247 pxor m3, m3 1248 psubsw m3, m2 1249 punpckhwd m1, m0, m3 1250 punpcklwd m0, m3 1251 jmp tx2q 1252.pass2: 1253 call .main 1254.end: 1255 vpermq m0, m0, q3120 1256 vpermq m1, m1, q3120 1257.end2: 1258 vpbroadcastd m2, [o(pw_2048)] 1259 pmulhrsw m0, m2 1260 pmulhrsw m1, m2 1261 WIN64_RESTORE_XMM 1262.end3: 1263 pxor m2, m2 1264 mova [cq+32*0], m2 1265 mova [cq+32*1], m2 1266 lea r3, [strideq*3] 1267 WRITE_8X4 0, 1, 4, 5 1268 RET 1269ALIGN function_align 1270cglobal_label .main 1271 IADST4_1D_PACKED 1272 ret 1273 1274INV_TXFM_8X4_FN flipadst, dct 1275INV_TXFM_8X4_FN flipadst, adst 1276INV_TXFM_8X4_FN flipadst, flipadst 1277INV_TXFM_8X4_FN flipadst, identity 1278 1279cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 1280 vpbroadcastd xm0, [o(pw_2896x8)] 1281 pshufd xm4, [cq+16*0], q1032 1282 pmulhrsw xm3, xm0, [cq+16*3] 1283 pshufd xm5, [cq+16*1], q1032 1284 pmulhrsw xm2, xm0, [cq+16*2] 1285 pmulhrsw xm4, xm0 1286 pmulhrsw xm5, xm0 1287 call m(iadst_4x8_internal_8bpc).main_pass1 1288 vinserti128 m3, xm1, 1 1289 vinserti128 m2, xm0, 1 1290 punpckhwd m1, m3, m2 1291 punpcklwd m3, m2 1292 pxor m0, m0 1293 psubsw m0, m1 1294 punpckhwd m1, m0, m3 1295 punpcklwd m0, m3 1296 jmp tx2q 1297.pass2: 1298 call m(iadst_8x4_internal_8bpc).main 1299 mova m2, m1 1300 vpermq m1, m0, q2031 1301 vpermq m0, m2, q2031 1302 jmp m(iadst_8x4_internal_8bpc).end2 1303 1304INV_TXFM_8X4_FN identity, dct 1305INV_TXFM_8X4_FN identity, adst 1306INV_TXFM_8X4_FN identity, flipadst 1307INV_TXFM_8X4_FN identity, identity 1308 1309cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 1310 mova xm2, [cq+16*0] 1311 mova xm0, [cq+16*1] 1312 vinserti128 m2, [cq+16*2], 1 1313 vinserti128 m0, [cq+16*3], 1 1314 vpbroadcastd m3, [o(pw_2896x8)] 1315 punpcklwd m1, m2, m0 1316 punpckhwd m2, m0 1317 pmulhrsw m1, m3 1318 pmulhrsw m2, m3 1319 punpcklwd m0, m1, m2 1320 punpckhwd m1, m2 1321 paddsw m0, m0 1322 paddsw m1, m1 1323 jmp tx2q 1324.pass2: 1325 vpbroadcastd m3, [o(pw_1697x8)] 1326 pmulhrsw m2, m3, m0 1327 pmulhrsw m3, m1 1328 paddsw m0, m2 1329 paddsw m1, m3 1330 jmp m(iadst_8x4_internal_8bpc).end 1331 1332%macro INV_TXFM_8X8_FN 2 ; type1, type2 1333 INV_TXFM_FN %1, %2, 8x8 1334%ifidn %1_%2, dct_dct 1335 movd xm1, [o(pw_2896x8)] 1336 pmulhrsw xm0, xm1, [cq] 1337 movd xm2, [o(pw_16384)] 1338 mov [cq], eobd 1339 or r3d, 8 1340.dconly: 1341 pmulhrsw xm0, xm2 1342.dconly2: 1343 movd xm2, [pw_2048] 1344 pmulhrsw xm0, xm1 1345 lea r2, [strideq*3] 1346 pmulhrsw xm0, xm2 1347 vpbroadcastw m0, xm0 1348.dconly_loop: 1349 WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2 1350 lea dstq, [dstq+strideq*4] 1351 sub r3d, 4 1352 jg .dconly_loop 1353 RET 1354%endif 1355%endmacro 1356 1357INV_TXFM_8X8_FN dct, dct 1358INV_TXFM_8X8_FN dct, adst 1359INV_TXFM_8X8_FN dct, flipadst 1360INV_TXFM_8X8_FN dct, identity 1361 1362cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 1363 vpermq m0, [cq+32*0], q3120 ; 0 1 1364 vpermq m3, [cq+32*3], q3120 ; 6 7 1365 vpermq m2, [cq+32*2], q3120 ; 4 5 1366 vpermq m1, [cq+32*1], q3120 ; 2 3 1367 call .main 1368 shufps m4, m0, m1, q0220 1369 shufps m5, m0, m1, q1331 1370 shufps m1, m2, m3, q0220 1371 shufps m3, m2, m3, q1331 1372 vbroadcasti128 m0, [o(deint_shuf)] 1373 vpbroadcastd m2, [o(pw_16384)] 1374 REPX {pshufb x, m0}, m4, m5, m1, m3 1375 REPX {pmulhrsw x, m2}, m4, m5, m1, m3 1376 vinserti128 m0, m4, xm1, 1 1377 vperm2i128 m2, m4, m1, 0x31 1378 vinserti128 m1, m5, xm3, 1 1379 vperm2i128 m3, m5, m3, 0x31 1380 jmp tx2q 1381.pass2: 1382 call .main 1383 vpbroadcastd m4, [o(pw_2048)] 1384 vpermq m0, m0, q3120 1385 vpermq m1, m1, q2031 1386 vpermq m2, m2, q3120 1387 vpermq m3, m3, q2031 1388 jmp m(iadst_8x8_internal_8bpc).end2 1389ALIGN function_align 1390cglobal_label .main 1391 IDCT8_1D_PACKED 1392 ret 1393 1394INV_TXFM_8X8_FN adst, dct 1395INV_TXFM_8X8_FN adst, adst 1396INV_TXFM_8X8_FN adst, flipadst 1397INV_TXFM_8X8_FN adst, identity 1398 1399cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 1400 vpermq m4, [cq+32*0], q1302 ; 1 0 1401 vpermq m3, [cq+32*3], q3120 ; 6 7 1402 vpermq m5, [cq+32*1], q1302 ; 3 2 1403 vpermq m2, [cq+32*2], q3120 ; 4 5 1404 call .main_pass1 1405 vpbroadcastd m5, [o(pw_16384)] 1406 punpcklwd m4, m0, m1 1407 punpckhwd m0, m1 1408 punpcklwd m1, m2, m3 1409 punpckhwd m2, m3 1410 pxor m3, m3 1411 psubw m3, m5 ; negate odd elements during rounding 1412 pmulhrsw m4, m5 1413 pmulhrsw m0, m3 1414 pmulhrsw m1, m5 1415 pmulhrsw m2, m3 1416 punpcklwd m3, m4, m0 1417 punpckhwd m4, m0 1418 punpcklwd m0, m1, m2 1419 punpckhwd m1, m2 1420 vperm2i128 m2, m3, m0, 0x31 1421 vinserti128 m0, m3, xm0, 1 1422 vperm2i128 m3, m4, m1, 0x31 1423 vinserti128 m1, m4, xm1, 1 1424 jmp tx2q 1425.pass2: 1426 pshufd m4, m0, q1032 1427 pshufd m5, m1, q1032 1428 call .main_pass2 1429 vpbroadcastd m5, [o(pw_2048)] 1430 vpbroadcastd xm4, [o(pw_4096)] 1431 psubw m4, m5 ; lower half = 2048, upper half = -2048 1432.end: 1433 REPX {vpermq x, x, q3120}, m0, m1, m2, m3 1434.end2: 1435 pmulhrsw m0, m4 1436 pmulhrsw m1, m4 1437.end3: 1438 pmulhrsw m2, m4 1439 pmulhrsw m3, m4 1440 WIN64_RESTORE_XMM 1441.end4: 1442 pxor m4, m4 1443 mova [cq+32*0], m4 1444 mova [cq+32*1], m4 1445 mova [cq+32*2], m4 1446 mova [cq+32*3], m4 1447 lea r3, [strideq*3] 1448 WRITE_8X4 0, 1, 4, 5 1449 lea dstq, [dstq+strideq*4] 1450 WRITE_8X4 2, 3, 4, 5 1451 RET 1452ALIGN function_align 1453.main_pass1: 1454 IADST8_1D_PACKED 1 1455 ret 1456ALIGN function_align 1457cglobal_label .main_pass2 1458 IADST8_1D_PACKED 2 1459 ret 1460 1461INV_TXFM_8X8_FN flipadst, dct 1462INV_TXFM_8X8_FN flipadst, adst 1463INV_TXFM_8X8_FN flipadst, flipadst 1464INV_TXFM_8X8_FN flipadst, identity 1465 1466cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 1467 vpermq m4, [cq+32*0], q1302 ; 1 0 1468 vpermq m3, [cq+32*3], q3120 ; 6 7 1469 vpermq m5, [cq+32*1], q1302 ; 3 2 1470 vpermq m2, [cq+32*2], q3120 ; 4 5 1471 call m(iadst_8x8_internal_8bpc).main_pass1 1472 vpbroadcastd m5, [o(pw_16384)] 1473 punpckhwd m4, m3, m2 1474 punpcklwd m3, m2 1475 punpckhwd m2, m1, m0 1476 punpcklwd m1, m0 1477 pxor m0, m0 1478 psubw m0, m5 1479 pmulhrsw m4, m0 1480 pmulhrsw m3, m5 1481 pmulhrsw m2, m0 1482 pmulhrsw m1, m5 1483 punpckhwd m0, m4, m3 1484 punpcklwd m4, m3 1485 punpckhwd m3, m2, m1 1486 punpcklwd m2, m1 1487 vinserti128 m1, m0, xm3, 1 1488 vperm2i128 m3, m0, m3, 0x31 1489 vinserti128 m0, m4, xm2, 1 1490 vperm2i128 m2, m4, m2, 0x31 1491 jmp tx2q 1492.pass2: 1493 pshufd m4, m0, q1032 1494 pshufd m5, m1, q1032 1495 call m(iadst_8x8_internal_8bpc).main_pass2 1496 vpbroadcastd m4, [o(pw_2048)] 1497 vpbroadcastd xm5, [o(pw_4096)] 1498 psubw m4, m5 ; lower half = -2048, upper half = 2048 1499 vpermq m5, m3, q2031 1500 vpermq m3, m0, q2031 1501 vpermq m0, m2, q2031 1502 vpermq m2, m1, q2031 1503 pmulhrsw m1, m0, m4 1504 pmulhrsw m0, m5, m4 1505 jmp m(iadst_8x8_internal_8bpc).end3 1506 1507INV_TXFM_8X8_FN identity, dct 1508INV_TXFM_8X8_FN identity, adst 1509INV_TXFM_8X8_FN identity, flipadst 1510INV_TXFM_8X8_FN identity, identity 1511 1512cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 1513 mova xm3, [cq+16*0] 1514 mova xm2, [cq+16*1] 1515 vinserti128 m3, [cq+16*4], 1 1516 vinserti128 m2, [cq+16*5], 1 1517 mova xm4, [cq+16*2] 1518 mova xm0, [cq+16*3] 1519 vinserti128 m4, [cq+16*6], 1 1520 vinserti128 m0, [cq+16*7], 1 1521 punpcklwd m1, m3, m2 1522 punpckhwd m3, m2 1523 punpcklwd m2, m4, m0 1524 punpckhwd m4, m0 1525 punpckldq m0, m1, m2 1526 punpckhdq m1, m2 1527 punpckldq m2, m3, m4 1528 punpckhdq m3, m4 1529 jmp tx2q 1530.pass2: 1531 vpbroadcastd m4, [o(pw_4096)] 1532 jmp m(iadst_8x8_internal_8bpc).end 1533 1534%macro INV_TXFM_8X16_FN 2 ; type1, type2 1535 INV_TXFM_FN %1, %2, 8x16 1536%ifidn %1_%2, dct_dct 1537 movd xm1, [o(pw_2896x8)] 1538 pmulhrsw xm0, xm1, [cq] 1539 movd xm2, [o(pw_16384)] 1540 mov [cq], eobd 1541 pmulhrsw xm0, xm1 1542 or r3d, 16 1543 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly 1544%endif 1545%endmacro 1546 1547%macro ITX_8X16_LOAD_COEFS 0 1548 vpbroadcastd m4, [o(pw_2896x8)] 1549 pmulhrsw m0, m4, [cq+32*0] 1550 add cq, 32*4 1551 pmulhrsw m7, m4, [cq+32*3] 1552 pmulhrsw m1, m4, [cq-32*3] 1553 pmulhrsw m6, m4, [cq+32*2] 1554 pmulhrsw m2, m4, [cq-32*2] 1555 pmulhrsw m5, m4, [cq+32*1] 1556 pmulhrsw m3, m4, [cq-32*1] 1557 pmulhrsw m4, [cq+32*0] 1558%endmacro 1559 1560INV_TXFM_8X16_FN dct, dct 1561INV_TXFM_8X16_FN dct, adst 1562INV_TXFM_8X16_FN dct, flipadst 1563INV_TXFM_8X16_FN dct, identity 1564 1565cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 1566 ITX_8X16_LOAD_COEFS 1567 call m(idct_16x8_internal_8bpc).main 1568 vpbroadcastd m10, [o(pw_16384)] 1569.pass1_end: 1570 vperm2i128 m9, m3, m7, 0x31 1571 vinserti128 m3, xm7, 1 1572 vperm2i128 m8, m2, m6, 0x31 1573 vinserti128 m2, xm6, 1 1574 vperm2i128 m6, m1, m5, 0x31 1575 vinserti128 m1, xm5, 1 1576 vperm2i128 m5, m0, m4, 0x31 1577 vinserti128 m0, xm4, 1 1578 punpckhwd m4, m2, m3 1579 punpcklwd m2, m3 1580 punpckhwd m3, m0, m1 1581 punpcklwd m0, m1 1582.pass1_end2: 1583 punpckhwd m7, m5, m6 1584 punpcklwd m5, m6 1585 punpcklwd m6, m8, m9 1586 punpckhwd m8, m9 1587 REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8 1588 punpckhdq m1, m0, m2 1589 punpckldq m0, m2 1590 punpckldq m2, m3, m4 1591 punpckhdq m3, m4 1592 punpckldq m4, m5, m6 1593 punpckhdq m5, m6 1594 punpckldq m6, m7, m8 1595 punpckhdq m7, m8 1596 jmp tx2q 1597.pass2: 1598 call .main 1599 REPX {vpermq x, x, q3120}, m0, m2, m4, m6 1600 REPX {vpermq x, x, q2031}, m1, m3, m5, m7 1601.end: 1602 vpbroadcastd m8, [o(pw_2048)] 1603.end2: 1604 REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 1605.end3: 1606 pxor m8, m8 1607 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 1608 lea r3, [strideq*3] 1609 WRITE_8X4 0, 1, 8, 9 1610 lea dstq, [dstq+strideq*4] 1611 WRITE_8X4 2, 3, 0, 1 1612 lea dstq, [dstq+strideq*4] 1613 WRITE_8X4 4, 5, 0, 1 1614 lea dstq, [dstq+strideq*4] 1615 WRITE_8X4 6, 7, 0, 1 1616 RET 1617ALIGN function_align 1618cglobal_label .main 1619 IDCT16_1D_PACKED 1620 ret 1621 1622INV_TXFM_8X16_FN adst, dct 1623INV_TXFM_8X16_FN adst, adst 1624INV_TXFM_8X16_FN adst, flipadst 1625INV_TXFM_8X16_FN adst, identity 1626 1627cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 1628 ITX_8X16_LOAD_COEFS 1629 call m(iadst_16x8_internal_8bpc).main 1630 call m(iadst_16x8_internal_8bpc).main_pass1_end 1631 vpbroadcastd m10, [o(pw_16384)] 1632 pslld m9, m10, 17 1633 psubw m10, m9 ; 16384, -16384 1634 jmp m(idct_8x16_internal_8bpc).pass1_end 1635ALIGN function_align 1636.pass2: 1637 call .main 1638 call .main_pass2_end 1639 vpbroadcastd m9, [o(pw_2048)] 1640 vpbroadcastd xm8, [o(pw_4096)] 1641 psubw m8, m9 1642 REPX {vpermq x, x, q2031}, m0, m1, m2, m3 1643 REPX {vpermq x, x, q3120}, m4, m5, m6, m7 1644 jmp m(idct_8x16_internal_8bpc).end2 1645ALIGN function_align 1646cglobal_label .main 1647 REPX {pshufd x, x, q1032}, m7, m1, m5, m3 1648.main2: 1649 vpbroadcastd m10, [o(pd_2048)] 1650 punpckhwd m8, m7, m0 ; in14 in1 1651 punpcklwd m0, m7 ; in0 in15 1652 punpcklwd m7, m6, m1 ; in12 in3 1653 punpckhwd m1, m6 ; in2 in13 1654 punpckhwd m6, m5, m2 ; in10 in5 1655 punpcklwd m2, m5 ; in4 in11 1656 punpcklwd m5, m4, m3 ; in8 in7 1657 punpckhwd m3, m4 ; in6 in9 1658 ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1 1659 ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3 1660 ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5 1661 ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7 1662 ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9 1663 ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11 1664 ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13 1665 ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15 1666 psubsw m4, m0, m5 ; t9a t8a 1667 paddsw m0, m5 ; t1a t0a 1668 psubsw m5, m1, m6 ; t11a t10a 1669 paddsw m1, m6 ; t3a t2a 1670 psubsw m6, m2, m7 ; t13a t12a 1671 paddsw m2, m7 ; t5a t4a 1672 psubsw m7, m3, m8 ; t15a t14a 1673 paddsw m3, m8 ; t7a t6a 1674 vpbroadcastd m11, [o(pw_m4017_799)] 1675 vpbroadcastd m12, [o(pw_799_4017)] 1676 pxor m9, m9 1677 ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9 1678 psubw m8, m9, m11 ; pw_4017_m799 1679 ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13 1680 vpbroadcastd m11, [o(pw_m2276_3406)] 1681 vpbroadcastd m12, [o(pw_3406_2276)] 1682 ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11 1683 psubw m8, m9, m11 ; pw_2276_m3406 1684 ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15 1685 psubsw m8, m1, m3 ; t7 t6 1686 paddsw m1, m3 ; t3 t2 1687 psubsw m3, m0, m2 ; t5 t4 1688 paddsw m0, m2 ; t1 t0 1689 psubsw m2, m5, m7 ; t14a t15a 1690 paddsw m7, m5 ; t10a t11a 1691 psubsw m5, m4, m6 ; t12a t13a 1692 paddsw m4, m6 ; t8a t9a 1693 vpbroadcastd m11, [o(pw_m3784_1567)] 1694 vpbroadcastd m12, [o(pw_1567_3784)] 1695 ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a 1696 psubw m6, m9, m11 ; pw_3784_m1567 1697 ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a 1698 vpbroadcastd m11, [o(pw_m1567_3784)] 1699 vpbroadcastd m12, [o(pw_3784_1567)] 1700 ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14 1701 psubw m6, m9, m11 ; pw_1567_m3784 1702 ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12 1703 vbroadcasti128 m12, [o(deint_shuf)] 1704 paddsw m6, m4, m7 ; -out1 out14 1705 psubsw m4, m7 ; t10 t11 1706 psubsw m11, m3, m8 ; t7 t6 1707 paddsw m8, m3 ; out12 -out3 1708 psubsw m3, m0, m1 ; t3a t2a 1709 paddsw m0, m1 ; -out15 out0 1710 paddsw m1, m2, m5 ; -out13 out2 1711 psubsw m5, m2 ; t15a t14a 1712 pshufb m0, m12 1713 pshufb m6, m12 1714 pshufb m8, m12 1715 pshufb m1, m12 1716 shufps m7, m6, m0, q1032 ; out14 -out15 1717 vpblendd m0, m6, 0x33 ; -out1 out0 1718 punpcklqdq m6, m8, m1 ; out12 -out13 1719 punpckhqdq m1, m8, m1 ; -out3 out2 1720 ret 1721ALIGN function_align 1722.main_pass1_end: 1723 vpbroadcastd m8, [o(pw_m2896_2896)] 1724 vpbroadcastd m12, [o(pw_2896_2896)] 1725 pmaddwd m9, m8, m11 ; -out11 1726 pmaddwd m2, m12, m5 ; -out5 1727 pmaddwd m5, m8 ; out10 1728 pmaddwd m11, m12 ; out4 1729 REPX {paddd x, m10}, m9, m5, m2, m11 1730 REPX {psrad x, 12 }, m9, m5, m2, m11 1731 packssdw m5, m9 ; out10 -out11 1732 packssdw m2, m11 ; -out5 out4 1733 pmaddwd m11, m8, m3 ; out8 1734 vpbroadcastd m8, [o(pw_2896_m2896)] 1735 pmaddwd m3, m12 ; -out7 1736 pmaddwd m8, m4 ; -out9 1737 pmaddwd m4, m12 ; out6 1738 REPX {paddd x, m10}, m11, m3, m8, m4 1739 REPX {psrad x, 12 }, m11, m3, m8, m4 1740 packssdw m3, m4 ; -out7 out6 1741 packssdw m4, m11, m8 ; out8 -out9 1742 vpbroadcastd m10, [o(pw_16384)] 1743 pxor m9, m9 1744 ret 1745ALIGN function_align 1746cglobal_label .main_pass2_end 1747 vpbroadcastd m8, [o(pw_2896x8)] 1748 pshufb m2, m11, m12 1749 pshufb m5, m12 1750 pshufb m3, m12 1751 pshufb m4, m12 1752 punpcklqdq m11, m5, m2 ; t15a t7 1753 punpckhqdq m5, m2 ; t14a t6 1754 shufps m2, m3, m4, q1032 ; t2a t10 1755 vpblendd m3, m4, 0xcc ; t3a t11 1756 psubsw m4, m2, m3 ; out8 -out9 1757 paddsw m3, m2 ; -out7 out6 1758 paddsw m2, m5, m11 ; -out5 out4 1759 psubsw m5, m11 ; out10 -out11 1760 REPX {pmulhrsw x, m8}, m2, m3, m4, m5 1761 ret 1762 1763INV_TXFM_8X16_FN flipadst, dct 1764INV_TXFM_8X16_FN flipadst, adst 1765INV_TXFM_8X16_FN flipadst, flipadst 1766INV_TXFM_8X16_FN flipadst, identity 1767 1768cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 1769 ITX_8X16_LOAD_COEFS 1770 call m(iadst_16x8_internal_8bpc).main 1771 call m(iadst_16x8_internal_8bpc).main_pass1_end 1772 vpbroadcastd m9, [o(pw_16384)] 1773 pslld m10, m9, 17 1774 psubw m10, m9 ; -16384, 16384 1775 vperm2i128 m9, m4, m0, 0x31 1776 vinserti128 m0, m4, xm0, 1 1777 vperm2i128 m8, m5, m1, 0x31 1778 vinserti128 m4, m5, xm1, 1 1779 vperm2i128 m5, m7, m3, 0x31 1780 vinserti128 m3, m7, xm3, 1 1781 vinserti128 m1, m6, xm2, 1 1782 vperm2i128 m6, m6, m2, 0x31 1783 punpcklwd m2, m4, m0 1784 punpckhwd m4, m0 1785 punpcklwd m0, m3, m1 1786 punpckhwd m3, m1 1787 jmp m(idct_8x16_internal_8bpc).pass1_end2 1788.pass2: 1789 call m(iadst_8x16_internal_8bpc).main 1790 call m(iadst_8x16_internal_8bpc).main_pass2_end 1791 vpbroadcastd m8, [o(pw_2048)] 1792 vpbroadcastd xm9, [o(pw_4096)] 1793 psubw m8, m9 1794 vpermq m9, m0, q3120 1795 vpermq m0, m7, q2031 1796 vpermq m7, m1, q3120 1797 vpermq m1, m6, q2031 1798 vpermq m6, m2, q3120 1799 vpermq m2, m5, q2031 1800 vpermq m5, m3, q3120 1801 vpermq m3, m4, q2031 1802 pmulhrsw m0, m8 1803 pmulhrsw m1, m8 1804 pmulhrsw m2, m8 1805 pmulhrsw m3, m8 1806 pmulhrsw m4, m5, m8 1807 pmulhrsw m5, m6, m8 1808 pmulhrsw m6, m7, m8 1809 pmulhrsw m7, m9, m8 1810 jmp m(idct_8x16_internal_8bpc).end3 1811 1812INV_TXFM_8X16_FN identity, dct 1813INV_TXFM_8X16_FN identity, adst 1814INV_TXFM_8X16_FN identity, flipadst 1815INV_TXFM_8X16_FN identity, identity 1816 1817%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] 1818 pmulhrsw m%2, m%3, m%1 1819%if %0 == 4 ; if downshifting by 1 1820 pmulhrsw m%2, m%4 1821%else 1822 paddsw m%1, m%1 1823%endif 1824 paddsw m%1, m%2 1825%endmacro 1826 1827cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 1828 mova xm3, [cq+16*0] 1829 mova xm2, [cq+16*2] 1830 add cq, 16*8 1831 vinserti128 m3, [cq+16*0], 1 1832 vinserti128 m2, [cq+16*2], 1 1833 vpbroadcastd m9, [o(pw_2896x8)] 1834 mova xm4, [cq-16*4] 1835 mova xm5, [cq-16*2] 1836 vinserti128 m4, [cq+16*4], 1 1837 vinserti128 m5, [cq+16*6], 1 1838 mova xm7, [cq-16*7] 1839 mova xm6, [cq-16*5] 1840 vinserti128 m7, [cq+16*1], 1 1841 vinserti128 m6, [cq+16*3], 1 1842 mova xm8, [cq-16*3] 1843 mova xm0, [cq-16*1] 1844 vinserti128 m8, [cq+16*5], 1 1845 vinserti128 m0, [cq+16*7], 1 1846 punpcklwd m1, m3, m2 1847 punpckhwd m3, m2 1848 punpcklwd m2, m4, m5 1849 punpckhwd m4, m5 1850 punpcklwd m5, m7, m6 1851 punpckhwd m7, m6 1852 punpcklwd m6, m8, m0 1853 punpckhwd m8, m0 1854 REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8 1855 punpckldq m0, m1, m2 1856 punpckhdq m1, m2 1857 punpckldq m2, m3, m4 1858 punpckhdq m3, m4 1859 punpckldq m4, m5, m6 1860 punpckhdq m5, m6 1861 punpckldq m6, m7, m8 1862 punpckhdq m7, m8 1863 jmp tx2q 1864.pass2: 1865 vpbroadcastd m8, [o(pw_1697x16)] 1866 REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 1867 REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 1868 jmp m(idct_8x16_internal_8bpc).end 1869 1870%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] 1871 pmovzxbw m%3, [dstq+%5] 1872%ifnum %1 1873 paddw m%3, m%1 1874%else 1875 paddw m%3, %1 1876%endif 1877 pmovzxbw m%4, [dstq+%6] 1878%ifnum %2 1879 paddw m%4, m%2 1880%else 1881 paddw m%4, %2 1882%endif 1883 packuswb m%3, m%4 1884 vpermq m%3, m%3, q3120 1885 mova [dstq+%5], xm%3 1886 vextracti128 [dstq+%6], m%3, 1 1887%endmacro 1888 1889%macro INV_TXFM_16X4_FN 2 ; type1, type2 1890 INV_TXFM_FN %1, %2, 16x4 1891%ifidn %1_%2, dct_dct 1892 movd xm1, [o(pw_2896x8)] 1893 pmulhrsw xm0, xm1, [cq] 1894 movd xm2, [o(pw_16384)] 1895 mov [cq], eobd 1896 or r3d, 4 1897.dconly: 1898 pmulhrsw xm0, xm2 1899 movd xm2, [pw_2048] ; intentionally rip-relative 1900 pmulhrsw xm0, xm1 1901 pmulhrsw xm0, xm2 1902 vpbroadcastw m0, xm0 1903 pxor m3, m3 1904.dconly_loop: 1905 mova xm1, [dstq+strideq*0] 1906 vinserti128 m1, [dstq+strideq*1], 1 1907 punpckhbw m2, m1, m3 1908 punpcklbw m1, m3 1909 paddw m2, m0 1910 paddw m1, m0 1911 packuswb m1, m2 1912 mova [dstq+strideq*0], xm1 1913 vextracti128 [dstq+strideq*1], m1, 1 1914 lea dstq, [dstq+strideq*2] 1915 sub r3d, 2 1916 jg .dconly_loop 1917 RET 1918%endif 1919%endmacro 1920 1921INV_TXFM_16X4_FN dct, dct 1922INV_TXFM_16X4_FN dct, adst 1923INV_TXFM_16X4_FN dct, flipadst 1924INV_TXFM_16X4_FN dct, identity 1925 1926cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 1927 mova xm0, [cq+16*0] 1928 mova xm1, [cq+16*1] 1929 mova xm2, [cq+16*2] 1930 mova xm3, [cq+16*3] 1931 mova xm4, [cq+16*4] 1932 mova xm5, [cq+16*5] 1933 mova xm6, [cq+16*6] 1934 mova xm7, [cq+16*7] 1935 call m(idct_4x16_internal_8bpc).main 1936 vinserti128 m6, m2, xm6, 1 1937 vinserti128 m2, m0, xm4, 1 1938 vinserti128 m0, m1, xm5, 1 1939 vinserti128 m1, m3, xm7, 1 1940 punpcklwd m3, m2, m6 1941 punpckhwd m2, m6 1942 vpbroadcastd m6, [o(pw_16384)] 1943 punpckhwd m4, m0, m1 1944 punpcklwd m0, m1 1945 mova m1, m6 1946 jmp m(iadst_16x4_internal_8bpc).pass1_end 1947.pass2: 1948 call .main 1949 jmp m(iadst_16x4_internal_8bpc).end 1950ALIGN function_align 1951cglobal_label .main 1952 vpbroadcastd m6, [o(pd_2048)] 1953 IDCT4_1D 0, 1, 2, 3, 4, 5, 6 1954 ret 1955 1956INV_TXFM_16X4_FN adst, dct 1957INV_TXFM_16X4_FN adst, adst 1958INV_TXFM_16X4_FN adst, flipadst 1959INV_TXFM_16X4_FN adst, identity 1960 1961cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 1962 vpermq m0, [cq+32*0], q1230 1963 vpermq m3, [cq+32*3], q2103 1964 vpermq m1, [cq+32*1], q1230 1965 vpermq m2, [cq+32*2], q2103 1966 call m(iadst_4x16_internal_8bpc).main2 1967 call m(iadst_4x16_internal_8bpc).main_pass1_end 1968 punpcklwd m4, m3, m1 1969 punpcklwd m5, m2, m0 1970 punpckhwd m0, m1 1971 punpckhwd m2, m3 1972 vpbroadcastd m1, [o(pw_16384)] 1973 vinserti128 m3, m0, xm2, 1 1974 vperm2i128 m2, m0, m2, 0x31 1975 vinserti128 m0, m4, xm5, 1 1976 vperm2i128 m4, m4, m5, 0x31 1977 psubw m6, m7, m1 1978.pass1_end: 1979 pmulhrsw m3, m1 1980 pmulhrsw m2, m6 1981 pmulhrsw m4, m1 1982 pmulhrsw m0, m6 1983 punpcklwd m1, m3, m2 1984 punpckhwd m3, m2 1985 punpcklwd m2, m4, m0 1986 punpckhwd m4, m0 1987 punpckldq m0, m1, m2 1988 punpckhdq m1, m2 1989 punpckldq m2, m3, m4 1990 punpckhdq m3, m4 1991 jmp tx2q 1992.pass2: 1993 call .main 1994.end: 1995 vpbroadcastd m4, [o(pw_2048)] 1996 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 1997 WIN64_RESTORE_XMM 1998.end2: 1999 pxor m4, m4 2000 mova [cq+32*0], m4 2001 mova [cq+32*1], m4 2002 mova [cq+32*2], m4 2003 mova [cq+32*3], m4 2004.end3: 2005 WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1 2006 lea dstq, [dstq+strideq*2] 2007 WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1 2008 RET 2009ALIGN function_align 2010cglobal_label .main 2011 vpbroadcastd m6, [o(pw_m3344_3344)] 2012 vpbroadcastd m7, [o(pw_3803_1321)] 2013 vpbroadcastd m8, [o(pw_m1321_2482)] 2014 vpbroadcastd m9, [o(pw_2482_3344)] 2015 punpcklwd m4, m2, m0 ; in2 in0 l 2016 punpckhwd m2, m0 ; in2 in0 h 2017 psrld m5, m6, 16 2018 pmaddwd m10, m6, m4 ; t2:02 l 2019 pmaddwd m6, m2 ; t2:02 h 2020 pmaddwd m0, m7, m4 ; t0:02 l 2021 pmaddwd m7, m2 ; t0:02 h 2022 pmaddwd m4, m8 ; t1:02 l 2023 pmaddwd m8, m2 ; t1:02 h 2024 punpckhwd m2, m3, m1 ; in3 in1 h 2025 punpcklwd m3, m1 ; in3 in1 l 2026 pmaddwd m1, m5, m2 ; t2:3 h 2027 pmaddwd m5, m3 ; t2:3 l 2028 paddd m6, m1 2029 vpbroadcastd m1, [o(pd_2048)] 2030 paddd m10, m5 2031 pmaddwd m5, m9, m3 2032 pmaddwd m9, m2 2033 paddd m0, m1 2034 paddd m7, m1 2035 paddd m0, m5 ; t0 + t3 + 2048 l 2036 paddd m7, m9 ; t0 + t3 + 2048 h 2037 vpbroadcastd m9, [o(pw_m3803_3344)] 2038 pmaddwd m5, m9, m2 2039 pmaddwd m9, m3 2040 paddd m10, m1 ; t2 + 2048 l 2041 paddd m6, m1 ; t2 + 2048 h 2042 paddd m5, m1 ; t1:13 + 2048 h 2043 paddd m1, m9 ; t1:13 + 2048 l 2044 vpbroadcastd m9, [o(pw_m3803_m6688)] 2045 pmaddwd m2, m9 2046 pmaddwd m3, m9 2047 paddd m5, m8 ; t1 + t3 + 2048 h 2048 paddd m1, m4 ; t1 + t3 + 2048 l 2049 paddd m8, m7 2050 paddd m4, m0 2051 paddd m2, m8 ; t0 + t1 - t3 + 2048 h 2052 paddd m3, m4 ; t0 + t1 - t3 + 2048 l 2053 REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3 2054 packssdw m0, m7 2055 packssdw m1, m5 2056 packssdw m3, m2 2057 packssdw m2, m10, m6 2058 ret 2059 2060INV_TXFM_16X4_FN flipadst, dct 2061INV_TXFM_16X4_FN flipadst, adst 2062INV_TXFM_16X4_FN flipadst, flipadst 2063INV_TXFM_16X4_FN flipadst, identity 2064 2065cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 2066 vpermq m0, [cq+32*0], q1230 2067 vpermq m3, [cq+32*3], q2103 2068 vpermq m1, [cq+32*1], q1230 2069 vpermq m2, [cq+32*2], q2103 2070 call m(iadst_4x16_internal_8bpc).main2 2071 call m(iadst_4x16_internal_8bpc).main_pass1_end 2072 punpckhwd m4, m3, m2 2073 punpckhwd m5, m1, m0 2074 punpcklwd m0, m2 2075 punpcklwd m1, m3 2076 vpbroadcastd m6, [o(pw_16384)] 2077 vinserti128 m3, m0, xm1, 1 2078 vperm2i128 m2, m0, m1, 0x31 2079 vinserti128 m0, m4, xm5, 1 2080 vperm2i128 m4, m4, m5, 0x31 2081 psubw m1, m7, m6 2082 jmp m(iadst_16x4_internal_8bpc).pass1_end 2083ALIGN function_align 2084.pass2: 2085 call m(iadst_16x4_internal_8bpc).main 2086 vpbroadcastd m4, [o(pw_2048)] 2087 REPX {pmulhrsw x, m4}, m3, m2, m1, m0 2088 pxor m4, m4 2089 mova [cq+32*0], m4 2090 mova [cq+32*1], m4 2091 mova [cq+32*2], m4 2092 mova [cq+32*3], m4 2093 WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1 2094 lea dstq, [dstq+strideq*2] 2095 WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1 2096 RET 2097 2098INV_TXFM_16X4_FN identity, dct 2099INV_TXFM_16X4_FN identity, adst 2100INV_TXFM_16X4_FN identity, flipadst 2101INV_TXFM_16X4_FN identity, identity 2102 2103cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 2104 mova xm2, [cq+16*0] 2105 mova xm4, [cq+16*1] 2106 vinserti128 m2, [cq+16*4], 1 2107 vinserti128 m4, [cq+16*5], 1 2108 mova xm0, [cq+16*2] 2109 mova xm1, [cq+16*3] 2110 vinserti128 m0, [cq+16*6], 1 2111 vinserti128 m1, [cq+16*7], 1 2112 vpbroadcastd m7, [o(pw_1697x16)] 2113 vpbroadcastd m8, [o(pw_16384)] 2114 punpcklwd m3, m2, m4 2115 punpckhwd m2, m4 2116 punpcklwd m4, m0, m1 2117 punpckhwd m0, m1 2118 punpcklwd m1, m3, m2 2119 punpckhwd m3, m2 2120 punpcklwd m2, m4, m0 2121 punpckhwd m4, m0 2122 pmulhrsw m0, m7, m1 2123 pmulhrsw m5, m7, m2 2124 pmulhrsw m6, m7, m3 2125 pmulhrsw m7, m4 2126 REPX {pmulhrsw x, m8}, m0, m5, m6, m7 2127 paddsw m1, m0 2128 paddsw m2, m5 2129 paddsw m3, m6 2130 paddsw m4, m7 2131 punpcklqdq m0, m1, m2 2132 punpckhqdq m1, m2 2133 punpcklqdq m2, m3, m4 2134 punpckhqdq m3, m4 2135 jmp tx2q 2136.pass2: 2137 vpbroadcastd m7, [o(pw_1697x8)] 2138 pmulhrsw m4, m7, m0 2139 pmulhrsw m5, m7, m1 2140 pmulhrsw m6, m7, m2 2141 pmulhrsw m7, m3 2142 paddsw m0, m4 2143 paddsw m1, m5 2144 paddsw m2, m6 2145 paddsw m3, m7 2146 jmp m(iadst_16x4_internal_8bpc).end 2147 2148%macro INV_TXFM_16X8_FN 2 ; type1, type2 2149 INV_TXFM_FN %1, %2, 16x8 2150%ifidn %1_%2, dct_dct 2151 movd xm1, [o(pw_2896x8)] 2152 pmulhrsw xm0, xm1, [cq] 2153 movd xm2, [o(pw_16384)] 2154 mov [cq], eobd 2155 pmulhrsw xm0, xm1 2156 or r3d, 8 2157 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly 2158%endif 2159%endmacro 2160 2161%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd 2162 vpbroadcastd m8, [o(pw_2896x8)] 2163 vpermq m0, [cq+32*0], q3120 2164 add cq, 32*4 2165 vpermq m7, [cq+32*3], q%1 2166 vpermq m1, [cq-32*3], q%1 2167 vpermq m6, [cq+32*2], q3120 2168 vpermq m2, [cq-32*2], q3120 2169 vpermq m5, [cq+32*1], q%1 2170 vpermq m3, [cq-32*1], q%1 2171 vpermq m4, [cq+32*0], q3120 2172 REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 2173%endmacro 2174 2175INV_TXFM_16X8_FN dct, dct 2176INV_TXFM_16X8_FN dct, adst 2177INV_TXFM_16X8_FN dct, flipadst 2178INV_TXFM_16X8_FN dct, identity 2179 2180cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 2181 ITX_16X8_LOAD_COEFS 3120 2182 call m(idct_8x16_internal_8bpc).main 2183 vpbroadcastd m10, [o(pw_16384)] 2184 punpckhwd m8, m0, m2 2185 punpcklwd m0, m2 2186 punpckhwd m2, m1, m3 2187 punpcklwd m1, m3 2188 punpcklwd m9, m4, m6 2189 punpckhwd m4, m6 2190 punpcklwd m6, m5, m7 2191 punpckhwd m5, m7 2192 REPX {pmulhrsw x, m10}, m8, m1, m4, m6 2193.pass1_end: 2194 REPX {pmulhrsw x, m10}, m0, m2, m9, m5 2195 punpckhwd m3, m0, m8 2196 punpcklwd m0, m8 2197 punpckhwd m8, m2, m1 2198 punpcklwd m2, m1 2199 punpcklwd m7, m9, m4 2200 punpckhwd m9, m4 2201 punpcklwd m4, m5, m6 2202 punpckhwd m5, m6 2203 punpckhdq m1, m0, m2 2204 punpckldq m0, m2 2205 punpckldq m2, m3, m8 2206 punpckhdq m3, m8 2207 punpckldq m6, m7, m4 2208 punpckhdq m7, m4 2209 punpckldq m8, m9, m5 2210 punpckhdq m9, m5 2211 vperm2i128 m4, m0, m6, 0x31 2212 vinserti128 m0, xm6, 1 2213 vperm2i128 m5, m1, m7, 0x31 2214 vinserti128 m1, xm7, 1 2215 vperm2i128 m6, m2, m8, 0x31 2216 vinserti128 m2, xm8, 1 2217 vperm2i128 m7, m3, m9, 0x31 2218 vinserti128 m3, xm9, 1 2219 jmp tx2q 2220.pass2: 2221 call .main 2222 vpbroadcastd m8, [o(pw_2048)] 2223.end: 2224 REPX {pmulhrsw x, m8}, m0, m2, m4, m6 2225.end2: 2226 REPX {pmulhrsw x, m8}, m1, m3, m5, m7 2227 lea r3, [strideq*3] 2228 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 2229 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 2230.end3: 2231 pxor m0, m0 2232 REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 2233.end4: 2234 lea dstq, [dstq+strideq*4] 2235 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 2236 WRITE_16X2 6, 7, 0, 1, strideq*2, r3 2237 RET 2238ALIGN function_align 2239cglobal_label .main 2240 vpbroadcastd m10, [o(pd_2048)] 2241.main2: 2242 IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 2243 ret 2244 2245INV_TXFM_16X8_FN adst, dct 2246INV_TXFM_16X8_FN adst, adst 2247INV_TXFM_16X8_FN adst, flipadst 2248INV_TXFM_16X8_FN adst, identity 2249 2250cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 2251 ITX_16X8_LOAD_COEFS 1302 2252 call m(iadst_8x16_internal_8bpc).main2 2253 call m(iadst_8x16_internal_8bpc).main_pass1_end 2254 psubw m11, m9, m10 2255 punpcklwd m8, m0, m2 2256 punpckhwd m0, m2 2257 punpckhwd m2, m1, m3 2258 punpcklwd m1, m3 2259 punpcklwd m9, m4, m6 2260 punpckhwd m4, m6 2261 punpckhwd m6, m5, m7 2262 punpcklwd m5, m7 2263 REPX {pmulhrsw x, m11}, m8, m1, m4, m6 2264 jmp m(idct_16x8_internal_8bpc).pass1_end 2265ALIGN function_align 2266.pass2: 2267 call .main 2268 call .main_pass2_end 2269 pxor m8, m8 2270 psubw m8, m9 2271 REPX {pmulhrsw x, m9}, m0, m2, m4, m6 2272 jmp m(idct_16x8_internal_8bpc).end2 2273ALIGN function_align 2274cglobal_label .main 2275 vpbroadcastd m10, [o(pd_2048)] 2276 ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a 2277 ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a 2278 ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a 2279 ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a 2280 psubsw m8, m2, m6 ; t6 2281 paddsw m2, m6 ; t2 2282 psubsw m6, m0, m4 ; t4 2283 paddsw m0, m4 ; t0 2284 psubsw m4, m5, m1 ; t7 2285 paddsw m5, m1 ; t3 2286 psubsw m1, m7, m3 ; t5 2287 paddsw m7, m3 ; t1 2288 ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a 2289 ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a 2290 psubsw m9, m6, m8 ; t7 2291 paddsw m6, m8 ; out6 2292 psubsw m3, m7, m5 ; t3 2293 paddsw m7, m5 ; -out7 2294 psubsw m5, m0, m2 ; t2 2295 paddsw m0, m2 ; out0 2296 psubsw m2, m1, m4 ; t6 2297 paddsw m1, m4 ; -out1 2298 ret 2299ALIGN function_align 2300.main_pass1_end: 2301 vpbroadcastd m11, [o(pw_m2896_2896)] 2302 vpbroadcastd m12, [o(pw_2896_2896)] 2303 punpckhwd m4, m3, m5 2304 punpcklwd m3, m5 2305 pmaddwd m5, m11, m4 2306 pmaddwd m4, m12 2307 pmaddwd m8, m11, m3 2308 pmaddwd m3, m12 2309 REPX {paddd x, m10}, m5, m4, m8, m3 2310 REPX {psrad x, 12 }, m5, m8, m4, m3 2311 packssdw m3, m4 ; -out3 2312 packssdw m4, m8, m5 ; out4 2313 punpcklwd m5, m9, m2 2314 punpckhwd m9, m2 2315 pmaddwd m2, m12, m5 2316 pmaddwd m5, m11 2317 pmaddwd m12, m9 2318 pmaddwd m11, m9 2319 REPX {paddd x, m10}, m2, m5, m12, m11 2320 REPX {psrad x, 12 }, m2, m12, m5, m11 2321 packssdw m2, m12 ; out2 2322 packssdw m5, m11 ; -out5 2323 ret 2324ALIGN function_align 2325cglobal_label .main_pass2_end 2326 vpbroadcastd m8, [o(pw_2896x8)] 2327 psubsw m4, m5, m3 2328 paddsw m3, m5 2329 psubsw m5, m2, m9 2330 paddsw m2, m9 2331 pmulhrsw m2, m8 ; out2 2332 pmulhrsw m3, m8 ; -out3 2333 pmulhrsw m4, m8 ; out4 2334 pmulhrsw m5, m8 ; -out5 2335 vpbroadcastd m9, [o(pw_2048)] 2336 ret 2337 2338INV_TXFM_16X8_FN flipadst, dct 2339INV_TXFM_16X8_FN flipadst, adst 2340INV_TXFM_16X8_FN flipadst, flipadst 2341INV_TXFM_16X8_FN flipadst, identity 2342 2343cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 2344 ITX_16X8_LOAD_COEFS 1302 2345 call m(iadst_8x16_internal_8bpc).main2 2346 call m(iadst_8x16_internal_8bpc).main_pass1_end 2347 psubw m9, m10 2348 punpcklwd m8, m6, m4 2349 punpckhwd m6, m4 2350 punpcklwd m4, m7, m5 2351 punpckhwd m7, m5 2352 punpckhwd m5, m3, m1 2353 punpcklwd m3, m1 2354 punpckhwd m1, m2, m0 2355 punpcklwd m2, m0 2356 REPX {pmulhrsw x, m10}, m8, m4, m5, m1 2357 REPX {pmulhrsw x, m9 }, m6, m7, m3, m2 2358 punpcklwd m0, m7, m4 2359 punpckhwd m7, m4 2360 punpckhwd m4, m6, m8 2361 punpcklwd m6, m8 2362 punpckhwd m8, m3, m5 2363 punpcklwd m3, m5 2364 punpcklwd m5, m2, m1 2365 punpckhwd m2, m1 2366 punpckhdq m1, m0, m6 2367 punpckldq m0, m6 2368 punpckldq m6, m7, m4 2369 punpckhdq m7, m4 2370 punpckhdq m4, m3, m5 2371 punpckldq m3, m5 2372 punpckldq m5, m8, m2 2373 punpckhdq m8, m2 2374 vinserti128 m2, m6, xm5, 1 2375 vperm2i128 m6, m5, 0x31 2376 vperm2i128 m5, m1, m4, 0x31 2377 vinserti128 m1, xm4, 1 2378 vperm2i128 m4, m0, m3, 0x31 2379 vinserti128 m0, xm3, 1 2380 vinserti128 m3, m7, xm8, 1 2381 vperm2i128 m7, m8, 0x31 2382 jmp tx2q 2383.pass2: 2384 call m(iadst_16x8_internal_8bpc).main 2385 call m(iadst_16x8_internal_8bpc).main_pass2_end 2386 pxor m8, m8 2387 psubw m8, m9 2388 pmulhrsw m10, m7, m8 2389 pmulhrsw m7, m0, m9 2390 pmulhrsw m0, m6, m9 2391 pmulhrsw m6, m1, m8 2392 pmulhrsw m1, m5, m8 2393 pmulhrsw m5, m2, m9 2394 pmulhrsw m2, m4, m9 2395 pmulhrsw m4, m3, m8 2396 lea r3, [strideq*3] 2397 WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 2398 WRITE_16X2 1, 2, 0, 1, strideq*2, r3 2399 jmp m(idct_16x8_internal_8bpc).end3 2400 2401INV_TXFM_16X8_FN identity, dct 2402INV_TXFM_16X8_FN identity, adst 2403INV_TXFM_16X8_FN identity, flipadst 2404INV_TXFM_16X8_FN identity, identity 2405 2406cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 2407 mova xm7, [cq+16*0] 2408 mova xm2, [cq+16*1] 2409 add cq, 16*8 2410 vpbroadcastd m3, [o(pw_2896x8)] 2411 vinserti128 m7, [cq+16*0], 1 2412 vinserti128 m2, [cq+16*1], 1 2413 mova xm6, [cq-16*6] 2414 mova xm4, [cq-16*5] 2415 vinserti128 m6, [cq+16*2], 1 2416 vinserti128 m4, [cq+16*3], 1 2417 mova xm8, [cq-16*4] 2418 mova xm5, [cq-16*3] 2419 vinserti128 m8, [cq+16*4], 1 2420 vinserti128 m5, [cq+16*5], 1 2421 mova xm0, [cq-16*2] 2422 mova xm1, [cq-16*1] 2423 vinserti128 m0, [cq+16*6], 1 2424 vinserti128 m1, [cq+16*7], 1 2425 vpbroadcastd m10, [o(pw_1697x16)] 2426 vpbroadcastd m11, [o(pw_16384)] 2427 REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1 2428 punpcklwd m3, m7, m2 2429 punpckhwd m7, m2 2430 punpcklwd m2, m6, m4 2431 punpckhwd m6, m4 2432 punpcklwd m4, m8, m5 2433 punpckhwd m8, m5 2434 punpcklwd m5, m0, m1 2435 punpckhwd m0, m1 2436 punpckldq m1, m3, m2 2437 punpckhdq m3, m2 2438 punpckldq m2, m4, m5 2439 punpckhdq m4, m5 2440 punpckldq m5, m7, m6 2441 punpckhdq m7, m6 2442 punpckldq m6, m8, m0 2443 punpckhdq m8, m0 2444 REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8 2445 punpcklqdq m0, m1, m2 2446 punpckhqdq m1, m2 2447 punpcklqdq m2, m3, m4 2448 punpckhqdq m3, m4 2449 punpcklqdq m4, m5, m6 2450 punpckhqdq m5, m6 2451 punpcklqdq m6, m7, m8 2452 punpckhqdq m7, m8 2453 jmp tx2q 2454.pass2: 2455 vpbroadcastd m8, [o(pw_4096)] 2456 jmp m(idct_16x8_internal_8bpc).end 2457 2458%define o_base pw_5 + 128 2459 2460%macro INV_TXFM_16X16_FN 2 ; type1, type2 2461 INV_TXFM_FN %1, %2, 16x16 2462%ifidn %1_%2, dct_dct 2463 movd xm1, [o(pw_2896x8)] 2464 pmulhrsw xm0, xm1, [cq] 2465 movd xm2, [o(pw_8192)] 2466 mov [cq], eobd 2467 or r3d, 16 2468 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly 2469%endif 2470%endmacro 2471 2472%macro ITX_16X16_LOAD_COEFS 0 2473 mova m0, [cq+32*0] 2474 mova m1, [cq+32*1] 2475 mova m2, [cq+32*2] 2476 mova m3, [cq+32*3] 2477 add cq, 32*8 2478 mova m4, [cq-32*4] 2479 mova m5, [cq-32*3] 2480 mova m6, [cq-32*2] 2481 mova m7, [cq-32*1] 2482 mova m8, [cq+32*0] 2483 mova m9, [cq+32*1] 2484 mova m10, [cq+32*2] 2485 mova m11, [cq+32*3] 2486 mova m12, [cq+32*4] 2487 mova m13, [cq+32*5] 2488 mova m14, [cq+32*6] 2489 mova m15, [cq+32*7] 2490 mova [rsp], m15 2491%endmacro 2492 2493INV_TXFM_16X16_FN dct, dct 2494INV_TXFM_16X16_FN dct, adst 2495INV_TXFM_16X16_FN dct, flipadst 2496INV_TXFM_16X16_FN dct, identity 2497 2498cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 2499 ITX_16X16_LOAD_COEFS 2500 call .main 2501.pass1_end: 2502 vpbroadcastd m1, [o(pw_8192)] 2503 REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 2504 vextracti128 [rsp+16*5], m8, 1 2505 mova [rsp+16*1], xm8 2506.pass1_end2: 2507 vextracti128 [rsp+16*4], m0, 1 2508 mova [rsp+16*0], xm0 2509 REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 2510 pmulhrsw m1, [rsp+32*1] 2511 vperm2i128 m8, m1, m9, 0x31 2512 vinserti128 m1, xm9, 1 2513 vperm2i128 m9, m2, m10, 0x31 2514 vinserti128 m2, xm10, 1 2515 vperm2i128 m10, m3, m11, 0x31 2516 vinserti128 m3, xm11, 1 2517 vperm2i128 m11, m4, m12, 0x31 2518 vinserti128 m4, xm12, 1 2519 vperm2i128 m12, m5, m13, 0x31 2520 vinserti128 m5, xm13, 1 2521 vperm2i128 m13, m6, m14, 0x31 2522 vinserti128 m6, xm14, 1 2523 vperm2i128 m14, m7, m15, 0x31 2524 vinserti128 m7, xm15, 1 2525 mova m15, [rsp+32*2] 2526.pass1_end3: 2527 punpcklwd m0, m9, m10 2528 punpckhwd m9, m10 2529 punpcklwd m10, m15, m8 2530 punpckhwd m15, m8 2531 punpckhwd m8, m11, m12 2532 punpcklwd m11, m12 2533 punpckhwd m12, m13, m14 2534 punpcklwd m13, m14 2535 punpckhdq m14, m11, m13 2536 punpckldq m11, m13 2537 punpckldq m13, m15, m9 2538 punpckhdq m15, m9 2539 punpckldq m9, m10, m0 2540 punpckhdq m10, m0 2541 punpckhdq m0, m8, m12 2542 punpckldq m8, m12 2543 punpcklqdq m12, m13, m8 2544 punpckhqdq m13, m8 2545 punpcklqdq m8, m9, m11 2546 punpckhqdq m9, m11 2547 punpckhqdq m11, m10, m14 2548 punpcklqdq m10, m14 2549 punpcklqdq m14, m15, m0 2550 punpckhqdq m15, m0 2551 mova m0, [rsp] 2552 mova [rsp], m15 2553 punpckhwd m15, m4, m5 2554 punpcklwd m4, m5 2555 punpckhwd m5, m0, m1 2556 punpcklwd m0, m1 2557 punpckhwd m1, m6, m7 2558 punpcklwd m6, m7 2559 punpckhwd m7, m2, m3 2560 punpcklwd m2, m3 2561 punpckhdq m3, m0, m2 2562 punpckldq m0, m2 2563 punpckldq m2, m4, m6 2564 punpckhdq m4, m6 2565 punpckhdq m6, m5, m7 2566 punpckldq m5, m7 2567 punpckldq m7, m15, m1 2568 punpckhdq m15, m1 2569 punpckhqdq m1, m0, m2 2570 punpcklqdq m0, m2 2571 punpcklqdq m2, m3, m4 2572 punpckhqdq m3, m4 2573 punpcklqdq m4, m5, m7 2574 punpckhqdq m5, m7 2575 punpckhqdq m7, m6, m15 2576 punpcklqdq m6, m15 2577 jmp tx2q 2578.pass2: 2579 call .main 2580.end: 2581 vpbroadcastd m1, [o(pw_2048)] 2582 REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 2583 mova [rsp], m6 2584.end2: 2585 REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 2586 pmulhrsw m1, [rsp+32*1] 2587 lea r3, [strideq*3] 2588 WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1 2589 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 2590 lea dstq, [dstq+strideq*4] 2591 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 2592 WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3 2593.end3: 2594 pxor m2, m2 2595 REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1 2596 lea dstq, [dstq+strideq*4] 2597 WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1 2598 WRITE_16X2 10, 11, 0, 1, strideq*2, r3 2599 REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7 2600 lea dstq, [dstq+strideq*4] 2601 WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1 2602 WRITE_16X2 14, 15, 0, 1, strideq*2, r3 2603 RET 2604ALIGN function_align 2605cglobal_label .main 2606 vpbroadcastd m15, [o(pd_2048)] 2607 mova [rsp+gprsize+32*1], m1 2608 mova [rsp+gprsize+32*2], m9 2609 IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15 2610 mova m1, [rsp+gprsize+32*2] ; in9 2611 mova [rsp+gprsize+32*2], m14 ; tmp7 2612 mova m9, [rsp+gprsize+32*1] ; in1 2613 mova [rsp+gprsize+32*1], m10 ; tmp5 2614 mova m14, [rsp+gprsize+32*0] ; in15 2615 mova [rsp+gprsize+32*0], m6 ; tmp3 2616 IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15 2617 mova m6, [rsp+gprsize+32*1] ; tmp5 2618 psubsw m15, m0, m14 ; out15 2619 paddsw m0, m14 ; out0 2620 psubsw m14, m2, m13 ; out14 2621 paddsw m2, m13 ; out1 2622 mova [rsp+gprsize+32*1], m2 2623 psubsw m13, m4, m11 ; out13 2624 paddsw m2, m4, m11 ; out2 2625 psubsw m11, m8, m7 ; out11 2626 paddsw m4, m8, m7 ; out4 2627 mova m7, [rsp+gprsize+32*2] ; tmp7 2628 psubsw m10, m6, m5 ; out10 2629 paddsw m5, m6 ; out5 2630 psubsw m8, m7, m9 ; out8 2631 paddsw m7, m9 ; out7 2632 psubsw m9, m12, m3 ; out9 2633 paddsw m6, m12, m3 ; out6 2634 mova m3, [rsp+gprsize+32*0] ; tmp3 2635 psubsw m12, m3, m1 ; out12 2636 paddsw m3, m1 ; out3 2637 ret 2638 2639INV_TXFM_16X16_FN adst, dct 2640INV_TXFM_16X16_FN adst, adst 2641INV_TXFM_16X16_FN adst, flipadst 2642 2643cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 2644 ITX_16X16_LOAD_COEFS 2645 call .main 2646 call .main_pass1_end 2647 pmulhrsw m0, m1, [cq+32*0] 2648 pmulhrsw m2, m1, [cq+32*1] 2649 REPX {pmulhrsw x, m1}, m4, m6, m8, m10 2650 pmulhrsw m12, m1, [cq+32*2] 2651 pmulhrsw m14, m1, [cq+32*3] 2652 vextracti128 [rsp+16*5], m8, 1 2653 mova [rsp+16*1], xm8 2654 pxor m8, m8 2655 psubw m1, m8, m1 2656 jmp m(idct_16x16_internal_8bpc).pass1_end2 2657ALIGN function_align 2658.pass2: 2659 call .main 2660 call .main_pass2_end 2661 REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 2662 mova [rsp+32*0], m6 2663 pxor m6, m6 2664 psubw m1, m6, m1 2665 jmp m(idct_16x16_internal_8bpc).end2 2666ALIGN function_align 2667cglobal_label .main 2668 vpbroadcastd m15, [o(pd_2048)] 2669 mova [rsp+gprsize+32*1], m0 2670 mova [rsp+gprsize+32*2], m4 2671 ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2 2672 ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6 2673 ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10 2674 ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14 2675 psubsw m0, m2, m10 ; t10a 2676 paddsw m2, m10 ; t2a 2677 psubsw m10, m13, m5 ; t11a 2678 paddsw m13, m5 ; t3a 2679 psubsw m5, m6, m14 ; t14a 2680 paddsw m6, m14 ; t6a 2681 psubsw m14, m9, m1 ; t15a 2682 paddsw m9, m1 ; t7a 2683 ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10 2684 ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15 2685 psubsw m1, m10, m14 ; t14a 2686 paddsw m10, m14 ; t10a 2687 psubsw m14, m0, m5 ; t15a 2688 paddsw m0, m5 ; t11a 2689 psubsw m5, m2, m6 ; t6 2690 paddsw m2, m6 ; t2 2691 psubsw m6, m13, m9 ; t7 2692 paddsw m13, m9 ; t3 2693 ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a 2694 ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15 2695 mova m9, [rsp+gprsize+32*0] ; in15 2696 mova [rsp+gprsize+32*0], m10 ; t10a 2697 mova m4, [rsp+gprsize+32*1] ; in0 2698 mova [rsp+gprsize+32*1], m6 ; t6a 2699 mova m6, [rsp+gprsize+32*2] ; in4 2700 mova [rsp+gprsize+32*2], m2 ; t2 2701 ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0 2702 ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4 2703 ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8 2704 ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12 2705 psubsw m10, m4, m8 ; t8a 2706 paddsw m8, m4 ; t0a 2707 psubsw m4, m9, m7 ; t9a 2708 paddsw m9, m7 ; t1a 2709 psubsw m7, m6, m12 ; t12a 2710 paddsw m6, m12 ; t4a 2711 psubsw m12, m11, m3 ; t13a 2712 paddsw m11, m3 ; t5a 2713 ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8 2714 ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13 2715 psubsw m3, m9, m11 ; t5 2716 paddsw m9, m11 ; t1 2717 psubsw m11, m4, m12 ; t12a 2718 paddsw m4, m12 ; t8a 2719 paddsw m12, m8, m6 ; t0 2720 psubsw m8, m6 ; t4 2721 paddsw m6, m10, m7 ; t9a 2722 psubsw m10, m7 ; t13a 2723 ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a 2724 ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12 2725 mova m7, [rsp+gprsize+32*0] ; t10a 2726 mova m2, [rsp+gprsize+32*1] ; t6a 2727 paddsw m15, m9, m13 ; -out15 2728 psubsw m9, m13 ; t3a 2729 paddsw m13, m11, m1 ; -out13 2730 psubsw m11, m1 ; t15a 2731 psubsw m1, m4, m7 ; t10 2732 paddsw m7, m4 ; -out1 2733 psubsw m4, m3, m2 ; t6 2734 paddsw m3, m2 ; -out3 2735 paddsw m2, m10, m14 ; out2 2736 psubsw m10, m14 ; t14a 2737 paddsw m14, m6, m0 ; out14 2738 psubsw m6, m0 ; t11 2739 mova m0, [rsp+gprsize+32*2] ; t2 2740 mova [rsp+gprsize+32*1], m7 2741 psubsw m7, m12, m0 ; t2a 2742 paddsw m0, m12 ; out0 2743 paddsw m12, m8, m5 ; out12 2744 psubsw m8, m5 ; t7 2745 ret 2746ALIGN function_align 2747.main_pass1_end: 2748 mova [cq+32*0], m0 2749 mova [cq+32*1], m2 2750 mova [cq+32*2], m12 2751 mova [cq+32*3], m14 2752 vpbroadcastd m14, [pw_m2896_2896] 2753 vpbroadcastd m12, [pw_2896_2896] 2754 vpbroadcastd m2, [pd_2048] 2755 punpcklwd m5, m11, m10 2756 punpckhwd m11, m10 2757 pmaddwd m10, m14, m5 2758 pmaddwd m0, m14, m11 2759 pmaddwd m5, m12 2760 pmaddwd m11, m12 2761 REPX {paddd x, m2}, m10, m0, m5, m11 2762 REPX {psrad x, 12}, m10, m0, m5, m11 2763 packssdw m10, m0 ; out10 2764 packssdw m5, m11 ; -out5 2765 punpcklwd m11, m8, m4 2766 punpckhwd m8, m4 2767 pmaddwd m4, m12, m11 2768 pmaddwd m0, m12, m8 2769 pmaddwd m11, m14 2770 pmaddwd m8, m14 2771 REPX {paddd x, m2}, m4, m0, m11, m8 2772 REPX {psrad x, 12}, m4, m0, m11, m8 2773 packssdw m4, m0 ; out4 2774 packssdw m11, m8 ; -out11 2775 punpcklwd m8, m9, m7 2776 punpckhwd m9, m7 2777 pmaddwd m7, m12, m8 2778 pmaddwd m0, m12, m9 2779 pmaddwd m8, m14 2780 pmaddwd m9, m14 2781 REPX {paddd x, m2}, m7, m0, m8, m9 2782 REPX {psrad x, 12}, m7, m0, m8, m9 2783 packssdw m7, m0 ; -out7 2784 packssdw m8, m9 ; out8 2785 punpckhwd m0, m6, m1 2786 punpcklwd m6, m1 2787 pmaddwd m1, m14, m0 2788 pmaddwd m9, m14, m6 2789 pmaddwd m0, m12 2790 pmaddwd m6, m12 2791 REPX {paddd x, m2}, m1, m9, m0, m6 2792 REPX {psrad x, 12}, m1, m9, m0, m6 2793 packssdw m9, m1 ; -out7 2794 packssdw m6, m0 ; out8 2795 vpbroadcastd m1, [o(pw_8192)] 2796 ret 2797ALIGN function_align 2798cglobal_label .main_pass2_end 2799 ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to 2800 ; 16-bit here will produce the same result as using 32-bit intermediates. 2801 paddsw m5, m10, m11 ; -out5 2802 psubsw m10, m11 ; out10 2803 psubsw m11, m4, m8 ; -out11 2804 paddsw m4, m8 ; out4 2805 psubsw m8, m7, m9 ; out8 2806 paddsw m7, m9 ; -out7 2807 psubsw m9, m1, m6 ; -out9 2808 paddsw m6, m1 ; out6 2809 vpbroadcastd m1, [o(pw_2896x8)] 2810 REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11 2811 vpbroadcastd m1, [o(pw_2048)] 2812 ret 2813 2814INV_TXFM_16X16_FN flipadst, dct 2815INV_TXFM_16X16_FN flipadst, adst 2816INV_TXFM_16X16_FN flipadst, flipadst 2817 2818cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 2819 ITX_16X16_LOAD_COEFS 2820 call m(iadst_16x16_internal_8bpc).main 2821 call m(iadst_16x16_internal_8bpc).main_pass1_end 2822 pmulhrsw m6, m1 2823 pmulhrsw m2, m1, m8 2824 mova [rsp+32*2], m6 2825 pmulhrsw m6, m1, m4 2826 pmulhrsw m4, m1, m10 2827 pmulhrsw m8, m1, [cq+32*3] 2828 pmulhrsw m10, m1, [cq+32*2] 2829 pmulhrsw m12, m1, [cq+32*1] 2830 pmulhrsw m14, m1, [cq+32*0] 2831 pxor m0, m0 2832 psubw m0, m1 2833 REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15 2834 pmulhrsw m1, m0, m9 2835 pmulhrsw m9, m0, m13 2836 pmulhrsw m0, [rsp+32*1] 2837 mova [rsp+16*0], xm15 2838 mova [rsp+16*1], xm7 2839 vperm2i128 m15, m15, m7, 0x31 2840 vinserti128 m7, m2, xm14, 1 2841 vperm2i128 m14, m2, m14, 0x31 2842 vinserti128 m2, m9, xm5, 1 2843 vperm2i128 m9, m9, m5, 0x31 2844 vinserti128 m5, m4, xm12, 1 2845 vperm2i128 m12, m4, m12, 0x31 2846 vinserti128 m4, m11, xm3, 1 2847 vperm2i128 m11, m11, m3, 0x31 2848 vinserti128 m3, m10, xm6, 1 2849 vperm2i128 m10, m10, m6, 0x31 2850 vinserti128 m6, m1, xm0, 1 2851 vperm2i128 m13, m1, m0, 0x31 2852 vinserti128 m1, m8, [rsp+32*2], 1 2853 vperm2i128 m8, m8, [rsp+32*2], 0x31 2854 jmp m(idct_16x16_internal_8bpc).pass1_end3 2855.pass2: 2856 call m(iadst_16x16_internal_8bpc).main 2857 call m(iadst_16x16_internal_8bpc).main_pass2_end 2858 pmulhrsw m0, m1 2859 pmulhrsw m8, m1 2860 mova [rsp+32*0], m0 2861 mova [rsp+32*2], m8 2862 pxor m0, m0 2863 psubw m0, m1 2864 pmulhrsw m8, m0, m7 2865 pmulhrsw m7, m0, m9 2866 pmulhrsw m9, m1, m6 2867 pmulhrsw m6, m1, m10 2868 pmulhrsw m10, m0, m5 2869 pmulhrsw m5, m0, m11 2870 pmulhrsw m11, m1, m4 2871 pmulhrsw m4, m1, m12 2872 pmulhrsw m12, m0, m3 2873 pmulhrsw m3, m0, m13 2874 pmulhrsw m13, m1, m2 2875 pmulhrsw m1, m14 2876 pmulhrsw m14, m0, [rsp+32*1] 2877 pmulhrsw m0, m15 2878 lea r3, [strideq*3] 2879 WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1 2880 mova m15, [rsp+32*0] 2881 WRITE_16X2 3, 4, 0, 1, strideq*2, r3 2882 lea dstq, [dstq+strideq*4] 2883 WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 2884 WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 2885 jmp m(idct_16x16_internal_8bpc).end3 2886 2887%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 2888 pmulhrsw m%2, m%3, m%1 2889 psraw m%2, 1 2890 pavgw m%1, m%2 ; signs are guaranteed to be equal 2891%endmacro 2892 2893INV_TXFM_16X16_FN identity, dct 2894INV_TXFM_16X16_FN identity, identity 2895 2896cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 2897 vpbroadcastd m7, [o(pw_1697x16)] 2898 mova xm0, [cq+16* 0] 2899 vinserti128 m0, [cq+16*16], 1 2900 mova xm15, [cq+16* 1] 2901 vinserti128 m15, [cq+16*17], 1 2902 mova xm1, [cq+16* 2] 2903 vinserti128 m1, [cq+16*18], 1 2904 mova xm8, [cq+16* 3] 2905 vinserti128 m8, [cq+16*19], 1 2906 mova xm2, [cq+16* 4] 2907 vinserti128 m2, [cq+16*20], 1 2908 mova xm9, [cq+16* 5] 2909 vinserti128 m9, [cq+16*21], 1 2910 mova xm3, [cq+16* 6] 2911 vinserti128 m3, [cq+16*22], 1 2912 mova xm10, [cq+16* 7] 2913 add cq, 16*16 2914 vinserti128 m10, [cq+16* 7], 1 2915 mova xm4, [cq-16* 8] 2916 vinserti128 m4, [cq+16* 8], 1 2917 mova xm11, [cq-16* 7] 2918 vinserti128 m11, [cq+16* 9], 1 2919 mova xm5, [cq-16* 6] 2920 vinserti128 m5, [cq+16*10], 1 2921 mova xm12, [cq-16* 5] 2922 vinserti128 m12, [cq+16*11], 1 2923 mova xm13, [cq-16* 3] 2924 vinserti128 m13, [cq+16*13], 1 2925 mova xm14, [cq-16* 1] 2926 vinserti128 m14, [cq+16*15], 1 2927 REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \ 2928 10, 4, 11, 5, 12, 13, 14 2929 mova xm6, [cq-16* 4] 2930 vinserti128 m6, [cq+16*12], 1 2931 mova [rsp], m0 2932 IDTX16B 6, 0, 7 2933 mova xm0, [cq-16* 2] 2934 vinserti128 m0, [cq+16*14], 1 2935 pmulhrsw m7, m0 2936 psraw m7, 1 2937 pavgw m7, m0 2938 jmp m(idct_16x16_internal_8bpc).pass1_end3 2939ALIGN function_align 2940.pass2: 2941 vpbroadcastd m15, [o(pw_1697x16)] 2942 mova [rsp+32*1], m0 2943 REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ 2944 8, 9, 10, 11, 12, 13, 14 2945 mova m0, [rsp+32*1] 2946 mova [rsp+32*1], m1 2947 IDTX16 0, 1, 15 2948 mova m1, [rsp+32*0] 2949 pmulhrsw m15, m1 2950 paddsw m1, m1 2951 paddsw m15, m1 2952 jmp m(idct_16x16_internal_8bpc).end 2953 2954%define o_base deint_shuf + 128 2955 2956%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 2957%if %3 2958 vpbroadcastd m15, [o(pw_2896x8)] 2959 pmulhrsw m0, m15, [%1+%2*0] 2960 pmulhrsw m1, m15, [%1+%2*1] 2961 pmulhrsw m2, m15, [%1+%2*2] 2962 pmulhrsw m3, m15, [%1+%2*3] 2963 pmulhrsw m4, m15, [%1+%2*4] 2964 pmulhrsw m5, m15, [%1+%2*5] 2965 pmulhrsw m6, m15, [%1+%2*6] 2966 pmulhrsw m7, m15, [%1+%2*7] 2967%else 2968 mova m0, [%1+%2*0] 2969 mova m1, [%1+%2*1] 2970 mova m2, [%1+%2*2] 2971 mova m3, [%1+%2*3] 2972 mova m4, [%1+%2*4] 2973 mova m5, [%1+%2*5] 2974 mova m6, [%1+%2*6] 2975 mova m7, [%1+%2*7] 2976%endif 2977%endmacro 2978 2979%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2 2980%if %3 2981%if %3 == 1 2982 vpbroadcastd m15, [o(pw_2896x8)] 2983%endif 2984 pmulhrsw m8, m15, [%1+%2*0] 2985 pmulhrsw m9, m15, [%1+%2*1] 2986 pmulhrsw m10, m15, [%1+%2*2] 2987 pmulhrsw m11, m15, [%1+%2*3] 2988 pmulhrsw m12, m15, [%1+%2*4] 2989 pmulhrsw m13, m15, [%1+%2*5] 2990 pmulhrsw m14, m15, [%1+%2*6] 2991 pmulhrsw m15, [%1+%2*7] 2992%else 2993 mova m8, [%1+%2*0] 2994 mova m9, [%1+%2*1] 2995 mova m10, [%1+%2*2] 2996 mova m11, [%1+%2*3] 2997 mova m12, [%1+%2*4] 2998 mova m13, [%1+%2*5] 2999 mova m14, [%1+%2*6] 3000 mova m15, [%1+%2*7] 3001%endif 3002%endmacro 3003 3004%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] 3005 vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%4_%5x8] 3006 punpcklwd m%1, m%2, m%2 3007 pmulhrsw m%1, m%3 3008 vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%6_%7x8] 3009 punpckhwd m%2, m%2 3010 pmulhrsw m%2, m%3 3011%endmacro 3012 3013cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob 3014 lea r6, [o_base] 3015 test eobd, eobd 3016 jz .dconly 3017 PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob 3018 %undef cmp 3019 cmp eobd, 106 3020 jle .fast 3021 LOAD_8ROWS cq+32*1, 32*2 3022 call m(idct_16x8_internal_8bpc).main 3023 vperm2i128 m11, m0, m4, 0x31 3024 vinserti128 m0, xm4, 1 3025 vperm2i128 m4, m1, m5, 0x31 3026 vinserti128 m1, xm5, 1 3027 vperm2i128 m5, m2, m6, 0x31 3028 vinserti128 m2, xm6, 1 3029 vperm2i128 m6, m3, m7, 0x31 3030 vinserti128 m3, xm7, 1 3031 pxor m7, m7 3032 REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15 3033 punpckhwd m7, m0, m1 3034 punpcklwd m0, m1 3035 punpckhwd m1, m2, m3 3036 punpcklwd m2, m3 3037 punpcklwd m3, m11, m4 3038 punpckhwd m11, m4 3039 punpckhwd m4, m5, m6 3040 punpcklwd m5, m6 3041 punpckhdq m6, m0, m2 3042 punpckldq m0, m2 3043 punpckldq m2, m3, m5 3044 punpckhdq m3, m5 3045 punpckhdq m5, m11, m4 3046 punpckldq m11, m4 3047 punpckldq m4, m7, m1 3048 punpckhdq m7, m1 3049 punpckhqdq m12, m6, m0 3050 punpcklqdq m0, m6 ; out4 3051 punpckhqdq m13, m7, m4 3052 punpcklqdq m4, m7 ; out5 3053 punpckhqdq m14, m3, m2 3054 punpcklqdq m2, m3 ; out6 3055 punpckhqdq m15, m5, m11 3056 punpcklqdq m11, m5 ; out7 3057 mova [rsp+32*0], m0 3058 mova [rsp+32*1], m4 3059 mova [rsp+32*2], m2 3060.fast: 3061 LOAD_8ROWS cq+32*0, 32*2 3062 call m(idct_16x8_internal_8bpc).main 3063 vperm2i128 m8, m0, m4, 0x31 3064 vinserti128 m0, xm4, 1 3065 vperm2i128 m4, m1, m5, 0x31 3066 vinserti128 m1, xm5, 1 3067 vperm2i128 m5, m2, m6, 0x31 3068 vinserti128 m2, xm6, 1 3069 vperm2i128 m6, m3, m7, 0x31 3070 vinserti128 m3, xm7, 1 3071 vpbroadcastd m9, [o(pw_8192)] 3072 pxor m7, m7 3073 REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14 3074 punpckhwd m7, m0, m1 3075 punpcklwd m0, m1 3076 punpckhwd m1, m2, m3 3077 punpcklwd m2, m3 3078 punpckhwd m3, m8, m4 3079 punpcklwd m8, m4 3080 punpckhwd m4, m5, m6 3081 punpcklwd m5, m6 3082 punpckhdq m6, m0, m2 3083 punpckldq m0, m2 3084 punpckldq m2, m8, m5 3085 punpckhdq m8, m5 3086 punpckhdq m5, m3, m4 3087 punpckldq m3, m4 3088 punpckhdq m4, m7, m1 3089 punpckldq m7, m1 3090 punpcklqdq m1, m7, m4 3091 punpckhqdq m7, m4 ; out9 3092 punpckhqdq m4, m2, m8 ; out10 3093 punpcklqdq m2, m8 3094 punpckhqdq m8, m3, m5 3095 punpcklqdq m3, m5 3096 punpckhqdq m5, m0, m6 ; out8 3097 punpcklqdq m0, m6 3098 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7 3099 cmp eobd, 106 3100 jg .full 3101 mova [rsp+32*0], m5 3102 mova [rsp+32*1], m7 3103 mova [rsp+32*2], m4 3104 pmulhrsw m11, m9, m8 3105 pxor m4, m4 3106 REPX {mova x, m4}, m5, m6, m7 3107 call .main_fast 3108 jmp .pass2 3109.dconly: 3110 movd xm1, [o(pw_2896x8)] 3111 pmulhrsw xm0, xm1, [cq] 3112 movd xm2, [o(pw_8192)] 3113 mov [cq], eobd 3114 or r3d, 32 3115 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly 3116.full: 3117 REPX {pmulhrsw x, m9}, m12, m13, m14, m15 3118 pmulhrsw m6, m9, [rsp+32*2] 3119 mova [rsp+32*2], m4 3120 pmulhrsw m4, m9, [rsp+32*0] 3121 mova [rsp+32*0], m5 3122 pmulhrsw m5, m9, [rsp+32*1] 3123 mova [rsp+32*1], m7 3124 pmulhrsw m7, m9, m11 3125 pmulhrsw m11, m9, m8 3126 call .main 3127.pass2: 3128 vpbroadcastd m12, [o(pw_2048)] 3129 REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3130 m8, m9, m10, m11, m13, m14, m15 3131 pmulhrsw m12, [rsp] 3132 REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14 3133 REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15 3134 mova [rsp+32*0], m4 3135 mova [rsp+32*1], m6 3136 lea r3, [strideq*3] 3137 WRITE_8X4 0, 1, 4, 6 3138 lea dstq, [dstq+strideq*4] 3139 WRITE_8X4 2, 3, 4, 6 3140 lea dstq, [dstq+strideq*4] 3141 WRITE_8X4 [rsp+32*0], 5, 4, 6 3142 lea dstq, [dstq+strideq*4] 3143 WRITE_8X4 [rsp+32*1], 7, 4, 6 3144 lea dstq, [dstq+strideq*4] 3145 WRITE_8X4 8, 9, 4, 6 3146 lea dstq, [dstq+strideq*4] 3147 WRITE_8X4 10, 11, 4, 6 3148 lea dstq, [dstq+strideq*4] 3149 WRITE_8X4 12, 13, 4, 6 3150 lea dstq, [dstq+strideq*4] 3151 WRITE_8X4 14, 15, 4, 6 3152 RET 3153ALIGN function_align 3154cglobal_label .main_fast ; bottom half is zero 3155 call m(idct_8x16_internal_8bpc).main 3156 mova m8, [rsp+gprsize+0*32] 3157 mova [rsp+gprsize+0*32], m0 3158 mova m9, [rsp+gprsize+1*32] 3159 mova [rsp+gprsize+1*32], m1 3160 mova m0, [rsp+gprsize+2*32] 3161 mova [rsp+gprsize+2*32], m6 3162 lea r5, [r6-(o_base)+pw_201_4091x8] 3163 ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a 3164 ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a 3165 ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a 3166 ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a 3167 jmp .main2 3168ALIGN function_align 3169cglobal_label .main 3170 call m(idct_8x16_internal_8bpc).main 3171 mova m8, [rsp+gprsize+0*32] 3172 mova [rsp+gprsize+0*32], m0 3173 mova m9, [rsp+gprsize+1*32] 3174 mova [rsp+gprsize+1*32], m1 3175 mova m0, [rsp+gprsize+2*32] 3176 mova [rsp+gprsize+2*32], m6 3177 punpcklwd m1, m15, m8 ; in31 in1 3178 punpckhwd m8, m15 ; in3 in29 3179 punpcklwd m15, m14, m9 ; in27 in5 3180 punpckhwd m9, m14 ; in7 in25 3181 punpcklwd m14, m13, m0 ; in23 in9 3182 punpckhwd m0, m13 ; in11 in21 3183 punpcklwd m13, m12, m11 ; in19 in13 3184 punpckhwd m11, m12 ; in15 in17 3185 ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a 3186 ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a 3187 ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a 3188 ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a 3189 ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a 3190 ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a 3191 ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a 3192 ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a 3193.main2: 3194 psubsw m6, m1, m11 ; t17 t30 3195 paddsw m1, m11 ; t16 t31 3196 psubsw m11, m9, m14 ; t18 t29 3197 paddsw m9, m14 ; t19 t28 3198 psubsw m14, m15, m0 ; t21 t26 3199 paddsw m15, m0 ; t20 t27 3200 psubsw m0, m8, m13 ; t22 t25 3201 paddsw m8, m13 ; t23 t24 3202 ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a 3203 ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a 3204 ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a 3205 ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a 3206 psubsw m13, m1, m9 ; t19a t28a 3207 paddsw m1, m9 ; t16a t31a 3208 psubsw m9, m8, m15 ; t20a t27a 3209 paddsw m8, m15 ; t23a t24a 3210 psubsw m15, m6, m11 ; t18 t29 3211 paddsw m6, m11 ; t17 t30 3212 psubsw m11, m0, m14 ; t21 t26 3213 paddsw m0, m14 ; t22 t25 3214 ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a 3215 ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28 3216 ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27 3217 ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a 3218 vbroadcasti128 m12, [o(deint_shuf)] 3219 psubsw m14, m1, m8 ; t23 t24 3220 paddsw m1, m8 ; t16 t31 3221 psubsw m8, m6, m0 ; t22a t25a 3222 paddsw m6, m0 ; t17a t30a 3223 psubsw m0, m15, m11 ; t21 t26 3224 paddsw m15, m11 ; t18 t29 3225 psubsw m11, m13, m9 ; t20a t27a 3226 paddsw m13, m9 ; t19a t28a 3227 REPX {pshufb x, m12}, m1, m6, m15, m13 3228 ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a 3229 vpbroadcastd m9, [o(pw_m2896_2896)] 3230 ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25 3231 vpbroadcastd m12, [o(pw_2896_2896)] 3232 ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a 3233 vpbroadcastd m12, [o(pw_2896_2896)] 3234 ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20 3235 shufps m9, m14, m8, q1032 ; t23a t22 3236 vpblendd m14, m8, 0xcc ; t24a t25 3237 shufps m8, m11, m0, q1032 ; t20 t21a 3238 vpblendd m11, m0, 0xcc ; t27 t26a 3239 punpcklqdq m0, m1, m6 ; t16 t17a 3240 punpckhqdq m1, m6 ; t31 t30a 3241 psubsw m10, m5, m8 ; out20 out21 3242 paddsw m5, m8 ; out11 out10 3243 psubsw m6, m3, m14 ; out24 out25 3244 paddsw m3, m14 ; out7 out6 3245 psubsw m8, m7, m0 ; out16 out17 3246 paddsw m7, m0 ; out15 out14 3247 mova m0, [rsp+gprsize+0*32] 3248 punpcklqdq m12, m13, m15 ; t19a t18 3249 punpckhqdq m13, m15 ; t28a t29 3250 psubsw m15, m0, m1 ; out31 out30 3251 paddsw m0, m1 ; out0 out1 3252 mova m1, [rsp+gprsize+1*32] 3253 mova [rsp+gprsize+0*32], m6 3254 mova m6, [rsp+gprsize+2*32] 3255 psubsw m14, m1, m13 ; out28 out29 3256 paddsw m1, m13 ; out3 out2 3257 psubsw m13, m2, m11 ; out27 out26 3258 paddsw m2, m11 ; out4 out5 3259 psubsw m11, m4, m9 ; out23 out22 3260 paddsw m4, m9 ; out8 out9 3261 psubsw m9, m6, m12 ; out19 out18 3262 paddsw m6, m12 ; out12 out13 3263 ret 3264 3265%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2] 3266 vbroadcasti128 m%1, [cq+16*%3] 3267 vbroadcasti128 m%2, [cq+16*%4] 3268 shufpd m%1, m%2, 0x0c 3269%endmacro 3270 3271cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob 3272 lea r6, [o_base] 3273 test eobd, eobd 3274 jnz .normal 3275 movd xm1, [o(pw_2896x8)] 3276 pmulhrsw xm0, xm1, [cq] 3277 movd xm2, [o(pw_8192)] 3278 mov [cq], eobd 3279 or r3d, 8 3280.dconly: 3281 pmulhrsw xm0, xm2 3282 movd xm2, [pw_2048] ; intentionally rip-relative 3283 pmulhrsw xm0, xm1 3284 pmulhrsw xm0, xm2 3285 vpbroadcastw m0, xm0 3286 pxor m3, m3 3287.dconly_loop: 3288 mova m1, [dstq] 3289 punpckhbw m2, m1, m3 3290 punpcklbw m1, m3 3291 paddw m2, m0 3292 paddw m1, m0 3293 packuswb m1, m2 3294 mova [dstq], m1 3295 add dstq, strideq 3296 dec r3d 3297 jg .dconly_loop 3298 RET 3299.normal: 3300 PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob 3301 %undef cmp 3302 LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2 3303 LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3 3304 LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6 3305 LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7 3306 pxor m8, m8 3307 REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 3308 add cq, 16*16 3309 LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10 3310 LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11 3311 LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14 3312 LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15 3313 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1 3314 mova [rsp+32*0], m4 3315 mova [rsp+32*1], m5 3316 mova [rsp+32*2], m6 3317 cmp eobd, 106 3318 jg .full 3319 pxor m4, m4 3320 REPX {mova x, m4}, m5, m6, m7 3321 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast 3322 jmp .pass2 3323.full: 3324 LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 3325 LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17 3326 LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22 3327 LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21 3328 REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 3329 add cq, 16*8 3330 LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26 3331 LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25 3332 LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30 3333 LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 3334 pxor m8, m8 3335 REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 3336 call m(inv_txfm_add_dct_dct_8x32_8bpc).main 3337.pass2: 3338 vpbroadcastd m12, [o(pw_8192)] 3339 REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 3340 mova [rsp+32*1], m9 3341 mova [rsp+32*2], m10 3342 punpckhwd m9, m0, m2 3343 punpcklwd m0, m2 3344 punpckhwd m2, m1, m3 3345 punpcklwd m1, m3 3346 punpcklwd m10, m4, m6 3347 punpckhwd m4, m6 3348 punpcklwd m6, m5, m7 3349 punpckhwd m5, m7 3350 punpckhwd m3, m0, m9 3351 punpcklwd m0, m9 3352 punpckhwd m9, m2, m1 3353 punpcklwd m2, m1 3354 punpcklwd m7, m10, m4 3355 punpckhwd m10, m4 3356 punpcklwd m4, m5, m6 3357 punpckhwd m5, m6 3358 punpckhdq m1, m0, m2 3359 punpckldq m0, m2 3360 punpckldq m2, m3, m9 3361 punpckhdq m3, m9 3362 punpckldq m6, m7, m4 3363 punpckhdq m7, m4 3364 punpckldq m9, m10, m5 3365 punpckhdq m10, m5 3366 REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10 3367 pmulhrsw m12, [rsp+32*0] 3368 mova [rsp+32*0], m8 3369 vperm2i128 m4, m0, m6, 0x31 3370 vinserti128 m0, xm6, 1 3371 vperm2i128 m5, m1, m7, 0x31 3372 vinserti128 m1, xm7, 1 3373 vperm2i128 m6, m2, m9, 0x31 3374 vinserti128 m2, xm9, 1 3375 vperm2i128 m7, m3, m10, 0x31 3376 vinserti128 m3, xm10, 1 3377 call m(idct_16x8_internal_8bpc).main 3378 vpbroadcastd m8, [o(pw_2048)] 3379 REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 3380 lea r2, [strideq*3] 3381 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 3382 WRITE_16X2 2, 3, 0, 1, strideq*2, r2 3383 lea r3, [dstq+strideq*4] 3384 %define dstq r3 3385 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 3386 WRITE_16X2 6, 7, 0, 1, strideq*2, r2 3387 mova m0, [rsp+32*0] 3388 mova m1, [rsp+32*1] 3389 mova m2, [rsp+32*2] 3390 punpckhwd m7, m0, m2 3391 punpcklwd m0, m2 3392 punpckhwd m2, m1, m11 3393 punpcklwd m1, m11 3394 punpckhwd m4, m12, m14 3395 punpcklwd m12, m14 3396 punpckhwd m5, m13, m15 3397 punpcklwd m13, m15 3398 punpckhwd m3, m0, m7 3399 punpcklwd m0, m7 3400 punpckhwd m9, m2, m1 3401 punpcklwd m2, m1 3402 punpcklwd m7, m12, m4 3403 punpckhwd m12, m4 3404 punpcklwd m4, m5, m13 3405 punpckhwd m5, m13 3406 punpckhdq m1, m0, m2 3407 punpckldq m0, m2 3408 punpckldq m2, m3, m9 3409 punpckhdq m3, m9 3410 punpckldq m6, m7, m4 3411 punpckhdq m7, m4 3412 punpckldq m9, m12, m5 3413 punpckhdq m12, m5 3414 vperm2i128 m4, m0, m6, 0x31 3415 vinserti128 m0, xm6, 1 3416 vperm2i128 m5, m1, m7, 0x31 3417 vinserti128 m1, xm7, 1 3418 vperm2i128 m6, m2, m9, 0x31 3419 vinserti128 m2, xm9, 1 3420 vperm2i128 m7, m3, m12, 0x31 3421 vinserti128 m3, xm12, 1 3422 call m(idct_16x8_internal_8bpc).main2 3423 vpbroadcastd m8, [o(pw_2048)] 3424 REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 3425 add r0, 16 3426 add r3, 16 3427 %define dstq r0 3428 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 3429 WRITE_16X2 2, 3, 0, 1, strideq*2, r2 3430 %define dstq r3 3431 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 3432 WRITE_16X2 6, 7, 0, 1, strideq*2, r2 3433 RET 3434 3435cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob 3436 vpbroadcastd m9, [pw_5] 3437 lea r4, [strideq*3] 3438 sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) 3439.loop: 3440 mova xm0,[cq+16* 0] 3441 mova xm1, [cq+16* 4] 3442 vinserti128 m0, [cq+16* 1], 1 3443 vinserti128 m1, [cq+16* 5], 1 3444 pxor m8, m8 3445 mova [cq+32*0], m8 3446 mova [cq+32*2], m8 3447 add cq, 16*16 3448 mova xm2, [cq-16* 8] 3449 mova xm3, [cq-16* 4] 3450 vinserti128 m2, [cq-16* 7], 1 3451 vinserti128 m3, [cq-16* 3], 1 3452 mova xm4, [cq+16* 0] 3453 mova xm5, [cq+16* 4] 3454 vinserti128 m4, [cq+16* 1], 1 3455 vinserti128 m5, [cq+16* 5], 1 3456 mova xm6, [cq+16* 8] 3457 mova xm7, [cq+16*12] 3458 vinserti128 m6, [cq+16* 9], 1 3459 vinserti128 m7, [cq+16*13], 1 3460 REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6 3461 REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 3462 call .transpose8x8 3463 REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 3464 WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4 3465 add dstq, strideq 3466 WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4 3467 add dstq, strideq 3468 WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4 3469 add dstq, strideq 3470 WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4 3471 add dstq, strideq 3472 sub cq, 16*16-32 3473 lea dstq, [dstq+r4*4] 3474 add eobd, 0x80000000 3475 jnc .loop 3476 RET 3477ALIGN function_align 3478.transpose8x8: 3479 punpckhwd m8, m4, m5 3480 punpcklwd m4, m5 3481 punpckhwd m5, m0, m1 3482 punpcklwd m0, m1 3483 punpckhwd m1, m6, m7 3484 punpcklwd m6, m7 3485 punpckhwd m7, m2, m3 3486 punpcklwd m2, m3 3487 punpckhdq m3, m0, m2 3488 punpckldq m0, m2 3489 punpckldq m2, m4, m6 3490 punpckhdq m4, m6 3491 punpckhdq m6, m5, m7 3492 punpckldq m5, m7 3493 punpckldq m7, m8, m1 3494 punpckhdq m8, m1 3495 punpckhqdq m1, m0, m2 3496 punpcklqdq m0, m2 3497 punpcklqdq m2, m3, m4 3498 punpckhqdq m3, m4 3499 punpcklqdq m4, m5, m7 3500 punpckhqdq m5, m7 3501 punpckhqdq m7, m6, m8 3502 punpcklqdq m6, m8 3503 ret 3504 3505cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob 3506 add cq, 16*8 3507 vpbroadcastd m9, [pw_4096] 3508 lea r4, [strideq*3] 3509 lea r5, [dstq+strideq*4] 3510 sub eobd, 107 3511.loop: 3512 mova xm0, [cq-16*8] 3513 mova xm1, [cq-16*7] 3514 vinserti128 m0, [cq+16*0], 1 3515 vinserti128 m1, [cq+16*1], 1 3516 mova xm2, [cq-16*6] 3517 mova xm3, [cq-16*5] 3518 vinserti128 m2, [cq+16*2], 1 3519 vinserti128 m3, [cq+16*3], 1 3520 mova xm4, [cq-16*4] 3521 mova xm5, [cq-16*3] 3522 vinserti128 m4, [cq+16*4], 1 3523 vinserti128 m5, [cq+16*5], 1 3524 mova xm6, [cq-16*2] 3525 mova xm7, [cq-16*1] 3526 vinserti128 m6, [cq+16*6], 1 3527 vinserti128 m7, [cq+16*7], 1 3528 pxor m8, m8 3529 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 3530 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 3531 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 3532 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 3533 WRITE_16X2 2, 3, 0, 1, strideq*2, r4 3534 %define dstq r5 3535 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 3536 WRITE_16X2 6, 7, 0, 1, strideq*2, r4 3537 add cq, 16*16 3538 add r0, 16 3539 add r5, 16 3540 add eobd, 0x80000000 3541 jnc .loop 3542 RET 3543 3544%define o_base pw_5 + 128 3545 3546%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs 3547%if %3 3548 vpbroadcastd m15, [o(pw_2896x8)] 3549 pmulhrsw m0, m15, [%1+%2* 0] 3550 pmulhrsw m1, m15, [%1+%2* 1] 3551 pmulhrsw m2, m15, [%1+%2* 2] 3552 pmulhrsw m3, m15, [%1+%2* 3] 3553 pmulhrsw m4, m15, [%1+%2* 4] 3554 pmulhrsw m5, m15, [%1+%2* 5] 3555 pmulhrsw m6, m15, [%1+%2* 6] 3556 pmulhrsw m7, m15, [%1+%2* 7] 3557 pmulhrsw m8, m15, [%1+%2* 8] 3558 pmulhrsw m9, m15, [%1+%2* 9] 3559 pmulhrsw m10, m15, [%1+%2*10] 3560 pmulhrsw m11, m15, [%1+%2*11] 3561 pmulhrsw m12, m15, [%1+%2*12] 3562 pmulhrsw m13, m15, [%1+%2*13] 3563 pmulhrsw m14, m15, [%1+%2*14] 3564 pmulhrsw m15, [%1+%2*15] 3565%else 3566 mova m0, [%1+%2* 0] 3567 mova m1, [%1+%2* 1] 3568 mova m2, [%1+%2* 2] 3569 mova m3, [%1+%2* 3] 3570 mova m4, [%1+%2* 4] 3571 mova m5, [%1+%2* 5] 3572 mova m6, [%1+%2* 6] 3573 mova m7, [%1+%2* 7] 3574 mova m8, [%1+%2* 8] 3575 mova m9, [%1+%2* 9] 3576 mova m10, [%1+%2*10] 3577 mova m11, [%1+%2*11] 3578 mova m12, [%1+%2*12] 3579 mova m13, [%1+%2*13] 3580 mova m14, [%1+%2*14] 3581 mova m15, [%1+%2*15] 3582%endif 3583 mova [rsp], m15 3584%if %4 3585 pxor m15, m15 3586 REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \ 3587 8, 9, 10, 11, 12, 13, 14, 15 3588%endif 3589%endmacro 3590 3591%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2] 3592 mova m%4, [%2] 3593 paddsw m%3, m%1, m%4 3594 psubsw m%1, m%4 3595 pmovzxbw m%4, [dstq+%6] 3596 pmulhrsw m%3, m%5 3597 pmulhrsw m%1, m%5 3598 paddw m%3, m%4 3599 pmovzxbw m%4, [r2+%7] 3600 paddw m%1, m%4 3601 packuswb m%3, m%1 3602 vpermq m%3, m%3, q3120 3603 mova [dstq+%6], xm%3 3604 vextracti128 [r2+%7], m%3, 1 3605%endmacro 3606 3607cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob 3608 lea r6, [o_base] 3609 test eobd, eobd 3610 jz .dconly 3611 PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \ 3612 base, tmp3 3613 %undef cmp 3614 LOAD_16ROWS cq, 64, 1 3615 call m(idct_16x16_internal_8bpc).main 3616 lea tmp1q, [rsp+32*7] 3617 lea tmp2q, [tmp1q+32*8] 3618 lea tmp3q, [tmp1q+32*16] 3619 mova m1, [rsp+32*1] 3620 mova [rsp+32*0], m6 3621 mova [rsp+32*1], m7 3622 vpbroadcastd m7, [o(pw_16384)] 3623 call .transpose_2x8x8_round 3624 mova m15, [rsp+32*0] 3625 mova [tmp3q-32*4+ 0], xm0 3626 vextracti128 [tmp3q+32*0+ 0], m0, 1 3627 mova [tmp3q-32*3+ 0], xm2 3628 vextracti128 [tmp3q+32*1+ 0], m2, 1 3629 mova [tmp3q-32*2+ 0], xm4 3630 vextracti128 [tmp3q+32*2+ 0], m4, 1 3631 mova [tmp3q-32*1+ 0], xm6 3632 vextracti128 [tmp3q+32*3+ 0], m6, 1 3633 mova [tmp3q-32*4+16], xm8 3634 vextracti128 [tmp3q+32*0+16], m8, 1 3635 mova [tmp3q-32*3+16], xm10 3636 vextracti128 [tmp3q+32*1+16], m10, 1 3637 mova [tmp3q-32*2+16], xm12 3638 vextracti128 [tmp3q+32*2+16], m12, 1 3639 mova [tmp3q-32*1+16], xm14 3640 vextracti128 [tmp3q+32*3+16], m14, 1 3641 cmp eobd, 150 3642 jg .full 3643 vinserti128 m0, m1, xm9, 1 3644 vperm2i128 m4, m1, m9, 0x31 3645 vinserti128 m2, m5, xm13, 1 3646 vperm2i128 m6, m5, m13, 0x31 3647 vinserti128 m1, m3, xm11, 1 3648 vperm2i128 m5, m3, m11, 0x31 3649 vinserti128 m3, m7, xm15, 1 3650 vperm2i128 m7, m7, m15, 0x31 3651 call .main_oddhalf_fast 3652 pxor m8, m8 3653 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 3654 jmp .idct16 3655.dconly: 3656 movd xm1, [o(pw_2896x8)] 3657 pmulhrsw xm0, xm1, [cq] 3658 movd xm2, [o(pw_16384)] 3659 mov [cq], eobd 3660 pmulhrsw xm0, xm1 3661 or r3d, 32 3662 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly 3663.full: 3664 mova [tmp1q-32*4], m1 3665 mova [tmp1q-32*3], m3 3666 mova [tmp1q-32*2], m5 3667 mova [tmp1q-32*1], m7 3668 mova [tmp1q+32*0], m9 3669 mova [tmp1q+32*1], m11 3670 mova [tmp1q+32*2], m13 3671 mova [tmp1q+32*3], m15 3672 LOAD_16ROWS cq+32, 64, 1 3673 call m(idct_16x16_internal_8bpc).main 3674 lea r2, [tmp3q+32*8] 3675 mova m1, [rsp+32*1] 3676 mova [rsp+32*0], m6 3677 mova [rsp+32*1], m7 3678 vpbroadcastd m7, [o(pw_16384)] 3679 call .transpose_2x8x8_round 3680 mova m15, [rsp+32*0] 3681 mova [r2-32*4+ 0], xm0 3682 vextracti128 [r2+32*0+ 0], m0, 1 3683 mova [r2-32*3+ 0], xm2 3684 vextracti128 [r2+32*1+ 0], m2, 1 3685 mova [r2-32*2+ 0], xm4 3686 vextracti128 [r2+32*2+ 0], m4, 1 3687 mova [r2-32*1+ 0], xm6 3688 vextracti128 [r2+32*3+ 0], m6, 1 3689 mova [r2-32*4+16], xm8 3690 vextracti128 [r2+32*0+16], m8, 1 3691 mova [r2-32*3+16], xm10 3692 vextracti128 [r2+32*1+16], m10, 1 3693 mova [r2-32*2+16], xm12 3694 vextracti128 [r2+32*2+16], m12, 1 3695 mova [r2-32*1+16], xm14 3696 vextracti128 [r2+32*3+16], m14, 1 3697 vinserti128 m8, m1, xm9, 1 3698 vperm2i128 m12, m1, m9, 0x31 3699 mova xm0, [tmp1q-32*4] 3700 mova xm1, [tmp1q-32*3] 3701 vinserti128 m0, [tmp1q+32*0], 1 3702 vinserti128 m1, [tmp1q+32*1], 1 3703 vinserti128 m10, m5, xm13, 1 3704 vperm2i128 m14, m5, m13, 0x31 3705 mova xm4, [tmp1q-32*4+16] 3706 mova xm5, [tmp1q-32*3+16] 3707 vinserti128 m4, [tmp1q+32*0+16], 1 3708 vinserti128 m5, [tmp1q+32*1+16], 1 3709 vinserti128 m9, m3, xm11, 1 3710 vperm2i128 m13, m3, m11, 0x31 3711 mova xm2, [tmp1q-32*2] 3712 mova xm3, [tmp1q-32*1] 3713 vinserti128 m2, [tmp1q+32*2], 1 3714 vinserti128 m3, [tmp1q+32*3], 1 3715 vinserti128 m11, m7, xm15, 1 3716 vperm2i128 m15, m7, m15, 0x31 3717 mova xm6, [tmp1q-32*2+16] 3718 mova xm7, [tmp1q-32*1+16] 3719 vinserti128 m6, [tmp1q+32*2+16], 1 3720 vinserti128 m7, [tmp1q+32*3+16], 1 3721 call .main_oddhalf 3722 LOAD_8ROWS_H r2-32*4, 32 3723.idct16: 3724 LOAD_8ROWS tmp3q-32*4, 32 3725 mova [rsp], m15 3726 call m(idct_16x16_internal_8bpc).main 3727 imul r2, strideq, 19 3728 lea r3, [strideq*3] 3729 add r2, dstq 3730 call .pass2_end 3731 RET 3732ALIGN function_align 3733cglobal_label .main_oddhalf_fast ; lower half is zero 3734 mova [rsp+gprsize+32*1], m7 3735 pxor m7, m7 3736 mova [rsp+gprsize+32*0], m7 3737 mova [rsp+gprsize+32*2], m7 3738 vpbroadcastd m11, [o(pw_3703x8)] 3739 vpbroadcastd m7, [o(pw_1751x8)] 3740 vpbroadcastd m12, [o(pw_m1380x8)] 3741 vpbroadcastd m8, [o(pw_3857x8)] 3742 vpbroadcastd m13, [o(pw_3973x8)] 3743 vpbroadcastd m15, [o(pw_995x8)] 3744 pmulhrsw m11, m4 ; t29a 3745 pmulhrsw m4, m7 ; t18a 3746 pmulhrsw m12, m3 ; t19a 3747 pmulhrsw m3, m8 ; t28a 3748 pmulhrsw m13, m2 ; t27a 3749 pmulhrsw m2, m15 ; t20a 3750 vpbroadcastd m10, [o(pw_m2106x8)] 3751 vpbroadcastd m7, [o(pw_3513x8)] 3752 vpbroadcastd m9, [o(pw_3290x8)] 3753 vpbroadcastd m8, [o(pw_2440x8)] 3754 vpbroadcastd m14, [o(pw_m601x8)] 3755 vpbroadcastd m15, [o(pw_4052x8)] 3756 pmulhrsw m10, m5 ; t21a 3757 pmulhrsw m5, m7 ; t26a 3758 pmulhrsw m9, m6 ; t25a 3759 pmulhrsw m6, m8 ; t22a 3760 pmulhrsw m14, m1 ; t23a 3761 pmulhrsw m1, m15 ; t24a 3762 vpbroadcastd m15, [o(pd_2048)] 3763 jmp .main2 3764ALIGN function_align 3765cglobal_label .main_oddhalf 3766 mova [rsp+gprsize+32*0], m15 3767 mova [rsp+gprsize+32*1], m7 3768 mova [rsp+gprsize+32*2], m8 3769 vpbroadcastd m15, [o(pd_2048)] 3770 ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a 3771 ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a 3772 ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a 3773 ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a 3774 ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a 3775 ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a 3776.main2: 3777 psubsw m7, m12, m4 ; t18 3778 paddsw m12, m4 ; t19 3779 psubsw m4, m2, m10 ; t21 3780 paddsw m2, m10 ; t20 3781 psubsw m10, m14, m6 ; t22 3782 paddsw m14, m6 ; t23 3783 psubsw m6, m1, m9 ; t25 3784 paddsw m1, m9 ; t24 3785 psubsw m9, m13, m5 ; t26 3786 paddsw m13, m5 ; t27 3787 psubsw m5, m3, m11 ; t29 3788 paddsw m3, m11 ; t28 3789 ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a 3790 ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a 3791 ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a 3792 psubsw m8, m14, m2 ; t20a 3793 paddsw m14, m2 ; t23a 3794 psubsw m2, m1, m13 ; t27a 3795 paddsw m1, m13 ; t24a 3796 psubsw m13, m6, m9 ; t21 3797 paddsw m6, m9 ; t22 3798 psubsw m9, m10, m4 ; t26 3799 paddsw m10, m4 ; t25 3800 ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27 3801 ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a 3802 mova m4, [rsp+gprsize+32*0] ; in31 3803 mova [rsp+gprsize+32*0], m6 ; t22 3804 mova m6, [rsp+gprsize+32*1] ; in15 3805 mova [rsp+gprsize+32*1], m14 ; t23a 3806 mova m14, [rsp+gprsize+32*2] ; in17 3807 mova [rsp+gprsize+32*2], m1 ; t24a 3808 ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a 3809 ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a 3810 psubsw m1, m0, m14 ; t17 3811 paddsw m0, m14 ; t16 3812 psubsw m14, m4, m6 ; t30 3813 paddsw m4, m6 ; t31 3814 ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a 3815 psubsw m6, m0, m12 ; t19a 3816 paddsw m0, m12 ; t16a 3817 psubsw m12, m4, m3 ; t28a 3818 paddsw m4, m3 ; t31a 3819 psubsw m3, m14, m5 ; t18 3820 paddsw m14, m5 ; t17 3821 psubsw m5, m1, m7 ; t29 3822 paddsw m1, m7 ; t30 3823 ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a 3824 ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28 3825 psubsw m7, m1, m10 ; t25a 3826 paddsw m1, m10 ; t30a 3827 psubsw m10, m5, m9 ; t21 3828 paddsw m5, m9 ; t18 3829 psubsw m9, m12, m2 ; t20a 3830 paddsw m12, m2 ; t19a 3831 psubsw m2, m3, m13 ; t26 3832 paddsw m3, m13 ; t29 3833 psubsw m13, m6, m8 ; t27a 3834 paddsw m6, m8 ; t28a 3835 mova [tmp1q-32*2], m5 3836 mova [tmp1q-32*1], m12 3837 mova [tmp2q+32*0], m6 3838 mova [tmp2q+32*1], m3 3839 mova [tmp2q+32*2], m1 3840 mova m5, [rsp+gprsize+32*0] ; t22 3841 mova m6, [rsp+gprsize+32*1] ; t23 3842 mova m3, [rsp+gprsize+32*2] ; t24a 3843 psubsw m1, m14, m5 ; t22a 3844 paddsw m14, m5 ; t17a 3845 psubsw m5, m0, m6 ; t23 3846 paddsw m0, m6 ; t16 3847 psubsw m6, m4, m3 ; t24 3848 paddsw m4, m3 ; t31 3849 vpbroadcastd m8, [o(pw_m2896_2896)] 3850 vpbroadcastd m3, [o(pw_2896_2896)] 3851 mova [tmp1q-32*4], m0 3852 mova [tmp1q-32*3], m14 3853 mova [tmp2q+32*3], m4 3854 ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27 3855 ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a 3856 ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25 3857 ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a 3858 mova [tmp1q+32*0], m13 3859 mova [tmp1q+32*1], m2 3860 mova [tmp1q+32*2], m7 3861 mova [tmp1q+32*3], m6 3862 mova [tmp2q-32*4], m5 3863 mova [tmp2q-32*3], m1 3864 mova [tmp2q-32*2], m10 3865 mova [tmp2q-32*1], m9 3866 ret 3867ALIGN function_align 3868.transpose_2x8x8_round: 3869 punpckhwd m6, m12, m13 3870 punpcklwd m12, m13 3871 punpckhwd m13, m8, m9 3872 punpcklwd m8, m9 3873 punpckhwd m9, m14, m15 3874 punpcklwd m14, m15 3875 punpckhwd m15, m10, m11 3876 punpcklwd m10, m11 3877 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5 3878 punpckhdq m11, m8, m10 3879 punpckldq m8, m10 3880 punpckldq m10, m12, m14 3881 punpckhdq m12, m14 3882 punpckhdq m14, m13, m15 3883 punpckldq m13, m15 3884 punpckldq m15, m6, m9 3885 punpckhdq m6, m9 3886 punpckhqdq m9, m8, m10 3887 punpcklqdq m8, m10 3888 punpcklqdq m10, m11, m12 3889 punpckhqdq m11, m12 3890 punpcklqdq m12, m13, m15 3891 punpckhqdq m13, m15 3892 punpckhqdq m15, m14, m6 3893 punpcklqdq m14, m6 3894 pmulhrsw m6, m7, [rsp+gprsize+32*0] 3895 REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15 3896 pmulhrsw m7, [rsp+gprsize+32*1] 3897 mova [rsp+gprsize+32*0], m15 3898 punpckhwd m15, m4, m5 3899 punpcklwd m4, m5 3900 punpckhwd m5, m0, m1 3901 punpcklwd m0, m1 3902 punpckhwd m1, m6, m7 3903 punpcklwd m6, m7 3904 punpckhwd m7, m2, m3 3905 punpcklwd m2, m3 3906 punpckhdq m3, m0, m2 3907 punpckldq m0, m2 3908 punpckldq m2, m4, m6 3909 punpckhdq m4, m6 3910 punpckhdq m6, m5, m7 3911 punpckldq m5, m7 3912 punpckldq m7, m15, m1 3913 punpckhdq m15, m1 3914 punpckhqdq m1, m0, m2 3915 punpcklqdq m0, m2 3916 punpcklqdq m2, m3, m4 3917 punpckhqdq m3, m4 3918 punpcklqdq m4, m5, m7 3919 punpckhqdq m5, m7 3920 punpckhqdq m7, m6, m15 3921 punpcklqdq m6, m15 3922 ret 3923ALIGN function_align 3924.pass2_end: 3925 mova [rsp+gprsize+32*0], m7 3926 mova [rsp+gprsize+32*2], m15 3927 vpbroadcastd m15, [o(pw_2048)] 3928 IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4 3929 IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8 3930 IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4 3931 IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0 3932 add dstq, strideq 3933 sub r2, strideq 3934 mova m1, [rsp+gprsize+32*1] 3935 IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4 3936 IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8 3937 IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4 3938 IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0 3939 add dstq, strideq 3940 sub r2, strideq 3941 IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4 3942 IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8 3943 IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4 3944 IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0 3945 add dstq, strideq 3946 sub r2, strideq 3947 mova m7, [rsp+gprsize+32*0] 3948 mova m1, [rsp+gprsize+32*2] 3949 IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4 3950 IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8 3951 IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4 3952 IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0 3953 ret 3954 3955; Perform the final sumsub step and YMM lane shuffling 3956%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2] 3957 mova m%3, [tmp2q+32*( 3-%1)] 3958 psubsw m%4, m%1, m%3 3959 paddsw m%1, m%3 3960 mova m%3, [tmp1q+32*(11-%2)] 3961 mova [tmp1q+32*(11-%2)+16], xm%4 3962 vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1 3963 paddsw m%4, m%2, m%3 3964 psubsw m%2, m%3 3965 mova [tmp1q+32*(11-%2)], xm%2 3966 vextracti128 [tmp2q+32*( 3-%1)], m%2, 1 3967 vperm2i128 m%2, m%1, m%4, 0x31 3968 vinserti128 m%1, xm%4, 1 3969%endmacro 3970 3971cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob 3972 lea r6, [o_base] 3973 test eobd, eobd 3974 jnz .normal 3975 movd xm1, [o(pw_2896x8)] 3976 pmulhrsw xm0, xm1, [cq] 3977 movd xm2, [o(pw_16384)] 3978 mov [cq], eobd 3979 pmulhrsw xm0, xm1 3980 or r3d, 16 3981 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly 3982.normal: 3983 PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 3984 vpbroadcastd m15, [o(pw_2896x8)] 3985 pmulhrsw m0, m15, [cq+32* 1] 3986 pmulhrsw m1, m15, [cq+32* 3] 3987 pmulhrsw m2, m15, [cq+32* 5] 3988 pmulhrsw m3, m15, [cq+32* 7] 3989 pmulhrsw m4, m15, [cq+32* 9] 3990 pmulhrsw m5, m15, [cq+32*11] 3991 pmulhrsw m6, m15, [cq+32*13] 3992 pmulhrsw m7, m15, [cq+32*15] 3993 pmulhrsw m8, m15, [cq+32*17] 3994 pmulhrsw m9, m15, [cq+32*19] 3995 pmulhrsw m10, m15, [cq+32*21] 3996 pmulhrsw m11, m15, [cq+32*23] 3997 pmulhrsw m12, m15, [cq+32*25] 3998 pmulhrsw m13, m15, [cq+32*27] 3999 pmulhrsw m14, m15, [cq+32*29] 4000 pmulhrsw m15, [cq+32*31] 4001 lea tmp1q, [rsp+32*7] 4002 lea tmp2q, [tmp1q+32*8] 4003 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf 4004 LOAD_16ROWS cq+32*0, 32*2, 1, 0 4005 pxor m15, m15 4006 mov r3d, 8 4007.zero_loop: 4008 mova [cq+32*0], m15 4009 mova [cq+32*1], m15 4010 mova [cq+32*2], m15 4011 mova [cq+32*3], m15 4012 add cq, 32*4 4013 dec r3d 4014 jg .zero_loop 4015 call m(idct_16x16_internal_8bpc).main 4016 call .pass1_end 4017 lea r2, [strideq*3] 4018 mov r3, dstq 4019.pass2: 4020 vpbroadcastd m7, [o(pw_16384)] 4021 call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round 4022 call m(idct_16x16_internal_8bpc).main 4023 mova [rsp+32*2], m15 4024 vpbroadcastd m15, [o(pw_2048)] 4025 REPX {pmulhrsw x, m15}, m2, m3, m0 4026 WRITE_16X2 2, 3, 1, 2, strideq*2, r2 4027 pmulhrsw m1, m15, [rsp+32*1] 4028 WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 4029 lea dstq, [dstq+strideq*4] 4030 REPX {pmulhrsw x, m15}, m4, m5, m6, m7 4031 WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 4032 WRITE_16X2 6, 7, 2, 3, strideq*2, r2 4033 lea dstq, [dstq+strideq*4] 4034 REPX {pmulhrsw x, m15}, m8, m9, m10, m11 4035 WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 4036 WRITE_16X2 10, 11, 2, 3, strideq*2, r2 4037 lea dstq, [dstq+strideq*4] 4038 REPX {pmulhrsw x, m15}, m11, m12, m13, m14 4039 pmulhrsw m15, [rsp+32*2] 4040 WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 4041 WRITE_16X2 14, 15, 2, 3, strideq*2, r2 4042 test r3, r3 4043 jnz .right_half 4044 RET 4045.right_half: 4046 LOAD_8ROWS tmp1q-32*4, 32 4047 LOAD_8ROWS_H tmp2q-32*4, 32 4048 lea dstq, [r3+16] 4049 xor r3d, r3d 4050 mova [rsp+32*0], m6 4051 mova [rsp+32*1], m7 4052 jmp .pass2 4053ALIGN function_align 4054.pass1_end: 4055 mova [rsp+gprsize+32*0], m9 4056 IDCT32_PASS1_END 0, 8, 1, 9 4057 IDCT32_PASS1_END 2, 10, 1, 9 4058 IDCT32_PASS1_END 3, 11, 1, 9 4059 IDCT32_PASS1_END 4, 12, 1, 9 4060 IDCT32_PASS1_END 5, 13, 1, 9 4061 IDCT32_PASS1_END 6, 14, 1, 9 4062 IDCT32_PASS1_END 7, 15, 1, 9 4063 mova m1, [rsp+gprsize+32*1] 4064 mova m9, [rsp+gprsize+32*0] 4065 mova [rsp+gprsize+32*0], m6 4066 mova [rsp+gprsize+32*1], m7 4067 IDCT32_PASS1_END 1, 9, 6, 7 4068 ret 4069 4070cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob 4071%undef cmp 4072 lea r6, [o_base] 4073 vpbroadcastd m9, [o(pw_2896x8)] 4074 vpbroadcastd m10, [o(pw_1697x16)] 4075 vpbroadcastd m12, [o(pw_8192)] 4076 cmp eobd, 43 ; if (eob > 43) 4077 setg r4b ; iteration_count++ 4078 cmp eobd, 150 ; if (eob > 150) 4079 setg al ; iteration_count++ 4080 add eobd, -279 ; if (eob > 278) 4081 adc r4b, al ; iteration_count++ 4082 lea r3, [strideq*3] 4083 mov r6, cq 4084 paddw m11, m12, m12 ; pw_16384 4085.loop: 4086 mova xm0, [cq+64* 0] 4087 mova xm1, [cq+64* 1] 4088 vinserti128 m0, [cq+64* 8], 1 4089 vinserti128 m1, [cq+64* 9], 1 4090 mova xm2, [cq+64* 2] 4091 mova xm3, [cq+64* 3] 4092 vinserti128 m2, [cq+64*10], 1 4093 vinserti128 m3, [cq+64*11], 1 4094 mova xm4, [cq+64* 4] 4095 mova xm5, [cq+64* 5] 4096 vinserti128 m4, [cq+64*12], 1 4097 vinserti128 m5, [cq+64*13], 1 4098 mova xm6, [cq+64* 6] 4099 mova xm7, [cq+64* 7] 4100 vinserti128 m6, [cq+64*14], 1 4101 vinserti128 m7, [cq+64*15], 1 4102 REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 4103 REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 4104 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 4105 REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 4106 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 4107 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 4108 lea dstq, [dstq+strideq*4] 4109 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 4110 WRITE_16X2 6, 7, 0, 1, strideq*2, r3 4111 lea dstq, [dstq+strideq*4] 4112 add cq, 16 4113 dec r4b 4114 jge .loop 4115 sub cq, 32 4116 pxor m0, m0 4117 mov r0d, 8 4118 cmp cq, r6 4119 ja .zero_loop 4120.zero_loop_half: 4121 mova [r6+64*0], m0 4122 mova [r6+64*1], m0 4123 add r6, 64*4 4124 mova [r6-64*2], m0 4125 mova [r6-64*1], m0 4126 sub r0d, 2 4127 jg .zero_loop_half 4128 RET 4129.zero_loop: 4130 mova [r6+32*0], m0 4131 mova [r6+32*1], m0 4132 mova [r6+32*2], m0 4133 mova [r6+32*3], m0 4134 add r6, 32*4 4135 dec r0d 4136 jg .zero_loop 4137 RET 4138 4139cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob 4140%undef cmp 4141 lea r6, [o_base] 4142 vpbroadcastd m9, [o(pw_2896x8)] 4143 vpbroadcastd m10, [o(pw_1697x16)] 4144 vpbroadcastd m11, [o(pw_2048)] 4145 cmp eobd, 35 ; if (eob > 35) 4146 setg r4b ; iteration_count++ 4147 cmp eobd, 150 ; if (eob > 150) 4148 setg r3b ; iteration_count += 2 4149 lea r4d, [r4+r3*2] 4150 lea r3, [strideq*3] 4151 mov r5, dstq 4152 mov r6, cq 4153.loop: 4154 mova xm0, [cq+32* 0] 4155 mova xm1, [cq+32* 1] 4156 vinserti128 m0, [cq+32* 8], 1 4157 vinserti128 m1, [cq+32* 9], 1 4158 mova xm2, [cq+32* 2] 4159 mova xm3, [cq+32* 3] 4160 vinserti128 m2, [cq+32*10], 1 4161 vinserti128 m3, [cq+32*11], 1 4162 mova xm4, [cq+32* 4] 4163 mova xm5, [cq+32* 5] 4164 vinserti128 m4, [cq+32*12], 1 4165 vinserti128 m5, [cq+32*13], 1 4166 mova xm6, [cq+32* 6] 4167 mova xm7, [cq+32* 7] 4168 vinserti128 m6, [cq+32*14], 1 4169 vinserti128 m7, [cq+32*15], 1 4170 REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 4171 REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 4172 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 4173 REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7 4174 REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 4175 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 4176 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 4177 lea dstq, [dstq+strideq*4] 4178 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 4179 WRITE_16X2 6, 7, 0, 1, strideq*2, r3 4180 lea dstq, [dstq+strideq*4] 4181 add cq, 16 4182 dec r4b 4183 jl .ret 4184 test r4b, 1 4185 jz .loop 4186 add cq, 32*15 4187 lea dstq, [r5+16] 4188 jmp .loop 4189.ret: 4190 sub cd, eax 4191 pxor m0, m0 4192 add cd, 384 4193.zero_loop: 4194 mova [r6+32*0], m0 4195 mova [r6+32*1], m0 4196 mova [r6+32*2], m0 4197 mova [r6+32*3], m0 4198 add r6, 32*4 4199 sub cd, 128 4200 jge .zero_loop 4201 RET 4202 4203cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob 4204 lea r6, [o_base] 4205 test eobd, eobd 4206 jnz .normal 4207 movd xm1, [o(pw_2896x8)] 4208 pmulhrsw xm0, xm1, [cq] 4209 movd xm2, [o(pw_8192)] 4210 mov [cq], eobd 4211 or r3d, 32 4212 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly 4213.normal: 4214 PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ 4215 base, tmp3, tmp4 4216 %undef cmp 4217 lea tmp1q, [rsp+32*7] 4218 lea tmp2q, [tmp1q+32*8] 4219 sub eobd, 136 4220 mov tmp4d, eobd 4221.pass1_loop: 4222 LOAD_8ROWS cq+64*1, 64*2 4223 pxor m8, m8 4224 REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 4225 test tmp4d, tmp4d 4226 jl .fast 4227 LOAD_8ROWS_H cq+64*17, 64*2 4228 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf 4229 LOAD_8ROWS_H cq+64*16, 64*2 4230 pxor m0, m0 4231 REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ 4232 24, 25, 26, 27, 28, 29, 30, 31 4233 mova [rsp], m15 4234 jmp .idct16 4235.fast: 4236 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 4237 pxor m8, m8 4238 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 4239 mova [rsp], m8 4240.idct16: 4241 LOAD_8ROWS cq+64*0, 64*2 4242 pxor m15, m15 4243 REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 4244 call m(idct_16x16_internal_8bpc).main 4245 call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end 4246 vpbroadcastd m7, [o(pw_8192)] 4247 call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round 4248 lea tmp3q, [tmp1q+32*32] 4249 mova m15, [rsp] 4250 mova [tmp3q-32*4], m0 4251 mova [tmp3q-32*3], m2 4252 mova [tmp3q-32*2], m4 4253 mova [tmp3q-32*1], m6 4254 mova [tmp3q+32*0], m8 4255 mova [tmp3q+32*1], m10 4256 mova [tmp3q+32*2], m12 4257 mova [tmp3q+32*3], m14 4258 add tmp3q, 32*8 4259 mova [tmp3q-32*4], m1 4260 mova [tmp3q-32*3], m3 4261 mova [tmp3q-32*2], m5 4262 mova [tmp3q-32*1], m7 4263 mova [tmp3q+32*0], m9 4264 mova [tmp3q+32*1], m11 4265 mova [tmp3q+32*2], m13 4266 mova [tmp3q+32*3], m15 4267 vpbroadcastd m9, [o(pw_8192)] 4268 pmulhrsw m0, m9, [tmp1q-32*4] 4269 pmulhrsw m1, m9, [tmp1q-32*3] 4270 pmulhrsw m2, m9, [tmp1q-32*2] 4271 pmulhrsw m3, m9, [tmp1q-32*1] 4272 pmulhrsw m4, m9, [tmp1q+32*0] 4273 pmulhrsw m5, m9, [tmp1q+32*1] 4274 pmulhrsw m6, m9, [tmp1q+32*2] 4275 pmulhrsw m7, m9, [tmp1q+32*3] 4276 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 4277 mova [tmp1q-32*4], m0 4278 pmulhrsw m0, m9, [tmp2q-32*4] 4279 mova [tmp2q-32*4], m1 4280 pmulhrsw m1, m9, [tmp2q-32*3] 4281 mova [tmp1q-32*3], m2 4282 pmulhrsw m2, m9, [tmp2q-32*2] 4283 mova [tmp2q-32*3], m3 4284 pmulhrsw m3, m9, [tmp2q-32*1] 4285 mova [tmp1q-32*2], m4 4286 pmulhrsw m4, m9, [tmp2q+32*0] 4287 mova [tmp2q-32*2], m5 4288 pmulhrsw m5, m9, [tmp2q+32*1] 4289 mova [tmp1q-32*1], m6 4290 pmulhrsw m6, m9, [tmp2q+32*2] 4291 mova [tmp2q-32*1], m7 4292 pmulhrsw m7, m9, [tmp2q+32*3] 4293 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 4294 mova [tmp1q+32*0], m0 4295 mova [tmp2q+32*0], m1 4296 mova [tmp1q+32*1], m2 4297 mova [tmp2q+32*1], m3 4298 mova [tmp1q+32*2], m4 4299 mova [tmp2q+32*2], m5 4300 mova [tmp1q+32*3], m6 4301 mova [tmp2q+32*3], m7 4302 add cq, 32 4303 add tmp1q, 32*16 4304 add tmp2q, 32*16 4305 add eobd, 0x80000000 4306 jnc .pass1_loop 4307 add tmp1q, 32*24 4308 imul r2, strideq, 19 4309 lea r3, [strideq*3] 4310 add r2, dstq 4311 test tmp4d, tmp4d 4312 jge .pass2_loop 4313 add tmp1q, 32*16 4314 add tmp2q, 32*16 4315 add tmp3q, 32*16 4316.pass2_loop: 4317 LOAD_8ROWS tmp2q-32*4, 32 4318 test tmp4d, tmp4d 4319 jl .fast2 4320 LOAD_8ROWS_H tmp3q-32*4, 32 4321 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf 4322 sub tmp3q, 32*8 4323 LOAD_8ROWS_H tmp3q-32*4, 32 4324 sub tmp3q, 32*16 4325 jmp .pass2_loop_end 4326.fast2: 4327 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 4328 sub tmp3q, 32*24 4329 pxor m8, m8 4330 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 4331.pass2_loop_end: 4332 LOAD_8ROWS tmp3q-32*4, 32 4333 mova [rsp], m15 4334 call m(idct_16x16_internal_8bpc).main 4335 call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end 4336 lea tmp3q, [tmp1q-32*32] 4337 cmp tmp2q, tmp3q 4338 jb .ret 4339 sub tmp2q, 32*32 4340 sub dstq, r3 4341 lea r2, [r2+r3+16] 4342 add dstq, 16 4343 jmp .pass2_loop 4344.ret: 4345 RET 4346 4347cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob 4348 %undef cmp 4349 vpbroadcastd m9, [pw_8192] 4350 sub eobd, 136 ; if (eob < 136) 4351 shr eobd, 30 ; topleft 16x16 only 4352 lea eobd, [eobq*2-8] 4353 lea r4, [strideq*3] 4354 mov r5, dstq 4355 lea r6, [cq+32] 4356.loop: 4357 mova xm0, [cq+64* 0] 4358 mova xm1, [cq+64* 1] 4359 vinserti128 m0, [cq+64* 8], 1 4360 vinserti128 m1, [cq+64* 9], 1 4361 mova xm2, [cq+64* 2] 4362 mova xm3, [cq+64* 3] 4363 vinserti128 m2, [cq+64*10], 1 4364 vinserti128 m3, [cq+64*11], 1 4365 mova xm4, [cq+64* 4] 4366 mova xm5, [cq+64* 5] 4367 vinserti128 m4, [cq+64*12], 1 4368 vinserti128 m5, [cq+64*13], 1 4369 mova xm6, [cq+64* 6] 4370 mova xm7, [cq+64* 7] 4371 vinserti128 m6, [cq+64*14], 1 4372 vinserti128 m7, [cq+64*15], 1 4373 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 4374 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 4375 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 4376 WRITE_16X2 2, 3, 0, 1, strideq*2, r4 4377 lea dstq, [dstq+strideq*4] 4378 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 4379 WRITE_16X2 6, 7, 0, 1, strideq*2, r4 4380 lea dstq, [dstq+strideq*4] 4381 add cq, 16 4382 inc eobd 4383 jz .ret 4384 test eobd, 3 4385 jnz .loop 4386 add cq, 64*15 4387 lea dstq, [r5+16] 4388 jmp .loop 4389.ret: 4390 pxor m0, m0 4391 mov r0d, 16 4392 cmp cq, r6 4393 jne .zero_loop 4394.zero_loop_topleft: 4395 mova [r6-32*1], m0 4396 mova [r6+32*1], m0 4397 mova [r6+32*3], m0 4398 mova [r6+32*5], m0 4399 add r6, 64*4 4400 sub r0d, 4 4401 jg .zero_loop_topleft 4402 RET 4403.zero_loop: 4404 mova [r6-32*1], m0 4405 mova [r6+32*0], m0 4406 mova [r6+32*1], m0 4407 mova [r6+32*2], m0 4408 add r6, 32*4 4409 dec r0d 4410 jg .zero_loop 4411 RET 4412 4413%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) 4414%if %1 & 1 4415 mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n 4416 mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n 4417%else 4418 mova m%5, [tmp1q-32*(45-%1)] 4419 mova m%4, [tmp2q-32*(20+%1)] 4420%endif 4421 psubsw m%6, m%5, m%4 ; idct32 out31-n 4422 paddsw m%5, m%4 ; idct32 out 0+n 4423 psubsw m%4, m%6, m%3 ; out32+n 4424 paddsw m%6, m%3 ; out31-n 4425 psubsw m%3, m%5, m%2 ; out63-n 4426 paddsw m%5, m%2 ; out 0+n 4427%if %0 == 6 ; pass 1 4428%if %1 & 1 4429 mova [tmp2q-32*(19-%1)], m%4 4430 mova [tmp1q-32*(14+%1)], m%6 4431 mova [tmp1q+32*(18-%1)], m%3 4432 mova [tmp2q-32*(51-%1)], m%5 4433%else 4434 mova [tmp1q-32*(13-%1)], m%4 4435 mova [tmp2q-32*(20+%1)], m%6 4436 mova [tmp2q+32*(12-%1)], m%3 4437 mova [tmp1q-32*(45-%1)], m%5 4438%endif 4439%else ; pass 2 4440 REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5 4441%if %1 & 1 4442 %define %%d0 r2 4443 %define %%d1 dstq 4444%else 4445 %define %%d0 dstq 4446 %define %%d1 r2 4447%endif 4448 pmovzxbw m%2, [%%d0+%9 ] 4449 paddw m%2, m%4 4450 pmovzxbw m%4, [%%d1+%8 ] 4451 paddw m%4, m%6 4452 pmovzxbw m%6, [%%d1+%10] 4453 paddw m%3, m%6 4454 pmovzxbw m%6, [%%d0+%7 ] 4455 paddw m%5, m%6 4456 packuswb m%2, m%4 4457 packuswb m%3, m%5 4458 vpermq m%2, m%2, q3120 4459 vpermq m%3, m%3, q3120 4460 mova [%%d0+%9 ], xm%2 4461 vextracti128 [%%d1+%8 ], m%2, 1 4462 mova [%%d1+%10], xm%3 4463 vextracti128 [%%d0+%7 ], m%3, 1 4464%endif 4465%endmacro 4466 4467cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob 4468 lea r6, [o_base] 4469 test eobd, eobd 4470 jnz .normal 4471 movd xm1, [o(pw_2896x8)] 4472 pmulhrsw xm0, xm1, [cq] 4473 movd xm2, [o(pw_8192)] 4474 mov [cq], eobd 4475 or r3d, 64 4476 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly 4477.normal: 4478 PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 4479 %undef cmp 4480 lea tmp1q, [rsp+32*23] 4481 lea tmp2q, [tmp1q+32*24] 4482 sub eobd, 151 4483 mov r7d, eobd 4484.pass1_loop: 4485 LOAD_16ROWS cq, 64 4486 call m(idct_16x16_internal_8bpc).main 4487 mova m1, [rsp+32*1] 4488 mova [rsp+32*0], m6 4489 mova [rsp+32*1], m7 4490 vpbroadcastd m7, [o(pw_8192)] 4491 call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round 4492 mova m15, [rsp+32*0] 4493 mova [tmp1q-32*4], m0 4494 mova [tmp1q-32*3], m2 4495 mova [tmp1q-32*2], m4 4496 mova [tmp1q-32*1], m6 4497 mova [tmp1q+32*0], m8 4498 mova [tmp1q+32*1], m10 4499 mova [tmp1q+32*2], m12 4500 mova [tmp1q+32*3], m14 4501 mova [tmp2q-32*4], m1 4502 mova [tmp2q-32*3], m3 4503 mova [tmp2q-32*2], m5 4504 mova [tmp2q-32*1], m7 4505 mova [tmp2q+32*0], m9 4506 mova [tmp2q+32*1], m11 4507 mova [tmp2q+32*2], m13 4508 mova [tmp2q+32*3], m15 4509 add cq, 32 4510 add tmp1q, 32*8 4511 add tmp2q, 32*8 4512 add eobd, 0x80000000 4513 jnc .pass1_loop 4514 lea r2, [rsp+32*23] 4515 mova xm0, [r2-32*4+ 0] 4516 mova xm1, [r2-32*2+ 0] 4517 vinserti128 m0, [r2+32*0+ 0], 1 4518 vinserti128 m1, [r2+32*2+ 0], 1 4519 mova xm2, [r2-32*4+16] 4520 mova xm3, [r2-32*2+16] 4521 vinserti128 m2, [r2+32*0+16], 1 4522 vinserti128 m3, [r2+32*2+16], 1 4523 pxor m4, m4 4524 REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 4525 test r7d, r7d 4526 jl .fast 4527 lea r3, [r2+32*8] 4528 mova xm4, [r3-32*4+ 0] 4529 mova xm5, [r3-32*2+ 0] 4530 vinserti128 m4, [r3+32*0+ 0], 1 4531 vinserti128 m5, [r3+32*2+ 0], 1 4532 mova xm6, [r3-32*4+16] 4533 mova xm7, [r3-32*2+16] 4534 vinserti128 m6, [r3+32*0+16], 1 4535 vinserti128 m7, [r3+32*2+16], 1 4536.fast: 4537 mova [rsp], m8 4538 lea tmp1q, [rsp+32*7] 4539 call m(idct_16x16_internal_8bpc).main 4540 mova m1, [rsp+32*1] 4541 mova [tmp1q-32*4], m0 4542 mova [tmp1q-32*3], m1 4543 mova [tmp1q-32*2], m2 4544 mova [tmp1q-32*1], m3 4545 mova [tmp1q+32*0], m4 4546 mova [tmp1q+32*1], m5 4547 mova [tmp1q+32*2], m6 4548 mova [tmp1q+32*3], m7 4549 add tmp1q, 32*8 4550 mova [tmp1q-32*4], m8 4551 mova [tmp1q-32*3], m9 4552 mova [tmp1q-32*2], m10 4553 mova [tmp1q-32*1], m11 4554 mova [tmp1q+32*0], m12 4555 mova [tmp1q+32*1], m13 4556 mova [tmp1q+32*2], m14 4557 mova [tmp1q+32*3], m15 4558 mova xm0, [r2-32*3+ 0] 4559 mova xm1, [r2-32*1+ 0] 4560 vinserti128 m0, [r2+32*1+ 0], 1 4561 vinserti128 m1, [r2+32*3+ 0], 1 4562 mova xm2, [r2-32*3+16] 4563 mova xm3, [r2-32*1+16] 4564 vinserti128 m2, [r2+32*1+16], 1 4565 vinserti128 m3, [r2+32*3+16], 1 4566 pxor m4, m4 4567 REPX {mova x, m4}, m5, m6, m7 4568 test r7d, r7d 4569 jl .fast2 4570 mova xm4, [r3-32*3+ 0] 4571 mova xm5, [r3-32*1+ 0] 4572 vinserti128 m4, [r3+32*1+ 0], 1 4573 vinserti128 m5, [r3+32*3+ 0], 1 4574 mova xm6, [r3-32*3+16] 4575 mova xm7, [r3-32*1+16] 4576 vinserti128 m6, [r3+32*1+16], 1 4577 vinserti128 m7, [r3+32*3+16], 1 4578.fast2: 4579 add tmp1q, 32*8 4580 lea tmp2q, [tmp1q+32*8] 4581 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 4582 add r2, 32*24 4583 vpbroadcastd m15, [o(pd_2048)] 4584 add tmp1q, 32*16 4585 add tmp2q, 32*32 4586 mova xm0, [r2-32*4+ 0] 4587 mova xm3, [r2-32*1+16] 4588 vinserti128 m0, [r2+32*0+ 0], 1 4589 vinserti128 m3, [r2+32*3+16], 1 4590 mova xm4, [r2-32*4+16] 4591 mova xm7, [r2-32*1+ 0] 4592 vinserti128 m4, [r2+32*0+16], 1 4593 vinserti128 m7, [r2+32*3+ 0], 1 4594 pxor m1, m1 4595 REPX {mova x, m1}, m2, m5, m6 4596 test r7d, r7d 4597 jl .fast3 4598 add r3, 32*24 4599 mova xm1, [r3-32*1+16] 4600 mova xm2, [r3-32*4+ 0] 4601 vinserti128 m1, [r3+32*3+16], 1 4602 vinserti128 m2, [r3+32*0+ 0], 1 4603 mova xm5, [r3-32*1+ 0] 4604 mova xm6, [r3-32*4+16] 4605 vinserti128 m5, [r3+32*3+ 0], 1 4606 vinserti128 m6, [r3+32*0+16], 1 4607.fast3: 4608 add r6, o_idct64_offset 4609 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 4610 add r6, 8 4611 add tmp1q, 32*8 4612 sub tmp2q, 32*8 4613 mova xm0, [r2-32*2+ 0] 4614 mova xm3, [r2-32*3+16] 4615 vinserti128 m0, [r2+32*2+ 0], 1 4616 vinserti128 m3, [r2+32*1+16], 1 4617 mova xm4, [r2-32*2+16] 4618 mova xm7, [r2-32*3+ 0] 4619 vinserti128 m4, [r2+32*2+16], 1 4620 vinserti128 m7, [r2+32*1+ 0], 1 4621 pxor m1, m1 4622 REPX {mova x, m1}, m2, m5, m6 4623 test r7d, r7d 4624 jl .fast4 4625 mova xm1, [r3-32*3+16] 4626 mova xm2, [r3-32*2+ 0] 4627 vinserti128 m1, [r3+32*1+16], 1 4628 vinserti128 m2, [r3+32*2+ 0], 1 4629 mova xm5, [r3-32*3+ 0] 4630 mova xm6, [r3-32*2+16] 4631 vinserti128 m5, [r3+32*1+ 0], 1 4632 vinserti128 m6, [r3+32*2+16], 1 4633.fast4: 4634 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 4635 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 4636 RET 4637ALIGN function_align 4638%define o_base idct64_mul - 8 4639cglobal_label .main_part1 4640 ; idct64 steps 1-5: 4641 ; in1/31/17/15/ 9/23/25/ 7 -> 4642 ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a 4643 ; in5/27/21/11/13/19/29/ 3 -> 4644 ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a 4645 vpbroadcastd m11, [o(idct64_mul+4* 0)] 4646 vpbroadcastd m13, [o(idct64_mul+4* 1)] 4647 vpbroadcastd m10, [o(idct64_mul+4* 4)] 4648 vpbroadcastd m12, [o(idct64_mul+4* 5)] 4649 pmulhrsw m11, m0 ; t63a 4650 pmulhrsw m0, m13 ; t32a 4651 pmulhrsw m10, m1 ; t62a 4652 pmulhrsw m1, m12 ; t33a 4653 vpbroadcastd m9, [o(idct64_mul+4* 8)] 4654 vpbroadcastd m13, [o(idct64_mul+4* 9)] 4655 vpbroadcastd m8, [o(idct64_mul+4*12)] 4656 vpbroadcastd m12, [o(idct64_mul+4*13)] 4657 pmulhrsw m9, m2 ; t61a 4658 pmulhrsw m2, m13 ; t34a 4659 pmulhrsw m8, m3 ; t60a 4660 pmulhrsw m3, m12 ; t35a 4661 psubsw m12, m0, m1 ; t33 4662 paddsw m0, m1 ; t32 4663 psubsw m1, m3, m2 ; t34 4664 paddsw m3, m2 ; t35 4665 psubsw m2, m8, m9 ; t61 4666 paddsw m8, m9 ; t60 4667 psubsw m9, m11, m10 ; t62 4668 paddsw m11, m10 ; t63 4669 ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a 4670 vpbroadcastd m14, [o(pw_401_4076)] 4671 ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a 4672 psubsw m10, m0, m3 ; t35a 4673 paddsw m0, m3 ; t32a 4674 psubsw m3, m11, m8 ; t60a 4675 paddsw m11, m8 ; t63a 4676 psubsw m8, m9, m2 ; t34 4677 paddsw m9, m2 ; t33 4678 psubsw m2, m12, m1 ; t61 4679 paddsw m12, m1 ; t62 4680 mova [tmp1q-32*4], m0 4681 mova [tmp1q-32*3], m9 4682 mova [tmp2q+32*2], m12 4683 mova [tmp2q+32*3], m11 4684 vpbroadcastd m13, [o(pw_m4017_799)] 4685 vpbroadcastd m14, [o(pw_799_4017)] 4686 ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a 4687 ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60 4688 mova [tmp1q-32*2], m2 4689 mova [tmp1q-32*1], m3 4690 mova [tmp2q+32*0], m10 4691 mova [tmp2q+32*1], m8 4692 vpbroadcastd m3, [o(idct64_mul+4*16)] 4693 vpbroadcastd m11, [o(idct64_mul+4*17)] 4694 vpbroadcastd m2, [o(idct64_mul+4*20)] 4695 vpbroadcastd m10, [o(idct64_mul+4*21)] 4696 vpbroadcastd m1, [o(idct64_mul+4*24)] 4697 vpbroadcastd m9, [o(idct64_mul+4*25)] 4698 vpbroadcastd m0, [o(idct64_mul+4*28)] 4699 vpbroadcastd m8, [o(idct64_mul+4*29)] 4700 pmulhrsw m3, m4 ; t59a 4701 pmulhrsw m4, m11 ; t36a 4702 pmulhrsw m2, m5 ; t58a 4703 pmulhrsw m5, m10 ; t37a 4704 pmulhrsw m1, m6 ; t57a 4705 pmulhrsw m6, m9 ; t38a 4706 pmulhrsw m0, m7 ; t56a 4707 pmulhrsw m7, m8 ; t39a 4708 psubsw m8, m4, m5 ; t37 4709 paddsw m4, m5 ; t36 4710 psubsw m5, m7, m6 ; t38 4711 paddsw m7, m6 ; t39 4712 psubsw m6, m0, m1 ; t57 4713 paddsw m0, m1 ; t56 4714 psubsw m1, m3, m2 ; t58 4715 paddsw m3, m2 ; t59 4716 ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a 4717 vpbroadcastd m10, [o(pw_3166_2598)] 4718 ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a 4719 psubsw m2, m7, m4 ; t36a 4720 paddsw m7, m4 ; t39a 4721 psubsw m4, m0, m3 ; t59a 4722 paddsw m0, m3 ; t56a 4723 psubsw m3, m6, m1 ; t37 4724 paddsw m6, m1 ; t38 4725 psubsw m1, m5, m8 ; t58 4726 paddsw m5, m8 ; t57 4727 mova [tmp1q+32*2], m6 4728 mova [tmp1q+32*3], m7 4729 mova [tmp2q-32*4], m0 4730 mova [tmp2q-32*3], m5 4731 vpbroadcastd m6, [o(pw_m799_m4017)] 4732 vpbroadcastd m7, [o(pw_m4017_799)] 4733 ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59 4734 ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a 4735 mova [tmp1q+32*0], m4 4736 mova [tmp1q+32*1], m1 4737 mova [tmp2q-32*2], m3 4738 mova [tmp2q-32*1], m2 4739 ret 4740%define o_base pw_5 + 128 4741.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub 4742 sub r6, o_idct64_offset + 8 4743 vpbroadcastd m11, [o(pw_1567_3784)] 4744 vpbroadcastd m12, [o(pw_m3784_1567)] 4745 vpbroadcastd m13, [o(pw_2896_2896)] 4746 vpbroadcastd m14, [o(pw_m2896_2896)] 4747.main_part2_pass1_loop: 4748 call .main_part2_internal 4749 IDCT64_PART2_END 0, 7, 0, 6, 9, 10 4750 IDCT64_PART2_END 7, 8, 5, 0, 6, 7 4751 IDCT64_PART2_END 8, 2, 1, 0, 6, 7 4752 IDCT64_PART2_END 15, 3, 4, 0, 6, 7 4753 cmp tmp1q, tmp2q 4754 jne .main_part2_pass1_loop 4755 ret 4756cglobal_label .main_part2_internal 4757 mova m0, [tmp1q-32*12] ; t32a 4758 mova m6, [tmp2q-32*13] ; t39a 4759 mova m1, [tmp1q-32* 4] ; t40a 4760 mova m5, [tmp2q+32* 3] ; t55a 4761 add tmp1q, 32 4762 sub tmp2q, 32 4763 mova m2, [tmp1q+32* 3] ; t48a 4764 mova m4, [tmp2q-32* 4] ; t47a 4765 mova m3, [tmp1q+32*11] ; t56a 4766 mova m7, [tmp2q+32*12] ; t63a 4767 psubsw m8, m0, m6 ; t39 4768 paddsw m0, m6 ; t32 4769 psubsw m6, m4, m1 ; t40 4770 paddsw m4, m1 ; t47 4771 psubsw m1, m2, m5 ; t55 4772 paddsw m2, m5 ; t48 4773 psubsw m5, m7, m3 ; t56 4774 paddsw m7, m3 ; t63 4775 ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a 4776 vpbroadcastd m9, [o(pw_m1567_m3784)] 4777 ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a 4778 psubsw m3, m0, m4 ; t47a 4779 paddsw m0, m4 ; t32a 4780 psubsw m4, m7, m2 ; t48a 4781 paddsw m7, m2 ; t63a 4782 psubsw m2, m5, m1 ; t40 4783 paddsw m5, m1 ; t39 4784 psubsw m1, m8, m6 ; t55 4785 paddsw m8, m6 ; t56 4786 ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48 4787 ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a 4788 ret 4789.main_part2_pass2: 4790 sub r6, o_idct64_offset + 8 4791 vpbroadcastd m11, [o(pw_1567_3784)] 4792 vpbroadcastd m12, [o(pw_m3784_1567)] 4793 vpbroadcastd m13, [o(pw_2896_2896)] 4794 lea r9, [strideq*5] ; stride*5 4795 lea r3, [r9+strideq*1] ; stride*6 4796 lea r7, [r9+strideq*2] ; stride*7 4797 lea r8, [r3+strideq*2] ; stride*8 4798 lea r2, [dstq+r7] 4799.main_part2_pass2_loop: 4800 vpbroadcastd m14, [o(pw_m2896_2896)] 4801 call .main_part2_internal 4802 vpbroadcastd m14, [o(pw_2048)] 4803 IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8 4804 IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8 4805 IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 4806 IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 4807 add dstq, strideq 4808 sub r2, strideq 4809 cmp tmp1q, tmp2q 4810 jne .main_part2_pass2_loop 4811 ret 4812 4813cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob 4814 lea r6, [o_base] 4815 test eobd, eobd 4816 jnz .normal 4817 movd xm1, [o(pw_2896x8)] 4818 pmulhrsw xm0, xm1, [cq] 4819 movd xm2, [o(pw_8192)] 4820 mov [cq], eobd 4821 or r3d, 16 4822.dconly: 4823 pmulhrsw xm0, xm2 4824 movd xm2, [o(pw_2048)] 4825 pmulhrsw xm0, xm1 4826 pmulhrsw xm0, xm2 4827 vpbroadcastw m0, xm0 4828 pxor m1, m1 4829.dconly_loop: 4830 mova m2, [dstq+32*0] 4831 mova m3, [dstq+32*1] 4832 punpckhbw m4, m2, m1 4833 punpcklbw m2, m1 4834 punpckhbw m5, m3, m1 4835 punpcklbw m3, m1 4836 paddw m4, m0 4837 paddw m2, m0 4838 paddw m5, m0 4839 paddw m3, m0 4840 packuswb m2, m4 4841 packuswb m3, m5 4842 mova [dstq+32*0], m2 4843 mova [dstq+32*1], m3 4844 add dstq, strideq 4845 dec r3d 4846 jg .dconly_loop 4847 RET 4848.normal: 4849 PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 4850 LOAD_8ROWS cq+32*0, 32*4 4851 pxor m8, m8 4852 REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 4853 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 4854 mova [rsp], m8 4855 lea tmp1q, [rsp+32*7] 4856 call m(idct_16x16_internal_8bpc).main 4857 mova m1, [rsp+32*1] 4858 mova [tmp1q-32*4], m0 4859 mova [tmp1q-32*3], m1 4860 mova [tmp1q-32*2], m2 4861 mova [tmp1q-32*1], m3 4862 mova [tmp1q+32*0], m4 4863 mova [tmp1q+32*1], m5 4864 mova [tmp1q+32*2], m6 4865 mova [tmp1q+32*3], m7 4866 add tmp1q, 32*8 4867 mova [tmp1q-32*4], m8 4868 mova [tmp1q-32*3], m9 4869 mova [tmp1q-32*2], m10 4870 mova [tmp1q-32*1], m11 4871 mova [tmp1q+32*0], m12 4872 mova [tmp1q+32*1], m13 4873 mova [tmp1q+32*2], m14 4874 mova [tmp1q+32*3], m15 4875 LOAD_8ROWS cq+32*2, 32*4 4876 pxor m8, m8 4877 REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 4878 add tmp1q, 32*8 4879 lea tmp2q, [tmp1q+32*8] 4880 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 4881 vpbroadcastd m15, [o(pd_2048)] 4882 add tmp1q, 32*16 4883 add tmp2q, 32*32 4884 mova m0, [cq+32* 1] 4885 mova m1, [cq+32*31] 4886 mova m2, [cq+32*17] 4887 mova m3, [cq+32*15] 4888 mova m4, [cq+32* 9] 4889 mova m5, [cq+32*23] 4890 mova m6, [cq+32*25] 4891 mova m7, [cq+32* 7] 4892 pxor m8, m8 4893 REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 4894 add r6, o_idct64_offset 4895 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 4896 add r6, 8 4897 add tmp1q, 32*8 4898 sub tmp2q, 32*8 4899 mova m0, [cq+32* 5] 4900 mova m1, [cq+32*27] 4901 mova m2, [cq+32*21] 4902 mova m3, [cq+32*11] 4903 mova m4, [cq+32*13] 4904 mova m5, [cq+32*19] 4905 mova m6, [cq+32*29] 4906 mova m7, [cq+32* 3] 4907 pxor m8, m8 4908 REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 4909 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 4910 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 4911 sub tmp1q, 32*36 4912 lea r2, [strideq*3] 4913 mov tmp2d, 4 4914.pass2_loop: 4915 lea r3, [tmp1q-32*8] 4916 mova xm0, [r3 -32*4] 4917 mova xm1, [r3 -32*3] 4918 vinserti128 m0, [tmp1q-32*4], 1 4919 vinserti128 m1, [tmp1q-32*3], 1 4920 mova xm2, [r3 -32*2] 4921 mova xm3, [r3 -32*1] 4922 vinserti128 m2, [tmp1q-32*2], 1 4923 vinserti128 m3, [tmp1q-32*1], 1 4924 mova xm4, [r3 +32*0] 4925 mova xm5, [r3 +32*1] 4926 vinserti128 m4, [tmp1q+32*0], 1 4927 vinserti128 m5, [tmp1q+32*1], 1 4928 mova xm6, [r3 +32*2] 4929 mova xm7, [r3 +32*3] 4930 vinserti128 m6, [tmp1q+32*2], 1 4931 vinserti128 m7, [tmp1q+32*3], 1 4932 mova xm8, [r3 -32*4+16] 4933 mova xm9, [r3 -32*3+16] 4934 vinserti128 m8, [tmp1q-32*4+16], 1 4935 vinserti128 m9, [tmp1q-32*3+16], 1 4936 mova xm10, [r3 -32*2+16] 4937 mova xm11, [r3 -32*1+16] 4938 vinserti128 m10, [tmp1q-32*2+16], 1 4939 vinserti128 m11, [tmp1q-32*1+16], 1 4940 mova xm12, [r3 +32*0+16] 4941 mova xm13, [r3 +32*1+16] 4942 vinserti128 m12, [tmp1q+32*0+16], 1 4943 vinserti128 m13, [tmp1q+32*1+16], 1 4944 mova xm14, [r3 +32*2+16] 4945 mova xm15, [r3 +32*3+16] 4946 vinserti128 m14, [tmp1q+32*2+16], 1 4947 vinserti128 m15, [tmp1q+32*3+16], 1 4948 mova [rsp+32*0], m6 4949 mova [rsp+32*1], m7 4950 vpbroadcastd m7, [o(pw_8192)] 4951 call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round 4952 call m(idct_16x16_internal_8bpc).main 4953 mova [rsp+32*0], m15 4954 vpbroadcastd m15, [o(pw_2048)] 4955 REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 4956 WRITE_16X2 2, 3, 1, 2, strideq*2, r2 4957 pmulhrsw m1, m15, [rsp+32*1] 4958 WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 4959 lea r3, [dstq+strideq*4] 4960 %define dstq r3 4961 WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 4962 WRITE_16X2 6, 7, 2, 3, strideq*2, r2 4963 REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14 4964 lea r3, [r3+strideq*4] 4965 WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 4966 WRITE_16X2 10, 11, 2, 3, strideq*2, r2 4967 pmulhrsw m15, [rsp+32*0] 4968 lea r3, [r3+strideq*4] 4969 WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 4970 WRITE_16X2 14, 15, 2, 3, strideq*2, r2 4971 add tmp1q, 32*16 4972 add r0, 16 4973 dec tmp2d 4974 jg .pass2_loop 4975 RET 4976 4977cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob 4978 lea r6, [o_base] 4979 test eobd, eobd 4980 jnz .normal 4981 movd xm1, [o(pw_2896x8)] 4982 pmulhrsw xm0, xm1, [cq] 4983 movd xm2, [o(pw_16384)] 4984 mov [cq], eobd 4985 pmulhrsw xm0, xm1 4986 or r3d, 64 4987 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly 4988.normal: 4989 PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 4990 lea tmp1q, [rsp+32*7] 4991 lea r10d, [eobq-136] 4992 sar r10d, 31 4993.pass1_loop: 4994 lea tmp2q, [tmp1q+32*16] 4995 LOAD_8ROWS cq+64*1, 64*2, 1 4996 pxor m8, m8 4997 REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 4998 test r10b, r10b 4999 jnz .fast 5000 LOAD_8ROWS_H cq+64*17, 64*2, 2 5001 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf 5002 LOAD_8ROWS_H cq+64*16, 64*2, 1 5003 mova [rsp], m15 5004 pxor m15, m15 5005 REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \ 5006 24, 25, 26, 27, 28, 29, 30, 31 5007 jmp .idct16 5008.fast: 5009 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 5010 pxor m8, m8 5011 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 5012 mova [rsp], m8 5013.idct16: 5014 LOAD_8ROWS cq+64*0, 64*2, 1 5015 pxor m15, m15 5016 REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 5017 call m(idct_16x16_internal_8bpc).main 5018 call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end 5019 vpbroadcastd m7, [o(pw_16384)] 5020 call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round 5021 lea r3, [tmp1q+32*48] 5022 mova m15, [rsp] 5023 mova [r3-32*4], m0 5024 mova [r3-32*3], m2 5025 mova [r3-32*2], m4 5026 mova [r3-32*1], m6 5027 mova [r3+32*0], m8 5028 mova [r3+32*1], m10 5029 mova [r3+32*2], m12 5030 mova [r3+32*3], m14 5031 add r3, 32*24 5032 mova [r3-32*4], m1 5033 mova [r3-32*3], m3 5034 mova [r3-32*2], m5 5035 mova [r3-32*1], m7 5036 mova [r3+32*0], m9 5037 mova [r3+32*1], m11 5038 mova [r3+32*2], m13 5039 mova [r3+32*3], m15 5040 vpbroadcastd m9, [o(pw_16384)] 5041 pmulhrsw m0, m9, [tmp1q-32*4] 5042 pmulhrsw m1, m9, [tmp1q-32*3] 5043 pmulhrsw m2, m9, [tmp1q-32*2] 5044 pmulhrsw m3, m9, [tmp1q-32*1] 5045 pmulhrsw m4, m9, [tmp1q+32*0] 5046 pmulhrsw m5, m9, [tmp1q+32*1] 5047 pmulhrsw m6, m9, [tmp1q+32*2] 5048 pmulhrsw m7, m9, [tmp1q+32*3] 5049 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 5050 mova [tmp1q-32*4], m0 5051 pmulhrsw m0, m9, [tmp2q-32*4] 5052 mova [tmp2q-32*4], m1 5053 pmulhrsw m1, m9, [tmp2q-32*3] 5054 mova [tmp1q-32*3], m2 5055 pmulhrsw m2, m9, [tmp2q-32*2] 5056 mova [tmp2q-32*3], m3 5057 pmulhrsw m3, m9, [tmp2q-32*1] 5058 mova [tmp1q-32*2], m4 5059 pmulhrsw m4, m9, [tmp2q+32*0] 5060 mova [tmp2q-32*2], m5 5061 pmulhrsw m5, m9, [tmp2q+32*1] 5062 mova [tmp1q-32*1], m6 5063 pmulhrsw m6, m9, [tmp2q+32*2] 5064 mova [tmp2q-32*1], m7 5065 pmulhrsw m7, m9, [tmp2q+32*3] 5066 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 5067 mova [tmp1q+32*0], m0 5068 mova [tmp2q+32*0], m1 5069 mova [tmp1q+32*1], m2 5070 mova [tmp2q+32*1], m3 5071 mova [tmp1q+32*2], m4 5072 mova [tmp2q+32*2], m5 5073 mova [tmp1q+32*3], m6 5074 mova [tmp2q+32*3], m7 5075 add cq, 32 5076 add tmp1q, 32*8 5077 add r10d, 0x80000000 5078 jnc .pass1_loop 5079 lea r2, [rsp+32*55] 5080 lea r7, [r2+32*24] 5081.pass2_loop: 5082 lea r3, [r2+32*8] 5083 lea r8, [r7+32*8] 5084 mova m0, [r2-32*4] 5085 mova m1, [r2-32*2] 5086 mova m2, [r2+32*0] 5087 mova m3, [r2+32*2] 5088 pxor m4, m4 5089 REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 5090 test r10b, r10b 5091 jnz .fast2 5092 mova m4, [r3-32*4] 5093 mova m5, [r3-32*2] 5094 mova m6, [r3+32*0] 5095 mova m7, [r3+32*2] 5096.fast2: 5097 mova [rsp], m8 5098 lea tmp1q, [rsp+32*39] 5099 call m(idct_16x16_internal_8bpc).main 5100 mova m1, [rsp+32*1] 5101 mova [tmp1q-32*4], m0 5102 mova [tmp1q-32*3], m1 5103 mova [tmp1q-32*2], m2 5104 mova [tmp1q-32*1], m3 5105 mova [tmp1q+32*0], m4 5106 mova [tmp1q+32*1], m5 5107 mova [tmp1q+32*2], m6 5108 mova [tmp1q+32*3], m7 5109 add tmp1q, 32*8 5110 mova [tmp1q-32*4], m8 5111 mova [tmp1q-32*3], m9 5112 mova [tmp1q-32*2], m10 5113 mova [tmp1q-32*1], m11 5114 mova [tmp1q+32*0], m12 5115 mova [tmp1q+32*1], m13 5116 mova [tmp1q+32*2], m14 5117 mova [tmp1q+32*3], m15 5118 mova m0, [r2-32*3] 5119 mova m1, [r2-32*1] 5120 mova m2, [r2+32*1] 5121 mova m3, [r2+32*3] 5122 pxor m4, m4 5123 REPX {mova x, m4}, m5, m6, m7 5124 test r10b, r10b 5125 jnz .fast3 5126 mova m4, [r3-32*3] 5127 mova m5, [r3-32*1] 5128 mova m6, [r3+32*1] 5129 mova m7, [r3+32*3] 5130.fast3: 5131 add tmp1q, 32*8 5132 lea tmp2q, [tmp1q+32*8] 5133 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 5134 vpbroadcastd m15, [o(pd_2048)] 5135 add tmp1q, 32*16 5136 add tmp2q, 32*32 5137 mova m0, [r7-32*4] 5138 mova m3, [r7+32*3] 5139 mova m4, [r7+32*0] 5140 mova m7, [r7-32*1] 5141 pxor m1, m1 5142 REPX {mova x, m1}, m2, m5, m6 5143 test r10b, r10b 5144 jnz .fast4 5145 mova m1, [r8+32*3] 5146 mova m2, [r8-32*4] 5147 mova m5, [r8-32*1] 5148 mova m6, [r8+32*0] 5149.fast4: 5150 add r6, o_idct64_offset 5151 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 5152 add r6, 8 5153 add tmp1q, 32*8 5154 sub tmp2q, 32*8 5155 mova m0, [r7-32*2] 5156 mova m3, [r7+32*1] 5157 mova m4, [r7+32*2] 5158 mova m7, [r7-32*3] 5159 pxor m1, m1 5160 REPX {mova x, m1}, m2, m5, m6 5161 test r10b, r10b 5162 jnz .fast5 5163 mova m1, [r8+32*1] 5164 mova m2, [r8-32*2] 5165 mova m5, [r8-32*3] 5166 mova m6, [r8+32*2] 5167.fast5: 5168 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 5169 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 5170 add r10d, 0x80000000 5171 jc .ret 5172 lea r2, [rsp+32*7] 5173 lea r7, [r2+32*16] 5174 sub dstq, r8 5175 lea dstq, [dstq+strideq*4+16] 5176 jmp .pass2_loop 5177.ret: 5178 RET 5179 5180cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob 5181 lea r6, [o_base] 5182 test eobd, eobd 5183 jnz .normal 5184 movd xm1, [o(pw_2896x8)] 5185 pmulhrsw xm0, xm1, [cq] 5186 movd xm2, [o(pw_16384)] 5187 mov [cq], eobd 5188 pmulhrsw xm0, xm1 5189 or r3d, 32 5190 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly 5191.normal: 5192 PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ 5193 base, tmp3, tmp4 5194 lea tmp1q, [rsp+32*7] 5195 lea tmp4d, [eobq-136] 5196.pass1_loop: 5197 LOAD_8ROWS cq+64*0, 64*4, 1 5198 pxor m8, m8 5199 REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 5200 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 5201 mova [rsp], m8 5202 call m(idct_16x16_internal_8bpc).main 5203 mova m1, [rsp+32*1] 5204 mova [tmp1q-32*4], m0 5205 mova [tmp1q-32*3], m1 5206 mova [tmp1q-32*2], m2 5207 mova [tmp1q-32*1], m3 5208 mova [tmp1q+32*0], m4 5209 mova [tmp1q+32*1], m5 5210 mova [tmp1q+32*2], m6 5211 mova [tmp1q+32*3], m7 5212 add tmp1q, 32*8 5213 mova [tmp1q-32*4], m8 5214 mova [tmp1q-32*3], m9 5215 mova [tmp1q-32*2], m10 5216 mova [tmp1q-32*1], m11 5217 mova [tmp1q+32*0], m12 5218 mova [tmp1q+32*1], m13 5219 mova [tmp1q+32*2], m14 5220 mova [tmp1q+32*3], m15 5221 LOAD_8ROWS cq+64*2, 64*4, 1 5222 pxor m8, m8 5223 REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 5224 add tmp1q, 32*8 5225 lea tmp2q, [tmp1q+32*8] 5226 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 5227 vpbroadcastd m15, [o(pd_2048)] 5228 add tmp1q, 32*16 5229 add tmp2q, 32*32 5230 vpbroadcastd m7, [o(pw_2896x8)] 5231 pmulhrsw m0, m7, [cq+64* 1] 5232 pmulhrsw m1, m7, [cq+64*31] 5233 pmulhrsw m2, m7, [cq+64*17] 5234 pmulhrsw m3, m7, [cq+64*15] 5235 pmulhrsw m4, m7, [cq+64* 9] 5236 pmulhrsw m5, m7, [cq+64*23] 5237 pmulhrsw m6, m7, [cq+64*25] 5238 pmulhrsw m7, [cq+64* 7] 5239 pxor m8, m8 5240 REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 5241 add r6, o_idct64_offset 5242 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 5243 vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))] 5244 add r6, 8 5245 add tmp1q, 32*8 5246 sub tmp2q, 32*8 5247 pmulhrsw m0, m7, [cq+64* 5] 5248 pmulhrsw m1, m7, [cq+64*27] 5249 pmulhrsw m2, m7, [cq+64*21] 5250 pmulhrsw m3, m7, [cq+64*11] 5251 pmulhrsw m4, m7, [cq+64*13] 5252 pmulhrsw m5, m7, [cq+64*19] 5253 pmulhrsw m6, m7, [cq+64*29] 5254 pmulhrsw m7, [cq+64* 3] 5255 pxor m8, m8 5256 REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 5257 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 5258 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 5259 sub tmp1q, 32*44 5260 vpbroadcastd m10, [o(pw_16384)] 5261 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave 5262 add cq, 32 5263 add tmp4d, 0x80000000 5264 jnc .pass1_loop 5265 lea tmp1q, [rsp+32*15] 5266 imul r2, strideq, 19 5267 lea r3, [strideq*3] 5268 add r2, dstq 5269 mov tmp4b, 4 5270.pass2_loop: 5271 lea tmp2q, [tmp1q+32*64] 5272 LOAD_8ROWS tmp1q-32*4, 32 5273 test tmp4d, 0x40000000 5274 jnz .fast 5275 LOAD_8ROWS_H tmp2q-32*4, 32 5276 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf 5277 lea tmp3q, [tmp2q-32*8] 5278 LOAD_8ROWS_H tmp3q-32*4, 32 5279 mova [rsp], m15 5280 jmp .idct16 5281.fast: 5282 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 5283 pxor m8, m8 5284 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 5285 mova [rsp], m8 5286.idct16: 5287 lea tmp3q, [tmp1q-32*8] 5288 LOAD_8ROWS tmp3q-32*4, 32 5289 call m(idct_16x16_internal_8bpc).main 5290 call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end 5291 add tmp1q, 32*16 5292 sub dstq, r3 5293 lea r2, [r2+r3+16] 5294 add dstq, 16 5295 dec tmp4b 5296 jg .pass2_loop 5297 RET 5298ALIGN function_align 5299.transpose_round_interleave: 5300 mov tmp3d, 4 5301.loop: 5302 lea tmp2q, [tmp1q+32*8] 5303 mova xm0, [tmp1q-32*4] 5304 mova xm1, [tmp1q-32*3] 5305 vinserti128 m0, [tmp2q-32*4], 1 5306 vinserti128 m1, [tmp2q-32*3], 1 5307 mova xm2, [tmp1q-32*2] 5308 mova xm3, [tmp1q-32*1] 5309 vinserti128 m2, [tmp2q-32*2], 1 5310 vinserti128 m3, [tmp2q-32*1], 1 5311 mova xm4, [tmp1q+32*0] 5312 mova xm5, [tmp1q+32*1] 5313 vinserti128 m4, [tmp2q+32*0], 1 5314 vinserti128 m5, [tmp2q+32*1], 1 5315 mova xm6, [tmp1q+32*2] 5316 mova xm7, [tmp1q+32*3] 5317 vinserti128 m6, [tmp2q+32*2], 1 5318 vinserti128 m7, [tmp2q+32*3], 1 5319 REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 5320 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 5321 mova xm8, [tmp1q-32*4+16] 5322 mova xm9, [tmp1q-32*3+16] 5323 vinserti128 m8, [tmp2q-32*4+16], 1 5324 vinserti128 m9, [tmp2q-32*3+16], 1 5325 mova [tmp1q-32*4], m0 5326 mova [tmp2q-32*4], m1 5327 mova [tmp1q-32*3], m2 5328 mova [tmp2q-32*3], m3 5329 mova xm2, [tmp1q-32*2+16] 5330 mova xm3, [tmp1q-32*1+16] 5331 vinserti128 m2, [tmp2q-32*2+16], 1 5332 vinserti128 m3, [tmp2q-32*1+16], 1 5333 mova [tmp1q-32*2], m4 5334 mova [tmp2q-32*2], m5 5335 mova [tmp1q-32*1], m6 5336 mova [tmp2q-32*1], m7 5337 mova xm4, [tmp1q+32*0+16] 5338 mova xm5, [tmp1q+32*1+16] 5339 vinserti128 m4, [tmp2q+32*0+16], 1 5340 vinserti128 m5, [tmp2q+32*1+16], 1 5341 mova xm6, [tmp1q+32*2+16] 5342 mova xm7, [tmp1q+32*3+16] 5343 vinserti128 m6, [tmp2q+32*2+16], 1 5344 vinserti128 m7, [tmp2q+32*3+16], 1 5345 pmulhrsw m0, m8, m10 5346 pmulhrsw m1, m9, m10 5347 REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 5348 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 5349 mova [tmp1q+32*0], m0 5350 mova [tmp2q+32*0], m1 5351 mova [tmp1q+32*1], m2 5352 mova [tmp2q+32*1], m3 5353 mova [tmp1q+32*2], m4 5354 mova [tmp2q+32*2], m5 5355 mova [tmp1q+32*3], m6 5356 mova [tmp2q+32*3], m7 5357 add tmp1q, 32*16 5358 dec tmp3d 5359 jg .loop 5360 ret 5361 5362cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob 5363 lea r6, [o_base] 5364 test eobd, eobd 5365 jnz .normal 5366 movd xm1, [o(pw_2896x8)] 5367 pmulhrsw xm0, xm1, [cq] 5368 movd xm2, [o(pw_8192)] 5369 mov [cq], eobd 5370 or r3d, 64 5371 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly 5372.normal: 5373 PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 5374 lea tmp1q, [rsp+32*71] 5375 lea r10d, [eobq-136] 5376.pass1_loop: 5377 LOAD_8ROWS cq+64*0, 64*4 5378 pxor m8, m8 5379 REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 5380 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 5381 mova [rsp], m8 5382 call m(idct_16x16_internal_8bpc).main 5383 mova m1, [rsp+32*1] 5384 mova [tmp1q-32*4], m0 5385 mova [tmp1q-32*3], m1 5386 mova [tmp1q-32*2], m2 5387 mova [tmp1q-32*1], m3 5388 mova [tmp1q+32*0], m4 5389 mova [tmp1q+32*1], m5 5390 mova [tmp1q+32*2], m6 5391 mova [tmp1q+32*3], m7 5392 add tmp1q, 32*8 5393 mova [tmp1q-32*4], m8 5394 mova [tmp1q-32*3], m9 5395 mova [tmp1q-32*2], m10 5396 mova [tmp1q-32*1], m11 5397 mova [tmp1q+32*0], m12 5398 mova [tmp1q+32*1], m13 5399 mova [tmp1q+32*2], m14 5400 mova [tmp1q+32*3], m15 5401 LOAD_8ROWS cq+64*2, 64*4 5402 pxor m8, m8 5403 REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 5404 add tmp1q, 32*8 5405 lea tmp2q, [tmp1q+32*8] 5406 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 5407 vpbroadcastd m15, [o(pd_2048)] 5408 add tmp1q, 32*16 5409 add tmp2q, 32*32 5410 mova m0, [cq+64* 1] 5411 mova m1, [cq+64*31] 5412 mova m2, [cq+64*17] 5413 mova m3, [cq+64*15] 5414 mova m4, [cq+64* 9] 5415 mova m5, [cq+64*23] 5416 mova m6, [cq+64*25] 5417 mova m7, [cq+64* 7] 5418 pxor m8, m8 5419 REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 5420 add r6, o_idct64_offset 5421 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 5422 add r6, 8 5423 add tmp1q, 32*8 5424 sub tmp2q, 32*8 5425 mova m0, [cq+64* 5] 5426 mova m1, [cq+64*27] 5427 mova m2, [cq+64*21] 5428 mova m3, [cq+64*11] 5429 mova m4, [cq+64*13] 5430 mova m5, [cq+64*19] 5431 mova m6, [cq+64*29] 5432 mova m7, [cq+64* 3] 5433 pxor m8, m8 5434 REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 5435 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 5436 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 5437 sub tmp1q, 32*44 5438 vpbroadcastd m10, [o(pw_8192)] 5439 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave 5440 add cq, 32 5441 add r10d, 0x80000000 5442 jnc .pass1_loop 5443 lea tmp1q, [rsp+32*7] 5444 mov r10b, 4 5445.pass2_loop: 5446 lea r2, [tmp1q+32*64] 5447 mova m0, [r2-32*4] 5448 mova m1, [r2-32*2] 5449 mova m2, [r2+32*0] 5450 mova m3, [r2+32*2] 5451 pxor m4, m4 5452 REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 5453 mova [rsp], m4 5454 test r10d, 0x40000000 5455 jnz .fast 5456 lea r3, [r2+32*64] 5457 mova m4, [r3-32*4] 5458 mova m5, [r3-32*2] 5459 mova m6, [r3+32*0] 5460 mova m7, [r3+32*2] 5461.fast: 5462 call m(idct_16x16_internal_8bpc).main 5463 mova m1, [rsp+32*1] 5464 mova [tmp1q-32*4], m0 5465 mova [tmp1q-32*3], m1 5466 mova [tmp1q-32*2], m2 5467 mova [tmp1q-32*1], m3 5468 mova [tmp1q+32*0], m4 5469 mova [tmp1q+32*1], m5 5470 mova [tmp1q+32*2], m6 5471 mova [tmp1q+32*3], m7 5472 add tmp1q, 32*8 5473 mova [tmp1q-32*4], m8 5474 mova [tmp1q-32*3], m9 5475 mova [tmp1q-32*2], m10 5476 mova [tmp1q-32*1], m11 5477 mova [tmp1q+32*0], m12 5478 mova [tmp1q+32*1], m13 5479 mova [tmp1q+32*2], m14 5480 mova [tmp1q+32*3], m15 5481 mova m0, [r2-32*3] 5482 mova m1, [r2-32*1] 5483 mova m2, [r2+32*1] 5484 mova m3, [r2+32*3] 5485 pxor m4, m4 5486 REPX {mova x, m4}, m5, m6, m7 5487 test r10d, 0x40000000 5488 jnz .fast2 5489 mova m4, [r3-32*3] 5490 mova m5, [r3-32*1] 5491 mova m6, [r3+32*1] 5492 mova m7, [r3+32*3] 5493.fast2: 5494 add tmp1q, 32*8 5495 lea tmp2q, [tmp1q+32*8] 5496 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 5497 vpbroadcastd m15, [o(pd_2048)] 5498 add r2, 32*8 5499 add r3, 32*8 5500 add tmp1q, 32*16 5501 add tmp2q, 32*32 5502 mova m0, [r2-32*4] ; 1 5503 mova m3, [r2+32*3] ; 15 5504 mova m4, [r2+32*0] ; 9 5505 mova m7, [r2-32*1] ; 7 5506 pxor m1, m1 5507 REPX {mova x, m1}, m2, m5, m6 5508 test r10d, 0x40000000 5509 jnz .fast3 5510 mova m1, [r3+32*3] ; 31 5511 mova m2, [r3-32*4] ; 17 5512 mova m5, [r3-32*1] ; 23 5513 mova m6, [r3+32*0] ; 25 5514.fast3: 5515 add r6, o_idct64_offset 5516 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 5517 add r6, 8 5518 add tmp1q, 32*8 5519 sub tmp2q, 32*8 5520 mova m0, [r2-32*2] ; 5 5521 mova m3, [r2+32*1] ; 11 5522 mova m4, [r2+32*2] ; 13 5523 mova m7, [r2-32*3] ; 3 5524 pxor m1, m1 5525 REPX {mova x, m1}, m2, m5, m6 5526 test r10d, 0x40000000 5527 jnz .fast4 5528 mova m1, [r3+32*1] ; 27 5529 mova m2, [r3-32*2] ; 21 5530 mova m5, [r3-32*3] ; 19 5531 mova m6, [r3+32*2] ; 29 5532.fast4: 5533 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 5534 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 5535 sub tmp1q, 32*28 5536 sub dstq, r8 5537 lea dstq, [dstq+strideq*4+16] 5538 dec r10b 5539 jg .pass2_loop 5540 RET 5541 5542%endif ; ARCH_X86_64 5543