1; Copyright © 2020-2023, VideoLAN and dav1d authors 2; Copyright © 2020-2023, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 64 32const \ 33dup16_perm, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 34 db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 35 db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 36 db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 37const \ 38int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 39 db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 40 db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 41 db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 42int8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 43 db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 44 db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 45 db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 46int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 47 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 48 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 49 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 50idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23 51 db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55 52 db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31 53 db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63 54idct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9 55 db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17 56 db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51 57 db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35 58idct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35 59 db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21 60 db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51 61 db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37 62end_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60 63 db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61 64 db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63 65 db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62 66 67; packed 4-bit qword shuffle indices 68permA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262 69 dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373 70 dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb 71 dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea 72permB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604 73 dq 0xc824352d56128751, 0xd906171e74301e15 74 dq 0x6271604b03472d62, 0x735342782165b426 75 dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37 76permC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486 77 dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597 78 dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e 79 dq 0x5115049dd9045b79, 0x733726bffb263d1f 80permD: dq 0x0cda098800041504, 0x0edb09b2028c3726 81 dq 0x0f11fa9c01150415, 0x0988f326039d2637 82 dq 0x05640f1108269d8c, 0x05290edb0aaebfae 83 dq 0x0005000509378c9d, 0xffffffff0bbfaebf 84 85pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 86gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 87gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 88gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 89gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16 90 91int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 92int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 93int_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 94int_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7 95deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 96int_mshift: db 12, 20, 0, 0, 44, 52, 0, 0 97 98pb_32: times 4 db 32 99pw_2048: times 2 dw 2048 100pw_4096: times 2 dw 4096 101pw_8192: times 2 dw 8192 102pw_16384: times 2 dw 16384 103pw_1697x16: times 2 dw 1697*16 104pw_1697x8: times 2 dw 1697*8 105pw_2896x8: times 2 dw 2896*8 106pd_2048: dd 2048 107 108%define pw_5 (permD+52) 109%define pd_m1 (permD+60) 110%define pw_3803_1321 (permD+44) 111%define pw_2482_3803 (permD+12) 112%define pw_2440_3290 (permD+ 4) 113%define pw_m3290_2440 (permD+28) 114%define pw_3857_1380 (permD+36) 115%define pw_m1380_3857 (permD+20) 116 117pw_8192_m8192: dw 8192, -8192 118pw_m8192_8192: dw -8192, 8192 119pw_16384_m16384: dw 16384, -16384 120pw_m16384_16384: dw -16384, 16384 121 122pw_m1321_2482: dw -1321, 2482 123pw_m3344_3344: dw -3344, 3344 124pw_2482_3344: dw 2482, 3344 125pw_m3803_3344: dw -3803, 3344 126pd_3344: dd 3344 127pw_m1321_m3344: dw -1321, -3344 128pw_2896_m2896: dw 2896, -2896 129 130pw_1567_m3784: dw 1567, -3784 131pw_3784_m1567: dw 3784, -1567 132pw_4017_m799: dw 4017, -799 133pw_2276_m3406: dw 2276, -3406 134pw_m799_m4017: dw -799, -4017 135pw_m3406_m2276: dw -3406, -2276 136 137%macro COEF_PAIR 2-3 0 138pw_%1_%2: dw %1, %2 139pw_m%2_%1: dw -%2, %1 140%if %3 141pw_m%1_m%2: dw -%1, -%2 142%endif 143%endmacro 144 145COEF_PAIR 2896, 2896 146COEF_PAIR 1567, 3784, 1 147COEF_PAIR 3784, 1567 148COEF_PAIR 201, 4091 149COEF_PAIR 995, 3973 150COEF_PAIR 1751, 3703 151COEF_PAIR 3035, 2751 152COEF_PAIR 3513, 2106 153COEF_PAIR 4052, 601 154COEF_PAIR 3166, 2598, 1 155COEF_PAIR 3920, 1189, 1 156COEF_PAIR 2276, 3406 157COEF_PAIR 4017, 799 158 159%macro COEF_X8 1-* 160%rep %0 161 dw %1*8, %1*8 162 %rotate 1 163%endrep 164%endmacro 165 166pw_m2276x8: COEF_X8 -2276 167pw_3406x8: COEF_X8 3406 168pw_4017x8: COEF_X8 4017 169pw_799x8: COEF_X8 799 170pw_3784x8: COEF_X8 3784 171pw_1567x8: COEF_X8 1567 172 173pw_4076x8: COEF_X8 4076 174pw_401x8: COEF_X8 401 175pw_m2598x8: COEF_X8 -2598 176pw_3166x8: COEF_X8 3166 177pw_3612x8: COEF_X8 3612 178pw_1931x8: COEF_X8 1931 179pw_m1189x8: COEF_X8 -1189 180pw_3920x8: COEF_X8 3920 181 182pw_4091x8: COEF_X8 4091 183pw_201x8: COEF_X8 201 184pw_m2751x8: COEF_X8 -2751 185pw_3035x8: COEF_X8 3035 186pw_3703x8: COEF_X8 3703 187pw_1751x8: COEF_X8 1751 188pw_m1380x8: COEF_X8 -1380 189pw_3857x8: COEF_X8 3857 190pw_3973x8: COEF_X8 3973 191pw_995x8: COEF_X8 995 192pw_m2106x8: COEF_X8 -2106 193pw_3513x8: COEF_X8 3513 194pw_3290x8: COEF_X8 3290 195pw_2440x8: COEF_X8 2440 196pw_m601x8: COEF_X8 -601 197pw_4052x8: COEF_X8 4052 198 199pw_401_4076x8: dw 401*8, 4076*8 200pw_m2598_3166x8: dw -2598*8, 3166*8 201pw_1931_3612x8: dw 1931*8, 3612*8 202pw_m1189_3920x8: dw -1189*8, 3920*8 203pw_799_4017x8: dw 799*8, 4017*8 204pw_m2276_3406x8: dw -2276*8, 3406*8 205 206pw_201_4091x8: dw 201*8, 4091*8 207pw_m601_4052x8: dw -601*8, 4052*8 208pw_995_3973x8: dw 995*8, 3973*8 209pw_m1380_3857x8: dw -1380*8, 3857*8 210pw_1751_3703x8: dw 1751*8, 3703*8 211pw_m2106_3513x8: dw -2106*8, 3513*8 212pw_2440_3290x8: dw 2440*8, 3290*8 213pw_m2751_3035x8: dw -2751*8, 3035*8 214 215pw_101_4095x8: dw 101*8, 4095*8 216pw_m2824_2967x8: dw -2824*8, 2967*8 217pw_1660_3745x8: dw 1660*8, 3745*8 218pw_m1474_3822x8: dw -1474*8, 3822*8 219pw_897_3996x8: dw 897*8, 3996*8 220pw_m2191_3461x8: dw -2191*8, 3461*8 221pw_2359_3349x8: dw 2359*8, 3349*8 222pw_m700_4036x8: dw -700*8, 4036*8 223pw_501_4065x8: dw 501*8, 4065*8 224pw_m2520_3229x8: dw -2520*8, 3229*8 225pw_2019_3564x8: dw 2019*8, 3564*8 226pw_m1092_3948x8: dw -1092*8, 3948*8 227pw_1285_3889x8: dw 1285*8, 3889*8 228pw_m1842_3659x8: dw -1842*8, 3659*8 229pw_2675_3102x8: dw 2675*8, 3102*8 230pw_m301_4085x8: dw -301*8, 4085*8 231 232idct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474 233COEF_PAIR 401, 4076, 1 234COEF_PAIR 799, 4017 235 COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996 236dw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017 237 COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092 238COEF_PAIR 1931, 3612, 1 239COEF_PAIR 3406, 2276 240 COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889 241dw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276 242 243SECTION .text 244 245%define o_base int8_permA+64*18 246%define o(x) (r5 - (o_base) + (x)) 247%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 248 249; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack, 250; 16 = special_mul1, 32 = special_mul2 251%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags 252 mova m%2, m%4 253%if %7 & 16 254 vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd} 255 mova m%3, m%4 256%if %7 & 32 257 vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} 258%else 259 vpdpwssd m%3, m%1, m%6 260%endif 261%elif %7 & 32 262 vpdpwssd m%2, m%1, m%5 263 mova m%3, m%4 264 vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} 265%elif %6 < 32 266 vpdpwssd m%2, m%1, m%5 267 mova m%3, m%4 268 vpdpwssd m%3, m%1, m%6 269%elif %7 & 1 270 vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd} 271 mova m%3, m%4 272 vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd} 273%else 274 vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd} 275 mova m%3, m%4 276 vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd} 277%endif 278%if %7 & 2 279 psrld m%2, 12 280 pslld m%3, 4 281 vpshrdd m%1, m%3, m%2, 16 282%elif %7 & 4 283 ; compared to using shifts (as above) this has better throughput, 284 ; but worse latency and requires setting up the opmask/index 285 ; registers, so only use this method for the larger transforms 286 pslld m%1, m%2, 4 287 vpmultishiftqb m%1{k7}, m13, m%3 288%else 289 psrad m%2, 12 290 psrad m%3, 12 291%if %7 & 8 == 0 292 packssdw m%1, m%3, m%2 293%endif 294%endif 295%endmacro 296 297; flags: same as ITX_MUL2X_PACK 298%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags 299%if %11 & 1 300 vpbroadcastd m%4, [o(pw_%9_%10)] 301 vpbroadcastd m%4{k1}, [o(pw_%7_%8)] 302 vpbroadcastd m%5, [o(pw_m%10_%9)] 303 vpbroadcastd m%5{k1}, [o(pw_m%8_%7)] 304%else 305 vpbroadcastd m%4, [o(pw_m%10_%9)] 306 vpbroadcastd m%4{k1}, [o(pw_m%8_%7)] 307 vpbroadcastd m%5, [o(pw_%9_%10)] 308 vpbroadcastd m%5{k1}, [o(pw_%7_%8)] 309%endif 310 ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11 311%endmacro 312 313; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 314; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 315%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 316 punpcklwd m%3, m%2, m%1 317 punpckhwd m%2, m%1 318%if %7 < 32 319 mova m%1, m%5 320 vpdpwssd m%1, m%3, m%7 321 mova m%4, m%5 322 vpdpwssd m%4, m%2, m%7 323%else 324 mova m%1, m%5 325 vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd} 326 mova m%4, m%5 327 vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd} 328%endif 329 psrad m%1, 12 330 psrad m%4, 12 331 packssdw m%1, m%4 332 mova m%4, m%5 333%if %7 < 32 334 vpdpwssd m%4, m%2, m%6 335 mova m%2, m%5 336 vpdpwssd m%2, m%3, m%6 337%else 338 vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd} 339 mova m%2, m%5 340 vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd} 341%endif 342 psrad m%4, 12 343 psrad m%2, 12 344%if %0 == 8 345 packssdw m%8, m%2, m%4 346%else 347 packssdw m%2, m%4 348%endif 349%endmacro 350 351%macro WRAP_XMM 1+ 352 %xdefine %%reset RESET_MM_PERMUTATION 353 INIT_XMM cpuname 354 DEFINE_MMREGS xmm 355 AVX512_MM_PERMUTATION 356 %1 357 %%reset 358%endmacro 359 360%macro WRAP_YMM 1+ 361 INIT_YMM cpuname 362 %1 363 INIT_ZMM cpuname 364%endmacro 365 366%macro ITX4_END 4-5 2048 ; row[1-4], rnd 367%if %5 368 vpbroadcastd m2, [o(pw_%5)] 369 pmulhrsw m0, m2 370 pmulhrsw m1, m2 371%endif 372 lea r2, [dstq+strideq*2] 373%assign %%i 1 374%rep 4 375 %if %1 & 2 376 CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) 377 %else 378 CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) 379 %endif 380 %assign %%i %%i + 1 381 %rotate 1 382%endrep 383 movd m2, [%%row_adr1] 384 pinsrd m2, [%%row_adr2], 1 385 movd m3, [%%row_adr3] 386 pinsrd m3, [%%row_adr4], 1 387 pmovzxbw m2, m2 388 pmovzxbw m3, m3 389 paddw m0, m2 390 paddw m1, m3 391 packuswb m0, m1 392 movd [%%row_adr1], m0 393 pextrd [%%row_adr2], m0, 1 394 pextrd [%%row_adr3], m0, 2 395 pextrd [%%row_adr4], m0, 3 396 ret 397%endmacro 398 399%macro INV_TXFM_FN 3 ; type1, type2, size 400cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base 401 %define %%p1 m(i%1_%3_internal_8bpc) 402 lea baseq, [o_base] 403 ; Jump to the 1st txfm function if we're not taking the fast path, which 404 ; in turn performs an indirect jump to the 2nd txfm function. 405 lea tx2q, [m(i%2_%3_internal_8bpc).pass2] 406%ifidn %1_%2, dct_dct 407 test eobd, eobd 408 jnz %%p1 409%else 410 ; jump to the 1st txfm function unless it's located directly after this 411 times ((%%end - %%p1) >> 31) & 1 jmp %%p1 412ALIGN function_align 413%%end: 414%endif 415%endmacro 416 417%macro INV_TXFM_4X4_FN 2 ; type1, type2 418 INV_TXFM_FN %1, %2, 4x4 419%ifidn %1_%2, dct_dct 420 vpbroadcastw m0, [cq] 421 vpbroadcastd m1, [o(pw_2896x8)] 422 pmulhrsw m0, m1 423 mov [cq], eobd 424 pmulhrsw m0, m1 425 mova m1, m0 426 jmp m(iadst_4x4_internal_8bpc).end2 427%endif 428%endmacro 429 430%macro IDCT4_1D_PACKED 0 431 vpbroadcastd m4, [o(pd_2048)] 432 punpckhwd m2, m1, m0 433 punpcklwd m1, m0 434 ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 435 ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 436 paddsw m0, m1, m2 ; out0 out1 437 psubsw m1, m2 ; out3 out2 438%endmacro 439 440%macro IADST4_1D_PACKED 0 441 punpcklwd m4, m1, m0 ; in2 in0 442 punpckhwd m5, m1, m0 ; in3 in1 443.main2: 444 vpbroadcastd m3, [o(pd_2048)] 445 mova m0, m3 446 vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd} 447 mova m2, m3 448 vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd} 449 mova m1, m3 450 vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd} 451 vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd} 452 vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd} 453 vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd} 454 vpdpwssd m1, m5, [o(pd_3344)] {bcstd} 455 vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd} 456 REPX {psrad x, 12}, m0, m2, m1, m3 457 packssdw m0, m2 ; out0 out1 458 packssdw m1, m3 ; out2 out3 459%endmacro 460 461INIT_XMM avx512icl 462INV_TXFM_4X4_FN dct, dct 463INV_TXFM_4X4_FN dct, adst 464INV_TXFM_4X4_FN dct, flipadst 465INV_TXFM_4X4_FN dct, identity 466 467cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 468 mova m0, [cq+16*0] 469 mova m1, [cq+16*1] 470 IDCT4_1D_PACKED 471 mova m2, [o(deint_shuf)] 472 shufps m3, m0, m1, q1331 473 shufps m0, m0, m1, q0220 474 pshufb m0, m2 475 pshufb m1, m3, m2 476 jmp tx2q 477.pass2: 478 IDCT4_1D_PACKED 479 pxor ymm16, ymm16 480 mova [cq], ymm16 481 ITX4_END 0, 1, 3, 2 482 483INV_TXFM_4X4_FN adst, dct 484INV_TXFM_4X4_FN adst, adst 485INV_TXFM_4X4_FN adst, flipadst 486INV_TXFM_4X4_FN adst, identity 487 488cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 489 mova m0, [cq+16*0] 490 mova m1, [cq+16*1] 491 call .main 492 punpckhwd m3, m0, m1 493 punpcklwd m0, m1 494 punpckhwd m1, m0, m3 495 punpcklwd m0, m3 496 jmp tx2q 497.pass2: 498 call .main 499.end: 500 pxor ymm16, ymm16 501 mova [cq], ymm16 502.end2: 503 ITX4_END 0, 1, 2, 3 504ALIGN function_align 505.main: 506 IADST4_1D_PACKED 507 ret 508 509INV_TXFM_4X4_FN flipadst, dct 510INV_TXFM_4X4_FN flipadst, adst 511INV_TXFM_4X4_FN flipadst, flipadst 512INV_TXFM_4X4_FN flipadst, identity 513 514cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 515 mova m0, [cq+16*0] 516 mova m1, [cq+16*1] 517 call m(iadst_4x4_internal_8bpc).main 518 punpcklwd m2, m1, m0 519 punpckhwd m1, m0 520 punpcklwd m0, m1, m2 521 punpckhwd m1, m2 522 jmp tx2q 523.pass2: 524 call m(iadst_4x4_internal_8bpc).main 525.end: 526 pxor ymm16, ymm16 527 mova [cq], ymm16 528.end2: 529 ITX4_END 3, 2, 1, 0 530 531INV_TXFM_4X4_FN identity, dct 532INV_TXFM_4X4_FN identity, adst 533INV_TXFM_4X4_FN identity, flipadst 534INV_TXFM_4X4_FN identity, identity 535 536cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 537 mova m0, [cq+16*0] 538 mova m1, [cq+16*1] 539 vpbroadcastd m3, [o(pw_1697x8)] 540 pmulhrsw m2, m3, m0 541 pmulhrsw m3, m1 542 paddsw m0, m2 543 paddsw m1, m3 544 punpckhwd m2, m0, m1 545 punpcklwd m0, m1 546 punpckhwd m1, m0, m2 547 punpcklwd m0, m2 548 jmp tx2q 549.pass2: 550 vpbroadcastd m3, [o(pw_1697x8)] 551 pmulhrsw m2, m3, m0 552 pmulhrsw m3, m1 553 paddsw m0, m2 554 paddsw m1, m3 555 jmp m(iadst_4x4_internal_8bpc).end 556 557%macro INV_TXFM_4X8_FN 2 ; type1, type2 558 INV_TXFM_FN %1, %2, 4x8 559%ifidn %1_%2, dct_dct 560 movd xmm1, [o(pw_2896x8)] 561 pmulhrsw xmm0, xmm1, [cq] 562 movd xmm2, [o(pw_2048)] 563 pmulhrsw xmm0, xmm1 564 pmulhrsw xmm0, xmm1 565 pmulhrsw xmm0, xmm2 566 vpbroadcastw ym0, xmm0 567 mova ym1, ym0 568 jmp m(iadst_4x8_internal_8bpc).end3 569%endif 570%endmacro 571 572%macro IDCT8_1D_PACKED 0 573 punpckhwd m5, m3, m0 ; in7 in1 574 punpckhwd m4, m1, m2 ; in3 in5 575 punpcklwd m3, m1 ; in6 in2 576 punpcklwd m2, m0 ; in4 in0 577.main2: 578 vpbroadcastd m6, [o(pd_2048)] 579 ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a 580 ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a 581 ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 582 psubsw m0, m5, m4 ; t5a t6a (interleaved) 583 paddsw m4, m5 ; t4 t7 (interleaved) 584 ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 585 ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5 586%if mmsize > 16 587 vbroadcasti32x4 m1, [o(deint_shuf)] 588 pshufb m4, m1 589%else 590 pshufb m4, [o(deint_shuf)] 591%endif 592 psubsw m1, m2, m3 ; tmp3 tmp2 593 paddsw m3, m2 ; tmp0 tmp1 594 punpckhqdq m2, m4, m0 ; t7 t6 595 punpcklqdq m4, m0 ; t4 t5 596 paddsw m0, m3, m2 ; out0 out1 597 psubsw m3, m2 ; out7 out6 598 psubsw m2, m1, m4 ; out4 out5 599 paddsw m1, m4 ; out3 out2 600%endmacro 601 602%macro IADST8_1D_PACKED 1 ; pass 603 vpbroadcastd m6, [o(pd_2048)] 604%if %1 == 1 605 ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a 606 ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a 607 ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a 608 ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a 609 psubsw m4, m0, m2 ; t5 t4 610 paddsw m0, m2 ; t1 t0 611 psubsw m5, m1, m3 ; t6 t7 612 paddsw m1, m3 ; t2 t3 613 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a 614 ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a 615%if mmsize > 16 616 vbroadcasti32x4 m2, [o(deint_shuf)] 617%else 618 mova m2, [o(deint_shuf)] 619%endif 620 vprord m1, 16 621 psubsw m3, m0, m1 ; t3 t2 622 paddsw m0, m1 ; -out7 out0 623 psubsw m1, m4, m5 ; t7 t6 624 paddsw m4, m5 ; out6 -out1 625 pshufb m0, m2 626 pshufb m4, m2 627 mova m2, m6 628 vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd} 629 mova m5, m6 630 vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd} 631 psrad m2, 12 632 psrad m5, 12 633 packssdw m2, m5 ; out4 -out5 634 mova m5, m6 635 vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd} 636 mova m3, m6 637 vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd} 638 psrad m5, 12 639 psrad m3, 12 640 packssdw m1, m3, m5 ; out2 -out3 641%else 642 punpckhwd m0, m4, m3 ; 0 7 643 punpckhwd m1, m5, m2 ; 2 5 644 punpcklwd m2, m5 ; 4 3 645 punpcklwd m3, m4 ; 6 1 646 ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a 647 ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a 648 ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a 649 ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a 650 psubsw m4, m0, m2 ; t4 t5 651 paddsw m0, m2 ; t0 t1 652 psubsw m5, m1, m3 ; t6 t7 653 paddsw m1, m3 ; t2 t3 654 shufps m2, m5, m4, q1032 655 punpckhwd m4, m2 656 punpcklwd m5, m2 657 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a 658 ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a 659 psubsw m2, m0, m1 ; t2 t3 660 paddsw m0, m1 ; out0 -out7 661 psubsw m1, m4, m5 ; t6 t7 662 paddsw m4, m5 ; -out1 out6 663 vpbroadcastd m5, [o(pw_2896x8)] 664 punpckhqdq m3, m2, m1 ; t3 t7 665 punpcklqdq m2, m1 ; t2 t6 666 paddsw m1, m2, m3 ; t2+t3 t6+t7 667 psubsw m2, m3 ; t2-t3 t6-t7 668 punpckhqdq m3, m4, m0 ; out6 -out7 669 punpcklqdq m0, m4 ; out0 -out1 670 pmulhrsw m2, m5 ; out4 -out5 671 pshufd m1, m1, q1032 672 pmulhrsw m1, m5 ; out2 -out3 673%endif 674%endmacro 675 676INIT_YMM avx512icl 677INV_TXFM_4X8_FN dct, dct 678INV_TXFM_4X8_FN dct, identity 679INV_TXFM_4X8_FN dct, adst 680INV_TXFM_4X8_FN dct, flipadst 681 682cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 683 vpermq m0, [cq+32*0], q3120 684 vpermq m1, [cq+32*1], q3120 685 vpbroadcastd m2, [o(pw_2896x8)] 686 pmulhrsw m0, m2 687 pmulhrsw m1, m2 688 IDCT4_1D_PACKED 689 vbroadcasti32x4 m2, [o(deint_shuf)] 690 shufps m3, m0, m1, q1331 691 shufps m0, m0, m1, q0220 692 pshufb m0, m2 693 pshufb m1, m3, m2 694 jmp tx2q 695.pass2: 696 vextracti32x4 xm2, m0, 1 697 vextracti32x4 xm3, m1, 1 698 call .main 699 vpbroadcastd m4, [o(pw_2048)] 700 vinserti32x4 m0, m0, xm2, 1 701 vinserti32x4 m1, m1, xm3, 1 702 pshufd m1, m1, q1032 703 jmp m(iadst_4x8_internal_8bpc).end2 704ALIGN function_align 705.main: 706 WRAP_XMM IDCT8_1D_PACKED 707 ret 708 709INV_TXFM_4X8_FN adst, dct 710INV_TXFM_4X8_FN adst, adst 711INV_TXFM_4X8_FN adst, flipadst 712INV_TXFM_4X8_FN adst, identity 713 714cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 715 vpermq m0, [cq+32*0], q3120 716 vpermq m1, [cq+32*1], q3120 717 vpbroadcastd m2, [o(pw_2896x8)] 718 pmulhrsw m0, m2 719 pmulhrsw m1, m2 720 call m(iadst_8x4_internal_8bpc).main 721 punpckhwd m3, m0, m1 722 punpcklwd m0, m1 723 punpckhwd m1, m0, m3 724 punpcklwd m0, m3 725 jmp tx2q 726.pass2: 727 vextracti32x4 xm2, m0, 1 728 vextracti32x4 xm3, m1, 1 729 pshufd xm4, xm0, q1032 730 pshufd xm5, xm1, q1032 731 call .main_pass2 732 vpbroadcastd m4, [o(pw_2048)] 733 vinserti32x4 m0, xm2, 1 734 vinserti32x4 m1, xm3, 1 735 pxor m5, m5 736 psubw m5, m4 737.end: 738 punpcklqdq m4, m5 739.end2: 740 pmulhrsw m0, m4 741 pmulhrsw m1, m4 742.end3: 743 vpbroadcastd m3, strided 744 pmulld m5, m3, [o(pd_0to15)] 745 kxnorb k1, k1, k1 746 kmovb k2, k1 747 vpgatherdd m3{k1}, [dstq+m5] 748 pxor m4, m4 749 mova [cq], zmm20 750 punpcklbw m2, m3, m4 751 punpckhbw m3, m4 752 paddw m0, m2 753 paddw m1, m3 754 packuswb m0, m1 755 vpscatterdd [dstq+m5]{k2}, m0 756 RET 757ALIGN function_align 758.main_pass1: 759 punpckhwd xm0, xm4, xm3 ; 0 7 760 punpckhwd xm1, xm5, xm2 ; 2 5 761 punpcklwd xm2, xm5 ; 4 3 762 punpcklwd xm3, xm4 ; 6 1 763 WRAP_XMM IADST8_1D_PACKED 1 764 punpcklqdq xm3, xm4, xm0 ; out6 -out7 765 punpckhqdq xm0, xm4 ; out0 -out1 766 ret 767ALIGN function_align 768.main_pass2: 769 WRAP_XMM IADST8_1D_PACKED 2 770 ret 771 772INV_TXFM_4X8_FN flipadst, dct 773INV_TXFM_4X8_FN flipadst, adst 774INV_TXFM_4X8_FN flipadst, flipadst 775INV_TXFM_4X8_FN flipadst, identity 776 777cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 778 vpermq m0, [cq+32*0], q3120 779 vpermq m1, [cq+32*1], q3120 780 vpbroadcastd m2, [o(pw_2896x8)] 781 pmulhrsw m0, m2 782 pmulhrsw m1, m2 783 call m(iadst_8x4_internal_8bpc).main 784 punpcklwd m3, m1, m0 785 punpckhwd m1, m0 786 punpcklwd m0, m1, m3 787 punpckhwd m1, m3 788 jmp tx2q 789.pass2: 790 vextracti32x4 xm2, m0, 1 791 vextracti32x4 xm3, m1, 1 792 pshufd xm4, xm0, q1032 793 pshufd xm5, xm1, q1032 794 call m(iadst_4x8_internal_8bpc).main_pass2 795 vpbroadcastd m5, [o(pw_2048)] 796 vinserti32x4 m3, xm1, 1 797 vinserti32x4 m2, xm0, 1 798 pxor m4, m4 799 psubw m4, m5 800 pshufd m0, m3, q1032 801 pshufd m1, m2, q1032 802 jmp m(iadst_4x8_internal_8bpc).end 803 804INIT_ZMM avx512icl 805INV_TXFM_4X8_FN identity, dct 806INV_TXFM_4X8_FN identity, adst 807INV_TXFM_4X8_FN identity, flipadst 808INV_TXFM_4X8_FN identity, identity 809 810cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 811 vpbroadcastd m0, [o(pw_2896x8)] 812 pmulhrsw m0, [cq] 813 mova m1, [o(int8_permB)] 814 vpbroadcastd m2, [o(pw_1697x8)] 815 vpermb m0, m1, m0 816 pmulhrsw m2, m0 817 paddsw m0, m2 818 vextracti32x8 ym1, m0, 1 819 jmp tx2q 820.pass2: 821 vpbroadcastd ym4, [o(pw_4096)] 822 jmp m(iadst_4x8_internal_8bpc).end2 823 824%macro INV_TXFM_4X16_FN 2 ; type1, type2 825 INV_TXFM_FN %1, %2, 4x16 826%ifidn %1_%2, dct_dct 827 movsx r6d, word [cq] 828 mov [cq], eobd 829 imul r6d, 181 830 add r6d, 128+256 831 sar r6d, 8+1 832 imul r6d, 181 833 add r6d, 128+2048 834 sar r6d, 8+4 835 vpbroadcastw m0, r6d 836 mova m1, m0 837 jmp m(iadst_4x16_internal_8bpc).end3 838%endif 839%endmacro 840 841%macro IDCT16_1D_PACKED 0 842 punpckhwd m8, m7, m0 ; dct16 in15 in1 843 punpcklwd m9, m4, m0 ; dct4 in2 in0 844 punpckhwd m0, m3, m4 ; dct16 in7 in9 845 punpcklwd m7, m1 ; dct8 in7 in1 846 punpckhwd m1, m6 ; dct16 in3 in13 847 punpcklwd m3, m5 ; dct8 in3 in5 848 punpckhwd m5, m2 ; dct16 in11 in5 849 punpcklwd m6, m2 ; dct4 in3 in1 850cglobal_label .main2 851 vpbroadcastd m10, [o(pd_2048)] 852.main3: 853 vpbroadcastq m13, [o(int_mshift)] 854 vpcmpub k7, m13, m10, 6 ; 0x33... 855 ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a 856 ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a 857 ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a 858 ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a 859 ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a 860 ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a 861.main4: 862 psubsw m2, m8, m0 ; t9 t14 863 paddsw m8, m0 ; t8 t15 864 psubsw m4, m1, m5 ; t10 t13 865 paddsw m1, m5 ; t11 t12 866 ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2 867 psubsw m0, m8, m1 ; t11a t12a 868 paddsw m8, m1 ; t8a t15a 869 psubsw m1, m7, m3 ; t5a t6a 870 paddsw m7, m3 ; t4 t7 871.main5: 872 ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a 873 ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a 874%if mmsize > 16 875 vbroadcasti32x4 m5, [o(deint_shuf)] 876%else 877 mova m5, [o(deint_shuf)] 878%endif 879 vpbroadcastd m11, [o(pw_m2896_2896)] 880 vpbroadcastd m12, [o(pw_2896_2896)] 881 paddsw m3, m2, m4 ; t9 t14 882 psubsw m2, m4 ; t10 t13 883 pshufb m8, m5 884 pshufb m7, m5 885 pshufb m3, m5 886 ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1 887 ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6 888 ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12 889 ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a 890 punpckhqdq m2, m7, m1 ; t7 t6 891 punpcklqdq m7, m1 ; t4 t5 892 psubsw m1, m9, m6 ; dct4 out3 out2 893 paddsw m9, m6 ; dct4 out0 out1 894 packssdw m5, m11 ; t12 t13a 895 packssdw m4, m0 ; t11 t10a 896 punpckhqdq m0, m8, m3 ; t15a t14 897 punpcklqdq m8, m3 ; t8a t9 898 psubsw m3, m9, m2 ; dct8 out7 out6 899 paddsw m9, m2 ; dct8 out0 out1 900 psubsw m2, m1, m7 ; dct8 out4 out5 901 paddsw m1, m7 ; dct8 out3 out2 902 psubsw m7, m9, m0 ; out15 out14 903 paddsw m0, m9 ; out0 out1 904 psubsw m6, m1, m5 ; out12 out13 905 paddsw m1, m5 ; out3 out2 906 psubsw m5, m2, m4 ; out11 out10 907 paddsw m2, m4 ; out4 out5 908 psubsw m4, m3, m8 ; out8 out9 909 paddsw m3, m8 ; out7 out6 910%endmacro 911 912INV_TXFM_4X16_FN dct, dct 913INV_TXFM_4X16_FN dct, identity 914INV_TXFM_4X16_FN dct, adst 915INV_TXFM_4X16_FN dct, flipadst 916 917cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 918 mova ym1, [cq+32*2] 919 vinserti32x8 m1, [cq+32*0], 1 920 mova m0, [o(int16_perm)] 921 mova ym2, [cq+32*3] 922 vinserti32x8 m2, [cq+32*1], 1 923 vpbroadcastd m4, [o(pd_2048)] 924 vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3 925 vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3 926 ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2 927 ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2 928 vpbroadcastd m4, [o(pw_16384)] 929 psubsw m3, m1, m2 930 paddsw m1, m2 ; out0 out1 931 vprord m3, 16 ; out2 out3 932 punpckldq m0, m1, m3 933 punpckhdq m1, m3 934 pmulhrsw m0, m4 935 pmulhrsw m1, m4 936 jmp tx2q 937.pass2: 938 vextracti32x4 xm2, ym0, 1 939 vextracti32x4 xm3, ym1, 1 940 vextracti32x4 xm4, m0, 2 941 vextracti32x4 xm5, m1, 2 942 vextracti32x4 xm6, m0, 3 943 vextracti32x4 xm7, m1, 3 944 call .main 945 vinserti32x4 ym0, xm2, 1 946 vinserti32x4 ym1, xm3, 1 947 vinserti32x4 ym4, xm6, 1 948 vinserti32x4 ym5, xm7, 1 949 vinserti32x8 m0, ym4, 1 950 vinserti32x8 m1, ym5, 1 951 vpbroadcastd m5, [o(pw_2048)] 952 pshufd m1, m1, q1032 953 jmp m(iadst_4x16_internal_8bpc).end2 954ALIGN function_align 955.main: 956 WRAP_XMM IDCT16_1D_PACKED 957 ret 958 959INV_TXFM_4X16_FN adst, dct 960INV_TXFM_4X16_FN adst, adst 961INV_TXFM_4X16_FN adst, flipadst 962INV_TXFM_4X16_FN adst, identity 963 964cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 965 mova m1, [o(permB)] 966 vpermq m0, m1, [cq+64*0] 967 vpermq m1, m1, [cq+64*1] 968 call m(iadst_16x4_internal_8bpc).main 969 vpbroadcastd m3, [o(pw_16384)] 970 punpckhwd m2, m0, m1 971 punpcklwd m0, m1 972 pmulhrsw m2, m3 973 pmulhrsw m0, m3 974 punpckhwd m1, m0, m2 975 punpcklwd m0, m2 976 jmp tx2q 977.pass2: 978 call .main 979 vpbroadcastd m5, [o(pw_2048)] 980 psrlq m10, 4 981 psubw m6, m8, m5 982.end: 983 vpbroadcastd m7, [o(pw_2896x8)] 984 paddsw ym1, ym2, ym4 985 psubsw ym2, ym4 986 vinserti32x8 m1, ym2, 1 987 pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10 988 psrlq m0, m10, 4 989 vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d 990 vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f 991 punpcklqdq m5, m6 992.end2: 993 pmulhrsw m0, m5 994 pmulhrsw m1, m5 995.end3: 996 vpbroadcastd m3, strided 997 pmulld m5, m3, [o(pd_0to15)] 998 kxnorw k1, k1, k1 999 kmovw k2, k1 1000 vpgatherdd m3{k1}, [dstq+m5] 1001 pxor m4, m4 1002 mova [cq+64*0], m4 1003 mova [cq+64*1], m4 1004 punpcklbw m2, m3, m4 1005 punpckhbw m3, m4 1006 paddw m0, m2 1007 paddw m1, m3 1008 packuswb m0, m1 1009 vpscatterdd [dstq+m5]{k2}, m0 1010 RET 1011ALIGN function_align 1012.main: 1013 movu m3, [o(permB+1)] 1014 psrlq m10, m3, 4 1015.main2: 1016 vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10 1017 vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5 1018 vpbroadcastd m9, [o(pd_2048)] 1019 vpbroadcastq ym13, [o(int_mshift)] 1020 kxnorb k1, k1, k1 1021 punpckhwd m4, m3, m0 ; in12 in3 in14 in1 1022 punpcklwd m0, m3 ; in0 in15 in2 in13 1023 kshiftrb k1, k1, 4 1024 vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5 1025 vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9 1026INIT_YMM avx512icl 1027 vpcmpub k7, m13, m9, 6 ; 0x33... 1028 pxor m8, m8 1029 ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5 1030 ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5 1031 ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5 1032 ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5 1033 psubsw m2, m0, m3 ; t9a t8a t11a t10a 1034 paddsw m0, m3 ; t1a t0a t3a t2a 1035 psubsw m3, m1, m4 ; t13a t12a t15a t14a 1036 paddsw m4, m1 ; t5a t4a t7a t6a 1037 ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5 1038 psubw m7, m8, m7 1039 ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4 1040 vpbroadcastd m6, [o(pw_3784_m1567)] 1041 vpbroadcastd m6{k1}, [o(pw_m3784_1567)] 1042 psubsw m1, m0, m4 ; t5 t4 t7 t6 1043 paddsw m0, m4 ; t1 t0 t3 t2 1044 psubsw m4, m2, m3 ; t13a t12a t15a t14a 1045 paddsw m2, m3 ; t9a t8a t11a t10a 1046 ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a 1047 ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14 1048 vbroadcasti32x4 m5, [o(deint_shuf)] 1049 pshufb m0, m5 1050 pshufb m2, m5 1051 vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a 1052 vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a 1053 vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14 1054 vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13 1055 pshufd m2, m2, q1032 ; t6a t7a t14 t15 1056 psubsw m4, m0, m3 ; t3a t2a t11 t10 1057 paddsw m0, m3 ; -out15 out0 out14 -out1 1058 paddsw m3, m1, m2 ; out12 -out3 -out13 out2 1059 psubsw m1, m2 ; t7 t6 t15a t14a 1060 punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a 1061 punpcklqdq m4, m1 ; t3a t7 t11 t15a 1062INIT_ZMM avx512icl 1063 vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1 1064 ret 1065 1066INV_TXFM_4X16_FN flipadst, dct 1067INV_TXFM_4X16_FN flipadst, adst 1068INV_TXFM_4X16_FN flipadst, flipadst 1069INV_TXFM_4X16_FN flipadst, identity 1070 1071cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1072 mova m1, [o(permB)] 1073 vpermq m0, m1, [cq+64*0] 1074 vpermq m1, m1, [cq+64*1] 1075 call m(iadst_16x4_internal_8bpc).main 1076 vpbroadcastd m3, [o(pw_16384)] 1077 punpcklwd m2, m1, m0 1078 punpckhwd m1, m0 1079 pmulhrsw m2, m3 1080 pmulhrsw m1, m3 1081 punpcklwd m0, m1, m2 1082 punpckhwd m1, m2 1083 jmp tx2q 1084.pass2: 1085 call m(iadst_4x16_internal_8bpc).main 1086 vpbroadcastd m6, [o(pw_2048)] 1087 psrlq m10, 12 1088 psubw m5, m8, m6 1089 jmp m(iadst_4x16_internal_8bpc).end 1090 1091INV_TXFM_4X16_FN identity, dct 1092INV_TXFM_4X16_FN identity, adst 1093INV_TXFM_4X16_FN identity, flipadst 1094INV_TXFM_4X16_FN identity, identity 1095 1096cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1097 mova m2, [o(int16_perm)] 1098 vpermb m1, m2, [cq+64*0] 1099 vpermb m2, m2, [cq+64*1] 1100 vpbroadcastd m4, [o(pw_1697x8)] 1101 vpbroadcastd m0, [o(pd_m1)] 1102 pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is 1103 vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal 1104 pmulhrsw m4, m2 ; it still works, but if the input is -1 the 1105 vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes 1106 vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless 1107 vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here. 1108 punpckldq m0, m1, m2 1109 punpckhdq m1, m2 1110 jmp tx2q 1111.pass2: 1112 vpbroadcastd m3, [o(pw_1697x16)] 1113 vpbroadcastd m5, [o(pw_2048)] 1114 pmulhrsw m2, m3, m0 1115 pmulhrsw m3, m1 1116 paddsw m0, m0 1117 paddsw m1, m1 1118 paddsw m0, m2 1119 paddsw m1, m3 1120 jmp m(iadst_4x16_internal_8bpc).end2 1121 1122%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3] 1123 movq xm%3, [dstq ] 1124 movhps xm%3, [dstq+%5] 1125 movq xm%4, [dstq+%6] 1126 movhps xm%4, [dstq+%7] 1127 pmovzxbw m%3, xm%3 1128 pmovzxbw m%4, xm%4 1129%ifnum %1 1130 paddw m%3, m%1 1131%else 1132 paddw m%3, %1 1133%endif 1134%ifnum %2 1135 paddw m%4, m%2 1136%else 1137 paddw m%4, %2 1138%endif 1139 packuswb m%3, m%4 1140 vextracti32x4 xm%4, m%3, 1 1141 movq [dstq ], xm%3 1142 movhps [dstq+%6], xm%3 1143 movq [dstq+%5], xm%4 1144 movhps [dstq+%7], xm%4 1145%endmacro 1146 1147%macro INV_TXFM_8X4_FN 2 ; type1, type2 1148 INV_TXFM_FN %1, %2, 8x4 1149%ifidn %1_%2, dct_dct 1150 movd xm1, [o(pw_2896x8)] 1151 pmulhrsw xm0, xm1, [cq] 1152 movd xm2, [o(pw_2048)] 1153 pmulhrsw xm0, xm1 1154 pmulhrsw xm0, xm1 1155 pmulhrsw xm0, xm2 1156 vpbroadcastw m0, xm0 1157 mova m1, m0 1158 jmp m(iadst_8x4_internal_8bpc).end3 1159%endif 1160%endmacro 1161 1162INIT_YMM avx512icl 1163INV_TXFM_8X4_FN dct, dct 1164INV_TXFM_8X4_FN dct, adst 1165INV_TXFM_8X4_FN dct, flipadst 1166INV_TXFM_8X4_FN dct, identity 1167 1168cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1169 vpbroadcastd xm3, [o(pw_2896x8)] 1170 pmulhrsw xm0, xm3, [cq+16*0] 1171 pmulhrsw xm1, xm3, [cq+16*1] 1172 pmulhrsw xm2, xm3, [cq+16*2] 1173 pmulhrsw xm3, [cq+16*3] 1174 call m(idct_4x8_internal_8bpc).main 1175 vbroadcasti32x4 m4, [o(deint_shuf)] 1176 vinserti32x4 m3, m1, xm3, 1 1177 vinserti32x4 m1, m0, xm2, 1 1178 shufps m0, m1, m3, q0220 1179 shufps m1, m3, q1331 1180 pshufb m0, m4 1181 pshufb m1, m4 1182 jmp tx2q 1183.pass2: 1184 IDCT4_1D_PACKED 1185 vpermq m0, m0, q3120 1186 vpermq m1, m1, q2031 1187 jmp m(iadst_8x4_internal_8bpc).end2 1188 1189INV_TXFM_8X4_FN adst, dct 1190INV_TXFM_8X4_FN adst, adst 1191INV_TXFM_8X4_FN adst, flipadst 1192INV_TXFM_8X4_FN adst, identity 1193 1194cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1195 vpbroadcastd xm0, [o(pw_2896x8)] 1196 pshufd xm4, [cq+16*0], q1032 1197 pmulhrsw xm3, xm0, [cq+16*3] 1198 pshufd xm5, [cq+16*1], q1032 1199 pmulhrsw xm2, xm0, [cq+16*2] 1200 pmulhrsw xm4, xm0 1201 pmulhrsw xm5, xm0 1202 call m(iadst_4x8_internal_8bpc).main_pass1 1203 vinserti32x4 m0, xm2, 1 1204 vinserti32x4 m1, xm3, 1 1205 pxor m3, m3 1206 punpckhwd m2, m0, m1 1207 punpcklwd m0, m1 1208 psubsw m3, m2 1209 punpckhwd m1, m0, m3 1210 punpcklwd m0, m3 1211 jmp tx2q 1212.pass2: 1213 call .main 1214.end: 1215 vpermq m0, m0, q3120 1216 vpermq m1, m1, q3120 1217.end2: 1218 vpbroadcastd m2, [o(pw_2048)] 1219 pmulhrsw m0, m2 1220 pmulhrsw m1, m2 1221.end3: 1222 pxor m2, m2 1223 mova [cq], zmm18 1224 lea r6, [strideq*3] 1225 WRITE_8X4 0, 1, 4, 5 1226 RET 1227ALIGN function_align 1228.main: 1229 IADST4_1D_PACKED 1230 ret 1231 1232INV_TXFM_8X4_FN flipadst, dct 1233INV_TXFM_8X4_FN flipadst, adst 1234INV_TXFM_8X4_FN flipadst, flipadst 1235INV_TXFM_8X4_FN flipadst, identity 1236 1237cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1238 vpbroadcastd xm0, [o(pw_2896x8)] 1239 pshufd xm4, [cq+16*0], q1032 1240 pmulhrsw xm3, xm0, [cq+16*3] 1241 pshufd xm5, [cq+16*1], q1032 1242 pmulhrsw xm2, xm0, [cq+16*2] 1243 pmulhrsw xm4, xm0 1244 pmulhrsw xm5, xm0 1245 call m(iadst_4x8_internal_8bpc).main_pass1 1246 vinserti32x4 m3, m3, xm1, 1 1247 vinserti32x4 m2, m2, xm0, 1 1248 punpckhwd m1, m3, m2 1249 punpcklwd m3, m2 1250 pxor m0, m0 1251 psubsw m0, m1 1252 punpckhwd m1, m0, m3 1253 punpcklwd m0, m3 1254 jmp tx2q 1255.pass2: 1256 call m(iadst_8x4_internal_8bpc).main 1257 mova m2, m1 1258 vpermq m1, m0, q2031 1259 vpermq m0, m2, q2031 1260 jmp m(iadst_8x4_internal_8bpc).end2 1261 1262INV_TXFM_8X4_FN identity, dct 1263INV_TXFM_8X4_FN identity, adst 1264INV_TXFM_8X4_FN identity, flipadst 1265INV_TXFM_8X4_FN identity, identity 1266 1267cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1268 mova xm2, [cq+16*0] 1269 mova xm0, [cq+16*1] 1270 vinserti32x4 m2, [cq+16*2], 1 1271 vinserti32x4 m0, [cq+16*3], 1 1272 vpbroadcastd m3, [o(pw_2896x8)] 1273 punpcklwd m1, m2, m0 1274 punpckhwd m2, m0 1275 pmulhrsw m1, m3 1276 pmulhrsw m2, m3 1277 punpcklwd m0, m1, m2 1278 punpckhwd m1, m2 1279 paddsw m0, m0 1280 paddsw m1, m1 1281 jmp tx2q 1282.pass2: 1283 vpbroadcastd m3, [o(pw_1697x8)] 1284 pmulhrsw m2, m3, m0 1285 pmulhrsw m3, m1 1286 paddsw m0, m2 1287 paddsw m1, m3 1288 jmp m(iadst_8x4_internal_8bpc).end 1289 1290%macro INV_TXFM_8X8_FN 2 ; type1, type2 1291 INV_TXFM_FN %1, %2, 8x8 1292%ifidn %1_%2, dct_dct 1293INIT_ZMM avx512icl 1294 movsx r6d, word [cq] 1295 mov [cq], eobd 1296.dconly: 1297 imul r6d, 181 1298 add r6d, 128+256 1299 sar r6d, 8+1 1300.dconly2: 1301 vpbroadcastd ym2, strided 1302 imul r6d, 181 1303 pmulld ym5, ym2, [o(pd_0to15)] 1304 kxnorb k1, k1, k1 1305 add r6d, 128+2048 1306 sar r6d, 8+4 1307 pxor m3, m3 1308 vpbroadcastw m4, r6d 1309.dconly_loop: 1310 kmovb k2, k1 1311 vpgatherdq m2{k1}, [dstq+ym5] 1312 punpcklbw m0, m2, m3 1313 punpckhbw m1, m2, m3 1314 paddw m0, m4 1315 paddw m1, m4 1316 packuswb m0, m1 1317 kmovb k1, k2 1318 vpscatterdq [dstq+ym5]{k2}, m0 1319 lea dstq, [dstq+strideq*8] 1320 sub r3d, 8 1321 jg .dconly_loop 1322 RET 1323INIT_YMM avx512icl 1324%endif 1325%endmacro 1326 1327INV_TXFM_8X8_FN dct, dct 1328INV_TXFM_8X8_FN dct, identity 1329INV_TXFM_8X8_FN dct, adst 1330INV_TXFM_8X8_FN dct, flipadst 1331 1332cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1333 vpermq m0, [cq+32*0], q3120 ; 0 1 1334 vpermq m3, [cq+32*3], q3120 ; 6 7 1335 vpermq m2, [cq+32*2], q3120 ; 4 5 1336 vpermq m1, [cq+32*1], q3120 ; 2 3 1337 call .main 1338 shufps m4, m0, m1, q0220 1339 shufps m5, m0, m1, q1331 1340 shufps m1, m2, m3, q0220 1341 shufps m3, m2, m3, q1331 1342 vbroadcasti32x4 m0, [o(deint_shuf)] 1343 vpbroadcastd m2, [o(pw_16384)] 1344 REPX {pshufb x, m0}, m4, m5, m1, m3 1345 REPX {pmulhrsw x, m2}, m4, m5, m1, m3 1346 vinserti32x4 m0, m4, xm1, 1 1347 vshufi32x4 m2, m4, m1, 0x03 1348 vinserti32x4 m1, m5, xm3, 1 1349 vshufi32x4 m3, m5, m3, 0x03 1350 jmp tx2q 1351.pass2: 1352 call .main 1353 vpbroadcastd m4, [o(pw_2048)] 1354 vpermq m0, m0, q3120 1355 vpermq m1, m1, q2031 1356 vpermq m2, m2, q3120 1357 vpermq m3, m3, q2031 1358 jmp m(iadst_8x8_internal_8bpc).end2 1359ALIGN function_align 1360cglobal_label .main 1361 IDCT8_1D_PACKED 1362 ret 1363 1364INV_TXFM_8X8_FN adst, dct 1365INV_TXFM_8X8_FN adst, adst 1366INV_TXFM_8X8_FN adst, flipadst 1367INV_TXFM_8X8_FN adst, identity 1368 1369cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1370 vpermq m4, [cq+32*0], q1302 ; 1 0 1371 vpermq m3, [cq+32*3], q3120 ; 6 7 1372 vpermq m5, [cq+32*1], q1302 ; 3 2 1373 vpermq m2, [cq+32*2], q3120 ; 4 5 1374 call .main_pass1 1375 vpbroadcastd m5, [o(pw_16384_m16384)] 1376 punpcklwd m4, m0, m1 1377 punpckhwd m0, m1 1378 punpcklwd m1, m2, m3 1379 punpckhwd m2, m3 1380 punpcklwd m3, m4, m0 1381 punpckhwd m4, m0 1382 punpcklwd m0, m1, m2 1383 punpckhwd m1, m2 1384 REPX {pmulhrsw x, m5}, m3, m4, m0, m1 1385 vshufi32x4 m2, m3, m0, 0x03 1386 vinserti32x4 m0, m3, xm0, 1 1387 vshufi32x4 m3, m4, m1, 0x03 1388 vinserti32x4 m1, m4, xm1, 1 1389 jmp tx2q 1390.pass2: 1391 pshufd m4, m0, q1032 1392 pshufd m5, m1, q1032 1393 call .main_pass2 1394 vpbroadcastd m5, [o(pw_2048)] 1395 vpbroadcastd xm4, [o(pw_4096)] 1396 psubw m4, m5 ; lower half = 2048, upper half = -2048 1397.end: 1398 REPX {vpermq x, x, q3120}, m0, m1, m2, m3 1399.end2: 1400 pmulhrsw m0, m4 1401 pmulhrsw m1, m4 1402.end3: 1403 pmulhrsw m2, m4 1404 pmulhrsw m3, m4 1405.end4: 1406 pxor m4, m4 1407 mova [cq+32*0], m4 1408 mova [cq+32*1], m4 1409 mova [cq+32*2], m4 1410 mova [cq+32*3], m4 1411 lea r6, [strideq*3] 1412 WRITE_8X4 0, 1, 4, 5 1413 lea dstq, [dstq+strideq*4] 1414 WRITE_8X4 2, 3, 4, 5 1415 RET 1416ALIGN function_align 1417.main_pass1: 1418 punpckhwd m0, m4, m3 ; 0 7 1419 punpckhwd m1, m5, m2 ; 2 5 1420 punpcklwd m2, m5 ; 4 3 1421 punpcklwd m3, m4 ; 6 1 1422 IADST8_1D_PACKED 1 1423 punpcklqdq m3, m4, m0 ; out6 -out7 1424 punpckhqdq m0, m4 ; out0 -out1 1425 ret 1426ALIGN function_align 1427cglobal_label .main_pass2 1428 IADST8_1D_PACKED 2 1429 ret 1430 1431INV_TXFM_8X8_FN flipadst, dct 1432INV_TXFM_8X8_FN flipadst, adst 1433INV_TXFM_8X8_FN flipadst, flipadst 1434INV_TXFM_8X8_FN flipadst, identity 1435 1436cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1437 vpermq m4, [cq+32*0], q1302 ; 1 0 1438 vpermq m3, [cq+32*3], q3120 ; 6 7 1439 vpermq m5, [cq+32*1], q1302 ; 3 2 1440 vpermq m2, [cq+32*2], q3120 ; 4 5 1441 call m(iadst_8x8_internal_8bpc).main_pass1 1442 vpbroadcastd m5, [o(pw_m16384_16384)] 1443 punpckhwd m4, m3, m2 1444 punpcklwd m3, m2 1445 punpckhwd m2, m1, m0 1446 punpcklwd m1, m0 1447 punpckhwd m0, m4, m3 1448 punpcklwd m4, m3 1449 punpckhwd m3, m2, m1 1450 punpcklwd m2, m1 1451 REPX {pmulhrsw x, m5}, m0, m4, m3, m2 1452 vinserti32x4 m1, m0, xm3, 1 1453 vshufi32x4 m3, m0, m3, 0x03 1454 vinserti32x4 m0, m4, xm2, 1 1455 vshufi32x4 m2, m4, m2, 0x03 1456 jmp tx2q 1457.pass2: 1458 pshufd m4, m0, q1032 1459 pshufd m5, m1, q1032 1460 call m(iadst_8x8_internal_8bpc).main_pass2 1461 vpbroadcastd m4, [o(pw_2048)] 1462 vpbroadcastd xm5, [o(pw_4096)] 1463 psubw m4, m5 ; lower half = -2048, upper half = 2048 1464 vpermq m5, m3, q2031 1465 vpermq m3, m0, q2031 1466 vpermq m0, m2, q2031 1467 vpermq m2, m1, q2031 1468 pmulhrsw m1, m0, m4 1469 pmulhrsw m0, m5, m4 1470 jmp m(iadst_8x8_internal_8bpc).end3 1471 1472INV_TXFM_8X8_FN identity, dct 1473INV_TXFM_8X8_FN identity, adst 1474INV_TXFM_8X8_FN identity, flipadst 1475INV_TXFM_8X8_FN identity, identity 1476 1477cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1478 mova xm3, [cq+16*0] 1479 mova xm2, [cq+16*1] 1480 vinserti32x4 m3, [cq+16*4], 1 1481 vinserti32x4 m2, [cq+16*5], 1 1482 mova xm4, [cq+16*2] 1483 mova xm0, [cq+16*3] 1484 vinserti32x4 m4, [cq+16*6], 1 1485 vinserti32x4 m0, [cq+16*7], 1 1486 punpcklwd m1, m3, m2 1487 punpckhwd m3, m2 1488 punpcklwd m2, m4, m0 1489 punpckhwd m4, m0 1490 punpckldq m0, m1, m2 1491 punpckhdq m1, m2 1492 punpckldq m2, m3, m4 1493 punpckhdq m3, m4 1494 jmp tx2q 1495.pass2: 1496 vpbroadcastd m4, [o(pw_4096)] 1497 jmp m(iadst_8x8_internal_8bpc).end 1498 1499%macro INV_TXFM_8X16_FN 2 ; type1, type2 1500 INV_TXFM_FN %1, %2, 8x16 1501%ifidn %1_%2, dct_dct 1502 movsx r6d, word [cq] 1503 mov [cq], eobd 1504 or r3d, 16 1505 imul r6d, 181 1506 add r6d, 128 1507 sar r6d, 8 1508 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly 1509%endif 1510%endmacro 1511 1512%macro ITX_8X16_LOAD_COEFS 0 1513 vpbroadcastd m4, [o(pw_2896x8)] 1514 pmulhrsw m0, m4, [cq+32*0] 1515 add cq, 32*4 1516 pmulhrsw m7, m4, [cq+32*3] 1517 pmulhrsw m1, m4, [cq-32*3] 1518 pmulhrsw m6, m4, [cq+32*2] 1519 pmulhrsw m2, m4, [cq-32*2] 1520 pmulhrsw m5, m4, [cq+32*1] 1521 pmulhrsw m3, m4, [cq-32*1] 1522 pmulhrsw m4, [cq+32*0] 1523%endmacro 1524 1525INIT_ZMM avx512icl 1526INV_TXFM_8X16_FN dct, dct 1527INV_TXFM_8X16_FN dct, identity 1528INV_TXFM_8X16_FN dct, adst 1529INV_TXFM_8X16_FN dct, flipadst 1530 1531cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1532 mova m3, [o(permB)] 1533 vpermq m0, m3, [cq+64*0] 1534 vpbroadcastd m4, [o(pw_2896x8)] 1535 vpermq m1, m3, [cq+64*1] 1536 vpermq m2, m3, [cq+64*2] 1537 vpermq m3, m3, [cq+64*3] 1538 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 1539 call m(idct_16x8_internal_8bpc).main 1540 vpbroadcastd m5, [o(pw_16384)] 1541 punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3 1542 punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3 1543 punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3 1544 punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3 1545 REPX {pmulhrsw x, m5}, m4, m0, m2, m1 1546 punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3 1547 punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1 1548 punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3 1549 punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1 1550 punpckhdq m1, m0, m2 ; 1 5 9 13 1551 punpckldq m0, m2 ; 0 4 8 12 1552 punpckldq m2, m3, m4 ; 2 6 10 14 1553 punpckhdq m3, m4 ; 3 7 11 15 1554 jmp tx2q 1555.pass2: 1556 vprord m5, [o(int16_perm)], 16 1557 vshufi32x4 m2, m2, q1320 ; 2 10 14 6 1558 vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11 1559 vshufi32x4 m1, m3, q0132 ; 9 13 7 3 1560 vpermb m9, m5, m0 1561 vpermb m7, m5, m2 1562 vpermb m8, m5, m4 1563 vpermb m0, m5, m1 1564 vextracti32x8 ym6, m9, 1 1565 vextracti32x8 ym3, m7, 1 1566 vextracti32x8 ym5, m8, 1 1567 vextracti32x8 ym1, m0, 1 1568 call .main2 1569 mova ym8, [o(gather8a)] 1570 lea r3, [dstq+strideq*4] 1571 pmovzxdq m9, ym8 1572 pshufd ym8, ym8, q1230 1573 vpermt2q m0, m9, m4 1574 vpermt2q m1, m9, m5 1575 vpermt2q m2, m9, m6 1576 vpermt2q m3, m9, m7 1577.end: 1578 vpbroadcastd m7, [o(pw_2048)] 1579.end2: 1580 pmulhrsw m0, m7 1581 pmulhrsw m1, m7 1582.end3: 1583 pmulhrsw m2, m7 1584 pmulhrsw m3, m7 1585.end4: 1586 vpbroadcastd ym6, strided 1587 kxnorb k1, k1, k1 1588 pxor m4, m4 1589 pmulld ym8, ym6 1590 kmovb k2, k1 1591 vpgatherdq m6{k1}, [dstq+ym8] 1592 kmovb k1, k2 1593 vpgatherdq m7{k2}, [r3+ym8] 1594 mova [cq+64*0], m4 1595 mova [cq+64*1], m4 1596 kmovb k2, k1 1597 mova [cq+64*2], m4 1598 mova [cq+64*3], m4 1599 punpcklbw m5, m6, m4 1600 punpckhbw m6, m4 1601 paddw m0, m5 1602 paddw m1, m6 1603 packuswb m0, m1 1604 vpscatterdq [dstq+ym8]{k1}, m0 1605 punpcklbw m6, m7, m4 1606 punpckhbw m7, m4 1607 paddw m2, m6 1608 paddw m3, m7 1609 packuswb m2, m3 1610 vpscatterdq [r3+ym8]{k2}, m2 1611 RET 1612ALIGN function_align 1613cglobal_label .main_fast2 ; bottom three-quarters are zero 1614 vpbroadcastd ym10, [o(pd_2048)] 1615 vpbroadcastq ym13, [o(int_mshift)] 1616 vpbroadcastd ym3, [o(pw_401_4076x8)] 1617 vpbroadcastd ym5, [o(pw_799_4017x8)] 1618 vpbroadcastd ym4, [o(pw_m1189_3920x8)] 1619 pxor ym6, ym6 1620 punpckhwd ym2, ym0, ym0 1621 pmulhrsw ym2, ym3 ; t8a t15a 1622 punpcklwd ym7, ym1, ym1 1623 pmulhrsw ym7, ym5 ; t4a t7a 1624 punpckhwd ym1, ym1 1625 pmulhrsw ym4, ym1 ; t11a t12a 1626 vpcmpub k7, ym13, ym10, 6 1627 punpcklwd ym9, ym6, ym0 1628 psubsw ym0, ym2, ym4 ; t11a t12a 1629 paddsw ym8, ym2, ym4 ; t8a t15a 1630 mova ym1, ym7 1631 jmp .main5 1632ALIGN function_align 1633cglobal_label .main_fast ; bottom half is zero 1634 vpbroadcastd ym10, [o(pd_2048)] 1635 vpbroadcastq ym13, [o(int_mshift)] 1636 pxor ym6, ym6 1637 punpckhwd ym8, ym0, ym0 1638 punpckhwd ym4, ym3, ym3 1639 punpckhwd ym5, ym2, ym2 1640 punpcklwd ym7, ym1, ym1 1641 punpckhwd ym1, ym1 1642 punpcklwd ym3, ym3 1643 punpcklwd ym9, ym6, ym0 1644 punpcklwd ym6, ym2 1645 vpbroadcastd ym2, [o(pw_401_4076x8)] 1646 vpbroadcastd ym0, [o(pw_m2598_3166x8)] 1647 vpbroadcastd ym11, [o(pw_1931_3612x8)] 1648 vpbroadcastd ym12, [o(pw_m1189_3920x8)] 1649 pmulhrsw ym8, ym2 ; t8a t15a 1650 vpbroadcastd ym2, [o(pw_799_4017x8)] 1651 pmulhrsw ym0, ym4 ; t9a t14a 1652 vpbroadcastd ym4, [o(pw_m2276_3406x8)] 1653 pmulhrsw ym5, ym11 ; t10a t13a 1654 pmulhrsw ym1, ym12 ; t11a t12a 1655 pmulhrsw ym7, ym2 ; t4a t7a 1656 pmulhrsw ym3, ym4 ; t5a t6a 1657 vpcmpub k7, ym13, ym10, 6 1658 jmp .main4 1659ALIGN function_align 1660cglobal_label .main 1661 WRAP_YMM IDCT16_1D_PACKED 1662 ret 1663 1664INV_TXFM_8X16_FN adst, dct 1665INV_TXFM_8X16_FN adst, adst 1666INV_TXFM_8X16_FN adst, flipadst 1667INV_TXFM_8X16_FN adst, identity 1668 1669cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1670 call m(iadst_16x8_internal_8bpc).main_pass1 1671 vbroadcasti32x4 m6, [o(int_shuf1)] 1672 vpbroadcastd m7, [o(pw_16384_m16384)] 1673 punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 1674 punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 1675 pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 1676 pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 1677.pass1_end: 1678 REPX {pmulhrsw x, m7}, m3, m5, m4, m2 1679 punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1 1680 punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3 1681 punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 1682 punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 1683 punpckhqdq m1, m0, m2 1684 punpcklqdq m0, m2 1685 punpcklqdq m2, m3, m5 1686 punpckhqdq m3, m5 1687 jmp tx2q 1688.pass2: 1689 call .main_pass2 1690 vpbroadcastd m6, [o(pw_2048)] 1691 psrlq m10, 4 1692 psubw m7, m8, m6 1693.pass2_end: 1694 vpbroadcastd m5, [o(pw_2896x8)] 1695 paddsw m1, m2, m4 1696 psubsw m2, m4 1697 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 1698 pmulhrsw m5, m2 ; out8 -out11 -out9 out10 1699 mova ym8, [o(gather8c)] 1700 lea r3, [dstq+strideq] 1701 psrlq m2, m10, 4 1702 vpermi2q m2, m0, m3 ; 1 3 13 15 1703 vpermt2q m0, m10, m3 ; 0 2 12 14 1704 psrlq m3, m10, 8 1705 vpermi2q m3, m1, m5 ; 5 7 9 11 1706 psrlq m10, 12 1707 vpermt2q m1, m10, m5 ; 4 6 8 10 1708 pmulhrsw m0, m6 1709 pmulhrsw m1, m6 1710 jmp m(idct_8x16_internal_8bpc).end3 1711ALIGN function_align 1712.main_pass1: 1713 vpbroadcastd m2, [o(pw_2896x8)] 1714 pmulhrsw m5, m2, [cq+64*0] 1715 pmulhrsw m3, m2, [cq+64*3] 1716 pmulhrsw m1, m2, [cq+64*1] 1717 pmulhrsw m2, [cq+64*2] 1718 movu m4, [o(permA+3)] 1719 psrlq m10, m4, 4 1720 mova m6, m4 1721 vpermi2q m4, m5, m3 ; in0 in12 in2 in14 1722 vpermt2q m5, m10, m3 ; in15 in3 in13 in1 1723 vpermi2q m6, m1, m2 ; in4 in8 in6 in10 1724 vpermt2q m1, m10, m2 ; in11 in7 in9 in5 1725 jmp .main 1726ALIGN function_align 1727.main_pass2: 1728 mova m4, [o(permC)] 1729 psrlq m5, m4, 4 1730 vpermi2q m4, m0, m2 ; in0 in12 in2 in14 1731 psrlq m6, m5, 4 1732 vpermi2q m5, m1, m3 ; in15 in3 in13 in1 1733 psrlq m10, m6, 4 1734 vpermi2q m6, m0, m2 ; in4 in8 in6 in10 1735 vpermt2q m1, m10, m3 ; in11 in7 in9 in5 1736.main: 1737 punpcklwd m0, m4, m5 ; in0 in15 in2 in13 1738 punpckhwd m4, m5 ; in12 in3 in14 in1 1739 punpcklwd m5, m6, m1 ; in4 in11 in6 in9 1740 punpckhwd m6, m1 ; in8 in7 in10 in5 1741cglobal_label .main2 1742 vpbroadcastd m9, [o(pd_2048)] 1743 vpbroadcastq m13, [o(int_mshift)] 1744 kxnorb k1, k1, k1 1745 vpcmpub k7, m13, m9, 6 ; 0x33... 1746 pxor m8, m8 1747 ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5 1748 ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5 1749 ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5 1750 ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5 1751 psubsw m2, m0, m6 ; t9a t8a t11a t10a 1752 paddsw m0, m6 ; t1a t0a t3a t2a 1753 psubsw m3, m5, m4 ; t13a t12a t15a t14a 1754 paddsw m5, m4 ; t5a t4a t7a t6a 1755 ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5 1756 psubw m7, m8, m7 1757 ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4 1758 vpbroadcastd m6, [o(pw_3784_m1567)] 1759 vpbroadcastd m6{k1}, [o(pw_m3784_1567)] 1760 psubsw m1, m0, m5 ; t5 t4 t7 t6 1761 paddsw m0, m5 ; t1 t0 t3 t2 1762 psubsw m4, m2, m3 ; t13a t12a t15a t14a 1763 paddsw m2, m3 ; t9a t8a t11a t10a 1764 ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a 1765 ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15 1766 vbroadcasti32x4 m5, [o(deint_shuf)] 1767 pshufb m0, m5 1768 pshufb m2, m5 1769 vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a 1770 vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a 1771 vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15 1772 vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12 1773 pshufd m2, m2, q1032 ; t7a t6a t15 t14 1774 psubsw m4, m0, m3 ; t3a t2a t11 t10 1775 paddsw m0, m3 ; -out15 out0 out14 -out1 1776 paddsw m3, m1, m2 ; out12 -out3 -out13 out2 1777 psubsw m1, m2 ; t7 t6 t15a t14a 1778 punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a 1779 punpcklqdq m4, m1 ; t3a t7 t11 t15a 1780 ret 1781 1782INV_TXFM_8X16_FN flipadst, dct 1783INV_TXFM_8X16_FN flipadst, adst 1784INV_TXFM_8X16_FN flipadst, flipadst 1785INV_TXFM_8X16_FN flipadst, identity 1786 1787cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1788 call m(iadst_16x8_internal_8bpc).main_pass1 1789 vbroadcasti32x4 m6, [o(int_shuf2)] 1790 vpbroadcastd m7, [o(pw_m16384_16384)] 1791 punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 1792 punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 1793 pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 1794 pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 1795 jmp m(iadst_8x16_internal_8bpc).pass1_end 1796.pass2: 1797 call m(iadst_8x16_internal_8bpc).main_pass2 1798 vpbroadcastd m7, [o(pw_2048)] 1799 psrlq m10, 36 1800 psubw m6, m8, m7 1801 jmp m(iadst_8x16_internal_8bpc).pass2_end 1802 1803INV_TXFM_8X16_FN identity, dct 1804INV_TXFM_8X16_FN identity, adst 1805INV_TXFM_8X16_FN identity, flipadst 1806INV_TXFM_8X16_FN identity, identity 1807 1808cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1809 mova m0, [o(int16_perm)] 1810 vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 1811 vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 1812 vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 1813 vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 1814 vpbroadcastd m5, [o(pw_2896x8)] 1815 punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 1816 punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 1817 punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1 1818 punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3 1819 REPX {pmulhrsw x, m5}, m1, m2, m3, m4 1820 punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 1821 punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1 1822 punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2 1823 punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3 1824 jmp tx2q 1825.pass2: 1826 vpbroadcastd m7, [o(pw_1697x16)] 1827 mova ym8, [o(gather8b)] 1828 lea r3, [dstq+strideq*2] 1829 pmulhrsw m4, m7, m0 1830 pmulhrsw m5, m7, m1 1831 pmulhrsw m6, m7, m2 1832 pmulhrsw m7, m3 1833 REPX {paddsw x, x}, m0, m1, m2, m3 1834 paddsw m0, m4 1835 paddsw m1, m5 1836 paddsw m2, m6 1837 paddsw m3, m7 1838 jmp m(idct_8x16_internal_8bpc).end 1839 1840%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] 1841 pmovzxbw m%3, [dstq+%5] 1842%ifnum %1 1843 paddw m%3, m%1 1844%else 1845 paddw m%3, %1 1846%endif 1847 pmovzxbw m%4, [dstq+%6] 1848%ifnum %2 1849 paddw m%4, m%2 1850%else 1851 paddw m%4, %2 1852%endif 1853 packuswb m%3, m%4 1854 vpermq m%3, m%3, q3120 1855 mova [dstq+%5], xm%3 1856 vextracti32x4 [dstq+%6], m%3, 1 1857%endmacro 1858 1859%macro INV_TXFM_16X4_FN 2 ; type1, type2 1860 INV_TXFM_FN %1, %2, 16x4 1861%ifidn %1_%2, dct_dct 1862 movsx r6d, word [cq] 1863 mov [cq], eobd 1864 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2 1865%endif 1866%endmacro 1867 1868INIT_ZMM avx512icl 1869INV_TXFM_16X4_FN dct, dct 1870INV_TXFM_16X4_FN dct, adst 1871INV_TXFM_16X4_FN dct, flipadst 1872INV_TXFM_16X4_FN dct, identity 1873 1874cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1875 mova xm0, [cq+16*0] 1876 mova xm1, [cq+16*1] 1877 mova xm2, [cq+16*2] 1878 mova xm3, [cq+16*3] 1879 mova xm4, [cq+16*4] 1880 mova xm5, [cq+16*5] 1881 mova xm6, [cq+16*6] 1882 mova xm7, [cq+16*7] 1883 call m(idct_4x16_internal_8bpc).main 1884 vpbroadcastd m8, [o(pw_16384)] 1885 vinserti32x4 ym1, xm3, 1 ; 3 2 7 6 1886 vinserti32x4 ym5, xm7, 1 ; b a f e 1887 vinserti32x4 ym0, xm2, 1 ; 0 1 4 5 1888 vinserti32x4 ym4, xm6, 1 ; 8 9 c d 1889 vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e 1890 vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d 1891 pmulhrsw m1, m8 1892 pmulhrsw m0, m8 1893 pshufd m1, m1, q1032 1894 punpckhwd m2, m0, m1 1895 punpcklwd m0, m1 1896 punpckhwd m1, m0, m2 1897 punpcklwd m0, m2 1898 jmp tx2q 1899.pass2: 1900 IDCT4_1D_PACKED 1901 mova m2, [o(permA)] 1902 jmp m(iadst_16x4_internal_8bpc).end 1903 1904INV_TXFM_16X4_FN adst, dct 1905INV_TXFM_16X4_FN adst, adst 1906INV_TXFM_16X4_FN adst, flipadst 1907INV_TXFM_16X4_FN adst, identity 1908 1909cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1910 mova m0, [cq+64*0] 1911 mova m1, [cq+64*1] 1912 movshdup m3, [o(permB)] 1913 psrlq m10, m3, 4 1914 call m(iadst_4x16_internal_8bpc).main2 1915 vpbroadcastd m6, [o(pw_16384_m16384)] 1916 psrlq m0, m10, 4 1917 psrlq m10, 8 1918.pass1_end: 1919 punpcklwd ym5, ym4, ym2 1920 punpckhwd ym4, ym2 1921 vinserti32x8 m5, ym4, 1 1922 mova m1, m9 1923 vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} 1924 mova m4, m9 1925 vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16} 1926 psrad m1, 12 1927 psrad m4, 12 1928 packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5 1929 vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d 1930 vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f 1931 punpckhwd m2, m0, m1 1932 punpcklwd m0, m1 1933 punpckhwd m1, m0, m2 1934 punpcklwd m0, m2 1935 pmulhrsw m0, m6 1936 pmulhrsw m1, m6 1937 jmp tx2q 1938.pass2: 1939 call .main 1940 movu m2, [o(permA+1)] 1941.end: 1942 vpbroadcastd m3, [o(pw_2048)] 1943 pmulhrsw m0, m3 1944 pmulhrsw m1, m3 1945.end2: 1946 psrlq m3, m2, 4 1947 vpermi2q m2, m0, m1 1948 vpermi2q m3, m0, m1 1949.end3: 1950 lea r3, [dstq+strideq*2] 1951 mova xm1, [dstq+strideq*0] 1952 vinserti32x4 ym1, [dstq+strideq*1], 1 1953 vinserti32x4 m1, [r3 +strideq*0], 2 1954 vinserti32x4 m1, [r3 +strideq*1], 3 1955 pxor m4, m4 1956 mova [cq+64*0], m4 1957 mova [cq+64*1], m4 1958 punpcklbw m0, m1, m4 1959 punpckhbw m1, m4 1960 paddw m0, m2 1961 paddw m1, m3 1962 packuswb m0, m1 1963 mova [dstq+strideq*0], xm0 1964 vextracti32x4 [dstq+strideq*1], ym0, 1 1965 vextracti32x4 [r3 +strideq*0], m0, 2 1966 vextracti32x4 [r3 +strideq*1], m0, 3 1967 RET 1968ALIGN function_align 1969.main: 1970 IADST4_1D_PACKED 1971 ret 1972 1973INV_TXFM_16X4_FN flipadst, dct 1974INV_TXFM_16X4_FN flipadst, adst 1975INV_TXFM_16X4_FN flipadst, flipadst 1976INV_TXFM_16X4_FN flipadst, identity 1977 1978cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1979 mova m0, [cq+64*0] 1980 mova m1, [cq+64*1] 1981 movshdup m3, [o(permB)] 1982 psrlq m10, m3, 4 1983 call m(iadst_4x16_internal_8bpc).main2 1984 vpbroadcastd m6, [o(pw_m16384_16384)] 1985 psrlq m0, m10, 12 1986 psrlq m10, 16 1987 jmp m(iadst_16x4_internal_8bpc).pass1_end 1988.pass2: 1989 call m(iadst_16x4_internal_8bpc).main 1990 movu m2, [o(permA+2)] 1991 jmp m(iadst_16x4_internal_8bpc).end 1992 1993INV_TXFM_16X4_FN identity, dct 1994INV_TXFM_16X4_FN identity, adst 1995INV_TXFM_16X4_FN identity, flipadst 1996INV_TXFM_16X4_FN identity, identity 1997 1998cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1999 mova m1, [cq+64*0] 2000 mova m2, [cq+64*1] 2001 vpbroadcastd m3, [o(pw_1697x16)] 2002 vpbroadcastd m4, [o(pw_16384)] 2003 mova m5, [o(idtx_16x4p)] 2004 shufps m0, m1, m2, q2020 2005 shufps m1, m2, q3131 2006 pmulhrsw m2, m3, m0 2007 pmulhrsw m3, m1 2008 pmulhrsw m2, m4 2009 pmulhrsw m3, m4 2010 paddsw m0, m2 2011 paddsw m1, m3 2012 vpermb m0, m5, m0 2013 vpermb m1, m5, m1 2014 jmp tx2q 2015.pass2: 2016 vpbroadcastd m3, [o(pw_1697x8)] 2017 pmulhrsw m2, m3, m0 2018 pmulhrsw m3, m1 2019 paddsw m0, m2 2020 paddsw m1, m3 2021 movu m2, [o(permA+1)] 2022 jmp m(iadst_16x4_internal_8bpc).end 2023 2024%macro INV_TXFM_16X8_FN 2 ; type1, type2 2025 INV_TXFM_FN %1, %2, 16x8 2026%ifidn %1_%2, dct_dct 2027 movsx r6d, word [cq] 2028 mov [cq], eobd 2029 or r3d, 8 2030.dconly: 2031 imul r6d, 181 2032 add r6d, 128 2033 sar r6d, 8 2034.dconly2: 2035 imul r6d, 181 2036 add r6d, 128+256 2037 sar r6d, 8+1 2038.dconly3: 2039 imul r6d, 181 2040 lea r2, [strideq*3] 2041 add r6d, 128+2048 2042 sar r6d, 8+4 2043 pxor m2, m2 2044 vpbroadcastw m3, r6d 2045.dconly_loop: 2046 mova xm1, [dstq+strideq*0] 2047 vinserti32x4 ym1, [dstq+strideq*1], 1 2048 vinserti32x4 m1, [dstq+strideq*2], 2 2049 vinserti32x4 m1, [dstq+r2 ], 3 2050 punpcklbw m0, m1, m2 2051 punpckhbw m1, m2 2052 paddw m0, m3 2053 paddw m1, m3 2054 packuswb m0, m1 2055 mova [dstq+strideq*0], xm0 2056 vextracti32x4 [dstq+strideq*1], ym0, 1 2057 vextracti32x4 [dstq+strideq*2], m0, 2 2058 vextracti32x4 [dstq+r2 ], m0, 3 2059 lea dstq, [dstq+strideq*4] 2060 sub r3d, 4 2061 jg .dconly_loop 2062 RET 2063%endif 2064%endmacro 2065 2066%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd 2067 vpbroadcastd m8, [o(pw_2896x8)] 2068 vpermq m0, [cq+32*0], q3120 2069 add cq, 32*4 2070 vpermq m7, [cq+32*3], q%1 2071 vpermq m1, [cq-32*3], q%1 2072 vpermq m6, [cq+32*2], q3120 2073 vpermq m2, [cq-32*2], q3120 2074 vpermq m5, [cq+32*1], q%1 2075 vpermq m3, [cq-32*1], q%1 2076 vpermq m4, [cq+32*0], q3120 2077 REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 2078%endmacro 2079 2080INV_TXFM_16X8_FN dct, dct 2081INV_TXFM_16X8_FN dct, identity 2082INV_TXFM_16X8_FN dct, adst 2083INV_TXFM_16X8_FN dct, flipadst 2084 2085cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2086 vpbroadcastd m1, [o(pw_2896x8)] 2087 vpermq m0, [cq+64*0], q3120 2088 vpermq m2, [cq+64*1], q3120 2089 vpermq m4, [cq+64*2], q3120 2090 vpermq m6, [cq+64*3], q3120 2091 REPX {pmulhrsw x, m1}, m0, m2, m4, m6 2092 vextracti32x8 ym1, m0, 1 2093 vextracti32x8 ym3, m2, 1 2094 vextracti32x8 ym5, m4, 1 2095 vextracti32x8 ym7, m6, 1 2096 call m(idct_8x16_internal_8bpc).main 2097 vbroadcasti32x4 m8, [o(int_shuf1)] 2098 vbroadcasti32x4 m9, [o(int_shuf2)] 2099 vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3 2100 vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3 2101 vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3 2102 vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3 2103 vpbroadcastd m2, [o(pw_16384)] 2104 pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3 2105 pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3 2106 pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3 2107 pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3 2108 REPX {pmulhrsw x, m2}, m0, m1, m6, m7 2109 punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 2110 punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3 2111 punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1 2112 punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3 2113 jmp tx2q 2114.pass2: 2115 vshufi32x4 m0, m2, m4, q2020 ; 0 1 2116 vshufi32x4 m2, m4, q3131 ; 4 5 2117 vshufi32x4 m1, m3, m5, q2020 ; 2 3 2118 vshufi32x4 m3, m5, q3131 ; 6 7 2119 call .main 2120 movshdup m4, [o(permC)] 2121 psrlq m6, m4, 4 2122 vpermq m5, m4, q1032 2123 vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3 2124 vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1 2125 psrlq m6, m5, 4 2126 vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3 2127 vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1 2128 vpbroadcastd m6, [o(pw_2048)] 2129.end: 2130 REPX {pmulhrsw x, m6}, m0, m4, m1, m5 2131.end2: 2132 lea r3, [dstq+strideq*4] 2133 lea r4, [strideq*3] 2134 mova xm3, [dstq+strideq*0] 2135 mova xm6, [dstq+strideq*2] 2136 vinserti32x4 ym3, [dstq+strideq*1], 1 2137 vinserti32x4 ym6, [dstq+r4 ], 1 2138 vinserti32x4 m3, [r3 +strideq*0], 2 2139 vinserti32x4 m6, [r3 +strideq*2], 2 2140 vinserti32x4 m3, [r3 +strideq*1], 3 2141 vinserti32x4 m6, [r3 +r4 ], 3 2142 pxor m7, m7 2143 mova [cq+64*0], m7 2144 mova [cq+64*1], m7 2145 mova [cq+64*2], m7 2146 mova [cq+64*3], m7 2147 punpcklbw m2, m3, m7 2148 punpckhbw m3, m7 2149 paddw m0, m2 2150 paddw m4, m3 2151 packuswb m0, m4 2152 mova [dstq+strideq*0], xm0 2153 vextracti32x4 [dstq+strideq*1], ym0, 1 2154 vextracti32x4 [r3 +strideq*0], m0, 2 2155 vextracti32x4 [r3 +strideq*1], m0, 3 2156 punpcklbw m3, m6, m7 2157 punpckhbw m6, m7 2158 paddw m1, m3 2159 paddw m5, m6 2160 packuswb m1, m5 2161 mova [dstq+strideq*2], xm1 2162 vextracti32x4 [dstq+r4 ], ym1, 1 2163 vextracti32x4 [r3 +strideq*2], m1, 2 2164 vextracti32x4 [r3 +r4 ], m1, 3 2165 RET 2166ALIGN function_align 2167cglobal_label .main 2168 IDCT8_1D_PACKED 2169 ret 2170 2171INV_TXFM_16X8_FN adst, dct 2172INV_TXFM_16X8_FN adst, adst 2173INV_TXFM_16X8_FN adst, flipadst 2174INV_TXFM_16X8_FN adst, identity 2175 2176cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2177 call m(iadst_8x16_internal_8bpc).main_pass1 2178 vpbroadcastd m7, [o(pw_16384_m16384)] 2179 psrlq m10, 4 2180.pass1_end: 2181 punpcklwd m5, m4, m2 2182 punpckhwd m4, m2 2183 mova m1, m9 2184 vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} 2185 mova m6, m9 2186 vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16} 2187 mova m2, m9 2188 vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16} 2189 vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16} 2190 psrad m1, 12 2191 psrad m6, 12 2192 packssdw m1, m6 ; out8 -out7 -out9 out6 2193 psrad m2, 12 2194 psrad m9, 12 2195 packssdw m2, m9 ; -out11 out4 out10 -out5 2196 psrlq m4, m10, 4 2197 vpermi2q m4, m0, m2 2198 vpermt2q m0, m10, m2 2199 psrlq m5, m10, 8 2200 vpermi2q m5, m1, m3 2201 psrlq m10, 12 2202 vpermt2q m1, m10, m3 2203 punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3 2204 punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3 2205 punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3 2206 punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3 2207 punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1 2208 punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3 2209 punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1 2210 punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3 2211 REPX {pmulhrsw x, m7}, m2, m3, m4, m5 2212 jmp tx2q 2213.pass2: 2214 vshufi32x4 m0, m2, m4, q2020 2215 vshufi32x4 m2, m4, q3131 ; 4 5 2216 vshufi32x4 m1, m3, m5, q2020 2217 vshufi32x4 m3, m5, q3131 ; 6 7 2218 pshufd m4, m0, q1032 ; 1 0 2219 pshufd m5, m1, q1032 ; 3 2 2220 call .main_pass2 2221 movshdup m4, [o(permC)] 2222 pmulhrsw m0, m6 2223 pmulhrsw m1, m6 2224 psrlq m6, m4, 4 2225 mova m5, m4 2226 vpermi2q m4, m0, m2 2227 vpermt2q m0, m6, m2 2228 vpermi2q m5, m1, m3 2229 vpermt2q m1, m6, m3 2230 jmp m(idct_16x8_internal_8bpc).end2 2231ALIGN function_align 2232.main_pass1: 2233 vpbroadcastd m4, [o(pw_2896x8)] 2234 pmulhrsw m3, m4, [cq+64*0] 2235 pmulhrsw m1, m4, [cq+64*3] 2236 pmulhrsw m2, m4, [cq+64*1] 2237 pmulhrsw m4, [cq+64*2] 2238 mova m5, [o(int16_perm)] 2239 kxnorb k1, k1, k1 2240 vpblendmd m0{k1}, m1, m3 ; 0 7 2241 vmovdqa32 m3{k1}, m1 ; 6 1 2242 vpblendmd m1{k1}, m4, m2 ; 2 5 2243 vmovdqa32 m2{k1}, m4 ; 4 3 2244 REPX {vpermb x, m5, x}, m0, m1, m2, m3 2245 IADST8_1D_PACKED 1 2246 ret 2247ALIGN function_align 2248cglobal_label .main_pass2 2249 IADST8_1D_PACKED 2 2250 pxor m5, m5 2251 psubd m5, m6 2252 packssdw m6, m5 2253 pmulhrsw m2, m6 2254 pmulhrsw m3, m6 2255 ret 2256 2257INV_TXFM_16X8_FN flipadst, dct 2258INV_TXFM_16X8_FN flipadst, adst 2259INV_TXFM_16X8_FN flipadst, flipadst 2260INV_TXFM_16X8_FN flipadst, identity 2261 2262cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2263 call m(iadst_8x16_internal_8bpc).main_pass1 2264 vpbroadcastd m7, [o(pw_m16384_16384)] 2265 psrlq m10, 20 2266 jmp m(iadst_16x8_internal_8bpc).pass1_end 2267.pass2: 2268 vshufi32x4 m0, m2, m4, q2020 2269 vshufi32x4 m2, m4, q3131 ; 4 5 2270 vshufi32x4 m1, m3, m5, q2020 2271 vshufi32x4 m3, m5, q3131 ; 6 7 2272 pshufd m4, m0, q1032 ; 1 0 2273 pshufd m5, m1, q1032 ; 3 2 2274 call m(iadst_16x8_internal_8bpc).main_pass2 2275 movshdup m4, [o(permC)] 2276 pmulhrsw m5, m6, m0 2277 pmulhrsw m0, m6, m1 2278 psrlq m1, m4, 12 2279 psrlq m4, 8 2280 mova m7, m4 2281 vpermi2q m4, m0, m3 2282 vpermt2q m0, m1, m3 2283 vpermi2q m1, m5, m2 2284 vpermt2q m5, m7, m2 2285 jmp m(idct_16x8_internal_8bpc).end2 2286 2287INV_TXFM_16X8_FN identity, dct 2288INV_TXFM_16X8_FN identity, adst 2289INV_TXFM_16X8_FN identity, flipadst 2290INV_TXFM_16X8_FN identity, identity 2291 2292cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2293 vpbroadcastd m0, [o(pw_2896x8)] 2294 pmulhrsw m3, m0, [cq+64*0] 2295 pmulhrsw m4, m0, [cq+64*1] 2296 pmulhrsw m5, m0, [cq+64*2] 2297 pmulhrsw m0, [cq+64*3] 2298 vpbroadcastd m7, [o(pw_1697x16)] 2299 vpbroadcastd m8, [o(pw_16384)] 2300 shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5 2301 shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7 2302 shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5 2303 shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7 2304 mova m9, [o(int8_permA)] 2305 pmulhrsw m0, m7, m2 2306 pmulhrsw m1, m7, m3 2307 pmulhrsw m6, m7, m4 2308 pmulhrsw m7, m5 2309 REPX {pmulhrsw x, m8}, m0, m1, m6, m7 2310 paddsw m2, m0 2311 paddsw m3, m1 2312 paddsw m4, m6 2313 paddsw m5, m7 2314 REPX {vpermb x, m9, x}, m2, m3, m4, m5 2315 jmp tx2q 2316.pass2: 2317 mova m7, [o(permB)] 2318 vpbroadcastd m6, [o(pw_4096)] 2319 vpermq m0, m7, m2 2320 vpermq m4, m7, m4 2321 vpermq m1, m7, m3 2322 vpermq m5, m7, m5 2323 jmp m(idct_16x8_internal_8bpc).end 2324 2325%macro INV_TXFM_16X16_FN 2 ; type1, type2 2326 INV_TXFM_FN %1, %2, 16x16 2327%ifidn %1_%2, dct_dct 2328 movsx r6d, word [cq] 2329 mov [cq], eobd 2330 or r3d, 16 2331 imul r6d, 181 2332 add r6d, 128+512 2333 sar r6d, 8+2 2334 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 2335%endif 2336%endmacro 2337 2338INV_TXFM_16X16_FN dct, dct 2339INV_TXFM_16X16_FN dct, identity 2340INV_TXFM_16X16_FN dct, adst 2341INV_TXFM_16X16_FN dct, flipadst 2342 2343cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2344 mova m7, [o(permB)] 2345 vpermq m0, m7, [cq+64*0] 2346 vpermq m1, m7, [cq+64*1] 2347 vpermq m2, m7, [cq+64*2] 2348 vpermq m3, m7, [cq+64*3] 2349 vpermq m4, m7, [cq+64*4] 2350 vpermq m5, m7, [cq+64*5] 2351 vpermq m6, m7, [cq+64*6] 2352 vpermq m7, m7, [cq+64*7] 2353 call .main 2354 vbroadcasti32x4 m12, [o(int_shuf1)] 2355 vbroadcasti32x4 m11, [o(int_shuf2)] 2356 vpbroadcastd m13, [o(pw_8192)] 2357 pshufb m0, m12 2358 pshufb m8, m1, m11 2359 pshufb m2, m12 2360 pshufb m9, m3, m11 2361 pshufb m4, m12 2362 pshufb m10, m5, m11 2363 pshufb m6, m12 2364 pshufb m11, m7, m11 2365 REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11 2366 punpckhdq m1, m0, m8 2367 punpckldq m0, m8 2368 punpckhdq m3, m2, m9 2369 punpckldq m2, m9 2370 punpckhdq m5, m4, m10 2371 punpckldq m4, m10 2372 punpckhdq m7, m6, m11 2373 punpckldq m6, m11 2374 jmp tx2q 2375.pass2: 2376 vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc 2377 vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 2378 vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec 2379 vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 2380 vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me 2381 vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 2382 vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee 2383 vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 2384 vshufi32x4 m2, m0, m4, q3131 ; 4 5 2385 vshufi32x4 m0, m4, q2020 ; 0 1 2386 vshufi32x4 m4, m6, m8, q2020 ; 8 9 2387 vshufi32x4 m6, m8, q3131 ; 12 13 2388 vshufi32x4 m3, m1, m5, q3131 ; 6 7 2389 vshufi32x4 m1, m5, q2020 ; 2 3 2390 vshufi32x4 m5, m7, m9, q2020 ; 10 11 2391 vshufi32x4 m7, m9, q3131 ; 14 15 2392 call .main 2393 mova m8, [o(permD)] 2394 psrlq m12, m8, 4 2395 psrlq m9, m8, 8 2396 psrlq m13, m8, 12 2397 mova m10, m8 2398 vpermi2q m8, m0, m2 ; 0 1 4 5 2399 vpermt2q m0, m12, m2 2400 mova m11, m9 2401 vpermi2q m9, m1, m3 ; 2 3 6 7 2402 vpermt2q m1, m13, m3 2403 vpermi2q m10, m4, m6 ; 8 9 12 13 2404 vpermt2q m4, m12, m6 2405 vpermi2q m11, m5, m7 ; 10 11 14 15 2406 vpermt2q m5, m13, m7 2407.end: 2408 vpbroadcastd m12, [o(pw_2048)] 2409.end2: 2410 REPX {pmulhrsw x, m12}, m0, m1, m4, m5 2411.end3: 2412 REPX {pmulhrsw x, m12}, m8, m9, m10, m11 2413 lea r3, [strideq*3] 2414 lea r4, [dstq+strideq*4] 2415 lea r5, [dstq+strideq*8] 2416 lea r6, [r4 +strideq*8] 2417 mova xm3, [dstq+strideq*0] 2418 mova xm6, [dstq+strideq*2] 2419 vinserti32x4 ym3, [dstq+strideq*1], 1 2420 vinserti32x4 ym6, [dstq+r3 ], 1 2421 vinserti32x4 m3, [r4+strideq*0], 2 2422 vinserti32x4 m6, [r4+strideq*2], 2 2423 vinserti32x4 m3, [r4+strideq*1], 3 2424 vinserti32x4 m6, [r4+r3 ], 3 2425 mova xm12, [r5+strideq*0] 2426 mova xm13, [r5+strideq*2] 2427 vinserti32x4 ym12, [r5+strideq*1], 1 2428 vinserti32x4 ym13, [r5+r3 ], 1 2429 vinserti32x4 m12, [r6+strideq*0], 2 2430 vinserti32x4 m13, [r6+strideq*2], 2 2431 vinserti32x4 m12, [r6+strideq*1], 3 2432 vinserti32x4 m13, [r6+r3 ], 3 2433 pxor m7, m7 2434 REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 2435 punpcklbw m2, m3, m7 2436 punpckhbw m3, m7 2437 paddw m0, m2 2438 paddw m8, m3 2439 packuswb m0, m8 2440 punpcklbw m2, m6, m7 2441 punpckhbw m6, m7 2442 paddw m1, m2 2443 paddw m9, m6 2444 packuswb m1, m9 2445 punpcklbw m2, m12, m7 2446 punpckhbw m12, m7 2447 paddw m2, m4 2448 paddw m10, m12 2449 packuswb m2, m10 2450 punpcklbw m3, m13, m7 2451 punpckhbw m13, m7 2452 paddw m3, m5 2453 paddw m11, m13 2454 packuswb m3, m11 2455 mova [dstq+strideq*0], xm0 2456 vextracti32x4 [dstq+strideq*1], ym0, 1 2457 mova [dstq+strideq*2], xm1 2458 vextracti32x4 [dstq+r3 ], ym1, 1 2459 vextracti32x4 [r4+strideq*0], m0, 2 2460 vextracti32x4 [r4+strideq*1], m0, 3 2461 vextracti32x4 [r4+strideq*2], m1, 2 2462 vextracti32x4 [r4+r3 ], m1, 3 2463 mova [r5+strideq*0], xm2 2464 vextracti32x4 [r5+strideq*1], ym2, 1 2465 mova [r5+strideq*2], xm3 2466 vextracti32x4 [r5+r3 ], ym3, 1 2467 vextracti32x4 [r6+strideq*0], m2, 2 2468 vextracti32x4 [r6+strideq*1], m2, 3 2469 vextracti32x4 [r6+strideq*2], m3, 2 2470 vextracti32x4 [r6+r3 ], m3, 3 2471 RET 2472ALIGN function_align 2473cglobal_label .main_fast2 ; bottom three-quarters are zero 2474 vpbroadcastd m10, [o(pd_2048)] 2475 vpbroadcastq m13, [o(int_mshift)] 2476 vpcmpub k7, m13, m10, 6 2477.main_fast4: 2478 vpbroadcastd m2, [o(pw_401_4076x8)] 2479 vpbroadcastd m4, [o(pw_m1189_3920x8)] 2480 vpbroadcastd m3, [o(pw_799_4017x8)] 2481 pmulhrsw m2, m8 ; t8a t15a 2482 pmulhrsw m4, m1 ; t11a t12a 2483 pmulhrsw m7, m3 ; t4a t7a 2484 pxor m6, m6 2485 psubsw m0, m2, m4 ; t11a t12a 2486 paddsw m8, m2, m4 ; t8a t15a 2487 mova m1, m7 2488 jmp .main5 2489ALIGN function_align 2490cglobal_label .main_fast ; bottom half is zero 2491 vpbroadcastd m10, [o(pd_2048)] 2492.main_fast3: 2493 vpbroadcastq m13, [o(int_mshift)] 2494 vpcmpub k7, m13, m10, 6 2495.main_fast5: 2496 vpbroadcastd m2, [o(pw_401_4076x8)] 2497 vpbroadcastd m4, [o(pw_m2598_3166x8)] 2498 vpbroadcastd m11, [o(pw_1931_3612x8)] 2499 vpbroadcastd m12, [o(pw_m1189_3920x8)] 2500 pmulhrsw m8, m2 ; t8a t15a 2501 vpbroadcastd m2, [o(pw_799_4017x8)] 2502 pmulhrsw m0, m4 ; t9a t14a 2503 vpbroadcastd m4, [o(pw_m2276_3406x8)] 2504 pmulhrsw m5, m11 ; t10a t13a 2505 pmulhrsw m1, m12 ; t11a t12a 2506 pmulhrsw m7, m2 ; t4a t7a 2507 pmulhrsw m3, m4 ; t5a t6a 2508 jmp .main4 2509ALIGN function_align 2510cglobal_label .main 2511 IDCT16_1D_PACKED 2512 ret 2513 2514INV_TXFM_16X16_FN adst, dct 2515INV_TXFM_16X16_FN adst, adst 2516INV_TXFM_16X16_FN adst, flipadst 2517 2518cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2519 call .main_pass1 2520 vpbroadcastd m10, [o(pw_8192_m8192)] 2521 punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3 2522 punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3 2523 punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3 2524 punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1 2525 punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3 2526 punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3 2527 punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3 2528 punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1 2529 punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3 2530 punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3 2531 punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 2532 punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 2533 punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3 2534 punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3 2535 punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 2536 punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 2537.pass1_end: 2538 REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 2539 jmp tx2q 2540.pass2: 2541 call .main_pass2 2542 mova m10, [o(permD)] 2543 psrlq m8, m10, 8 2544 psrlq m12, m10, 12 2545 psrlq m13, m10, 4 2546 mova m9, m8 2547 vpermi2q m8, m0, m2 ; 0 1 4 5 2548 vpermt2q m0, m12, m2 2549 vpermi2q m9, m1, m3 ; 2 3 6 7 2550 vpermt2q m1, m12, m3 2551 vpbroadcastd m12, [o(pw_2048)] 2552 mov r3d, 0xff00ff00 2553 mova m11, m10 2554 vpermi2q m10, m4, m6 ; 8 9 12 13 2555 vpermt2q m4, m13, m6 2556 kmovd k1, r3d 2557 vpermi2q m11, m5, m7 ; 10 11 14 15 2558 vpermt2q m5, m13, m7 2559 pxor m7, m7 2560 vpsubw m12{k1}, m7, m12 2561 jmp m(idct_16x16_internal_8bpc).end2 2562ALIGN function_align 2563.main_pass1: 2564 mova m4, [o(permB)] 2565 psrlq m3, m4, 4 2566 vpermq m0, m4, [cq+64*0] 2567 vpermq m7, m3, [cq+64*7] 2568 vpermq m6, m4, [cq+64*6] 2569 vpermq m1, m3, [cq+64*1] 2570 vpermq m2, m4, [cq+64*2] 2571 vpermq m5, m3, [cq+64*5] 2572 vpermq m4, m4, [cq+64*4] 2573 vpermq m3, m3, [cq+64*3] 2574 call .main 2575 vpbroadcastd m13, [o(pw_2896_2896)] 2576 vpbroadcastd m12, [o(pw_m2896_2896)] 2577 mova m2, m10 2578 vpdpwssd m2, m5, m13 ; -out5 2579 mova m8, m10 2580 vpdpwssd m8, m11, m13 ; out4 2581 mova m9, m10 2582 vpdpwssd m9, m5, m12 ; out10 2583 mova m5, m10 2584 vpdpwssd m5, m11, m12 ; -out11 2585 mova m11, m10 2586 vpdpwssd m11, m3, m13 ; -out7 2587 mova m14, m10 2588 vpdpwssd m14, m4, m13 ; out6 2589 mova m13, m10 2590 vpdpwssd m13, m3, m12 ; out8 2591 vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9 2592 REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10 2593 packssdw m2, m8 ; -out5 out4 2594 packssdw m5, m9, m5 ; out10 -out11 2595 packssdw m3, m11, m14 ; -out7 out6 2596 packssdw m4, m13, m10 ; out8 -out9 2597 ret 2598ALIGN function_align 2599.main_pass2: 2600 vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc 2601 vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 2602 vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec 2603 vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 2604 vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me 2605 vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 2606 vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee 2607 vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 2608 vshufi32x4 m2, m0, m4, q3131 ; 4 5 2609 vshufi32x4 m0, m4, q2020 ; 0 1 2610 vshufi32x4 m4, m6, m8, q2020 ; 8 9 2611 vshufi32x4 m6, m8, q3131 ; 12 13 2612 vshufi32x4 m3, m1, m5, q3131 ; 6 7 2613 vshufi32x4 m1, m5, q2020 ; 2 3 2614 vshufi32x4 m5, m7, m9, q2020 ; 10 11 2615 vshufi32x4 m7, m9, q3131 ; 14 15 2616cglobal_label .main_pass2b 2617 REPX {pshufd x, x, q1032}, m1, m3, m5, m7 2618 call .main 2619 vpbroadcastd m8, [o(pw_2896x8)] 2620 pshufb m2, m11, m12 2621 pshufb m5, m12 2622 pshufb m3, m12 2623 pshufb m4, m12 2624 punpcklqdq m9, m5, m2 ; t15a t7 2625 punpckhqdq m5, m2 ; t14a t6 2626 shufps m2, m3, m4, q1032 ; t2a t10 2627 shufps m3, m4, q3210 ; t3a t11 2628 psubsw m4, m2, m3 ; out8 -out9 2629 paddsw m3, m2 ; -out7 out6 2630 paddsw m2, m5, m9 ; -out5 out4 2631 psubsw m5, m9 ; out10 -out11 2632 REPX {pmulhrsw x, m8}, m2, m3, m4, m5 2633 ret 2634ALIGN function_align 2635.main: 2636 vpbroadcastd m10, [o(pd_2048)] 2637 vpbroadcastq m13, [o(int_mshift)] 2638 punpckhwd m8, m7, m0 ; in14 in1 2639 punpcklwd m0, m7 ; in0 in15 2640 punpcklwd m7, m6, m1 ; in12 in3 2641 punpckhwd m1, m6 ; in2 in13 2642 punpckhwd m6, m5, m2 ; in10 in5 2643 punpcklwd m2, m5 ; in4 in11 2644 punpcklwd m5, m4, m3 ; in8 in7 2645 punpckhwd m3, m4 ; in6 in9 2646 vpcmpub k7, m13, m10, 6 ; 0x33... 2647 ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1 2648 ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3 2649 ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5 2650 ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7 2651 ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9 2652 ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11 2653 ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13 2654 ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15 2655 psubsw m4, m0, m5 ; t9a t8a 2656 paddsw m0, m5 ; t1a t0a 2657 psubsw m5, m1, m6 ; t11a t10a 2658 paddsw m1, m6 ; t3a t2a 2659 psubsw m6, m2, m7 ; t13a t12a 2660 paddsw m2, m7 ; t5a t4a 2661 psubsw m7, m3, m8 ; t15a t14a 2662 paddsw m3, m8 ; t7a t6a 2663 ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9 2664 ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13 2665 ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11 2666 ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15 2667 psubsw m8, m1, m3 ; t7 t6 2668 paddsw m1, m3 ; t3 t2 2669 psubsw m3, m0, m2 ; t5 t4 2670 paddsw m0, m2 ; t1 t0 2671 psubsw m2, m5, m7 ; t14a t15a 2672 paddsw m7, m5 ; t10a t11a 2673 psubsw m5, m4, m6 ; t12a t13a 2674 paddsw m4, m6 ; t8a t9a 2675 ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a 2676 ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a 2677 ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14 2678 ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12 2679 vbroadcasti32x4 m12, [o(deint_shuf)] 2680 paddsw m6, m4, m7 ; -out1 out14 2681 psubsw m4, m7 ; t10 t11 2682 psubsw m11, m3, m8 ; t7 t6 2683 paddsw m8, m3 ; out12 -out3 2684 psubsw m3, m0, m1 ; t3a t2a 2685 paddsw m0, m1 ; -out15 out0 2686 paddsw m1, m2, m5 ; -out13 out2 2687 psubsw m5, m2 ; t15a t14a 2688 pshufb m0, m12 2689 pshufb m6, m12 2690 pshufb m8, m12 2691 pshufb m1, m12 2692 shufps m7, m6, m0, q1032 ; out14 -out15 2693 shufps m0, m6, m0, q3210 ; -out1 out0 2694 punpcklqdq m6, m8, m1 ; out12 -out13 2695 punpckhqdq m1, m8, m1 ; -out3 out2 2696 ret 2697 2698INV_TXFM_16X16_FN flipadst, dct 2699INV_TXFM_16X16_FN flipadst, adst 2700INV_TXFM_16X16_FN flipadst, flipadst 2701 2702cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2703 call m(iadst_16x16_internal_8bpc).main_pass1 2704 vpbroadcastd m10, [o(pw_m8192_8192)] 2705 punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3 2706 punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3 2707 punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3 2708 punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3 2709 punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1 2710 punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3 2711 punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1 2712 punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3 2713 punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3 2714 punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3 2715 punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3 2716 punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3 2717 punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1 2718 punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3 2719 punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1 2720 punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3 2721 jmp m(iadst_16x16_internal_8bpc).pass1_end 2722.pass2: 2723 call m(iadst_16x16_internal_8bpc).main_pass2 2724 mova m10, [o(permD)] 2725 psrlq m8, m10, 8 2726 psrlq m12, m10, 12 2727 psrlq m13, m10, 4 2728 mova m9, m8 2729 vpermi2q m8, m7, m5 ; 0 1 4 5 2730 vpermt2q m7, m12, m5 2731 vpermi2q m9, m6, m4 ; 2 3 6 7 2732 vpermt2q m6, m12, m4 2733 vpbroadcastd m12, [o(pw_2048)] 2734 mov r3d, 0x00ff00ff 2735 mova m11, m10 2736 vpermi2q m10, m3, m1 ; 8 9 12 13 2737 vpermt2q m3, m13, m1 2738 kmovd k1, r3d 2739 vpermi2q m11, m2, m0 ; 10 11 14 15 2740 vpermt2q m2, m13, m0 2741 pxor m0, m0 2742 vpsubw m12{k1}, m0, m12 2743 pmulhrsw m0, m7, m12 2744 pmulhrsw m1, m6, m12 2745 pmulhrsw m4, m3, m12 2746 pmulhrsw m5, m2, m12 2747 jmp m(idct_16x16_internal_8bpc).end3 2748 2749INV_TXFM_16X16_FN identity, dct 2750INV_TXFM_16X16_FN identity, identity 2751 2752cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2753 mova m8, [o(int16_perm)] 2754 vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 2755 vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 2756 vpbroadcastd m0, [o(pw_1697x16)] 2757 vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 2758 vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 2759 vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3 2760 vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3 2761 vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3 2762 vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3 2763 pmulhrsw m9, m0, m1 2764 pmulhrsw m10, m0, m2 2765 pmulhrsw m11, m0, m3 2766 pmulhrsw m12, m0, m4 2767 pmulhrsw m13, m0, m5 2768 pmulhrsw m14, m0, m6 2769 pmulhrsw m15, m0, m7 2770 pmulhrsw m0, m8 2771 REPX {psraw x, 1}, m9, m10, m11, m12 2772 pavgw m1, m9 2773 pavgw m2, m10 2774 pavgw m3, m11 2775 pavgw m4, m12 2776 REPX {psraw x, 1}, m13, m14, m15, m0 2777 pavgw m5, m13 2778 pavgw m6, m14 2779 pavgw m7, m15 2780 pavgw m8, m0 2781 punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 2782 punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 2783 punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 2784 punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 2785 punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1 2786 punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3 2787 punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 2788 punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 2789 jmp tx2q 2790ALIGN function_align 2791.pass2: 2792 vpbroadcastd m11, [o(pw_1697x16)] 2793 pmulhrsw m12, m11, m0 2794 pmulhrsw m13, m11, m1 2795 pmulhrsw m14, m11, m2 2796 pmulhrsw m15, m11, m3 2797 pmulhrsw m8, m11, m4 2798 pmulhrsw m9, m11, m5 2799 pmulhrsw m10, m11, m6 2800 pmulhrsw m11, m7 2801 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 2802 paddsw m0, m12 2803 paddsw m1, m13 2804 paddsw m2, m14 2805 paddsw m3, m15 2806 paddsw m8, m4 2807 movu m4, [o(permD+2)] 2808 paddsw m9, m5 2809 paddsw m6, m10 2810 paddsw m7, m11 2811 psrlq m12, m4, 4 2812 mova m5, m4 2813 mova m10, m4 2814 mova m11, m4 2815 vpermi2q m4, m0, m2 ; 8 9 12 13 2816 vpermt2q m0, m12, m2 ; 0 1 4 5 2817 vpermi2q m5, m1, m3 ; 10 11 14 15 2818 vpermt2q m1, m12, m3 ; 2 3 6 7 2819 vpermi2q m10, m8, m6 2820 vpermt2q m8, m12, m6 2821 vpermi2q m11, m9, m7 2822 vpermt2q m9, m12, m7 2823 jmp m(idct_16x16_internal_8bpc).end 2824 2825%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4] 2826 vpbroadcastd m%4, [o(pw_%5_%6x8)] 2827 punpcklwd m%1, m%3, m%3 2828 pmulhrsw m%1, m%4 2829 vpbroadcastd m%4, [o(pw_%7_%8x8)] 2830 punpckhwd m%2, m%3, m%3 2831 pmulhrsw m%2, m%4 2832%endmacro 2833 2834cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob 2835%undef cmp 2836 lea r5, [o_base] 2837 test eobd, eobd 2838 jz .dconly 2839 cmp eobd, 107 2840 jb .fast 2841 mova m5, [cq+64*5] 2842 mova m3, [cq+64*3] 2843 mova m1, [cq+64*1] 2844 mova m7, [cq+64*7] 2845 mova m2, [cq+64*2] 2846 mova m6, [cq+64*6] 2847 mova m0, [cq+64*0] 2848 mova m4, [cq+64*4] 2849 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 2850 mova m8, [o(idct_8x32p)] 2851 vpbroadcastd m9, [o(pw_8192)] 2852 REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7 2853 punpckldq m8, m0, m1 ; ab 2854 punpckhdq m0, m1 2855 punpckldq m1, m2, m3 ; cd 2856 punpckhdq m2, m3 2857 punpckldq m3, m4, m5 ; ef 2858 punpckhdq m4, m5 2859 punpckldq m5, m6, m7 ; gh 2860 punpckhdq m6, m7 2861 REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6 2862 punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9 2863 punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21 2864 punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13 2865 punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17 2866 punpcklqdq m20, m3, m5 2867 punpckhqdq m16, m3, m5 2868 punpcklqdq m19, m4, m6 2869 punpckhqdq m17, m4, m6 2870 vinserti32x4 ym8, ym18, xm20, 1 2871 vshufi32x4 ym1, ym18, ym20, 0x03 2872 vinserti32x4 ym9, ym14, xm16, 1 2873 vshufi32x4 ym3, ym14, ym16, 0x03 2874 vinserti32x4 ym0, ym21, xm19, 1 2875 vshufi32x4 ym5, ym21, ym19, 0x03 2876 vinserti32x4 ym7, ym15, xm17, 1 2877 vshufi32x4 ym6, ym15, ym17, 0x03 2878 call m(idct_8x16_internal_8bpc).main2 2879 psrlq m12, [o(permB)], 60 2880 vpermt2q m14, m12, m16 2881 vpermt2q m21, m12, m19 2882 vpermt2q m15, m12, m17 2883 vpermi2q m12, m18, m20 2884 vextracti32x8 ym16, m14, 1 2885 vextracti32x8 ym19, m21, 1 2886 vextracti32x8 ym17, m15, 1 2887 vextracti32x8 ym20, m12, 1 2888 call .main2 2889 jmp .end 2890.fast: ; right half is zero 2891 mova m0, [o(int16_perm)] 2892 mova ym2, [cq+64*4] 2893 vinserti32x8 m2, [cq+64*0], 1 2894 mova ym3, [cq+64*6] 2895 vinserti32x8 m3, [cq+64*2], 1 2896 mova ym4, [cq+64*3] 2897 vinserti32x8 m4, [cq+64*5], 1 2898 mova ym5, [cq+64*7] 2899 vinserti32x8 m5, [cq+64*1], 1 2900 REPX {vpermb x, m0, x}, m2, m3, m4, m5 2901 call m(idct_16x8_internal_8bpc).main2 2902 vbroadcasti32x4 m4, [o(int_shuf3)] 2903 vbroadcasti32x4 m5, [o(int_shuf4)] 2904 pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3 2905 pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3 2906 pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3 2907 pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3 2908 vpbroadcastd m4, [o(pw_8192)] 2909 psrlq m5, [o(permB)], 60 2910 punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2 2911 punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3 2912 punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2 2913 punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3 2914 REPX {pmulhrsw x, m4}, m6, m17, m2, m16 2915 vinserti32x4 ym0, ym2, xm6, 1 ; 0 2 2916 vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 2917 vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 2918 vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 2919 vpermt2q m2, m5, m6 ; 8 10 2920 vpermt2q m16, m5, m17 ; 9 11 2921 vextracti32x8 ym3, m2, 1 ; 12 14 2922 vextracti32x8 ym17, m16, 1 ; 13 15 2923 call m(idct_8x16_internal_8bpc).main_fast 2924 call .main_fast 2925.end: 2926 vpbroadcastd ym8, strided 2927 pmulld ym8, [o(gather8d)] 2928 call .main_end 2929 lea r3, [dstq+strideq*4] 2930 kxnorb k1, k1, k1 2931 lea r4, [dstq+strideq*8] 2932 pxor m9, m9 2933 lea r1, [r3+strideq*8] 2934 kmovb k2, k1 2935 vpgatherdq m12{k1}, [r0+ym8] 2936 kmovb k1, k2 2937 vpgatherdq m13{k2}, [r3+ym8] 2938 kmovb k2, k1 2939 vpgatherdq m14{k1}, [r4+ym8] 2940 kmovb k1, k2 2941 vpgatherdq m15{k2}, [r1+ym8] 2942 REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 2943 REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7 2944 punpcklbw m11, m12, m9 2945 punpckhbw m12, m9 2946 paddw m0, m11 2947 paddw m1, m12 2948 packuswb m0, m1 2949 kmovb k2, k1 2950 vpscatterdq [r0+ym8]{k1}, m0 2951 punpcklbw m12, m13, m9 2952 punpckhbw m13, m9 2953 paddw m2, m12 2954 paddw m3, m13 2955 packuswb m2, m3 2956 kmovb k1, k2 2957 vpscatterdq [r3+ym8]{k2}, m2 2958 punpcklbw m13, m14, m9 2959 punpckhbw m14, m9 2960 paddw m4, m13 2961 paddw m5, m14 2962 packuswb m4, m5 2963 kmovb k2, k1 2964 vpscatterdq [r4+ym8]{k1}, m4 2965 punpcklbw m14, m15, m9 2966 punpckhbw m15, m9 2967 paddw m6, m14 2968 paddw m7, m15 2969 packuswb m6, m7 2970 vpscatterdq [r1+ym8]{k2}, m6 2971 RET 2972.dconly: 2973 movsx r6d, word [cq] 2974 mov [cq], eobd 2975 or r3d, 32 2976 imul r6d, 181 2977 add r6d, 128+512 2978 sar r6d, 8+2 2979 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 2980INIT_YMM avx512icl 2981ALIGN function_align 2982cglobal_label .main_fast2 ; bottom three-quarters are zero 2983 ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a 2984 ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a 2985 mova m11, m12 2986 mova m17, m20 2987 mova m15, m21 2988 mova m16, m14 2989 jmp .main4 2990ALIGN function_align 2991cglobal_label .main_fast ; bottom half is zero 2992 ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a 2993 ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a 2994 ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a 2995 ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a 2996 jmp .main3 2997ALIGN function_align 2998cglobal_label .main 2999 punpcklwd m12, m21, m14 ; in31 in1 3000 punpckhwd m14, m21 ; in3 in29 3001 punpcklwd m21, m20, m15 ; in27 in5 3002 punpckhwd m15, m20 ; in7 in25 3003 punpcklwd m20, m19, m16 ; in23 in9 3004 punpckhwd m16, m19 ; in11 in21 3005 punpcklwd m19, m18, m17 ; in19 in13 3006 punpckhwd m17, m18 ; in15 in17 3007.main2: 3008 ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a 3009 ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a 3010 ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a 3011 ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a 3012 ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a 3013 ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a 3014 ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a 3015 ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a 3016.main3: 3017 psubsw m11, m12, m17 ; t17 t30 3018 paddsw m12, m17 ; t16 t31 3019 psubsw m17, m15, m20 ; t18 t29 3020 paddsw m20, m15 ; t19 t28 3021 psubsw m15, m21, m16 ; t21 t26 3022 paddsw m21, m16 ; t20 t27 3023 psubsw m16, m14, m19 ; t22 t25 3024 paddsw m14, m19 ; t23 t24 3025.main4: 3026 ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a 3027 ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a 3028 ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a 3029 ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a 3030 vpbroadcastd m8, [o(pw_m3784_1567)] 3031 psubsw m19, m12, m20 ; t19a t28a 3032 paddsw m20, m12 ; t16a t31a 3033 psubsw m12, m14, m21 ; t20a t27a 3034 paddsw m14, m21 ; t23a t24a 3035 psubsw m21, m11, m17 ; t18 t29 3036 paddsw m11, m17 ; t17 t30 3037 psubsw m17, m16, m15 ; t21 t26 3038 paddsw m16, m15 ; t22 t25 3039 ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a 3040 ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28 3041 ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27 3042 ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a 3043 vbroadcasti32x4 m18, [o(deint_shuf)] 3044 vpbroadcastd m8, [o(pw_m2896_2896)] 3045 vpbroadcastd m9, [o(pw_2896_2896)] 3046 psubsw m15, m20, m14 ; t23 t24 3047 paddsw m20, m14 ; t16 t31 3048 psubsw m14, m11, m16 ; t22a t25a 3049 paddsw m11, m16 ; t17a t30a 3050 psubsw m16, m21, m17 ; t21 t26 3051 paddsw m21, m17 ; t18 t29 3052 psubsw m17, m19, m12 ; t20a t27a 3053 paddsw m19, m12 ; t19a t28a 3054 REPX {pshufb x, m18}, m20, m11, m21, m19 3055 ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a 3056 ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 3057 packssdw m18, m13 ; t23a t22 3058 packssdw m12, m15 ; t24a t25 3059 ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a 3060 ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 3061 packssdw m16, m13 ; t20 t21a 3062 packssdw m14, m15 ; t27 t26a 3063 punpcklqdq m13, m19, m21 ; t19a t18 3064 punpckhqdq m19, m21 ; t28a t29 3065 punpcklqdq m21, m20, m11 ; t16 t17a 3066 punpckhqdq m20, m11 ; t31 t30a 3067INIT_ZMM avx512icl 3068 mova m15, [o(permA)] 3069 ret 3070cglobal_label .main_end 3071 vpbroadcastd m10, [o(pw_2048)] 3072 vpermt2q m0, m15, m1 ; t0 t1 t2 t3 3073 vpermt2q m20, m15, m19 ; t31 t30a t29 t28a 3074 vpermt2q m2, m15, m3 ; t4 t5 t6 t7 3075 vpermt2q m14, m15, m12 ; t27 t26a t25 t24a 3076 vpermt2q m4, m15, m5 ; t8 t9 t10 t11 3077 vpermt2q m18, m15, m16 ; t23a t22 t21a t20 3078 vpermt2q m6, m15, m7 ; t12 t13 t14 t15 3079 vpermt2q m13, m15, m21 ; t19a t18 t17a t16 3080 psubsw m7, m0, m20 ; out31 out30 out29 out28 3081 paddsw m0, m20 ; out0 out1 out2 out3 3082 psubsw m5, m2, m14 ; out27 out26 out25 out24 3083 paddsw m2, m14 ; out4 out5 out6 out7 3084 psubsw m3, m4, m18 ; out23 out22 out21 out20 3085 paddsw m4, m18 ; out8 out9 out10 out11 3086 psubsw m1, m6, m13 ; out19 out18 out17 out16 3087 paddsw m6, m13 ; out12 out13 out14 out15 3088 vzeroupper 3089 ret 3090 3091%macro LOAD_PACKED_16X2 3 ; dst, row[1-2] 3092 vbroadcasti32x4 ym%1, [cq+16*%2] 3093 vbroadcasti32x4 ym8, [cq+16*%3] 3094 shufpd ym%1, ym8, 0x0c 3095%endmacro 3096 3097cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob 3098%undef cmp 3099 test eobd, eobd 3100 jz .dconly 3101 lea r5, [o_base] 3102 LOAD_PACKED_16X2 0, 0, 2 ; in0 in2 3103 LOAD_PACKED_16X2 1, 4, 6 ; in4 in6 3104 LOAD_PACKED_16X2 2, 8, 10 ; in8 in10 3105 LOAD_PACKED_16X2 3, 12, 14 ; in12 in14 3106 LOAD_PACKED_16X2 14, 1, 3 ; in1 in3 3107 LOAD_PACKED_16X2 15, 5, 7 ; in5 in7 3108 LOAD_PACKED_16X2 16, 9, 11 ; in9 in11 3109 LOAD_PACKED_16X2 17, 13, 15 ; in13 in15 3110 pxor m4, m4 3111 REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 3112 cmp eobd, 107 3113 jb .fast 3114 LOAD_PACKED_16X2 4, 16, 18 ; in16 in18 3115 LOAD_PACKED_16X2 5, 20, 22 ; in20 in22 3116 LOAD_PACKED_16X2 6, 24, 26 ; in24 in26 3117 LOAD_PACKED_16X2 7, 28, 30 ; in28 in30 3118 call m(idct_8x16_internal_8bpc).main 3119 LOAD_PACKED_16X2 18, 19, 17 ; in19 in17 3120 LOAD_PACKED_16X2 19, 23, 21 ; in23 in21 3121 LOAD_PACKED_16X2 20, 27, 25 ; in27 in25 3122 LOAD_PACKED_16X2 21, 31, 29 ; in31 in29 3123 pxor m8, m8 3124 REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 3125 call m(inv_txfm_add_dct_dct_8x32_8bpc).main 3126 jmp .pass2 3127.fast: ; bottom half is zero 3128 mova ym5, ym4 3129 mova ym6, ym4 3130 mova ym7, ym4 3131 call m(idct_8x16_internal_8bpc).main 3132 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast 3133.pass2: 3134 vpbroadcastd m10, [o(pw_8192)] 3135 vpermt2q m0, m15, m4 ; t0 t1 t9 t8 3136 vpermt2q m20, m15, m18 ; t31 t30a t23a t22 3137 vpermt2q m3, m15, m7 ; t7 t6 t14 t15 3138 vpermt2q m12, m15, m21 ; t25 t24a t17a t16 3139 vpermt2q m2, m15, m6 ; t4 t5 t13 t12 3140 vpermt2q m14, m15, m13 ; t23a t22 t21a t20 3141 vpermt2q m1, m15, m5 ; t3 t2 t10 t11 3142 vpermt2q m19, m15, m16 ; t27 t26a t19a t18 3143 psubsw m8, m0, m20 ; out31 out30 out22 out23 3144 paddsw m0, m20 ; out0 out1 out9 out8 3145 paddsw m6, m3, m12 ; out7 out6 out14 out15 3146 psubsw m3, m12 ; out24 out25 out17 out16 3147 psubsw m5, m2, m14 ; out27 out26 out18 out19 3148 paddsw m4, m2, m14 ; out4 out5 out13 out12 3149 psubsw m7, m1, m19 ; out28 out29 out21 out20 3150 paddsw m2, m1, m19 ; out3 out2 out10 out11 3151 vzeroupper 3152 vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25 3153 vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24 3154 vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27 3155 vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26 3156 vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29 3157 vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28 3158 vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31 3159 vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30 3160 REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 3161 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 3162 call .main 3163 vpbroadcastd m8, [o(pw_2048)] 3164 REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 3165 lea r2, [strideq*3] 3166 lea r3, [dstq+strideq*4] 3167 movshdup m12, [o(permD)] 3168 pmovzxbw m8, [dstq+strideq*0] 3169 pmovzxbw m9, [dstq+strideq*1] 3170 pmovzxbw m10, [dstq+strideq*2] 3171 pmovzxbw m11, [dstq+r2 ] 3172 paddw m0, m8 3173 paddw m1, m9 3174 paddw m2, m10 3175 paddw m3, m11 3176 pmovzxbw m8, [r3+strideq*0] 3177 pmovzxbw m9, [r3+strideq*1] 3178 pmovzxbw m10, [r3+strideq*2] 3179 pmovzxbw m11, [r3+r2 ] 3180 paddw m4, m8 3181 paddw m5, m9 3182 paddw m6, m10 3183 paddw m7, m11 3184 packuswb m0, m1 3185 packuswb m2, m3 3186 vpermq m0, m12, m0 3187 vpermq m2, m12, m2 3188 mova [dstq+strideq*0], ym0 3189 vextracti32x8 [dstq+strideq*1], m0, 1 3190 mova [dstq+strideq*2], ym2 3191 vextracti32x8 [dstq+r2 ], m2, 1 3192 packuswb m4, m5 3193 packuswb m6, m7 3194 vpermq m4, m12, m4 3195 vpermq m6, m12, m6 3196 mova [r3+strideq*0], ym4 3197 vextracti32x8 [r3+strideq*1], m4, 1 3198 mova [r3+strideq*2], ym6 3199 vextracti32x8 [r3+r2 ], m6, 1 3200 RET 3201.dconly: 3202 movsx r6d, word [cq] 3203 mov [cq], eobd 3204 or r3d, 8 3205.dconly2: 3206 imul r6d, 181 3207 add r6d, 128+512 3208 sar r6d, 8+2 3209.dconly3: 3210 imul r6d, 181 3211 add r6d, 128+2048 3212 sar r6d, 8+4 3213 pxor m2, m2 3214 vpbroadcastw m3, r6d 3215.dconly_loop: 3216 mova ym1, [dstq+strideq*0] 3217 vinserti32x8 m1, [dstq+strideq*1], 1 3218 punpcklbw m0, m1, m2 3219 punpckhbw m1, m2 3220 paddw m0, m3 3221 paddw m1, m3 3222 packuswb m0, m1 3223 mova [dstq+strideq*0], ym0 3224 vextracti32x8 [dstq+strideq*1], m0, 1 3225 lea dstq, [dstq+strideq*2] 3226 sub r3d, 2 3227 jg .dconly_loop 3228 RET 3229ALIGN function_align 3230cglobal_label .main 3231 vpbroadcastd m10, [o(pd_2048)] 3232.main2: 3233 ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a 3234 ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a 3235 ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3 3236 vpbroadcastd m11, [o(pw_2896_2896)] 3237 vpbroadcastd m12, [o(pw_m2896_2896)] 3238 ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0 3239.main3: 3240 paddsw m8, m1, m5 ; t4 3241 psubsw m1, m5 ; t5a 3242 paddsw m9, m7, m3 ; t7 3243 psubsw m7, m3 ; t6a 3244 ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6 3245 psubsw m5, m0, m2 ; dct4 out2 3246 paddsw m2, m0 ; dct4 out1 3247 paddsw m0, m4, m6 ; dct4 out0 3248 psubsw m4, m6 ; dct4 out3 3249 psubsw m6, m2, m1 ; out6 3250 paddsw m1, m2 ; out1 3251 paddsw m2, m5, m7 ; out2 3252 psubsw m5, m7 ; out5 3253 psubsw m7, m0, m9 ; out7 3254 paddsw m0, m9 ; out0 3255 paddsw m3, m4, m8 ; out3 3256 psubsw m4, m8 ; out4 3257 ret 3258 3259cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c 3260 vpbroadcastd m7, [pw_5] 3261 paddsw m0, m7, [cq+64*0] 3262 paddsw m1, m7, [cq+64*1] 3263 vpbroadcastd ym9, strided 3264 paddsw m2, m7, [cq+64*2] 3265 paddsw m3, m7, [cq+64*3] 3266 paddsw m4, m7, [cq+64*4] 3267 paddsw m5, m7, [cq+64*5] 3268 paddsw m6, m7, [cq+64*6] 3269 paddsw m7, [cq+64*7] 3270 pmulld ym14, ym9, [pd_0to15] 3271 lea r3, [dstq+strideq*1] 3272 lea r4, [dstq+strideq*2] 3273 kxnorb k1, k1, k1 3274 pxor m13, m13 3275 add r1, r4 ; dstq+strideq*3 3276 kmovb k2, k1 3277 vpgatherdq m9{k1}, [r0+ym14*4] 3278 kmovb k1, k2 3279 vpgatherdq m10{k2}, [r3+ym14*4] 3280 kmovb k2, k1 3281 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 3282 REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 3283 vpgatherdq m11{k1}, [r4+ym14*4] 3284 kmovb k1, k2 3285 vpgatherdq m12{k2}, [r1+ym14*4] 3286 REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 3287 punpcklbw m8, m9, m13 ; 0 8 16 24 3288 punpckhbw m9, m13 ; 4 12 20 28 3289 paddw m0, m8 3290 paddw m4, m9 3291 packuswb m0, m4 3292 kmovb k2, k1 3293 vpscatterdq [r0+ym14*4]{k1}, m0 3294 punpcklbw m8, m10, m13 ; 1 9 17 25 3295 punpckhbw m10, m13 ; 5 13 21 29 3296 paddw m1, m8 3297 paddw m5, m10 3298 packuswb m1, m5 3299 kmovb k1, k2 3300 vpscatterdq [r3+ym14*4]{k2}, m1 3301 punpcklbw m8, m11, m13 ; 2 10 18 26 3302 punpckhbw m11, m13 ; 6 14 22 30 3303 paddw m2, m8 3304 paddw m6, m11 3305 packuswb m2, m6 3306 kmovb k2, k1 3307 vpscatterdq [r4+ym14*4]{k1}, m2 3308 punpcklbw m8, m12, m13 ; 3 11 19 27 3309 punpckhbw m12, m13 ; 7 15 23 31 3310 paddw m3, m8 3311 paddw m7, m12 3312 packuswb m3, m7 3313 vpscatterdq [r1+ym14*4]{k2}, m3 3314 RET 3315 3316cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c 3317 vpbroadcastd m0, [pw_4096] 3318 pmulhrsw m3, m0, [cq+64*0] 3319 pmulhrsw m4, m0, [cq+64*4] 3320 pmulhrsw m6, m0, [cq+64*1] 3321 pmulhrsw m5, m0, [cq+64*5] 3322 pmulhrsw m7, m0, [cq+64*2] 3323 pmulhrsw m2, m0, [cq+64*6] 3324 pmulhrsw m8, m0, [cq+64*3] 3325 pmulhrsw m0, [cq+64*7] 3326 mova m13, [int8_permA] 3327 lea r3, [strideq*3] 3328 lea r4, [dstq+strideq*4] 3329 punpckldq m1, m3, m4 3330 punpckhdq m3, m4 3331 punpckldq m4, m6, m5 3332 punpckhdq m6, m5 3333 punpckldq m5, m7, m2 3334 punpckhdq m7, m2 3335 punpckldq m2, m8, m0 3336 punpckhdq m8, m0 3337 mova ym9, [dstq+strideq*0] 3338 vinserti32x8 m9, [dstq+strideq*2], 1 3339 mova ym10, [dstq+strideq*1] 3340 vinserti32x8 m10, [dstq+r3 ], 1 3341 mova ym11, [r4+strideq*0] 3342 vinserti32x8 m11, [r4+strideq*2], 1 3343 mova ym12, [r4+strideq*1] 3344 vinserti32x8 m12, [r4+r3 ], 1 3345 REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8 3346 pxor m13, m13 3347 REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 3348 punpcklqdq m0, m1, m4 ; a0 a2 c0 c2 3349 punpckhqdq m1, m4 ; b0 b2 d0 d2 3350 punpcklqdq m4, m5, m2 ; a1 a3 c1 c3 3351 punpckhqdq m5, m2 ; b1 b3 d1 d3 3352 punpcklqdq m2, m3, m6 ; e0 e2 g0 g2 3353 punpckhqdq m3, m6 ; f0 f2 h0 h2 3354 punpcklqdq m6, m7, m8 ; e1 e3 g1 g3 3355 punpckhqdq m7, m8 ; f1 f3 h1 h3 3356 punpcklbw m8, m9, m13 3357 punpckhbw m9, m13 3358 paddw m0, m8 3359 paddw m4, m9 3360 packuswb m0, m4 3361 mova [dstq+strideq*0], ym0 3362 vextracti32x8 [dstq+strideq*2], m0, 1 3363 punpcklbw m8, m10, m13 3364 punpckhbw m10, m13 3365 paddw m1, m8 3366 paddw m5, m10 3367 packuswb m1, m5 3368 mova [dstq+strideq*1], ym1 3369 vextracti32x8 [dstq+r3 ], m1, 1 3370 punpcklbw m8, m11, m13 3371 punpckhbw m11, m13 3372 paddw m2, m8 3373 paddw m6, m11 3374 packuswb m2, m6 3375 mova [r4+strideq*0], ym2 3376 vextracti32x8 [r4+strideq*2], m2, 1 3377 punpcklbw m8, m12, m13 3378 punpckhbw m12, m13 3379 paddw m3, m8 3380 paddw m7, m12 3381 packuswb m3, m7 3382 mova [r4+strideq*1], ym3 3383 vextracti32x8 [r4+r3 ], m3, 1 3384 RET 3385 3386%macro IDCT_16x32_END 3 ; src[1-2], row 3387 mova xm8, [dstq+strideq*0] 3388 vinserti32x4 ym8, [dstq+strideq*1], 1 3389 mova xm9, [dstq+r3 ] 3390 vinserti32x4 ym9, [dstq+strideq*2], 1 3391 pmulhrsw m%1, m10 3392 pmulhrsw m%2, m10 3393 vpermb m8, m11, m8 3394 vpermb m9, m11, m9 3395 mova [cq+64*(%3*2+0)], m13 3396 mova [cq+64*(%3*2+1)], m13 3397 paddw m8, m%1 3398 paddw m9, m%2 3399 packuswb m8, m9 3400 vpermd m8, m12, m8 3401 mova [dstq+strideq*0], xm8 3402 vextracti32x4 [dstq+strideq*1], ym8, 1 3403 vextracti32x4 [dstq+strideq*2], m8, 2 3404 vextracti32x4 [dstq+r3 ], m8, 3 3405%if %1 != 20 3406 lea dstq, [dstq+strideq*4] 3407%endif 3408%endmacro 3409 3410cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob 3411%undef cmp 3412 lea r5, [o_base] 3413 test eobd, eobd 3414 jz .dconly 3415 vpbroadcastd m15, [o(pw_2896x8)] 3416 cmp eobd, 151 3417 jb .fast 3418 pmulhrsw m5, m15, [cq+64*10] 3419 pmulhrsw m3, m15, [cq+64* 6] 3420 pmulhrsw m1, m15, [cq+64* 2] 3421 pmulhrsw m7, m15, [cq+64*14] 3422 pmulhrsw m2, m15, [cq+64* 4] 3423 pmulhrsw m6, m15, [cq+64*12] 3424 pmulhrsw m0, m15, [cq+64* 0] 3425 pmulhrsw m4, m15, [cq+64* 8] 3426 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 3427 pmulhrsw m14, m15, [cq+64* 1] 3428 pmulhrsw m21, m15, [cq+64*15] 3429 pmulhrsw m18, m15, [cq+64* 9] 3430 pmulhrsw m17, m15, [cq+64* 7] 3431 pmulhrsw m16, m15, [cq+64* 5] 3432 pmulhrsw m19, m15, [cq+64*11] 3433 pmulhrsw m20, m15, [cq+64*13] 3434 pmulhrsw m15, [cq+64* 3] 3435 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 3436 mova m8, [o(idct_16x32p)] 3437 vpbroadcastd m9, [o(pw_16384)] 3438 REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3439 m14, m15, m16, m17, m18, m19, m20, m21 3440 punpckldq m8, m0, m1 3441 punpckhdq m0, m1 3442 punpckldq m1, m2, m3 3443 punpckhdq m2, m3 3444 REPX {pmulhrsw x, m9}, m8, m0, m1, m2 3445 punpckldq m3, m4, m5 3446 punpckhdq m4, m5 3447 punpckldq m5, m6, m7 3448 punpckhdq m6, m7 3449 REPX {pmulhrsw x, m9}, m3, m4, m5, m6 3450 punpckldq m7, m14, m15 3451 punpckhdq m14, m15 3452 punpckldq m15, m16, m17 3453 punpckhdq m16, m17 3454 REPX {pmulhrsw x, m9}, m7, m14, m15, m16 3455 punpckldq m17, m18, m19 3456 punpckhdq m18, m19 3457 punpckldq m19, m20, m21 3458 punpckhdq m20, m21 3459 REPX {pmulhrsw x, m9}, m17, m18, m19, m20 3460 punpcklqdq m21, m8, m1 3461 punpckhqdq m8, m1 3462 punpcklqdq m1, m0, m2 3463 punpckhqdq m0, m2 3464 punpcklqdq m2, m3, m5 3465 punpckhqdq m3, m5 3466 punpcklqdq m5, m4, m6 3467 punpckhqdq m4, m6 3468 punpcklqdq m6, m7, m15 3469 punpckhqdq m7, m15 3470 punpcklqdq m15, m14, m16 3471 punpckhqdq m14, m16 3472 punpcklqdq m16, m17, m19 3473 punpckhqdq m17, m19 3474 punpcklqdq m19, m18, m20 3475 punpckhqdq m18, m20 3476 vinserti32x8 m20, m21, ym2, 1 3477 vshufi32x4 m21, m2, q3232 3478 vinserti32x8 m2, m8, ym3, 1 3479 vshufi32x4 m8, m3, q3232 3480 vinserti32x8 m3, m1, ym5, 1 3481 vshufi32x4 m1, m5, q3232 3482 vinserti32x8 m5, m0, ym4, 1 3483 vshufi32x4 m0, m4, q3232 3484 vinserti32x8 m4, m6, ym16, 1 3485 vshufi32x4 m6, m16, q3232 3486 vinserti32x8 m16, m7, ym17, 1 3487 vshufi32x4 m7, m17, q3232 3488 vinserti32x8 m17, m15, ym19, 1 3489 vshufi32x4 m15, m19, q3232 3490 vinserti32x8 m19, m14, ym18, 1 3491 vshufi32x4 m14, m18, q3232 3492 vshufi32x4 m18, m21, m6, q3131 ; 27 5 3493 vshufi32x4 m21, m6, q2020 ; 31 1 3494 vshufi32x4 m6, m8, m7, q2020 ; 24 8 3495 vshufi32x4 m8, m7, q3131 ; 30 2 3496 vshufi32x4 m7, m1, m15, q2020 ; 28 4 3497 vshufi32x4 m1, m15, q3131 ; 6 26 3498 vshufi32x4 m15, m0, m14, q2020 ; 7 25 3499 vshufi32x4 m0, m14, q3131 ; 14 18 3500 vshufi32x4 m14, m20, m4, q2020 ; 3 29 3501 vshufi32x4 m20, m4, q3131 ; 23 9 3502 vshufi32x4 m9, m3, m17, q2020 ; 16 0 3503 vshufi32x4 m3, m17, q3131 ; 12 20 3504 vshufi32x4 m17, m5, m19, q2020 ; 15 17 3505 vshufi32x4 m5, m19, q3131 ; 22 10 3506 vshufi32x4 m19, m2, m16, q2020 ; 19 13 3507 vshufi32x4 m16, m2, m16, q3131 ; 11 21 3508 call m(idct_16x16_internal_8bpc).main3 3509 call .main_oddhalf 3510 jmp .pass2 3511.fast: ; right half is zero 3512 mova ym8, [cq+64*15] 3513 vinserti32x8 m8, [cq+64* 1], 1 3514 mova m2, [o(int16_perm)] 3515 mova ym9, [cq+64* 8] 3516 vinserti32x8 m9, [cq+64* 0], 1 3517 mova ym0, [cq+64* 7] 3518 vinserti32x8 m0, [cq+64* 9], 1 3519 mova ym7, [cq+64*14] 3520 vinserti32x8 m7, [cq+64* 2], 1 3521 mova ym1, [cq+64* 3] 3522 vinserti32x8 m1, [cq+64*13], 1 3523 mova ym3, [cq+64* 6] 3524 vinserti32x8 m3, [cq+64*10], 1 3525 mova ym5, [cq+64*11] 3526 vinserti32x8 m5, [cq+64* 5], 1 3527 mova ym6, [cq+64*12] 3528 vinserti32x8 m6, [cq+64* 4], 1 3529 REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6 3530 REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 3531 call m(idct_16x16_internal_8bpc).main2 3532 vbroadcasti32x4 m8, [o(int_shuf3)] 3533 vbroadcasti32x4 m9, [o(int_shuf4)] 3534 vpbroadcastd m11, [o(pw_16384)] 3535 pshufb m0, m8 3536 pshufb m1, m9 3537 pshufb m2, m8 3538 pshufb m3, m9 3539 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 3540 pshufb m4, m8 3541 pshufb m5, m9 3542 pshufb m6, m8 3543 pshufb m7, m9 3544 REPX {pmulhrsw x, m11}, m4, m5, m6, m7 3545 punpckhdq m17, m0, m1 3546 punpckldq m0, m1 3547 punpckhdq m16, m2, m3 3548 punpckldq m2, m3 3549 punpckhdq m18, m4, m5 3550 punpckldq m4, m5 3551 punpckhdq m5, m6, m7 3552 punpckldq m6, m7 3553 vinserti32x8 m1, m0, ym2, 1 3554 vshufi32x4 m3, m0, m2, q3232 3555 vinserti32x8 m2, m4, ym6, 1 3556 vshufi32x4 m4, m6, q3232 3557 vinserti32x8 m15, m17, ym16, 1 3558 vshufi32x4 m17, m16, q3232 3559 vinserti32x8 m16, m18, ym5, 1 3560 vshufi32x4 m18, m5, q3232 3561 vshufi32x4 m0, m1, m2, q2020 ; 0 2 3562 vshufi32x4 m1, m2, q3131 ; 4 6 3563 vshufi32x4 m2, m3, m4, q2020 ; 8 10 3564 vshufi32x4 m3, m4, q3131 ; 12 14 3565 vshufi32x4 m14, m15, m16, q2020 ; 1 3 3566 vshufi32x4 m15, m16, q3131 ; 5 7 3567 vshufi32x4 m16, m17, m18, q2020 ; 9 11 3568 vshufi32x4 m17, m18, q3131 ; 13 15 3569 pxor m6, m6 3570 punpckhwd m8, m0, m0 3571 punpcklwd m9, m6, m0 3572 punpckhwd m0, m3, m3 3573 punpckhwd m5, m2, m2 3574 punpcklwd m7, m1, m1 3575 punpckhwd m1, m1 3576 punpcklwd m3, m3 3577 punpcklwd m6, m2 3578 call m(idct_16x16_internal_8bpc).main_fast5 3579 punpcklwd m21, m14, m14 3580 punpckhwd m14, m14 3581 punpcklwd m18, m15, m15 3582 punpckhwd m15, m15 3583 punpcklwd m20, m16, m16 3584 punpckhwd m16, m16 3585 punpcklwd m19, m17, m17 3586 punpckhwd m17, m17 3587 call .main_oddhalf_fast 3588.pass2: 3589 vpbroadcastd m10, [o(pw_2048)] 3590 mova m11, [o(end_16x32p)] 3591 lea r3, [strideq*3] 3592 pxor m13, m13 3593 psrld m12, m11, 8 3594 IDCT_16x32_END 0, 1, 0 3595 IDCT_16x32_END 2, 3, 1 3596 IDCT_16x32_END 4, 5, 2 3597 IDCT_16x32_END 6, 7, 3 3598 IDCT_16x32_END 14, 15, 4 3599 IDCT_16x32_END 16, 17, 5 3600 IDCT_16x32_END 18, 19, 6 3601 IDCT_16x32_END 20, 21, 7 3602 RET 3603ALIGN function_align 3604.dconly: 3605 movsx r6d, word [cq] 3606 mov [cq], eobd 3607 or r3d, 32 3608 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly 3609ALIGN function_align 3610cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero 3611 vpbroadcastd m8, [o(pw_201_4091x8)] 3612 vpbroadcastd m20, [o(pw_m1380_3857x8)] 3613 vpbroadcastd m9, [o(pw_995_3973x8)] 3614 vpbroadcastd m16, [o(pw_m601_4052x8)] 3615 pmulhrsw m21, m8 ; t16a, t31a 3616 pmulhrsw m20, m15 ; t19a, t28a 3617 pmulhrsw m18, m9 ; t20a, t27a 3618 pmulhrsw m14, m16 ; t23a, t24a 3619 mova m8, m21 3620 mova m17, m20 3621 mova m15, m18 3622 mova m16, m14 3623 jmp .main3 3624ALIGN function_align 3625cglobal_label .main_oddhalf_fast ; bottom half is zero 3626 vpbroadcastd m8, [o(pw_201_4091x8)] 3627 vpbroadcastd m9, [o(pw_m2751_3035x8)] 3628 vpbroadcastd m11, [o(pw_1751_3703x8)] 3629 vpbroadcastd m12, [o(pw_m1380_3857x8)] 3630 pmulhrsw m21, m8 ; t16a, t31a 3631 vpbroadcastd m8, [o(pw_995_3973x8)] 3632 pmulhrsw m17, m9 ; t17a, t30a 3633 vpbroadcastd m9, [o(pw_m2106_3513x8)] 3634 pmulhrsw m20, m11 ; t18a, t29a 3635 vpbroadcastd m11, [o(pw_2440_3290x8)] 3636 pmulhrsw m15, m12 ; t19a, t28a 3637 vpbroadcastd m12, [o(pw_m601_4052x8)] 3638 pmulhrsw m18, m8 ; t20a, t27a 3639 pmulhrsw m16, m9 ; t21a, t26a 3640 pmulhrsw m19, m11 ; t22a, t25a 3641 pmulhrsw m14, m12 ; t23a, t24a 3642 jmp .main2 3643ALIGN function_align 3644cglobal_label .main_oddhalf 3645 ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a 3646 ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a 3647 ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a 3648 ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a 3649 ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a 3650 ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a 3651 ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a 3652 ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a 3653.main2: 3654 psubsw m8, m21, m17 ; t17 t30 3655 paddsw m21, m17 ; t16 t31 3656 psubsw m17, m15, m20 ; t18 t29 3657 paddsw m20, m15 ; t19 t28 3658 psubsw m15, m18, m16 ; t21 t26 3659 paddsw m18, m16 ; t20 t27 3660 psubsw m16, m14, m19 ; t22 t25 3661 paddsw m14, m19 ; t23 t24 3662.main3: 3663 ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a 3664 ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a 3665 ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a 3666 ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a 3667 vpbroadcastd m11, [o(pw_m3784_1567)] 3668 psubsw m19, m21, m20 ; t19a t28a 3669 paddsw m21, m20 ; t16a t31a 3670 psubsw m20, m14, m18 ; t20a t27a 3671 paddsw m14, m18 ; t23a t24a 3672 psubsw m18, m8, m17 ; t18 t29 3673 paddsw m8, m17 ; t17 t30 3674 psubsw m17, m16, m15 ; t21 t26 3675 paddsw m15, m16 ; t22 t25 3676 ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a 3677 ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28 3678 ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27 3679 ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a 3680 vbroadcasti32x4 m9, [o(deint_shuf)] 3681 psubsw m16, m21, m14 ; t23 t24 3682 paddsw m14, m21 ; t16 t31 3683 psubsw m21, m8, m15 ; t22a t25a 3684 paddsw m15, m8 ; t17a t30a 3685 psubsw m8, m18, m17 ; t21 t26 3686 paddsw m18, m17 ; t18 t29 3687 paddsw m17, m19, m20 ; t19a t28a 3688 psubsw m19, m20 ; t20a t27a 3689 vpbroadcastd m11, [o(pw_m2896_2896)] 3690 vpbroadcastd m12, [o(pw_2896_2896)] 3691 REPX {pshufb x, m9}, m14, m15, m18, m17 3692 mova m9, m10 3693 vpdpwssd m9, m16, m11 3694 mova m20, m10 3695 vpdpwssd m20, m21, m11 3696 psrad m9, 12 3697 psrad m20, 12 3698 packssdw m9, m20 ; t23a t22 3699 mova m20, m10 3700 vpdpwssd m20, m16, m12 3701 mova m16, m10 3702 vpdpwssd m16, m21, m12 3703 psrad m20, 12 3704 psrad m16, 12 3705 packssdw m16, m20, m16 ; t24a t25 3706 ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a 3707 ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27 3708 packssdw m11, m20 ; t27 t26a 3709 packssdw m8, m21 ; t20 t21a 3710 punpcklqdq m20, m14, m15 ; t16 t17a 3711 punpckhqdq m14, m15 ; t31 t30a 3712 punpckhqdq m15, m17, m18 ; t28a t29 3713 punpcklqdq m17, m18 ; t19a t18 3714 psubsw m21, m0, m14 ; out31 out30 3715 paddsw m0, m14 ; out0 out1 3716 psubsw m14, m7, m20 ; out16 out17 3717 paddsw m7, m20 ; out15 out14 3718 psubsw m20, m1, m15 ; out28 out29 3719 paddsw m1, m15 ; out3 out2 3720 psubsw m15, m6, m17 ; out19 out18 3721 paddsw m6, m17 ; out12 out13 3722 psubsw m17, m4, m9 ; out23 out22 3723 paddsw m4, m9 ; out8 out9 3724 psubsw m18, m3, m16 ; out24 out25 3725 paddsw m3, m16 ; out7 out6 3726 psubsw m16, m5, m8 ; out20 out21 3727 paddsw m5, m8 ; out11 out10 3728 psubsw m19, m2, m11 ; out27 out26 3729 paddsw m2, m11 ; out4 out5 3730 ret 3731 3732cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob 3733%undef cmp 3734 lea r5, [o_base] 3735 test eobd, eobd 3736 jz .dconly 3737 mova m21, [o(permB)] 3738 vpermq m1, m21, [cq+64* 0] ; 0 1 3739 vpermq m14, m21, [cq+64* 1] ; 2 3 3740 vpermq m20, m21, [cq+64* 2] ; 4 5 3741 vpermq m15, m21, [cq+64* 3] ; 6 7 3742 vpbroadcastd m8, [o(pw_2896x8)] 3743 vpermq m2, m21, [cq+64* 4] ; 8 9 3744 vpermq m16, m21, [cq+64* 5] ; 10 11 3745 vpermq m3, m21, [cq+64* 6] ; 12 13 3746 vpermq m17, m21, [cq+64* 7] ; 14 15 3747 REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17 3748 pxor m12, m12 3749 REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7 3750 cmp eobd, 151 3751 jb .fast 3752 vpermq m9, m21, [cq+64* 8] ; 16 17 3753 vpermq m19, m21, [cq+64* 9] ; 18 19 3754 vpermq m4, m21, [cq+64*10] ; 20 21 3755 vpermq m5, m21, [cq+64*11] ; 22 23 3756 vpermq m6, m21, [cq+64*12] ; 24 25 3757 vpermq m18, m21, [cq+64*13] ; 26 27 3758 vpermq m7, m21, [cq+64*14] ; 28 29 3759 vpermq m21, m21, [cq+64*15] ; 30 31 3760 REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21 3761 REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15 3762 punpcklwd m8, m21, m14 ; 30 2 3763 punpckhwd m21, m1 ; 31 1 3764 punpcklwd m0, m17, m19 ; 14 18 3765 punpckhwd m17, m9 ; 15 17 3766 punpcklwd m9, m1 ; 16 0 3767 punpckhwd m14, m7 ; 3 29 3768 punpcklwd m1, m15, m18 ; 6 26 3769 punpckhwd m15, m6 ; 7 25 3770 punpcklwd m6, m2 ; 24 8 3771 punpckhwd m19, m3 ; 19 13 3772 punpcklwd m3, m4 ; 12 20 3773 punpckhwd m18, m20 ; 27 5 3774 punpcklwd m7, m20 ; 28 4 3775 punpckhwd m20, m5, m2 ; 23 9 3776 punpcklwd m5, m16 ; 22 10 3777 punpckhwd m16, m4 ; 11 21 3778 call m(idct_16x16_internal_8bpc).main2 3779 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf 3780 jmp .pass2 3781.fast: ; bottom half zero 3782 punpcklwd m8, m14, m14 ; 2 3783 punpcklwd m0, m17, m17 ; 14 3784 punpcklwd m5, m16, m16 ; 10 3785 punpcklwd m9, m12, m1 ; __ 0 3786 punpckhwd m21, m1, m1 ; 1 3787 punpcklwd m1, m15, m15 ; 6 3788 punpcklwd m7, m20, m20 ; 4 3789 punpckhwd m19, m3, m3 ; 13 3790 punpcklwd m3, m3 ; 12 3791 punpcklwd m6, m12, m2 ; __ 8 3792 punpckhwd m18, m20, m20 ; 5 3793 punpckhwd m20, m2, m2 ; 9 3794 call m(idct_16x16_internal_8bpc).main_fast 3795 punpckhwd m15, m15 ; 7 3796 punpckhwd m14, m14 ; 3 3797 punpckhwd m16, m16 ; 11 3798 punpckhwd m17, m17 ; 15 3799 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 3800.pass2: 3801 vpbroadcastd m9, [o(pw_16384)] 3802 call .transpose_round 3803 vshufi32x4 m16, m14, m2, q3131 ; 5 3804 vshufi32x4 m14, m2, q2020 ; 1 3805 vshufi32x4 m2, m0, m3, q3131 ; 4 3806 vshufi32x4 m0, m3, q2020 ; 0 3807 vshufi32x4 m3, m1, m18, q3131 ; 6 3808 vshufi32x4 m1, m18, q2020 ; 2 3809 vshufi32x4 m18, m20, m6, q2020 ; 9 3810 vshufi32x4 m20, m6, q3131 ; 13 3811 vshufi32x4 m6, m21, m4, q3131 ; 12 3812 vshufi32x4 m4, m21, m4, q2020 ; 8 3813 vshufi32x4 m21, m19, m7, q3131 ; 15 3814 vshufi32x4 m19, m7, q2020 ; 11 3815 vshufi32x4 m7, m5, m15, q3131 ; 14 3816 vshufi32x4 m5, m15, q2020 ; 10 3817 vshufi32x4 m15, m17, m9, q2020 ; 3 3818 vshufi32x4 m17, m9, q3131 ; 7 3819 call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 3820 call .main_oddhalf 3821 vpbroadcastd m12, [o(pw_2048)] 3822 movshdup m13, [o(permD)] 3823 lea r2, [strideq*3] 3824 pmovzxbw m8, [dstq+strideq*0] 3825 pmovzxbw m9, [dstq+strideq*1] 3826 pmovzxbw m10, [dstq+strideq*2] 3827 pmovzxbw m11, [dstq+r2 ] 3828 REPX {pmulhrsw x, m12}, m0, m1, m2, m3 3829 lea r3, [dstq+strideq*4] 3830 paddw m0, m8 3831 paddw m1, m9 3832 paddw m2, m10 3833 paddw m3, m11 3834 pmovzxbw m8, [r3+strideq*0] 3835 pmovzxbw m9, [r3+strideq*1] 3836 pmovzxbw m10, [r3+strideq*2] 3837 pmovzxbw m11, [r3+r2 ] 3838 REPX {pmulhrsw x, m12}, m4, m5, m6, m7 3839 lea r4, [dstq+strideq*8] 3840 packuswb m0, m1 3841 paddw m4, m8 3842 paddw m5, m9 3843 packuswb m2, m3 3844 paddw m6, m10 3845 paddw m7, m11 3846 pmovzxbw m8, [r4+strideq*0] 3847 pmovzxbw m9, [r4+strideq*1] 3848 pmovzxbw m10, [r4+strideq*2] 3849 pmovzxbw m11, [r4+r2 ] 3850 REPX {pmulhrsw x, m12}, m14, m15, m16, m17 3851 lea r5, [r3+strideq*8] 3852 packuswb m4, m5 3853 paddw m14, m8 3854 paddw m15, m9 3855 packuswb m6, m7 3856 paddw m16, m10 3857 paddw m17, m11 3858 pmovzxbw m8, [r5+strideq*0] 3859 pmovzxbw m9, [r5+strideq*1] 3860 pmovzxbw m10, [r5+strideq*2] 3861 pmovzxbw m11, [r5+r2 ] 3862 REPX {pmulhrsw x, m12}, m18, m19, m20, m21 3863 packuswb m14, m15 3864 paddw m18, m8 3865 paddw m19, m9 3866 packuswb m16, m17 3867 paddw m20, m10 3868 paddw m21, m11 3869 packuswb m18, m19 3870 packuswb m20, m21 3871 REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20 3872 mova [dstq+strideq*0], ym0 3873 vextracti32x8 [dstq+strideq*1], m0, 1 3874 mova [dstq+strideq*2], ym2 3875 vextracti32x8 [dstq+r2 ], m2, 1 3876 mova [r3+strideq*0], ym4 3877 vextracti32x8 [r3+strideq*1], m4, 1 3878 mova [r3+strideq*2], ym6 3879 vextracti32x8 [r3+r2 ], m6, 1 3880 mova [r4+strideq*0], ym14 3881 vextracti32x8 [r4+strideq*1], m14, 1 3882 mova [r4+strideq*2], ym16 3883 vextracti32x8 [r4+r2 ], m16, 1 3884 mova [r5+strideq*0], ym18 3885 vextracti32x8 [r5+strideq*1], m18, 1 3886 mova [r5+strideq*2], ym20 3887 vextracti32x8 [r5+r2 ], m20, 1 3888 RET 3889ALIGN function_align 3890.dconly: 3891 movsx r6d, word [cq] 3892 mov [cq], eobd 3893 or r3d, 16 3894 imul r6d, 181 3895 add r6d, 128 3896 sar r6d, 8 3897 imul r6d, 181 3898 add r6d, 128+256 3899 sar r6d, 8+1 3900 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 3901ALIGN function_align 3902cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero 3903 vpbroadcastd m8, [o(pw_2896x8)] 3904 vpbroadcastd m4, [o(pw_4076x8)] 3905 vpbroadcastd m3, [o(pw_401x8)] 3906 pmulhrsw m8, m0 ; t0 3907 pmulhrsw m4, m14 ; t15a 3908 pmulhrsw m3, m14 ; t8a 3909 punpcklwd m9, m3, m4 3910 punpckhwd m5, m3, m4 3911 mova m2, m10 3912 vpdpwssd m2, m9, [o(pw_m3784_1567)] {bcstd} 3913 mova m1, m10 3914 vpdpwssd m1, m5, [o(pw_m3784_1567)] {bcstd} 3915 mova m6, m10 3916 vpdpwssd m6, m5, [o(pw_1567_3784)] {bcstd} 3917 mova m5, m10 3918 vpdpwssd m5, m9, [o(pw_1567_3784)] {bcstd} 3919 vpbroadcastd m11, [o(pw_2896_2896)] 3920 vpbroadcastd m12, [o(pw_m2896_2896)] 3921 psubsw m21, m8, m4 ; out15 3922 paddsw m0, m8, m4 ; out0 3923 psubsw m14, m8, m3 ; out8 3924 paddsw m7, m8, m3 ; out7 3925 REPX {psrad x, 12}, m2, m1, m6, m5 3926 packssdw m2, m1 ; t9a 3927 packssdw m5, m6 ; t14a 3928 ITX_MULSUB_2W 4, 3, 16, 17, 10, 11, 12 ; t11, t12 3929 psubsw m20, m8, m5 ; out14 3930 paddsw m1, m8, m5 ; out1 3931 psubsw m15, m8, m2 ; out9 3932 paddsw m6, m8, m2 ; out6 3933 ITX_MULSUB_2W 5, 2, 16, 17, 10, 11, 12 ; t10a, t13a 3934 psubsw m18, m8, m3 ; out12 3935 paddsw m3, m8 ; out3 3936 psubsw m17, m8, m4 ; out11 3937 paddsw m4, m8 ; out4 3938 psubsw m19, m8, m2 ; out13 3939 paddsw m2, m8 ; out2 3940 psubsw m16, m8, m5 ; out10 3941 paddsw m5, m8 ; out5 3942 ret 3943cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero 3944 vpbroadcastd m9, [o(pw_2896x8)] 3945 vpbroadcastd m2, [o(pw_4017x8)] 3946 vpbroadcastd m3, [o(pw_799x8)] 3947 vpbroadcastd m18, [o(pw_4076x8)] 3948 vpbroadcastd m19, [o(pw_401x8)] 3949 vpbroadcastd m20, [o(pw_m1189x8)] 3950 vpbroadcastd m16, [o(pw_3920x8)] 3951 pmulhrsw m9, m0 ; t0 3952 pmulhrsw m2, m1 ; t7a 3953 pmulhrsw m1, m3 ; t4a 3954 pmulhrsw m18, m14 ; t15a 3955 pmulhrsw m14, m19 ; t8a 3956 pmulhrsw m20, m15 ; t11a 3957 pmulhrsw m15, m16 ; t12a 3958 psubsw m7, m9, m2 ; idct8 out7 3959 paddsw m0, m9, m2 ; idct8 out0 3960 psubsw m4, m9, m1 ; idct8 out4 3961 paddsw m3, m9, m1 ; idct8 out3 3962 ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6 3963 mova m21, m18 3964 mova m19, m14 3965 mova m16, m15 3966 mova m8, m20 3967 psubsw m6, m9, m1 ; idct8 out6 3968 paddsw m1, m9 ; idct8 out1 3969 psubsw m5, m9, m2 ; idct8 out5 3970 paddsw m2, m9 ; idct8 out2 3971 jmp .main3 3972ALIGN function_align 3973cglobal_label .main_oddhalf_fast ; bottom half is zero 3974 vpbroadcastd m5, [o(pw_m2276x8)] 3975 vpbroadcastd m11, [o(pw_3406x8)] 3976 vpbroadcastd m7, [o(pw_4017x8)] 3977 vpbroadcastd m12, [o(pw_799x8)] 3978 vpbroadcastd m6, [o(pw_3784x8)] 3979 vpbroadcastd m10, [o(pw_1567x8)] 3980 vpbroadcastd m4, [o(pw_2896x8)] 3981 pmulhrsw m5, m3 ; t5a 3982 pmulhrsw m3, m11 ; t6a 3983 pmulhrsw m7, m1 ; t7a 3984 pmulhrsw m1, m12 ; t4a 3985 pmulhrsw m6, m2 ; t3 3986 pmulhrsw m2, m10 ; t2 3987 pmulhrsw m4, m0 ; t0 3988 vpbroadcastd m11, [o(pw_2896_2896)] 3989 vpbroadcastd m12, [o(pw_m2896_2896)] 3990 vpbroadcastd m10, [o(pd_2048)] 3991 mova m0, m4 ; t1 3992 call m(inv_txfm_add_dct_dct_32x8_8bpc).main3 3993 vpbroadcastd m21, [o(pw_4076x8)] 3994 vpbroadcastd m8, [o(pw_401x8)] 3995 vpbroadcastd m18, [o(pw_m2598x8)] 3996 vpbroadcastd m9, [o(pw_3166x8)] 3997 vpbroadcastd m19, [o(pw_3612x8)] 3998 vpbroadcastd m11, [o(pw_1931x8)] 3999 vpbroadcastd m20, [o(pw_m1189x8)] 4000 vpbroadcastd m12, [o(pw_3920x8)] 4001 pmulhrsw m21, m14 ; t15a 4002 pmulhrsw m14, m8 ; t8a 4003 pmulhrsw m18, m17 ; t9a 4004 pmulhrsw m17, m9 ; t14a 4005 pmulhrsw m19, m16 ; t13a 4006 pmulhrsw m16, m11 ; t10a 4007 pmulhrsw m20, m15 ; t11a 4008 pmulhrsw m15, m12 ; t12a 4009 jmp .main2 4010ALIGN function_align 4011cglobal_label .main_oddhalf 4012 ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a 4013 ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a 4014 ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a 4015 ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a 4016.main2: 4017 paddsw m8, m20, m16 ; t11 4018 psubsw m20, m16 ; t10 4019 paddsw m16, m15, m19 ; t12 4020 psubsw m15, m19 ; t13 4021 psubsw m19, m14, m18 ; t9 4022 paddsw m14, m18 ; t8 4023 psubsw m18, m21, m17 ; t14 4024 paddsw m21, m17 ; t15 4025.main3: 4026 vpbroadcastd m11, [o(pw_1567_3784)] 4027 vpbroadcastd m12, [o(pw_m3784_1567)] 4028 ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a 4029 vpbroadcastd m11, [o(pw_m1567_m3784)] 4030 ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a 4031 vpbroadcastd m11, [o(pw_2896_2896)] 4032 vpbroadcastd m12, [o(pw_m2896_2896)] 4033 psubsw m17, m14, m8 ; t11a 4034 paddsw m8, m14 ; t8a 4035 paddsw m14, m18, m15 ; t9 4036 psubsw m18, m15 ; t10 4037 psubsw m15, m19, m20 ; t13 4038 paddsw m19, m20 ; t14 4039 paddsw m20, m21, m16 ; t15a 4040 psubsw m16, m21, m16 ; t12a 4041 ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a 4042 ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12 4043 psubsw m21, m0, m20 ; out15 4044 paddsw m0, m20 ; out0 4045 psubsw m20, m1, m19 ; out14 4046 paddsw m1, m19 ; out1 4047 psubsw m19, m2, m18 ; out13 4048 paddsw m2, m18 ; out2 4049 psubsw m18, m3, m17 ; out12 4050 paddsw m3, m17 ; out3 4051 psubsw m17, m4, m16 ; out11 4052 paddsw m4, m16 ; out4 4053 psubsw m16, m5, m15 ; out10 4054 paddsw m5, m15 ; out5 4055 psubsw m15, m6, m14 ; out9 4056 paddsw m6, m14 ; out6 4057 psubsw m14, m7, m8 ; out8 4058 paddsw m7, m8 ; out7 4059 ret 4060.transpose_round: 4061 punpcklwd m8, m0, m2 4062 punpckhwd m0, m2 4063 punpcklwd m2, m1, m3 4064 punpckhwd m1, m3 4065 punpcklwd m3, m4, m6 4066 punpckhwd m4, m6 4067 punpcklwd m6, m5, m7 4068 punpckhwd m5, m7 4069 punpcklwd m7, m14, m16 4070 punpckhwd m14, m16 4071 punpcklwd m16, m15, m17 4072 punpckhwd m15, m17 4073 punpcklwd m17, m19, m21 4074 punpckhwd m19, m21 4075 punpckhwd m21, m18, m20 4076 punpcklwd m18, m20 4077 punpcklwd m20, m8, m1 4078 punpckhwd m8, m1 4079 punpcklwd m1, m0, m2 4080 punpckhwd m0, m2 4081 punpcklwd m2, m3, m5 4082 punpckhwd m3, m5 4083 punpcklwd m5, m4, m6 4084 punpckhwd m4, m6 4085 REPX {pmulhrsw x, m9}, m20, m8, m1, m0 4086 punpcklwd m6, m7, m15 4087 punpckhwd m7, m15 4088 punpcklwd m15, m14, m16 4089 punpckhwd m14, m16 4090 REPX {pmulhrsw x, m9}, m2, m3, m5, m4 4091 punpckhwd m16, m18, m19 4092 punpcklwd m18, m19 4093 punpcklwd m19, m21, m17 4094 punpckhwd m21, m17 4095 REPX {pmulhrsw x, m9}, m6, m7, m15, m14 4096 punpcklwd m17, m8, m0 ; a2 a6 aa ae 4097 punpckhwd m8, m0 ; a3 a7 ab af 4098 punpcklwd m0, m20, m1 ; a0 a4 a8 ac 4099 punpckhwd m20, m1 ; a1 a5 a9 ad 4100 REPX {pmulhrsw x, m9}, m16, m18, m19, m21 4101 punpcklwd m1, m2, m5 ; b0 b4 b8 bc 4102 punpckhwd m2, m5 ; b1 b5 b9 bd 4103 punpcklwd m5, m3, m4 ; b2 b6 ba be 4104 punpckhwd m3, m4 ; b3 b7 bb bf 4105 punpcklwd m4, m6, m15 ; c0 c4 c8 cc 4106 punpckhwd m6, m15 ; c1 c5 c9 cd 4107 punpcklwd m15, m7, m14 ; c2 c6 ca ce 4108 punpckhwd m7, m14 ; c3 c7 cb cf 4109 punpcklwd m14, m18, m19 ; d0 d4 d8 dc 4110 punpckhwd m18, m19 ; d1 d5 d9 dd 4111 punpcklwd m9, m16, m21 ; d2 d6 da de 4112 punpckhwd m16, m21 ; d3 d7 db df 4113 vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc 4114 vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4 4115 vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6 4116 vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be 4117 vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7 4118 vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf 4119 vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4 4120 vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc 4121 vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5 4122 vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd 4123 vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5 4124 vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd 4125 vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6 4126 vshufi32x4 m15, m9, q3232 ; ca ce da de 4127 vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7 4128 vshufi32x4 m7, m16, q3232 ; cb cf db df 4129 ret 4130 4131%macro IDTX_16x32 4 ; src/dst[1-4] 4132 pmulhrsw m%1, m15, [cq+64*%1] 4133 pmulhrsw m%2, m15, [cq+64*%2] 4134 pmulhrsw m%3, m15, [cq+64*%3] 4135 pmulhrsw m%4, m15, [cq+64*%4] 4136 pmulhrsw m18, m16, m%1 4137 pmulhrsw m19, m16, m%2 4138 pmulhrsw m20, m16, m%3 4139 pmulhrsw m21, m16, m%4 4140 REPX {pmulhrsw x, m17}, m18, m19, m20, m21 4141 paddsw m%1, m18 4142 paddsw m%2, m19 4143 paddsw m%3, m20 4144 paddsw m%4, m21 4145%endmacro 4146 4147%macro IDTX_16x32_STORE 2 ; src[1-2] 4148 mova xm17, [dstq+r3*0] 4149 vinserti128 ym17, [dstq+r3*4], 1 4150 vinserti32x4 m17, [dstq+r3*8], 2 4151 vinserti32x4 m17, [dstq+r4*8], 3 4152 mova [cq+64*(%1*2+0)], m18 4153 mova [cq+64*(%1*2+1)], m18 4154 punpcklbw m16, m17, m18 4155 punpckhbw m17, m18 4156 paddw m16, m%1 4157 paddw m17, m%2 4158 packuswb m16, m17 4159 mova [dstq+r3*0], xm16 4160 vextracti128 [dstq+r3*4], ym16, 1 4161 vextracti32x4 [dstq+r3*8], m16, 2 4162 vextracti32x4 [dstq+r4*8], m16, 3 4163%if %1 != 7 4164 add dstq, strideq 4165%endif 4166%endmacro 4167 4168cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c 4169 vpbroadcastd m15, [pw_2896x8] 4170 vpbroadcastd m16, [pw_1697x16] 4171 vpbroadcastd m17, [pw_16384] 4172 IDTX_16x32 0, 1, 2, 3 4173 IDTX_16x32 4, 5, 6, 7 4174 IDTX_16x32 8, 9, 10, 11 4175 IDTX_16x32 12, 13, 14, 15 4176 vpbroadcastd m16, [pw_8192] 4177 call .transpose_2x8x8_round 4178 lea r3, [strideq*2] 4179 lea r4, [strideq*3] 4180 pxor m18, m18 4181 IDTX_16x32_STORE 0, 8 4182 IDTX_16x32_STORE 1, 9 4183 IDTX_16x32_STORE 2, 10 4184 IDTX_16x32_STORE 3, 11 4185 IDTX_16x32_STORE 4, 12 4186 IDTX_16x32_STORE 5, 13 4187 IDTX_16x32_STORE 6, 14 4188 IDTX_16x32_STORE 7, 15 4189 RET 4190ALIGN function_align 4191.transpose_2x8x8_round: 4192 punpckhwd m17, m4, m5 4193 punpcklwd m4, m5 4194 punpckhwd m5, m0, m1 4195 punpcklwd m0, m1 4196 punpckhwd m1, m6, m7 4197 punpcklwd m6, m7 4198 punpckhwd m7, m2, m3 4199 punpcklwd m2, m3 4200 punpckhdq m3, m0, m2 4201 punpckldq m0, m2 4202 punpckldq m2, m4, m6 4203 punpckhdq m4, m6 4204 punpckhdq m6, m5, m7 4205 punpckldq m5, m7 4206 punpckldq m7, m17, m1 4207 punpckhdq m17, m1 4208 REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17 4209 punpckhqdq m1, m0, m2 4210 punpcklqdq m0, m2 4211 punpcklqdq m2, m3, m4 4212 punpckhqdq m3, m4 4213 punpcklqdq m4, m5, m7 4214 punpckhqdq m5, m7 4215 punpckhqdq m7, m6, m17 4216 punpcklqdq m6, m17 4217 punpckhwd m17, m12, m13 4218 punpcklwd m12, m13 4219 punpckhwd m13, m8, m9 4220 punpcklwd m8, m9 4221 punpckhwd m9, m14, m15 4222 punpcklwd m14, m15 4223 punpckhwd m15, m10, m11 4224 punpcklwd m10, m11 4225 punpckhdq m11, m8, m10 4226 punpckldq m8, m10 4227 punpckldq m10, m12, m14 4228 punpckhdq m12, m14 4229 punpckhdq m14, m13, m15 4230 punpckldq m13, m15 4231 punpckldq m15, m17, m9 4232 punpckhdq m17, m9 4233 REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17 4234 punpckhqdq m9, m8, m10 4235 punpcklqdq m8, m10 4236 punpcklqdq m10, m11, m12 4237 punpckhqdq m11, m12 4238 punpcklqdq m12, m13, m15 4239 punpckhqdq m13, m15 4240 punpckhqdq m15, m14, m17 4241 punpcklqdq m14, m17 4242 ret 4243 4244%macro IDTX_32x16 4 ; dst[1-4] 4245 pmulhrsw m%2, m12, [cq+32*(%1+ 0)] 4246 pmulhrsw m18, m12, [cq+32*(%1+16)] 4247 pmulhrsw m%4, m12, [cq+32*(%3+ 0)] 4248 pmulhrsw m19, m12, [cq+32*(%3+16)] 4249 REPX {paddsw x, x}, m%2, m18, m%4, m19 4250 mova m%1, m14 4251 vpermi2q m%1, m%2, m18 4252 vpermt2q m%2, m16, m18 4253%if %3 != 14 4254 mova m%3, m14 4255%endif 4256 vpermi2q m%3, m%4, m19 4257 vpermt2q m%4, m16, m19 4258 pmulhrsw m18, m17, m%1 4259 pmulhrsw m19, m17, m%2 4260 pmulhrsw m20, m17, m%3 4261 pmulhrsw m21, m17, m%4 4262 REPX {paddsw x, x}, m%1, m%2, m%3, m%4 4263 paddsw m%1, m18 4264 paddsw m%2, m19 4265 paddsw m%3, m20 4266 paddsw m%4, m21 4267%endmacro 4268 4269%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32 4270 mova ym19, [dstq+strideq*0] 4271 vinserti32x8 m19, [dstq+strideq*8], 1 4272%if %3 == 0 4273 mova [cq+64*(%1*2+0)], m20 4274 mova [cq+64*(%1*2+1)], m20 4275%endif 4276 punpcklbw m18, m19, m20 4277 punpckhbw m19, m20 4278 paddw m18, m%1 4279 paddw m19, m%2 4280 packuswb m18, m19 4281 mova [dstq+strideq*0], ym18 4282 vextracti32x8 [dstq+strideq*8], m18, 1 4283%if %3 || %1 != 7 4284 add dstq, strideq 4285%endif 4286%endmacro 4287 4288cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c 4289 vpbroadcastd m12, [pw_2896x8] 4290 movu m14, [permB+7] 4291 vpbroadcastd m17, [pw_1697x16] 4292 psrlq m16, m14, 4 4293 IDTX_32x16 0, 1, 2, 3 4294 IDTX_32x16 4, 5, 6, 7 4295 IDTX_32x16 8, 9, 10, 11 4296 IDTX_32x16 12, 13, 14, 15 4297 vpbroadcastd m16, [pw_2048] 4298 call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round 4299 pxor m20, m20 4300 IDTX_32x16_STORE 0, 8 4301 IDTX_32x16_STORE 1, 9 4302 IDTX_32x16_STORE 2, 10 4303 IDTX_32x16_STORE 3, 11 4304 IDTX_32x16_STORE 4, 12 4305 IDTX_32x16_STORE 5, 13 4306 IDTX_32x16_STORE 6, 14 4307 IDTX_32x16_STORE 7, 15 4308 RET 4309 4310%macro IDCT_32x32_END 4 ; src, mem, stride[1-2] 4311 pmovzxbw m10, [dstq+%3] 4312 pmovzxbw m11, [r3 +%4] 4313%if %2 < 8 4314 paddsw m8, m%2, m%1 4315 psubsw m9, m%2, m%1 4316%else 4317 mova m9, [cq+64*(%2*2-16)] 4318 paddsw m8, m9, m%1 4319 psubsw m9, m%1 4320%endif 4321 pmulhrsw m8, m12 4322 pmulhrsw m9, m12 4323%if %2 >= 8 4324%if %2 == 8 4325 pxor m0, m0 4326%endif 4327 mova [cq+64*(%2*2-16)], m0 4328 mova [cq+64*(%2*2-15)], m0 4329%endif 4330 paddw m8, m10 4331 paddw m9, m11 4332 packuswb m8, m9 4333 vpermq m8, m13, m8 4334 mova [dstq+%3], ym8 4335 vextracti32x8 [r3 +%4], m8, 1 4336%if %2 == 3 || %2 == 7 || %2 == 11 4337 add dstq, r5 4338 sub r3, r5 4339%endif 4340%endmacro 4341 4342cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob 4343%undef cmp 4344 lea r5, [o_base] 4345 test eobd, eobd 4346 jz .dconly 4347 WIN64_SPILL_XMM 30 4348 cmp eobd, 136 4349 jb .fast 4350 mova m5, [cq+64*20] 4351 mova m3, [cq+64*12] 4352 mova m1, [cq+64* 4] 4353 mova m7, [cq+64*28] 4354 mova m2, [cq+64* 8] 4355 mova m6, [cq+64*24] 4356 mova m0, [cq+64* 0] 4357 mova m4, [cq+64*16] 4358 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 4359 mova m14, [cq+64* 2] 4360 mova m21, [cq+64*30] 4361 mova m18, [cq+64*18] 4362 mova m17, [cq+64*14] 4363 mova m16, [cq+64*10] 4364 mova m19, [cq+64*22] 4365 mova m20, [cq+64*26] 4366 mova m15, [cq+64* 6] 4367 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 4368 mova [cq+64* 0], m14 4369 mova [cq+64* 2], m15 4370 mova [cq+64* 4], m16 4371 mova [cq+64* 6], m17 4372 mova [cq+64* 8], m18 4373 mova [cq+64*10], m19 4374 mova [cq+64*12], m20 4375 mova [cq+64*14], m21 4376 mova m22, [cq+64* 1] 4377 mova m21, [cq+64*31] 4378 mova m14, [cq+64*17] 4379 mova m29, [cq+64*15] 4380 mova m26, [cq+64* 9] 4381 mova m17, [cq+64*23] 4382 mova m18, [cq+64*25] 4383 mova m25, [cq+64* 7] 4384 mova m24, [cq+64* 5] 4385 mova m19, [cq+64*27] 4386 mova m16, [cq+64*21] 4387 mova m27, [cq+64*11] 4388 mova m28, [cq+64*13] 4389 mova m15, [cq+64*19] 4390 mova m20, [cq+64*29] 4391 mova m23, [cq+64* 3] 4392 call .main_oddhalf 4393 vpbroadcastd m10, [o(pw_8192)] 4394 psubsw m13, m0, m29 ; 31 4395 paddsw m0, m29 ; 0 4396 psubsw m29, m1, m28 ; 30 4397 paddsw m1, m28 ; 1 4398 psubsw m28, m2, m27 ; 29 4399 paddsw m2, m27 ; 2 4400 psubsw m27, m3, m26 ; 28 4401 paddsw m3, m26 ; 3 4402 psubsw m26, m4, m25 ; 27 4403 paddsw m4, m25 ; 4 4404 psubsw m25, m5, m24 ; 26 4405 paddsw m5, m24 ; 5 4406 psubsw m24, m6, m23 ; 25 4407 paddsw m6, m23 ; 6 4408 psubsw m23, m7, m22 ; 24 4409 paddsw m7, m22 ; 7 4410 pxor m9, m9 4411 punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 4412 punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 4413 punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 4414 punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 4415 REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 4416 punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 4417 punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 4418 punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 4419 punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 4420 REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 4421 punpckhwd m3, m23, m24 4422 punpcklwd m23, m24 4423 punpckhwd m24, m25, m26 4424 punpcklwd m25, m26 4425 REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 4426 punpckhwd m26, m27, m28 4427 punpcklwd m27, m28 4428 punpckhwd m28, m29, m13 4429 punpcklwd m29, m13 4430 REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 4431 punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 4432 punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 4433 punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 4434 punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 4435 punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 4436 punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 4437 punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 4438 punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 4439 REPX {pmulhrsw x, m10}, m0, m4, m8, m22 4440 punpckhdq m13, m23, m25 4441 punpckldq m23, m25 4442 punpckhdq m25, m27, m29 4443 punpckldq m27, m29 4444 REPX {pmulhrsw x, m10}, m13, m23, m25, m27 4445 punpckhdq m9, m3, m24 4446 punpckldq m3, m24 4447 punpckhdq m24, m26, m28 4448 punpckldq m26, m28 4449 punpcklqdq m5, m23, m27 ; d00 d08 d16 d24 4450 punpckhqdq m23, m27 ; d01 d09 d17 d25 4451 punpckhqdq m27, m13, m25 ; d03 d11 d19 d27 4452 punpcklqdq m13, m25 ; d02 d10 d18 d26 4453 punpckhqdq m25, m3, m26 ; d05 d13 d21 d29 4454 punpcklqdq m3, m26 ; d04 d12 d20 d28 4455 punpckhqdq m26, m9, m24 ; d07 d15 d23 d31 4456 punpcklqdq m9, m24 ; d06 d14 d22 d30 4457 REPX {pmulhrsw x, m10}, m25, m3, m26 4458 mova [cq+64* 9], m23 4459 mova [cq+64*11], m27 4460 mova [cq+64*13], m25 4461 mova [cq+64*15], m26 4462 punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 4463 punpcklqdq m8, m22 ; a04 a12 a20 a28 4464 punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 4465 punpcklqdq m0, m4 ; a00 a08 a16 a24 4466 punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 4467 punpcklqdq m7, m2 ; a02 a10 a18 a26 4468 punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 4469 punpcklqdq m6, m1 ; a06 a14 a22 a30 4470 mova m2, [cq+64* 0] 4471 mova m11, [cq+64* 2] 4472 mova m12, [cq+64* 4] 4473 mova m29, [cq+64* 6] 4474 mova m27, [cq+64* 8] 4475 mova m26, [cq+64*10] 4476 mova m4, [cq+64*12] 4477 mova m28, [cq+64*14] 4478 psubsw m1, m2, m21 ; 23 4479 paddsw m2, m21 ; 8 4480 psubsw m21, m11, m20 ; 22 4481 paddsw m11, m20 ; 9 4482 psubsw m20, m12, m19 ; 21 4483 paddsw m12, m19 ; 10 4484 psubsw m19, m29, m18 ; 20 4485 paddsw m29, m18 ; 11 4486 psubsw m18, m27, m17 ; 19 4487 paddsw m27, m17 ; 12 4488 psubsw m17, m26, m16 ; 18 4489 paddsw m26, m16 ; 13 4490 paddsw m16, m4, m15 ; 14 4491 psubsw m4, m15 ; 17 4492 pmulhrsw m15, m6, m10 4493 psubsw m6, m28, m14 ; 16 4494 paddsw m28, m14 ; 15 4495 pmulhrsw m14, m7, m10 4496 punpcklwd m7, m6, m4 4497 punpckhwd m6, m4 4498 punpckhwd m4, m17, m18 4499 punpcklwd m17, m18 4500 punpckhwd m18, m19, m20 4501 punpcklwd m19, m20 4502 punpckhwd m20, m21, m1 4503 punpcklwd m21, m1 4504 punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 4505 punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 4506 punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 4507 punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 4508 punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 4509 punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 4510 punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 4511 punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 4512 pmulhrsw m23, m10 4513 pmulhrsw m25, m10 4514 punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3 4515 punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1 4516 punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3 4517 punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1 4518 REPX {pmulhrsw x, m10}, m28, m2, m12, m27 4519 punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 4520 punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 4521 punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 4522 punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 4523 REPX {pmulhrsw x, m10}, m16, m1, m11, m29 4524 punpckhdq m26, m19, m21 4525 punpckldq m19, m21 4526 punpckhdq m21, m6, m4 4527 punpckldq m6, m4 4528 REPX {pmulhrsw x, m10}, m26, m19, m21, m6 4529 punpckhdq m4, m18, m20 4530 punpckldq m18, m20 4531 punpckhdq m20, m7, m17 4532 punpckldq m7, m17 4533 REPX {pmulhrsw x, m10}, m4, m18, m20, m7 4534 punpcklqdq m17, m28, m12 ; b02 b10 b18 b26 4535 punpckhqdq m28, m12 ; b03 b11 b19 b27 4536 punpckhqdq m12, m2, m27 ; b01 b09 b17 b25 4537 punpcklqdq m2, m27 ; b00 b08 b16 b24 4538 punpckhqdq m27, m1, m29 ; b05 b13 b21 b29 4539 punpcklqdq m1, m29 ; b04 b12 b20 b28 4540 punpckhqdq m29, m16, m11 ; b07 b15 b23 b31 4541 punpcklqdq m16, m11 ; b06 b14 b22 b30 4542 mova [cq+64* 1], m12 4543 mova [cq+64* 3], m28 4544 mova [cq+64* 5], m27 4545 mova [cq+64* 7], m29 4546 punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 4547 punpcklqdq m20, m26 ; c02 c10 c18 c26 4548 punpckhqdq m26, m7, m19 ; c01 c09 c17 c25 4549 punpcklqdq m7, m19 ; c00 c08 c16 c24 4550 punpckhqdq m28, m6, m18 ; c05 c13 c21 c29 4551 punpcklqdq m6, m18 ; c04 c12 c20 c28 4552 punpckhqdq m29, m21, m4 ; c07 c15 c23 c31 4553 punpcklqdq m21, m4 ; c06 c14 c22 c30 4554 pmulhrsw m19, m9, m10 4555 vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24 4556 vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08 4557 vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24 4558 vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08 4559 vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28 4560 vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12 4561 vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28 4562 vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12 4563 vshufi32x4 m3, m1, m6, q3131 ; 12 4564 vshufi32x4 m1, m6, q2020 ; 4 4565 vshufi32x4 m6, m4, m2, q3131 ; 24 4566 vshufi32x4 m4, m2, q2020 ; 16 4567 vshufi32x4 m2, m0, m7, q3131 ; 8 4568 vshufi32x4 m0, m7, q2020 ; 0 4569 vshufi32x4 m7, m5, m8, q3131 ; 28 4570 vshufi32x4 m5, m8, q2020 ; 20 4571 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 4572 vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26 4573 vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10 4574 vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26 4575 vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10 4576 vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30 4577 vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14 4578 vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30 4579 vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14 4580 vshufi32x4 m16, m14, m20, q3131 ; 10 4581 vshufi32x4 m14, m20, q2020 ; 2 4582 vshufi32x4 m20, m18, m17, q3131 ; 26 4583 vshufi32x4 m18, m17, q2020 ; 18 4584 vshufi32x4 m17, m15, m21, q3131 ; 14 4585 vshufi32x4 m15, m21, q2020 ; 6 4586 vshufi32x4 m21, m19, m13, q3131 ; 30 4587 vshufi32x4 m19, m13, q2020 ; 22 4588 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 4589 mova [cq+64* 0], m14 4590 mova [cq+64* 2], m15 4591 mova [cq+64* 4], m16 4592 mova [cq+64* 6], m17 4593 mova [cq+64* 8], m18 4594 mova [cq+64*10], m19 4595 mova [cq+64*12], m20 4596 mova [cq+64*14], m21 4597 mova m15, [cq+64* 1] 4598 mova m16, [cq+64* 3] 4599 mova m17, [cq+64* 5] 4600 mova m19, [cq+64* 7] 4601 mova m20, [cq+64* 9] 4602 mova m21, [cq+64*11] 4603 mova m13, [cq+64*13] 4604 mova m18, [cq+64*15] 4605 vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25 4606 vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09 4607 vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27 4608 vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11 4609 vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29 4610 vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13 4611 vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31 4612 vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15 4613 vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09 4614 vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25 4615 vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11 4616 vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27 4617 vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13 4618 vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29 4619 vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15 4620 vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31 4621 vshufi32x4 m18, m14, m26, q3131 ; 25 4622 vshufi32x4 m14, m26, q2020 ; 17 4623 vshufi32x4 m19, m15, m27, q3131 ; 27 4624 vshufi32x4 m15, m27, q2020 ; 19 4625 vshufi32x4 m20, m16, m28, q3131 ; 29 4626 vshufi32x4 m16, m28, q2020 ; 21 4627 vshufi32x4 m21, m17, m29, q3131 ; 31 4628 vshufi32x4 m17, m29, q2020 ; 23 4629 vshufi32x4 m26, m22, m8, q3131 ; 9 4630 vshufi32x4 m22, m8, q2020 ; 1 4631 vshufi32x4 m27, m23, m9, q3131 ; 11 4632 vshufi32x4 m23, m9, q2020 ; 3 4633 vshufi32x4 m28, m24, m11, q3131 ; 13 4634 vshufi32x4 m24, m11, q2020 ; 5 4635 vshufi32x4 m29, m25, m12, q3131 ; 15 4636 vshufi32x4 m25, m12, q2020 ; 7 4637 call .main_oddhalf 4638 jmp .end 4639.fast: ; bottom/right halves are zero 4640 mova m14, [o(dup16_perm)] 4641 pmovzxwd m9, [cq+64* 0] 4642 pmovzxwd m6, [cq+64* 8] 4643 vpermb m8, m14, [cq+64* 2] 4644 vpermb ym0, ym14, [cq+64*14] 4645 vpermb ym5, ym14, [cq+64*10] 4646 vpermb m1, m14, [cq+64* 6] 4647 vpermb m7, m14, [cq+64* 4] 4648 vpermb ym3, ym14, [cq+64*12] 4649 pslld m9, 16 4650 pslld m6, 16 4651 call m(idct_16x16_internal_8bpc).main_fast 4652 vpermb m21, m14, [cq+64* 1] 4653 vpermb ym17, ym14, [cq+64*15] 4654 vpermb ym20, ym14, [cq+64* 9] 4655 vpermb m15, m14, [cq+64* 7] 4656 vpermb m18, m14, [cq+64* 5] 4657 vpermb ym16, ym14, [cq+64*11] 4658 vpermb ym19, ym14, [cq+64*13] 4659 vpermb m14, m14, [cq+64* 3] 4660 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 4661 vpbroadcastd m9, [o(pw_8192)] 4662 call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round 4663 vshufi32x4 m22, m14, m2, q2020 ; 1 4664 vshufi32x4 m24, m14, m2, q3131 ; 5 4665 vshufi32x4 m23, m17, m9, q2020 ; 3 4666 vshufi32x4 m25, m17, m9, q3131 ; 7 4667 vshufi32x4 m16, m5, m15, q2020 ; 10 4668 vshufi32x4 m17, m5, m15, q3131 ; 14 4669 vshufi32x4 m14, m1, m18, q2020 ; 2 4670 vshufi32x4 m15, m1, m18, q3131 ; 6 4671 vshufi32x4 m1, m0, m3, q3131 ; 4 4672 vshufi32x4 m0, m3, q2020 ; 0 4673 vshufi32x4 m3, m21, m4, q3131 ; 12 4674 vshufi32x4 m2, m21, m4, q2020 ; 8 4675 vshufi32x4 m26, m20, m6, q2020 ; 9 4676 vshufi32x4 m28, m20, m6, q3131 ; 13 4677 vshufi32x4 m27, m19, m7, q2020 ; 11 4678 vshufi32x4 m29, m19, m7, q3131 ; 15 4679 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 4680 mova [cq+64* 0], m14 4681 mova [cq+64* 2], m15 4682 mova [cq+64* 4], m16 4683 mova [cq+64* 6], m17 4684 mova [cq+64* 8], m18 4685 mova [cq+64*10], m19 4686 mova [cq+64*12], m20 4687 mova [cq+64*14], m21 4688 call .main_oddhalf_fast 4689.end: 4690 lea r4, [strideq*3] 4691 vpbroadcastd m12, [o(pw_2048)] 4692 movshdup m13, [o(permD)] 4693 lea r3, [dstq+r4*8] 4694 lea r5, [strideq+r4] ; stride*4 4695 add r3, r5 ; dst+stride*28 4696 IDCT_32x32_END 29, 0, strideq*0, r4 4697 IDCT_32x32_END 28, 1, strideq*1, strideq*2 4698 IDCT_32x32_END 27, 2, strideq*2, strideq*1 4699 IDCT_32x32_END 26, 3, r4 , strideq*0 4700 IDCT_32x32_END 25, 4, strideq*0, r4 4701 IDCT_32x32_END 24, 5, strideq*1, strideq*2 4702 IDCT_32x32_END 23, 6, strideq*2, strideq*1 4703 IDCT_32x32_END 22, 7, r4 , strideq*0 4704 IDCT_32x32_END 21, 8, strideq*0, r4 4705 IDCT_32x32_END 20, 9, strideq*1, strideq*2 4706 IDCT_32x32_END 19, 10, strideq*2, strideq*1 4707 IDCT_32x32_END 18, 11, r4 , strideq*0 4708 IDCT_32x32_END 17, 12, strideq*0, r4 4709 IDCT_32x32_END 16, 13, strideq*1, strideq*2 4710 IDCT_32x32_END 15, 14, strideq*2, strideq*1 4711 IDCT_32x32_END 14, 15, r4 , strideq*0 4712 RET 4713.dconly: 4714 movsx r6d, word [cq] 4715 mov [cq], eobd 4716 or r3d, 32 4717 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 4718ALIGN function_align 4719cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero 4720 vpbroadcastd m21, [o(pw_4091x8)] 4721 vpbroadcastd m8, [o(pw_201x8)] 4722 vpbroadcastd m24, [o(pw_m601x8)] 4723 vpbroadcastd m12, [o(pw_4052x8)] 4724 pmulhrsw m21, m22 ; t31a 4725 pmulhrsw m22, m8 ; t16a 4726 pmulhrsw m24, m23 ; t23a 4727 pmulhrsw m23, m12 ; t24a 4728 4729 punpcklwd m9, m22, m21 4730 punpckhwd m8, m22, m21 4731 mova m15, m10 4732 vpdpwssd m15, m9, [o(pw_m4017_799)] {bcstd} 4733 mova m17, m10 4734 vpdpwssd m17, m8, [o(pw_m4017_799)] {bcstd} 4735 REPX {psrad x, 12}, m15, m17 4736 packssdw m15, m17 4737 mova m17, m10 4738 vpdpwssd m17, m8, [o(pw_799_4017)] {bcstd} 4739 mova m8, m10 4740 vpdpwssd m8, m9, [o(pw_799_4017)] {bcstd} 4741 REPX {psrad x, 12}, m17, m8 4742 packssdw m8, m17 4743 4744 punpcklwd m9, m24, m23 4745 punpckhwd m16, m24, m23 4746 mova m20, m10 4747 vpdpwssd m20, m9, [o(pw_m3406_m2276)] {bcstd} 4748 mova m17, m10 4749 vpdpwssd m17, m16, [o(pw_m3406_m2276)] {bcstd} 4750 REPX {psrad x, 12}, m20, m17 4751 packssdw m20, m17 4752 mova m17, m10 4753 vpdpwssd m17, m16, [o(pw_m2276_3406)] {bcstd} 4754 mova m16, m10 4755 vpdpwssd m16, m9, [o(pw_m2276_3406)] {bcstd} 4756 REPX {psrad x, 12}, m17, m16 4757 packssdw m16, m17 4758 4759 mova m17, m21 4760 mova m27, m15 4761 mova m25, m20 4762 mova m29, m8 4763 mova m18, m22 4764 mova m14, m24 4765 mova m28, m16 4766 mova m26, m23 4767 jmp .main4 4768cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero 4769 vpbroadcastd m21, [o(pw_4091x8)] 4770 vpbroadcastd m8, [o(pw_201x8)] 4771 vpbroadcastd m18, [o(pw_m1380x8)] 4772 vpbroadcastd m9, [o(pw_3857x8)] 4773 vpbroadcastd m19, [o(pw_3973x8)] 4774 vpbroadcastd m11, [o(pw_995x8)] 4775 vpbroadcastd m28, [o(pw_m601x8)] 4776 vpbroadcastd m12, [o(pw_4052x8)] 4777 pmulhrsw m21, m22 ; t31a 4778 pmulhrsw m22, m8 ; t16a 4779 pmulhrsw m18, m25 ; t19a 4780 pmulhrsw m25, m9 ; t28a 4781 pmulhrsw m19, m24 ; t27a 4782 pmulhrsw m24, m11 ; t20a 4783 pmulhrsw m28, m23 ; t23a 4784 pmulhrsw m23, m12 ; t24a 4785 mova m15, m21 4786 mova m8, m22 4787 mova m14, m18 4788 mova m27, m25 4789 mova m29, m19 4790 mova m26, m24 4791 mova m16, m28 4792 mova m20, m23 4793 jmp .main3 4794ALIGN function_align 4795cglobal_label .main_oddhalf_fast ; bottom half is zero 4796 vpbroadcastd m21, [o(pw_4091x8)] 4797 vpbroadcastd m8, [o(pw_201x8)] 4798 vpbroadcastd m14, [o(pw_m2751x8)] 4799 vpbroadcastd m9, [o(pw_3035x8)] 4800 vpbroadcastd m17, [o(pw_3703x8)] 4801 vpbroadcastd m11, [o(pw_1751x8)] 4802 vpbroadcastd m18, [o(pw_m1380x8)] 4803 vpbroadcastd m12, [o(pw_3857x8)] 4804 pmulhrsw m21, m22 ; t31a 4805 vpbroadcastd m19, [o(pw_3973x8)] 4806 pmulhrsw m22, m8 ; t16a 4807 vpbroadcastd m8, [o(pw_995x8)] 4808 pmulhrsw m14, m29 ; t30a 4809 vpbroadcastd m16, [o(pw_m2106x8)] 4810 pmulhrsw m29, m9 ; t17a 4811 vpbroadcastd m9, [o(pw_3513x8)] 4812 pmulhrsw m17, m26 ; t29a 4813 vpbroadcastd m15, [o(pw_3290x8)] 4814 pmulhrsw m26, m11 ; t18a 4815 vpbroadcastd m11, [o(pw_2440x8)] 4816 pmulhrsw m18, m25 ; t19a 4817 vpbroadcastd m20, [o(pw_m601x8)] 4818 pmulhrsw m25, m12 ; t28a 4819 vpbroadcastd m12, [o(pw_4052x8)] 4820 pmulhrsw m19, m24 ; t27a 4821 pmulhrsw m24, m8 ; t20a 4822 pmulhrsw m16, m27 ; t21a 4823 pmulhrsw m27, m9 ; t26a 4824 pmulhrsw m15, m28 ; t25a 4825 pmulhrsw m28, m11 ; t22a 4826 pmulhrsw m20, m23 ; t23a 4827 pmulhrsw m23, m12 ; t24a 4828 jmp .main2 4829ALIGN function_align 4830cglobal_label .main_oddhalf 4831 ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a 4832 ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a 4833 ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a 4834 ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a 4835 ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a 4836 ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a 4837 ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a 4838 ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a 4839.main2: 4840 psubsw m8, m22, m14 ; t17 4841 paddsw m22, m14 ; t16 4842 paddsw m14, m18, m26 ; t19 4843 psubsw m18, m26 ; t18 4844 psubsw m26, m24, m16 ; t21 4845 paddsw m24, m16 ; t20 4846 psubsw m16, m20, m28 ; t22 4847 paddsw m28, m20 ; t23 4848 psubsw m20, m23, m15 ; t25 4849 paddsw m23, m15 ; t24 4850 psubsw m15, m21, m29 ; t30 4851 paddsw m21, m29 ; t31 4852 psubsw m29, m19, m27 ; t26 4853 paddsw m19, m27 ; t27 4854 paddsw m27, m25, m17 ; t28 4855 psubsw m25, m17 ; t29 4856.main3: 4857 ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a 4858 ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a 4859 ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a 4860 ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a 4861 psubsw m17, m21, m27 ; t28a 4862 paddsw m21, m27 ; t31a 4863 psubsw m27, m15, m25 ; t18 4864 paddsw m15, m25 ; t17 4865 psubsw m25, m20, m29 ; t21 4866 paddsw m20, m29 ; t22 4867 psubsw m29, m8, m18 ; t29 4868 paddsw m8, m18 ; t30 4869 psubsw m18, m22, m14 ; t19a 4870 paddsw m22, m14 ; t16a 4871 psubsw m14, m28, m24 ; t20a 4872 paddsw m24, m28 ; t23a 4873 paddsw m28, m16, m26 ; t25 4874 psubsw m16, m26 ; t26 4875 psubsw m26, m23, m19 ; t27a 4876 paddsw m23, m19 ; t24a 4877.main4: 4878 vpbroadcastd m12, [o(pw_m3784_1567)] 4879 vpbroadcastd m11, [o(pw_1567_3784)] 4880 ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a 4881 ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28 4882 vpbroadcastd m11, [o(pw_m1567_m3784)] 4883 ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a 4884 ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27 4885 vpbroadcastd m12, [o(pw_m2896_2896)] 4886 vpbroadcastd m11, [o(pw_2896_2896)] 4887 psubsw m19, m27, m25 ; t26 4888 paddsw m27, m25 ; t29 4889 psubsw m25, m17, m26 ; t20a 4890 paddsw m17, m26 ; t19a 4891 paddsw m26, m18, m14 ; t28a 4892 psubsw m18, m14 ; t27a 4893 paddsw m14, m22, m24 ; t16 4894 psubsw m22, m24 ; t23 4895 psubsw m24, m29, m16 ; t21 4896 paddsw m16, m29 ; t18 4897 paddsw m29, m21, m23 ; t31 4898 psubsw m21, m23 ; t24 4899 psubsw m23, m15, m20 ; t22a 4900 paddsw m15, m20 ; t17a 4901 psubsw m20, m8, m28 ; t25a 4902 paddsw m28, m8 ; t30a 4903 ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27 4904 ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a 4905 ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a 4906 ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25 4907 ret 4908 4909%macro IDTX_32x32 2 ; dst[1-2] 4910 vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which 4911 vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to 4912 vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements 4913 vmovdqa32 ym18, [cq+64*(%2+16)] 4914 vpermt2q m%1, m21, m17 4915 vpermt2q m%2, m21, m18 4916%endmacro 4917 4918cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c 4919 movu m21, [permB+7] 4920 vpbroadcastd m16, [pw_8192] 4921 pxor m20, m20 4922.loop: 4923 IDTX_32x32 0, 1 4924 IDTX_32x32 2, 3 4925 IDTX_32x32 4, 5 4926 IDTX_32x32 6, 7 4927 IDTX_32x32 8, 9 4928 IDTX_32x32 10, 11 4929 IDTX_32x32 12, 13 4930 IDTX_32x32 14, 15 4931 call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round 4932 IDTX_32x16_STORE 0, 8, 1 4933 IDTX_32x16_STORE 1, 9, 1 4934 IDTX_32x16_STORE 2, 10, 1 4935 IDTX_32x16_STORE 3, 11, 1 4936 IDTX_32x16_STORE 4, 12, 1 4937 IDTX_32x16_STORE 5, 13, 1 4938 IDTX_32x16_STORE 6, 14, 1 4939 IDTX_32x16_STORE 7, 15, 1 4940 lea dstq, [dstq+strideq*8] 4941 btc cq, 5 4942 jnc .loop 4943 mov r0d, 8 4944.zero_loop: 4945 mova [cq+64*0], m20 4946 mova [cq+64*1], m20 4947 mova [cq+64*2], m20 4948 mova [cq+64*3], m20 4949 add cq, 64*4 4950 dec r0d 4951 jg .zero_loop 4952 RET 4953 4954cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob 4955%undef cmp 4956 lea r5, [o_base] 4957 test eobd, eobd 4958 jz .dconly 4959 WIN64_SPILL_XMM 30 4960 cmp eobd, 151 4961 jb .fast 4962 mova m5, [cq+64*10] 4963 mova m3, [cq+64* 6] 4964 mova m1, [cq+64* 2] 4965 mova m7, [cq+64*14] 4966 mova m2, [cq+64* 4] 4967 mova m6, [cq+64*12] 4968 mova m0, [cq+64* 0] 4969 mova m4, [cq+64* 8] 4970 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 4971 mova m14, [cq+64* 1] 4972 mova m21, [cq+64*15] 4973 mova m18, [cq+64* 9] 4974 mova m17, [cq+64* 7] 4975 mova m16, [cq+64* 5] 4976 mova m19, [cq+64*11] 4977 mova m20, [cq+64*13] 4978 mova m15, [cq+64* 3] 4979 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 4980 vpbroadcastd m9, [o(pw_8192)] 4981%macro TRANSPOSE_8x4_ROUND 4 4982 punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7 4983 punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3 4984 punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7 4985 punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3 4986 punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3 4987 punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1 4988 punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5 4989 punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7 4990 REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4 4991%endmacro 4992 TRANSPOSE_8x4_ROUND 0, 1, 2, 3 4993 TRANSPOSE_8x4_ROUND 4, 5, 6, 7 4994 TRANSPOSE_8x4_ROUND 14, 15, 16, 17 4995 TRANSPOSE_8x4_ROUND 18, 19, 20, 21 4996 vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4 4997 vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12 4998 vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5 4999 vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13 5000 vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6 5001 vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14 5002 vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7 5003 vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15 5004 vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4 5005 vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12 5006 vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5 5007 vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13 5008 vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6 5009 vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14 5010 vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7 5011 vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15 5012 vshufi32x4 m22, m26, m4, q2020 ; 0 1 5013 vshufi32x4 m26, m4, q3131 ; 8 9 5014 vshufi32x4 m23, m27, m5, q2020 ; 2 3 5015 vshufi32x4 m27, m5, q3131 ; 10 11 5016 vshufi32x4 m24, m28, m6, q2020 ; 4 5 5017 vshufi32x4 m28, m6, q3131 ; 12 13 5018 vshufi32x4 m25, m29, m7, q2020 ; 6 7 5019 vshufi32x4 m29, m7, q3131 ; 14 15 5020 vshufi32x4 m4, m0, m14, q2020 ; 16 17 5021 vshufi32x4 m3, m0, m14, q3131 ; 24 25 5022 vshufi32x4 m20, m1, m15, q2020 ; 18 19 5023 vshufi32x4 m19, m1, m15, q3131 ; 26 27 5024 vshufi32x4 m5, m2, m16, q2020 ; 20 21 5025 vshufi32x4 m0, m2, m16, q3131 ; 28 29 5026 vshufi32x4 m16, m8, m17, q2020 ; 22 23 5027 vshufi32x4 m17, m8, m17, q3131 ; 30 31 5028 pxor m6, m6 5029 mova [cq+64* 0], m4 5030 mova [cq+64* 2], m5 5031 mova [cq+64* 4], m3 5032 mova [cq+64* 6], m0 5033 punpcklwd m8, m24, m24 ; 4 5034 punpcklwd m0, m0 ; 28 5035 punpcklwd m5, m5 ; 20 5036 punpcklwd m1, m28, m28 ; 12 5037 punpcklwd m7, m26, m26 ; 8 5038 punpcklwd m3, m3 ; 24 5039 punpcklwd m9, m6, m22 ; __ 0 5040 punpcklwd m6, m4 ; __ 16 5041 call m(idct_16x16_internal_8bpc).main_fast3 5042 mova [cq+64* 1], m20 5043 mova [cq+64* 3], m16 5044 mova [cq+64* 5], m19 5045 mova [cq+64* 7], m17 5046 punpcklwd m21, m23, m23 ; 2 5047 punpcklwd m17, m17 ; 30 5048 punpcklwd m20, m20 ; 18 5049 punpcklwd m15, m29, m29 ; 14 5050 punpcklwd m18, m27, m27 ; 10 5051 punpcklwd m16, m16 ; 22 5052 punpcklwd m19, m19 ; 26 5053 punpcklwd m14, m25, m25 ; 6 5054 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 5055 mova [cq+64* 8], m14 5056 mova [cq+64* 9], m15 5057 mova [cq+64*10], m16 5058 mova [cq+64*11], m17 5059 mova [cq+64*12], m18 5060 mova [cq+64*13], m19 5061 mova [cq+64*14], m20 5062 mova [cq+64*15], m21 5063 mova m21, [cq+64* 7] 5064 mova m14, [cq+64* 0] 5065 mova m17, [cq+64* 3] 5066 mova m18, [cq+64* 4] 5067 mova m19, [cq+64* 5] 5068 mova m16, [cq+64* 2] 5069 mova m15, [cq+64* 1] 5070 mova m20, [cq+64* 6] 5071 REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ 5072 m24, m19, m16, m27, m28, m15, m20, m23 5073 call .main_oddhalf 5074 jmp .end 5075.fast: ; right half is zero 5076 mova ym8, [cq+64*15] 5077 vinserti32x8 m8, [cq+64* 1], 1 5078 mova m2, [o(int16_perm)] 5079 mova ym9, [cq+64* 8] 5080 vinserti32x8 m9, [cq+64* 0], 1 5081 mova ym0, [cq+64* 7] 5082 vinserti32x8 m0, [cq+64* 9], 1 5083 mova ym7, [cq+64*14] 5084 vinserti32x8 m7, [cq+64* 2], 1 5085 mova ym1, [cq+64* 3] 5086 vinserti32x8 m1, [cq+64*13], 1 5087 mova ym3, [cq+64* 6] 5088 vinserti32x8 m3, [cq+64*10], 1 5089 mova ym5, [cq+64*11] 5090 vinserti32x8 m5, [cq+64* 5], 1 5091 mova ym6, [cq+64*12] 5092 vinserti32x8 m6, [cq+64* 4], 1 5093 REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 5094 call m(idct_16x16_internal_8bpc).main2 5095 vbroadcasti32x4 m8, [o(int_shuf3)] 5096 vbroadcasti32x4 m9, [o(int_shuf4)] 5097 vpbroadcastd m11, [o(pw_8192)] 5098 pshufb m0, m8 5099 pshufb m1, m9 5100 pshufb m2, m8 5101 pshufb m3, m9 5102 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 5103 pshufb m4, m8 5104 pshufb m5, m9 5105 pshufb m6, m8 5106 pshufb m7, m9 5107 REPX {pmulhrsw x, m11}, m4, m5, m6, m7 5108 punpckhdq m28, m0, m1 5109 punpckldq m0, m1 5110 punpckhdq m27, m2, m3 5111 punpckldq m2, m3 5112 punpckhdq m22, m4, m5 5113 punpckldq m4, m5 5114 punpckhdq m23, m6, m7 5115 punpckldq m6, m7 5116 vinserti32x8 m14, m0, ym2, 1 5117 vshufi32x4 m15, m0, m2, q3232 5118 vinserti32x8 m2, m4, ym6, 1 5119 vshufi32x4 m4, m6, q3232 5120 vshufi32x4 m21, m14, m2, q2020 ; 0 2 5121 vshufi32x4 m14, m2, q3131 ; 4 6 5122 vshufi32x4 m18, m15, m4, q2020 ; 8 10 5123 vshufi32x4 m15, m4, q3131 ; 12 14 5124 pxor m9, m9 5125 punpcklwd m8, m14, m14 ; 4 5126 punpcklwd m1, m15, m15 ; 12 5127 punpcklwd m7, m18, m18 ; 8 5128 punpcklwd m9, m21 ; __ 0 5129 call m(idct_16x16_internal_8bpc).main_fast4 5130 punpckhwd m21, m21 ; 2 5131 punpckhwd m15, m15 ; 14 5132 punpckhwd m18, m18 ; 10 5133 punpckhwd m14, m14 ; 6 5134 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 5135 vinserti32x8 m24, m28, ym27, 1 5136 vshufi32x4 m28, m27, q3232 5137 vinserti32x8 m27, m22, ym23, 1 5138 vshufi32x4 m22, m23, q3232 5139 vshufi32x4 m23, m24, m27, q2020 ; 1 3 5140 vshufi32x4 m24, m27, q3131 ; 5 7 5141 vshufi32x4 m27, m28, m22, q2020 ; 9 11 5142 vshufi32x4 m28, m22, q3131 ; 13 15 5143 punpcklwd m22, m23, m23 ; 1 5144 punpckhwd m29, m28, m28 ; 15 5145 punpcklwd m26, m27, m27 ; 9 5146 punpckhwd m25, m24, m24 ; 7 5147 mova [cq+64* 8], m14 5148 mova [cq+64* 9], m15 5149 mova [cq+64*10], m16 5150 mova [cq+64*11], m17 5151 punpcklwd m24, m24 ; 5 5152 punpckhwd m27, m27 ; 11 5153 punpcklwd m28, m28 ; 13 5154 punpckhwd m23, m23 ; 3 5155 mova [cq+64*12], m18 5156 mova [cq+64*13], m19 5157 mova [cq+64*14], m20 5158 mova [cq+64*15], m21 5159 call .main_oddhalf_fast 5160.end: 5161 imul r6, strideq, 60 5162 mova m10, [o(end_16x32p)] 5163 vpbroadcastd m11, [o(pw_2048)] 5164 lea r3, [strideq*3] 5165 pxor m12, m12 5166 add r6, dstq ; dst+stride*60 5167 psrldq m13, m10, 1 5168 lea r4, [strideq+r3] ; stride*4 5169%macro IDCT_16x64_END 3 ; idct32, idct64, tmp 5170%if %1 & 1 5171 %define %%s0 r3 5172 %define %%s1 strideq*2 5173 %define %%s2 strideq*1 5174 %define %%s3 strideq*0 5175%else 5176 %define %%s0 strideq*0 5177 %define %%s1 strideq*1 5178 %define %%s2 strideq*2 5179 %define %%s3 r3 5180%if %1 5181 add dstq, r4 5182 sub r6, r4 5183%endif 5184%endif 5185%if %1 < 8 5186 pmulhrsw m8, m11, m%1 5187 pmulhrsw m9, m11, m%2 5188%else 5189 mova m9, [cq+64*%1] 5190 paddsw m8, m9, m%2 ; out 0+n, 1+n 5191 psubsw m9, m%2 ; out 63-n, 62-n 5192 pmulhrsw m8, m11 5193 pmulhrsw m9, m11 5194%endif 5195 mova xm29, [dstq+%%s0] 5196 vinserti128 ym29, [dstq+%%s1], 1 5197 mova xm%3, [r6 +%%s3] 5198 vinserti128 ym%3, [r6 +%%s2], 1 5199 vpermb m29, m10, m29 5200 vpermb m%3, m10, m%3 5201 mova [cq+64*%1], m12 5202 paddw m29, m8 5203 paddw m%3, m9 5204 packuswb m29, m%3 5205 vpermd m29, m13, m29 5206 mova [dstq+%%s0], xm29 5207 vextracti128 [dstq+%%s1], ym29, 1 5208 vextracti32x4 [r6 +%%s2], m29, 2 5209 vextracti32x4 [r6 +%%s3], m29, 3 5210%endmacro 5211 IDCT_16x64_END 0, 29, 0 5212 IDCT_16x64_END 1, 28, 28 5213 IDCT_16x64_END 2, 27, 28 5214 IDCT_16x64_END 3, 26, 28 5215 IDCT_16x64_END 4, 25, 28 5216 IDCT_16x64_END 5, 24, 28 5217 IDCT_16x64_END 6, 23, 28 5218 IDCT_16x64_END 7, 22, 28 5219 IDCT_16x64_END 8, 21, 28 5220 IDCT_16x64_END 9, 20, 28 5221 IDCT_16x64_END 10, 19, 28 5222 IDCT_16x64_END 11, 18, 28 5223 IDCT_16x64_END 12, 17, 28 5224 IDCT_16x64_END 13, 16, 28 5225 IDCT_16x64_END 14, 15, 28 5226 IDCT_16x64_END 15, 14, 28 5227 RET 5228.dconly: 5229 movsx r6d, word [cq] 5230 mov [cq], eobd 5231 or r3d, 64 5232 imul r6d, 181 5233 add r6d, 128+512 5234 sar r6d, 8+2 5235 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 5236ALIGN function_align 5237cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero 5238 vpbroadcastd m8, [o(pw_101_4095x8)] 5239 vpbroadcastd m21, [o(pw_m1474_3822x8)] 5240 vpbroadcastd m14, [o(pw_897_3996x8)] 5241 vpbroadcastd m17, [o(pw_m700_4036x8)] 5242 vpbroadcastd m18, [o(pw_501_4065x8)] 5243 vpbroadcastd m19, [o(pw_m1092_3948x8)] 5244 vpbroadcastd m16, [o(pw_1285_3889x8)] 5245 vpbroadcastd m15, [o(pw_m301_4085x8)] 5246 pmulhrsw m8, m22 ; t32a t63a 5247 pmulhrsw m21, m29 ; t35a t60a 5248 pmulhrsw m14, m26 ; t36a t59a 5249 pmulhrsw m17, m25 ; t39a t56 5250 pmulhrsw m18, m24 ; t40a t55a 5251 pmulhrsw m19, m27 ; t43a t52a 5252 pmulhrsw m16, m28 ; t44a t51a 5253 pmulhrsw m15, m23 ; t47a t48a 5254 mova m22, m8 5255 mova m29, m21 5256 mova m26, m14 5257 mova m25, m17 5258 mova m24, m18 5259 mova m27, m19 5260 mova m28, m16 5261 mova m20, m15 5262 jmp .main_oddhalf2 5263ALIGN function_align 5264cglobal_label .main_oddhalf 5265 vpbroadcastd m8, [o(pw_101_4095x8)] 5266 vpbroadcastd m9, [o(pw_m2824_2967x8)] 5267 vpbroadcastd m11, [o(pw_1660_3745x8)] 5268 vpbroadcastd m12, [o(pw_m1474_3822x8)] 5269 pmulhrsw m22, m8 ; t32a t63a 5270 vpbroadcastd m8, [o(pw_897_3996x8)] 5271 pmulhrsw m21, m9 ; t33a t62a 5272 vpbroadcastd m9, [o(pw_m2191_3461x8)] 5273 pmulhrsw m14, m11 ; t34a t61a 5274 vpbroadcastd m11, [o(pw_2359_3349x8)] 5275 pmulhrsw m29, m12 ; t35a t60a 5276 vpbroadcastd m12, [o(pw_m700_4036x8)] 5277 pmulhrsw m26, m8 ; t36a t59a 5278 vpbroadcastd m8, [o(pw_501_4065x8)] 5279 pmulhrsw m17, m9 ; t37a t58a 5280 vpbroadcastd m9, [o(pw_m2520_3229x8)] 5281 pmulhrsw m18, m11 ; t38a t57a 5282 vpbroadcastd m11, [o(pw_2019_3564x8)] 5283 pmulhrsw m25, m12 ; t39a t56a 5284 vpbroadcastd m12, [o(pw_m1092_3948x8)] 5285 pmulhrsw m24, m8 ; t40a t55a 5286 vpbroadcastd m8, [o(pw_1285_3889x8)] 5287 pmulhrsw m19, m9 ; t41a t54a 5288 vpbroadcastd m9, [o(pw_m1842_3659x8)] 5289 pmulhrsw m16, m11 ; t42a t53a 5290 vpbroadcastd m11, [o(pw_2675_3102x8)] 5291 pmulhrsw m27, m12 ; t43a t52a 5292 vpbroadcastd m12, [o(pw_m301_4085x8)] 5293 pmulhrsw m28, m8 ; t44a t51a 5294 pmulhrsw m15, m9 ; t45a t50a 5295 pmulhrsw m20, m11 ; t46a t49a 5296 pmulhrsw m23, m12 ; t47a t48a 5297 psubsw m8, m22, m21 ; t33 t62 5298 paddsw m22, m21 ; t32 t63 5299 psubsw m21, m29, m14 ; t34 t61 5300 paddsw m29, m14 ; t35 t60 5301 psubsw m14, m26, m17 ; t37 t58 5302 paddsw m26, m17 ; t36 t59 5303 psubsw m17, m25, m18 ; t38 t57 5304 paddsw m25, m18 ; t39 t56 5305 psubsw m18, m24, m19 ; t41 t54 5306 paddsw m24, m19 ; t40 t55 5307 psubsw m19, m27, m16 ; t42 t53 5308 paddsw m27, m16 ; t43 t52 5309 psubsw m16, m28, m15 ; t45 t50 5310 paddsw m28, m15 ; t44 t51 5311 psubsw m15, m23, m20 ; t46 t49 5312 paddsw m20, m23 ; t47 t48 5313.main_oddhalf2: 5314 ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a 5315 ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a 5316 ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a 5317 ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a 5318 ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a 5319 ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a 5320 ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a 5321 ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a 5322 vpbroadcastd m11, [o(pw_m4017_799)] 5323 psubsw m23, m25, m26 ; t36a t59a 5324 paddsw m25, m26 ; t39a t56a 5325 psubsw m26, m24, m27 ; t43a t52a 5326 paddsw m27, m24 ; t40a t55a 5327 psubsw m24, m20, m28 ; t44a t51a 5328 paddsw m20, m28 ; t47a t48a 5329 psubsw m28, m8, m21 ; t34 t61 5330 paddsw m8, m21 ; t33 t62 5331 psubsw m21, m17, m14 ; t37 t58 5332 paddsw m17, m14 ; t38 t57 5333 psubsw m14, m18, m19 ; t42 t53 5334 paddsw m18, m19 ; t41 t54 5335 psubsw m19, m15, m16 ; t45 t50 5336 paddsw m15, m16 ; t46 t49 5337 psubsw m16, m22, m29 ; t35a t60a 5338 paddsw m22, m29 ; t32a t63a 5339 ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60 5340 ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a 5341 ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59 5342 ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a 5343 vpbroadcastd m11, [o(pw_m2276_3406)] 5344 ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52 5345 ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a 5346 ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51 5347 ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a 5348 vpbroadcastd m11, [o(pw_1567_3784)] 5349 vpbroadcastd m12, [o(pw_m3784_1567)] 5350 psubsw m29, m22, m25 ; t39 t56 5351 paddsw m22, m25 ; t32 t63 5352 psubsw m25, m20, m27 ; t40 t55 5353 paddsw m20, m27 ; t47 t48 5354 psubsw m27, m8, m17 ; t38a t57a 5355 paddsw m8, m17 ; t33a t62a 5356 psubsw m17, m15, m18 ; t41a t54a 5357 paddsw m15, m18 ; t46a t49a 5358 paddsw m18, m16, m23 ; t35a t60a 5359 psubsw m16, m23 ; t36a t59a 5360 psubsw m23, m24, m26 ; t43a t52a 5361 paddsw m24, m26 ; t44a t51a 5362 paddsw m26, m28, m21 ; t34 t61 5363 psubsw m28, m21 ; t37 t58 5364 psubsw m21, m19, m14 ; t42 t53 5365 paddsw m19, m14 ; t45 t50 5366 ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a 5367 ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57 5368 ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59 5369 ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a 5370 vpbroadcastd m11, [o(pw_m1567_m3784)] 5371 ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a 5372 ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54 5373 ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52 5374 ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a 5375 vbroadcasti32x4 m13, [o(deint_shuf)] 5376 vpbroadcastd m11, [o(pw_2896_2896)] 5377 vpbroadcastd m12, [o(pw_m2896_2896)] 5378 paddsw m14, m22, m20 ; t32a t63a 5379 psubsw m22, m20 ; t47a t48a 5380 psubsw m20, m8, m15 ; t46 t49 5381 paddsw m8, m15 ; t33 t62 5382 paddsw m15, m18, m24 ; t35 t60 5383 psubsw m18, m24 ; t44 t51 5384 psubsw m24, m26, m19 ; t45a t50a 5385 paddsw m26, m19 ; t34a t61a 5386 REPX {pshufb x, m13}, m14, m8, m15, m26 5387 psubsw m19, m29, m25 ; t40 t55 5388 paddsw m25, m29 ; t39 t56 5389 psubsw m29, m27, m17 ; t41a t54a 5390 paddsw m27, m17 ; t38a t57a 5391 psubsw m17, m16, m23 ; t43a t52a 5392 paddsw m16, m23 ; t36a t59a 5393 psubsw m9, m28, m21 ; t42 t53 5394 paddsw m28, m21 ; t37 t58 5395 REPX {pshufb x, m13}, m25, m27, m16, m28 5396 ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48 5397 ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a 5398 packssdw m21, m22 ; t47 t46a 5399 packssdw m13, m23 ; t48 t49a 5400 ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a 5401 ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50 5402 packssdw m20, m18 ; t44a t45 5403 packssdw m22, m23 ; t51a t50 5404 ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a 5405 ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54 5406 packssdw m18, m19 ; t40a t41 5407 packssdw m24, m23 ; t55a t54 5408 ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52 5409 ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a 5410 packssdw m19, m17 ; t43 t42a 5411 packssdw m23, m29 ; t52 t53a 5412 punpcklqdq m17, m25, m27 ; t39 t38a 5413 punpckhqdq m25, m27 ; t56 t57a 5414 punpckhqdq m27, m15, m26 ; t60 t61a 5415 punpcklqdq m15, m26 ; t35 t34a 5416 punpckhqdq m26, m16, m28 ; t59a t58 5417 punpcklqdq m16, m28 ; t36a t37 5418 punpckhqdq m28, m14, m8 ; t63a t62 5419 punpcklqdq m14, m8 ; t32a t33 5420 psubsw m29, m0, m28 ; out63 out62 5421 paddsw m0, m28 ; out0 out1 5422 psubsw m28, m1, m27 ; out60 out61 5423 paddsw m1, m27 ; out3 out2 5424 psubsw m27, m2, m26 ; out59 out58 5425 paddsw m2, m26 ; out4 out5 5426 psubsw m26, m3, m25 ; out56 out57 5427 paddsw m3, m25 ; out7 out6 5428 psubsw m25, m4, m24 ; out55 out54 5429 paddsw m4, m24 ; out8 out9 5430 psubsw m24, m5, m23 ; out52 out53 5431 paddsw m5, m23 ; out11 out10 5432 psubsw m23, m6, m22 ; out51 out50 5433 paddsw m6, m22 ; out12 out13 5434 psubsw m22, m7, m13 ; out48 out49 5435 paddsw m7, m13 ; out15 out14 5436 ret 5437 5438cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob 5439%undef cmp 5440 lea r5, [o_base] 5441 test eobd, eobd 5442 jnz .normal 5443 movsx r6d, word [cq] 5444 mov [cq], eobd 5445 or r3d, 16 5446.dconly: 5447 imul r6d, 181 5448 add r6d, 128+512 5449 sar r6d, 8+2 5450.dconly2: 5451 imul r6d, 181 5452 add r6d, 128+2048 5453 sar r6d, 8+4 5454 pxor m2, m2 5455 vpbroadcastw m3, r6d 5456.dconly_loop: 5457 mova m1, [dstq] 5458 punpcklbw m0, m1, m2 5459 punpckhbw m1, m2 5460 paddw m0, m3 5461 paddw m1, m3 5462 packuswb m0, m1 5463 mova [dstq], m0 5464 add dstq, strideq 5465 dec r3d 5466 jg .dconly_loop 5467 RET 5468.normal: 5469 WIN64_SPILL_XMM 31 5470 mova m19, [o(dup16_perm)] 5471 mova m24, [cq+64* 2] 5472 mova m28, [cq+64* 6] 5473 mova m26, [cq+64* 4] 5474 mova m22, [cq+64* 0] 5475 mova m23, [cq+64* 1] 5476 mova m29, [cq+64* 7] 5477 mova m27, [cq+64* 5] 5478 mova m25, [cq+64* 3] 5479 vpermb m8, m19, m24 ; 4 5480 vpermb m1, m19, m28 ; 12 5481 vpermb m7, m19, m26 ; 8 5482 vpermb m9, m19, m22 ; __ 0 5483 vpermb m21, m19, m23 ; 2 5484 vpermb m15, m19, m29 ; 14 5485 vpermb m18, m19, m27 ; 10 5486 vpermb m14, m19, m25 ; 6 5487 pslld m9, 16 5488 vpord m30, m19, [o(pb_32)] {1to16} 5489 REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23 5490 cmp eobd, 151 5491 jb .fast 5492 vpermb m0, m19, [cq+64*14] ; 28 5493 vpermb m5, m19, [cq+64*10] ; 20 5494 vpermb m3, m19, [cq+64*12] ; 24 5495 vpermb m6, m19, [cq+64* 8] ; __ 16 5496 pslld m6, 16 5497 call m(idct_16x16_internal_8bpc).main_fast 5498 vpermb m17, m19, [cq+64*15] ; 30 5499 vpermb m20, m19, [cq+64* 9] ; 18 5500 vpermb m16, m19, [cq+64*11] ; 22 5501 vpermb m19, m19, [cq+64*13] ; 26 5502 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 5503 mova [cq+64* 0], m14 5504 mova [cq+64* 1], m15 5505 mova [cq+64* 2], m16 5506 mova [cq+64* 3], m17 5507 mova [cq+64* 4], m18 5508 mova [cq+64* 5], m19 5509 mova [cq+64* 6], m20 5510 mova [cq+64* 7], m21 5511 vpermb m21, m30, [cq+64*15] 5512 vpermb m14, m30, [cq+64* 8] 5513 vpermb m17, m30, [cq+64*11] 5514 vpermb m18, m30, [cq+64*12] 5515 vpermb m19, m30, [cq+64*13] 5516 vpermb m16, m30, [cq+64*10] 5517 vpermb m15, m30, [cq+64* 9] 5518 vpermb m20, m30, [cq+64*14] 5519 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf 5520 jmp .end 5521.fast: ; bottom half is zero 5522 call m(idct_16x16_internal_8bpc).main_fast2 5523 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 5524 mova [cq+64* 0], m14 5525 mova [cq+64* 1], m15 5526 mova [cq+64* 2], m16 5527 mova [cq+64* 3], m17 5528 mova [cq+64* 4], m18 5529 mova [cq+64* 5], m19 5530 mova [cq+64* 6], m20 5531 mova [cq+64* 7], m21 5532 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast 5533.end: 5534 mova [cq+64* 8], m4 5535 mova [cq+64* 9], m5 5536 mova [cq+64*10], m6 5537 mova [cq+64*11], m7 5538 mova [cq+64*12], m26 5539 mova [cq+64*13], m27 5540 mova [cq+64*14], m28 5541 mova [cq+64*15], m29 5542 vpbroadcastd m13, [o(pw_8192)] 5543 call .pass1_end 5544 call .pass2 5545 mova [cq+64* 0], m0 5546 mova [cq+64* 1], m1 5547 mova [cq+64* 2], m2 5548 mova [cq+64* 3], m3 5549 mova [cq+64* 4], m4 5550 mova [cq+64* 5], m5 5551 mova [cq+64* 6], m6 5552 mova [cq+64* 7], m7 5553 pmulhrsw m0, m13, [cq+64* 8] 5554 pmulhrsw m1, m13, [cq+64* 9] 5555 pmulhrsw m2, m13, [cq+64*10] 5556 pmulhrsw m3, m13, [cq+64*11] 5557 vpbroadcastd m30, [o(pw_2048)] 5558 pmulhrsw m4, m13, m22 5559 pmulhrsw m5, m13, m23 5560 pmulhrsw m6, m13, m24 5561 pmulhrsw m7, m13, m25 5562 pmulhrsw m22, m30, m14 5563 pmulhrsw m14, m13, m26 5564 pmulhrsw m23, m30, m15 5565 pmulhrsw m15, m13, m27 5566 pmulhrsw m24, m30, m16 5567 pmulhrsw m16, m13, m28 5568 pmulhrsw m25, m30, m17 5569 pmulhrsw m17, m13, m29 5570 pmulhrsw m26, m30, m18 5571 pmulhrsw m18, m13, [cq+64*12] 5572 pmulhrsw m27, m30, m19 5573 pmulhrsw m19, m13, [cq+64*13] 5574 pmulhrsw m28, m30, m20 5575 pmulhrsw m20, m13, [cq+64*14] 5576 pmulhrsw m29, m30, m21 5577 pmulhrsw m21, m13, [cq+64*15] 5578 call .transpose_round 5579 call .pass2 5580 pxor m10, m10 5581 lea r3, [strideq*3] 5582%macro IDCT_64x16_END 4 5583 mova m9, [dstq+%4] 5584%if %1 < 8 5585 pmulhrsw m%3, m30, [cq+64*%1] 5586%endif 5587 pmulhrsw m%2, m30 5588 mova [cq+64*%1], m10 5589 punpcklbw m8, m9, m10 5590 punpckhbw m9, m10 5591 paddw m8, m%3 5592 paddw m9, m%2 5593 packuswb m8, m9 5594 mova [dstq+%4], m8 5595%if %1 == 3 || %1 == 7 || %1 == 11 5596 lea dstq, [dstq+strideq*4] 5597%endif 5598%endmacro 5599 IDCT_64x16_END 0, 0, 11, strideq*0 5600 IDCT_64x16_END 1, 1, 11, strideq*1 5601 IDCT_64x16_END 2, 2, 11, strideq*2 5602 IDCT_64x16_END 3, 3, 11, r3 5603 IDCT_64x16_END 4, 4, 11, strideq*0 5604 IDCT_64x16_END 5, 5, 11, strideq*1 5605 IDCT_64x16_END 6, 6, 11, strideq*2 5606 IDCT_64x16_END 7, 7, 11, r3 5607 IDCT_64x16_END 8, 14, 22, strideq*0 5608 IDCT_64x16_END 9, 15, 23, strideq*1 5609 IDCT_64x16_END 10, 16, 24, strideq*2 5610 IDCT_64x16_END 11, 17, 25, r3 5611 IDCT_64x16_END 12, 18, 26, strideq*0 5612 IDCT_64x16_END 13, 19, 27, strideq*1 5613 IDCT_64x16_END 14, 20, 28, strideq*2 5614 IDCT_64x16_END 15, 21, 29, r3 5615 RET 5616ALIGN function_align 5617.pass1_end: 5618 mova m4, [cq+64* 0] 5619 mova m5, [cq+64* 1] 5620 mova m6, [cq+64* 2] 5621 mova m7, [cq+64* 3] 5622 mova m8, [cq+64* 4] 5623 mova m9, [cq+64* 5] 5624 mova m11, [cq+64* 6] 5625 mova m12, [cq+64* 7] 5626 psubsw m29, m4, m21 ; out47 out46 5627 paddsw m4, m21 ; out16 out17 5628 psubsw m28, m5, m20 ; out44 out45 5629 paddsw m5, m20 ; out19 out18 5630 REPX {pmulhrsw x, m13}, m0, m1, m2, m3 5631 psubsw m27, m6, m19 ; out43 out42 5632 paddsw m6, m19 ; out20 out21 5633 psubsw m26, m7, m18 ; out40 out41 5634 paddsw m7, m18 ; out23 out22 5635 pmulhrsw m18, m13, m22 5636 pmulhrsw m19, m13, m23 5637 pmulhrsw m20, m13, m24 5638 pmulhrsw m21, m13, m25 5639 paddsw m25, m12, m14 ; out31 out30 5640 psubsw m14, m12, m14 ; out32 out33 5641 paddsw m24, m11, m15 ; out28 out29 5642 psubsw m15, m11, m15 ; out35 out34 5643 REPX {pmulhrsw x, m13}, m4, m5, m6, m7 5644 paddsw m23, m9, m16 ; out27 out26 5645 psubsw m16, m9, m16 ; out36 out37 5646 paddsw m22, m8, m17 ; out24 out25 5647 psubsw m17, m8, m17 ; out39 out38 5648 REPX {pmulhrsw x, m13}, m14, m15, m16, m17 5649.transpose_round: 5650%macro TRANSPOSE_8x4_PACKED 4 5651 punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3 5652 punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3 5653 punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3 5654 punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3 5655 punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3 5656 punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1 5657 punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3 5658 punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1 5659 punpcklwd m%3, m%4, m%2 ; 2 5660 punpckhwd m%4, m%2 ; 3 5661 punpckhwd m%2, m%1, m8 ; 1 5662 punpcklwd m%1, m8 ; 0 5663%endmacro 5664 TRANSPOSE_8x4_PACKED 0, 1, 2, 3 5665 TRANSPOSE_8x4_PACKED 18, 19, 20, 21 5666 TRANSPOSE_8x4_PACKED 4, 5, 6, 7 5667 TRANSPOSE_8x4_PACKED 14, 15, 16, 17 5668 vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03 5669 vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01 5670 vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13 5671 vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11 5672 vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23 5673 vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21 5674 vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33 5675 vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31 5676 vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03 5677 vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01 5678 vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13 5679 vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11 5680 vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23 5681 vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21 5682 vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33 5683 vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31 5684 ret 5685.pass2: 5686 vshufi32x4 m7, m5, m19, q3131 ; 14 5687 vshufi32x4 m5, m19, q2020 ; 10 5688 vshufi32x4 m21, m6, m20, q3131 ; 15 5689 vshufi32x4 m19, m6, m20, q2020 ; 11 5690 vshufi32x4 m20, m4, m18, q3131 ; 13 5691 vshufi32x4 m18, m4, m18, q2020 ; 9 5692 vshufi32x4 m6, m8, m2, q3131 ; 12 5693 vshufi32x4 m4, m8, m2, q2020 ; 8 5694 vshufi32x4 m2, m0, m3, q3131 ; 4 5695 vshufi32x4 m0, m3, q2020 ; 0 5696 vshufi32x4 m3, m1, m16, q3131 ; 6 5697 vshufi32x4 m1, m16, q2020 ; 2 5698 vshufi32x4 m16, m9, m15, q3131 ; 5 5699 vshufi32x4 m14, m9, m15, q2020 ; 1 5700 vshufi32x4 m15, m11, m17, q2020 ; 3 5701 vshufi32x4 m17, m11, m17, q3131 ; 7 5702 call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 5703 jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 5704 5705cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob 5706 lea r5, [o_base] 5707 test eobd, eobd 5708 jz .dconly 5709 PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob 5710 vpbroadcastd m23, [o(pw_2896x8)] 5711%undef cmp 5712 cmp eobd, 136 5713 jb .fast 5714 pmulhrsw m5, m23, [cq+64*20] 5715 pmulhrsw m3, m23, [cq+64*12] 5716 pmulhrsw m1, m23, [cq+64* 4] 5717 pmulhrsw m7, m23, [cq+64*28] 5718 pmulhrsw m2, m23, [cq+64* 8] 5719 pmulhrsw m6, m23, [cq+64*24] 5720 pmulhrsw m0, m23, [cq+64* 0] 5721 pmulhrsw m4, m23, [cq+64*16] 5722 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 5723 pmulhrsw m14, m23, [cq+64* 2] 5724 pmulhrsw m21, m23, [cq+64*30] 5725 pmulhrsw m18, m23, [cq+64*18] 5726 pmulhrsw m17, m23, [cq+64*14] 5727 pmulhrsw m16, m23, [cq+64*10] 5728 pmulhrsw m19, m23, [cq+64*22] 5729 pmulhrsw m20, m23, [cq+64*26] 5730 pmulhrsw m15, m23, [cq+64* 6] 5731 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 5732 mova [cq+64* 0], m14 5733 mova [cq+64* 2], m15 5734 mova [cq+64* 4], m16 5735 mova [cq+64* 6], m17 5736 mova [cq+64* 8], m18 5737 mova [cq+64*10], m19 5738 mova [cq+64*12], m20 5739 mova [cq+64*14], m21 5740 pmulhrsw m22, m23, [cq+64* 1] 5741 pmulhrsw m21, m23, [cq+64*31] 5742 pmulhrsw m14, m23, [cq+64*17] 5743 pmulhrsw m29, m23, [cq+64*15] 5744 pmulhrsw m26, m23, [cq+64* 9] 5745 pmulhrsw m17, m23, [cq+64*23] 5746 pmulhrsw m18, m23, [cq+64*25] 5747 pmulhrsw m25, m23, [cq+64* 7] 5748 pmulhrsw m24, m23, [cq+64* 5] 5749 pmulhrsw m19, m23, [cq+64*27] 5750 pmulhrsw m16, m23, [cq+64*21] 5751 pmulhrsw m27, m23, [cq+64*11] 5752 pmulhrsw m28, m23, [cq+64*13] 5753 pmulhrsw m15, m23, [cq+64*19] 5754 pmulhrsw m20, m23, [cq+64*29] 5755 pmulhrsw m23, [cq+64* 3] 5756 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf 5757 vpbroadcastd m12, [o(pw_16384)] 5758 psubsw m13, m0, m29 ; 31 5759 paddsw m0, m29 ; 0 5760 psubsw m29, m1, m28 ; 30 5761 paddsw m1, m28 ; 1 5762 psubsw m28, m2, m27 ; 29 5763 paddsw m2, m27 ; 2 5764 psubsw m27, m3, m26 ; 28 5765 paddsw m3, m26 ; 3 5766 psubsw m26, m4, m25 ; 27 5767 paddsw m4, m25 ; 4 5768 psubsw m25, m5, m24 ; 26 5769 paddsw m5, m24 ; 5 5770 psubsw m24, m6, m23 ; 25 5771 paddsw m6, m23 ; 6 5772 psubsw m23, m7, m22 ; 24 5773 paddsw m7, m22 ; 7 5774 pxor m9, m9 5775 punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 5776 punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 5777 punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 5778 punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 5779 REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 5780 punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 5781 punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 5782 punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 5783 punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 5784 REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 5785 punpckhwd m3, m23, m24 5786 punpcklwd m23, m24 5787 punpckhwd m24, m25, m26 5788 punpcklwd m25, m26 5789 REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 5790 punpckhwd m26, m27, m28 5791 punpcklwd m27, m28 5792 punpckhwd m28, m29, m13 5793 punpcklwd m29, m13 5794 REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 5795 punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 5796 punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 5797 punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 5798 punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 5799 REPX {pmulhrsw x, m12}, m7, m0, m2, m4 5800 punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 5801 punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 5802 punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 5803 punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 5804 REPX {pmulhrsw x, m12}, m6, m8, m1, m22 5805 punpckhdq m13, m23, m25 5806 punpckldq m23, m25 5807 punpckhdq m25, m27, m29 5808 punpckldq m27, m29 5809 REPX {pmulhrsw x, m12}, m13, m23, m25, m27 5810 punpckhdq m9, m3, m24 5811 punpckldq m3, m24 5812 punpckhdq m24, m26, m28 5813 punpckldq m26, m28 5814 REPX {pmulhrsw x, m12}, m9, m3, m24, m26 5815 punpckhqdq m5, m23, m27 ; d01 d09 d17 d25 5816 punpcklqdq m23, m27 ; d00 d08 d16 d24 5817 punpcklqdq m27, m13, m25 ; d02 d10 d18 d26 5818 punpckhqdq m13, m25 ; d03 d11 d19 d27 5819 punpcklqdq m25, m3, m26 ; d04 d12 d20 d28 5820 punpckhqdq m3, m26 ; d05 d13 d21 d29 5821 punpcklqdq m26, m9, m24 ; d06 d14 d22 d30 5822 punpckhqdq m9, m24 ; d07 d15 d23 d31 5823 mova [cq+64* 3], m23 5824 mova [cq+64*13], m27 5825 mova [cq+64* 7], m25 5826 mova [cq+64*15], m26 5827 punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 5828 punpcklqdq m8, m22 ; a04 a12 a20 a28 5829 punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 5830 punpcklqdq m0, m4 ; a00 a08 a16 a24 5831 punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 5832 punpcklqdq m7, m2 ; a02 a10 a18 a26 5833 punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 5834 punpcklqdq m6, m1 ; a06 a14 a22 a30 5835 mova [cq+64* 1], m0 5836 mova [cq+64* 9], m7 5837 mova [cq+64* 5], m8 5838 mova [cq+64*11], m6 5839 mova m2, [cq+64* 0] 5840 mova m11, [cq+64* 2] 5841 mova m8, [cq+64* 4] 5842 mova m29, [cq+64* 6] 5843 mova m27, [cq+64* 8] 5844 mova m26, [cq+64*10] 5845 mova m4, [cq+64*12] 5846 mova m28, [cq+64*14] 5847 psubsw m1, m2, m21 ; 23 5848 paddsw m2, m21 ; 8 5849 psubsw m21, m11, m20 ; 22 5850 paddsw m11, m20 ; 9 5851 psubsw m20, m8, m19 ; 21 5852 paddsw m8, m19 ; 10 5853 psubsw m19, m29, m18 ; 20 5854 paddsw m29, m18 ; 11 5855 psubsw m18, m27, m17 ; 19 5856 paddsw m27, m17 ; 12 5857 psubsw m17, m26, m16 ; 18 5858 paddsw m26, m16 ; 13 5859 psubsw m16, m4, m15 ; 17 5860 paddsw m4, m15 ; 14 5861 psubsw m15, m28, m14 ; 16 5862 paddsw m28, m14 ; 15 5863 punpcklwd m14, m15, m16 5864 punpckhwd m15, m16 5865 punpckhwd m16, m17, m18 5866 punpcklwd m17, m18 5867 punpckhwd m18, m19, m20 5868 punpcklwd m19, m20 5869 punpckhwd m20, m21, m1 5870 punpcklwd m21, m1 5871 punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 5872 punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 5873 punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 5874 punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 5875 punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 5876 punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 5877 punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 5878 punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 5879 punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 5880 punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 5881 punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3 5882 punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1 5883 REPX {pmulhrsw x, m12}, m28, m2, m8, m27 5884 punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 5885 punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 5886 punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 5887 punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 5888 REPX {pmulhrsw x, m12}, m4, m1, m11, m29 5889 punpckhdq m26, m19, m21 5890 punpckldq m19, m21 5891 punpckhdq m21, m15, m16 5892 punpckldq m15, m16 5893 REPX {pmulhrsw x, m12}, m26, m19, m21, m15 5894 punpckhdq m16, m18, m20 5895 punpckldq m18, m20 5896 punpckhdq m20, m14, m17 5897 punpckldq m14, m17 5898 REPX {pmulhrsw x, m12}, m16, m18, m20, m14 5899 punpckhqdq m17, m28, m8 ; b03 b11 b19 b27 5900 punpcklqdq m28, m8 ; b02 b10 b18 b26 5901 punpckhqdq m8, m2, m27 ; b01 b09 b17 b25 5902 punpcklqdq m2, m27 ; b00 b08 b16 b24 5903 punpcklqdq m27, m1, m29 ; b04 b12 b20 b28 5904 punpckhqdq m1, m29 ; b05 b13 b21 b29 5905 punpcklqdq m29, m4, m11 ; b06 b14 b22 b30 5906 punpckhqdq m4, m11 ; b07 b15 b23 b31 5907 mova [cq+64* 0], m2 5908 mova [cq+64* 8], m28 5909 mova [cq+64* 4], m27 5910 mova [cq+64*10], m29 5911 punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 5912 punpcklqdq m20, m26 ; c02 c10 c18 c26 5913 punpckhqdq m26, m14, m19 ; c01 c09 c17 c25 5914 punpcklqdq m14, m19 ; c00 c08 c16 c24 5915 punpckhqdq m28, m15, m18 ; c05 c13 c21 c29 5916 punpcklqdq m15, m18 ; c04 c12 c20 c28 5917 punpckhqdq m29, m21, m16 ; c07 c15 c23 c31 5918 punpcklqdq m21, m16 ; c06 c14 c22 c30 5919 mova [cq+64* 2], m14 5920 mova [cq+64*12], m20 5921 mova [cq+64* 6], m15 5922 mova [cq+64*14], m21 5923 vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25 5924 vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09 5925 vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27 5926 vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11 5927 vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29 5928 vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13 5929 vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31 5930 vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15 5931 vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09 5932 vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25 5933 vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11 5934 vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27 5935 vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13 5936 vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29 5937 vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15 5938 vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31 5939 mov r4, rsp 5940 vshufi32x4 m0, m22, m19, q2020 ; 1 5941 vshufi32x4 m1, m17, m29, q3131 ; 31 5942 vshufi32x4 m2, m14, m26, q2020 ; 17 5943 vshufi32x4 m3, m25, m18, q3131 ; 15 5944 call .main_part1 5945 vshufi32x4 m0, m25, m18, q2020 ; 7 5946 vshufi32x4 m1, m14, m26, q3131 ; 25 5947 vshufi32x4 m2, m17, m29, q2020 ; 23 5948 vshufi32x4 m3, m22, m19, q3131 ; 9 5949 call .main_part1 5950 vshufi32x4 m0, m24, m21, q2020 ; 5 5951 vshufi32x4 m1, m15, m27, q3131 ; 27 5952 vshufi32x4 m2, m16, m28, q2020 ; 21 5953 vshufi32x4 m3, m23, m20, q3131 ; 11 5954 call .main_part1 5955 vshufi32x4 m0, m23, m20, q2020 ; 3 5956 vshufi32x4 m1, m16, m28, q3131 ; 29 5957 vshufi32x4 m2, m15, m27, q2020 ; 19 5958 vshufi32x4 m3, m24, m21, q3131 ; 13 5959 call .main_part1 5960 call .main_part2 5961 mova m0, [cq+64* 1] ; a0 5962 mova m15, [cq+64* 0] ; b0 5963 mova m3, [cq+64* 2] ; c0 5964 mova m16, [cq+64* 3] ; d0 5965 mova m14, [cq+64* 5] ; a4 5966 mova m8, [cq+64* 4] ; b4 5967 mova m17, [cq+64* 6] ; c4 5968 mova m1, [cq+64* 7] ; d4 5969 vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24 5970 vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08 5971 vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24 5972 vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08 5973 vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28 5974 vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12 5975 vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28 5976 vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12 5977 vshufi32x4 m1, m0, m3, q3131 ; 8 5978 vshufi32x4 m0, m3, q2020 ; 0 5979 vshufi32x4 m3, m2, m15, q3131 ; 24 5980 vshufi32x4 m2, m15, q2020 ; 16 5981 vshufi32x4 m15, m14, m17, q3131 ; 12 5982 vshufi32x4 m14, m17, q2020 ; 4 5983 vshufi32x4 m17, m16, m8, q3131 ; 28 5984 vshufi32x4 m16, m8, q2020 ; 20 5985 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 5986 mova m8, [cq+64* 8] 5987 mova m9, [cq+64*12] 5988 mova m11, [cq+64*10] 5989 mova m12, [cq+64*14] 5990 mova [cq+64* 0], m14 5991 mova [cq+64* 2], m15 5992 mova [cq+64* 4], m16 5993 mova [cq+64* 6], m17 5994 mova [cq+64* 8], m18 5995 mova [cq+64*10], m19 5996 mova [cq+64*12], m20 5997 mova [cq+64*14], m21 5998 mova m22, [cq+64* 9] 5999 mova m27, [cq+64*13] 6000 mova m23, [cq+64*11] 6001 mova m24, [cq+64*15] 6002 vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26 6003 vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10 6004 vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26 6005 vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10 6006 vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30 6007 vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14 6008 vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30 6009 vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14 6010 vshufi32x4 m28, m26, m8, q3131 ; 26 6011 vshufi32x4 m26, m8, q2020 ; 18 6012 vshufi32x4 m24, m22, m9, q3131 ; 10 6013 vshufi32x4 m22, m9, q2020 ; 2 6014 vshufi32x4 m29, m27, m11, q3131 ; 30 6015 vshufi32x4 m27, m11, q2020 ; 22 6016 vshufi32x4 m25, m23, m12, q3131 ; 14 6017 vshufi32x4 m23, m12, q2020 ; 6 6018 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 6019 jmp .end 6020.fast: ; bottom/right halves are zero 6021 pmulhrsw ym9, ym23, [cq+64* 0] 6022 pmulhrsw ym6, ym23, [cq+64* 8] 6023 mova m14, [o(dup16_perm)] 6024 pmulhrsw ym8, ym23, [cq+64* 2] 6025 pmulhrsw xm0, xm23, [cq+64*14] 6026 pmulhrsw xm5, xm23, [cq+64*10] 6027 pmulhrsw ym1, ym23, [cq+64* 6] 6028 pmulhrsw ym7, ym23, [cq+64* 4] 6029 pmulhrsw xm3, xm23, [cq+64*12] 6030 pmovzxwd m9, ym9 6031 pmovzxwd m6, ym6 6032 vpermb m8, m14, m8 6033 punpcklwd xm0, xm0 6034 vpermb ym5, ym14, ym5 6035 vpermb m1, m14, m1 6036 vpermb m7, m14, m7 6037 punpcklwd xm3, xm3 6038 pslld m9, 16 6039 pslld m6, 16 6040 call m(idct_16x16_internal_8bpc).main_fast 6041 vpmulhrsw ym21, ym23, [cq+64* 1] 6042 {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which 6043 {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to 6044 {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements 6045 {evex}vpmulhrsw ym18, ym23, [cq+64* 5] 6046 {evex}vpmulhrsw xm16, xm23, [cq+64*11] 6047 {evex}vpmulhrsw xm19, xm23, [cq+64*13] 6048 {evex}vpmulhrsw ym23, [cq+64* 3] 6049 vpermb m21, m14, m21 6050 punpcklwd xm17, xm17 6051 vpermb ym20, ym14, ym20 6052 vpermb m15, m14, m15 6053 vpermb m18, m14, m18 6054 vpermb ym16, ym14, ym16 6055 punpcklwd xm19, xm19 6056 vpermb m14, m14, m23 6057 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 6058 vpbroadcastd m9, [o(pw_16384)] 6059 call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round 6060 vshufi32x4 m16, m0, m3, q2020 ; 0 6061 vshufi32x4 m26, m0, m3, q3131 ; 4 6062 vshufi32x4 m0, m14, m2, q2020 ; 1 6063 vshufi32x4 m14, m2, q3131 ; 5 6064 vshufi32x4 m3, m19, m7, q3131 ; 15 6065 vshufi32x4 m19, m7, q2020 ; 11 6066 vshufi32x4 m27, m17, m9, q2020 ; 3 6067 vshufi32x4 m17, m9, q3131 ; 7 6068 vshufi32x4 m28, m20, m6, q2020 ; 9 6069 vshufi32x4 m20, m6, q3131 ; 13 6070 vshufi32x4 m22, m1, m18, q2020 ; 2 6071 vshufi32x4 m23, m1, m18, q3131 ; 6 6072 vshufi32x4 m24, m5, m15, q2020 ; 10 6073 vshufi32x4 m25, m5, m15, q3131 ; 14 6074 vshufi32x4 m15, m21, m4, q3131 ; 12 6075 vshufi32x4 m21, m21, m4, q2020 ; 8 6076 mov r4, rsp 6077 call .main_part1_fast 6078 mova m0, m17 6079 mova m3, m28 6080 call .main_part1_fast 6081 mova m0, m14 6082 mova m3, m19 6083 call .main_part1_fast 6084 mova m0, m27 6085 mova m3, m20 6086 call .main_part1_fast 6087 call .main_part2 6088 mova m0, m16 6089 mova m1, m21 6090 mova m14, m26 6091 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 6092 mova [cq+64*14], m21 6093 mova [cq+64* 0], m14 6094 mova [cq+64* 6], m17 6095 mova [cq+64* 8], m18 6096 mova [cq+64*10], m19 6097 mova [cq+64* 4], m16 6098 mova [cq+64* 2], m15 6099 mova [cq+64*12], m20 6100 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 6101.end: 6102 lea r4, [strideq*3] 6103 vpbroadcastd m12, [o(pw_2048)] 6104 movshdup m13, [o(permD)] 6105 lea r5, [r4+strideq] ; stride*4 6106 lea r3, [dstq+r4*8] 6107 lea r6, [strideq+r5*8] ; stride*33 6108 lea r8, [r4+r5*8] ; stride*35 6109 add r3, r5 ; dst+stride*28 6110 lea r7, [r6+strideq] ; stride*34 6111%macro IDCT_32x64_END 6 ; src, mem, stride[1-4] 6112%if %2 < 8 6113 paddsw m10, m%2, m%1 6114 psubsw m11, m%2, m%1 6115%else 6116 mova m11, [cq+64*(%2*2-16)] 6117 paddsw m10, m11, m%1 6118 psubsw m11, m%1 6119%endif 6120 mova m9, [rsp+64*(31-%2)] 6121 mova m%1, [rsp+64*%2] 6122 paddsw m8, m10, m9 6123 psubsw m10, m9 6124 paddsw m9, m11, m%1 6125 pmovzxbw m0, [dstq+%3] 6126 psubsw m11, m%1 6127 pmovzxbw m%1, [r3 +%4] 6128 REPX {pmulhrsw x, m12}, m8, m10, m9, m11 6129 paddw m8, m0 6130 pmovzxbw m0, [r3 +%5] 6131 paddw m10, m%1 6132 pmovzxbw m%1, [dstq+%6] 6133 paddw m9, m0 6134 paddw m11, m%1 6135%if %2 >= 8 6136%if %2 == 8 6137 pxor m1, m1 6138%endif 6139 mova [cq+64*(%2*2-16)], m1 6140 mova [cq+64*(%2*2-15)], m1 6141%endif 6142 packuswb m8, m10 6143 packuswb m9, m11 6144 vpermq m8, m13, m8 6145 vpermq m9, m13, m9 6146 mova [dstq+%3], ym8 6147 vextracti32x8 [r3 +%4], m8, 1 6148 mova [r3 +%5], ym9 6149 vextracti32x8 [dstq+%6], m9, 1 6150%if %2 == 3 || %2 == 7 || %2 == 11 6151 add dstq, r5 6152 sub r3, r5 6153%endif 6154%endmacro 6155 IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8 6156 IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6 6157 IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7 6158 IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8 6159 IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8 6160 IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6 6161 IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7 6162 IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8 6163 IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8 6164 IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6 6165 IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7 6166 IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8 6167 IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8 6168 IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6 6169 IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7 6170 IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8 6171 RET 6172.dconly: 6173 movsx r6d, word [cq] 6174 mov [cq], eobd 6175 or r3d, 64 6176 imul r6d, 181 6177 add r6d, 128 6178 sar r6d, 8 6179 imul r6d, 181 6180 add r6d, 128+256 6181 sar r6d, 8+1 6182 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 6183ALIGN function_align ; bottom three-quarters are zero 6184cglobal_label .main_part1_fast2 6185 vpbroadcastd m7, [o(idct64_mul+4*0)] 6186 vpbroadcastd m8, [o(idct64_mul+4*1)] 6187 pmulhrsw m7, m0 ; t63a 6188 pmulhrsw m0, m8 ; t32a 6189 6190 punpcklwd m4, m0, m7 6191 punpckhwd m6, m0, m7 6192 mova m1, m10 6193 vpdpwssd m1, m4, [o(idct64_mul+4*9)] {bcstd} 6194 mova m9, m10 6195 vpdpwssd m9, m6, [o(idct64_mul+4*9)] {bcstd} 6196 REPX {psrad x, 12}, m1, m9 6197 packssdw m1, m9 6198 mova m9, m10 6199 vpdpwssd m9, m6, [o(idct64_mul+4*8)] {bcstd} 6200 mova m6, m10 6201 vpdpwssd m6, m4, [o(idct64_mul+4*8)] {bcstd} 6202 REPX {psrad x, 12}, m9, m6 6203 packssdw m6, m9 6204 6205 mova m4, m0 6206 mova m3, m7 6207 mova m5, m1 6208 mova m2, m6 6209 jmp .main_part1c 6210cglobal_label .main_part1_fast 6211 vpbroadcastd m1, [o(idct64_mul+4*0)] 6212 vpbroadcastd m8, [o(idct64_mul+4*1)] 6213 vpbroadcastd m2, [o(idct64_mul+4*6)] 6214 vpbroadcastd m9, [o(idct64_mul+4*7)] 6215 pmulhrsw m1, m0 ; t63a 6216 pmulhrsw m0, m8 ; t32a 6217 pmulhrsw m2, m3 ; t60a 6218 pmulhrsw m3, m9 ; t35a 6219 mova m8, m0 6220 mova m7, m1 6221 mova m6, m3 6222 mova m5, m2 6223 jmp .main_part1b 6224cglobal_label .main_part1 6225 ; idct64 steps 1-5: 6226 ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 6227 ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 6228 ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 6229 ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 6230 vpbroadcastd m7, [o(idct64_mul+4*0)] 6231 vpbroadcastd m8, [o(idct64_mul+4*1)] 6232 vpbroadcastd m6, [o(idct64_mul+4*2)] 6233 vpbroadcastd m9, [o(idct64_mul+4*3)] 6234 pmulhrsw m7, m0 ; t63a 6235 vpbroadcastd m5, [o(idct64_mul+4*4)] 6236 pmulhrsw m0, m8 ; t32a 6237 vpbroadcastd m8, [o(idct64_mul+4*5)] 6238 pmulhrsw m6, m1 ; t62a 6239 vpbroadcastd m4, [o(idct64_mul+4*6)] 6240 pmulhrsw m1, m9 ; t33a 6241 vpbroadcastd m9, [o(idct64_mul+4*7)] 6242 pmulhrsw m5, m2 ; t61a 6243 pmulhrsw m2, m8 ; t34a 6244 pmulhrsw m4, m3 ; t60a 6245 pmulhrsw m3, m9 ; t35a 6246 psubsw m8, m0, m1 ; t33 6247 paddsw m0, m1 ; t32 6248 psubsw m1, m7, m6 ; t62 6249 paddsw m7, m6 ; t63 6250 psubsw m6, m3, m2 ; t34 6251 paddsw m3, m2 ; t35 6252 psubsw m2, m4, m5 ; t61 6253 paddsw m5, m4 ; t60 6254.main_part1b: 6255 vpbroadcastd m11, [o(idct64_mul+4*8)] 6256 vpbroadcastd m12, [o(idct64_mul+4*9)] 6257 ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a 6258 vpbroadcastd m11, [o(idct64_mul+4*10)] 6259 ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a 6260 psubsw m4, m0, m3 ; t35a 6261 paddsw m0, m3 ; t32a 6262 psubsw m3, m7, m5 ; t60a 6263 paddsw m7, m5 ; t63a 6264 psubsw m5, m1, m2 ; t34 6265 paddsw m1, m2 ; t33 6266 psubsw m2, m8, m6 ; t61 6267 paddsw m6, m8 ; t62 6268.main_part1c: 6269 vpbroadcastd m11, [o(idct64_mul+4*11)] 6270 vpbroadcastd m12, [o(idct64_mul+4*12)] 6271 add r5, 4*13 6272 ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60 6273 ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a 6274 mova [r4+64*0], m0 6275 mova [r4+64*7], m7 6276 mova [r4+64*1], m1 6277 mova [r4+64*6], m6 6278 mova [r4+64*3], m3 6279 mova [r4+64*4], m4 6280 mova [r4+64*2], m2 6281 mova [r4+64*5], m5 6282 add r4, 64*8 6283 ret 6284cglobal_label .main_part2 6285 vpbroadcastd m11, [o(pw_1567_3784 -16*13)] 6286 vpbroadcastd m12, [o(pw_m3784_1567 -16*13)] 6287 lea r6, [r4+64*7] 6288 vpbroadcastd m17, [o(pw_m1567_m3784-16*13)] 6289 vpbroadcastd m18, [o(pw_2896_2896 -16*13)] 6290 vpbroadcastd m19, [o(pw_m2896_2896 -16*13)] 6291 sub r5, 16*13 6292.main_part2_loop: 6293 mova m0, [r4-64*32] ; t32a 6294 mova m1, [r6-64*24] ; t39a 6295 mova m2, [r6-64*32] ; t63a 6296 mova m3, [r4-64*24] ; t56a 6297 mova m4, [r4-64*16] ; t40a 6298 mova m5, [r6-64* 8] ; t47a 6299 mova m6, [r6-64*16] ; t55a 6300 mova m7, [r4-64* 8] ; t48a 6301 psubsw m8, m0, m1 ; t39 6302 paddsw m0, m1 ; t32 6303 psubsw m1, m2, m3 ; t56 6304 paddsw m2, m3 ; t63 6305 psubsw m3, m5, m4 ; t40 6306 paddsw m5, m4 ; t47 6307 psubsw m4, m7, m6 ; t55 6308 paddsw m7, m6 ; t48 6309 ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a 6310 ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a 6311 psubsw m6, m2, m7 ; t48a 6312 paddsw m2, m7 ; t63a 6313 psubsw m7, m0, m5 ; t47a 6314 paddsw m0, m5 ; t32a 6315 psubsw m5, m8, m3 ; t55 6316 paddsw m8, m3 ; t56 6317 psubsw m3, m1, m4 ; t40 6318 paddsw m1, m4 ; t39 6319 ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48 6320 ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a 6321 mova [r6-64* 8], m2 6322 mova [r4-64*32], m0 6323 mova [r4-64* 8], m8 6324 mova [r6-64*32], m1 6325 mova [r6-64*24], m6 6326 mova [r4-64*16], m7 6327 mova [r4-64*24], m5 6328 mova [r6-64*16], m3 6329 add r4, 64 6330 sub r6, 64 6331 cmp r4, r6 6332 jb .main_part2_loop 6333 ret 6334 6335cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob 6336 lea r5, [o_base] 6337 test eobd, eobd 6338 jz .dconly 6339 PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob 6340 vpbroadcastd m23, [o(pw_2896x8)] 6341%undef cmp 6342 cmp eobd, 136 6343 jb .fast 6344 pmulhrsw m0, m23, [cq+64* 1] 6345 pmulhrsw m1, m23, [cq+64*31] 6346 pmulhrsw m2, m23, [cq+64*17] 6347 pmulhrsw m3, m23, [cq+64*15] 6348 vpbroadcastd m10, [o(pd_2048)] 6349 mov r4, rsp 6350 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6351 pmulhrsw m0, m23, [cq+64* 7] 6352 pmulhrsw m1, m23, [cq+64*25] 6353 pmulhrsw m2, m23, [cq+64*23] 6354 pmulhrsw m3, m23, [cq+64* 9] 6355 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6356 pmulhrsw m0, m23, [cq+64* 5] 6357 pmulhrsw m1, m23, [cq+64*27] 6358 pmulhrsw m2, m23, [cq+64*21] 6359 pmulhrsw m3, m23, [cq+64*11] 6360 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6361 pmulhrsw m0, m23, [cq+64* 3] 6362 pmulhrsw m1, m23, [cq+64*29] 6363 pmulhrsw m2, m23, [cq+64*19] 6364 pmulhrsw m3, m23, [cq+64*13] 6365 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6366 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 6367 pmulhrsw m3, m23, [cq+64*24] 6368 pmulhrsw m1, m23, [cq+64* 8] 6369 pmulhrsw m2, m23, [cq+64*16] 6370 pmulhrsw m0, m23, [cq+64* 0] 6371 pmulhrsw m14, m23, [cq+64* 4] 6372 pmulhrsw m17, m23, [cq+64*28] 6373 pmulhrsw m16, m23, [cq+64*20] 6374 pmulhrsw m15, m23, [cq+64*12] 6375 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 6376 pmulhrsw m22, m23, [cq+64* 2] 6377 pmulhrsw m29, m23, [cq+64*30] 6378 pmulhrsw m26, m23, [cq+64*18] 6379 pmulhrsw m25, m23, [cq+64*14] 6380 pmulhrsw m24, m23, [cq+64*10] 6381 pmulhrsw m27, m23, [cq+64*22] 6382 pmulhrsw m28, m23, [cq+64*26] 6383 pmulhrsw m23, [cq+64* 6] 6384 mova [cq+64* 0], m14 6385 mova [cq+64* 1], m15 6386 mova [cq+64* 2], m16 6387 mova [cq+64* 3], m17 6388 mova [cq+64* 4], m18 6389 mova [cq+64* 5], m19 6390 mova [cq+64* 6], m20 6391 mova [cq+64* 7], m21 6392 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 6393 vpbroadcastd m13, [o(pw_16384)] 6394 call .pass1_end_part1 6395 mova [cq+64*16], m1 6396 mova [cq+64*17], m3 6397 mova [cq+64*18], m5 6398 mova [cq+64*19], m7 6399 mova [cq+64*24], m23 6400 mova [cq+64*25], m25 6401 mova [cq+64*26], m27 6402 mova [cq+64*27], m29 6403 pmulhrsw m23, m13, m0 ; a0 6404 pmulhrsw m25, m13, m2 ; a2 6405 pmulhrsw m27, m13, m4 ; a4 6406 pmulhrsw m29, m13, m6 ; a6 6407 REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6 6408 call .pass1_end_part2 6409 mova [cq+64*20], m15 6410 mova [cq+64*21], m17 6411 mova [cq+64*22], m19 6412 mova [cq+64*23], m21 6413 mova [cq+64*28], m1 6414 mova [cq+64*29], m3 6415 mova [cq+64*30], m5 6416 mova [cq+64*31], m7 6417 REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6 6418 REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6 6419 vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01 6420 vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03 6421 vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01 6422 vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03 6423 vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41 6424 vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43 6425 vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 6426 vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43 6427 vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21 6428 vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23 6429 vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21 6430 vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23 6431 vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61 6432 vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 6433 vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61 6434 vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63 6435 vshufi32x4 m2, m3, m15, q3131 ; 8 6436 vshufi32x4 m0, m3, m15, q2020 ; 0 6437 vshufi32x4 m6, m23, m22, q3131 ; 24 6438 vshufi32x4 m4, m23, m22, q2020 ; 16 6439 vshufi32x4 m3, m1, m18, q3131 ; 12 6440 vshufi32x4 m1, m18, q2020 ; 4 6441 vshufi32x4 m7, m27, m26, q3131 ; 28 6442 vshufi32x4 m5, m27, m26, q2020 ; 20 6443 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 6444 vshufi32x4 m16, m14, m17, q3131 ; 10 6445 vshufi32x4 m14, m17, q2020 ; 2 6446 vshufi32x4 m17, m19, m20, q3131 ; 14 6447 vshufi32x4 m15, m19, m20, q2020 ; 6 6448 vshufi32x4 m20, m25, m24, q3131 ; 26 6449 vshufi32x4 m18, m25, m24, q2020 ; 18 6450 vshufi32x4 m21, m29, m28, q3131 ; 30 6451 vshufi32x4 m19, m29, m28, q2020 ; 22 6452 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 6453 pmulhrsw m22, m13, [cq+64*16] ; a1 6454 pmulhrsw m23, m13, [cq+64*20] ; c1 6455 pmulhrsw m24, m13, [cq+64*24] ; e1 6456 pmulhrsw m25, m13, [cq+64*28] ; g1 6457 pmulhrsw m26, m13, [cq+64*17] ; a3 6458 pmulhrsw m27, m13, [cq+64*21] ; c3 6459 pmulhrsw m28, m13, [cq+64*25] ; e3 6460 pmulhrsw m29, m13, [cq+64*29] ; g3 6461 mova [cq+64* 8], m14 6462 mova [cq+64* 9], m15 6463 mova [cq+64*10], m16 6464 mova [cq+64*11], m17 6465 mova [cq+64*12], m18 6466 mova [cq+64*13], m19 6467 mova [cq+64*14], m20 6468 mova [cq+64*15], m21 6469 pmulhrsw m14, m13, [cq+64*18] ; a5 6470 pmulhrsw m15, m13, [cq+64*22] ; c5 6471 pmulhrsw m16, m13, [cq+64*26] ; e5 6472 pmulhrsw m17, m13, [cq+64*30] ; g5 6473 pmulhrsw m18, m13, [cq+64*19] ; a7 6474 pmulhrsw m19, m13, [cq+64*23] ; c7 6475 pmulhrsw m20, m13, [cq+64*27] ; e7 6476 pmulhrsw m21, m13, [cq+64*31] ; g7 6477 vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11 6478 vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13 6479 vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11 6480 vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13 6481 vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31 6482 vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33 6483 vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31 6484 vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33 6485 mova [cq+64* 0], m0 6486 mova [cq+64* 1], m1 6487 mova [cq+64* 2], m2 6488 mova [cq+64* 3], m3 6489 mova [cq+64* 4], m4 6490 mova [cq+64* 5], m5 6491 mova [cq+64* 6], m6 6492 mova [cq+64* 7], m7 6493 vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51 6494 vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53 6495 vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51 6496 vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53 6497 vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71 6498 vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73 6499 vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71 6500 vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73 6501 vshufi32x4 m27, m23, m11, q3131 ; 11 m27 6502 vshufi32x4 m23, m11, q2020 ; 3 m23 6503 vshufi32x4 m19, m26, m28, q3131 ; 27 m19 6504 vshufi32x4 m15, m26, m28, q2020 ; 19 m15 6505 vshufi32x4 m29, m25, m17, q3131 ; 15 m29 6506 vshufi32x4 m25, m17, q2020 ; 7 m25 6507 vshufi32x4 m21, m18, m20, q3131 ; 31 m21 6508 vshufi32x4 m17, m18, m20, q2020 ; 23 m17 6509 vshufi32x4 m20, m14, m16, q3131 ; 29 m20 6510 vshufi32x4 m16, m14, m16, q2020 ; 21 m16 6511 vshufi32x4 m18, m22, m24, q3131 ; 25 m18 6512 vshufi32x4 m14, m22, m24, q2020 ; 17 m14 6513 vshufi32x4 m26, m8, m9, q3131 ; 9 m26 6514 vshufi32x4 m22, m8, m9, q2020 ; 1 m22 6515 vshufi32x4 m28, m12, m13, q3131 ; 13 m28 6516 vshufi32x4 m24, m12, m13, q2020 ; 5 m24 6517 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf 6518 vpbroadcastd m13, [o(pw_16384)] 6519 pmulhrsw m0, m13, [r4-64*21] 6520 pmulhrsw m1, m13, [r4-64*22] 6521 pmulhrsw m2, m13, [r4-64*23] 6522 pmulhrsw m3, m13, [r4-64*24] 6523 pmulhrsw m4, m13, [r4-64*25] 6524 pmulhrsw m5, m13, [r4-64*26] 6525 pmulhrsw m6, m13, [r4-64*27] 6526 pmulhrsw m7, m13, [r4-64*28] 6527 mova [cq+64*16], m14 6528 mova [cq+64*17], m15 6529 mova [cq+64*18], m16 6530 mova [cq+64*19], m17 6531 mova [cq+64*20], m18 6532 mova [cq+64*21], m19 6533 mova [cq+64*22], m20 6534 mova [cq+64*23], m21 6535 pmulhrsw m14, m13, [r4-64*12] 6536 pmulhrsw m15, m13, [r4-64*11] 6537 pmulhrsw m16, m13, [r4-64*10] 6538 pmulhrsw m17, m13, [r4-64* 9] 6539 pmulhrsw m18, m13, [r4-64* 8] 6540 pmulhrsw m19, m13, [r4-64* 7] 6541 pmulhrsw m20, m13, [r4-64* 6] 6542 pmulhrsw m21, m13, [r4-64* 5] 6543 mova [cq+64*24], m22 6544 mova [cq+64*25], m23 6545 mova [cq+64*26], m24 6546 mova [cq+64*27], m25 6547 mova [cq+64*28], m26 6548 mova [cq+64*29], m27 6549 mova [cq+64*30], m28 6550 mova [cq+64*31], m29 6551 call .transpose_2x8x8_lo 6552 mova [r4-64*12], m1 6553 mova [r4-64*11], m3 6554 mova [r4-64*10], m5 6555 mova [r4-64* 9], m7 6556 mova [r4-64* 8], m15 6557 mova [r4-64* 7], m17 6558 mova [r4-64* 6], m19 6559 mova [r4-64* 5], m21 6560 vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01 6561 vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03 6562 vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21 6563 vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23 6564 vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41 6565 vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43 6566 vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61 6567 vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63 6568 pmulhrsw m0, m13, [r4-64*20] 6569 pmulhrsw m1, m13, [r4-64*19] 6570 pmulhrsw m2, m13, [r4-64*18] 6571 pmulhrsw m3, m13, [r4-64*17] 6572 pmulhrsw m4, m13, [r4-64*16] 6573 pmulhrsw m5, m13, [r4-64*15] 6574 pmulhrsw m6, m13, [r4-64*14] 6575 pmulhrsw m7, m13, [r4-64*13] 6576 pmulhrsw m14, m13, [r4-64*29] 6577 pmulhrsw m15, m13, [r4-64*30] 6578 pmulhrsw m16, m13, [r4-64*31] 6579 pmulhrsw m17, m13, [r4-64*32] 6580 pmulhrsw m18, m13, [r4-64*33] 6581 pmulhrsw m19, m13, [r4-64*34] 6582 pmulhrsw m20, m13, [r4-64*35] 6583 pmulhrsw m21, m13, [r4-64*36] 6584 call .transpose_2x8x8_lo 6585 mova [r4-64*20], m1 6586 mova [r4-64*19], m3 6587 mova [r4-64*18], m5 6588 mova [r4-64*17], m7 6589 mova [r4-64*16], m15 6590 mova [r4-64*15], m17 6591 mova [r4-64*14], m19 6592 mova [r4-64*13], m21 6593 vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41 6594 vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43 6595 vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03 6596 vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01 6597 vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21 6598 vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23 6599 vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61 6600 vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63 6601 vshufi32x4 m2, m0, m22, q3131 ; 8 6602 vshufi32x4 m0, m22, q2020 ; 0 6603 vshufi32x4 m3, m1, m26, q3131 ; 12 6604 vshufi32x4 m1, m26, q2020 ; 4 6605 vshufi32x4 m6, m4, m23, q3131 ; 24 6606 vshufi32x4 m4, m23, q2020 ; 16 6607 vshufi32x4 m7, m5, m27, q3131 ; 28 6608 vshufi32x4 m5, m27, q2020 ; 20 6609 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 6610 vshufi32x4 m16, m14, m24, q3131 ; 10 6611 vshufi32x4 m14, m24, q2020 ; 2 6612 vshufi32x4 m17, m15, m28, q3131 ; 14 6613 vshufi32x4 m15, m28, q2020 ; 6 6614 vshufi32x4 m20, m18, m25, q3131 ; 26 6615 vshufi32x4 m18, m25, q2020 ; 18 6616 vshufi32x4 m21, m19, m29, q3131 ; 30 6617 vshufi32x4 m19, m29, q2020 ; 22 6618 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 6619 mova m22, [r4-64*20] 6620 mova m26, [r4-64*16] 6621 mova m23, [r4-64*19] 6622 mova m27, [r4-64*15] 6623 mova m24, [r4-64*18] 6624 mova m28, [r4-64*14] 6625 mova m25, [r4-64*17] 6626 mova m29, [r4-64*13] 6627 mova [r4-64*20], m14 6628 mova [r4-64*19], m15 6629 mova [r4-64*18], m16 6630 mova [r4-64*17], m17 6631 mova [r4-64*16], m18 6632 mova [r4-64*15], m19 6633 mova [r4-64*14], m20 6634 mova [r4-64*13], m21 6635 mova m19, [r4-64*12] 6636 mova m11, [r4-64* 8] 6637 mova m20, [r4-64*11] 6638 mova m12, [r4-64* 7] 6639 mova m21, [r4-64*10] 6640 mova m8, [r4-64* 6] 6641 mova m9, [r4-64* 9] 6642 mova m18, [r4-64* 5] 6643 vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13 6644 vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11 6645 vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33 6646 vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31 6647 vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53 6648 vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51 6649 vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73 6650 vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71 6651 vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11 6652 vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13 6653 vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31 6654 vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33 6655 vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51 6656 vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53 6657 vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71 6658 vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73 6659 vshufi32x4 m26, m22, m27, q3131 ; 9 6660 vshufi32x4 m22, m27, q2020 ; 1 6661 vshufi32x4 m27, m23, m28, q3131 ; 11 6662 vshufi32x4 m23, m28, q2020 ; 3 6663 vshufi32x4 m28, m24, m29, q3131 ; 13 6664 vshufi32x4 m24, m29, q2020 ; 5 6665 vshufi32x4 m29, m25, m8, q3131 ; 15 6666 vshufi32x4 m25, m8, q2020 ; 7 6667 vshufi32x4 m18, m14, m19, q3131 ; 25 6668 vshufi32x4 m14, m19, q2020 ; 17 6669 vshufi32x4 m19, m15, m20, q3131 ; 27 6670 vshufi32x4 m15, m20, q2020 ; 19 6671 vshufi32x4 m20, m16, m21, q3131 ; 29 6672 vshufi32x4 m16, m21, q2020 ; 21 6673 vshufi32x4 m21, m17, m9, q3131 ; 31 6674 vshufi32x4 m17, m9, q2020 ; 23 6675 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf 6676 jmp .end 6677.fast: ; bottom/right halves are zero 6678 {evex}vpmulhrsw ym8, ym23, [cq+64* 4] 6679 {evex}vpmulhrsw xm1, xm23, [cq+64*12] 6680 mova m28, [o(dup16_perm)] 6681 {evex}vpmulhrsw ym7, ym23, [cq+64* 8] 6682 vpmulhrsw ym22, ym23, [cq+64* 0] 6683 vpermb m8, m28, m8 6684 vpermb ym1, ym28, ym1 6685 vpermb m7, m28, m7 6686 pmovzxwd m9, ym22 6687 pslld m9, 16 6688 call m(idct_16x16_internal_8bpc).main_fast2 6689 {evex}vpmulhrsw ym21, ym23, [cq+64* 2] 6690 {evex}vpmulhrsw xm15, xm23, [cq+64*14] 6691 {evex}vpmulhrsw xm18, xm23, [cq+64*10] 6692 {evex}vpmulhrsw ym14, ym23, [cq+64* 6] 6693 vpermb m21, m28, m21 6694 punpcklwd xm15, xm15 6695 vpermb ym18, ym28, ym18 6696 vpermb m14, m28, m14 6697 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 6698 vpmulhrsw ym22, ym23, [cq+64* 1] 6699 {evex}vpmulhrsw xm29, xm23, [cq+64*15] 6700 {evex}vpmulhrsw xm26, xm23, [cq+64* 9] 6701 {evex}vpmulhrsw ym25, ym23, [cq+64* 7] 6702 {evex}vpmulhrsw ym24, ym23, [cq+64* 5] 6703 {evex}vpmulhrsw xm27, xm23, [cq+64*11] 6704 {evex}vpmulhrsw xm8, xm23, [cq+64*13] 6705 {evex}vpmulhrsw ym23, [cq+64* 3] 6706 vpermb m22, m28, m22 6707 punpcklwd xm29, xm29 6708 vpermb ym26, ym28, ym26 6709 vpermb m25, m28, m25 6710 mova [cq+64* 0], m14 6711 mova [cq+64* 1], m15 6712 mova [cq+64* 2], m16 6713 mova [cq+64* 3], m17 6714 REPX {vpermb x, m28, x}, m24, m27, m23 6715 punpcklwd xm28, xm8, xm8 6716 mova [cq+64* 4], m18 6717 mova [cq+64* 5], m19 6718 mova [cq+64* 6], m20 6719 mova [cq+64* 7], m21 6720 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast 6721 mov r4, rsp 6722 vpbroadcastd m13, [o(pw_16384)] 6723 mova [r4+64*16], m4 6724 mova [r4+64*17], m5 6725 mova [r4+64*18], m6 6726 mova [r4+64*19], m7 6727 mova [r4+64*28], m26 6728 mova [r4+64*29], m27 6729 mova [r4+64*30], m28 6730 mova [r4+64*31], m29 6731 call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end 6732 mova [r4+64*20], m22 6733 mova [r4+64*21], m23 6734 mova [r4+64*22], m24 6735 mova [r4+64*23], m25 6736 mova [r4+64*24], m26 6737 mova [r4+64*25], m27 6738 mova [r4+64*26], m28 6739 mova [r4+64*27], m29 6740 call .pass2_fast 6741 mova [cq+64* 8], m14 6742 mova [cq+64* 9], m15 6743 mova [cq+64*10], m16 6744 mova [cq+64*11], m17 6745 mova [cq+64*12], m18 6746 mova [cq+64*13], m19 6747 mova [cq+64*14], m20 6748 mova [cq+64*15], m21 6749 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 6750 mova [cq+64* 0], m0 6751 mova [cq+64* 1], m1 6752 mova [cq+64* 2], m2 6753 mova [cq+64* 3], m3 6754 mova [cq+64* 4], m4 6755 mova [cq+64* 5], m5 6756 mova [cq+64* 6], m6 6757 mova [cq+64* 7], m7 6758 pmulhrsw m0, m13, [r4+64*16] 6759 pmulhrsw m1, m13, [r4+64*17] 6760 pmulhrsw m2, m13, [r4+64*18] 6761 pmulhrsw m3, m13, [r4+64*19] 6762 pmulhrsw m4, m13, [r4+64*20] 6763 pmulhrsw m5, m13, [r4+64*21] 6764 pmulhrsw m6, m13, [r4+64*22] 6765 pmulhrsw m7, m13, [r4+64*23] 6766 mova [cq+64*16], m14 6767 mova [cq+64*17], m15 6768 mova [cq+64*18], m16 6769 mova [cq+64*19], m17 6770 mova [cq+64*20], m18 6771 mova [cq+64*21], m19 6772 mova [cq+64*22], m20 6773 mova [cq+64*23], m21 6774 pmulhrsw m14, m13, [r4+64*24] 6775 pmulhrsw m15, m13, [r4+64*25] 6776 pmulhrsw m16, m13, [r4+64*26] 6777 pmulhrsw m17, m13, [r4+64*27] 6778 pmulhrsw m18, m13, [r4+64*28] 6779 pmulhrsw m19, m13, [r4+64*29] 6780 pmulhrsw m20, m13, [r4+64*30] 6781 pmulhrsw m21, m13, [r4+64*31] 6782 mova [cq+64*24], m22 6783 mova [cq+64*25], m23 6784 mova [cq+64*26], m24 6785 mova [cq+64*27], m25 6786 mova [cq+64*28], m26 6787 mova [cq+64*29], m27 6788 mova [cq+64*30], m28 6789 mova [cq+64*31], m29 6790 call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round 6791 call .pass2_fast 6792 mova [r4+64*16], m14 6793 mova [r4+64*17], m15 6794 mova [r4+64*18], m16 6795 mova [r4+64*19], m17 6796 mova [r4+64*20], m18 6797 mova [r4+64*21], m19 6798 mova [r4+64*22], m20 6799 mova [r4+64*23], m21 6800 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 6801.end: 6802 vpbroadcastd m13, [o(pw_2048)] 6803 lea r5, [strideq*3] 6804 pxor m12, m12 6805 lea r3, [dstq+r5*8] 6806 lea r6, [strideq+r5] ; stride*4 6807 add r3, r6 ; dst+stride*28 6808%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi 6809 mova m11, [cq+64*( %3)] ; 0 6810 mova m9, [cq+64*(31-%3)] ; 31 6811%if %3 >= 8 6812 mova m%1, [rsp+64*(%1+16)] 6813%endif 6814 mova m10, [dstq+%4] 6815 paddsw m8, m11, m9 6816 psubsw m11, m9 6817 paddsw m9, m%1, m%2 6818 psubsw m%1, m%2 6819 punpcklbw m%2, m10, m12 6820 punpckhbw m10, m12 6821 pmulhrsw m8, m13 6822 pmulhrsw m9, m13 6823 paddw m8, m%2 6824 paddw m9, m10 6825 mova m10, [r3+%5] 6826 pmulhrsw m11, m13 6827 pmulhrsw m%1, m13 6828 mova [cq+64*( %3)], m12 6829 mova [cq+64*(31-%3)], m12 6830 punpcklbw m%2, m10, m12 6831 punpckhbw m10, m12 6832 packuswb m8, m9 6833 paddw m11, m%2 6834 paddw m%1, m10 6835 packuswb m11, m%1 6836 mova [dstq+%4], m8 6837 mova [r3 +%5], m11 6838%if %3 == 3 || %3 == 7 || %3 == 11 6839 add dstq, r6 6840 sub r3, r6 6841%endif 6842%endmacro 6843 IDCT_64x32_END 0, 29, 0, strideq*0, r5 6844 IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2 6845 IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1 6846 IDCT_64x32_END 3, 26, 3, r5 , strideq*0 6847 IDCT_64x32_END 4, 25, 4, strideq*0, r5 6848 IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2 6849 IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1 6850 IDCT_64x32_END 7, 22, 7, r5 , strideq*0 6851 IDCT_64x32_END 0, 21, 8, strideq*0, r5 6852 IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2 6853 IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1 6854 IDCT_64x32_END 3, 18, 11, r5 , strideq*0 6855 IDCT_64x32_END 4, 17, 12, strideq*0, r5 6856 IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2 6857 IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1 6858 IDCT_64x32_END 7, 14, 15, r5 , strideq*0 6859 RET 6860ALIGN function_align 6861.dconly: 6862 movsx r6d, word [cq] 6863 mov [cq], eobd 6864 or r3d, 32 6865 imul r6d, 181 6866 add r6d, 128 6867 sar r6d, 8 6868 imul r6d, 181 6869 add r6d, 128+256 6870 sar r6d, 8+1 6871 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2 6872ALIGN function_align 6873.pass1_end_part1: 6874%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64 6875%if %1 != %3 6876 mova m%1, [cq+64*%1] 6877%endif 6878 mova m9, [r4+64*(%3-36)] ; idct64 32+n 6879 mova m11, [r4+64*(-5-%3)] ; idct64 63-n 6880 psubsw m8, m%1, m%2 ; idct32 31-n 6881 paddsw m%1, m%2 ; idct32 0+n 6882%if %1 == %3 6883 psubsw m%2, m8, m9 ; out 32+n e 6884 paddsw m8, m9 ; out 31-n d 6885 psubsw m9, m%1, m11 ; out 63-n h 6886 paddsw m%1, m11 ; out 0+n a 6887%else 6888 paddsw m%2, m8, m9 ; out 23-n c 6889 psubsw m8, m9 ; out 40+n f 6890 paddsw m9, m%1, m11 ; out 8+n b 6891 psubsw m%1, m11 ; out 55-n g 6892%endif 6893 mova [r4+64*(%3-36)], m8 6894 mova [r4+64*(-5-%3)], m9 6895%endmacro 6896 IDCT_64x32_PASS1_END 0, 29, 0 6897 IDCT_64x32_PASS1_END 1, 28, 1 6898 IDCT_64x32_PASS1_END 2, 27, 2 6899 IDCT_64x32_PASS1_END 3, 26, 3 6900 IDCT_64x32_PASS1_END 4, 25, 4 6901 IDCT_64x32_PASS1_END 5, 24, 5 6902 IDCT_64x32_PASS1_END 6, 23, 6 6903 IDCT_64x32_PASS1_END 7, 22, 7 6904.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted) 6905 punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3 6906 punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7 6907 punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3 6908 punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7 6909 punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3 6910 punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7 6911 punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3 6912 punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7 6913 punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5 6914 punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7 6915 punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1 6916 punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3 6917 punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3 6918 punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1 6919 punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5 6920 punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7 6921 punpckhqdq m23, m22, m27 ; 1 23 6922 punpcklqdq m22, m27 ; 0 22 6923 punpckhqdq m27, m26, m28 ; 5 27 6924 punpcklqdq m26, m28 ; 4 26 6925 punpcklqdq m28, m29, m25 ; 6 28 6926 punpckhqdq m29, m25 ; 7 29 6927 punpckhqdq m25, m24, m8 ; 3 25 6928 punpcklqdq m24, m8 ; 2 24 6929.transpose_8x8: 6930 punpckhwd m8, m4, m5 6931 punpcklwd m4, m5 6932 punpckhwd m5, m0, m1 6933 punpcklwd m0, m1 6934 punpckhwd m1, m6, m7 6935 punpcklwd m6, m7 6936 punpckhwd m7, m2, m3 6937 punpcklwd m2, m3 6938 punpckhdq m3, m0, m2 6939 punpckldq m0, m2 6940 punpckldq m2, m4, m6 6941 punpckhdq m4, m6 6942 punpckhdq m6, m5, m7 6943 punpckldq m5, m7 6944 punpckldq m7, m8, m1 6945 punpckhdq m8, m1 6946 punpckhqdq m1, m0, m2 6947 punpcklqdq m0, m2 6948 punpcklqdq m2, m3, m4 6949 punpckhqdq m3, m4 6950 punpcklqdq m4, m5, m7 6951 punpckhqdq m5, m7 6952 punpckhqdq m7, m6, m8 6953 punpcklqdq m6, m8 6954 ret 6955.pass1_end_part2: 6956 IDCT_64x32_PASS1_END 0, 21, 8 6957 IDCT_64x32_PASS1_END 1, 20, 9 6958 IDCT_64x32_PASS1_END 2, 19, 10 6959 IDCT_64x32_PASS1_END 3, 18, 11 6960 IDCT_64x32_PASS1_END 4, 17, 12 6961 IDCT_64x32_PASS1_END 5, 16, 13 6962 IDCT_64x32_PASS1_END 6, 15, 14 6963 IDCT_64x32_PASS1_END 7, 14, 15 6964.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21 6965 punpcklwd m8, m3, m2 6966 punpckhwd m3, m2 6967 punpcklwd m2, m1, m0 6968 punpckhwd m1, m0 6969 punpcklwd m0, m7, m6 6970 punpckhwd m7, m6 6971 punpcklwd m6, m5, m4 6972 punpckhwd m5, m4 6973 punpckldq m4, m7, m5 6974 punpckhdq m7, m5 6975 punpckldq m5, m8, m2 6976 punpckhdq m8, m2 6977 punpckhdq m2, m0, m6 6978 punpckldq m0, m6 6979 punpckldq m6, m3, m1 6980 punpckhdq m3, m1 6981 punpckhqdq m1, m0, m5 6982 punpcklqdq m0, m5 6983 punpckhqdq m5, m4, m6 6984 punpcklqdq m4, m6 6985 punpcklqdq m6, m7, m3 6986 punpckhqdq m7, m3 6987 punpckhqdq m3, m2, m8 6988 punpcklqdq m2, m8 6989 punpckhwd m8, m18, m19 6990 punpcklwd m18, m19 6991 punpckhwd m19, m14, m15 6992 punpcklwd m14, m15 6993 punpckhwd m15, m20, m21 6994 punpcklwd m20, m21 6995 punpckhwd m21, m16, m17 6996 punpcklwd m16, m17 6997 punpckhdq m17, m14, m16 6998 punpckldq m14, m16 6999 punpckldq m16, m18, m20 7000 punpckhdq m18, m20 7001 punpckhdq m20, m19, m21 7002 punpckldq m19, m21 7003 punpckldq m21, m8, m15 7004 punpckhdq m8, m15 7005 punpckhqdq m15, m14, m16 7006 punpcklqdq m14, m16 7007 punpcklqdq m16, m17, m18 7008 punpckhqdq m17, m18 7009 punpcklqdq m18, m19, m21 7010 punpckhqdq m19, m21 7011 punpckhqdq m21, m20, m8 7012 punpcklqdq m20, m8 7013 ret 7014.pass2_fast: 7015 vshufi32x4 m24, m9, m15, q3131 ; 5 7016 vshufi32x4 m22, m9, m15, q2020 ; 1 7017 vshufi32x4 m15, m1, m16, q3131 ; 6 7018 vshufi32x4 m14, m1, m16, q2020 ; 2 7019 vshufi32x4 m1, m0, m3, q3131 ; 4 7020 vshufi32x4 m0, m3, q2020 ; 0 7021 vshufi32x4 m3, m8, m2, q3131 ; 12 7022 vshufi32x4 m2, m8, m2, q2020 ; 8 7023 vshufi32x4 m25, m11, m17, q3131 ; 7 7024 vshufi32x4 m23, m11, m17, q2020 ; 3 7025 vshufi32x4 m17, m5, m19, q3131 ; 14 7026 vshufi32x4 m16, m5, m19, q2020 ; 10 7027 vshufi32x4 m29, m6, m20, q3131 ; 15 7028 vshufi32x4 m27, m6, m20, q2020 ; 11 7029 vshufi32x4 m28, m4, m18, q3131 ; 13 7030 vshufi32x4 m26, m4, m18, q2020 ; 9 7031 jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 7032 7033cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob 7034 lea r5, [o_base] 7035 test eobd, eobd 7036 jz .dconly 7037 PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob 7038%undef cmp 7039 cmp eobd, 136 7040 jb .fast 7041 mova m0, [cq+64* 1] 7042 mova m1, [cq+64*31] 7043 mova m2, [cq+64*17] 7044 mova m3, [cq+64*15] 7045 vpbroadcastd m10, [o(pd_2048)] 7046 mov r4, rsp 7047 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7048 mova m0, [cq+64* 7] 7049 mova m1, [cq+64*25] 7050 mova m2, [cq+64*23] 7051 mova m3, [cq+64* 9] 7052 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7053 mova m0, [cq+64* 5] 7054 mova m1, [cq+64*27] 7055 mova m2, [cq+64*21] 7056 mova m3, [cq+64*11] 7057 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7058 mova m0, [cq+64* 3] 7059 mova m1, [cq+64*29] 7060 mova m2, [cq+64*19] 7061 mova m3, [cq+64*13] 7062 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7063 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 7064 mova m0, [cq+64* 0] 7065 mova m1, [cq+64* 8] 7066 mova m2, [cq+64*16] 7067 mova m3, [cq+64*24] 7068 mova m14, [cq+64* 4] 7069 mova m15, [cq+64*12] 7070 mova m16, [cq+64*20] 7071 mova m17, [cq+64*28] 7072 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 7073 mova m22, [cq+64* 2] 7074 mova m29, [cq+64*30] 7075 mova m26, [cq+64*18] 7076 mova m25, [cq+64*14] 7077 mova m24, [cq+64*10] 7078 mova m27, [cq+64*22] 7079 mova m28, [cq+64*26] 7080 mova m23, [cq+64* 6] 7081 mova [cq+64* 0], m14 7082 mova [cq+64* 1], m15 7083 mova [cq+64* 2], m16 7084 mova [cq+64* 3], m17 7085 mova [cq+64* 4], m18 7086 mova [cq+64* 5], m19 7087 mova [cq+64* 6], m20 7088 mova [cq+64* 7], m21 7089 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 7090 vpbroadcastd m13, [o(pw_8192)] 7091 call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1 7092 mova [r4+64*36], m1 7093 mova [r4+64*37], m3 7094 mova [r4+64*38], m5 7095 mova [r4+64*39], m7 7096 mova [r4+64*44], m23 7097 mova [r4+64*45], m25 7098 mova [r4+64*46], m27 7099 mova [r4+64*47], m29 7100 pmulhrsw m23, m13, m0 ; a0 7101 pmulhrsw m25, m13, m2 ; a2 7102 pmulhrsw m27, m13, m4 ; a4 7103 pmulhrsw m29, m13, m6 ; a6 7104 call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2 7105 lea r6, [r4-64*4] 7106 add r4, 64*28 7107 call .pass2_end 7108 mov r4, rsp 7109 mova m0, [r4+64*23] 7110 mova m1, [r4+64*22] 7111 mova m2, [r4+64*21] 7112 mova m3, [r4+64*20] 7113 mova m4, [r4+64*19] 7114 mova m5, [r4+64*18] 7115 mova m6, [r4+64*17] 7116 mova m7, [r4+64*16] 7117 mova m22, [r4+64*15] 7118 mova m23, [r4+64*14] 7119 mova m24, [r4+64*13] 7120 mova m25, [r4+64*12] 7121 mova m26, [r4+64*11] 7122 mova m27, [r4+64*10] 7123 mova m28, [r4+64* 9] 7124 mova m29, [r4+64* 8] 7125 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi 7126 vpbroadcastd m13, [o(pw_8192)] 7127 mova [r4+64* 8], m1 7128 mova [r4+64* 9], m3 7129 mova [r4+64*10], m5 7130 mova [r4+64*11], m7 7131 mova [r4+64*16], m23 7132 mova [r4+64*17], m25 7133 mova [r4+64*18], m27 7134 mova [r4+64*19], m29 7135 pmulhrsw m23, m13, m0 ; b0 7136 pmulhrsw m25, m13, m2 ; b2 7137 pmulhrsw m27, m13, m4 ; b4 7138 pmulhrsw m29, m13, m6 ; b6 7139 mova m0, [r4+64*31] 7140 mova m1, [r4+64*30] 7141 mova m2, [r4+64*29] 7142 mova m3, [r4+64*28] 7143 mova m4, [r4+64*27] 7144 mova m5, [r4+64*26] 7145 mova m6, [r4+64*25] 7146 mova m7, [r4+64*24] 7147 mova m14, [r4+64* 7] 7148 mova m15, [r4+64* 6] 7149 mova m16, [r4+64* 5] 7150 mova m17, [r4+64* 4] 7151 mova m18, [r4+64* 3] 7152 mova m19, [r4+64* 2] 7153 mova m20, [r4+64* 1] 7154 mova m21, [r4+64* 0] 7155 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo 7156 mov r6, cq 7157 call .pass2_end 7158 jmp .end 7159.fast: ; bottom/right halves are zero 7160 mova m28, [o(dup16_perm)] 7161 pmovzxwd m9, [cq+64* 0] 7162 vpermb m8, m28, [cq+64* 4] 7163 vpermb ym1, ym28, [cq+64*12] 7164 vpermb m7, m28, [cq+64* 8] 7165 pslld m9, 16 7166 call m(idct_16x16_internal_8bpc).main_fast2 7167 vpermb m21, m28, [cq+64* 2] 7168 vpermb ym15, ym28, [cq+64*14] 7169 vpermb ym18, ym28, [cq+64*10] 7170 vpermb m14, m28, [cq+64* 6] 7171 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 7172 vpermb m22, m28, [cq+64* 1] 7173 vpermb ym29, ym28, [cq+64*15] 7174 vpermb ym26, ym28, [cq+64* 9] 7175 vpermb m25, m28, [cq+64* 7] 7176 vpermb m24, m28, [cq+64* 5] 7177 vpermb ym27, ym28, [cq+64*11] 7178 vpermb m23, m28, [cq+64* 3] 7179 vpermb ym28, ym28, [cq+64*13] 7180 mova [cq+64* 0], m14 7181 mova [cq+64* 1], m15 7182 mova [cq+64* 2], m16 7183 mova [cq+64* 3], m17 7184 mova [cq+64* 4], m18 7185 mova [cq+64* 5], m19 7186 mova [cq+64* 6], m20 7187 mova [cq+64* 7], m21 7188 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast 7189 vpbroadcastd m13, [o(pw_8192)] 7190 mova [cq+64*16], m4 7191 mova [cq+64*17], m5 7192 mova [cq+64*18], m6 7193 mova [cq+64*19], m7 7194 mova [cq+64*28], m26 7195 mova [cq+64*29], m27 7196 mova [cq+64*30], m28 7197 mova [cq+64*31], m29 7198 call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end 7199 mova [cq+64*20], m22 7200 mova [cq+64*21], m23 7201 mova [cq+64*22], m24 7202 mova [cq+64*23], m25 7203 mova [cq+64*24], m26 7204 mova [cq+64*25], m27 7205 mova [cq+64*26], m28 7206 mova [cq+64*27], m29 7207 lea r4, [rsp+64*64] 7208 lea r3, [rsp+64*32] 7209 call .pass2_fast 7210 pmulhrsw m0, m13, [cq+64*16] 7211 pmulhrsw m1, m13, [cq+64*17] 7212 pmulhrsw m2, m13, [cq+64*18] 7213 pmulhrsw m3, m13, [cq+64*19] 7214 pmulhrsw m4, m13, [cq+64*20] 7215 pmulhrsw m5, m13, [cq+64*21] 7216 pmulhrsw m6, m13, [cq+64*22] 7217 pmulhrsw m7, m13, [cq+64*23] 7218 pmulhrsw m14, m13, [cq+64*24] 7219 pmulhrsw m15, m13, [cq+64*25] 7220 pmulhrsw m16, m13, [cq+64*26] 7221 pmulhrsw m17, m13, [cq+64*27] 7222 pmulhrsw m18, m13, [cq+64*28] 7223 pmulhrsw m19, m13, [cq+64*29] 7224 pmulhrsw m20, m13, [cq+64*30] 7225 pmulhrsw m21, m13, [cq+64*31] 7226 call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round 7227 mov r4, rsp 7228 mov r3, cq 7229 call .pass2_fast 7230.end: 7231 vpbroadcastd m17, [o(pw_2048)] 7232 lea r5, [strideq*8] 7233 mov r3, dstq 7234 pxor m16, m16 7235 sub r4, 64*5 ; rsp+64*31 7236 mov r6, rsp 7237.end_loop: 7238 mova m2, [r6+64*32] ; idct16 0+n lo 7239 mova m7, [r6+64*48] ; idct32 31-n lo 7240 mova m6, [cq+64* 0] ; idct16 0+n hi 7241 mova m0, [cq+64*16] ; idct32 31-n hi 7242 mova m4, [r4+64*64] ; idct64 63-n lo 7243 mova m1, [r4+64* 0] ; idct64 63-n hi 7244 mova m5, [r6+64*64] ; idct64 32+n lo 7245 mova m8, [r6+64* 0] ; idct64 32+n hi 7246 sub r3, strideq 7247 paddsw m3, m2, m7 ; idct32 0+n lo 7248 mova m12, [dstq+r5*0] 7249 psubsw m2, m7 ; idct32 31-n lo 7250 mova m15, [r3 +r5*8] 7251 paddsw m7, m6, m0 ; idct32 0+n hi 7252 mova m13, [r3 +r5*4] 7253 psubsw m6, m0 ; idct32 31-n hi 7254 mova m14, [dstq+r5*4] 7255 paddsw m0, m3, m4 ; out 0+n lo 7256 add r6, 64 7257 psubsw m3, m4 ; out 63-n lo 7258 sub r4, 64 7259 paddsw m4, m7, m1 ; out 0+n hi 7260 mova [cq+64* 0], m16 7261 psubsw m7, m1 ; out 63-n hi 7262 mova [cq+64*16], m16 7263 paddsw m1, m2, m5 ; out 31-n lo 7264 add cq, 64 7265 psubsw m2, m5 ; out 32+n lo 7266 paddsw m5, m6, m8 ; out 31-n hi 7267 psubsw m6, m8 ; out 32+n hi 7268 pmulhrsw m0, m17 7269 punpcklbw m8, m12, m16 7270 pmulhrsw m4, m17 7271 punpckhbw m12, m16 7272 pmulhrsw m3, m17 7273 punpcklbw m11, m15, m16 7274 pmulhrsw m7, m17 7275 punpckhbw m15, m16 7276 pmulhrsw m1, m17 7277 punpcklbw m9, m13, m16 7278 pmulhrsw m5, m17 7279 punpckhbw m13, m16 7280 pmulhrsw m2, m17 7281 punpcklbw m10, m14, m16 7282 pmulhrsw m6, m17 7283 punpckhbw m14, m16 7284 paddw m0, m8 7285 paddw m4, m12 7286 packuswb m0, m4 7287 paddw m3, m11 7288 paddw m7, m15 7289 packuswb m3, m7 7290 paddw m1, m9 7291 paddw m5, m13 7292 packuswb m1, m5 7293 paddw m2, m10 7294 paddw m6, m14 7295 packuswb m2, m6 7296 mova [dstq+r5*0], m0 7297 mova [r3 +r5*8], m3 7298 mova [r3 +r5*4], m1 7299 mova [dstq+r5*4], m2 7300 add dstq, strideq 7301 cmp r6, r4 7302 jb .end_loop 7303 RET 7304.dconly: 7305 movsx r6d, word [cq] 7306 mov [cq], eobd 7307 or r3d, 64 7308 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly 7309ALIGN function_align 7310.pass2_end: 7311 REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6 7312 mova [r4+64*20], m1 7313 mova [r4+64*21], m3 7314 mova [r4+64*22], m5 7315 mova [r4+64*23], m7 7316 vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01 7317 vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03 7318 vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01 7319 vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03 7320 mova [r4+64*12], m15 7321 mova [r4+64*13], m17 7322 mova [r4+64*14], m19 7323 mova [r4+64*15], m21 7324 vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41 7325 vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43 7326 vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 7327 vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43 7328 vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21 7329 vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23 7330 vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21 7331 vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23 7332 vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61 7333 vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 7334 vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63 7335 vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61 7336 vshufi32x4 m0, m1, m5, q2020 ; 0 7337 vshufi32x4 m1, m5, q3131 ; 8 7338 vshufi32x4 m2, m3, m14, q2020 ; 16 7339 vshufi32x4 m3, m14, q3131 ; 24 7340 vshufi32x4 m14, m15, m18, q2020 ; 4 7341 vshufi32x4 m15, m18, q3131 ; 12 7342 vshufi32x4 m16, m17, m19, q2020 ; 20 7343 vshufi32x4 m17, m19, q3131 ; 28 7344 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 7345 vshufi32x4 m24, m22, m25, q3131 ; 10 7346 vshufi32x4 m22, m25, q2020 ; 2 7347 vshufi32x4 m25, m23, m28, q3131 ; 14 7348 vshufi32x4 m23, m28, q2020 ; 6 7349 vshufi32x4 m28, m26, m27, q3131 ; 26 7350 vshufi32x4 m26, m27, q2020 ; 18 7351 vshufi32x4 m27, m29, m13, q2020 ; 22 7352 vshufi32x4 m29, m13, q3131 ; 30 7353 mova [r6+64* 0], m0 7354 mova [r6+64* 1], m1 7355 mova [r6+64* 2], m2 7356 mova [r6+64* 3], m3 7357 mova [r6+64* 4], m4 7358 mova [r6+64* 5], m5 7359 mova [r6+64* 6], m6 7360 mova [r6+64* 7], m7 7361 mova [r6+64* 8], m14 7362 mova [r6+64* 9], m15 7363 mova [r6+64*10], m16 7364 mova [r6+64*11], m17 7365 mova [r6+64*12], m18 7366 mova [r6+64*13], m19 7367 mova [r6+64*14], m20 7368 mova [r6+64*15], m21 7369 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 7370 vpbroadcastd m13, [o(pw_8192)] 7371 mova [r6+64*16], m29 7372 mova [r6+64*17], m28 7373 mova [r6+64*18], m27 7374 mova [r6+64*19], m26 7375 mova [r6+64*20], m25 7376 mova [r6+64*21], m24 7377 mova [r6+64*22], m23 7378 mova [r6+64*23], m22 7379 mova [r6+64*24], m21 7380 mova [r6+64*25], m20 7381 mova [r6+64*26], m19 7382 mova [r6+64*27], m18 7383 mova [r6+64*28], m17 7384 mova [r6+64*29], m16 7385 mova [r6+64*30], m15 7386 mova [r6+64*31], m14 7387 pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25 7388 pmulhrsw m16, m13, [r4+64*12] 7389 pmulhrsw m17, m13, [r4+64*16] 7390 pmulhrsw m18, m13, [r4+64*20] 7391 pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31 7392 pmulhrsw m20, m13, [r4+64*15] 7393 pmulhrsw m21, m13, [r4+64*19] 7394 pmulhrsw m22, m13, [r4+64*23] 7395 vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9 7396 vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25 7397 vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9 7398 vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25 7399 pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29 7400 pmulhrsw m24, m13, [r4+64*14] 7401 pmulhrsw m25, m13, [r4+64*18] 7402 pmulhrsw m26, m13, [r4+64*22] 7403 vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15 7404 vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31 7405 vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15 7406 vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31 7407 pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27 7408 pmulhrsw m28, m13, [r4+64*13] 7409 pmulhrsw m29, m13, [r4+64*17] 7410 pmulhrsw m13, [r4+64*21] 7411 vshufi32x4 m0, m14, m16, q2020 ; 1 7412 vshufi32x4 m1, m19, m21, q3131 ; 31 7413 vshufi32x4 m2, m15, m17, q2020 ; 17 7414 vshufi32x4 m3, m18, m20, q3131 ; 15 7415 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7416 vshufi32x4 m0, m18, m20, q2020 ; 7 7417 vshufi32x4 m1, m15, m17, q3131 ; 25 7418 vshufi32x4 m2, m19, m21, q2020 ; 23 7419 vshufi32x4 m3, m14, m16, q3131 ; 9 7420 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7421 vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13 7422 vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29 7423 vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13 7424 vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29 7425 vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11 7426 vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27 7427 vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11 7428 vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27 7429 vshufi32x4 m0, m22, m24, q2020 ; 5 7430 vshufi32x4 m1, m27, m29, q3131 ; 27 7431 vshufi32x4 m2, m23, m25, q2020 ; 21 7432 vshufi32x4 m3, m26, m28, q3131 ; 11 7433 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7434 vshufi32x4 m0, m26, m28, q2020 ; 3 7435 vshufi32x4 m1, m23, m25, q3131 ; 29 7436 vshufi32x4 m2, m27, m29, q2020 ; 19 7437 vshufi32x4 m3, m22, m24, q3131 ; 13 7438 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7439 jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 7440ALIGN function_align 7441.pass2_fast: 7442 vshufi32x4 m23, m1, m16, q3131 ; 6 7443 vshufi32x4 m22, m1, m16, q2020 ; 2 7444 vshufi32x4 m14, m0, m3, q3131 ; 4 7445 vshufi32x4 m26, m0, m3, q2020 ; 0 7446 vshufi32x4 m28, m9, m15, q3131 ; 5 7447 vshufi32x4 m0, m9, m15, q2020 ; 1 7448 vshufi32x4 m16, m11, m17, q3131 ; 7 7449 vshufi32x4 m29, m11, m17, q2020 ; 3 7450 vshufi32x4 m15, m8, m2, q3131 ; 12 7451 vshufi32x4 m27, m8, m2, q2020 ; 8 7452 vshufi32x4 m25, m5, m19, q3131 ; 14 7453 vshufi32x4 m24, m5, m19, q2020 ; 10 7454 vshufi32x4 m3, m6, m20, q3131 ; 15 7455 vshufi32x4 m19, m6, m20, q2020 ; 11 7456 vshufi32x4 m17, m4, m18, q3131 ; 13 7457 vshufi32x4 m18, m4, m18, q2020 ; 9 7458 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 7459 mova m0, m16 7460 mova m3, m18 7461 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 7462 mova m0, m28 7463 mova m3, m19 7464 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 7465 mova m0, m29 7466 mova m3, m17 7467 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 7468 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 7469 mova m0, m26 7470 mova m1, m27 7471 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 7472 mova [r3+64* 0], m0 7473 mova [r3+64* 1], m1 7474 mova [r3+64* 2], m2 7475 mova [r3+64* 3], m3 7476 mova [r3+64* 4], m4 7477 mova [r3+64* 5], m5 7478 mova [r3+64* 6], m6 7479 mova [r3+64* 7], m7 7480 mova [r3+64* 8], m14 7481 mova [r3+64* 9], m15 7482 mova [r3+64*10], m16 7483 mova [r3+64*11], m17 7484 mova [r3+64*12], m18 7485 mova [r3+64*13], m19 7486 mova [r3+64*14], m20 7487 mova [r3+64*15], m21 7488 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 7489 mova [r3+64*16], m29 7490 mova [r3+64*17], m28 7491 mova [r3+64*18], m27 7492 mova [r3+64*19], m26 7493 mova [r3+64*20], m25 7494 mova [r3+64*21], m24 7495 mova [r3+64*22], m23 7496 mova [r3+64*23], m22 7497 mova [r3+64*24], m21 7498 mova [r3+64*25], m20 7499 mova [r3+64*26], m19 7500 mova [r3+64*27], m18 7501 mova [r3+64*28], m17 7502 mova [r3+64*29], m16 7503 mova [r3+64*30], m15 7504 mova [r3+64*31], m14 7505 ret 7506 7507%endif ; ARCH_X86_64 7508