1; Copyright © 2022-2023, VideoLAN and dav1d authors 2; Copyright © 2022-2023, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 64 32 33idct8x8p: db 0, 1, 4, 5, 2, 3, 6, 7, 16, 17, 20, 21, 18, 19, 22, 23 34 db 8, 9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31 35 db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55 36 db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63 37idtx8x8p: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 38 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 39 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 40 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 41idct8x16p: db 54, 55, 2, 3, 22, 23, 34, 35, 38, 39, 18, 19, 6, 7, 50, 51 42 db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59 43 db 52, 53, 4, 5, 20, 21, 36, 37, 32, 33, 0, 1, 48, 49, 16, 17 44 db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41, 8, 9, 56, 57, 24, 25 45iadst8x16p: db 0, 1, 54, 55, 48, 49, 6, 7, 16, 17, 38, 39, 32, 33, 22, 23 46 db 8, 9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31 47 db 4, 5, 50, 51, 52, 53, 2, 3, 20, 21, 34, 35, 36, 37, 18, 19 48 db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27 49permA: db 0, 1, 0, 8, 4, 5, 1, 9, 8, 9, 4, 12, 12, 13, 5, 13 50 db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29 51 db 2, 3, 2, 10, 6, 7, 3, 11, 10, 11, 6, 14, 14, 15, 7, 15 52 db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31 53permB: db 4, 2, 1, 8, 0, 0, 1, 0, 12, 3, 3, 10, 8, 1, 3, 2 54 db 5, 10, 5, 12, 1, 8, 5, 4, 13, 11, 7, 14, 9, 9, 7, 6 55 db 6, 6, 13, 4, 2, 4, 4, 5, 14, 7, 15, 6, 10, 5, 6, 7 56 db 7, 14, 9, 0, 3, 12, 0, 1, 15, 15, 11, 2, 11, 13, 2, 3 57permC: db 0, 9, 0, 0, 0, 1, 4, 4, 2, 11, 2, 2, 2, 3, 6, 6 58 db 1, 8, 1, 8, 4, 5, 5, 12, 3, 10, 3, 10, 6, 7, 7, 14 59 db 9, 1, 8, 1, 1, 0, 12, 5, 11, 3, 10, 3, 3, 2, 14, 7 60 db 8, 0, 9, 9, 5, 4, 13, 13, 10, 2, 11, 11, 7, 6, 15, 15 61idct8x32p: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 62 db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 63 db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 64 db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 65idct32x8p: db 2, 18, 0, 16, 3, 19, 1, 17, 10, 26, 8, 24, 11, 27, 9, 25 66 db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57 67 db 6, 22, 4, 20, 7, 23, 5, 21, 14, 30, 12, 28, 15, 31, 13, 29 68 db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61 69idtx32x8p: db 0, 8, 16, 24, 4, 12, 20, 28, 2, 10, 18, 26, 6, 14, 22, 30 70 db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62 71 db 1, 9, 17, 25, 5, 13, 21, 29, 3, 11, 19, 27, 7, 15, 23, 31 72 db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63 73 74pw_2048_m2048: times 16 dw 2048 75pw_m2048_2048: times 16 dw -2048 76pw_2048: times 16 dw 2048 77 78; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-, 4=-- 79%macro COEF_PAIR 2-3 0 ; a, b, flags 80%if %3 == 1 81pd_%1_m%2: dd %1, %1, -%2, -%2 82%define pd_%1 (pd_%1_m%2 + 4*0) 83%define pd_m%2 (pd_%1_m%2 + 4*2) 84%elif %3 == 2 85pd_m%1_%2: dd -%1, -%1, %2, %2 86%define pd_m%1 (pd_m%1_%2 + 4*0) 87%define pd_%2 (pd_m%1_%2 + 4*2) 88%elif %3 == 4 89pd_m%1_m%2: dd -%1, -%1, -%2, -%2 90%define pd_m%1 (pd_m%1_m%2 + 4*0) 91%define pd_m%2 (pd_m%1_m%2 + 4*2) 92%else 93pd_%1_%2: dd %1, %1, %2, %2 94%define pd_%1 (pd_%1_%2 + 4*0) 95%define pd_%2 (pd_%1_%2 + 4*2) 96%if %3 == 3 97%define pd_%2_m%2 pd_%2 98dd -%2, -%2 99%endif 100%endif 101%endmacro 102 103COEF_PAIR 101, 501 104COEF_PAIR 201, 601, 1 105COEF_PAIR 201, 995 106COEF_PAIR 401, 1189, 1 107COEF_PAIR 401, 1931 108COEF_PAIR 401, 3920 109COEF_PAIR 401, 4076 110COEF_PAIR 700, 301, 4 111COEF_PAIR 799, 2276, 1 112COEF_PAIR 799, 3406 113COEF_PAIR 799, 4017 114COEF_PAIR 1380, 601 115COEF_PAIR 1751, 2440 116COEF_PAIR 2598, 1189 117COEF_PAIR 2598, 1931, 2 118COEF_PAIR 2598, 3612 119COEF_PAIR 2751, 2106 120COEF_PAIR 2896, 1567, 3 121COEF_PAIR 2896, 3784, 3 122COEF_PAIR 3035, 3513 123COEF_PAIR 3166, 1931 124COEF_PAIR 3166, 3612 125COEF_PAIR 3166, 3920 126COEF_PAIR 3703, 3290 127COEF_PAIR 3857, 4052 128COEF_PAIR 4017, 2276 129COEF_PAIR 4017, 3406 130COEF_PAIR 4036, 4085 131COEF_PAIR 4076, 1189 132COEF_PAIR 4076, 3612 133COEF_PAIR 4076, 3920 134COEF_PAIR 4091, 3973 135COEF_PAIR 4091, 4052 136COEF_PAIR 4095, 4065 137 138pb_32: times 4 db 32 139pw_5: times 2 dw 5 140pw_4096: times 2 dw 4096 141pw_8192: times 2 dw 8192 142pw_1697x16: times 2 dw 1697*16 143pw_2896x8: times 2 dw 2896*8 144pixel_10bpc_max: times 2 dw 0x03ff 145dconly_10bpc: times 2 dw 0x7c00 146clip_18b_min: dd -0x20000 147clip_18b_max: dd 0x1ffff 148pd_1: dd 1 149pd_2: dd 2 150pd_1448: dd 1448 151pd_2048: dd 2048 152pd_3071: dd 3071 ; 1024 + 2048 - 1 153pd_3072: dd 3072 ; 1024 + 2048 154pd_5119: dd 5119 ; 1024 + 4096 - 1 155pd_5120: dd 5120 ; 1024 + 4096 156pd_5793: dd 5793 157 158cextern dup16_perm 159cextern int8_permA 160cextern idct64_mul_16bpc 161cextern idct_8x8_internal_8bpc_avx512icl.main 162cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2 163cextern idct_8x16_internal_8bpc_avx512icl.main 164cextern idct_8x16_internal_8bpc_avx512icl.main2 165cextern idct_8x16_internal_8bpc_avx512icl.main_fast 166cextern idct_8x16_internal_8bpc_avx512icl.main_fast2 167cextern iadst_8x16_internal_8bpc_avx512icl.main2 168cextern idct_16x8_internal_8bpc_avx512icl.main 169cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2 170cextern idct_16x16_internal_8bpc_avx512icl.main 171cextern idct_16x16_internal_8bpc_avx512icl.main2 172cextern idct_16x16_internal_8bpc_avx512icl.main_fast 173cextern idct_16x16_internal_8bpc_avx512icl.main_fast2 174cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b 175cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main 176cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast 177cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2 178cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end 179cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf 180cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast 181cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast2 182cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main 183cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf 184cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast 185cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2 186cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast3 187cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf 188cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast 189cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2 190cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast3 191cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf 192cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast 193cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1 194cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast 195cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast2 196cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part2 197 198SECTION .text 199 200%define o_base (pw_2048+4*128) 201%define o_base_8bpc (int8_permA+64*18) 202%define o(x) (r5 - o_base + (x)) 203%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 204 205INIT_ZMM avx512icl 206 207; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 208; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 209; flags: 1 = inv_dst1, 2 = inv_dst2 210; skip round/shift if rnd is not a number 211%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags 212%if %8 < 32 213 pmulld m%4, m%1, m%8 214 pmulld m%3, m%2, m%8 215%else 216%if %8 < 4096 217 vpbroadcastd m%3, [o(pd_%8)] 218%else 219 vbroadcasti32x4 m%3, [o(pd_%8)] 220%endif 221 pmulld m%4, m%1, m%3 222 pmulld m%3, m%2 223%endif 224%if %7 < 32 225 pmulld m%1, m%7 226 pmulld m%2, m%7 227%else 228%if %7 < 4096 229 vpbroadcastd m%5, [o(pd_%7)] 230%else 231 vbroadcasti32x4 m%5, [o(pd_%7)] 232%endif 233 pmulld m%1, m%5 234 pmulld m%2, m%5 235%endif 236%if %9 & 2 237 psubd m%4, m%6, m%4 238 psubd m%2, m%4, m%2 239%else 240%ifnum %6 241 paddd m%4, m%6 242%endif 243 paddd m%2, m%4 244%endif 245%ifnum %6 246 paddd m%1, m%6 247%endif 248%if %9 & 1 249 psubd m%1, m%3, m%1 250%else 251 psubd m%1, m%3 252%endif 253%ifnum %6 254 psrad m%2, 12 255 psrad m%1, 12 256%endif 257%endmacro 258 259%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size 260cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2 261 %define %%p1 m(i%1_%4_internal_10bpc) 262 lea r5, [o_base] 263 ; Jump to the 1st txfm function if we're not taking the fast path, which 264 ; in turn performs an indirect jump to the 2nd txfm function. 265 lea tx2q, [m(i%2_%4_internal_10bpc).pass2] 266%ifidn %1_%2, dct_dct 267 test eobd, eobd 268 jnz %%p1 269%else 270%if %3 271 add eobd, %3 272%endif 273 ; jump to the 1st txfm function unless it's located directly after this 274 times ((%%end - %%p1) >> 31) & 1 jmp %%p1 275ALIGN function_align 276%%end: 277%endif 278%endmacro 279 280%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset 281 INV_TXFM_FN %1, %2, %3, 8x8 282%ifidn %1_%2, dct_dct 283 imul r6d, [cq], 181 284 mov [cq], eobd ; 0 285 or r3d, 8 286.dconly: 287 add r6d, 384 288 sar r6d, 9 289.dconly2: 290 vpbroadcastd ym2, [o(dconly_10bpc)] 291 imul r6d, 181 292 add r6d, 2176 293 sar r6d, 12 294 vpbroadcastw ym1, r6d 295 paddsw ym1, ym2 296.dconly_loop: 297 mova xm0, [dstq+strideq*0] 298 vinserti32x4 ym0, [dstq+strideq*1], 1 299 paddsw ym0, ym1 300 psubusw ym0, ym2 301 mova [dstq+strideq*0], xm0 302 vextracti32x4 [dstq+strideq*1], ym0, 1 303 lea dstq, [dstq+strideq*2] 304 sub r3d, 2 305 jg .dconly_loop 306 RET 307%endif 308%endmacro 309 310INV_TXFM_8X8_FN dct, dct 311INV_TXFM_8X8_FN dct, adst 312INV_TXFM_8X8_FN dct, flipadst 313INV_TXFM_8X8_FN dct, identity 314 315cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 316 call .load 317 vpermi2q m1, m0, m2 ; 1 5 318 vpermi2q m3, m6, m4 ; 7 3 319 vpermt2q m0, m5, m4 ; 0 2 320 vpermt2q m2, m5, m6 ; 4 6 321 call .main 322 call .main_end 323 mova m4, [o(idct8x8p)] 324 packssdw m0, m2 ; 0 1 4 5 325 packssdw m1, m3 ; 3 2 7 6 326 vpermb m0, m4, m0 327 vprolq m1, 32 328 vpermb m2, m4, m1 329 punpckhdq m1, m0, m2 330 punpckldq m0, m2 331 jmp tx2q 332.pass2: 333 lea r5, [o_base_8bpc] 334 vextracti32x8 ym2, m0, 1 335 vextracti32x8 ym3, m1, 1 336 call m(idct_8x8_internal_8bpc).main 337 mova m10, [permC] 338 vpbroadcastd m12, [pw_2048] 339.end: 340 vpermt2q m0, m10, m1 341 vpermt2q m2, m10, m3 342.end2: 343 vpbroadcastd m11, [pixel_10bpc_max] 344 lea r6, [strideq*3] 345 pxor m10, m10 346 pmulhrsw m8, m12, m0 347 call .write_8x4_start 348 pmulhrsw m8, m12, m2 349.write_8x4: 350 lea dstq, [dstq+strideq*4] 351 add cq, 64*2 352.write_8x4_start: 353 mova xm9, [dstq+strideq*0] 354 vinserti32x4 ym9, [dstq+strideq*1], 1 355 vinserti32x4 m9, [dstq+strideq*2], 2 356 vinserti32x4 m9, [dstq+r6 ], 3 357 mova [cq+64*0], m10 358 mova [cq+64*1], m10 359 paddw m9, m8 360 pmaxsw m9, m10 361 pminsw m9, m11 362 mova [dstq+strideq*0], xm9 363 vextracti32x4 [dstq+strideq*1], ym9, 1 364 vextracti32x4 [dstq+strideq*2], m9, 2 365 vextracti32x4 [dstq+r6 ], m9, 3 366 ret 367ALIGN function_align 368.load: 369 mova m0, [cq+64*0] ; 0 1 370 mova m4, [cq+64*1] ; 2 3 371 mova m1, [o(permB)] 372 mova m2, [cq+64*2] ; 4 5 373 mova m6, [cq+64*3] ; 6 7 374 vpbroadcastd m13, [o(pd_2048)] 375 vpbroadcastd m14, [o(clip_18b_min)] 376 vpbroadcastd m15, [o(clip_18b_max)] 377 psrlq m5, m1, 32 378 vpbroadcastd m12, [o(pd_2896)] 379 mova m3, m1 380 vpbroadcastd m11, [o(pd_1)] 381 ret 382ALIGN function_align 383.main_fast: ; bottom half is zero 384 vbroadcasti32x4 m3, [o(pd_4017_3406)] 385 vbroadcasti32x4 m8, [o(pd_799_m2276)] 386 vbroadcasti32x4 m2, [o(pd_2896_3784)] 387 vbroadcasti32x4 m9, [o(pd_2896_1567)] 388 pmulld m3, m1 ; t4a t5a 389 pmulld m1, m8 ; t7a t6a 390 pmulld m2, m0 ; t0 t3 391 pmulld m0, m9 ; t1 t2 392 jmp .main2 393.main: 394 ITX_MULSUB_2D 1, 3, 8, 9, 10, _, 799_3406, 4017_2276 395 ITX_MULSUB_2D 0, 2, 8, 9, 10, _, 2896_1567, 2896_3784 396.main2: 397 REPX {paddd x, m13}, m1, m3, m0, m2 398 REPX {psrad x, 12 }, m1, m3, m0, m2 399 punpcklqdq m8, m1, m3 ; t4a t7a 400 punpckhqdq m1, m3 ; t5a t6a 401 psubd m3, m8, m1 ; t5a t6a 402 paddd m8, m1 ; t4 t7 403 pmaxsd m3, m14 404 punpckhqdq m1, m2, m0 ; t3 t2 405 pminsd m3, m15 406 punpcklqdq m2, m0 ; t0 t1 407 pmulld m3, m12 408 paddd m0, m2, m1 ; dct4 out0 out1 409 psubd m2, m1 ; dct4 out3 out2 410 REPX {pmaxsd x, m14}, m8, m0, m2 411 REPX {pminsd x, m15}, m8, m0, m2 412.main3: 413 pshufd m1, m3, q1032 414 paddd m3, m13 415 psubd m9, m3, m1 416 paddd m3, m1 417 psrad m9, 12 418 psrad m3, 12 419 punpckhqdq m1, m8, m3 ; t7 t6 420 shufpd m8, m9, 0xaa ; t4 t5 421 ret 422.main_end: 423 paddd m0, m11 424 paddd m2, m11 425 psubd m3, m0, m1 ; out7 out6 426 paddd m0, m1 ; out0 out1 427 paddd m1, m2, m8 ; out3 out2 428 psubd m2, m8 ; out4 out5 429 REPX {vpsravd x, m11}, m0, m2, m3, m1 430 ret 431 432INV_TXFM_8X8_FN adst, dct 433INV_TXFM_8X8_FN adst, flipadst 434INV_TXFM_8X8_FN adst, identity 435INV_TXFM_8X8_FN adst, adst 436 437cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 438 call m(idct_8x8_internal_10bpc).load 439 vpermi2q m1, m6, m2 ; 7 5 440 vpermi2q m3, m4, m0 ; 3 1 441 vpermt2q m0, m5, m4 ; 0 2 442 vpermt2q m2, m5, m6 ; 4 6 443 call .main 444 punpckldq m1, m2, m4 ; out4 out6 445 punpckhdq m2, m0 ; -out5 -out7 446 punpckldq m0, m3 ; out0 out2 447 punpckhdq m4, m3 ; -out1 -out3 448 paddd m1, m11 449 psubd m3, m11, m2 450 paddd m0, m11 451 psubd m4, m11, m4 452.pass1_end: 453 REPX {psrad x, 1}, m1, m0, m3, m4 454 packssdw m0, m1 ; 0 2 4 6 455 packssdw m4, m3 ; 1 3 5 7 456 psrlq m1, [o(permB)], 8 457 punpckhwd m3, m0, m4 458 punpcklwd m0, m4 459 psrlq m2, m1, 32 460 vpermi2q m1, m0, m3 461 vpermt2q m0, m2, m3 462 jmp tx2q 463.pass2: 464 call .main_pass2 465 movu m10, [permC+2] 466 vbroadcasti32x8 m12, [pw_2048_m2048+16] 467 jmp m(idct_8x8_internal_10bpc).end 468.main_pass2: 469 vextracti32x8 ym2, m0, 1 470 vextracti32x8 ym3, m1, 1 471 lea r5, [o_base_8bpc] 472 pshufd ym4, ym0, q1032 473 pshufd ym5, ym1, q1032 474 jmp m(iadst_8x8_internal_8bpc).main_pass2 475ALIGN function_align 476.main: 477 ITX_MULSUB_2D 1, 0, 4, 5, 6, 13, 401_1931, 4076_3612 478 ITX_MULSUB_2D 3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189 479 psubd m4, m0, m2 ; t4 t6 480 paddd m0, m2 ; t0 t2 481 psubd m2, m1, m3 ; t5 t7 482 paddd m1, m3 ; t1 t3 483 REPX {pmaxsd x, m14}, m4, m2, m0, m1 484 REPX {pminsd x, m15}, m4, m2, m0, m1 485 pxor m5, m5 486 psubd m5, m4 487 shufpd m4, m2, 0xaa ; t4 t7 488 shufpd m2, m5, 0xaa ; t5 -t6 489 ITX_MULSUB_2D 4, 2, 3, 5, 6, 13, 1567, 3784 490 punpckhqdq m3, m0, m1 491 punpcklqdq m0, m1 492 psubd m1, m0, m3 ; t2 t3 493 paddd m0, m3 ; out0 -out7 494 punpckhqdq m3, m4, m2 ; t7a t6a 495 punpcklqdq m4, m2 ; t5a t4a 496 psubd m2, m4, m3 ; t7 t6 497 paddd m4, m3 ; out6 -out1 498 REPX {pmaxsd x, m14}, m1, m2 499 REPX {pminsd x, m15}, m1, m2 500 shufpd m3, m1, m2, 0xaa 501 shufpd m1, m2, 0x55 502 pmulld m3, m12 503 pmulld m1, m12 504 paddd m3, m13 505 psubd m2, m3, m1 506 paddd m3, m1 507 psrad m2, 12 ; out4 -out5 508 pshufd m3, m3, q1032 509 psrad m3, 12 ; out2 -out3 510 ret 511 512INV_TXFM_8X8_FN flipadst, dct 513INV_TXFM_8X8_FN flipadst, adst 514INV_TXFM_8X8_FN flipadst, identity 515INV_TXFM_8X8_FN flipadst, flipadst 516 517cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 518 call m(idct_8x8_internal_10bpc).load 519 vpermi2q m1, m6, m2 ; 7 5 520 vpermi2q m3, m4, m0 ; 3 1 521 vpermt2q m0, m5, m4 ; 0 2 522 vpermt2q m2, m5, m6 ; 4 6 523 call m(iadst_8x8_internal_10bpc).main 524 punpckhdq m1, m3, m4 ; -out3 -out1 525 punpckldq m3, m0 ; out2 out0 526 punpckhdq m0, m2 ; -out7 -out5 527 punpckldq m4, m2 ; out6 out4 528 psubd m1, m11, m1 529 paddd m3, m11 530 psubd m0, m11, m0 531 paddd m4, m11 532 jmp m(iadst_8x8_internal_10bpc).pass1_end 533.pass2: 534 call m(iadst_8x8_internal_10bpc).main_pass2 535 movu m10, [permC+1] 536 vbroadcasti32x8 m12, [pw_m2048_2048+16] 537 lea r6, [strideq*3] 538 vpermt2q m0, m10, m1 ; 7 6 5 4 539 vpbroadcastd m11, [pixel_10bpc_max] 540 vpermt2q m2, m10, m3 ; 3 2 1 0 541 pxor m10, m10 542 pmulhrsw m8, m12, m2 543 call m(idct_8x8_internal_10bpc).write_8x4_start 544 pmulhrsw m8, m12, m0 545 jmp m(idct_8x8_internal_10bpc).write_8x4 546 547INV_TXFM_8X8_FN identity, dct 548INV_TXFM_8X8_FN identity, adst 549INV_TXFM_8X8_FN identity, flipadst 550INV_TXFM_8X8_FN identity, identity 551 552cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 553 mova m1, [cq+64*0] 554 packssdw m1, [cq+64*2] ; 0 4 1 5 555 mova m2, [cq+64*1] ; 2 6 3 7 556 packssdw m2, [cq+64*3] 557 mova m0, [o(idtx8x8p)] 558 vpermb m1, m0, m1 559 vpermb m2, m0, m2 560 punpckldq m0, m1, m2 ; 0 1 4 5 561 punpckhdq m1, m2 ; 2 3 6 7 562 jmp tx2q 563.pass2: 564 movu m3, [o(permC+2)] 565 vpbroadcastd m12, [o(pw_4096)] 566 psrlq m2, m3, 32 567 vpermi2q m2, m0, m1 568 vpermt2q m0, m3, m1 569 jmp m(idct_8x8_internal_10bpc).end2 570 571%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset 572 INV_TXFM_FN %1, %2, %3, 8x16 573%ifidn %1_%2, dct_dct 574 imul r6d, [cq], 181 575 mov [cq], eobd ; 0 576 or r3d, 16 577 add r6d, 128 578 sar r6d, 8 579 imul r6d, 181 580 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly 581%endif 582%endmacro 583 584INV_TXFM_8X16_FN dct, dct 585INV_TXFM_8X16_FN dct, identity, 35 586INV_TXFM_8X16_FN dct, flipadst 587INV_TXFM_8X16_FN dct, adst 588 589cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 590%undef cmp 591 cmp eobd, 43 592 jl .fast 593 call .load 594 call .main 595 call .main_end 596.pass1_end: 597 packssdw m0, m4 598 packssdw m1, m5 599 packssdw m2, m6 600 packssdw m3, m7 601 jmp tx2q 602.pass2: 603 mova m8, [o(idct8x16p)] 604 REPX {vpermb x, m8, x}, m0, m1, m2, m3 605 punpckhdq m5, m0, m1 606 punpckldq m0, m1 607 punpckhdq m4, m2, m3 608 punpckldq m2, m3 609 punpcklqdq m8, m0, m2 ; 15 1 610 punpckhqdq m0, m2 ; 7 9 611 punpckhqdq m1, m5, m4 ; 3 13 612 punpcklqdq m5, m4 ; 11 5 613 lea r5, [o_base_8bpc] 614 vextracti32x8 ym7, m8, 1 ; 14 2 615 vextracti32x8 ym3, m0, 1 ; 6 10 616 vextracti32x8 ym6, m1, 1 ; 12 4 617 vextracti32x8 ym9, m5, 1 ; 8 0 618 call m(idct_8x16_internal_8bpc).main2 619 mova m8, [permC] 620 vpbroadcastd m12, [pw_2048] 621 vpermt2q m0, m8, m1 622 lea r6, [strideq*3] 623 vpermt2q m2, m8, m3 624 vpbroadcastd m11, [pixel_10bpc_max] 625 vpermt2q m4, m8, m5 626 pxor m10, m10 627 vpermt2q m6, m8, m7 628 pmulhrsw m8, m12, m0 629 call m(idct_8x8_internal_10bpc).write_8x4_start 630 pmulhrsw m8, m12, m2 631 call m(idct_8x8_internal_10bpc).write_8x4 632 pmulhrsw m8, m12, m4 633 call m(idct_8x8_internal_10bpc).write_8x4 634 pmulhrsw m8, m12, m6 635 jmp m(idct_8x8_internal_10bpc).write_8x4 636.fast: 637 mova ym0, [cq+64*0] 638 mova ym4, [cq+64*2] 639 mova ym1, [cq+64*1] 640 mova ym5, [cq+64*5] 641 mova ym2, [cq+64*4] 642 mova ym6, [cq+64*6] 643 mova ym3, [cq+64*7] 644 mova ym7, [cq+64*3] 645 call .round_input_fast 646 call m(idct_8x8_internal_10bpc).main 647 call m(idct_8x8_internal_10bpc).main_end 648 movu m6, [o(permC+3)] 649 packssdw m3, m1, m3 650 packssdw m1, m0, m2 651 vprolq m3, 32 652 vpermd m1, m6, m1 653 vpermd m3, m6, m3 654 mova ym0, ym1 ; 0 4 655 vextracti32x8 ym1, m1, 1 ; 1 5 656 mova ym2, ym3 ; 2 6 657 vextracti32x8 ym3, m3, 1 ; 3 7 658 jmp tx2q 659ALIGN function_align 660.round_input_fast: 661 movshdup m8, [o(permB)] 662 vpbroadcastd m12, [o(pd_2896)] 663 vpermt2q m0, m8, m4 664 vpermt2q m1, m8, m5 665 vpermt2q m2, m8, m6 666 vpermt2q m3, m8, m7 667 vpbroadcastd m13, [o(pd_2048)] 668 REPX {pmulld x, m12}, m0, m1, m2, m3 669 vpbroadcastd m14, [o(clip_18b_min)] 670 vpbroadcastd m15, [o(clip_18b_max)] 671 REPX {paddd x, m13}, m0, m1, m2, m3 672 vpbroadcastd m11, [o(pd_1)] 673 REPX {psrad x, 12 }, m0, m1, m2, m3 674 ret 675ALIGN function_align 676.load: 677 vpbroadcastd m14, [o(clip_18b_min)] 678 vpbroadcastd m15, [o(clip_18b_max)] 679.load2: 680 vpbroadcastd m12, [o(pd_2896)] 681 pmulld m0, m12, [cq+64*0] 682 pmulld m1, m12, [cq+64*1] 683 pmulld m2, m12, [cq+64*2] 684 pmulld m3, m12, [cq+64*3] 685 vpbroadcastd m13, [o(pd_2048)] 686 pmulld m4, m12, [cq+64*4] 687 pmulld m5, m12, [cq+64*5] 688 pmulld m6, m12, [cq+64*6] 689 pmulld m7, m12, [cq+64*7] 690.round: 691 REPX {paddd x, m13}, m0, m1, m2, m3 692 REPX {psrad x, 12 }, m0, m1, m2, m3 693 REPX {paddd x, m13}, m4, m5, m6, m7 694 REPX {psrad x, 12 }, m4, m5, m6, m7 695 ret 696ALIGN function_align 697.main_fast2_rect2: 698 REPX {paddd x, m13}, m0, m1 699 REPX {psrad x, 12 }, m0, m1 700.main_fast2: 701 pmulld m0, m12 702 pmulld m6, m1, [o(pd_4017)] {1to16} ; t7a 703 pmulld m8, m1, [o(pd_799)] {1to16} ; t4a 704 REPX {paddd x, m13}, m0, m6, m8 705 REPX {psrad x, 12 }, m0, m6, m8 706 pmulld m5, m6, m12 707 pmulld m1, m8, m12 708 paddd m5, m13 709 psubd m4, m5, m1 710 paddd m5, m1 711 REPX {psrad x, 12 }, m4, m5 712 REPX {mova x, m0 }, m1, m2, m3 713 ret 714.main_fast_rect2: 715 REPX {paddd x, m13}, m0, m1, m2, m3 716 REPX {psrad x, 12 }, m0, m1, m2, m3 717.main_fast: 718 pmulld m0, m12 719 pmulld m5, m3, [o(pd_2276)] {1to16} ; t5a 720 pmulld m3, [o(pd_3406)] {1to16} ; t6a 721 pmulld m7, m1, [o(pd_4017)] {1to16} ; t7a 722 pmulld m1, [o(pd_799)] {1to16} ; t4a 723 pmulld m6, m2, [o(pd_3784)] {1to16} ; t3 724 pmulld m2, [o(pd_1567)] {1to16} ; t2 725 paddd m0, m13 726 psubd m5, m13, m5 727 psrad m0, 12 ; t0 728 mova m9, m0 ; t1 729 jmp .main2 730.main_rect2: 731 call .round 732.main: 733 pmulld m0, m12 734 ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a 735 ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a 736 ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3 737 pmulld m4, m12 738 paddd m0, m13 739 paddd m5, m13 740 psubd m9, m0, m4 ; t1 741 paddd m0, m4 ; t0 742 psrad m9, 12 743 psrad m0, 12 744.main2: 745 REPX {paddd x, m13}, m3, m1, m7 746 REPX {psrad x, 12 }, m5, m1, m3, m7 747 paddd m8, m1, m5 ; t4 748 psubd m1, m5 ; t5a 749 psubd m5, m7, m3 ; t6a 750 paddd m7, m3 ; t7 751 pmaxsd m5, m14 752 pmaxsd m1, m14 753 paddd m2, m13 754 paddd m6, m13 755 pminsd m5, m15 756 pminsd m1, m15 757 pmulld m5, m12 758 pmulld m1, m12 759 pmaxsd m8, m14 760 pmaxsd m7, m14 761 pminsd m8, m15 762 paddd m5, m13 763 psubd m4, m5, m1 764 paddd m5, m1 765 REPX {psrad x, 12 }, m2, m6, m5, m4 766 paddd m1, m9, m2 ; dct4 out1 767 psubd m2, m9, m2 ; dct4 out2 768 psubd m3, m0, m6 ; dct4 out3 769 paddd m0, m6 ; dct4 out0 770 pminsd m6, m15, m7 771 REPX {pmaxsd x, m14}, m0, m1, m2, m3 772 REPX {pminsd x, m15}, m0, m1, m2, m3 773 ret 774.main_end: 775 vpbroadcastd m11, [o(pd_1)] 776.main_end2: 777 REPX {paddd x, m11}, m0, m1, m2, m3 778 psubd m7, m0, m6 ; out7 779 paddd m0, m6 ; out0 780 psubd m6, m1, m5 ; out6 781 paddd m1, m5 ; out1 782 psubd m5, m2, m4 ; out5 783 paddd m2, m4 ; out2 784 psubd m4, m3, m8 ; out4 785 paddd m3, m8 ; out3 786 REPX {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 787 ret 788 789INV_TXFM_8X16_FN adst, dct 790INV_TXFM_8X16_FN adst, identity, 35 791INV_TXFM_8X16_FN adst, flipadst 792INV_TXFM_8X16_FN adst, adst 793 794cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 795%undef cmp 796 cmp eobd, 43 797 jl .fast 798 call m(idct_8x16_internal_10bpc).load 799 call .main 800 psrad m0, 1 801 psrad m1, 1 802 psrad m6, m10, 1 803 psrad m7, m11, 1 804 psrad m2, 12 805 psrad m3, 12 806 psrad m4, m8, 12 807 psrad m5, m9, 12 808 jmp m(idct_8x16_internal_10bpc).pass1_end 809.fast: 810 call .fast_main 811 punpcklqdq m1, m2, m4 ; out4 out6 812 punpckhqdq m2, m0 ; -out5 -out7 813 punpcklqdq m0, m3 ; out0 out2 814 punpckhqdq m4, m3 ; -out1 -out3 815 paddd m1, m11 816 psubd m3, m11, m2 817 paddd m0, m11 818 psubd m4, m11, m4 819.fast_end: 820 movu m5, [o(permC+3)] 821 REPX {psrad x, 1}, m1, m0, m3, m4 822 packssdw m2, m0, m1 ; 0 2 4 6 823 packssdw m3, m4, m3 ; 1 3 5 7 824 vpermd m2, m5, m2 825 vpermd m3, m5, m3 826 mova ym0, ym2 827 vextracti32x8 ym2, m2, 1 828 mova ym1, ym3 829 vextracti32x8 ym3, m3, 1 830 jmp tx2q 831.pass2: 832 call .pass2_main 833 movu m4, [permB+2] 834 vbroadcasti32x8 m12, [pw_2048_m2048+16] 835 psrlq m7, m4, 8 836 vpermi2q m4, m0, m3 ; 0 1 2 3 837 psrlq m5, m7, 24 838 vpermi2q m7, m0, m3 ; 12 13 14 15 839 psrlq m6, m5, 8 840 vpermq m5, m5, m1 ; 4 5 6 7 841 vpermq m6, m6, m2 ; 8 9 10 11 842.pass2_end: 843 vpbroadcastd m11, [pixel_10bpc_max] 844 pxor m10, m10 845 lea r6, [strideq*3] 846 pmulhrsw m8, m12, m4 847 call m(idct_8x8_internal_10bpc).write_8x4_start 848 pmulhrsw m8, m12, m5 849 call m(idct_8x8_internal_10bpc).write_8x4 850 pmulhrsw m8, m12, m6 851 call m(idct_8x8_internal_10bpc).write_8x4 852 pmulhrsw m8, m12, m7 853 jmp m(idct_8x8_internal_10bpc).write_8x4 854ALIGN function_align 855.main: 856 ITX_MULSUB_2D 7, 0, 8, 9, 10, 13, 401, 4076 ; t1a, t0a 857 ITX_MULSUB_2D 1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a 858 ITX_MULSUB_2D 5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a 859 ITX_MULSUB_2D 3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a 860 psubd m8, m2, m6 ; t6 861 paddd m2, m6 ; t2 862 psubd m6, m0, m4 ; t4 863 paddd m0, m4 ; t0 864 psubd m4, m5, m1 ; t7 865 paddd m5, m1 ; t3 866 psubd m1, m7, m3 ; t5 867 paddd m7, m3 ; t1 868 REPX {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7 869 REPX {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7 870 vpbroadcastd m10, [o(pd_1567)] 871 vpbroadcastd m11, [o(pd_3784)] 872 ITX_MULSUB_2D 6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a 873 ITX_MULSUB_2D 4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a 874 vpbroadcastd m12, [o(pd_1448)] 875 psubd m9, m6, m8 ; t7 876 paddd m6, m8 ; out6 877 psubd m3, m7, m5 ; t3 878 paddd m7, m5 ; -out7 879 psubd m5, m0, m2 ; t2 880 paddd m0, m2 ; out0 881 psubd m2, m1, m4 ; t6 882 paddd m1, m4 ; -out1 883 REPX {pmaxsd x, m14}, m5, m3, m2, m9 884 REPX {pminsd x, m15}, m5, m3, m2, m9 885 REPX {pmulld x, m12}, m5, m3, m2, m9 886 vpbroadcastd m4, [o(pd_1)] 887 psubd m8, m5, m3 ; (t2 - t3) * 1448 888 paddd m3, m5 ; (t2 + t3) * 1448 889 psubd m5, m2, m9 ; (t6 - t7) * 1448 890 paddd m2, m9 ; (t6 + t7) * 1448 891 vpbroadcastd m9, [o(pd_3072)] 892 paddd m0, m4 893 psubd m1, m4, m1 894 paddd m10, m6, m4 895 psubd m11, m4, m7 896 paddd m2, m9 897 paddd m8, m9 898 vpbroadcastd m9, [o(pd_3071)] 899 psubd m3, m9, m3 900 psubd m9, m5 901 ret 902ALIGN function_align 903.fast_main: 904 mova ym0, [cq+64*0] 905 mova ym4, [cq+64*2] 906 mova ym1, [cq+64*7] 907 mova ym5, [cq+64*5] 908 mova ym2, [cq+64*4] 909 mova ym6, [cq+64*6] 910 mova ym3, [cq+64*3] 911 mova ym7, [cq+64*1] 912 call m(idct_8x16_internal_10bpc).round_input_fast 913 jmp m(iadst_8x8_internal_10bpc).main 914ALIGN function_align 915.pass2_main: 916 mova m8, [o(iadst8x16p)] 917 REPX {vpermb x, m8, x}, m0, m1, m2, m3 918 vpbroadcastd m10, [o(pw_2896x8)] 919 punpckhdq m5, m0, m1 920 punpckldq m0, m1 921 punpckhdq m1, m2, m3 922 punpckldq m2, m3 923 lea r5, [o_base_8bpc] 924 punpckhqdq m4, m0, m2 ; 12 3 14 1 925 punpcklqdq m0, m2 ; 0 15 2 13 926 punpckhqdq m6, m5, m1 ; 8 7 10 5 927 punpcklqdq m5, m1 ; 4 11 6 9 928 call m(iadst_8x16_internal_8bpc).main2 929 paddsw m1, m2, m4 930 psubsw m2, m4 931 pmulhrsw m1, m10 ; -out7 out4 out6 -out5 932 pmulhrsw m2, m10 ; out8 -out11 -out9 out10 933 ret 934 935INV_TXFM_8X16_FN flipadst, dct 936INV_TXFM_8X16_FN flipadst, identity, 35 937INV_TXFM_8X16_FN flipadst, adst 938INV_TXFM_8X16_FN flipadst, flipadst 939 940cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 941%undef cmp 942 cmp eobd, 43 943 jl .fast 944 call m(idct_8x16_internal_10bpc).load 945 call m(iadst_8x16_internal_10bpc).main 946 psrad m7, m0, 1 947 psrad m0, m11, 1 948 psrad m6, m1, 1 949 psrad m1, m10, 1 950 psrad m5, m2, 12 951 psrad m2, m9, 12 952 psrad m4, m3, 12 953 psrad m3, m8, 12 954 jmp m(idct_8x16_internal_10bpc).pass1_end 955.fast: 956 call m(iadst_8x16_internal_10bpc).fast_main 957 punpckhqdq m1, m3, m4 ; -out3 -out1 958 punpcklqdq m3, m0 ; out2 out0 959 punpckhqdq m0, m2 ; -out7 -out5 960 punpcklqdq m4, m2 ; out6 out4 961 psubd m1, m11, m1 962 paddd m3, m11 963 psubd m0, m11, m0 964 paddd m4, m11 965 jmp m(iadst_8x16_internal_10bpc).fast_end 966.pass2: 967 call m(iadst_8x16_internal_10bpc).pass2_main 968 movu m7, [permB+2] 969 vbroadcasti32x8 m12, [pw_m2048_2048+16] 970 psrlq m4, m7, 8 971 vpermi2q m7, m3, m0 ; 3 2 1 0 972 psrlq m5, m4, 24 973 vpermi2q m4, m3, m0 ; 15 14 13 12 974 psrlq m6, m5, 8 975 vpermq m5, m5, m2 ; 11 10 9 8 976 vpermq m6, m6, m1 ; 7 6 5 4 977 jmp m(iadst_8x16_internal_10bpc).pass2_end 978 979INV_TXFM_8X16_FN identity, dct 980INV_TXFM_8X16_FN identity, adst 981INV_TXFM_8X16_FN identity, flipadst 982INV_TXFM_8X16_FN identity, identity 983 984cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 985 call m(idct_8x16_internal_10bpc).load2 986 jmp m(idct_8x16_internal_10bpc).pass1_end 987.pass2: 988 vpbroadcastd m8, [o(pw_1697x16)] 989 pmulhrsw m4, m8, m0 990 pmulhrsw m5, m8, m1 991 pmulhrsw m6, m8, m2 992 pmulhrsw m7, m8, m3 993 REPX {paddsw x, x}, m0, m1, m2, m3 994 paddsw m0, m4 995 paddsw m1, m5 996 paddsw m2, m6 997 paddsw m3, m7 998 vpbroadcastd m7, [o(pw_2048)] 999 punpckhwd m4, m0, m1 1000 punpcklwd m0, m1 1001 punpckhwd m1, m2, m3 1002 punpcklwd m2, m3 1003 vpbroadcastd m6, [o(pixel_10bpc_max)] 1004 punpckhdq m3, m0, m2 1005 punpckldq m0, m2 1006 punpckldq m2, m4, m1 1007 punpckhdq m4, m1 1008 pxor m5, m5 1009 punpckhqdq m1, m0, m2 ; 1 5 9 13 1010 punpcklqdq m0, m2 ; 0 4 8 12 1011 punpcklqdq m2, m3, m4 ; 2 6 10 14 1012 punpckhqdq m3, m4 ; 3 7 11 15 1013 lea r6, [strideq*3] 1014 pmulhrsw m0, m7 1015 call .write_8x4_start 1016 pmulhrsw m0, m7, m1 1017 call .write_8x4 1018 pmulhrsw m0, m7, m2 1019 call .write_8x4 1020 pmulhrsw m0, m7, m3 1021.write_8x4: 1022 add dstq, strideq 1023 add cq, 64*2 1024.write_8x4_start: 1025 mova xm4, [dstq+strideq*0] 1026 vinserti32x4 ym4, [dstq+strideq*4], 1 1027 vinserti32x4 m4, [dstq+strideq*8], 2 1028 vinserti32x4 m4, [dstq+r6*4 ], 3 1029 mova [cq+64*0], m5 1030 mova [cq+64*1], m5 1031 paddw m4, m0 1032 pmaxsw m4, m5 1033 pminsw m4, m6 1034 mova [dstq+strideq*0], xm4 1035 vextracti32x4 [dstq+strideq*4], ym4, 1 1036 vextracti32x4 [dstq+strideq*8], m4, 2 1037 vextracti32x4 [dstq+r6*4 ], m4, 3 1038 ret 1039 1040%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset 1041 INV_TXFM_FN %1, %2, %3, 16x8 1042%ifidn %1_%2, dct_dct 1043 imul r6d, [cq], 181 1044 mov [cq], eobd ; 0 1045 or r3d, 8 1046.dconly: 1047 add r6d, 128 1048 sar r6d, 8 1049 imul r6d, 181 1050 add r6d, 384 1051 sar r6d, 9 1052.dconly2: 1053 vpbroadcastd m2, [o(dconly_10bpc)] 1054 imul r6d, 181 1055 add r6d, 2176 1056 sar r6d, 12 1057 vpbroadcastw m1, r6d 1058 paddsw m1, m2 1059.dconly_loop: 1060 mova ym0, [dstq+strideq*0] 1061 vinserti32x8 m0, [dstq+strideq*1], 1 1062 paddsw m0, m1 1063 psubusw m0, m2 1064 mova [dstq+strideq*0], ym0 1065 vextracti32x8 [dstq+strideq*1], m0, 1 1066 lea dstq, [dstq+strideq*2] 1067 sub r3d, 2 1068 jg .dconly_loop 1069 RET 1070%endif 1071%endmacro 1072 1073INV_TXFM_16X8_FN dct, dct 1074INV_TXFM_16X8_FN dct, identity, -21 1075INV_TXFM_16X8_FN dct, flipadst 1076INV_TXFM_16X8_FN dct, adst 1077 1078cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 1079%undef cmp 1080 vpbroadcastd m12, [o(pd_2896)] 1081 pmulld m4, m12, [cq+64*0] ; 0 1 1082 pmulld m9, m12, [cq+64*1] ; 2 3 1083 pmulld m8, m12, [cq+64*2] ; 4 5 1084 pmulld m7, m12, [cq+64*3] ; 6 7 1085 vpbroadcastd m13, [o(pd_2048)] 1086 pxor m2, m2 1087 mova m15, [o(permB)] 1088 REPX {mova [cq+64*x], m2}, 0, 1, 2, 3 1089 psrlq m0, m15, 32 1090 REPX {paddd x, m13}, m4, m9, m8, m7 1091 vpbroadcastd m14, [o(clip_18b_min)] 1092 REPX {psrad x, 12 }, m4, m8, m9, m7 1093 mova m1, m0 1094 vpermi2q m0, m4, m8 ; 0 4 1095 cmp eobd, 43 1096 jl .fast 1097 pmulld m5, m12, [cq+64*4] ; 8 9 1098 pmulld m10, m12, [cq+64*5] ; 10 11 1099 pmulld m11, m12, [cq+64*6] ; 12 13 1100 pmulld m6, m12, [cq+64*7] ; 14 15 1101 REPX {mova [cq+64*x], m2}, 4, 5, 6, 7 1102 REPX {paddd x, m13}, m5, m10, m11, m6 1103 REPX {psrad x, 12 }, m10, m5, m11, m6 1104 mova m2, m1 1105 vpermi2q m1, m9, m10 ; 2 10 1106 mova m3, m2 1107 vpermi2q m2, m5, m11 ; 8 12 1108 vpermi2q m3, m6, m7 ; 14 6 1109 vpermt2q m4, m15, m11 ; 1 13 1110 vpermt2q m6, m15, m9 ; 15 3 1111 vpermt2q m5, m15, m8 ; 9 5 1112 vpermt2q m7, m15, m10 ; 7 11 1113 vpbroadcastd m15, [o(clip_18b_max)] 1114 call m(idct_8x8_internal_10bpc).main 1115 call .main 1116 jmp .pass1_end 1117.fast: 1118 vpermi2q m1, m9, m7 ; 2 6 1119 vpermt2q m4, m15, m9 ; 1 3 1120 vpermt2q m7, m15, m8 ; 7 5 1121 vpbroadcastd m15, [o(clip_18b_max)] 1122 call m(idct_8x8_internal_10bpc).main_fast 1123 call .main_fast 1124.pass1_end: 1125 call m(idct_8x16_internal_10bpc).main_end 1126 mova m8, [o(permA)] 1127 psrlq m9, m8, 8 1128.pass1_end2: 1129 mova m10, m9 1130 mova m11, m8 1131 call .transpose_16x8 1132 jmp tx2q 1133.pass2: 1134 lea r5, [o_base_8bpc] 1135 call m(idct_16x8_internal_8bpc).main 1136 movshdup m4, [permC] 1137 vpbroadcastd m11, [pw_2048] 1138 psrlq m5, m4, 8 1139.end: 1140 vpbroadcastd m13, [pixel_10bpc_max] 1141 pxor m12, m12 1142 vpermq m8, m4, m0 1143 vpermq m9, m5, m1 1144 lea r6, [strideq*3] 1145 call .write_16x4 1146 vpermq m8, m4, m2 1147 vpermq m9, m5, m3 1148.write_16x4: 1149 pmulhrsw m8, m11 1150 pmulhrsw m9, m11 1151.write_16x4_noround: 1152 mova ym10, [dstq+strideq*0] 1153 vinserti32x8 m10, [dstq+strideq*1], 1 1154 paddw m8, m10 1155 mova ym10, [dstq+strideq*2] 1156 vinserti32x8 m10, [dstq+r6 ], 1 1157 paddw m9, m10 1158 pmaxsw m8, m12 1159 pmaxsw m9, m12 1160 pminsw m8, m13 1161 pminsw m9, m13 1162 mova [dstq+strideq*0], ym8 1163 vextracti32x8 [dstq+strideq*1], m8, 1 1164 mova [dstq+strideq*2], ym9 1165 vextracti32x8 [dstq+r6 ], m9, 1 1166 lea dstq, [dstq+strideq*4] 1167 ret 1168ALIGN function_align 1169.main_fast: ; bottom half is zero 1170 vbroadcasti32x4 m6, [o(pd_4076_3920)] 1171 vbroadcasti32x4 m3, [o(pd_401_m1189)] 1172 vbroadcasti32x4 m5, [o(pd_m2598_1931)] 1173 vbroadcasti32x4 m9, [o(pd_3166_3612)] 1174 pmulld m6, m4 ; t15a t12a 1175 pmulld m4, m3 ; t8a t11a 1176 pmulld m5, m7 ; t9a t10a 1177 pmulld m7, m9 ; t14a t13a 1178 jmp .main2 1179.main: 1180 ITX_MULSUB_2D 4, 6, 3, 9, 10, _, 401_3920, 4076_1189 1181 ITX_MULSUB_2D 5, 7, 3, 9, 10, _, 3166_1931, 2598_3612 1182.main2: 1183 REPX {paddd x, m13}, m4, m6, m5, m7 1184 REPX {psrad x, 12 }, m4, m5, m6, m7 1185 paddd m9, m4, m5 ; t8 t11 1186 psubd m4, m5 ; t9 t10 1187 psubd m5, m6, m7 ; t14 t13 1188 paddd m6, m7 ; t15 t12 1189 REPX {pmaxsd x, m14}, m5, m4, m9, m6 1190 REPX {pminsd x, m15}, m5, m4, m9, m6 1191.main3: 1192 psubd m3, m0, m1 ; dct8 out7 out6 1193 paddd m0, m1 ; dct8 out0 out1 1194 vbroadcasti32x4 m7, [o(pd_3784_m3784)] 1195 pmulld m7, m5 1196 vpmulld m5, [o(pd_1567)] {1to16} 1197 paddd m1, m2, m8 ; dct8 out3 out2 1198 psubd m2, m8 ; dct8 out4 out5 1199 vbroadcasti32x4 m8, [o(pd_1567_m1567)] 1200 pmulld m8, m4 1201 vpmulld m4, [o(pd_3784)] {1to16} 1202 REPX {pmaxsd x, m14}, m0, m1 1203 REPX {pminsd x, m15}, m0, m1 1204 paddd m7, m13 1205 paddd m5, m13 1206 paddd m7, m8 1207 psubd m5, m4 1208 psrad m7, 12 ; t14a t10a 1209 psrad m5, 12 ; t9a t13a 1210 punpckhqdq m4, m9, m7 1211 punpcklqdq m8, m9, m5 1212 punpckhqdq m5, m6, m5 1213 punpcklqdq m6, m7 1214 psubd m7, m8, m4 ; t11a t10 1215 paddd m8, m4 ; t8a t9 1216 psubd m4, m6, m5 ; t12a t13 1217 paddd m6, m5 ; t15a t14 1218 REPX {pmaxsd x, m14}, m4, m7 1219 REPX {pminsd x, m15}, m4, m7 1220 pmulld m4, m12 1221 pmulld m7, m12 1222 REPX {pmaxsd x, m14}, m2, m3, m6, m8 1223 REPX {pminsd x, m15}, m2, m3, m6, m8 1224 paddd m4, m13 1225 paddd m5, m4, m7 1226 psubd m4, m7 1227 psrad m4, 12 ; t11 t10a 1228 psrad m5, 12 ; t12 t13a 1229 ret 1230ALIGN function_align 1231.transpose_16x8: 1232 packssdw m0, m4 1233 packssdw m1, m5 1234 packssdw m2, m6 1235 packssdw m3, m7 1236 vpermi2d m8, m0, m2 1237 vpermt2d m0, m9, m2 1238 vpermi2d m10, m1, m3 1239 vpermi2d m11, m1, m3 1240 punpckhwd m3, m8, m0 1241 punpcklwd m1, m8, m0 1242 punpckhwd m4, m10, m11 1243 punpcklwd m2, m10, m11 1244 punpckldq m0, m1, m2 1245 punpckhdq m1, m2 1246 punpckldq m2, m3, m4 1247 punpckhdq m3, m4 1248 ret 1249 1250INV_TXFM_16X8_FN adst, dct 1251INV_TXFM_16X8_FN adst, identity, -21 1252INV_TXFM_16X8_FN adst, flipadst 1253INV_TXFM_16X8_FN adst, adst 1254 1255cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 1256%undef cmp 1257 call .main_pass1 1258 vpbroadcastd m9, [o(pd_1)] 1259 paddd m0, m9 1260 psubd m1, m9, m1 1261 paddd m2, m9 1262 psubd m3, m9, m3 1263 paddd m4, m9, m5 1264 psubd m5, m9, m6 1265 paddd m6, m9, m7 1266 psubd m7, m9, m8 1267.pass1_end: 1268 mova m9, [o(permA)] 1269 psrlq m8, m9, 8 1270 REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7 1271 jmp m(idct_16x8_internal_10bpc).pass1_end2 1272.pass2: 1273 call .main_pass2 1274 vpermq m8, m11, m0 1275 vpermq m9, m11, m1 1276 call m(idct_16x8_internal_10bpc).write_16x4_noround 1277 vpermq m8, m11, m2 1278 vpermq m9, m11, m3 1279 jmp m(idct_16x8_internal_10bpc).write_16x4_noround 1280ALIGN function_align 1281.main_pass1: 1282 vpbroadcastd m12, [o(pd_2896)] 1283 pmulld m2, m12, [cq+64*0] 1284 pmulld m7, m12, [cq+64*1] 1285 pmulld m1, m12, [cq+64*2] 1286 pmulld m5, m12, [cq+64*3] 1287 vpbroadcastd m13, [o(pd_2048)] 1288 pxor m4, m4 1289 mova m10, [o(permB)] 1290 REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 1291 REPX {paddd x, m13}, m2, m7, m1, m5 1292 psrlq m6, m10, 32 1293 REPX {psrad x, 12 }, m2, m7, m1, m5 1294 mova m0, m6 1295 vpermi2q m0, m2, m7 ; 0 2 1296 vpermt2q m7, m10, m2 ; 3 1 1297 mova m2, m6 1298 vpermi2q m2, m1, m5 ; 4 6 1299 vpermt2q m5, m10, m1 ; 7 5 1300 cmp eobd, 43 1301 jl .main_fast 1302 pmulld m8, m12, [cq+64*4] 1303 pmulld m3, m12, [cq+64*5] 1304 pmulld m9, m12, [cq+64*6] 1305 pmulld m1, m12, [cq+64*7] 1306 REPX {mova [cq+64*x], m4}, 4, 5, 6, 7 1307 REPX {paddd x, m13}, m8, m3, m9, m1 1308 REPX {psrad x, 12 }, m8, m3, m9, m1 1309 mova m4, m6 1310 vpermi2q m4, m8, m3 ; 8 10 1311 vpermt2q m3, m10, m8 ; 11 9 1312 vpermi2q m6, m9, m1 ; 12 14 1313 vpermt2q m1, m10, m9 ; 15 13 1314.main: 1315 ITX_MULSUB_2D 1, 0, 8, 9, 10, _, 201_995, 4091_3973, 1 1316 ITX_MULSUB_2D 3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1 1317 ITX_MULSUB_2D 5, 4, 8, 9, 10, _, 3035_3513, 2751_2106 1318 ITX_MULSUB_2D 7, 6, 8, 9, 10, _, 3857_4052, 1380_601 1319 jmp .main2 1320.main_fast: 1321 vbroadcasti32x4 m1, [o(pd_4091_3973)] 1322 vbroadcasti32x4 m8, [o(pd_201_995)] 1323 vbroadcasti32x4 m3, [o(pd_3703_3290)] 1324 vbroadcasti32x4 m9, [o(pd_1751_2440)] 1325 vbroadcasti32x4 m4, [o(pd_2751_2106)] 1326 vbroadcasti32x4 m10, [o(pd_3035_3513)] 1327 vbroadcasti32x4 m6, [o(pd_1380_601)] 1328 vbroadcasti32x4 m11, [o(pd_3857_4052)] 1329 pmulld m1, m0 1330 pmulld m0, m8 1331 pmulld m3, m2 1332 pmulld m2, m9 1333 pmulld m4, m5 1334 pmulld m5, m10 1335 pmulld m6, m7 1336 pmulld m7, m11 1337.main2: 1338 vpbroadcastd m14, [o(clip_18b_min)] 1339 vpbroadcastd m15, [o(clip_18b_max)] 1340 REPX {psubd x, m13, x}, m1, m3 1341 REPX {paddd x, m13 }, m0, m2, m4, m5, m6, m7 1342 REPX {psrad x, 12 }, m0, m4, m1, m5, m2, m6, m3, m7 1343 psubd m8, m0, m4 ; t8a t10a 1344 paddd m0, m4 ; t0a t2a 1345 psubd m4, m1, m5 ; t9a t11a 1346 paddd m1, m5 ; t1a t3a 1347 psubd m5, m2, m6 ; t12a t14a 1348 paddd m2, m6 ; t4a t6a 1349 psubd m6, m3, m7 ; t13a t15a 1350 paddd m3, m7 ; t5a t7a 1351 REPX {pmaxsd x, m14}, m8, m4, m5, m6 1352 REPX {pminsd x, m15}, m8, m4, m5, m6 1353 vbroadcasti32x4 m11, [o(pd_4017_2276)] 1354 vbroadcasti32x4 m10, [o(pd_799_3406)] 1355 ITX_MULSUB_2D 8, 4, 7, 9, _, 13, 10, 11 1356 ITX_MULSUB_2D 6, 5, 7, 9, _, 13, 11, 10 1357 REPX {pmaxsd x, m14}, m0, m2, m1, m3 1358 REPX {pminsd x, m15}, m0, m2, m1, m3 1359 psubd m7, m0, m2 ; t4 t6 1360 paddd m0, m2 ; t0 t2 1361 psubd m2, m1, m3 ; t5 t7 1362 paddd m1, m3 ; t1 t3 1363 psubd m3, m4, m6 ; t12a t14a 1364 paddd m4, m6 ; t8a t10a 1365 psubd m6, m8, m5 ; t13a t15a 1366 paddd m8, m5 ; t9a t11a 1367 REPX {pmaxsd x, m14}, m7, m3, m2, m6 1368 REPX {pminsd x, m15}, m7, m3, m2, m6 1369 punpcklqdq m5, m3, m7 ; t12a t4 1370 punpckhqdq m3, m7 ; t14a t6 1371 punpckhqdq m7, m6, m2 ; t15a t7 1372 punpcklqdq m6, m2 ; t13a t5 1373 vpbroadcastd m11, [o(pd_1567)] 1374 vpbroadcastd m10, [o(pd_3784)] 1375 ITX_MULSUB_2D 7, 3, 2, 9, 10, 13, 10, 11 1376 ITX_MULSUB_2D 5, 6, 2, 9, 10, 13, 11, 10 1377 REPX {pmaxsd x, m14}, m0, m4, m1, m8 1378 REPX {pminsd x, m15}, m0, m4, m1, m8 1379 punpckhqdq m2, m4, m0 ; t10a t2 1380 punpcklqdq m4, m0 ; t8a t0 1381 punpckhqdq m0, m8, m1 ; t11a t3 1382 punpcklqdq m8, m1 ; t9a t1 1383 paddd m1, m6, m7 ; out2 -out3 1384 psubd m6, m7 ; t14a t6 1385 paddd m7, m5, m3 ; -out13 out12 1386 psubd m5, m3 ; t15a t7 1387 psubd m3, m8, m0 ; t11 t3a 1388 paddd m8, m0 ; out14 -out15 1389 paddd m0, m4, m2 ; -out1 out0 1390 psubd m4, m2 ; t10 t2a 1391 REPX {pmaxsd x, m14}, m6, m5, m3, m4 1392 mov r6d, 0x3333 1393 REPX {pminsd x, m15}, m6, m5, m3, m4 1394 kmovw k1, r6d 1395 REPX {pmulld x, m12}, m6, m5, m3, m4 1396 pxor m9, m9 1397 REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8 1398 paddd m6, m13 1399 paddd m4, m13 1400 paddd m2, m6, m5 ; -out5 out4 1401 psubd m6, m5 ; out10 -out11 1402 psubd m5, m4, m3 ; -out9 out8 1403 paddd m3, m4 ; out6 -out7 1404 REPX {psrad x, 12}, m2, m3, m5, m6 1405 REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6 1406 ret 1407ALIGN function_align 1408.main_pass2: 1409 lea r5, [o_base_8bpc] 1410 pshufd m4, m0, q1032 1411 pshufd m5, m1, q1032 1412 call m(iadst_16x8_internal_8bpc).main_pass2 1413 movshdup m11, [permC] 1414 pmulhrsw m0, m6 1415 pmulhrsw m1, m6 1416 vpbroadcastd m13, [pixel_10bpc_max] 1417 pxor m12, m12 1418 lea r6, [strideq*3] 1419 ret 1420 1421INV_TXFM_16X8_FN flipadst, dct 1422INV_TXFM_16X8_FN flipadst, identity, -21 1423INV_TXFM_16X8_FN flipadst, adst 1424INV_TXFM_16X8_FN flipadst, flipadst 1425 1426cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 1427 call m(iadst_16x8_internal_10bpc).main_pass1 1428 vpbroadcastd m9, [o(pd_1)] 1429 psubd m4, m9, m3 1430 paddd m3, m9, m5 1431 paddd m5, m9, m2 1432 psubd m2, m9, m6 1433 psubd m6, m9, m1 1434 paddd m1, m9, m7 1435 paddd m7, m9, m0 1436 psubd m0, m9, m8 1437 jmp m(iadst_16x8_internal_10bpc).pass1_end 1438.pass2: 1439 call m(iadst_16x8_internal_10bpc).main_pass2 1440 psrlq m11, 8 1441 vpermq m8, m11, m3 1442 vpermq m9, m11, m2 1443 call m(idct_16x8_internal_10bpc).write_16x4_noround 1444 vpermq m8, m11, m1 1445 vpermq m9, m11, m0 1446 jmp m(idct_16x8_internal_10bpc).write_16x4_noround 1447 1448INV_TXFM_16X8_FN identity, dct 1449INV_TXFM_16X8_FN identity, adst 1450INV_TXFM_16X8_FN identity, flipadst 1451INV_TXFM_16X8_FN identity, identity 1452 1453cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 1454 call m(idct_8x16_internal_10bpc).load2 1455 vpbroadcastd m8, [o(pd_5793)] 1456 vpbroadcastd m13, [o(pd_3072)] 1457 pxor m10, m10 1458 REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 1459 REPX {mova [cq+64*x], m10}, 0, 1, 2, 3, 4, 5, 6, 7 1460 call m(idct_8x16_internal_10bpc).round 1461 psrlq m8, [o(permA)], 16 1462 psrlq m9, m8, 8 1463 mova m10, m8 1464 mova m11, m9 1465 call m(idct_16x8_internal_10bpc).transpose_16x8 1466 jmp tx2q 1467.pass2: 1468 movshdup m4, [o(permC)] 1469 vpbroadcastd m11, [o(pw_4096)] 1470 mova m5, m4 1471 jmp m(idct_16x8_internal_10bpc).end 1472 1473%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset 1474 INV_TXFM_FN %1, %2, %3, 16x16 1475%ifidn %1_%2, dct_dct 1476 imul r6d, [cq], 181 1477 mov [cq], eobd ; 0 1478 or r3d, 16 1479 add r6d, 640 1480 sar r6d, 10 1481 jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2 1482%endif 1483%endmacro 1484 1485INV_TXFM_16X16_FN dct, dct 1486INV_TXFM_16X16_FN dct, identity, 28 1487INV_TXFM_16X16_FN dct, flipadst 1488INV_TXFM_16X16_FN dct, adst 1489 1490cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 1491%undef cmp 1492 vpbroadcastd m13, [o(pd_2048)] 1493 vpbroadcastd m12, [o(pd_2896)] 1494 vpbroadcastd m14, [o(clip_18b_min)] 1495 vpbroadcastd m15, [o(clip_18b_max)] 1496 cmp eobd, 36 1497 jl .fast 1498 mova m0, [cq+64* 0] 1499 mova m1, [cq+64* 2] 1500 mova m2, [cq+64* 4] 1501 mova m3, [cq+64* 6] 1502 mova m4, [cq+64* 8] 1503 mova m5, [cq+64*10] 1504 mova m6, [cq+64*12] 1505 mova m7, [cq+64*14] 1506%if WIN64 1507 movaps [cq+16*0], xmm6 1508 movaps [cq+16*1], xmm7 1509%endif 1510 call m(idct_8x16_internal_10bpc).main 1511 mova m16, [cq+64* 1] 1512 mova m17, [cq+64* 3] 1513 mova m18, [cq+64* 5] 1514 mova m19, [cq+64* 7] 1515 mova m20, [cq+64* 9] 1516 mova m21, [cq+64*11] 1517 mova m22, [cq+64*13] 1518 mova m23, [cq+64*15] 1519 call .main 1520 call .main_end 1521.pass1_end: 1522%if WIN64 1523 movaps xmm6, [cq+16*0] 1524 movaps xmm7, [cq+16*1] 1525%endif 1526 vzeroupper 1527.pass1_end2: 1528 call .main_end3 1529.pass1_end3: 1530 mov r6d, 64*12 1531 pxor m8, m8 1532.zero_loop: 1533 mova [cq+r6+64*3], m8 1534 mova [cq+r6+64*2], m8 1535 mova [cq+r6+64*1], m8 1536 mova [cq+r6+64*0], m8 1537 sub r6d, 64*4 1538 jge .zero_loop 1539 jmp tx2q 1540.pass2: 1541 lea r5, [o_base_8bpc] 1542 call m(idct_16x16_internal_8bpc).main 1543 movshdup m12, [permC] 1544 vpbroadcastd m11, [pw_2048] 1545 psrlq m13, m12, 8 1546 vpermq m8, m12, m0 1547 vpermq m0, m13, m7 1548 vpermq m7, m13, m1 1549 vpermq m1, m12, m6 1550 vpermq m6, m12, m2 1551 vpermq m2, m13, m5 1552 vpermq m5, m13, m3 1553 vpermq m3, m12, m4 1554.pass2_end: 1555 lea r6, [strideq*3] 1556 vpbroadcastd m13, [pixel_10bpc_max] 1557 pxor m12, m12 1558 pmulhrsw m8, m11, m8 1559 pmulhrsw m9, m11, m7 1560 call m(idct_16x8_internal_10bpc).write_16x4_noround 1561 pmulhrsw m8, m11, m6 1562 pmulhrsw m9, m11, m5 1563 call m(idct_16x8_internal_10bpc).write_16x4_noround 1564 pmulhrsw m8, m11, m3 1565 pmulhrsw m9, m11, m2 1566 call m(idct_16x8_internal_10bpc).write_16x4_noround 1567 pmulhrsw m8, m11, m1 1568 pmulhrsw m9, m11, m0 1569 jmp m(idct_16x8_internal_10bpc).write_16x4_noround 1570.fast: 1571 mova ym0, [cq+64*0] 1572 mova ym2, [cq+64*4] 1573 movshdup m8, [o(permB)] 1574 mova ym1, [cq+64*2] 1575 mova ym3, [cq+64*6] 1576 mova ym4, [cq+64*1] 1577 mova ym5, [cq+64*3] 1578 mova ym6, [cq+64*5] 1579 mova ym7, [cq+64*7] 1580 vpermt2q m0, m8, m2 ; 0 4 1581 vpermt2q m1, m8, m3 ; 2 6 1582 vpermt2q m4, m8, m5 ; 1 3 1583 vpermt2q m7, m8, m6 ; 7 5 1584 call m(idct_8x8_internal_10bpc).main_fast 1585 call m(idct_16x8_internal_10bpc).main_fast 1586 vpbroadcastd m11, [o(pd_2)] 1587 call m(idct_8x16_internal_10bpc).main_end2 1588 mova m8, [o(permA)] 1589 psrlq m9, m8, 8 1590 jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2 1591ALIGN function_align 1592.main_fast2_rect2: 1593 REPX {paddd x, m13}, m16, m17 1594 REPX {psrad x, 12 }, m16, m17 1595.main_fast2: 1596 pmulld m22, m16, [o(pd_4076)] {1to16} ; t15a 1597 pmulld m9, m16, [o(pd_401)] {1to16} ; t8a 1598 pmulld m18, m17, [o(pd_1189)] {1to16} ; t11a 1599 pmulld m17, [o(pd_3920)] {1to16} ; t12a 1600 psubd m18, m13, m18 1601 REPX {paddd x, m13}, m22, m9, m17 1602 REPX {psrad x, 12 }, m18, m22, m9, m17 1603 1604 mova m20, m9 1605 mova m16, m18 1606 mova m23, m22 1607 mova m19, m17 1608 jmp .main3 1609.main_fast_rect2: 1610 REPX {paddd x, m13}, m16, m17, m18, m19 1611 REPX {psrad x, 12 }, m16, m17, m18, m19 1612.main_fast: 1613 pmulld m23, m16, [o(pd_4076)] {1to16} ; t15a 1614 pmulld m16, [o(pd_401)] {1to16} ; t8a 1615 pmulld m20, m19, [o(pd_2598)] {1to16} ; t9a 1616 pmulld m19, [o(pd_3166)] {1to16} ; t14a 1617 pmulld m22, m17, [o(pd_1189)] {1to16} ; t11a 1618 pmulld m17, [o(pd_3920)] {1to16} ; t12a 1619 pmulld m21, m18, [o(pd_3612)] {1to16} ; t13a 1620 pmulld m18, [o(pd_1931)] {1to16} ; t10a 1621 psubd m20, m13, m20 1622 psubd m22, m13, m22 1623 call .round2 1624 jmp .main2 1625.main_rect2: 1626 call .round 1627.main: 1628 ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 401, 4076 ; t8a, t15a 1629 ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3166, 2598 ; t9a, t14a 1630 ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3920, 1189 ; t11a, t12a 1631 ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1931, 3612 ; t10a, t13a 1632 call .round 1633.main2: 1634 paddd m9, m20, m16 ; t8 1635 psubd m20, m16, m20 ; t9 1636 psubd m16, m22, m18 ; t10 1637 paddd m18, m22 ; t11 1638 paddd m22, m23, m19 ; t15 1639 psubd m23, m19 ; t14 1640 psubd m19, m17, m21 ; t13 1641 paddd m17, m21 ; t12 1642 REPX {pmaxsd x, m14}, m20, m23, m16, m19 1643 REPX {pminsd x, m15}, m20, m23, m16, m19 1644 REPX {pmaxsd x, m14}, m9, m18, m22, m17 1645 REPX {pminsd x, m15}, m9, m18, m22, m17 1646.main3: 1647 vpbroadcastd m11, [o(pd_3784)] 1648 vpbroadcastd m10, [o(pd_1567)] 1649 ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11 1650 ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2 1651 paddd m21, m20, m19 ; t14 1652 psubd m20, m19 ; t13 1653 psubd m19, m9, m18 ; t11a 1654 paddd m9, m18 ; t8a 1655 psubd m18, m23, m16 ; t10 1656 paddd m16, m23 ; t9 1657 psubd m23, m22, m17 ; t12a 1658 paddd m22, m17 ; t15a 1659 REPX {pmaxsd x, m14}, m20, m23, m18, m19 1660 REPX {pminsd x, m15}, m20, m23, m18, m19 1661 REPX {pmulld x, m12}, m20, m23, m18, m19 1662 psubd m7, m0, m6 ; dct8 out7 1663 paddd m0, m6 ; dct8 out0 1664 psubd m6, m1, m5 ; dct8 out6 1665 paddd m1, m5 ; dct8 out1 1666 REPX {pmaxsd x, m14}, m7, m0, m6, m1 1667 psubd m5, m2, m4 ; dct8 out5 1668 paddd m2, m4 ; dct8 out2 1669 REPX {pminsd x, m15}, m7, m0, m6, m1 1670 psubd m4, m3, m8 ; dct8 out4 1671 paddd m3, m8 ; dct8 out3 1672 REPX {pmaxsd x, m14}, m5, m2, m4, m3 1673 paddd m20, m13 1674 paddd m23, m13 1675 REPX {pminsd x, m15}, m5, m2, m4, m3 1676 psubd m17, m20, m18 ; t10a 1677 paddd m20, m18 ; t13a 1678 REPX {pmaxsd x, m14}, m22, m21, m16, m9 1679 psubd m18, m23, m19 ; t11 1680 paddd m19, m23 ; t12 1681 REPX {pminsd x, m15}, m22, m21, m16, m9 1682 REPX {psrad x, 12 }, m20, m19, m18, m17 1683 ret 1684.main_end: 1685 vpbroadcastd m11, [o(pd_2)] 1686.main_end2: 1687 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 1688 psubd m23, m0, m22 ; out15 1689 paddd m0, m22 ; out0 1690 psubd m22, m1, m21 ; out14 1691 paddd m1, m21 ; out1 1692 psubd m21, m2, m20 ; out13 1693 paddd m2, m20 ; out2 1694 psubd m20, m3, m19 ; out12 1695 paddd m3, m19 ; out3 1696 psubd m19, m4, m18 ; out11 1697 paddd m4, m18 ; out4 1698 psubd m18, m5, m17 ; out10 1699 paddd m5, m17 ; out5 1700 psubd m17, m6, m16 ; out9 1701 paddd m6, m16 ; out6 1702 psubd m16, m7, m9 ; out8 1703 paddd m7, m9 ; out7 1704 REPX {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \ 1705 m4, m20, m5, m21, m6, m22, m7, m23 1706 packssdw m0, m16 1707 packssdw m1, m17 1708 packssdw m2, m18 1709 packssdw m3, m19 1710 packssdw m4, m20 1711 packssdw m5, m21 1712 packssdw m6, m22 1713 packssdw m7, m23 1714 ret 1715.main_end3: 1716 punpckhwd m8, m0, m1 1717 punpcklwd m0, m1 1718 punpckhwd m1, m2, m3 1719 punpcklwd m2, m3 1720 punpckhwd m3, m4, m5 1721 punpcklwd m4, m5 1722 punpcklwd m5, m6, m7 1723 punpckhwd m6, m7 1724 punpckhdq m7, m0, m2 1725 punpckldq m0, m2 1726 punpckhdq m2, m8, m1 1727 punpckldq m8, m1 1728 punpckhdq m1, m4, m5 1729 punpckldq m4, m5 1730 punpckhdq m5, m3, m6 1731 punpckldq m3, m6 1732 vshufi32x4 m6, m0, m4, q3232 1733 vinserti32x8 m0, ym4, 1 1734 vinserti32x8 m4, m8, ym3, 1 1735 vshufi32x4 m8, m3, q3232 1736 vinserti32x8 m3, m7, ym1, 1 1737 vshufi32x4 m7, m1, q3232 1738 vshufi32x4 m1, m2, m5, q3232 1739 vinserti32x8 m2, ym5, 1 1740 vshufi32x4 m5, m7, m1, q2020 ; 10 11 1741 vshufi32x4 m7, m1, q3131 ; 14 15 1742 vshufi32x4 m1, m3, m2, q2020 ; 2 3 1743 vshufi32x4 m3, m2, q3131 ; 6 7 1744 vshufi32x4 m2, m0, m4, q3131 ; 4 5 1745 vshufi32x4 m0, m4, q2020 ; 0 1 1746 vshufi32x4 m4, m6, m8, q2020 ; 8 9 1747 vshufi32x4 m6, m8, q3131 ; 12 13 1748 ret 1749ALIGN function_align 1750.round: 1751 paddd m20, m13 1752 paddd m22, m13 1753.round2: 1754 paddd m16, m13 1755 paddd m18, m13 1756.round3: 1757 REPX {psrad x, 12 }, m16, m18, m20, m22 1758 REPX {paddd x, m13}, m17, m19, m21, m23 1759 REPX {psrad x, 12 }, m17, m19, m21, m23 1760 ret 1761 1762INV_TXFM_16X16_FN adst, dct 1763INV_TXFM_16X16_FN adst, flipadst 1764INV_TXFM_16X16_FN adst, adst 1765 1766cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 1767%undef cmp 1768 cmp eobd, 36 1769 jl .fast 1770 call .main_pass1 1771 packssdw m0, m16 1772 packssdw m1, m17 1773 packssdw m2, m18 1774 packssdw m3, m19 1775 packssdw m4, m5, m20 1776 packssdw m5, m6, m21 1777 packssdw m6, m7, m22 1778 packssdw m7, m8, m23 1779 jmp m(idct_16x16_internal_10bpc).pass1_end 1780.fast: 1781 call .main_pass1_fast 1782 vpbroadcastd m9, [o(pd_2)] 1783 paddd m0, m9 1784 psubd m1, m9, m1 1785 paddd m2, m9 1786 psubd m3, m9, m3 1787 paddd m4, m9, m5 1788 psubd m5, m9, m6 1789 paddd m6, m9, m7 1790 psubd m7, m9, m8 1791.pass1_fast_end: 1792 mova m9, [o(permA)] 1793 psrlq m8, m9, 8 1794 REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 1795.pass1_fast_end2: 1796 mova m10, m9 1797 mova m11, m8 1798 call m(idct_16x8_internal_10bpc).transpose_16x8 1799 pxor m4, m4 1800 REPX {mova x, m4}, m5, m6, m7 1801 REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7 1802 jmp tx2q 1803.pass2: 1804 lea r5, [o_base_8bpc] 1805 call m(iadst_16x16_internal_8bpc).main_pass2b 1806 movshdup m12, [permC] 1807 mova m11, [pw_2048_m2048] 1808 psrlq m13, m12, 8 1809 vpermq m8, m13, m0 1810 vpermq m0, m12, m7 1811 vpermq m7, m13, m1 1812 vpermq m1, m12, m6 1813 vpermq m6, m13, m2 1814 vpermq m2, m12, m5 1815 vpermq m5, m13, m3 1816 vpermq m3, m12, m4 1817 jmp m(idct_16x16_internal_10bpc).pass2_end 1818ALIGN function_align 1819.main_pass1: 1820 mova m0, [cq+64* 0] 1821%if WIN64 1822 movaps [cq+16*0], xmm6 1823 movaps [cq+16*1], xmm7 1824%endif 1825 mova m23, [cq+64*15] 1826 vpbroadcastd m13, [o(pd_2048)] 1827 ITX_MULSUB_2D 23, 0, 8, 9, 10, 13, 201, 4091 ; t1 t0 1828 mova m7, [cq+64* 7] 1829 mova m16, [cq+64* 8] 1830 ITX_MULSUB_2D 7, 16, 8, 9, 10, 13, 3035, 2751 ; t9 t8 1831 mova m2, [cq+64* 2] 1832 mova m21, [cq+64*13] 1833 ITX_MULSUB_2D 21, 2, 8, 9, 10, 13, 995, 3973 ; t3 t2 1834 mova m5, [cq+64* 5] 1835 mova m18, [cq+64*10] 1836 ITX_MULSUB_2D 5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10 1837 mova m4, [cq+64* 4] 1838 mova m19, [cq+64*11] 1839 ITX_MULSUB_2D 19, 4, 8, 9, 10, 13, 1751, 3703 ; t5 t4 1840 mova m3, [cq+64* 3] 1841 mova m20, [cq+64*12] 1842 ITX_MULSUB_2D 3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12 1843 mova m6, [cq+64* 6] 1844 mova m17, [cq+64* 9] 1845 ITX_MULSUB_2D 17, 6, 8, 9, 10, 13, 2440, 3290 ; t7 t6 1846 mova m1, [cq+64* 1] 1847 mova m22, [cq+64*14] 1848 ITX_MULSUB_2D 1, 22, 8, 9, 10, 13, 4052, 601 ; t15 t14 1849 vpbroadcastd m14, [o(clip_18b_min)] 1850 vpbroadcastd m15, [o(clip_18b_max)] 1851 psubd m9, m23, m7 ; t9a 1852 paddd m23, m7 ; t1a 1853 psubd m7, m2, m18 ; t10a 1854 paddd m18, m2 ; t2a 1855 REPX {pmaxsd x, m14}, m9, m23, m7, m18 1856 psubd m2, m17, m1 ; t15a 1857 paddd m17, m1 ; t7a 1858 REPX {pminsd x, m15}, m9, m23, m7, m18 1859 psubd m1, m21, m5 ; t11a 1860 paddd m21, m5 ; t3a 1861 REPX {pmaxsd x, m14}, m2, m17, m1, m21 1862 psubd m5, m4, m20 ; t12a 1863 paddd m4, m20 ; t4a 1864 REPX {pminsd x, m15}, m2, m17, m1, m21 1865 psubd m20, m19, m3 ; t13a 1866 paddd m19, m3 ; t5a 1867 REPX {pmaxsd x, m14}, m5, m4, m20, m19 1868 psubd m8, m6, m22 ; t14a 1869 paddd m6, m22 ; t6a 1870 REPX {pminsd x, m15}, m5, m4, m20, m19 1871 psubd m22, m0, m16 ; t8a 1872 paddd m16, m0 ; t0a 1873 REPX {pmaxsd x, m14}, m8, m6, m22, m16 1874 vpbroadcastd m11, [o(pd_4017)] 1875 vpbroadcastd m10, [o(pd_799)] 1876 REPX {pminsd x, m15}, m8, m6, m22, m16 1877 ITX_MULSUB_2D 22, 9, 0, 3, _, 13, 10, 11 ; t9 t8 1878 ITX_MULSUB_2D 20, 5, 0, 3, _, 13, 11, 10 ; t12 t13 1879 vpbroadcastd m11, [o(pd_2276)] 1880 vpbroadcastd m10, [o(pd_3406)] 1881 ITX_MULSUB_2D 7, 1, 0, 3, _, 13, 10, 11 ; t11 t10 1882 ITX_MULSUB_2D 2, 8, 0, 3, _, 13, 11, 10 ; t14 t15 1883 paddd m0, m16, m4 ; t0 1884 psubd m16, m4 ; t4 1885 psubd m3, m23, m19 ; t5 1886 paddd m23, m19 ; t1 1887 REPX {pmaxsd x, m14}, m0, m16, m3, m23 1888 psubd m19, m18, m6 ; t6 1889 paddd m18, m6 ; t2 1890 REPX {pminsd x, m15}, m0, m16, m3, m23 1891 psubd m6, m21, m17 ; t7 1892 paddd m21, m17 ; t3 1893 REPX {pmaxsd x, m14}, m19, m18, m6, m21 1894 paddd m17, m9, m20 ; t8a 1895 psubd m9, m20 ; t12a 1896 REPX {pminsd x, m15}, m19, m18, m6, m21 1897 psubd m20, m22, m5 ; t13a 1898 paddd m22, m5 ; t9a 1899 REPX {pmaxsd x, m14}, m17, m9, m20, m22 1900 psubd m5, m1, m2 ; t14a 1901 paddd m1, m2 ; t10a 1902 REPX {pminsd x, m15}, m17, m9, m20, m22 1903 psubd m2, m7, m8 ; t15a 1904 paddd m7, m8 ; t11a 1905 REPX {pmaxsd x, m14}, m5, m1, m2, m7 1906 vpbroadcastd m11, [o(pd_3784)] 1907 vpbroadcastd m10, [o(pd_1567)] 1908 REPX {pminsd x, m15}, m5, m1, m2, m7 1909 ITX_MULSUB_2D 16, 3, 4, 8, _, 13, 10, 11 ; t5a t4a 1910 ITX_MULSUB_2D 6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a 1911 ITX_MULSUB_2D 9, 20, 4, 8, _, 13, 10, 11 ; t13 t12 1912 ITX_MULSUB_2D 2, 5, 4, 8, _, 13, 11, 10 ; t14 t15 1913 psubd m8, m0, m18 ; t2a 1914 paddd m0, m18 ; out0 1915 psubd m18, m23, m21 ; t3a 1916 paddd m23, m21 ; -out15 1917 paddd m21, m9, m5 ; -out13 1918 psubd m9, m5 ; t15a 1919 psubd m5, m3, m6 ; t6 1920 paddd m3, m6 ; -out3 1921 REPX {pmaxsd x, m14}, m8, m18, m9, m5 1922 psubd m6, m20, m2 ; t14a 1923 paddd m2, m20 ; out2 1924 paddd m20, m16, m19 ; out12 1925 psubd m16, m19 ; t7 1926 REPX {pminsd x, m15}, m8, m18, m9, m5 1927 psubd m19, m22, m7 ; t11 1928 paddd m22, m7 ; out14 1929 psubd m7, m17, m1 ; t10 1930 paddd m1, m17 ; -out1 1931 REPX {pmaxsd x, m14}, m6, m16, m19, m7 1932 vpbroadcastd m12, [o(pd_1448)] 1933 vpbroadcastd m4, [o(pd_2)] 1934 vpbroadcastd m10, [o(pd_5120)] 1935 vpbroadcastd m11, [o(pd_5119)] 1936 REPX {pminsd x, m15}, m6, m16, m19, m7 1937 psubd m17, m7, m19 ; -out9 1938 paddd m7, m19 ; out6 1939 psubd m19, m5, m16 ; -out11 1940 paddd m5, m16 ; out4 1941 REPX {pmulld x, m12}, m17, m7, m19, m5 1942 psubd m16, m8, m18 ; out8 1943 paddd m8, m18 ; -out7 1944 psubd m18, m6, m9 ; out10 1945 paddd m6, m9 ; -out5 1946 REPX {pmulld x, m12}, m16, m8, m18, m6 1947 REPX {paddd x, m4 }, m0, m2, m20, m22 1948 REPX {psubd x, m4, x}, m1, m3, m21, m23 1949 REPX {paddd x, m10 }, m7, m5, m16, m18 1950 REPX {psubd x, m11, x}, m17, m19, m8, m6 1951 REPX {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3 1952 REPX {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8 1953 ret 1954ALIGN function_align 1955.main_pass1_fast: 1956 mova ym0, [cq+64*0] 1957 mova ym1, [cq+64*2] 1958 movshdup m8, [o(permB)] 1959 mova ym6, [cq+64*1] 1960 mova ym7, [cq+64*3] 1961 mova ym2, [cq+64*4] 1962 mova ym3, [cq+64*6] 1963 mova ym4, [cq+64*5] 1964 mova ym5, [cq+64*7] 1965 vpermt2q m0, m8, m1 ; 0 2 1966 vpermt2q m7, m8, m6 ; 3 1 1967 vpermt2q m2, m8, m3 ; 4 6 1968 vpermt2q m5, m8, m4 ; 7 5 1969 vpbroadcastd m13, [o(pd_2048)] 1970 vpbroadcastd m12, [o(pd_2896)] 1971 jmp m(iadst_16x8_internal_10bpc).main_fast 1972 1973INV_TXFM_16X16_FN flipadst, dct 1974INV_TXFM_16X16_FN flipadst, adst 1975INV_TXFM_16X16_FN flipadst, flipadst 1976 1977cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 1978%undef cmp 1979 cmp eobd, 36 1980 jl .fast 1981 call m(iadst_16x16_internal_10bpc).main_pass1 1982 packssdw m4, m19, m3 1983 packssdw m3, m20, m5 1984 packssdw m5, m18, m2 1985 packssdw m2, m21, m6 1986 packssdw m6, m17, m1 1987 packssdw m1, m22, m7 1988 packssdw m7, m16, m0 1989 packssdw m0, m23, m8 1990 jmp m(idct_16x16_internal_10bpc).pass1_end 1991.fast: 1992 call m(iadst_16x16_internal_10bpc).main_pass1_fast 1993 vpbroadcastd m9, [o(pd_2)] 1994 psubd m4, m9, m3 1995 paddd m3, m9, m5 1996 paddd m5, m9, m2 1997 psubd m2, m9, m6 1998 psubd m6, m9, m1 1999 paddd m1, m9, m7 2000 paddd m7, m9, m0 2001 psubd m0, m9, m8 2002 jmp m(iadst_16x16_internal_10bpc).pass1_fast_end 2003.pass2: 2004 lea r5, [o_base_8bpc] 2005 call m(iadst_16x16_internal_8bpc).main_pass2b 2006 movshdup m12, [permC] 2007 movu m11, [pw_m2048_2048] 2008 psrlq m13, m12, 8 2009 vpermq m8, m13, m7 2010 vpermq m7, m13, m6 2011 vpermq m6, m13, m5 2012 vpermq m5, m13, m4 2013 vpermq m3, m12, m3 2014 vpermq m2, m12, m2 2015 vpermq m1, m12, m1 2016 vpermq m0, m12, m0 2017 jmp m(idct_16x16_internal_10bpc).pass2_end 2018 2019INV_TXFM_16X16_FN identity, dct, -92 2020INV_TXFM_16X16_FN identity, identity 2021 2022cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 2023%undef cmp 2024 vpbroadcastd m10, [o(pd_5793)] 2025 vpbroadcastd m11, [o(pd_5120)] 2026 mov r6, cq 2027 cmp eobd, 36 2028 jl .fast 2029 call .pass1_main 2030 packssdw m0, m6, m8 2031 packssdw m1, m7, m9 2032 call .pass1_main 2033 packssdw m2, m6, m8 2034 packssdw m3, m7, m9 2035 call .pass1_main 2036 packssdw m4, m6, m8 2037 packssdw m5, m7, m9 2038 call .pass1_main 2039 packssdw m6, m8 2040 packssdw m7, m9 2041 jmp m(idct_16x16_internal_10bpc).pass1_end2 2042.fast: 2043 call .pass1_main_fast 2044 packssdw m0, m6, m7 2045 call .pass1_main_fast 2046 packssdw m1, m6, m7 2047 call .pass1_main_fast 2048 packssdw m2, m6, m7 2049 call .pass1_main_fast 2050 packssdw m3, m6, m7 2051 punpckhwd m4, m0, m1 2052 punpcklwd m0, m1 2053 punpckhwd m1, m2, m3 2054 punpcklwd m2, m3 2055 punpckldq m3, m4, m1 2056 punpckhdq m4, m1 2057 punpckhdq m1, m0, m2 2058 punpckldq m0, m2 2059 pxor m7, m7 2060 vshufi32x4 m2, m0, m3, q3131 2061 vshufi32x4 m0, m3, q2020 2062 vshufi32x4 m3, m1, m4, q3131 2063 vshufi32x4 m1, m4, q2020 2064 REPX {mova x, m7}, m4, m5, m6 2065 jmp m(idct_16x16_internal_10bpc).pass1_end3 2066.pass2: 2067 movshdup m14, [o(permC)] 2068 vpbroadcastd m15, [o(pw_1697x16)] 2069 lea r6, [strideq*3] 2070 vpbroadcastd m11, [o(pw_2048)] 2071 pxor m12, m12 2072 vpbroadcastd m13, [pixel_10bpc_max] 2073 vpermq m8, m14, m0 2074 vpermq m9, m14, m1 2075 call .pass2_main 2076 vpermq m8, m14, m2 2077 vpermq m9, m14, m3 2078 call .pass2_main 2079 vpermq m8, m14, m4 2080 vpermq m9, m14, m5 2081 call .pass2_main 2082 vpermq m8, m14, m6 2083 vpermq m9, m14, m7 2084.pass2_main: 2085 pmulhrsw m0, m15, m8 2086 pmulhrsw m1, m15, m9 2087 paddsw m8, m8 2088 paddsw m9, m9 2089 paddsw m8, m0 2090 paddsw m9, m1 2091 jmp m(idct_16x8_internal_10bpc).write_16x4 2092ALIGN function_align 2093.pass1_main: 2094 pmulld m6, m10, [r6+64*0] 2095 pmulld m7, m10, [r6+64*1] 2096 pmulld m8, m10, [r6+64*8] 2097 pmulld m9, m10, [r6+64*9] 2098 add r6, 64*2 2099 REPX {paddd x, m11}, m6, m7, m8, m9 2100 REPX {psrad x, 13 }, m6, m8, m7, m9 2101 ret 2102ALIGN function_align 2103.pass1_main_fast: 2104 mova ym6, [r6+64* 0] 2105 vinserti32x8 m6, [r6+64* 4], 1 2106 mova ym7, [r6+64* 8] 2107 vinserti32x8 m7, [r6+64*12], 1 2108 add r6, 64 2109 REPX {pmulld x, m10}, m6, m7 2110 REPX {paddd x, m11}, m6, m7 2111 REPX {psrad x, 13 }, m6, m7 2112 ret 2113 2114cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob 2115%undef cmp 2116 lea r5, [o_base] 2117 test eobd, eobd 2118 jz .dconly 2119 vpbroadcastd m12, [o(pd_2896)] 2120 vpbroadcastd m13, [o(pd_2048)] 2121 vpbroadcastd m14, [o(clip_18b_min)] 2122 vpbroadcastd m15, [o(clip_18b_max)] 2123 vpbroadcastd m11, [o(pd_2)] 2124 mova m20, [o(idct8x32p)] 2125 pxor m21, m21 2126 cmp eobd, 43 2127 jl .fast 2128 call .pass1_main 2129 punpcklwd m16, m0, m1 2130 punpcklwd m17, m2, m3 2131 punpckhwd m18, m0, m1 2132 punpckhwd m19, m2, m3 2133 cmp eobd, 107 2134 jge .full 2135 punpckldq m0, m16, m17 ; 0 2 2136 punpckhdq m1, m16, m17 ; 4 6 2137 punpckldq m2, m18, m19 ; 8 10 2138 punpckhdq m3, m18, m19 ; 12 14 2139 lea r5, [o_base_8bpc] 2140 vextracti32x8 ym14, m0, 1 2141 vextracti32x8 ym15, m1, 1 2142 vextracti32x8 ym16, m2, 1 2143 vextracti32x8 ym17, m3, 1 2144 call m(idct_8x16_internal_8bpc).main_fast 2145 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast 2146 jmp .end 2147.full: 2148 add cq, 64 2149 call .pass1_main 2150 punpcklwd m5, m0, m1 2151 punpcklwd m6, m2, m3 2152 punpckhwd m7, m0, m1 2153 punpckhwd m8, m2, m3 2154 punpckldq m0, m16, m17 ; 0 2 2155 punpckhdq m1, m16, m17 ; 4 6 2156 punpckldq m2, m18, m19 ; 8 10 2157 punpckhdq m3, m18, m19 ; 12 14 2158 punpckldq m4, m5, m6 ; 16 18 2159 punpckhdq m5, m6 ; 20 22 2160 punpckldq m6, m7, m8 ; 24 26 2161 punpckhdq m7, m8 ; 28 30 2162 lea r5, [o_base_8bpc] 2163 vextracti32x8 ym14, m0, 1 2164 vextracti32x8 ym15, m1, 1 2165 vextracti32x8 ym16, m2, 1 2166 vextracti32x8 ym17, m3, 1 2167 vextracti32x8 ym18, m4, 1 2168 vextracti32x8 ym19, m5, 1 2169 vextracti32x8 ym20, m6, 1 2170 vextracti32x8 ym21, m7, 1 2171 call m(idct_8x16_internal_8bpc).main 2172 REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21 2173 call m(inv_txfm_add_dct_dct_8x32_8bpc).main 2174 jmp .end 2175.fast: 2176 movshdup m8, [o(permB)] 2177 mova ym1, [cq+128*1] 2178 mova ym5, [cq+128*5] 2179 mova ym7, [cq+128*3] 2180 mova ym3, [cq+128*7] 2181 mova ym0, [cq+128*0] 2182 mova ym4, [cq+128*2] 2183 mova ym2, [cq+128*4] 2184 mova ym6, [cq+128*6] 2185 vpermt2q m1, m8, m5 ; 1 5 2186 vpermt2q m3, m8, m7 ; 7 3 2187 vpermt2q m0, m8, m4 ; 0 2 2188 vpermt2q m2, m8, m6 ; 4 6 2189 mova [cq+128*0], ym21 2190 REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7 2191 call m(idct_8x8_internal_10bpc).main 2192 call m(idct_8x8_internal_10bpc).main_end 2193 packssdw m0, m2 2194 packssdw m1, m3 2195 vpermb m0, m20, m0 2196 vprold m20, 16 2197 vpermb m2, m20, m1 2198 punpckhdq m1, m0, m2 2199 punpckldq m0, m2 2200 lea r5, [o_base_8bpc] 2201 vextracti32x8 ym14, m0, 1 2202 vextracti32x8 ym15, m1, 1 2203 call m(idct_8x16_internal_8bpc).main_fast2 2204 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2 2205.end: 2206 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper 2207 lea r3, [strideq*2] 2208 vpbroadcastd m12, [pixel_10bpc_max] 2209 lea r6, [strideq*3] 2210 pxor m11, m11 2211 lea r3, [dstq+r3*8] 2212 pmulhrsw m0, m10 2213 pmulhrsw m1, m10 2214 call .write_8x4x2 2215 pmulhrsw m0, m10, m2 2216 pmulhrsw m1, m10, m3 2217 call .write_8x4x2 2218 pmulhrsw m0, m10, m4 2219 pmulhrsw m1, m10, m5 2220 call .write_8x4x2 2221 pmulhrsw m0, m10, m6 2222 pmulhrsw m1, m10, m7 2223.write_8x4x2: 2224 mova xm8, [dstq+strideq*0] 2225 vinserti32x4 ym8, [dstq+strideq*1], 1 2226 vinserti32x4 m8, [dstq+strideq*2], 2 2227 vinserti32x4 m8, [dstq+r6 ], 3 2228 mova xm9, [r3 +r6 ] 2229 vinserti32x4 ym9, [r3 +strideq*2], 1 2230 vinserti32x4 m9, [r3 +strideq*1], 2 2231 vinserti32x4 m9, [r3 +strideq*0], 3 2232 paddw m8, m0 2233 paddw m9, m1 2234 pmaxsw m8, m11 2235 pmaxsw m9, m11 2236 pminsw m8, m12 2237 pminsw m9, m12 2238 mova [dstq+strideq*0], xm8 2239 vextracti32x4 [dstq+strideq*1], ym8, 1 2240 vextracti32x4 [dstq+strideq*2], m8, 2 2241 vextracti32x4 [dstq+r6 ], m8, 3 2242 lea dstq, [dstq+strideq*4] 2243 vextracti32x4 [r3 +strideq*0], m9, 3 2244 vextracti32x4 [r3 +strideq*1], m9, 2 2245 vextracti32x4 [r3 +strideq*2], ym9, 1 2246 mova [r3 +r6 ], xm9 2247 lea r3, [r3+strideq*4] 2248 ret 2249.dconly: 2250 imul r6d, [cq], 181 2251 mov [cq], eobd 2252 or r3d, 32 2253 add r6d, 640 2254 sar r6d, 10 2255 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 2256ALIGN function_align 2257.pass1_main: 2258 mova m0, [cq+128*0] 2259 mova m1, [cq+128*1] 2260 mova m2, [cq+128*2] 2261 mova m3, [cq+128*3] 2262 mova m4, [cq+128*4] 2263 mova m5, [cq+128*5] 2264 mova m6, [cq+128*6] 2265 mova m7, [cq+128*7] 2266 REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7 2267 call m(idct_8x16_internal_10bpc).main 2268 call m(idct_8x16_internal_10bpc).main_end2 2269 packssdw m0, m4 2270 packssdw m1, m5 2271 packssdw m2, m6 2272 packssdw m3, m7 2273 REPX {vpermb x, m20, x}, m0, m1, m2, m3 2274 ret 2275 2276cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob 2277 vpbroadcastd m9, [pw_5] 2278 lea r4, [strideq*3] 2279 pxor m10, m10 2280 lea r5, [strideq*5] 2281 vpbroadcastd m11, [pixel_10bpc_max] 2282 sub eobd, 107 2283 lea r6, [strideq+r4*2] 2284.loop: 2285 mova m0, [cq+128*0] 2286 packssdw m0, [cq+128*1] 2287 mova m1, [cq+128*2] 2288 packssdw m1, [cq+128*3] 2289 mova m2, [cq+128*4] 2290 packssdw m2, [cq+128*5] 2291 mova m3, [cq+128*6] 2292 packssdw m3, [cq+128*7] 2293 lea r7, [dstq+strideq*8] 2294 REPX {mova [cq+128*x], m10}, 0, 1, 2, 3 2295 REPX {paddsw x, m9}, m0, m1, m2, m3 2296 REPX {mova [cq+128*x], m10}, 4, 5, 6, 7 2297 REPX {psraw x, 3 }, m0, m1, m2, m3 2298 add cq, 64 2299 mova xm4, [dstq+strideq*0] 2300 mova xm5, [dstq+strideq*1] 2301 mova xm6, [dstq+strideq*2] 2302 mova xm7, [dstq+r4 *1] 2303 punpckhwd m8, m0, m1 2304 vinserti32x4 ym4, [dstq+strideq*4], 1 2305 punpcklwd m0, m1 2306 vinserti32x4 ym5, [dstq+r5 *1], 1 2307 punpckhwd m1, m2, m3 2308 vinserti32x4 ym6, [dstq+r4 *2], 1 2309 punpcklwd m2, m3 2310 vinserti32x4 ym7, [dstq+r6 *1], 1 2311 punpckhwd m3, m0, m8 2312 vinserti32x4 m4, [r7 +strideq*0], 2 2313 punpcklwd m0, m8 2314 vinserti32x4 m5, [r7 +strideq*1], 2 2315 punpckhwd m8, m2, m1 2316 vinserti32x4 m6, [r7 +strideq*2], 2 2317 punpcklwd m2, m1 2318 vinserti32x4 m7, [r7 +r4 *1], 2 2319 punpckhqdq m1, m0, m2 2320 vinserti32x4 m4, [r7 +strideq*4], 3 2321 punpcklqdq m0, m2 2322 vinserti32x4 m5, [r7 +r5 *1], 3 2323 punpcklqdq m2, m3, m8 2324 vinserti32x4 m6, [r7 +r4 *2], 3 2325 punpckhqdq m3, m8 2326 vinserti32x4 m7, [r7 +r6 *1], 3 2327 paddw m0, m4 2328 paddw m1, m5 2329 paddw m2, m6 2330 paddw m3, m7 2331 REPX {pmaxsw x, m10}, m0, m1, m2, m3 2332 REPX {pminsw x, m11}, m0, m1, m2, m3 2333 mova [dstq+strideq*0], xm0 2334 mova [dstq+strideq*1], xm1 2335 mova [dstq+strideq*2], xm2 2336 mova [dstq+r4 *1], xm3 2337 vextracti32x4 [dstq+strideq*4], ym0, 1 2338 vextracti32x4 [dstq+r5 *1], ym1, 1 2339 vextracti32x4 [dstq+r4 *2], ym2, 1 2340 vextracti32x4 [dstq+r6 *1], ym3, 1 2341 lea dstq, [r7+strideq*8] 2342 vextracti32x4 [r7 +strideq*0], m0, 2 2343 vextracti32x4 [r7 +strideq*1], m1, 2 2344 vextracti32x4 [r7 +strideq*2], m2, 2 2345 vextracti32x4 [r7 +r4 *1], m3, 2 2346 vextracti32x4 [r7 +strideq*4], m0, 3 2347 vextracti32x4 [r7 +r5 *1], m1, 3 2348 vextracti32x4 [r7 +r4 *2], m2, 3 2349 vextracti32x4 [r7 +r6 *1], m3, 3 2350 add eobd, 0x80000000 2351 jnc .loop 2352 RET 2353 2354cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob 2355%undef cmp 2356 lea r5, [o_base] 2357 test eobd, eobd 2358 jz .dconly 2359 mova m11, [o(permB)] 2360 mova m0, [cq+64* 0] ; 0 1 2361 mova m4, [cq+64* 1] ; 2 3 2362 mova m1, [cq+64* 2] ; 4 5 2363 mova m8, [cq+64* 3] ; 6 7 2364 vpbroadcastd m12, [o(pd_2896)] 2365 vpbroadcastd m13, [o(pd_2048)] 2366 vpbroadcastd m14, [o(clip_18b_min)] 2367 vpbroadcastd m15, [o(clip_18b_max)] 2368 psrlq m10, m11, 32 2369%if WIN64 2370 movaps [cq+16*0], xmm6 2371 movaps [cq+16*1], xmm7 2372%endif 2373 mova m16, m11 2374 vpermi2q m16, m0, m1 ; 1 5 2375 mova m17, m11 2376 vpermi2q m17, m8, m4 ; 7 3 2377 cmp eobd, 43 2378 jl .fast 2379 mova m18, [cq+64* 4] ; 8 9 2380 mova m20, [cq+64* 5] ; 10 11 2381 mova m6, [cq+64* 6] ; 12 13 2382 mova m7, [cq+64* 7] ; 14 15 2383 vpermt2q m0, m10, m18 ; 0 8 2384 vpermt2q m18, m11, m6 ; 9 13 2385 mova m19, m11 2386 vpermi2q m19, m7, m20 ; 15 11 2387 cmp eobd, 107 2388 jge .full 2389 vpermt2q m1, m10, m6 ; 4 12 2390 vpermt2q m4, m10, m8 ; 2 6 2391 vpermt2q m7, m10, m20 ; 14 10 2392 mov r6d, 64*1 2393 call m(idct_8x8_internal_10bpc).main_fast 2394 call m(idct_16x8_internal_10bpc).main_fast 2395 call .main_fast 2396 call m(idct_16x16_internal_10bpc).main_end 2397 jmp .end 2398.full: 2399 mova m2, [cq+64* 8] ; 16 17 2400 mova m5, [cq+64* 9] ; 18 19 2401 mova m9, [cq+64*10] ; 20 21 2402 mova m21, [cq+64*11] ; 22 23 2403 vpermt2q m1, m10, m9 ; 4 20 2404 vpermt2q m7, m10, m21 ; 14 22 2405 vpermt2q m21, m11, m5 ; 23 19 2406 vpermt2q m5, m10, m20 ; 18 10 2407 mova m20, m11 2408 vpermi2q m20, m2, m9 ; 17 21 2409 mova m22, [cq+64*12] ; 24 25 2410 mova m9, [cq+64*13] ; 26 27 2411 mova m3, [cq+64*14] ; 28 29 2412 mova m23, [cq+64*15] ; 30 31 2413 vpermt2q m2, m10, m22 ; 16 24 2414 vpermt2q m22, m11, m3 ; 25 29 2415 vpermt2q m3, m10, m6 ; 28 12 2416 vpermt2q m4, m10, m9 ; 2 26 2417 mova m6, m10 2418 vpermi2q m6, m23, m8 ; 30 6 2419 vpermt2q m23, m11, m9 ; 31 27 2420 mov r6d, 64*3 2421 call m(idct_8x8_internal_10bpc).main 2422 call m(idct_16x8_internal_10bpc).main 2423 call .main 2424 call m(idct_16x16_internal_10bpc).main_end 2425 jmp .end 2426.fast: 2427 vpermq m0, m10, m0 ; 0 0 2428 vpermq m1, m10, m1 ; 4 4 2429 vpermt2q m4, m10, m8 ; 2 6 2430 xor r6d, r6d 2431 call .main_fast2 2432 call m(idct_16x16_internal_10bpc).main_end 2433.end: 2434%if WIN64 2435 movaps xmm6, [cq+16*0] 2436 movaps xmm7, [cq+16*1] 2437%endif 2438 vzeroupper 2439 call .transpose_8x32 2440 pxor m14, m14 2441.zero_loop: 2442 mova [cq+r6*4+64*3], m14 2443 mova [cq+r6*4+64*2], m14 2444 mova [cq+r6*4+64*1], m14 2445 mova [cq+r6*4+64*0], m14 2446 sub r6d, 64 2447 jge .zero_loop 2448 lea r5, [o_base_8bpc] 2449 punpckhqdq m1, m0, m2 2450 punpcklqdq m0, m2 2451 punpcklqdq m2, m3, m4 2452 punpckhqdq m3, m4 2453 punpcklqdq m4, m5, m7 2454 punpckhqdq m5, m7 2455 punpckhqdq m7, m6, m8 2456 punpcklqdq m6, m8 2457 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 2458 pxor m12, m12 2459.write_32x8_start: 2460 vpbroadcastd m11, [pw_2048] 2461 vpbroadcastd m13, [pixel_10bpc_max] 2462 lea r3, [strideq*3] 2463.write_32x8: 2464 pmulhrsw m0, m11 2465 pmulhrsw m1, m11 2466 pmulhrsw m2, m11 2467 pmulhrsw m3, m11 2468 call .write_32x4 2469 pmulhrsw m0, m11, m4 2470 pmulhrsw m1, m11, m5 2471 pmulhrsw m2, m11, m6 2472 pmulhrsw m3, m11, m7 2473.write_32x4: 2474 paddw m0, [dstq+strideq*0] 2475 paddw m1, [dstq+strideq*1] 2476 paddw m2, [dstq+strideq*2] 2477 paddw m3, [dstq+r3 ] 2478 REPX {pmaxsw x, m12}, m0, m1, m2, m3 2479 REPX {pminsw x, m13}, m0, m1, m2, m3 2480 mova [dstq+strideq*0], m0 2481 mova [dstq+strideq*1], m1 2482 mova [dstq+strideq*2], m2 2483 mova [dstq+r3 ], m3 2484 lea dstq, [dstq+strideq*4] 2485 ret 2486.dconly: 2487 imul r6d, [cq], 181 2488 mov [cq], eobd 2489 or r3d, 8 2490 add r6d, 640 2491 sar r6d, 10 2492 jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2 2493ALIGN function_align 2494.main_fast3: 2495 ; assuming m0=in0 in0, m4=in2 in2, and m16=in1 in3 2496 vbroadcasti32x4 m5, [o(pd_401_4076)] 2497 pmulld m3, m0, m12 2498 pmulld m4, m5 2499 REPX {paddd x, m13}, m3, m4 2500 REPX {psrad x, 12 }, m3, m4 ; m3=idct8:t0-7, m4=t8a t15a 2501 2502 ; t8a t15a -> t8/9 t14/15 2503 2504 vbroadcasti32x4 m5, [o(pd_3784_m3784)] 2505 pshufd m7, m4, q1032 2506 pmulld m6, m4, [o(pd_1567)]{bcstd} 2507 pmulld m5, m7 2508 paddd m6, m13 2509 paddd m5, m6 2510 psrad m5, 12 ; m5=t9a t14a 2511 2512 ; t14a t9a -> t13/14 t9/10 [m5] & t8 15 -> t8/11a t12/15a [m4] 2513 2514 shufps m6, m4, m5, q1032 ; t12 t13 2515 shufps m8, m4, m5, q3210 ; t11a t10 2516 pmulld m9, m6, m12 2517 pmulld m7, m8, m12 2518 paddd m9, m13 2519 paddd m5, m9, m7 ; t12 t13a 2520 psubd m4, m9, m7 ; t11 t10a 2521 REPX {psrad x, 12 }, m5, m4 2522 2523 psubd m7, m3, m6 ; dct16 out15 out14 2524 paddd m0, m3, m6 ; dct16 out0 out1 2525 psubd m6, m3, m5 ; dct16 out12 out13 2526 paddd m1, m3, m5 ; dct16 out3 out2 2527 psubd m5, m3, m4 ; dct16 out11 out10 2528 paddd m2, m3, m4 ; dct16 out4 out5 2529 psubd m4, m3, m8 ; dct16 out8 out9 2530 paddd m3, m8 ; dct16 out7 out6 2531 REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3 2532 REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 2533 2534 ; idct32_bottomhalf 2535 vbroadcasti32x4 m18, [o(pd_201_m601)] 2536 vbroadcasti32x4 m19, [o(pd_4091_4052)] 2537 pmulld m17, m16, m19 2538 pmulld m16, m18 2539 REPX {paddd x, m13}, m17, m16 2540 REPX {psrad x, 12 }, m17, m16 2541 2542 ; m17: t31a t24a -> t30/31 t24/25, m16: t16a t23a -> t16/17 t22/23 [step2] 2543 2544 vbroadcasti32x4 m10, [o(pd_799_m2276)] 2545 vbroadcasti32x4 m11, [o(pd_4017_3406)] 2546 pmulld m18, m17, m10 2547 pmulld m19, m17, m11 2548 pmulld m8, m16, m11 2549 pmulld m9, m16, m10 2550 REPX {paddd x, m13}, m18, m19 2551 psubd m18, m8 2552 paddd m19, m9 2553 REPX {psrad x, 12 }, m18, m19 2554 2555 ; m17=t31 t24 -> t28/31a t24/27a, m16=t16 t23 -> t16/19a t20/23a 2556 ; m18=t17a t22a -> t17/18 t21/22, m19=t30a t25a -> t29/30 t25/26 2557 2558 punpckhqdq m23, m17, m19 ; t24a t25 [or t27a t26] 2559 punpcklqdq m20, m16, m18 ; t16a t17 [or t19a t18] 2560 punpckhqdq m22, m16, m18 ; t23a t22 [or t20a t21] 2561 punpcklqdq m16, m17, m19 ; t28a t29 [or t31a t30] 2562 mova m21, m23 2563 mova m18, m20 2564 mova m17, m22 2565 mova m19, m16 2566 2567 jmp .main4 2568.main_fast2: ; bottom three-quarters are zero 2569 vbroadcasti32x4 m8, [o(pd_799_4017)] 2570 pmulld m8, m1 ; t4 t7 2571 vpmulld m0, [o(pd_2896)] {1to16} ; t0 t1 2572 REPX {paddd x, m13}, m8, m0 2573 REPX {psrad x, 12 }, m8, m0 2574 pmulld m3, m8, m12 2575 mova m2, m0 ; t3 t2 2576 call m(idct_8x8_internal_10bpc).main3 2577 vbroadcasti32x4 m6, [o(pd_4076_3920)] 2578 vbroadcasti32x4 m3, [o(pd_401_m1189)] 2579 pmulld m6, m4 ; t15 t12 2580 pmulld m4, m3 ; t9 t10 2581 REPX {paddd x, m13}, m6, m4 2582 REPX {psrad x, 12 }, m6, m4 2583 mova m5, m6 ; t14 t13 2584 mova m9, m4 ; t8 t11 2585 call m(idct_16x8_internal_10bpc).main3 2586 vbroadcasti32x4 m23, [o(pd_4091_3973)] 2587 vbroadcasti32x4 m7, [o(pd_201_995)] 2588 vbroadcasti32x4 m22, [o(pd_1380_601)] 2589 vbroadcasti32x4 m9, [o(pd_3857_4052)] 2590 pmulld m23, m16 ; t16 t20 2591 pmulld m16, m7 ; t31 t27 2592 pmulld m22, m17 ; -t19 -t25 2593 pmulld m17, m9 ; t28 t24 2594 REPX {paddd x, m13}, m23, m16, m17 2595 psubd m22, m13, m22 2596 REPX {psrad x, 12 }, m23, m16, m22, m17 2597 mova m20, m23 ; t30 t26 2598 mova m9, m16 ; t17 t21 2599 mova m19, m22 ; t18 t22 2600 mova m18, m17 ; t29 t25 2601 jmp .main3 2602.main_fast: ; bottom half is zero 2603 vbroadcasti32x4 m23, [o(pd_4091_3973)] 2604 vbroadcasti32x4 m7, [o(pd_201_995)] 2605 vbroadcasti32x4 m20, [o(pd_2751_2106)] 2606 vbroadcasti32x4 m9, [o(pd_3035_3513)] 2607 vbroadcasti32x4 m21, [o(pd_3703_3290)] 2608 vbroadcasti32x4 m10, [o(pd_1751_2440)] 2609 vbroadcasti32x4 m22, [o(pd_1380_601)] 2610 vbroadcasti32x4 m11, [o(pd_3857_4052)] 2611 pmulld m23, m16 ; t16a t20a 2612 pmulld m16, m7 ; t31a t27a 2613 pmulld m20, m19 ; -t17a -t21a 2614 pmulld m19, m9 ; t30a t26a 2615 pmulld m21, m18 ; t18a t22a 2616 pmulld m18, m10 ; t29a t25a 2617 pmulld m22, m17 ; -t19a -t25a 2618 pmulld m17, m11 ; t28a t24a 2619 psubd m20, m13, m20 2620 psubd m22, m13, m22 2621 jmp .main2 2622.main: 2623 ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 201_995, 4091_3973 2624 ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3035_3513, 2751_2106 2625 ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1751_2440, 3703_3290 2626 ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3857_4052, 1380_601 2627 paddd m20, m13 2628 paddd m22, m13 2629.main2: 2630 REPX {paddd x, m13}, m16, m23, m19 2631 REPX {psrad x, 12 }, m16, m20, m23, m19 2632 psubd m9, m16, m20 ; t17 t21 2633 paddd m16, m20 ; t16 t20 2634 psubd m20, m23, m19 ; t30 t26 2635 paddd m23, m19 ; t31 t27 2636 REPX {pmaxsd x, m14}, m9, m16, m20, m23 2637 REPX {paddd x, m13}, m21, m18, m17 2638 REPX {psrad x, 12 }, m18, m22, m21, m17 2639 psubd m19, m22, m18 ; t18 t22 2640 paddd m22, m18 ; t19 t23 2641 psubd m18, m17, m21 ; t29 t25 2642 paddd m17, m21 ; t28 t24 2643 REPX {pmaxsd x, m14}, m19, m22, m18, m17 2644 REPX {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17 2645.main3: 2646 vbroadcasti32x4 m11, [o(pd_4017_2276)] 2647 vbroadcasti32x4 m10, [o(pd_799_3406)] 2648 psubd m7, m0, m6 ; dct16 out15 out14 2649 paddd m0, m6 ; dct16 out0 out1 2650 psubd m6, m1, m5 ; dct16 out12 out13 2651 paddd m1, m5 ; dct16 out3 out2 2652 psubd m5, m2, m4 ; dct16 out11 out10 2653 paddd m2, m4 ; dct16 out4 out5 2654 psubd m4, m3, m8 ; dct16 out8 out9 2655 paddd m3, m8 ; dct16 out7 out6 2656 ITX_MULSUB_2D 20, 9, 8, 21, _, 13, 10, 11 2657 ITX_MULSUB_2D 18, 19, 8, 21, _, 13, 10, 11, 2 2658 REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3 2659 punpckhqdq m21, m16, m20 ; t20 t21a 2660 punpcklqdq m16, m20 ; t16 t17a 2661 punpcklqdq m20, m22, m19 ; t19 t18a 2662 punpckhqdq m22, m19 ; t23 t22a 2663 REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 2664 punpcklqdq m19, m23, m9 ; t31 t30a 2665 punpckhqdq m23, m9 ; t27 t26a 2666 punpckhqdq m9, m17, m18 ; t24 t25a 2667 punpcklqdq m17, m18 ; t28 t29a 2668 psubd m18, m16, m20 ; t19a t18 2669 paddd m20, m16 ; t16a t17 2670 psubd m16, m19, m17 ; t28a t29 2671 paddd m19, m17 ; t31a t30 2672 psubd m17, m22, m21 ; t20a t21 2673 paddd m22, m21 ; t23a t22 2674 psubd m21, m9, m23 ; t27a t26 2675 paddd m23, m9 ; t24a t25 2676 REPX {pmaxsd x, m14}, m18, m16, m17, m21 2677 REPX {pminsd x, m15}, m16, m18, m21, m17 2678 REPX {pmaxsd x, m14}, m20, m22, m19, m23 2679 REPX {pminsd x, m15}, m20, m22, m19, m23 2680.main4: 2681 vpbroadcastd m11, [o(pd_3784)] 2682 vpbroadcastd m10, [o(pd_1567)] 2683 ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11 2684 ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2 2685 paddd m9, m20, m22 ; t16 t17a 2686 psubd m20, m22 ; t23 t22a 2687 paddd m22, m19, m23 ; t31 t30a 2688 psubd m19, m23 ; t24 t25a 2689 psubd m23, m16, m17 ; t20a t21 2690 paddd m16, m17 ; t19a t18 2691 psubd m17, m18, m21 ; t27a t26 2692 paddd m21, m18 ; t28a t29 2693 REPX {pmaxsd x, m14}, m20, m19, m23, m17 2694 REPX {pminsd x, m15}, m19, m20, m17, m23 2695 REPX {pmulld x, m12}, m19, m20, m17, m23 2696 REPX {pmaxsd x, m14}, m22, m21, m16, m9 2697 paddd m19, m13 2698 paddd m17, m13 2699 REPX {pminsd x, m15}, m22, m21, m16, m9 2700 psubd m18, m19, m20 ; t23a t22 2701 paddd m19, m20 ; t24a t25 2702 paddd m20, m17, m23 ; t27 t26a 2703 psubd m17, m23 ; t20 t21a 2704 REPX {psrad x, 12 }, m20, m19, m18, m17 2705 ret 2706.transpose_8x32: 2707 mova m10, [o(idct32x8p)] 2708 psrlw m8, m10, 8 2709 mova m9, m8 2710 vpermi2w m8, m1, m5 2711 vpermt2w m1, m10, m5 2712 vprold m5, m9, 16 2713 vpermi2w m9, m3, m7 2714 vpermt2w m3, m10, m7 2715 vprold m10, 16 2716 mova m7, m5 2717 vpermi2w m5, m0, m4 2718 vpermt2w m0, m10, m4 2719 vpermi2w m7, m2, m6 2720 vpermt2w m2, m10, m6 2721 punpckhdq m6, m5, m8 2722 punpckldq m5, m8 2723 punpckhdq m8, m7, m9 2724 punpckldq m7, m9 2725 punpckhdq m4, m2, m3 2726 punpckldq m2, m3 2727 punpckhdq m3, m0, m1 2728 punpckldq m0, m1 2729 ret 2730 2731cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob 2732 vpbroadcastd m5, [pw_4096] 2733 lea r4, [strideq*3] 2734 mova m6, [idtx32x8p] 2735 lea r5, [strideq*5] 2736 vpbroadcastd m9, [pixel_10bpc_max] 2737 lea r6, [strideq+r4*2] 2738 pxor m8, m8 2739 sub eobd, 107 2740 psrlw m7, m6, 8 2741.loop: 2742 mova m0, [cq+64*0] 2743 packssdw m0, [cq+64*1] ; 02 13 2744 mova m1, [cq+64*2] 2745 packssdw m1, [cq+64*3] ; 46 57 2746 mova m2, [cq+64*4] 2747 packssdw m2, [cq+64*5] ; 8a 9b 2748 mova m3, [cq+64*6] 2749 packssdw m3, [cq+64*7] ; ce df 2750 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 2751 REPX {mova [cq+64*x], m8}, 0, 1, 2, 3 2752 mova m4, m6 2753 vpermi2w m4, m1, m3 2754 vpermt2w m1, m7, m3 2755 REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 2756 mova m3, m7 2757 vpermi2w m3, m0, m2 2758 vpermt2w m0, m6, m2 2759 add cq, 64*8 2760 punpcklqdq m2, m3, m1 ; 4 5 2761 punpckhqdq m3, m1 ; 6 7 2762 punpckhqdq m1, m0, m4 ; 2 3 2763 punpcklqdq m0, m4 ; 0 1 2764 mova ym4, [dstq+strideq*0] 2765 vinserti32x8 m4, [dstq+strideq*1], 1 2766 paddw m0, m4 2767 mova ym4, [dstq+strideq*2] 2768 vinserti32x8 m4, [dstq+r4 *1], 1 2769 paddw m1, m4 2770 mova ym4, [dstq+strideq*4] 2771 vinserti32x8 m4, [dstq+r5 *1], 1 2772 paddw m2, m4 2773 mova ym4, [dstq+r4 *2] 2774 vinserti32x8 m4, [dstq+r6 *1], 1 2775 paddw m3, m4 2776 REPX {pmaxsw x, m8}, m0, m1, m2, m3 2777 REPX {pminsw x, m9}, m0, m1, m2, m3 2778 mova [dstq+strideq*0], ym0 2779 vextracti32x8 [dstq+strideq*1], m0, 1 2780 mova [dstq+strideq*2], ym1 2781 vextracti32x8 [dstq+r4 *1], m1, 1 2782 mova [dstq+strideq*4], ym2 2783 vextracti32x8 [dstq+r5 *1], m2, 1 2784 mova [dstq+r4 *2], ym3 2785 vextracti32x8 [dstq+r6 *1], m3, 1 2786 add dstq, 32 2787 add eobd, 0x80000000 2788 jnc .loop 2789 RET 2790 2791cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob 2792%undef cmp 2793 lea r5, [o_base] 2794 test eobd, eobd 2795 jz .dconly 2796 vpbroadcastd m12, [o(pd_2896)] 2797 vpbroadcastd m13, [o(pd_2048)] 2798 vpbroadcastd m14, [o(clip_18b_min)] 2799 vpbroadcastd m15, [o(clip_18b_max)] 2800%if WIN64 2801 movaps [rsp+ 8], xmm6 2802 movaps [rsp+24], xmm7 2803%endif 2804 cmp eobd, 36 2805 jl .fast 2806 call .pass1 2807 cmp eobd, 151 2808 jge .full 2809 lea r5, [o_base_8bpc] 2810 pxor m9, m9 2811 punpcklwd m8, m1, m1 ; 2 2812 punpckhwd m14, m1, m1 ; 3 2813 punpcklwd m1, m3, m3 ; 6 2814 punpckhwd m15, m3, m3 ; 7 2815 punpcklwd m3, m6, m6 ; 12 2816 punpckhwd m19, m6, m6 ; 13 2817 punpcklwd m6, m9, m4 ; __ 8 2818 punpckhwd m20, m4, m4 ; 9 2819 punpckhwd m16, m5, m5 ; 11 2820 punpcklwd m5, m5 ; 10 2821 punpcklwd m9, m0 ; __ 0 2822 punpckhwd m21, m0, m0 ; 1 2823 punpcklwd m0, m7, m7 ; 14 2824 punpckhwd m17, m7, m7 ; 15 2825 punpcklwd m7, m2, m2 ; 4 2826 punpckhwd m18, m2, m2 ; 5 2827 call m(idct_16x16_internal_8bpc).main_fast 2828 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 2829 mov r6d, 64*3 2830 pxor m8, m8 2831.zero_loop: 2832 REPX {mova [cq+r6*8+128*x], m8}, 3, 2, 1, 0 2833 sub r6d, 64 2834 jge .zero_loop 2835 jmp .pass2_end 2836.full: 2837 mova [cq+128*0], m0 2838 mova [cq+128*1], m1 2839 mova [cq+128*2], m2 2840 mova [cq+128*3], m3 2841 mova [cq+128*4], m4 2842 mova [cq+128*5], m5 2843 mova [cq+128*6], m6 2844 mova [cq+128*7], m7 2845 add cq, 64 2846 call .pass1 2847 mova m9, [cq-64* 1] ; 0 1 2848 mova m14, [cq+64* 1] ; 2 3 2849 mova m18, [cq+64* 3] ; 4 5 2850 mova m15, [cq+64* 5] ; 6 7 2851 mova m20, [cq+64* 7] ; 8 9 2852 mova m16, [cq+64* 9] ; 10 11 2853 mova m22, [cq+64*11] ; 12 13 2854 mova m19, [cq+64*13] ; 14 15 2855 lea r5, [o_base_8bpc] 2856 punpcklwd m8, m7, m14 ; 30 2 2857 punpckhwd m21, m7, m9 ; 31 1 2858 punpcklwd m7, m6, m18 ; 28 4 2859 punpckhwd m14, m6 ; 3 29 2860 punpcklwd m9, m0, m9 ; 16 0 2861 punpckhwd m17, m19, m0 ; 15 17 2862 punpcklwd m0, m19, m1 ; 14 18 2863 punpckhwd m19, m1, m22 ; 19 13 2864 punpcklwd m1, m15, m5 ; 6 26 2865 punpckhwd m18, m5, m18 ; 27 5 2866 punpcklwd m6, m4, m20 ; 24 8 2867 punpckhwd m15, m4 ; 7 25 2868 punpcklwd m5, m3, m16 ; 22 10 2869 punpckhwd m20, m3, m20 ; 23 9 2870 punpcklwd m3, m22, m2 ; 12 20 2871 punpckhwd m16, m2 ; 11 21 2872 call m(idct_16x16_internal_8bpc).main2 2873 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf 2874 mov r6d, 32*7 2875 pxor m8, m8 2876.full_zero_loop: 2877 REPX {mova [cq+r6*8+64*x], m8}, 2, 1, 0, -1 2878 sub r6d, 32 2879 jge .full_zero_loop 2880 jmp .pass2_end 2881.fast: 2882 mova ym0, [cq+128*0] 2883 mova ym2, [cq+128*4] 2884 movshdup m8, [o(permB)] 2885 mova ym1, [cq+128*2] 2886 mova ym3, [cq+128*6] 2887 mova ym4, [cq+128*1] 2888 mova ym5, [cq+128*3] 2889 mova ym6, [cq+128*5] 2890 mova ym7, [cq+128*7] 2891 vpermt2q m0, m8, m2 ; 0 4 2892 vpermt2q m1, m8, m3 ; 2 6 2893 vpermt2q m4, m8, m5 ; 1 3 2894 vpermt2q m7, m8, m6 ; 7 5 2895 REPX {pmulld x, m12}, m0, m1, m4, m7 2896 pxor ym16, ym16 2897 mova [cq+128*0], ym16 2898 REPX {vmovdqa32 [cq+128*x], ym16}, 1, 2, 3, 4, 5, 6, 7 2899 REPX {paddd x, m13}, m0, m1, m4, m7 2900 REPX {psrad x, 12 }, m0, m1, m4, m7 2901 call m(idct_8x8_internal_10bpc).main_fast 2902 call m(idct_16x8_internal_10bpc).main_fast 2903 vpbroadcastd m11, [o(pd_1)] 2904 call m(idct_8x16_internal_10bpc).main_end2 2905 mova m8, [o(idct8x32p)] 2906 packssdw m0, m4 2907 packssdw m1, m5 2908 packssdw m2, m6 2909 packssdw m3, m7 2910 mova m6, [dup16_perm] 2911 vpermb m0, m8, m0 2912 vpermb m2, m8, m2 2913 vprold m8, 16 2914 vpermb m1, m8, m1 2915 vpermb m3, m8, m3 2916 punpckldq m4, m0, m2 2917 punpckhdq m0, m2 2918 punpckldq m2, m1, m3 2919 punpckhdq m1, m3 2920 punpckldq m21, m4, m2 2921 punpckhdq m14, m4, m2 2922 punpckldq m18, m0, m1 2923 punpckhdq m15, m0, m1 2924 vpermb m8, m6, m14 ; 2 2925 vpermb m1, m6, m15 ; 6 2926 vpermb m7, m6, m18 ; 4 2927 pmovzxwd m9, ym21 ; 0 2928 vpord m6, [o(pb_32)] {1to16} 2929 lea r5, [o_base_8bpc] 2930 vpermb m21, m6, m21 ; 1 2931 vpermb m15, m6, m15 ; 7 2932 vpermb m18, m6, m18 ; 5 2933 vpermb m14, m6, m14 ; 3 2934 pslld m9, 16 2935 call m(idct_16x16_internal_8bpc).main_fast2 2936 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 2937.pass2_end: 2938 movshdup m22, [permC] 2939 vpbroadcastd m11, [pw_2048] 2940 vpbroadcastd m13, [pixel_10bpc_max] 2941 lea r6, [strideq*3] 2942 pxor m12, m12 2943 psrlq m23, m22, 8 2944 vpermq m8, m22, m0 2945 vpermq m9, m23, m1 2946 call m(idct_16x8_internal_10bpc).write_16x4 2947 vpermq m8, m22, m2 2948 vpermq m9, m23, m3 2949 call m(idct_16x8_internal_10bpc).write_16x4 2950 vpermq m8, m22, m4 2951 vpermq m9, m23, m5 2952 call m(idct_16x8_internal_10bpc).write_16x4 2953 vpermq m8, m22, m6 2954 vpermq m9, m23, m7 2955 call m(idct_16x8_internal_10bpc).write_16x4 2956 vpermq m8, m22, m14 2957 vpermq m9, m23, m15 2958 call m(idct_16x8_internal_10bpc).write_16x4 2959 vpermq m8, m22, m16 2960 vpermq m9, m23, m17 2961 call m(idct_16x8_internal_10bpc).write_16x4 2962 vpermq m8, m22, m18 2963 vpermq m9, m23, m19 2964 call m(idct_16x8_internal_10bpc).write_16x4 2965 vpermq m8, m22, m20 2966 vpermq m9, m23, m21 2967%if WIN64 2968 movaps xmm6, [rsp+ 8] 2969 movaps xmm7, [rsp+24] 2970%endif 2971 vzeroupper 2972 jmp m(idct_16x8_internal_10bpc).write_16x4 2973.pass1: 2974 pmulld m0, m12, [cq+128* 0] 2975 pmulld m1, m12, [cq+128* 2] 2976 pmulld m2, m12, [cq+128* 4] 2977 pmulld m3, m12, [cq+128* 6] 2978 pmulld m4, m12, [cq+128* 8] 2979 pmulld m5, m12, [cq+128*10] 2980 pmulld m6, m12, [cq+128*12] 2981 pmulld m7, m12, [cq+128*14] 2982 call m(idct_8x16_internal_10bpc).main_rect2 2983 pmulld m16, m12, [cq+128* 1] 2984 pmulld m17, m12, [cq+128* 3] 2985 pmulld m18, m12, [cq+128* 5] 2986 pmulld m19, m12, [cq+128* 7] 2987 pmulld m20, m12, [cq+128* 9] 2988 pmulld m21, m12, [cq+128*11] 2989 pmulld m22, m12, [cq+128*13] 2990 pmulld m23, m12, [cq+128*15] 2991 call m(idct_16x16_internal_10bpc).main_rect2 2992 vpbroadcastd m11, [o(pd_1)] 2993 call m(idct_16x16_internal_10bpc).main_end2 2994 jmp m(idct_16x16_internal_10bpc).main_end3 2995.dconly: 2996 imul r6d, [cq], 181 2997 mov [cq], eobd 2998 or r3d, 32 2999 jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly 3000 3001cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 16, dst, stride, c, eob 3002%undef cmp 3003 vpbroadcastd m10, [pw_2896x8] 3004 vpbroadcastd m11, [pw_1697x16] 3005 vpbroadcastd m13, [pw_8192] 3006 vpbroadcastd m15, [pixel_10bpc_max] 3007 lea r6, [strideq*9] 3008 pxor m14, m14 3009 paddw m12, m13, m13 ; pw_16384 3010 cmp eobd, 151 3011 jl .main 3012 call .main 3013 add cq, 64-128*4 3014 lea dstq, [dstq+strideq*8] 3015.main: 3016 call .main_internal 3017 add cq, 128*4 3018 pmulhrsw m1, m13, m2 3019 pmulhrsw m3, m13, m4 3020 pmulhrsw m5, m13, m6 3021 pmulhrsw m7, m13, m8 3022 call .main_internal 3023.main2: 3024 pmulhrsw m2, m13 3025 pmulhrsw m4, m13 3026 pmulhrsw m6, m13 3027 pmulhrsw m8, m13 3028 punpcklqdq m0, m1, m2 ; 0 8 3029 punpckhqdq m1, m2 ; 1 9 3030 call .write_16x2x2 3031 punpcklqdq m0, m3, m4 ; 2 10 3032 punpckhqdq m1, m3, m4 ; 3 11 3033 call .write_16x2x2 3034 punpcklqdq m0, m5, m6 ; 4 12 3035 punpckhqdq m1, m5, m6 ; 5 13 3036 call .write_16x2x2 3037 punpcklqdq m0, m7, m8 ; 6 14 3038 punpckhqdq m1, m7, m8 ; 7 15 3039.write_16x2x2: 3040 mova ym2, [dstq+strideq*0] 3041 vinserti32x8 m2, [dstq+strideq*8], 1 3042 mova ym9, [dstq+strideq*1] 3043 vinserti32x8 m9, [dstq+r6 ], 1 3044 paddw m0, m2 3045 paddw m1, m9 3046 pmaxsw m0, m14 3047 pmaxsw m1, m14 3048 pminsw m0, m15 3049 pminsw m1, m15 3050 mova [dstq+strideq*0], ym0 3051 vextracti32x8 [dstq+strideq*8], m0, 1 3052 mova [dstq+strideq*1], ym1 3053 vextracti32x8 [dstq+r6 ], m1, 1 3054 lea dstq, [dstq+strideq*2] 3055 ret 3056.main_internal: 3057 mova m8, [cq+128* 0] 3058 packssdw m8, [cq+128* 8] 3059 mova m6, [cq+128* 1] 3060 packssdw m6, [cq+128* 9] 3061 mova m0, [cq+128* 2] 3062 packssdw m0, [cq+128*10] 3063 mova m2, [cq+128* 3] 3064 packssdw m2, [cq+128*11] 3065 REPX {pmulhrsw x, m10}, m8, m6, m0, m2 3066 REPX {vpermq x, x, q3120}, m8, m6, m0, m2 3067 pmulhrsw m4, m11, m8 3068 pmulhrsw m9, m11, m6 3069 REPX {mova [cq+128*x], m14}, 0, 1, 2, 3 3070 pmulhrsw m4, m12 3071 pmulhrsw m9, m12 3072 paddsw m8, m4 3073 paddsw m6, m9 3074 pmulhrsw m4, m11, m0 3075 pmulhrsw m9, m11, m2 3076 REPX {mova [cq+128*x], m14}, 8, 9, 10, 11 3077 pmulhrsw m4, m12 3078 pmulhrsw m9, m12 3079 paddsw m0, m4 3080 paddsw m2, m9 3081 punpcklwd m4, m8, m6 3082 punpckhwd m8, m6 3083 punpcklwd m6, m0, m2 3084 punpckhwd m0, m2 3085 punpckldq m2, m4, m6 ; 0 1 3086 punpckhdq m4, m6 ; 2 3 3087 punpckldq m6, m8, m0 ; 4 5 3088 punpckhdq m8, m0 ; 6 7 3089 ret 3090 3091cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob 3092%undef cmp 3093 lea r5, [o_base] 3094 test eobd, eobd 3095 jz .dconly 3096 vpbroadcastd m12, [o(pd_2896)] 3097 vpbroadcastd m13, [o(pd_2048)] 3098 vpbroadcastd m14, [o(clip_18b_min)] 3099 vpbroadcastd m15, [o(clip_18b_max)] 3100%if WIN64 3101 movaps [rsp+ 8], xmm6 3102 movaps [rsp+24], xmm7 3103%endif 3104 mov r6d, 8*12 3105 cmp eobd, 36 3106 jl .fast 3107 pmulld m0, m12, [cq+64* 0] 3108 pmulld m1, m12, [cq+64* 4] 3109 pmulld m2, m12, [cq+64* 8] 3110 pmulld m3, m12, [cq+64*12] 3111 pmulld m16, m12, [cq+64* 2] 3112 pmulld m17, m12, [cq+64* 6] 3113 pmulld m18, m12, [cq+64*10] 3114 pmulld m19, m12, [cq+64*14] 3115 cmp eobd, 151 3116 jge .full 3117 call m(idct_8x16_internal_10bpc).main_fast_rect2 3118 call m(idct_16x16_internal_10bpc).main_fast_rect2 3119 call .idct16_sumsub 3120 call .pass1_load_spill 3121 call .main_fast_rect2 3122 jmp .pass1_end 3123.full: 3124 pmulld m4, m12, [cq+64*16] 3125 pmulld m5, m12, [cq+64*20] 3126 pmulld m6, m12, [cq+64*24] 3127 pmulld m7, m12, [cq+64*28] 3128 pmulld m20, m12, [cq+64*18] 3129 pmulld m21, m12, [cq+64*22] 3130 pmulld m22, m12, [cq+64*26] 3131 pmulld m23, m12, [cq+64*30] 3132 add r6d, 8*16 3133 call m(idct_8x16_internal_10bpc).main_rect2 3134 call m(idct_16x16_internal_10bpc).main_rect2 3135 call .idct16_sumsub 3136 call .pass1_load_spill 3137 pmulld m16, m12, [cq+64*17] 3138 pmulld m17, m12, [cq+64*19] 3139 pmulld m18, m12, [cq+64*21] 3140 pmulld m19, m12, [cq+64*23] 3141 pmulld m20, m12, [cq+64*25] 3142 pmulld m21, m12, [cq+64*27] 3143 pmulld m22, m12, [cq+64*29] 3144 pmulld m23, m12, [cq+64*31] 3145 call .main_rect2 3146.pass1_end: 3147 vpbroadcastd m11, [o(pd_1)] 3148 lea r4, [cq+64] 3149 call .idct32_pass1_end 3150 lea r5, [o_base_8bpc] 3151 punpckhqdq m19, m5, m16 ; 11 3152 punpcklqdq m5, m16 ; 10 3153 punpckhqdq m16, m2, m1 ; 5 3154 punpcklqdq m2, m1 ; 4 3155 punpcklqdq m1, m15, m4 ; 2 3156 punpckhqdq m15, m4 ; 3 3157 punpcklqdq m4, m14, m18 ; 8 3158 punpckhqdq m18, m14, m18 ; 9 3159 punpckhqdq m14, m0, m20 ; 1 3160 punpcklqdq m0, m20 ; 0 3161 punpckhqdq m20, m6, m17 ; 13 3162 punpcklqdq m6, m17 ; 12 3163 punpckhqdq m17, m3, m21 ; 7 3164 punpcklqdq m3, m21 ; 6 3165 punpckhqdq m21, m7, m8 ; 15 3166 punpcklqdq m7, m8 ; 14 3167 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 3168 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 3169 jmp .end 3170.fast: 3171 pmulld ym0, ym12, [cq+64*0] 3172 pmulld ym1, ym12, [cq+64*4] 3173 movshdup m7, [o(permB)] 3174 mova ym4, [cq+64*2] 3175 mova ym5, [cq+64*6] 3176 mova ym16, [cq+64*1] 3177 mova ym2, [cq+64*5] 3178 mova ym3, [cq+64*3] 3179 mova ym17, [cq+64*7] 3180 vpermt2q m4, m7, m5 ; 2 6 3181 vpermt2q m16, m7, m2 ; 1 5 3182 vpermt2q m17, m7, m3 ; 7 3 3183 paddd ym0, ym13 3184 paddd ym1, ym13 3185 psrad ym0, 12 3186 psrad ym1, 12 3187 vpermq m0, m7, m0 ; 0 0 3188 vpermq m1, m7, m1 ; 4 4 3189 REPX {pmulld x, m12}, m4, m16, m17 3190 REPX {paddd x, m13}, m4, m16, m17 3191 REPX {psrad x, 12 }, m4, m16, m17 3192 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 3193 vpbroadcastd m11, [o(pd_1)] 3194 call m(idct_16x16_internal_10bpc).main_end2 3195 call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 3196 lea r5, [o_base_8bpc] 3197 punpckhqdq m14, m0, m2 ; 1 3198 punpcklqdq m0, m2 ; 0 3199 punpcklqdq m1, m3, m4 ; 2 3200 punpckhqdq m15, m3, m4 ; 3 3201 punpcklqdq m2, m5, m7 ; 4 3202 punpckhqdq m16, m5, m7 ; 5 3203 punpcklqdq m3, m6, m8 ; 6 3204 punpckhqdq m17, m6, m8 ; 7 3205 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 3206.end: 3207%if WIN64 3208 movaps xmm6, [rsp+ 8] 3209 movaps xmm7, [rsp+24] 3210%endif 3211 pxor m12, m12 3212.zero_loop: 3213 mova [cq+r6*8+64*3], m12 3214 mova [cq+r6*8+64*2], m12 3215 mova [cq+r6*8+64*1], m12 3216 mova [cq+r6*8+64*0], m12 3217 sub r6d, 8*4 3218 jge .zero_loop 3219 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start 3220 pmulhrsw m0, m11, m14 3221 pmulhrsw m1, m11, m15 3222 pmulhrsw m2, m11, m16 3223 pmulhrsw m3, m11, m17 3224 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 3225 pmulhrsw m0, m11, m18 3226 pmulhrsw m1, m11, m19 3227 pmulhrsw m2, m11, m20 3228 pmulhrsw m3, m11, m21 3229 vzeroupper 3230 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 3231.dconly: 3232 imul r6d, [cq], 181 3233 mov [cq], eobd 3234 or r3d, 16 3235.dconly3: 3236 add r6d, 128 3237 sar r6d, 8 3238 imul r6d, 181 3239 add r6d, 384 3240 sar r6d, 9 3241.dconly2: 3242 vpbroadcastd m3, [o(dconly_10bpc)] 3243 imul r6d, 181 3244 add r6d, 2176 3245 sar r6d, 12 3246 vpbroadcastw m2, r6d 3247 paddsw m2, m3 3248.dconly_loop: 3249 paddsw m0, m2, [dstq+strideq*0] 3250 paddsw m1, m2, [dstq+strideq*1] 3251 psubusw m0, m3 3252 psubusw m1, m3 3253 mova [dstq+strideq*0], m0 3254 mova [dstq+strideq*1], m1 3255 lea dstq, [dstq+strideq*2] 3256 sub r3d, 2 3257 jg .dconly_loop 3258 RET 3259ALIGN function_align 3260.idct16_sumsub: 3261 psubd m23, m0, m22 ; t15 3262 paddd m0, m22 ; t0 3263 psubd m22, m1, m21 ; t14 3264 paddd m1, m21 ; t1 3265 REPX {pmaxsd x, m14}, m23, m0, m22, m1 3266 psubd m21, m2, m20 ; t13 3267 paddd m2, m20 ; t2 3268 REPX {pminsd x, m15}, m23, m0, m22, m1 3269 psubd m20, m3, m19 ; t12 3270 paddd m3, m19 ; t3 3271 REPX {pmaxsd x, m14}, m21, m2, m20, m3 3272 psubd m19, m4, m18 ; t11 3273 paddd m4, m18 ; t4 3274 REPX {pminsd x, m15}, m21, m2, m20, m3 3275 psubd m18, m5, m17 ; t10 3276 paddd m5, m17 ; t5 3277 REPX {pmaxsd x, m14}, m19, m4, m18, m5 3278 psubd m17, m6, m16 ; t9 3279 paddd m6, m16 ; t6 3280 REPX {pminsd x, m15}, m19, m4, m18, m5 3281 psubd m16, m7, m9 ; t8 3282 paddd m7, m9 ; t7 3283 REPX {pmaxsd x, m14}, m17, m6, m16, m7 3284 REPX {pminsd x, m15}, m17, m6, m16, m7 3285 ret 3286.idct32_pass1_end: 3287 psrlq m12, [o(permC)], 24 ; 0 2 8 10 1 3 9 11 3288 psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 3289%macro IDCT32_PASS1_END 2 ; low, high 3290 paddd m8, m11, [r4+128*%1] 3291 paddd m9, m11, [cq+128*%1] 3292 psubd m10, m8, m%1 ; out 16+n 3293 paddd m8, m%1 ; out 15-n 3294 paddd m%1, m9, m%2 ; out 0+n 3295 psubd m9, m%2 ; out 31-n 3296 REPX {vpsravd x, m11}, m10, m%1, m8, m9 3297 packssdw m%1, m10 ; 0+n 16+n 3298 packssdw m%2, m8, m9 ; 15-n 31-n 3299%endmacro 3300 IDCT32_PASS1_END 0, 23 ; 0 16, 15 31 3301 IDCT32_PASS1_END 7, 16 ; 7 23, 8 24 3302 IDCT32_PASS1_END 1, 22 ; 1 17, 14 30 3303 IDCT32_PASS1_END 6, 17 ; 6 22, 9 25 3304 IDCT32_PASS1_END 2, 21 ; 2 18, 13 29 3305 IDCT32_PASS1_END 5, 18 ; 5 21, 10 26 3306 IDCT32_PASS1_END 3, 20 ; 3 19, 12 28 3307 IDCT32_PASS1_END 4, 19 ; 4 20, 11 27 3308.transpose_16x32: 3309 mova m14, m13 3310 vpermi2q m14, m0, m16 3311 vpermt2q m0, m12, m16 3312 mova m15, m13 3313 vpermi2q m15, m1, m17 3314 vpermt2q m1, m12, m17 3315 mova m16, m13 3316 vpermi2q m16, m2, m18 3317 vpermt2q m2, m12, m18 3318 mova m17, m13 3319 vpermi2q m17, m3, m19 3320 vpermt2q m3, m12, m19 3321 mova m18, m13 3322 vpermi2q m18, m4, m20 3323 vpermt2q m4, m12, m20 3324 mova m19, m13 3325 vpermi2q m19, m5, m21 3326 vpermt2q m5, m12, m21 3327 mova m20, m13 3328 vpermi2q m20, m6, m22 3329 vpermt2q m6, m12, m22 3330 mova m21, m13 3331 vpermi2q m21, m7, m23 3332 vpermt2q m7, m12, m23 3333 punpckhwd m8, m2, m3 ; c04 d04 c05 d05 c06 d06 c07 d07 3334 punpcklwd m2, m3 ; c00 d00 c01 d01 c02 d02 c03 d03 3335 punpckhwd m3, m0, m1 ; a04 b04 a05 b05 a06 b06 a07 b07 3336 punpcklwd m0, m1 ; a00 b00 a01 b01 a02 b02 a03 b03 3337 punpckhwd m1, m4, m5 ; e04 f04 e05 f05 e06 f06 e07 f07 3338 punpcklwd m4, m5 ; e00 f00 e01 f01 e02 f02 e03 f03 3339 punpckhwd m5, m6, m7 ; g04 h04 g05 h05 g06 h06 g07 h07 3340 punpcklwd m6, m7 ; g00 h00 g01 h01 g02 h02 g03 h03 3341 punpckhwd m7, m14, m15 ; a12 b12 a13 b13 a14 b14 a15 b15 3342 punpcklwd m14, m15 ; a08 b08 a09 b09 a10 b10 a11 b11 3343 punpckhwd m15, m16, m17 ; c12 d12 c13 d13 c14 d14 c15 d15 3344 punpcklwd m16, m17 ; c08 d08 c09 d09 c10 d10 c11 d11 3345 punpckhwd m17, m18, m19 ; e12 f12 e13 f13 e14 f14 e15 f15 3346 punpcklwd m18, m19 ; e08 f08 e09 f09 e10 f10 e11 f11 3347 punpckhwd m19, m20, m21 ; g12 h12 g13 h13 g14 h14 g15 h15 3348 punpcklwd m20, m21 ; g08 h08 g09 h09 g10 h10 g11 h11 3349 punpckhdq m21, m1, m5 ; e06 f06 g06 h06 e07 f07 g07 h07 3350 punpckldq m1, m5 ; e04 f04 g04 h04 e05 f05 g05 h05 3351 punpckhdq m5, m14, m16 ; a10 b10 c10 d10 a11 b11 c11 d11 3352 punpckldq m14, m16 ; a08 b08 c08 d08 a09 b09 c09 d09 3353 punpckhdq m16, m18, m20 ; e10 f10 g10 h10 e11 f11 g11 h11 3354 punpckldq m18, m20 ; e08 f08 g08 h08 e09 f09 g09 h09 3355 punpckldq m20, m4, m6 ; e00 f00 g00 h00 e01 f01 g01 h01 3356 punpckhdq m4, m6 ; e02 f02 g02 h02 e03 f03 g03 h03 3357 punpckldq m6, m7, m15 ; a12 b12 c12 d12 a13 b13 c13 d13 3358 punpckhdq m7, m15 ; a14 b14 c14 d14 a15 b15 c15 d15 3359 punpckhdq m15, m0, m2 ; a02 b02 c02 d02 a03 b03 c03 d03 3360 punpckldq m0, m2 ; a00 b00 c00 d00 a01 b01 c01 d01 3361 punpckldq m2, m3, m8 ; a04 b04 c04 d04 a05 b05 c05 d05 3362 punpckhdq m3, m8 ; a06 b06 c06 d06 a07 b07 c07 d07 3363 punpckhdq m8, m17, m19 ; e14 f14 g14 h14 e15 f15 g15 h15 3364 punpckldq m17, m19 ; e12 f12 g12 h12 e13 f13 g13 h13 3365 ret 3366.pass1_load_spill: 3367 mova [cq+64* 0], m0 3368 mova [cq+64* 2], m1 3369 mova [cq+64* 4], m2 3370 mova [cq+64* 6], m3 3371 mova [cq+64* 8], m4 3372 mova [cq+64*10], m5 3373 mova [cq+64*12], m6 3374 mova [cq+64*14], m7 3375 pmulld m0, m12, [cq+64* 1] 3376 pmulld m1, m12, [cq+64* 3] 3377 pmulld m2, m12, [cq+64* 5] 3378 pmulld m3, m12, [cq+64* 7] 3379 pmulld m4, m12, [cq+64* 9] 3380 pmulld m5, m12, [cq+64*11] 3381 pmulld m6, m12, [cq+64*13] 3382 pmulld m7, m12, [cq+64*15] 3383 mova [cq+64* 1], m23 3384 mova [cq+64* 3], m22 3385 mova [cq+64* 5], m21 3386 mova [cq+64* 7], m20 3387 mova [cq+64* 9], m19 3388 mova [cq+64*11], m18 3389 mova [cq+64*13], m17 3390 mova [cq+64*15], m16 3391 ret 3392.main_fast2_rect2: 3393 REPX {paddd x, m13}, m0, m1, m2, m3 3394 REPX {psrad x, 12 }, m0, m1, m2, m3 3395.main_fast2: ; bottom 3/4 is zero 3396 pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a 3397 pmulld m0, [o(pd_201)] {1to16} ; t16a 3398 pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a 3399 pmulld m3, [o(pd_3857)] {1to16} ; t28a 3400 pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a 3401 pmulld m2, [o(pd_995)] {1to16} ; t20a 3402 pmulld m6, m1, [o(pd_601)] {1to16} ; t23a 3403 pmulld m17, m1, [o(pd_4052)] {1to16} ; t24a 3404 REPX {psubd x, m13, x}, m20, m6 3405 REPX {paddd x, m13}, m23, m0, m3, m21, m2, m17 3406 REPX {psrad x, 12 }, m20, m6, m23, m0, m3, m21, m2, m17 3407 mova m8, m0 3408 mova m16, m23 3409 mova m7, m20 3410 mova m4, m3 3411 mova m19, m2 3412 mova m18, m21 3413 mova m5, m6 3414 mova m22, m17 3415 jmp .main3 3416.main_fast_rect2: 3417 call m(idct_8x16_internal_10bpc).round 3418.main_fast: ; bottom half is zero 3419 pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a 3420 pmulld m0, [o(pd_201)] {1to16} ; t16a 3421 pmulld m16, m7, [o(pd_2751)] {1to16} ; t17a 3422 pmulld m7, [o(pd_3035)] {1to16} ; t30a 3423 pmulld m19, m4, [o(pd_3703)] {1to16} ; t29a 3424 pmulld m4, [o(pd_1751)] {1to16} ; t18a 3425 pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a 3426 pmulld m3, [o(pd_3857)] {1to16} ; t28a 3427 pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a 3428 pmulld m2, [o(pd_995)] {1to16} ; t20a 3429 pmulld m18, m5, [o(pd_2106)] {1to16} ; t21a 3430 pmulld m5, [o(pd_3513)] {1to16} ; t26a 3431 pmulld m17, m6, [o(pd_3290)] {1to16} ; t25a 3432 pmulld m6, [o(pd_2440)] {1to16} ; t22a 3433 pmulld m22, m1, [o(pd_601)] {1to16} ; t23a 3434 pmulld m1, [o(pd_4052)] {1to16} ; t24a 3435 REPX {psubd x, m13, x}, m16, m20, m18, m22 3436 call m(idct_16x16_internal_10bpc).round3 3437 jmp .main2 3438.main_rect2: 3439 call m(idct_8x16_internal_10bpc).round 3440 call m(idct_16x16_internal_10bpc).round 3441.main: 3442 ITX_MULSUB_2D 0, 23, 8, 9, 10, _, 201, 4091 ; t16a, t31a 3443 ITX_MULSUB_2D 16, 7, 8, 9, 10, _, 3035, 2751 ; t17a, t30a 3444 ITX_MULSUB_2D 4, 19, 8, 9, 10, _, 1751, 3703 ; t18a, t29a 3445 ITX_MULSUB_2D 20, 3, 8, 9, 10, _, 3857, 1380 ; t19a, t28a 3446 ITX_MULSUB_2D 2, 21, 8, 9, 10, _, 995, 3973 ; t20a, t27a 3447 ITX_MULSUB_2D 18, 5, 8, 9, 10, _, 3513, 2106 ; t21a, t26a 3448 ITX_MULSUB_2D 6, 17, 8, 9, 10, _, 2440, 3290 ; t22a, t25a 3449 ITX_MULSUB_2D 22, 1, 8, 9, 10, _, 4052, 601 ; t23a, t24a 3450 call m(idct_16x16_internal_10bpc).round 3451.main2: 3452 call m(idct_8x16_internal_10bpc).round 3453 psubd m8, m0, m16 ; t17 3454 paddd m0, m16 ; t16 3455 psubd m16, m23, m7 ; t30 3456 paddd m23, m7 ; t31 3457 REPX {pmaxsd x, m14}, m8, m0, m16, m23 3458 paddd m7, m20, m4 ; t19 3459 psubd m20, m4 ; t18 3460 REPX {pminsd x, m15}, m8, m0, m16, m23 3461 paddd m4, m3, m19 ; t28 3462 psubd m3, m19 ; t29 3463 REPX {pmaxsd x, m14}, m7, m20, m4, m3 3464 psubd m19, m2, m18 ; t21 3465 paddd m2, m18 ; t20 3466 REPX {pminsd x, m15}, m7, m20, m4, m3 3467 psubd m18, m21, m5 ; t26 3468 paddd m21, m5 ; t27 3469 REPX {pmaxsd x, m14}, m19, m2, m18, m21 3470 psubd m5, m22, m6 ; t22 3471 paddd m6, m22 ; t23 3472 REPX {pminsd x, m15}, m19, m2, m18, m21 3473 psubd m22, m1, m17 ; t25 3474 paddd m17, m1 ; t24 3475 REPX {pmaxsd x, m14}, m5, m6, m22, m17 3476 REPX {pminsd x, m15}, m5, m6, m22, m17 3477.main3: 3478 vpbroadcastd m11, [o(pd_4017)] 3479 vpbroadcastd m10, [o(pd_799)] 3480 ITX_MULSUB_2D 16, 8, 9, 1, _, 13, 10, 11 ; t17a, t30a 3481 ITX_MULSUB_2D 3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a 3482 vpbroadcastd m11, [o(pd_2276)] 3483 vpbroadcastd m10, [o(pd_3406)] 3484 ITX_MULSUB_2D 18, 19, 9, 1, _, 13, 10, 11 ; t21a, t26a 3485 ITX_MULSUB_2D 22, 5, 9, 1, _, 13, 10, 11, 2 ; t25a, t22a 3486 paddd m1, m6, m2 ; t23a 3487 psubd m6, m2 ; t20a 3488 psubd m2, m17, m21 ; t27a 3489 paddd m17, m21 ; t24a 3490 REPX {pmaxsd x, m14}, m1, m6, m2, m17 3491 psubd m21, m23, m4 ; t28a 3492 paddd m23, m4 ; t31a 3493 REPX {pminsd x, m15}, m1, m6, m2, m17 3494 psubd m4, m16, m20 ; t18 3495 paddd m16, m20 ; t17 3496 REPX {pmaxsd x, m14}, m21, m23, m4, m16 3497 psubd m20, m0, m7 ; t19a 3498 paddd m0, m7 ; t16a 3499 REPX {pminsd x, m15}, m21, m23, m4, m16 3500 psubd m7, m8, m3 ; t29 3501 paddd m3, m8 ; t30 3502 REPX {pmaxsd x, m14}, m20, m0, m7, m3 3503 paddd m8, m5, m18 ; t22 3504 psubd m5, m18 ; t21 3505 REPX {pminsd x, m15}, m20, m0, m7, m3 3506 psubd m18, m22, m19 ; t26 3507 paddd m22, m19 ; t25 3508 REPX {pmaxsd x, m14}, m8, m5, m18, m22 3509 vpbroadcastd m11, [o(pd_3784)] 3510 vpbroadcastd m10, [o(pd_1567)] 3511 REPX {pminsd x, m15}, m8, m5, m18, m22 3512 ITX_MULSUB_2D 21, 20, 9, 19, _, 13, 10, 11 ; t19, t28 3513 ITX_MULSUB_2D 2, 6, 9, 19, _, 13, 10, 11, 2 ; t27, t20 3514 ITX_MULSUB_2D 7, 4, 9, 19, _, 13, 10, 11 ; t18a, t29a 3515 ITX_MULSUB_2D 18, 5, 9, 19, _, 13, 10, 11, 2 ; t26a, t21a 3516 psubd m19, m0, m1 ; t23 3517 paddd m0, m1 ; t16 3518 paddd m1, m8, m16 ; t17a 3519 psubd m8, m16, m8 ; t22a 3520 REPX {pmaxsd x, m14}, m19, m0, m1, m8 3521 psubd m16, m23, m17 ; t24 3522 paddd m23, m17 ; t31 3523 REPX {pminsd x, m15}, m19, m0, m1, m8 3524 psubd m17, m3, m22 ; t25a 3525 paddd m22, m3 ; t30a 3526 REPX {pmaxsd x, m14}, m16, m23, m17, m22 3527 paddd m3, m6, m21 ; t19a 3528 psubd m6, m21, m6 ; t20a 3529 REPX {pminsd x, m15}, m16, m23, m17, m22 3530 paddd m21, m18, m4 ; t29 3531 psubd m18, m4, m18 ; t26 3532 REPX {pmaxsd x, m14}, m3, m6, m21, m18 3533 psubd m4, m20, m2 ; t27a 3534 paddd m20, m2 ; t28a 3535 REPX {pminsd x, m15}, m3, m6, m21, m18 3536 paddd m2, m7, m5 ; t18 3537 psubd m7, m5 ; t21 3538 REPX {pmaxsd x, m14}, m4, m20, m2, m7 3539 REPX {pminsd x, m15}, m4, m20, m2, m7 3540 REPX {pmulld x, m12}, m18, m16, m4, m17, m7, m19, m6, m8 3541 REPX {paddd x, m13}, m18, m16, m4, m17 3542 psubd m5, m18, m7 ; t21a 3543 paddd m18, m7 ; t26a 3544 psubd m7, m16, m19 ; t23a 3545 paddd m16, m19 ; t24a 3546 REPX {psrad x, 12 }, m5, m18, m7, m16 3547 paddd m19, m4, m6 ; t27 3548 psubd m4, m6 ; t20 3549 psubd m6, m17, m8 ; t22 3550 paddd m17, m8 ; t25 3551 REPX {psrad x, 12 }, m19, m4, m6, m17 3552 ret 3553 3554cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 16, dst, stride, c, eob 3555%undef cmp 3556 vpbroadcastd m10, [pw_2896x8] 3557 vpbroadcastd m11, [pw_1697x16] 3558 vpbroadcastd m13, [pw_2048] 3559 vpbroadcastd m15, [pixel_10bpc_max] 3560 lea r6, [strideq*9] 3561 pxor m14, m14 3562 cmp eobd, 151 3563 jl .main 3564 mov r4, dstq 3565 call .main 3566 add cq, 64*12 3567 lea dstq, [r4+32] 3568.main: 3569 call .main_internal 3570 add cq, 64*4 3571 pmulhrsw m1, m13, m2 3572 pmulhrsw m3, m13, m4 3573 pmulhrsw m5, m13, m6 3574 pmulhrsw m7, m13, m8 3575 call .main_internal 3576 jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 3577.main_internal: 3578 mova m8, [cq+64* 0] 3579 packssdw m8, [cq+64* 8] 3580 mova m6, [cq+64* 1] 3581 packssdw m6, [cq+64* 9] 3582 mova m0, [cq+64* 2] 3583 packssdw m0, [cq+64*10] 3584 mova m2, [cq+64* 3] 3585 packssdw m2, [cq+64*11] 3586 REPX {pmulhrsw x, m10}, m8, m6, m0, m2 3587 REPX {paddsw x, x }, m8, m6, m0, m2 3588 REPX {vpermq x, x, q3120}, m8, m6, m0, m2 3589 pmulhrsw m4, m11, m8 3590 pmulhrsw m9, m11, m6 3591 paddsw m8, m8 3592 paddsw m6, m6 3593 REPX {mova [cq+64*x], m14}, 0, 1, 2, 3 3594 paddsw m8, m4 3595 paddsw m6, m9 3596 pmulhrsw m4, m11, m0 3597 pmulhrsw m9, m11, m2 3598 paddsw m0, m0 3599 paddsw m2, m2 3600 REPX {mova [cq+64*x], m14}, 8, 9, 10, 11 3601 paddsw m0, m4 3602 paddsw m2, m9 3603 punpcklwd m4, m8, m6 3604 punpckhwd m8, m6 3605 punpcklwd m6, m0, m2 3606 punpckhwd m0, m2 3607 punpckldq m2, m4, m6 ; 0 1 3608 punpckhdq m4, m6 ; 2 3 3609 punpckldq m6, m8, m0 ; 4 5 3610 punpckhdq m8, m0 ; 6 7 3611 ret 3612 3613cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob 3614%undef cmp 3615 lea r5, [o_base] 3616 test eobd, eobd 3617 jz .dconly 3618 vpbroadcastd m12, [o(pd_2896)] 3619 vpbroadcastd m13, [o(pd_2048)] 3620 vpbroadcastd m14, [o(clip_18b_min)] 3621 vpbroadcastd m15, [o(clip_18b_max)] 3622 WIN64_SPILL_XMM 30 3623 cmp eobd, 136 3624 jl .fast 3625 add cq, 64 3626 cmp eobd, 543 3627 jge .full 3628 call .pass1_fast ; bottomright 16x16 zero 3629 mov r6d, 16*12 3630 jmp .lefthalf 3631.full: 3632 call .pass1 3633 mov r6d, 16*28 3634.lefthalf: 3635 mova [cq+128* 0], m0 3636 mova [cq+128* 1], m1 3637 mova [cq+128* 2], m2 3638 mova [cq+128* 3], m3 3639 mova [cq+128* 4], m14 3640 mova [cq+128* 5], m15 3641 mova [cq+128* 6], m16 3642 mova [cq+128* 7], m17 3643 mova [cq+128* 8], m22 3644 mova [cq+128* 9], m23 3645 mova [cq+128*10], m24 3646 mova [cq+128*11], m25 3647 mova [cq+128*12], m26 3648 mova [cq+128*13], m27 3649 mova [cq+128*14], m28 3650 mova [cq+128*15], m29 3651 sub cq, 64 3652 vpbroadcastd m12, [o(pd_2896)] 3653 vpbroadcastd m13, [o(pd_2048)] 3654 vpbroadcastd m14, [o(clip_18b_min)] 3655 vpbroadcastd m15, [o(clip_18b_max)] 3656 call .pass1 3657 lea r5, [o_base_8bpc] 3658 call .pass2_start 3659 pxor m12, m12 3660.right_zero_loop: 3661 mova [cq+r6*8+64+128*3], m12 3662 mova [cq+r6*8+64+128*2], m12 3663 mova [cq+r6*8+64+128*1], m12 3664 mova [cq+r6*8+64+128*0], m12 3665 sub r6d, 16*4 3666 jge .right_zero_loop 3667 mov r6d, 16*28 3668 jmp .end2 3669.pass2_start: 3670 mova m4, [cq+64+128* 0] 3671 mova m5, [cq+64+128* 1] 3672 mova m6, [cq+64+128* 2] 3673 mova m7, [cq+64+128* 3] 3674 mova m18, [cq+64+128* 4] 3675 mova m19, [cq+64+128* 5] 3676 mova m20, [cq+64+128* 6] 3677 mova m21, [cq+64+128* 7] 3678 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 3679 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 3680 mova [cq+128*0], m14 3681 mova [cq+128*1], m15 3682 mova [cq+128*2], m16 3683 mova [cq+128*3], m17 3684 mova [cq+128*4], m18 3685 mova [cq+128*5], m19 3686 mova [cq+128*6], m20 3687 mova [cq+128*7], m21 3688 mova m14, [cq+64+128* 8] 3689 mova m15, [cq+64+128* 9] 3690 mova m16, [cq+64+128*10] 3691 mova m17, [cq+64+128*11] 3692 mova m18, [cq+64+128*12] 3693 mova m19, [cq+64+128*13] 3694 mova m20, [cq+64+128*14] 3695 mova m21, [cq+64+128*15] 3696 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf 3697.fast: ; topleft 16x16 nonzero 3698 cmp eobd, 36 3699 jl .fast2 3700 call .pass1_fast 3701 lea r5, [o_base_8bpc] 3702 call .pass2_fast_start 3703 jmp .end 3704.pass2_fast_start: 3705 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 3706 mova [cq+128*0], m14 3707 mova [cq+128*1], m15 3708 mova [cq+128*2], m16 3709 mova [cq+128*3], m17 3710 mova [cq+128*4], m18 3711 mova [cq+128*5], m19 3712 mova [cq+128*6], m20 3713 mova [cq+128*7], m21 3714 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 3715.fast2: ; topleft 8x8 nonzero 3716 movshdup m7, [o(permB)] 3717 mova ym0, [cq+128*0] 3718 mova ym1, [cq+128*4] 3719 mova ym4, [cq+128*2] 3720 mova ym5, [cq+128*6] 3721 mova ym16, [cq+128*1] 3722 mova ym2, [cq+128*5] 3723 mova ym3, [cq+128*3] 3724 mova ym17, [cq+128*7] 3725 mov r6d, 16*4 3726 vpermq m0, m7, m0 ; 0 0 3727 vpermq m1, m7, m1 ; 4 4 3728 vpermt2q m4, m7, m5 ; 2 6 3729 vpermt2q m16, m7, m2 ; 1 5 3730 vpermt2q m17, m7, m3 ; 7 3 3731 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 3732 call m(idct_16x16_internal_10bpc).main_end 3733 call .pass2_fast2_start 3734.end: 3735 pxor m12, m12 3736.end2: 3737 call .pass2_end 3738.zero_loop: 3739 mova [cq+r6*8+128*3], m12 3740 mova [cq+r6*8+128*2], m12 3741 mova [cq+r6*8+128*1], m12 3742 mova [cq+r6*8+128*0], m12 3743 sub r6d, 16*4 3744 jge .zero_loop 3745 WIN64_RESTORE_XMM 3746 vzeroupper 3747 ret 3748.pass2_fast2_start: 3749 call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 3750 lea r5, [o_base_8bpc] 3751 punpckhqdq m22, m0, m2 ; 1 3752 punpcklqdq m0, m2 ; 0 3753 punpcklqdq m1, m5, m7 ; 4 3754 punpckhqdq m24, m5, m7 ; 5 3755 punpcklqdq m14, m3, m4 ; 2 3756 punpckhqdq m23, m3, m4 ; 3 3757 punpcklqdq m15, m6, m8 ; 6 3758 punpckhqdq m25, m6, m8 ; 7 3759 mova m10, m13 3760 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 3761 mova [cq+128*0], m14 3762 mova [cq+128*1], m15 3763 mova [cq+128*2], m16 3764 mova [cq+128*3], m17 3765 mova [cq+128*4], m18 3766 mova [cq+128*5], m19 3767 mova [cq+128*6], m20 3768 mova [cq+128*7], m21 3769 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 3770.pass2_end: 3771 psubsw m9, m0, m29 ; out31 3772 paddsw m0, m29 ; out0 3773 psubsw m29, m1, m28 ; out30 3774 paddsw m1, m28 ; out1 3775 psubsw m28, m2, m27 ; out29 3776 paddsw m2, m27 ; out2 3777 psubsw m27, m3, m26 ; out28 3778 paddsw m3, m26 ; out3 3779 psubsw m26, m4, m25 ; out27 3780 paddsw m4, m25 ; out4 3781 psubsw m25, m5, m24 ; out26 3782 paddsw m5, m24 ; out5 3783 psubsw m24, m6, m23 ; out25 3784 paddsw m6, m23 ; out6 3785 psubsw m23, m7, m22 ; out24 3786 paddsw m7, m22 ; out7 3787 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start 3788 mova m0, [cq+128*0] 3789 mova m1, [cq+128*1] 3790 mova m2, [cq+128*2] 3791 mova m3, [cq+128*3] 3792 mova m4, [cq+128*4] 3793 mova m5, [cq+128*5] 3794 mova m6, [cq+128*6] 3795 mova m7, [cq+128*7] 3796 psubsw m22, m0, m21 ; out23 3797 paddsw m0, m21 ; out8 3798 psubsw m21, m1, m20 ; out22 3799 paddsw m1, m20 ; out9 3800 psubsw m20, m2, m19 ; out21 3801 paddsw m2, m19 ; out10 3802 psubsw m19, m3, m18 ; out20 3803 paddsw m3, m18 ; out11 3804 psubsw m18, m4, m17 ; out19 3805 paddsw m4, m17 ; out12 3806 psubsw m17, m5, m16 ; out18 3807 paddsw m5, m16 ; out13 3808 psubsw m16, m6, m15 ; out17 3809 paddsw m6, m15 ; out14 3810 psubsw m15, m7, m14 ; out16 3811 paddsw m7, m14 ; out15 3812 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8 3813 pmulhrsw m0, m11, m15 3814 pmulhrsw m1, m11, m16 3815 pmulhrsw m2, m11, m17 3816 pmulhrsw m3, m11, m18 3817 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 3818 pmulhrsw m0, m11, m19 3819 pmulhrsw m1, m11, m20 3820 pmulhrsw m2, m11, m21 3821 pmulhrsw m3, m11, m22 3822 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 3823 pmulhrsw m0, m11, m23 3824 pmulhrsw m1, m11, m24 3825 pmulhrsw m2, m11, m25 3826 pmulhrsw m3, m11, m26 3827 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 3828 pmulhrsw m0, m11, m27 3829 pmulhrsw m1, m11, m28 3830 pmulhrsw m2, m11, m29 3831 pmulhrsw m3, m11, m9 3832 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 3833.dconly: 3834 imul r6d, [cq], 181 3835 mov [cq], eobd 3836 or r3d, 32 3837 add r6d, 640 3838 sar r6d, 10 3839 jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2 3840.pass1_fast: 3841 mova m0, [cq+128* 0] 3842 mova m1, [cq+128* 4] 3843 mova m2, [cq+128* 8] 3844 mova m3, [cq+128*12] 3845 mov r6d, 16*12 3846 call m(idct_8x16_internal_10bpc).main_fast 3847 mova m16, [cq+128* 2] 3848 mova m17, [cq+128* 6] 3849 mova m18, [cq+128*10] 3850 mova m19, [cq+128*14] 3851 call m(idct_16x16_internal_10bpc).main_fast 3852 call .pass1_load_spill 3853 call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast 3854 jmp .pass1_end 3855.pass1: 3856 mova m0, [cq+128* 0] 3857 mova m1, [cq+128* 4] 3858 mova m2, [cq+128* 8] 3859 mova m3, [cq+128*12] 3860 mova m4, [cq+128*16] 3861 mova m5, [cq+128*20] 3862 mova m6, [cq+128*24] 3863 mova m7, [cq+128*28] 3864 call m(idct_8x16_internal_10bpc).main 3865 mova m16, [cq+128* 2] 3866 mova m17, [cq+128* 6] 3867 mova m18, [cq+128*10] 3868 mova m19, [cq+128*14] 3869 mova m20, [cq+128*18] 3870 mova m21, [cq+128*22] 3871 mova m22, [cq+128*26] 3872 mova m23, [cq+128*30] 3873 call m(idct_16x16_internal_10bpc).main 3874 call .pass1_load_spill 3875 mova m16, [cq+128*17] 3876 mova m17, [cq+128*19] 3877 mova m18, [cq+128*21] 3878 mova m19, [cq+128*23] 3879 mova m20, [cq+128*25] 3880 mova m21, [cq+128*27] 3881 mova m22, [cq+128*29] 3882 mova m23, [cq+128*31] 3883 call m(inv_txfm_add_dct_dct_32x16_10bpc).main 3884.pass1_end: 3885 vpbroadcastd m11, [o(pd_2)] 3886 lea r4, [cq+128*8] 3887 call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end 3888 punpckhqdq m22, m0, m20 ; 1 3889 punpcklqdq m0, m20 ; 0 3890 punpckhqdq m24, m2, m1 ; 5 3891 punpcklqdq m1, m2, m1 ; 4 3892 punpcklqdq m2, m14, m18 ; 8 3893 punpckhqdq m26, m14, m18 ; 9 3894 punpcklqdq m14, m15, m4 ; 2 3895 punpckhqdq m23, m15, m4 ; 3 3896 punpckhqdq m25, m3, m21 ; 7 3897 punpcklqdq m15, m3, m21 ; 6 3898 punpckhqdq m28, m6, m17 ; 13 3899 punpcklqdq m3, m6, m17 ; 12 3900 punpckhqdq m27, m5, m16 ; 11 3901 punpcklqdq m16, m5, m16 ; 10 3902 punpckhqdq m29, m7, m8 ; 15 3903 punpcklqdq m17, m7, m8 ; 14 3904 ret 3905.pass1_load_spill: 3906 call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub 3907 mova [cq+128* 0], m0 3908 mova m0, [cq+128* 1] 3909 mova [cq+128* 1], m1 3910 mova [cq+128* 2], m2 3911 mova m1, [cq+128* 3] 3912 mova m2, [cq+128* 5] 3913 mova [cq+128* 3], m3 3914 mova [cq+128* 4], m4 3915 mova m3, [cq+128* 7] 3916 mova m4, [cq+128* 9] 3917 mova [cq+128* 5], m5 3918 mova [cq+128* 6], m6 3919 mova [cq+128* 7], m7 3920 mova m5, [cq+128*11] 3921 mova m6, [cq+128*13] 3922 mova m7, [cq+128*15] 3923 mova [cq+128* 8], m23 3924 mova [cq+128* 9], m22 3925 mova [cq+128*10], m21 3926 mova [cq+128*11], m20 3927 mova [cq+128*12], m19 3928 mova [cq+128*13], m18 3929 mova [cq+128*14], m17 3930 mova [cq+128*15], m16 3931 ret 3932 3933cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eob 3934%undef cmp 3935 vpbroadcastd m13, [pw_8192] 3936 vpbroadcastd m15, [pixel_10bpc_max] 3937 pxor m14, m14 3938 lea r6, [strideq*9] 3939 cmp eobd, 136 3940 jl .main 3941 mov r4, dstq 3942 call .main 3943 add cq, 64-128*4 3944 lea dstq, [dstq+strideq*8] 3945 call .main 3946 add cq, 128*12-64 3947 lea dstq, [r4+32] 3948 cmp eobd, 543 3949 jl .main 3950 call .main 3951 add cq, 64-128*4 3952 lea dstq, [dstq+strideq*8] 3953.main: 3954 call .main_internal 3955 add cq, 128*4 3956 pmulhrsw m1, m13, m2 3957 pmulhrsw m3, m13, m4 3958 pmulhrsw m5, m13, m6 3959 pmulhrsw m7, m13, m8 3960 call .main_internal 3961 jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 3962.main_internal: 3963 mova m8, [cq+128* 0] 3964 packssdw m8, [cq+128* 8] 3965 mova m6, [cq+128* 1] 3966 packssdw m6, [cq+128* 9] 3967 mova m0, [cq+128* 2] 3968 packssdw m0, [cq+128*10] 3969 mova m2, [cq+128* 3] 3970 packssdw m2, [cq+128*11] 3971 REPX {vpermq x, x, q3120}, m8, m6, m0, m2 3972 REPX {mova [cq+128*x], m14}, 0, 1, 2, 3 3973 punpcklwd m4, m8, m6 3974 punpckhwd m8, m6 3975 punpcklwd m6, m0, m2 3976 punpckhwd m0, m2 3977 REPX {mova [cq+128*x], m14}, 8, 9, 10, 11 3978 punpckldq m2, m4, m6 ; 0 1 3979 punpckhdq m4, m6 ; 2 3 3980 punpckldq m6, m8, m0 ; 4 5 3981 punpckhdq m8, m0 ; 6 7 3982 ret 3983 3984cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob 3985 lea r5, [o_base] 3986 test eobd, eobd 3987 jz .dconly 3988 3989 PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob 3990%undef cmp 3991 vpbroadcastd m12, [o(pd_2896)] 3992 vpbroadcastd m13, [o(pd_2048)] 3993 vpbroadcastd m14, [o(clip_18b_min)] 3994 vpbroadcastd m15, [o(clip_18b_max)] 3995 cmp eobd, 36 3996 jl .fast 3997 call .pass1 3998 cmp eobd, 151 3999 jge .full 4000 lea r5, [o_base_8bpc] 4001 4002 punpckhwd m22, m0, m0 4003 punpckhwd m23, m1, m1 4004 punpckhwd m24, m2, m2 4005 punpckhwd m25, m3, m3 4006 punpckhwd m26, m4, m4 4007 punpckhwd m27, m5, m5 4008 punpckhwd m28, m6, m6 4009 punpckhwd m29, m7, m7 4010 punpcklwd m21, m1, m1 4011 punpcklwd m14, m3, m3 4012 punpcklwd m18, m5, m5 4013 punpcklwd m15, m7, m7 4014 pxor m9, m9 4015 punpcklwd m9, m9, m0 4016 punpcklwd m8, m2, m2 4017 punpcklwd m7, m4, m4 4018 punpcklwd m1, m6, m6 4019 call m(idct_16x16_internal_8bpc).main_fast2 4020 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 4021 mova [rsp+mmsize*0], m14 4022 mova [rsp+mmsize*1], m15 4023 mova [rsp+mmsize*2], m16 4024 mova [rsp+mmsize*3], m17 4025 mova [rsp+mmsize*4], m18 4026 mova [rsp+mmsize*5], m19 4027 mova [rsp+mmsize*6], m20 4028 mova [rsp+mmsize*7], m21 4029 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast 4030 4031 pxor m12, m12 4032 mov r3d, 64*3 4033.zero_loop: 4034 REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3 4035 sub r3d, 64 4036 jge .zero_loop 4037 4038 jmp .pass2_end 4039.full: 4040 mova [cq+128*0], m0 4041 mova [cq+128*1], m1 4042 mova [cq+128*2], m2 4043 mova [cq+128*3], m3 4044 mova [cq+128*4], m4 4045 mova [cq+128*5], m5 4046 mova [cq+128*6], m6 4047 mova [cq+128*7], m7 4048 add cq, 64 4049 call .pass1 4050 sub cq, 64 4051 mova m22, [cq+128*0] ; 0 1 4052 mova m23, [cq+128*1] ; 2 3 4053 mova m24, [cq+128*2] ; 4 5 4054 mova m25, [cq+128*3] ; 6 7 4055 mova m26, [cq+128*4] ; 8 9 4056 mova m27, [cq+128*5] ; 10 11 4057 mova m28, [cq+128*6] ; 12 13 4058 mova m29, [cq+128*7] ; 14 15 4059 mova [cq+64* 8], m0 4060 mova [cq+64* 9], m1 4061 mova [cq+64*10], m2 4062 mova [cq+64*11], m3 4063 mova [cq+64*12], m4 4064 mova [cq+64*13], m5 4065 mova [cq+64*14], m6 4066 mova [cq+64*15], m7 4067 lea r5, [o_base_8bpc] 4068 4069 punpcklwd m20, m1, m1 4070 punpcklwd m16, m3, m3 4071 punpcklwd m19, m5, m5 4072 punpcklwd m17, m7, m7 4073 punpcklwd m8, m24, m24 ; 4 4074 punpcklwd m5, m2, m2 ; 20 4075 punpcklwd m1, m28, m28 ; 12 4076 punpcklwd m7, m26, m26 ; 8 4077 punpcklwd m3, m4, m4 ; 24 4078 punpcklwd m4, m6, m6 ; 28 4079 pxor m9, m9 4080 punpcklwd m6, m9, m0 ; __ 16 4081 mova m0, m4 4082 punpcklwd m9, m9, m22 ; __ 0 4083 call m(idct_16x16_internal_8bpc).main_fast 4084 punpcklwd m21, m23, m23 ; 2 4085 punpcklwd m15, m29, m29 ; 14 4086 punpcklwd m18, m27, m27 ; 10 4087 punpcklwd m14, m25, m25 ; 6 4088 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 4089 mova [rsp+mmsize*0], m14 4090 mova [rsp+mmsize*1], m15 4091 mova [rsp+mmsize*2], m16 4092 mova [rsp+mmsize*3], m17 4093 mova [rsp+mmsize*4], m18 4094 mova [rsp+mmsize*5], m19 4095 mova [rsp+mmsize*6], m20 4096 mova [rsp+mmsize*7], m21 4097 mova m21, [cq+64*15] 4098 mova m14, [cq+64* 8] 4099 mova m17, [cq+64*11] 4100 mova m18, [cq+64*12] 4101 mova m19, [cq+64*13] 4102 mova m16, [cq+64*10] 4103 mova m15, [cq+64* 9] 4104 mova m20, [cq+64*14] 4105 REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ 4106 m24, m19, m16, m27, m28, m15, m20, m23 4107 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf 4108 4109 pxor m12, m12 4110 mov r3d, 32*7 4111.full_zero_loop: 4112 REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3 4113 sub r3d, 32 4114 jge .full_zero_loop 4115 4116 jmp .pass2_end 4117.fast: 4118 mova ym0, [cq+128*0] 4119 mova ym2, [cq+128*4] 4120 movshdup m8, [o(permB)] 4121 mova ym1, [cq+128*2] 4122 mova ym3, [cq+128*6] 4123 mova ym4, [cq+128*1] 4124 mova ym5, [cq+128*3] 4125 mova ym6, [cq+128*5] 4126 mova ym7, [cq+128*7] 4127 vpermt2q m0, m8, m2 ; 0 4 4128 vpermt2q m1, m8, m3 ; 2 6 4129 vpermt2q m4, m8, m5 ; 1 3 4130 vpermt2q m7, m8, m6 ; 7 5 4131 call m(idct_8x8_internal_10bpc).main_fast 4132 call m(idct_16x8_internal_10bpc).main_fast 4133 vpbroadcastd m11, [o(pd_2)] 4134 call m(idct_8x16_internal_10bpc).main_end2 4135 mova m8, [o(idct8x32p)] 4136 packssdw m0, m4 4137 packssdw m1, m5 4138 packssdw m2, m6 4139 packssdw m3, m7 4140 mova m6, [dup16_perm] 4141 vpermb m0, m8, m0 4142 vpermb m2, m8, m2 4143 vprold m8, 16 4144 vpermb m1, m8, m1 4145 vpermb m3, m8, m3 4146 punpckldq m4, m0, m2 4147 punpckhdq m0, m2 4148 punpckldq m2, m1, m3 4149 punpckhdq m1, m3 4150 punpckldq m21, m4, m2 4151 punpckhdq m14, m4, m2 4152 punpckldq m18, m0, m1 4153 punpckhdq m15, m0, m1 4154 vpord m7, m6, [o(pb_32)] {1to16} 4155 vpermb m22, m7, m21 ; 1 4156 pmovzxwd m9, ym21 ; 0 4157 vpermb m8, m6, m18 ; 4 4158 vpermb m24, m7, m18 ; 5 4159 vpermb m21, m6, m14 ; 2 4160 vpermb m23, m7, m14 ; 3 4161 vpermb m14, m6, m15 ; 6 4162 vpermb m25, m7, m15 ; 7 4163 lea r5, [o_base_8bpc] 4164 pslld m9, 16 4165 4166 pxor m7, m7 4167 REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29 4168 4169 call m(idct_16x16_internal_8bpc).main_fast2 4170 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 4171 mova [rsp+mmsize*0], m14 4172 mova [rsp+mmsize*1], m15 4173 mova [rsp+mmsize*2], m16 4174 mova [rsp+mmsize*3], m17 4175 mova [rsp+mmsize*4], m18 4176 mova [rsp+mmsize*5], m19 4177 mova [rsp+mmsize*6], m20 4178 mova [rsp+mmsize*7], m21 4179 4180 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast 4181 4182 pxor m12, m12 4183 REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7 4184.pass2_end: 4185 movshdup m30, [permC] 4186 vpbroadcastd m11, [pw_2048] 4187 vpbroadcastd m13, [pixel_10bpc_max] 4188 lea r6, [strideq*3] 4189 psrlq m31, m30, 8 4190 vpermq m8, m30, m0 4191 vpermq m9, m31, m1 4192 call m(idct_16x8_internal_10bpc).write_16x4 4193 vpermq m8, m30, m2 4194 vpermq m9, m31, m3 4195 call m(idct_16x8_internal_10bpc).write_16x4 4196 vpermq m8, m30, m4 4197 vpermq m9, m31, m5 4198 call m(idct_16x8_internal_10bpc).write_16x4 4199 vpermq m8, m30, m6 4200 vpermq m9, m31, m7 4201 call m(idct_16x8_internal_10bpc).write_16x4 4202 4203 mova m1, [rsp+mmsize*0] 4204 mova m2, [rsp+mmsize*1] 4205 mova m3, [rsp+mmsize*2] 4206 mova m4, [rsp+mmsize*3] 4207 mova m5, [rsp+mmsize*4] 4208 mova m6, [rsp+mmsize*5] 4209 mova m7, [rsp+mmsize*6] 4210 mova m8, [rsp+mmsize*7] 4211 4212 paddsw m0, m1, m21 4213 psubsw m21, m1, m21 4214 paddsw m1, m2, m20 4215 psubsw m20, m2, m20 4216 paddsw m2, m3, m19 4217 psubsw m19, m3, m19 4218 paddsw m3, m4, m18 4219 psubsw m18, m4, m18 4220 paddsw m4, m5, m17 4221 psubsw m17, m5, m17 4222 paddsw m5, m6, m16 4223 psubsw m16, m6, m16 4224 paddsw m6, m7, m15 4225 psubsw m15, m7, m15 4226 paddsw m7, m8, m14 4227 psubsw m14, m8, m14 4228 4229 vpermq m8, m30, m0 4230 vpermq m9, m31, m1 4231 call m(idct_16x8_internal_10bpc).write_16x4 4232 vpermq m8, m30, m2 4233 vpermq m9, m31, m3 4234 call m(idct_16x8_internal_10bpc).write_16x4 4235 vpermq m8, m30, m4 4236 vpermq m9, m31, m5 4237 call m(idct_16x8_internal_10bpc).write_16x4 4238 vpermq m8, m30, m6 4239 vpermq m9, m31, m7 4240 call m(idct_16x8_internal_10bpc).write_16x4 4241 4242 vpermq m8, m30, m14 4243 vpermq m9, m31, m15 4244 call m(idct_16x8_internal_10bpc).write_16x4 4245 vpermq m8, m30, m16 4246 vpermq m9, m31, m17 4247 call m(idct_16x8_internal_10bpc).write_16x4 4248 vpermq m8, m30, m18 4249 vpermq m9, m31, m19 4250 call m(idct_16x8_internal_10bpc).write_16x4 4251 vpermq m8, m30, m20 4252 vpermq m9, m31, m21 4253 call m(idct_16x8_internal_10bpc).write_16x4 4254 4255 vpermq m8, m30, m22 4256 vpermq m9, m31, m23 4257 call m(idct_16x8_internal_10bpc).write_16x4 4258 vpermq m8, m30, m24 4259 vpermq m9, m31, m25 4260 call m(idct_16x8_internal_10bpc).write_16x4 4261 vpermq m8, m30, m26 4262 vpermq m9, m31, m27 4263 call m(idct_16x8_internal_10bpc).write_16x4 4264 vpermq m8, m30, m28 4265 vpermq m9, m31, m29 4266 call m(idct_16x8_internal_10bpc).write_16x4 4267 RET 4268.pass1: 4269 mova m0, [cq+128* 0] 4270 mova m1, [cq+128* 2] 4271 mova m2, [cq+128* 4] 4272 mova m3, [cq+128* 6] 4273 mova m4, [cq+128* 8] 4274 mova m5, [cq+128*10] 4275 mova m6, [cq+128*12] 4276 mova m7, [cq+128*14] 4277 call m(idct_8x16_internal_10bpc).main 4278 mova m16, [cq+128* 1] 4279 mova m17, [cq+128* 3] 4280 mova m18, [cq+128* 5] 4281 mova m19, [cq+128* 7] 4282 mova m20, [cq+128* 9] 4283 mova m21, [cq+128*11] 4284 mova m22, [cq+128*13] 4285 mova m23, [cq+128*15] 4286 call m(idct_16x16_internal_10bpc).main 4287 call m(idct_16x16_internal_10bpc).main_end 4288 jmp m(idct_16x16_internal_10bpc).main_end3 4289.dconly: 4290 imul r6d, [cq], 181 4291 mov [cq], eobd 4292 or r3d, 64 4293 add r6d, 640 4294 sar r6d, 10 4295 jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2 4296 4297cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob 4298 lea r5, [o_base] 4299 test eobd, eobd 4300 jz .dconly 4301 PROLOGUE 4, 7, 32, -64*40, dst, stride, c, eob 4302%undef cmp 4303 vpbroadcastd m12, [o(pd_2896)] 4304 vpbroadcastd m13, [o(pd_2048)] 4305 vpbroadcastd m14, [o(clip_18b_min)] 4306 vpbroadcastd m15, [o(clip_18b_max)] 4307 cmp eobd, 136 4308 jl .fast 4309 add cq, 64 4310 cmp eobd, 543 4311 jge .full 4312 call .pass1_fast ; bottomright 16x16 zero 4313 jmp .lefthalf 4314.full: 4315 call .pass1 4316 mov r3d, 16*28 4317.lefthalf: 4318 mova [cq+128* 0], m27 4319 mova [cq+128* 1], m14 4320 mova [cq+128* 2], m28 4321 mova [cq+128* 3], m15 4322 mova [cq+128* 4], m22 4323 mova [cq+128* 5], m23 4324 mova [cq+128* 6], m24 4325 mova [cq+128* 7], m25 4326 mova [cq+128* 8], m0 4327 mova [cq+128* 9], m26 4328 mova [cq+128*10], m20 4329 mova [cq+128*11], m21 4330 mova [cq+128*12], m18 4331 mova [cq+128*13], m16 4332 mova [cq+128*14], m17 4333 mova [cq+128*15], m3 4334 sub cq, 64 4335 vpbroadcastd m12, [o(pd_2896)] 4336 vpbroadcastd m13, [o(pd_2048)] 4337 vpbroadcastd m14, [o(clip_18b_min)] 4338 vpbroadcastd m15, [o(clip_18b_max)] 4339 call .pass1 4340 call .pass2_start 4341 4342 pxor m31, m31 4343.right_zero_loop: 4344 REPX {mova [cq+r3*8+64+128*x], m31}, 0, 1, 2, 3 4345 sub r3d, 16*4 4346 jge .right_zero_loop 4347 mov r3d, 16*28 4348 jmp .left_zero_loop 4349.pass2_start: 4350 vpbroadcastd m10, [o(pd_2048)] 4351 lea r5, [o_base_8bpc] 4352 4353 lea r4, [rsp+gprsize] 4354 mova m1, [cq+128*15+64] 4355 mova m2, [cq+128* 8+64] 4356 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 4357 mova m0, m21 4358 mova m1, [cq+128*12+64] 4359 mova m2, [cq+128*11+64] 4360 mova m3, m18 4361 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 4362 mova m0, m20 4363 mova m1, [cq+128*13+64] 4364 mova m2, [cq+128*10+64] 4365 mova m3, m16 4366 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 4367 mova m0, m26 4368 mova m1, [cq+128*14+64] 4369 mova m2, [cq+128* 9+64] 4370 mova m3, m17 4371 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 4372 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 4373 4374 mova m0, m27 4375 mova m1, m28 4376 mova m2, [cq+128* 0+64] 4377 mova m3, [cq+128* 2+64] 4378 mova m16, [cq+128* 1+64] 4379 mova m17, [cq+128* 3+64] 4380 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 4381 mova m26, [cq+128* 4+64] 4382 mova m27, [cq+128* 5+64] 4383 mova m28, [cq+128* 6+64] 4384 mova m29, [cq+128* 7+64] 4385 mova [rsp+64*32+gprsize], m14 4386 mova [rsp+64*33+gprsize], m15 4387 mova [rsp+64*34+gprsize], m16 4388 mova [rsp+64*35+gprsize], m17 4389 mova [rsp+64*36+gprsize], m18 4390 mova [rsp+64*37+gprsize], m19 4391 mova [rsp+64*38+gprsize], m20 4392 mova [rsp+64*39+gprsize], m21 4393 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 4394.fast: ; topleft 16x16 nonzero 4395 cmp eobd, 36 4396 jl .fast2 4397 call .pass1_fast 4398 vpbroadcastd m10, [o(pd_2048)] 4399 call .pass2_fast_start 4400 jmp .end 4401.fast2: ; topleft 8x8 nonzero 4402 movshdup m7, [o(permB)] 4403 mova ym0, [cq+128*0] 4404 mova ym1, [cq+128*4] 4405 mova ym4, [cq+128*2] 4406 mova ym5, [cq+128*6] 4407 mova ym16, [cq+128*1] 4408 mova ym2, [cq+128*5] 4409 mova ym3, [cq+128*3] 4410 mova ym17, [cq+128*7] 4411 mov r3d, 16*4 4412 vpermq m0, m7, m0 ; 0 0 4413 vpermq m1, m7, m1 ; 4 4 4414 vpermt2q m4, m7, m5 ; 2 6 4415 vpermt2q m16, m7, m2 ; 1 5 4416 vpermt2q m17, m7, m3 ; 7 3 4417 REPX {pmulld x, m12}, m0, m1, m4, m16, m17 4418 REPX {paddd x, m13}, m0, m1, m4, m16, m17 4419 REPX {psrad x, 12 }, m0, m1, m4, m16, m17 4420 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 4421 vpbroadcastd m11, [o(pd_1)] 4422 call m(idct_16x16_internal_10bpc).main_end2 4423 4424 call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 4425 punpcklqdq m27, m0, m2 ; 0 4426 punpckhqdq m0, m2 ; 1 4427 punpcklqdq m22, m3, m4 ; 2 4428 punpckhqdq m26, m3, m4 ; 3 4429 punpcklqdq m14, m5, m7 ; 4 4430 punpckhqdq m20, m5, m7 ; 5 4431 punpcklqdq m23, m6, m8 ; 6 4432 punpckhqdq m21, m6, m8 ; 7 4433 4434 mova m10, m13 4435 call .pass2_fast2_start 4436.end: 4437 4438 pxor m31, m31 4439 4440.left_zero_loop: 4441 REPX {mova [cq+r3*8+128*x], m31}, 0, 1, 2, 3 4442 sub r3d, 16*4 4443 jge .left_zero_loop 4444 4445 call .pass2_end 4446 RET 4447.pass2_end: 4448 DEFINE_ARGS dst, stride, _, dst2, stride32, stklo, stkhi 4449 vpbroadcastd m30, [pixel_10bpc_max] 4450 vpbroadcastd m13, [pw_2048] 4451 4452 mov stride32q, strideq 4453 shl stride32q, 5 4454 lea stkhiq, [rsp+31*mmsize+gprsize] 4455 lea dst2q, [dstq+stride32q] 4456 lea stkloq, [rsp+gprsize] 4457 sub dst2q, strideq ; dst31 4458 4459 paddsw m8, m0, m29 ; t0[idct32] 4460 psubsw m9, m0, m29 ; t31[idct32] 4461 call .end_sumsub_write 4462 paddsw m8, m1, m28 ; t1[idct32] 4463 psubsw m9, m1, m28 ; t30[idct32] 4464 call .end_sumsub_write 4465 paddsw m8, m2, m27 ; t2[idct32] 4466 psubsw m9, m2, m27 ; t29[idct32] 4467 call .end_sumsub_write 4468 paddsw m8, m3, m26 ; t3[idct32] 4469 psubsw m9, m3, m26 ; t28[idct32] 4470 call .end_sumsub_write 4471 paddsw m8, m4, m25 ; t4[idct32] 4472 psubsw m9, m4, m25 ; t27[idct32] 4473 call .end_sumsub_write 4474 paddsw m8, m5, m24 ; t5[idct32] 4475 psubsw m9, m5, m24 ; t26[idct32] 4476 call .end_sumsub_write 4477 paddsw m8, m6, m23 ; t6[idct32] 4478 psubsw m9, m6, m23 ; t25[idct32] 4479 call .end_sumsub_write 4480 paddsw m8, m7, m22 ; t7[idct32] 4481 psubsw m9, m7, m22 ; t24[idct32] 4482 call .end_sumsub_write 4483 mova m0, [rsp+64*32+gprsize] 4484 mova m1, [rsp+64*33+gprsize] 4485 mova m2, [rsp+64*34+gprsize] 4486 mova m3, [rsp+64*35+gprsize] 4487 mova m4, [rsp+64*36+gprsize] 4488 mova m5, [rsp+64*37+gprsize] 4489 mova m6, [rsp+64*38+gprsize] 4490 mova m7, [rsp+64*39+gprsize] 4491 paddsw m8, m0, m21 ; t8[idct32] 4492 psubsw m9, m0, m21 ; t23[idct32] 4493 call .end_sumsub_write 4494 paddsw m8, m1, m20 ; t9[idct32] 4495 psubsw m9, m1, m20 ; t22[idct32] 4496 call .end_sumsub_write 4497 paddsw m8, m2, m19 ; t10[idct32] 4498 psubsw m9, m2, m19 ; t21[idct32] 4499 call .end_sumsub_write 4500 paddsw m8, m3, m18 ; t11[idct32] 4501 psubsw m9, m3, m18 ; t20[idct32] 4502 call .end_sumsub_write 4503 paddsw m8, m4, m17 ; t12[idct32] 4504 psubsw m9, m4, m17 ; t19[idct32] 4505 call .end_sumsub_write 4506 paddsw m8, m5, m16 ; t13[idct32] 4507 psubsw m9, m5, m16 ; t18[idct32] 4508 call .end_sumsub_write 4509 paddsw m8, m6, m15 ; t14[idct32] 4510 psubsw m9, m6, m15 ; t17[idct32] 4511 call .end_sumsub_write 4512 paddsw m8, m7, m14 ; t15[idct32] 4513 psubsw m9, m7, m14 ; t16[idct32] 4514 ; fall-through 4515.end_sumsub_write: 4516 mova m10, [stkhiq] ; t63-n 4517 mova m12, [stkloq] ; t32+n 4518 psubsw m11, m8, m10 ; out63-n 4519 paddsw m8, m10 ; out0 +n 4520 psubsw m10, m9, m12 ; out32+n 4521 paddsw m9, m12 ; out32-n 4522 REPX {pmulhrsw x, m13}, m11, m8, m10, m9 4523 paddw m8, [dstq] 4524 paddw m9, [dst2q] 4525 paddw m10, [dstq+stride32q] 4526 paddw m11, [dst2q+stride32q] 4527 REPX {pminsw x, m30}, m11, m8, m10, m9 4528 REPX {pmaxsw x, m31}, m11, m8, m10, m9 4529 mova [dstq ], m8 4530 mova [dst2q ], m9 4531 mova [dstq +stride32q], m10 4532 mova [dst2q+stride32q], m11 4533 add stkloq, mmsize 4534 sub stkhiq, mmsize 4535 add dstq, strideq 4536 sub dst2q, strideq 4537 ret 4538.pass2_fast_start: 4539 lea r5, [o_base_8bpc] 4540 lea r4, [rsp+gprsize] 4541 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 4542 mova m0, m21 4543 mova m3, m18 4544 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 4545 mova m0, m20 4546 mova m3, m16 4547 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 4548 mova m0, m26 4549 mova m3, m17 4550 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 4551 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 4552 4553 mova m0, m27 4554 mova m1, m28 4555 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 4556 mova [rsp+64*32+gprsize], m14 4557 mova [rsp+64*33+gprsize], m15 4558 mova [rsp+64*34+gprsize], m16 4559 mova [rsp+64*35+gprsize], m17 4560 mova [rsp+64*36+gprsize], m18 4561 mova [rsp+64*37+gprsize], m19 4562 mova [rsp+64*38+gprsize], m20 4563 mova [rsp+64*39+gprsize], m21 4564 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 4565.pass2_fast2_start: 4566 lea r5, [o_base_8bpc] 4567 lea r4, [rsp+gprsize] 4568 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 4569 mova m0, m21 4570 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 4571 mova m0, m20 4572 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 4573 mova m0, m26 4574 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 4575 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 4576 4577 mova m0, m27 4578 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast3 4579 mova [rsp+64*32+gprsize], m14 4580 mova [rsp+64*33+gprsize], m15 4581 mova [rsp+64*34+gprsize], m16 4582 mova [rsp+64*35+gprsize], m17 4583 mova [rsp+64*36+gprsize], m18 4584 mova [rsp+64*37+gprsize], m19 4585 mova [rsp+64*38+gprsize], m20 4586 mova [rsp+64*39+gprsize], m21 4587 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast3 4588.dconly: 4589 DEFINE_ARGS dst, stride, c, eob 4590 imul r6d, [cq], 181 4591 mov [cq], eobd 4592 or r3d, 64 4593 jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly3 4594.pass1_fast: 4595 pmulld m0, m12, [cq+128* 0] 4596 pmulld m1, m12, [cq+128* 4] 4597 pmulld m2, m12, [cq+128* 8] 4598 pmulld m3, m12, [cq+128*12] 4599 mov r3d, 16*12 4600 call m(idct_8x16_internal_10bpc).main_fast_rect2 4601 pmulld m16, m12, [cq+128* 2] 4602 pmulld m17, m12, [cq+128* 6] 4603 pmulld m18, m12, [cq+128*10] 4604 pmulld m19, m12, [cq+128*14] 4605 call m(idct_16x16_internal_10bpc).main_fast_rect2 4606 call .pass1_load_spill 4607 call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2 4608 jmp .pass1_end 4609.pass1: 4610 pmulld m0, m12, [cq+128* 0] 4611 pmulld m1, m12, [cq+128* 4] 4612 pmulld m2, m12, [cq+128* 8] 4613 pmulld m3, m12, [cq+128*12] 4614 pmulld m4, m12, [cq+128*16] 4615 pmulld m5, m12, [cq+128*20] 4616 pmulld m6, m12, [cq+128*24] 4617 pmulld m7, m12, [cq+128*28] 4618 call m(idct_8x16_internal_10bpc).main_rect2 4619 pmulld m16, m12, [cq+128* 2] 4620 pmulld m17, m12, [cq+128* 6] 4621 pmulld m18, m12, [cq+128*10] 4622 pmulld m19, m12, [cq+128*14] 4623 pmulld m20, m12, [cq+128*18] 4624 pmulld m21, m12, [cq+128*22] 4625 pmulld m22, m12, [cq+128*26] 4626 pmulld m23, m12, [cq+128*30] 4627 call m(idct_16x16_internal_10bpc).main_rect2 4628 call .pass1_load_spill 4629 pmulld m16, m12, [cq+128*17] 4630 pmulld m17, m12, [cq+128*19] 4631 pmulld m18, m12, [cq+128*21] 4632 pmulld m19, m12, [cq+128*23] 4633 pmulld m20, m12, [cq+128*25] 4634 pmulld m21, m12, [cq+128*27] 4635 pmulld m22, m12, [cq+128*29] 4636 pmulld m23, m12, [cq+128*31] 4637 call m(inv_txfm_add_dct_dct_32x16_10bpc).main_rect2 4638.pass1_end: 4639 vpbroadcastd m11, [o(pd_1)] 4640 lea r4, [cq+128*8] 4641 call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end 4642 punpcklqdq m27, m0, m20 ; 0 4643 punpckhqdq m0, m20 ; 1 4644 punpcklqdq m24, m5, m16 ; 10 4645 punpckhqdq m16, m5, m16 ; 11 4646 punpcklqdq m23, m3, m21 ; 6 4647 punpckhqdq m21, m3, m21 ; 7 4648 punpcklqdq m25, m7, m8 ; 14 4649 punpckhqdq m3, m7, m8 ; 15 4650 punpcklqdq m22, m15, m4 ; 2 4651 punpckhqdq m26, m15, m4 ; 3 4652 punpcklqdq m15, m6, m17 ; 12 4653 punpckhqdq m17, m6, m17 ; 13 4654 punpcklqdq m28, m14, m18 ; 8 4655 punpckhqdq m18, m14, m18 ; 9 4656 punpcklqdq m14, m2, m1 ; 4 4657 punpckhqdq m20, m2, m1 ; 5 4658 ret 4659.pass1_load_spill: 4660 call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub 4661 mova [cq+128* 0], m0 4662 pmulld m0, m12, [cq+128* 1] 4663 mova [cq+128* 1], m1 4664 mova [cq+128* 2], m2 4665 pmulld m1, m12, [cq+128* 3] 4666 pmulld m2, m12, [cq+128* 5] 4667 mova [cq+128* 3], m3 4668 mova [cq+128* 4], m4 4669 pmulld m3, m12, [cq+128* 7] 4670 pmulld m4, m12, [cq+128* 9] 4671 mova [cq+128* 5], m5 4672 mova [cq+128* 6], m6 4673 mova [cq+128* 7], m7 4674 pmulld m5, m12, [cq+128*11] 4675 pmulld m6, m12, [cq+128*13] 4676 pmulld m7, m12, [cq+128*15] 4677 mova [cq+128* 8], m23 4678 mova [cq+128* 9], m22 4679 mova [cq+128*10], m21 4680 mova [cq+128*11], m20 4681 mova [cq+128*12], m19 4682 mova [cq+128*13], m18 4683 mova [cq+128*14], m17 4684 mova [cq+128*15], m16 4685 ret 4686 4687cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob 4688%undef cmp 4689 lea r5, [o_base] 4690 test eobd, eobd 4691 jz .dconly 4692 4693 PROLOGUE 4, 7, 32, -64*32, dst, stride, c, eob 4694%undef cmp 4695 vpbroadcastd m12, [o(pd_2896)] 4696 vpbroadcastd m13, [o(pd_2048)] 4697 vpbroadcastd m14, [o(clip_18b_min)] 4698 vpbroadcastd m15, [o(clip_18b_max)] 4699 cmp eobd, 36 4700 jl .fast ; 8x8 4701 cmp eobd, 151 4702 jge .full ; 16x16 4703 lea r4, [idct64_mul_16bpc] 4704 lea r6, [rsp+4*64] 4705 mova m0, [cq+64* 1] 4706 mova m3, [cq+64*15] 4707 call .main_part1_fast 4708 mova m0, [cq+64* 7] 4709 mova m3, [cq+64* 9] 4710 call .main_part1_fast 4711 mova m0, [cq+64* 5] 4712 mova m3, [cq+64*11] 4713 call .main_part1_fast 4714 mova m0, [cq+64* 3] 4715 mova m3, [cq+64*13] 4716 call .main_part1_fast 4717 call .main_part2 4718 mova m0, [cq+64* 0] 4719 mova m1, [cq+64* 8] 4720 mova m16, [cq+64* 4] 4721 mova m17, [cq+64*12] 4722 call m(idct_8x16_internal_10bpc).main_fast2 4723 call m(idct_16x16_internal_10bpc).main_fast2 4724 call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub 4725 call .pass1_load_spill 4726 call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2 4727 mov r6d, 12*8 4728 jmp .idct64_end 4729.full: 4730 lea r4, [idct64_mul_16bpc] 4731 lea r6, [rsp+4*64] 4732 mova m0, [cq+64* 1] 4733 mova m1, [cq+64*31] 4734 mova m2, [cq+64*17] 4735 mova m3, [cq+64*15] 4736 call .main_part1 4737 mova m0, [cq+64* 7] 4738 mova m1, [cq+64*25] 4739 mova m2, [cq+64*23] 4740 mova m3, [cq+64* 9] 4741 call .main_part1 4742 mova m0, [cq+64* 5] 4743 mova m1, [cq+64*27] 4744 mova m2, [cq+64*21] 4745 mova m3, [cq+64*11] 4746 call .main_part1 4747 mova m0, [cq+64* 3] 4748 mova m1, [cq+64*29] 4749 mova m2, [cq+64*19] 4750 mova m3, [cq+64*13] 4751 call .main_part1 4752 call .main_part2 4753 mova m0, [cq+64* 0] 4754 mova m1, [cq+64* 8] 4755 mova m2, [cq+64*16] 4756 mova m3, [cq+64*24] 4757 mova m16, [cq+64* 4] 4758 mova m17, [cq+64*12] 4759 mova m18, [cq+64*20] 4760 mova m19, [cq+64*28] 4761 call m(idct_8x16_internal_10bpc).main_fast 4762 call m(idct_16x16_internal_10bpc).main_fast 4763 call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub 4764 call .pass1_load_spill 4765 mova m4, [cq+64*18] 4766 mova m5, [cq+64*22] 4767 mova m6, [cq+64*26] 4768 mova m7, [cq+64*30] 4769 call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast 4770 mov r6d, 28*8 4771 jmp .idct64_end 4772.dconly: 4773 imul r6d, [cq], 181 4774 mov [cq], eobd 4775 or r3d, 16 4776.dconly1: 4777 add r6d, 640 4778 sar r6d, 10 4779.dconly2: 4780 vpbroadcastd m3, [o(dconly_10bpc)] 4781 imul r6d, 181 4782 add r6d, 2176 4783 sar r6d, 12 4784 vpbroadcastw m2, r6d 4785 paddsw m2, m3 4786.dconly_loop: 4787 paddsw m0, m2, [dstq+64*0] 4788 paddsw m1, m2, [dstq+64*1] 4789 psubusw m0, m3 4790 psubusw m1, m3 4791 mova [dstq+64*0], m0 4792 mova [dstq+64*1], m1 4793 add dstq, strideq 4794 dec r3d 4795 jg .dconly_loop 4796 ret 4797.pass1_load_spill: 4798 mova [cq+64* 0], m0 4799 mova m0, [cq+64* 2] 4800 mova [cq+64* 2], m1 4801 mova m1, [cq+64* 6] 4802 mova [cq+64* 4], m2 4803 mova [cq+64* 6], m3 4804 mova m2, [cq+64*10] 4805 mova m3, [cq+64*14] 4806 mova [cq+64* 8], m4 4807 mova [cq+64*10], m5 4808 mova [cq+64*12], m6 4809 mova [cq+64*14], m7 4810 mova [cq+64* 1], m23 4811 mova [cq+64* 3], m22 4812 mova [cq+64* 5], m21 4813 mova [cq+64* 7], m20 4814 mova [cq+64* 9], m19 4815 mova [cq+64*11], m18 4816 mova [cq+64*13], m17 4817 mova [cq+64*15], m16 4818 ret 4819ALIGN function_align 4820.main_part1_fast_rect2: 4821 REPX {paddd x, m13}, m0, m3 4822 REPX {psrad x, 12 }, m0, m3 4823.main_part1_fast: 4824 pmulld m7, m0, [r4+4*0]{bcstd} ; t63a 4825 pmulld m0, [r4+4*1]{bcstd} ; t32a 4826 pmulld m4, m3, [r4+4*6]{bcstd} ; t60a 4827 pmulld m3, [r4+4*7]{bcstd} ; t35a 4828 vpbroadcastd m10, [r4+4*8] 4829 vpbroadcastd m11, [r4+4*9] 4830 REPX {paddd x, m13}, m7, m0, m4, m3 4831 REPX {psrad x, 12 }, m7, m0, m4, m3 4832 mova m8, m0 4833 mova m1, m7 4834 mova m6, m3 4835 mova m2, m4 4836 jmp .main_part1b 4837.main_part1_rect2: 4838 REPX {paddd x, m13}, m0, m1, m2, m3 4839 REPX {psrad x, 12 }, m0, m1, m2, m3 4840.main_part1: ; idct64 steps 1-5 4841 ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 4842 ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 4843 ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 4844 ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 4845 pmulld m7, m0, [r4+4*0]{bcstd} ; t63a 4846 pmulld m0, [r4+4*1]{bcstd} ; t32a 4847 pmulld m6, m1, [r4+4*2]{bcstd} ; t62a 4848 pmulld m1, [r4+4*3]{bcstd} ; t33a 4849 pmulld m5, m2, [r4+4*4]{bcstd} ; t61a 4850 pmulld m2, [r4+4*5]{bcstd} ; t34a 4851 pmulld m4, m3, [r4+4*6]{bcstd} ; t60a 4852 pmulld m3, [r4+4*7]{bcstd} ; t35a 4853 vpbroadcastd m10, [r4+4*8] 4854 vpbroadcastd m11, [r4+4*9] 4855 REPX {paddd x, m13}, m7, m0, m6, m1, m5, m2, m4, m3 4856 REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 4857 psubd m8, m0, m1 ; t33 4858 paddd m0, m1 ; t32 4859 psubd m1, m7, m6 ; t62 4860 paddd m7, m6 ; t63 4861 psubd m6, m3, m2 ; t34 4862 paddd m3, m2 ; t35 4863 psubd m2, m4, m5 ; t61 4864 paddd m4, m5 ; t60 4865.main_part1b: 4866 REPX {pmaxsd x, m14}, m8, m1, m6, m2 4867 REPX {pminsd x, m15}, m8, m1, m6, m2 4868 ITX_MULSUB_2D 1, 8, 5, 9, _, 13, 10, 11 ; t33a, t62a 4869 ITX_MULSUB_2D 2, 6, 5, 9, _, 13, 10, 11, 2 ; t61a, t34a 4870 REPX {pmaxsd x, m14}, m0, m3, m7, m4 4871 REPX {pminsd x, m15}, m0, m3, m7, m4 4872 vpbroadcastd m10, [r4+4*10] 4873 vpbroadcastd m11, [r4+4*11] 4874 psubd m5, m0, m3 ; t35a 4875 paddd m0, m3 ; t32a 4876 psubd m3, m7, m4 ; t60a 4877 paddd m7, m4 ; t63a 4878 psubd m4, m1, m6 ; t34 4879 paddd m1, m6 ; t33 4880 psubd m6, m8, m2 ; t61 4881 paddd m8, m2 ; t62 4882 REPX {pmaxsd x, m14}, m5, m3, m4, m6 4883 REPX {pminsd x, m15}, m5, m3, m4, m6 4884 ITX_MULSUB_2D 3, 5, 2, 9, _, 13, 10, 11 ; t35, t60 4885 ITX_MULSUB_2D 6, 4, 2, 9, _, 13, 10, 11 ; t34a, t61a 4886 REPX {pmaxsd x, m14}, m0, m7, m1, m8 4887 REPX {pminsd x, m15}, m0, m7, m1, m8 4888 add r4, 4*12 4889 mova [r6-64*4], m0 4890 mova [r6+64*3], m7 4891 mova [r6-64*3], m1 4892 mova [r6+64*2], m8 4893 mova [r6-64*2], m6 4894 mova [r6+64*1], m4 4895 mova [r6-64*1], m3 4896 mova [r6+64*0], m5 4897 add r6, 64*8 4898 ret 4899.main_part2: ; idct64 steps 6-9 4900 lea r4, [r6+64*3] 4901 sub r6, 64*4 4902 vpbroadcastd m10, [pd_1567] 4903 vpbroadcastd m11, [pd_3784] 4904.main_part2_loop: 4905 mova m0, [r6-64*32] ; t32a 4906 mova m1, [r4-64*24] ; t39a 4907 mova m2, [r4-64*32] ; t63a 4908 mova m3, [r6-64*24] ; t56a 4909 mova m4, [r6-64*16] ; t40a 4910 mova m5, [r4-64* 8] ; t47a 4911 mova m6, [r4-64*16] ; t55a 4912 mova m7, [r6-64* 8] ; t48a 4913 psubd m8, m0, m1 ; t39 4914 paddd m0, m1 ; t32 4915 psubd m1, m2, m3 ; t56 4916 paddd m2, m3 ; t63 4917 psubd m3, m5, m4 ; t40 4918 paddd m5, m4 ; t47 4919 psubd m4, m7, m6 ; t55 4920 paddd m7, m6 ; t48 4921 REPX {pmaxsd x, m14}, m8, m1, m3, m4 4922 REPX {pminsd x, m15}, m8, m1, m3, m4 4923 ITX_MULSUB_2D 1, 8, 6, 9, _, 13, 10, 11 ; t39a, t56a 4924 ITX_MULSUB_2D 4, 3, 6, 9, _, 13, 10, 11, 2 ; t55a, t40a 4925 REPX {pmaxsd x, m14}, m0, m2, m5, m7 4926 REPX {pminsd x, m15}, m0, m5, m2, m7 4927 psubd m6, m2, m7 ; t48a 4928 paddd m2, m7 ; t63a 4929 psubd m7, m0, m5 ; t47a 4930 paddd m0, m5 ; t32a 4931 psubd m5, m8, m4 ; t55 4932 paddd m8, m4 ; t56 4933 psubd m4, m1, m3 ; t40 4934 paddd m1, m3 ; t39 4935 REPX {pmaxsd x, m14}, m6, m7, m5, m4 4936 REPX {pminsd x, m15}, m6, m7, m5, m4 4937 REPX {pmulld x, m12}, m6, m7, m5, m4 4938 REPX {pmaxsd x, m14}, m2, m0, m8, m1 4939 REPX {pminsd x, m15}, m2, m0, m8, m1 4940 paddd m6, m13 4941 paddd m5, m13 4942 psubd m3, m6, m7 ; t47 4943 paddd m6, m7 ; t48 4944 psubd m7, m5, m4 ; t40a 4945 paddd m5, m4 ; t55a 4946 REPX {psrad x, 12}, m3, m6, m7, m5 4947 mova [r4-64* 8], m2 4948 mova [r6-64*32], m0 4949 mova [r6-64* 8], m8 4950 mova [r4-64*32], m1 4951 mova [r4-64*24], m3 4952 mova [r6-64*16], m6 4953 mova [r6-64*24], m7 4954 mova [r4-64*16], m5 4955 add r6, 64 4956 sub r4, 64 4957 cmp r6, r4 4958 jl .main_part2_loop 4959 ret 4960.idct64_main_end: 4961%macro IDCT64_PASS1_END 9 4962 mova m%5, [%9+%1*128] ; t0+n [idct32] + idct64 rounding 4963 psubd m%6, m%5, m%2 ; out31-n [idct32] = t31-n [idct64] 4964 paddd m%5, m%2 ; out0+n [idct32] = t0+n [idct64] 4965 REPX {pmaxsd x, m14}, m%6, m%5 4966 REPX {pminsd x, m15}, m%6, m%5 4967 REPX {paddd x, m11}, m%6, m%5 4968 mova m%2, [r3+%3*64] ; t32+n [idct64] 4969 mova m%7, [r3+%4*64] ; t63-n [idct64] 4970 psubd m%8, m%5, m%7 ; out63-n 4971 paddd m%5, m%7 ; out0+n 4972 psubd m%7, m%6, m%2 ; out32+n 4973 paddd m%6, m%2 ; out31-n 4974 REPX {vpsravd x, m11}, m%8, m%5, m%7, m%6 4975%endmacro 4976 4977%macro IDCT64_PASS1_ENDx4 1 4978%assign %%m1 %1 ; t32+n 4979%assign %%m2 (7-%1) ; t39-n 4980%assign %%m3 (8+%1) ; t40+n 4981%assign %%m4 (15-%1) ; t47-n 4982%assign %%m5 (16+%1) ; t48+n 4983%assign %%m6 (23-%1) ; t55-n 4984%assign %%m7 (24+%1) ; t56+n 4985%assign %%m8 (31-%1) ; t63-n 4986 4987%assign %%r1 %1 ; t16+n 4988%assign %%r2 (7-%1) ; t23-n 4989%assign %%r3 (16+%1) ; t24-n 4990%assign %%r4 (23-%1) ; t31-n 4991 4992%assign %%c1 (%1) ; t0/8+n 4993%assign %%c2 (7-%1) ; t7/15-n 4994 4995 IDCT64_PASS1_END %%c1, %%r4, %%m1, %%m8, 24, 25, 26, 27, cq ; out0/31/32/63 4996 IDCT64_PASS1_END %%c1, %%r1, %%m4, %%m5, 28, 29, 30, 31, r4 ; out15/16/47/48 4997 packssdw m %+ %%r1, m24, m29 4998 packssdw m %+ %%r4, m28, m25 4999 packssdw m26, m31 5000 packssdw m30, m27 5001 mova [r3+%%m5*mmsize], m26 5002 mova [r3+%%m8*mmsize], m30 5003 IDCT64_PASS1_END %%c2, %%r3, %%m2, %%m7, 24, 25, 26, 27, cq ; out7/24/39/56 5004 IDCT64_PASS1_END %%c2, %%r2, %%m3, %%m6, 28, 29, 30, 31, r4 ; out8/23/40/55 5005 packssdw m %+ %%r2, m24, m29 5006 packssdw m %+ %%r3, m28, m25 5007 packssdw m26, m31 5008 packssdw m30, m27 5009 mova [r3+%%m6*mmsize], m26 5010 mova [r3+%%m7*mmsize], m30 5011%endmacro 5012 IDCT64_PASS1_ENDx4 0 5013 IDCT64_PASS1_ENDx4 1 5014 IDCT64_PASS1_ENDx4 2 5015 IDCT64_PASS1_ENDx4 3 5016 ret 5017.idct64_end: 5018 vpbroadcastd m11, [o(pd_2)] 5019 lea r4, [cq+64] 5020 mov r3, rsp 5021 lea r5, [o_base_8bpc] 5022 call .idct64_main_end 5023 5024 pxor m12, m12 5025.zero_loop: 5026 REPX {mova [cq+r6*8+64*x], m12}, 0, 1, 2, 3 5027 sub r6d, 8*4 5028 jge .zero_loop 5029 5030 lea r3, [strideq*3] 5031 mov r4, dstq 5032 call .pass2 5033 mova m0, [rsp+16*mmsize] 5034 mova m1, [rsp+17*mmsize] 5035 mova m2, [rsp+18*mmsize] 5036 mova m3, [rsp+19*mmsize] 5037 mova m4, [rsp+20*mmsize] 5038 mova m5, [rsp+21*mmsize] 5039 mova m6, [rsp+22*mmsize] 5040 mova m7, [rsp+23*mmsize] 5041 mova m16, [rsp+24*mmsize] 5042 mova m17, [rsp+25*mmsize] 5043 mova m18, [rsp+26*mmsize] 5044 mova m19, [rsp+27*mmsize] 5045 mova m20, [rsp+28*mmsize] 5046 mova m21, [rsp+29*mmsize] 5047 mova m22, [rsp+30*mmsize] 5048 mova m23, [rsp+31*mmsize] 5049 lea dstq, [r4+64] 5050 call .pass2 5051 RET 5052.pass2: 5053 psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11 5054 psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 5055 call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32 5056 5057 punpckhqdq m19, m5, m16 ; 11 5058 punpcklqdq m5, m16 ; 10 5059 punpckhqdq m16, m2, m1 ; 5 5060 punpcklqdq m2, m1 ; 4 5061 punpcklqdq m1, m15, m4 ; 2 5062 punpckhqdq m15, m4 ; 3 5063 punpcklqdq m4, m14, m18 ; 8 5064 punpckhqdq m18, m14, m18 ; 9 5065 punpckhqdq m14, m0, m20 ; 1 5066 punpcklqdq m0, m20 ; 0 5067 punpckhqdq m20, m6, m17 ; 13 5068 punpcklqdq m6, m17 ; 12 5069 punpckhqdq m17, m3, m21 ; 7 5070 punpcklqdq m3, m21 ; 6 5071 punpckhqdq m21, m7, m8 ; 15 5072 punpcklqdq m7, m8 ; 14 5073 5074 call m(inv_txfm_add_dct_dct_32x8_8bpc).main 5075 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 5076.write: 5077 vpbroadcastd m11, [pw_2048] 5078 pxor m12, m12 5079 vpbroadcastd m13, [pixel_10bpc_max] 5080 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8 5081 pmulhrsw m0, m11, m14 5082 pmulhrsw m1, m11, m15 5083 pmulhrsw m2, m11, m16 5084 pmulhrsw m3, m11, m17 5085 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 5086 pmulhrsw m0, m11, m18 5087 pmulhrsw m1, m11, m19 5088 pmulhrsw m2, m11, m20 5089 pmulhrsw m3, m11, m21 5090 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 5091.fast: ; 8x8 packed 5092 movshdup m7, [o(permB)] 5093 mova ym0, [cq+64*1] 5094 mova ym2, [cq+64*5] 5095 mova ym3, [cq+64*3] 5096 mova ym1, [cq+64*7] 5097 vpermt2q m0, m7, m2 ; 1 5 5098 vpermt2q m1, m7, m3 ; 7 3 5099 call .main_oddhalf_packed 5100 mova [rsp+ 0*mmsize], m0 5101 mova [rsp+ 1*mmsize], m1 5102 mova [rsp+ 2*mmsize], m2 5103 mova [rsp+ 3*mmsize], m3 5104 mova [rsp+ 4*mmsize], m4 5105 mova [rsp+ 5*mmsize], m5 5106 mova [rsp+ 6*mmsize], m6 5107 mova [rsp+ 7*mmsize], m7 5108 mova [rsp+ 8*mmsize], m16 5109 mova [rsp+ 9*mmsize], m17 5110 mova [rsp+10*mmsize], m18 5111 mova [rsp+11*mmsize], m19 5112 mova [rsp+12*mmsize], m20 5113 mova [rsp+13*mmsize], m21 5114 mova [rsp+14*mmsize], m22 5115 mova [rsp+15*mmsize], m23 5116 5117 movshdup m7, [o(permB)] 5118 mova ym0, [cq+64*0] 5119 mova ym4, [cq+64*4] 5120 mova ym16, [cq+64*2] 5121 mova ym5, [cq+64*6] 5122 vpermt2q m16, m7, m5 ; 2 6 5123 vpermq m0, m7, m0 ; 0 0 5124 vpermq m4, m7, m4 ; 4 4 5125 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3 5126 ; m0-7,9,16-22 contain un-sumsub'ed dct32 output data 5127 5128 ; zero input coefs 5129 pxor m12, m12 5130 REPX {mova [cq+x*64], ym12}, 0, 1, 2, 3, 4, 5, 6, 7 5131 5132 vpbroadcastd m11, [o(pd_2)] 5133 call .main_end 5134 lea r3, [strideq*3] 5135 mov r4, dstq 5136 call .pass2_fast 5137 mova m0, m24 5138 mova m1, m25 5139 mova m2, m26 5140 mova m3, m27 5141 mova m4, m28 5142 mova m5, m29 5143 mova m6, m30 5144 mova m7, m31 5145 lea dstq, [r4+64] 5146 lea r5, [o_base] 5147 call .pass2_fast 5148 RET 5149.pass2_fast: 5150 call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 5151 lea r5, [o_base_8bpc] 5152 punpckhqdq m14, m0, m2 ; 1 5153 punpcklqdq m0, m2 ; 0 5154 punpcklqdq m1, m3, m4 ; 2 5155 punpckhqdq m15, m3, m4 ; 3 5156 punpcklqdq m2, m5, m7 ; 4 5157 punpckhqdq m16, m5, m7 ; 5 5158 punpcklqdq m3, m6, m8 ; 6 5159 punpckhqdq m17, m6, m8 ; 7 5160 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 5161 jmp .write 5162.main_end: 5163 5164%macro IDCT64_PASS1_PACKED_END 7 5165 psubd m%5, m%1, m%2 ; out31-n [idct32] = t31-n [idct64] 5166 paddd m%1, m%2 ; out0+n [idct32] = t0+n [idct64] 5167 REPX {pmaxsd x, m14}, m%5, m%1 5168 REPX {pminsd x, m15}, m%5, m%1 5169 REPX {paddd x, m11}, m%5, m%1 5170 mova m%2, [rsp+%6*64+gprsize] ; t32+n [idct64] 5171 mova m%3, [rsp+%7*64+gprsize] ; t63-n [idct64] 5172 psubd m%4, m%1, m%3 ; out63-n 5173 paddd m%1, m%3 ; out0+n 5174 psubd m%3, m%5, m%2 ; out32+n 5175 paddd m%2, m%5 ; out31-n 5176 REPX {vpsravd x, m11}, m%4, m%1, m%3, m%2 5177%endmacro 5178 5179 IDCT64_PASS1_PACKED_END 0, 22, 24, 10, 12, 0, 15 ; out0/1,31/30,32/33,63/62 5180 IDCT64_PASS1_PACKED_END 7, 9, 31, 13, 12, 7, 8 ; out15/14,16/17,47/46,48/49 5181 packssdw m0, m9 5182 packssdw m7, m22 5183 packssdw m24, m13 5184 packssdw m31, m10 5185 IDCT64_PASS1_PACKED_END 1, 21, 25, 10, 12, 1, 14 ; out3/2,28/29,35/34,60/61 5186 IDCT64_PASS1_PACKED_END 6, 16, 30, 13, 12, 6, 9 ; out12/13,19/18,44/45,51/50 5187 packssdw m1, m16 5188 packssdw m6, m21 5189 packssdw m25, m13 5190 packssdw m30, m10 5191 IDCT64_PASS1_PACKED_END 2, 20, 26, 10, 12, 2, 13 ; out4/5,27/26,36/37,59/58 5192 IDCT64_PASS1_PACKED_END 5, 17, 29, 13, 12, 5, 10 ; out11/10,20/21,43/42,52/53 5193 packssdw m2, m17 5194 packssdw m5, m20 5195 packssdw m26, m13 5196 packssdw m29, m10 5197 IDCT64_PASS1_PACKED_END 3, 19, 27, 10, 12, 3, 12 ; out7/6,24/25,39/38,56/57 5198 IDCT64_PASS1_PACKED_END 4, 18, 28, 13, 12, 4, 11 ; out8/9,23/22,40/41,55/54 5199 packssdw m3, m18 5200 packssdw m4, m19 5201 packssdw m27, m13 5202 packssdw m28, m10 5203 ret 5204.main_oddhalf_packed_rect2: 5205 REPX {paddd x, m13}, m0, m1 5206 REPX {psrad x, 12 }, m0, m1 5207.main_oddhalf_packed: 5208 ; m0=in1 in5, m1=in7 in3 5209 vbroadcasti32x4 m2, [o(pd_101_501)] 5210 vbroadcasti32x4 m3, [o(pd_m700_m301)] 5211 vbroadcasti32x4 m4, [o(pd_4095_4065)] 5212 vbroadcasti32x4 m5, [o(pd_4036_4085)] 5213 pmulld m2, m0 5214 pmulld m3, m1 5215 pmulld m0, m4 5216 pmulld m1, m5 5217 REPX {paddd x, m13}, m2, m3, m0, m1 5218 REPX {psrad x, 12 }, m2, m3, m0, m1 5219 5220 ; m2=t32a t40a -> t32/33 t40/41, m3=t39a t47a -> t38/39 t46/47 5221 ; m0=t63a t55a -> t62/63 t54/55, m1=t56a t48a -> t56/57 t48/49 5222 ; end of step 1-2 5223 5224 vbroadcasti32x4 m10, [o(pd_401_1931)] 5225 vbroadcasti32x4 m11, [o(pd_4076_3612)] 5226 mova m4, m0 5227 mova m5, m2 5228 ITX_MULSUB_2D 4, 5, 8, 9, _, 13, 10, 11 5229 vbroadcasti32x4 m10, [o(pd_3166_3920)] 5230 vbroadcasti32x4 m11, [o(pd_2598_1189)] 5231 mova m6, m3 5232 mova m7, m1 5233 ITX_MULSUB_2D 7, 6, 8, 9, _, 13, 10, 11, 2 5234 5235 ; m4=t33a t41a -> t41/42 t33/34, m5=t63a t54a -> t61/62 t53/54 5236 ; m6=t38a t46a -> t37/38 t45/46, m7=t57a t49a -> t57/58 t49/50 5237 ; and from earlier: 5238 ; m0=t63 t55 -> t60/63a t52/55a, m1=t56 t48 -> t56/59a t48/51a 5239 ; m2=t32 t40 -> t32/35a t40/43a, m3=t39 t47 -> t36/39a t44/47a 5240 ; end of step 3-4 5241 5242 punpcklqdq m22, m2, m4 ; t32a/33 or t35a/34 5243 punpcklqdq m21, m3, m6 ; t36a/37 or t39a/38 5244 punpckhqdq m18, m2, m4 ; t40a/41 or t43a/42 5245 punpckhqdq m17, m3, m6 ; t44a/45 or t47a/46 5246 punpckhqdq m6, m1, m7 ; t48a/49 or t51a/50 5247 punpckhqdq m19, m0, m5 ; t52a/53 or t55a/54 5248 punpcklqdq m8, m1, m7 ; t56a/57 or t59a/58 5249 punpcklqdq m23, m0, m5 ; t60a/61 or t63a/62 5250 mova m0, m22 5251 mova m7, m21 5252 mova m3, m18 5253 mova m16, m17 5254 mova m5, m6 5255 mova m4, m19 5256 mova m2, m8 5257 mova m1, m23 5258 ; m0/22/7/21,18/3/17/16,6/5/19/4,2/8/1/23: t32-63[a] 5259 5260 ; step5 5261 vpbroadcastd m10, [o(pd_799)] 5262 vpbroadcastd m11, [o(pd_4017)] 5263 ITX_MULSUB_2D 1, 22, 20, 9, _, 13, 10, 11 ; t35/34a, t60/61a 5264 ITX_MULSUB_2D 8, 7, 20, 9, _, 13, 10, 11, 2 ; t59/58a, t36/37a 5265 vpbroadcastd m10, [o(pd_3406)] 5266 vpbroadcastd m11, [o(pd_2276)] 5267 ITX_MULSUB_2D 19, 3, 20, 9, _, 13, 10, 11 ; t43/42a, t52/53a 5268 ITX_MULSUB_2D 5, 17, 20, 9, _, 13, 10, 11, 2 ; t51/50a, t44/45a 5269 ; m0-1/7/21: t32-39[a], m18-19/17-16: t40-47[a] 5270 ; m6-5/3-4: t48-55[a], m2/8/22-23: t56-63[a] 5271 5272 ; step6 5273 psubd m20, m0, m21 ; t39/38a 5274 paddd m0, m21 ; t32/33a 5275 psubd m21, m1, m7 ; t36a/37 5276 paddd m1, m7 ; t35a/34 5277 REPX {pmaxsd x, m14}, m20, m0, m21, m1 5278 psubd m7, m16, m18 ; t40/41a 5279 paddd m16, m18 ; t47/46a 5280 REPX {pminsd x, m15}, m20, m0, m21, m1 5281 psubd m18, m17, m19 ; t43a/42 5282 paddd m17, m19 ; t44a/45 5283 REPX {pmaxsd x, m14}, m7, m16, m18, m17 5284 psubd m19, m6, m4 ; t55/54a 5285 paddd m6, m4 ; t48/49a 5286 REPX {pminsd x, m15}, m7, m16, m18, m17 5287 psubd m4, m5, m3 ; t52a/53 5288 paddd m5, m3 ; t51a/50 5289 REPX {pmaxsd x, m14}, m19, m6, m4, m5 5290 psubd m3, m23, m2 ; t56/57a 5291 paddd m23, m2 ; t63/62a 5292 REPX {pminsd x, m15}, m19, m6, m4, m5 5293 psubd m2, m22, m8 ; t59a/58 5294 paddd m22, m8 ; t60a/61 5295 REPX {pmaxsd x, m14}, m3, m23, m2, m22 5296 REPX {pminsd x, m15}, m3, m23, m2, m22 5297 ; m0-1: t32-35[a], m17-16: t44-47[a], m6-5: t48-51[a], m22-23: t60-63[a] 5298 ; m21-20: t36-39[a], m7/18: t40-43[a], m4/19: t52-55[a], m3-2: t56-59[a] 5299 5300 ; step7 5301 vpbroadcastd m10, [o(pd_1567)] 5302 vpbroadcastd m11, [o(pd_3784)] 5303 ITX_MULSUB_2D 2, 21, 8, 9, _, 13, 10, 11 ; t36/37a, t59/58a 5304 ITX_MULSUB_2D 3, 20, 8, 9, _, 13, 10, 11 ; t39a/38, t56a/57 5305 ITX_MULSUB_2D 19, 7, 8, 9, _, 13, 10, 11, 2 ; t55a/54, t40a/41 5306 ITX_MULSUB_2D 4, 18, 8, 9, _, 13, 10, 11, 2 ; t52/53a, t43/42a 5307 ; m0-3: t32-39[a], m7,18-16: t40-47[a], m6-4,19: t48-55[a], m20-23: t56-63[a] 5308 5309 ; step8 5310 psubd m8, m0, m16 ; t47a/46 5311 paddd m0, m16 ; t32a/33 5312 psubd m16, m1, m17 ; t44/45a 5313 paddd m1, m17 ; t35/34a 5314 REPX {pmaxsd x, m14}, m8, m0, m16, m1 5315 psubd m17, m2, m18 ; t43a/42 5316 paddd m2, m18 ; t36a/37 5317 REPX {pminsd x, m15}, m8, m0, m16, m1 5318 psubd m18, m3, m7 ; t40/41a 5319 paddd m3, m7 ; t39/38a 5320 REPX {pmaxsd x, m14}, m17, m2, m18, m3 5321 psubd m7, m23, m6 ; t48a/49 5322 paddd m23, m6 ; t63a/62 5323 REPX {pminsd x, m15}, m17, m2, m18, m3 5324 psubd m6, m22, m5 ; t51/50a 5325 paddd m22, m5 ; t60/61a 5326 REPX {pmaxsd x, m14}, m7, m23, m6, m22 5327 psubd m5, m21, m4 ; t52a/53 5328 paddd m21, m4 ; t59a/58 5329 REPX {pminsd x, m15}, m7, m23, m6, m22 5330 psubd m4, m20, m19 ; t55/54a 5331 paddd m20, m19 ; t56/57a 5332 REPX {pmaxsd x, m14}, m5, m21, m4, m20 5333 REPX {pminsd x, m15}, m5, m21, m4, m20 5334 ; m0-3=t32-39[a], m18-16,8: t40-47[a], m7-4=t48-55[a], m20-23=t56-63[a] 5335 5336 ; step9 5337 REPX {pmulld x, m12}, m4, m18, m5, m17, m6, m16, m7, m8 5338 REPX {paddd x, m13}, m4, m5, m6, m7 5339 paddd m19, m4, m18 ; t55a/54 5340 psubd m4, m18 ; t40a/41 5341 paddd m18, m5, m17 ; t52/53a 5342 psubd m5, m17 ; t43/42a 5343 paddd m17, m6, m16 ; t51a/50 5344 psubd m6, m16 ; t44a/45 5345 paddd m16, m7, m8 ; t48/49a 5346 psubd m7, m8 ; t47/46a 5347 REPX {psrad x, 12 }, m19, m4, m18, m5, m17, m6, m16, m7 5348 ; m4-7=t40-47[a], m16-19=t48-55[a] 5349 ret 5350 5351cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob 5352 lea r5, [o_base] 5353 test eobd, eobd 5354 jz .dconly 5355 5356 PROLOGUE 4, 8, 32, -64*32, dst, stride, c, eob 5357%undef cmp 5358 vpbroadcastd m12, [o(pd_2896)] 5359 vpbroadcastd m13, [o(pd_2048)] 5360 vpbroadcastd m14, [o(clip_18b_min)] 5361 vpbroadcastd m15, [o(clip_18b_max)] 5362 cmp eobd, 136 5363 jl .fast 5364 add cq, 64 5365 cmp eobd, 543 5366 jge .full 5367 call .pass1_fast ; bottomright 16x16 zero 5368 mov r7d, 16*12 5369 jmp .lefthalf 5370.full: 5371 call .pass1 5372 mov r7d, 16*28 5373.lefthalf: 5374 mova [cq+128* 0], m0 5375 mova [cq+128* 1], m1 5376 mova [cq+128* 2], m2 5377 mova [cq+128* 3], m3 5378 mova [cq+128* 4], m14 5379 mova [cq+128* 5], m15 5380 mova [cq+128* 6], m16 5381 mova [cq+128* 7], m17 5382 mova [cq+128* 8], m22 5383 mova [cq+128* 9], m23 5384 mova [cq+128*10], m24 5385 mova [cq+128*11], m25 5386 mova [cq+128*12], m26 5387 mova [cq+128*13], m27 5388 mova [cq+128*14], m28 5389 mova [cq+128*15], m29 5390 sub cq, 64 5391 vpbroadcastd m12, [o(pd_2896)] 5392 vpbroadcastd m13, [o(pd_2048)] 5393 vpbroadcastd m14, [o(clip_18b_min)] 5394 vpbroadcastd m15, [o(clip_18b_max)] 5395 sub rsp, 16*64 5396 call .pass1 5397 add rsp, 16*64 5398 lea r5, [o_base_8bpc] 5399 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start 5400 mov r4, dstq 5401 pxor m12, m12 5402 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end 5403 lea dstq, [r4+64] 5404 mova m0, [rsp+16*mmsize] 5405 mova m1, [rsp+17*mmsize] 5406 mova m2, [rsp+18*mmsize] 5407 mova m3, [rsp+19*mmsize] 5408 mova m4, [rsp+20*mmsize] 5409 mova m5, [rsp+21*mmsize] 5410 mova m6, [rsp+22*mmsize] 5411 mova m7, [rsp+23*mmsize] 5412 mova m16, [rsp+24*mmsize] 5413 mova m17, [rsp+25*mmsize] 5414 mova m18, [rsp+26*mmsize] 5415 mova m19, [rsp+27*mmsize] 5416 mova m20, [rsp+28*mmsize] 5417 mova m21, [rsp+29*mmsize] 5418 mova m22, [rsp+30*mmsize] 5419 mova m23, [rsp+31*mmsize] 5420 call .transpose 5421 mova [cq+128* 0+64], m0 5422 mova [cq+128* 1+64], m1 5423 mova [cq+128* 2+64], m2 5424 mova [cq+128* 3+64], m3 5425 mova [cq+128* 4+64], m14 5426 mova [cq+128* 5+64], m15 5427 mova [cq+128* 6+64], m16 5428 mova [cq+128* 7+64], m17 5429 mova [cq+128* 8+64], m22 5430 mova [cq+128* 9+64], m23 5431 mova [cq+128*10+64], m24 5432 mova [cq+128*11+64], m25 5433 mova [cq+128*12+64], m26 5434 mova [cq+128*13+64], m27 5435 mova [cq+128*14+64], m28 5436 mova [cq+128*15+64], m29 5437 mova m0, [rsp+ 0*mmsize] 5438 mova m1, [rsp+ 1*mmsize] 5439 mova m2, [rsp+ 2*mmsize] 5440 mova m3, [rsp+ 3*mmsize] 5441 mova m4, [rsp+ 4*mmsize] 5442 mova m5, [rsp+ 5*mmsize] 5443 mova m6, [rsp+ 6*mmsize] 5444 mova m7, [rsp+ 7*mmsize] 5445 mova m16, [rsp+ 8*mmsize] 5446 mova m17, [rsp+ 9*mmsize] 5447 mova m18, [rsp+10*mmsize] 5448 mova m19, [rsp+11*mmsize] 5449 mova m20, [rsp+12*mmsize] 5450 mova m21, [rsp+13*mmsize] 5451 mova m22, [rsp+14*mmsize] 5452 mova m23, [rsp+15*mmsize] 5453 call .transpose 5454 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start 5455 pxor m12, m12 5456.right_zero_loop: 5457 mova [cq+r7*8+64+128*3], m12 5458 mova [cq+r7*8+64+128*2], m12 5459 mova [cq+r7*8+64+128*1], m12 5460 mova [cq+r7*8+64+128*0], m12 5461 sub r7d, 16*4 5462 jge .right_zero_loop 5463 mov r7d, 16*28 5464 jmp .end 5465.fast: ; topleft 16x16 nonzero 5466 cmp eobd, 36 5467 jl .fast2 5468 call .pass1_fast 5469 lea r5, [o_base_8bpc] 5470 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start 5471 mov r4, dstq 5472 pxor m12, m12 5473 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end 5474 lea dstq, [r4+64] 5475 mova m0, [rsp+16*mmsize] 5476 mova m1, [rsp+17*mmsize] 5477 mova m2, [rsp+18*mmsize] 5478 mova m3, [rsp+19*mmsize] 5479 mova m4, [rsp+20*mmsize] 5480 mova m5, [rsp+21*mmsize] 5481 mova m6, [rsp+22*mmsize] 5482 mova m7, [rsp+23*mmsize] 5483 mova m16, [rsp+24*mmsize] 5484 mova m17, [rsp+25*mmsize] 5485 mova m18, [rsp+26*mmsize] 5486 mova m19, [rsp+27*mmsize] 5487 mova m20, [rsp+28*mmsize] 5488 mova m21, [rsp+29*mmsize] 5489 mova m22, [rsp+30*mmsize] 5490 mova m23, [rsp+31*mmsize] 5491 call .transpose 5492 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start 5493 mov r7d, 16*12 5494 pxor m12, m12 5495 jmp .end 5496.fast2: ; topleft 8x8 nonzero 5497 movshdup m7, [o(permB)] 5498 mova ym0, [cq+128*1] 5499 mova ym2, [cq+128*5] 5500 mova ym3, [cq+128*3] 5501 mova ym1, [cq+128*7] 5502 vpermt2q m0, m7, m2 ; 1 5 5503 vpermt2q m1, m7, m3 ; 7 3 5504 REPX {pmulld x, m12}, m0, m1 5505 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed_rect2 5506 mova [rsp+ 0*mmsize], m0 5507 mova [rsp+ 1*mmsize], m1 5508 mova [rsp+ 2*mmsize], m2 5509 mova [rsp+ 3*mmsize], m3 5510 mova [rsp+ 4*mmsize], m4 5511 mova [rsp+ 5*mmsize], m5 5512 mova [rsp+ 6*mmsize], m6 5513 mova [rsp+ 7*mmsize], m7 5514 mova [rsp+ 8*mmsize], m16 5515 mova [rsp+ 9*mmsize], m17 5516 mova [rsp+10*mmsize], m18 5517 mova [rsp+11*mmsize], m19 5518 mova [rsp+12*mmsize], m20 5519 mova [rsp+13*mmsize], m21 5520 mova [rsp+14*mmsize], m22 5521 mova [rsp+15*mmsize], m23 5522 5523 movshdup m7, [o(permB)] 5524 pmulld ym0, ym12, [cq+128*0] 5525 pmulld ym4, ym12, [cq+128*4] 5526 mova ym16, [cq+128*2] 5527 mova ym5, [cq+128*6] 5528 REPX {paddd x, ym13}, ym0, ym4 5529 REPX {psrad x, 12 }, ym0, ym4 5530 vpermt2q m16, m7, m5 ; 2 6 5531 vpermq m0, m7, m0 ; 0 0 5532 vpermq m4, m7, m4 ; 4 4 5533 pmulld m16, m12 5534 paddd m16, m13 5535 psrad m16, 12 5536 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3 5537 5538 vpbroadcastd m11, [o(pd_1)] 5539 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end 5540 mova [rsp+16*mmsize], m24 5541 mova [rsp+17*mmsize], m25 5542 mova [rsp+18*mmsize], m26 5543 mova [rsp+19*mmsize], m27 5544 mova [rsp+20*mmsize], m28 5545 mova [rsp+21*mmsize], m29 5546 mova [rsp+22*mmsize], m30 5547 mova [rsp+23*mmsize], m31 5548 vpbroadcastd m13, [o(pd_2048)] 5549 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start 5550 mov r7d, 16*4 5551 mov r4, dstq 5552 pxor m12, m12 5553 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end 5554 lea dstq, [r4+64] 5555 mova m0, [rsp+16*mmsize] 5556 mova m1, [rsp+17*mmsize] 5557 mova m2, [rsp+18*mmsize] 5558 mova m3, [rsp+19*mmsize] 5559 mova m4, [rsp+20*mmsize] 5560 mova m5, [rsp+21*mmsize] 5561 mova m6, [rsp+22*mmsize] 5562 mova m7, [rsp+23*mmsize] 5563 lea r5, [o_base] 5564 vpbroadcastd m13, [o(pd_2048)] 5565 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start 5566 pxor m12, m12 5567.end: 5568 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end 5569.zero_loop: 5570 mova [cq+r7*8+128*3], m12 5571 mova [cq+r7*8+128*2], m12 5572 mova [cq+r7*8+128*1], m12 5573 mova [cq+r7*8+128*0], m12 5574 sub r7d, 16*4 5575 jge .zero_loop 5576 RET 5577.dconly: 5578 imul r6d, [cq], 181 5579 mov [cq], eobd 5580 or r3d, 32 5581 add r6d, 128 5582 sar r6d, 8 5583 imul r6d, 181 5584 add r6d, 384 5585 sar r6d, 9 5586 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 5587.pass1_fast: 5588 lea r4, [idct64_mul_16bpc] 5589 lea r6, [rsp+4*64+gprsize] 5590 pmulld m0, m12, [cq+128* 1] 5591 pmulld m3, m12, [cq+128*15] 5592 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 5593 pmulld m0, m12, [cq+128* 7] 5594 pmulld m3, m12, [cq+128* 9] 5595 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 5596 pmulld m0, m12, [cq+128* 5] 5597 pmulld m3, m12, [cq+128*11] 5598 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 5599 pmulld m0, m12, [cq+128* 3] 5600 pmulld m3, m12, [cq+128*13] 5601 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 5602 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 5603 pmulld m0, m12, [cq+128* 0] 5604 pmulld m1, m12, [cq+128* 8] 5605 pmulld m16, m12, [cq+128* 4] 5606 pmulld m17, m12, [cq+128*12] 5607 call m(idct_8x16_internal_10bpc).main_fast2_rect2 5608 call m(idct_16x16_internal_10bpc).main_fast2_rect2 5609 call .pass1_load_spill 5610 call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2_rect2 5611 jmp .pass1_end 5612.pass1: 5613 lea r4, [idct64_mul_16bpc] 5614 lea r6, [rsp+4*64+gprsize] 5615 pmulld m0, m12, [cq+128* 1] 5616 pmulld m1, m12, [cq+128*31] 5617 pmulld m2, m12, [cq+128*17] 5618 pmulld m3, m12, [cq+128*15] 5619 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 5620 pmulld m0, m12, [cq+128* 7] 5621 pmulld m1, m12, [cq+128*25] 5622 pmulld m2, m12, [cq+128*23] 5623 pmulld m3, m12, [cq+128* 9] 5624 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 5625 pmulld m0, m12, [cq+128* 5] 5626 pmulld m1, m12, [cq+128*27] 5627 pmulld m2, m12, [cq+128*21] 5628 pmulld m3, m12, [cq+128*11] 5629 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 5630 pmulld m0, m12, [cq+128* 3] 5631 pmulld m1, m12, [cq+128*29] 5632 pmulld m2, m12, [cq+128*19] 5633 pmulld m3, m12, [cq+128*13] 5634 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 5635 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 5636 pmulld m0, m12, [cq+128* 0] 5637 pmulld m1, m12, [cq+128* 8] 5638 pmulld m2, m12, [cq+128*16] 5639 pmulld m3, m12, [cq+128*24] 5640 pmulld m16, m12, [cq+128* 4] 5641 pmulld m17, m12, [cq+128*12] 5642 pmulld m18, m12, [cq+128*20] 5643 pmulld m19, m12, [cq+128*28] 5644 call m(idct_8x16_internal_10bpc).main_fast_rect2 5645 call m(idct_16x16_internal_10bpc).main_fast_rect2 5646 call .pass1_load_spill 5647 pmulld m4, m12, [cq+128*18] 5648 pmulld m5, m12, [cq+128*22] 5649 pmulld m6, m12, [cq+128*26] 5650 pmulld m7, m12, [cq+128*30] 5651 call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2 5652.pass1_end: 5653 vpbroadcastd m11, [o(pd_1)] 5654 lea r3, [rsp+gprsize] 5655 lea r4, [cq+8*128] 5656 call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end 5657 ; transpose one half immediately, we can transpose lower half later 5658.transpose: 5659 ; transpose m0-7,16-23 5660 psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11 5661 psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 5662 call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32 5663 punpckhqdq m22, m0, m20 ; 1 5664 punpcklqdq m0, m20 ; 0 5665 punpckhqdq m24, m2, m1 ; 5 5666 punpcklqdq m1, m2, m1 ; 4 5667 punpcklqdq m2, m14, m18 ; 8 5668 punpckhqdq m26, m14, m18 ; 9 5669 punpcklqdq m14, m15, m4 ; 2 5670 punpckhqdq m23, m15, m4 ; 3 5671 punpckhqdq m25, m3, m21 ; 7 5672 punpcklqdq m15, m3, m21 ; 6 5673 punpckhqdq m28, m6, m17 ; 13 5674 punpcklqdq m3, m6, m17 ; 12 5675 punpckhqdq m27, m5, m16 ; 11 5676 punpcklqdq m16, m5, m16 ; 10 5677 punpckhqdq m29, m7, m8 ; 15 5678 punpcklqdq m17, m7, m8 ; 14 5679 ret 5680.pass1_load_spill: 5681 call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub 5682 mova [cq+128* 0], m0 5683 mova [cq+128* 1], m1 5684 pmulld m0, m12, [cq+128* 2] 5685 pmulld m1, m12, [cq+128* 6] 5686 mova [cq+128* 2], m2 5687 mova [cq+128* 3], m3 5688 pmulld m2, m12, [cq+128*10] 5689 pmulld m3, m12, [cq+128*14] 5690 mova [cq+128* 4], m4 5691 mova [cq+128* 5], m5 5692 mova [cq+128* 6], m6 5693 mova [cq+128* 7], m7 5694 mova [cq+128* 8], m23 5695 mova [cq+128* 9], m22 5696 mova [cq+128*10], m21 5697 mova [cq+128*11], m20 5698 mova [cq+128*12], m19 5699 mova [cq+128*13], m18 5700 mova [cq+128*14], m17 5701 mova [cq+128*15], m16 5702 ret 5703 5704cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob 5705 lea r5, [o_base] 5706 test eobd, eobd 5707 jz .dconly 5708 5709 PROLOGUE 4, 9, 32, -64*32, dst, stride, c, eob 5710%undef cmp 5711 vpbroadcastd m12, [o(pd_2896)] 5712 vpbroadcastd m13, [o(pd_2048)] 5713 vpbroadcastd m14, [o(clip_18b_min)] 5714 vpbroadcastd m15, [o(clip_18b_max)] 5715 cmp eobd, 136 5716 jl .fast 5717 add cq, 64 5718 cmp eobd, 543 5719 jge .full 5720 call .pass1_fast ; bottomright 16x16 zero 5721 mov r7d, 16*12 5722 jmp .lefthalf 5723.full: 5724 call .pass1 5725 mov r7d, 16*28 5726.lefthalf: 5727 mova [cq+128* 0], m27 5728 mova [cq+128* 1], m14 5729 mova [cq+128* 2], m28 5730 mova [cq+128* 3], m15 5731 mova [cq+128* 4], m22 5732 mova [cq+128* 5], m23 5733 mova [cq+128* 6], m24 5734 mova [cq+128* 7], m25 5735 mova [cq+128* 8], m0 5736 mova [cq+128* 9], m26 5737 mova [cq+128*10], m20 5738 mova [cq+128*11], m21 5739 mova [cq+128*12], m18 5740 mova [cq+128*13], m16 5741 mova [cq+128*14], m17 5742 mova [cq+128*15], m3 5743 sub cq, 64 5744 vpbroadcastd m12, [o(pd_2896)] 5745 vpbroadcastd m13, [o(pd_2048)] 5746 vpbroadcastd m14, [o(clip_18b_min)] 5747 vpbroadcastd m15, [o(clip_18b_max)] 5748 sub rsp, 16*64 5749 call .pass1 5750 sub rsp, 24*64 5751 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start 5752 mov r8, dstq 5753 pxor m31, m31 5754 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end 5755 lea dstq, [r8+64] 5756 mova m0, [rsp+56*mmsize] 5757 mova m1, [rsp+57*mmsize] 5758 mova m2, [rsp+58*mmsize] 5759 mova m3, [rsp+59*mmsize] 5760 mova m4, [rsp+60*mmsize] 5761 mova m5, [rsp+61*mmsize] 5762 mova m6, [rsp+62*mmsize] 5763 mova m7, [rsp+63*mmsize] 5764 mova m16, [rsp+64*mmsize] 5765 mova m17, [rsp+65*mmsize] 5766 mova m18, [rsp+66*mmsize] 5767 mova m19, [rsp+67*mmsize] 5768 mova m20, [rsp+68*mmsize] 5769 mova m21, [rsp+69*mmsize] 5770 mova m22, [rsp+70*mmsize] 5771 mova m23, [rsp+71*mmsize] 5772 call .transpose 5773 mova [cq+128* 0+64], m27 5774 mova [cq+128* 1+64], m14 5775 mova [cq+128* 2+64], m28 5776 mova [cq+128* 3+64], m15 5777 mova [cq+128* 4+64], m22 5778 mova [cq+128* 5+64], m23 5779 mova [cq+128* 6+64], m24 5780 mova [cq+128* 7+64], m25 5781 mova [cq+128* 8+64], m0 5782 mova [cq+128* 9+64], m26 5783 mova [cq+128*10+64], m20 5784 mova [cq+128*11+64], m21 5785 mova [cq+128*12+64], m18 5786 mova [cq+128*13+64], m16 5787 mova [cq+128*14+64], m17 5788 mova [cq+128*15+64], m3 5789 mova m0, [rsp+40*mmsize] 5790 mova m1, [rsp+41*mmsize] 5791 mova m2, [rsp+42*mmsize] 5792 mova m3, [rsp+43*mmsize] 5793 mova m4, [rsp+44*mmsize] 5794 mova m5, [rsp+45*mmsize] 5795 mova m6, [rsp+46*mmsize] 5796 mova m7, [rsp+47*mmsize] 5797 mova m16, [rsp+48*mmsize] 5798 mova m17, [rsp+49*mmsize] 5799 mova m18, [rsp+50*mmsize] 5800 mova m19, [rsp+51*mmsize] 5801 mova m20, [rsp+52*mmsize] 5802 mova m21, [rsp+53*mmsize] 5803 mova m22, [rsp+54*mmsize] 5804 mova m23, [rsp+55*mmsize] 5805 add rsp, 32*64 5806 call .transpose 5807 lea r5, [o_base] 5808 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start 5809.right_zero_loop: 5810 REPX {mova [cq+r7*8+64+128*x], m31}, 0, 1, 2, 3 5811 sub r7d, 16*4 5812 jge .right_zero_loop 5813 mov r7d, 16*28 5814 jmp .end 5815.fast: ; topleft 16x16 nonzero 5816 cmp eobd, 36 5817 jl .fast2 5818 call .pass1_fast 5819 sub rsp, 24*64 5820 vpbroadcastd m10, [o(pd_2048)] 5821 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start 5822 mov r8, dstq 5823 pxor m31, m31 5824 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end 5825 lea dstq, [r8+64] 5826 mova m0, [rsp+40*mmsize] 5827 mova m1, [rsp+41*mmsize] 5828 mova m2, [rsp+42*mmsize] 5829 mova m3, [rsp+43*mmsize] 5830 mova m4, [rsp+44*mmsize] 5831 mova m5, [rsp+45*mmsize] 5832 mova m6, [rsp+46*mmsize] 5833 mova m7, [rsp+47*mmsize] 5834 mova m16, [rsp+48*mmsize] 5835 mova m17, [rsp+49*mmsize] 5836 mova m18, [rsp+50*mmsize] 5837 mova m19, [rsp+51*mmsize] 5838 mova m20, [rsp+52*mmsize] 5839 mova m21, [rsp+53*mmsize] 5840 mova m22, [rsp+54*mmsize] 5841 mova m23, [rsp+55*mmsize] 5842 add rsp, 16*64 5843 call .transpose 5844 lea r5, [o_base] 5845 vpbroadcastd m10, [o(pd_2048)] 5846 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start 5847 mov r7d, 16*12 5848 jmp .end 5849.fast2: ; topleft 8x8 nonzero 5850 movshdup m7, [o(permB)] 5851 mova ym0, [cq+128*1] 5852 mova ym2, [cq+128*5] 5853 mova ym3, [cq+128*3] 5854 mova ym1, [cq+128*7] 5855 vpermt2q m0, m7, m2 ; 1 5 5856 vpermt2q m1, m7, m3 ; 7 3 5857 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed 5858 mova [rsp+ 0*mmsize], m0 5859 mova [rsp+ 1*mmsize], m1 5860 mova [rsp+ 2*mmsize], m2 5861 mova [rsp+ 3*mmsize], m3 5862 mova [rsp+ 4*mmsize], m4 5863 mova [rsp+ 5*mmsize], m5 5864 mova [rsp+ 6*mmsize], m6 5865 mova [rsp+ 7*mmsize], m7 5866 mova [rsp+ 8*mmsize], m16 5867 mova [rsp+ 9*mmsize], m17 5868 mova [rsp+10*mmsize], m18 5869 mova [rsp+11*mmsize], m19 5870 mova [rsp+12*mmsize], m20 5871 mova [rsp+13*mmsize], m21 5872 mova [rsp+14*mmsize], m22 5873 mova [rsp+15*mmsize], m23 5874 5875 movshdup m7, [o(permB)] 5876 mova ym0, [cq+128*0] 5877 mova ym4, [cq+128*4] 5878 mova ym16, [cq+128*2] 5879 mova ym5, [cq+128*6] 5880 vpermt2q m16, m7, m5 ; 2 6 5881 vpermq m0, m7, m0 ; 0 0 5882 vpermq m4, m7, m4 ; 4 4 5883 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3 5884 5885 vpbroadcastd m11, [o(pd_2)] 5886 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end 5887 sub rsp, 16*64 5888 mova [rsp+40*mmsize], m24 5889 mova [rsp+41*mmsize], m25 5890 mova [rsp+42*mmsize], m26 5891 mova [rsp+43*mmsize], m27 5892 mova [rsp+44*mmsize], m28 5893 mova [rsp+45*mmsize], m29 5894 mova [rsp+46*mmsize], m30 5895 mova [rsp+47*mmsize], m31 5896 call .pass2_fast2_start 5897 mov r7d, 16*4 5898 mov r8, dstq 5899 pxor m31, m31 5900 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end 5901 lea dstq, [r8+64] 5902 mova m0, [rsp+40*mmsize] 5903 mova m1, [rsp+41*mmsize] 5904 mova m2, [rsp+42*mmsize] 5905 mova m3, [rsp+43*mmsize] 5906 mova m4, [rsp+44*mmsize] 5907 mova m5, [rsp+45*mmsize] 5908 mova m6, [rsp+46*mmsize] 5909 mova m7, [rsp+47*mmsize] 5910 add rsp, 8*64 5911 lea r5, [o_base] 5912 call .pass2_fast2_start 5913.end: 5914 pxor m31, m31 5915.zero_loop: 5916 REPX {mova [cq+r7*8+128*x], m31}, 0, 1, 2, 3 5917 sub r7d, 16*4 5918 jge .zero_loop 5919 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end 5920 add rsp, 8*64 ; FIXME adjust stack_size_padded instead? 5921 RET 5922.pass2_fast2_start: 5923 call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 5924 punpcklqdq m27, m0, m2 ; 0 5925 punpckhqdq m0, m2 ; 1 5926 punpcklqdq m22, m3, m4 ; 2 5927 punpckhqdq m26, m3, m4 ; 3 5928 punpcklqdq m14, m5, m7 ; 4 5929 punpckhqdq m20, m5, m7 ; 5 5930 punpcklqdq m23, m6, m8 ; 6 5931 punpckhqdq m21, m6, m8 ; 7 5932 vpbroadcastd m10, [o(pd_2048)] 5933 jmp m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast2_start 5934.dconly: 5935 imul r6d, [cq], 181 5936 mov [cq], eobd 5937 or r3d, 64 5938 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly1 5939.pass1_fast: 5940 lea r4, [idct64_mul_16bpc] 5941 lea r6, [rsp+4*64+gprsize] 5942 mova m0, [cq+128* 1] 5943 mova m3, [cq+128*15] 5944 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast 5945 mova m0, [cq+128* 7] 5946 mova m3, [cq+128* 9] 5947 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast 5948 mova m0, [cq+128* 5] 5949 mova m3, [cq+128*11] 5950 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast 5951 mova m0, [cq+128* 3] 5952 mova m3, [cq+128*13] 5953 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast 5954 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 5955 mova m0, [cq+128* 0] 5956 mova m1, [cq+128* 8] 5957 mova m16, [cq+128* 4] 5958 mova m17, [cq+128*12] 5959 call m(idct_8x16_internal_10bpc).main_fast2 5960 call m(idct_16x16_internal_10bpc).main_fast2 5961 call .pass1_load_spill 5962 call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2 5963 jmp .pass1_end 5964.pass1: 5965 lea r4, [idct64_mul_16bpc] 5966 lea r6, [rsp+4*64+gprsize] 5967 mova m0, [cq+128* 1] 5968 mova m1, [cq+128*31] 5969 mova m2, [cq+128*17] 5970 mova m3, [cq+128*15] 5971 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 5972 mova m0, [cq+128* 7] 5973 mova m1, [cq+128*25] 5974 mova m2, [cq+128*23] 5975 mova m3, [cq+128* 9] 5976 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 5977 mova m0, [cq+128* 5] 5978 mova m1, [cq+128*27] 5979 mova m2, [cq+128*21] 5980 mova m3, [cq+128*11] 5981 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 5982 mova m0, [cq+128* 3] 5983 mova m1, [cq+128*29] 5984 mova m2, [cq+128*19] 5985 mova m3, [cq+128*13] 5986 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 5987 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 5988 mova m0, [cq+128* 0] 5989 mova m1, [cq+128* 8] 5990 mova m2, [cq+128*16] 5991 mova m3, [cq+128*24] 5992 mova m16, [cq+128* 4] 5993 mova m17, [cq+128*12] 5994 mova m18, [cq+128*20] 5995 mova m19, [cq+128*28] 5996 call m(idct_8x16_internal_10bpc).main_fast 5997 call m(idct_16x16_internal_10bpc).main_fast 5998 call .pass1_load_spill 5999 mova m4, [cq+128*18] 6000 mova m5, [cq+128*22] 6001 mova m6, [cq+128*26] 6002 mova m7, [cq+128*30] 6003 call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast 6004.pass1_end: 6005 vpbroadcastd m11, [o(pd_2)] 6006 lea r3, [rsp+gprsize] 6007 lea r4, [cq+8*128] 6008 call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end 6009 ; transpose one half immediately, we can transpose lower half later 6010.transpose: 6011 ; transpose m0-7,16-23 6012 psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11 6013 psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 6014 call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32 6015 punpcklqdq m27, m0, m20 ; 0 6016 punpckhqdq m0, m20 ; 1 6017 punpcklqdq m24, m5, m16 ; 10 6018 punpckhqdq m16, m5, m16 ; 11 6019 punpcklqdq m23, m3, m21 ; 6 6020 punpckhqdq m21, m3, m21 ; 7 6021 punpcklqdq m25, m7, m8 ; 14 6022 punpckhqdq m3, m7, m8 ; 15 6023 punpcklqdq m22, m15, m4 ; 2 6024 punpckhqdq m26, m15, m4 ; 3 6025 punpcklqdq m15, m6, m17 ; 12 6026 punpckhqdq m17, m6, m17 ; 13 6027 punpcklqdq m28, m14, m18 ; 8 6028 punpckhqdq m18, m14, m18 ; 9 6029 punpcklqdq m14, m2, m1 ; 4 6030 punpckhqdq m20, m2, m1 ; 5 6031 ret 6032.pass1_load_spill: 6033 call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub 6034 mova [cq+128* 0], m0 6035 mova [cq+128* 1], m1 6036 mova m0, [cq+128* 2] 6037 mova m1, [cq+128* 6] 6038 mova [cq+128* 2], m2 6039 mova [cq+128* 3], m3 6040 mova m2, [cq+128*10] 6041 mova m3, [cq+128*14] 6042 mova [cq+128* 4], m4 6043 mova [cq+128* 5], m5 6044 mova [cq+128* 6], m6 6045 mova [cq+128* 7], m7 6046 mova [cq+128* 8], m23 6047 mova [cq+128* 9], m22 6048 mova [cq+128*10], m21 6049 mova [cq+128*11], m20 6050 mova [cq+128*12], m19 6051 mova [cq+128*13], m18 6052 mova [cq+128*14], m17 6053 mova [cq+128*15], m16 6054 ret 6055 6056%endif ; ARCH_X86_64 6057