xref: /aosp_15_r20/external/libdav1d/src/x86/itx16_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2022-2023, VideoLAN and dav1d authors
2; Copyright © 2022-2023, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 64
32
33idct8x8p:      db  0,  1,  4,  5,  2,  3,  6,  7, 16, 17, 20, 21, 18, 19, 22, 23
34               db  8,  9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31
35               db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55
36               db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63
37idtx8x8p:      db  0,  1, 32, 33,  2,  3, 34, 35,  4,  5, 36, 37,  6,  7, 38, 39
38               db  8,  9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
39               db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
40               db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
41idct8x16p:     db 54, 55,  2,  3, 22, 23, 34, 35, 38, 39, 18, 19,  6,  7, 50, 51
42               db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59
43               db 52, 53,  4,  5, 20, 21, 36, 37, 32, 33,  0,  1, 48, 49, 16, 17
44               db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41,  8,  9, 56, 57, 24, 25
45iadst8x16p:    db  0,  1, 54, 55, 48, 49,  6,  7, 16, 17, 38, 39, 32, 33, 22, 23
46               db  8,  9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31
47               db  4,  5, 50, 51, 52, 53,  2,  3, 20, 21, 34, 35, 36, 37, 18, 19
48               db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27
49permA:         db  0,  1,  0,  8,  4,  5,  1,  9,  8,  9,  4, 12, 12, 13,  5, 13
50               db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29
51               db  2,  3,  2, 10,  6,  7,  3, 11, 10, 11,  6, 14, 14, 15,  7, 15
52               db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31
53permB:         db  4,  2,  1,  8,  0,  0,  1,  0, 12,  3,  3, 10,  8,  1,  3,  2
54               db  5, 10,  5, 12,  1,  8,  5,  4, 13, 11,  7, 14,  9,  9,  7,  6
55               db  6,  6, 13,  4,  2,  4,  4,  5, 14,  7, 15,  6, 10,  5,  6,  7
56               db  7, 14,  9,  0,  3, 12,  0,  1, 15, 15, 11,  2, 11, 13,  2,  3
57permC:         db  0,  9,  0,  0,  0,  1,  4,  4,  2, 11,  2,  2,  2,  3,  6,  6
58               db  1,  8,  1,  8,  4,  5,  5, 12,  3, 10,  3, 10,  6,  7,  7, 14
59               db  9,  1,  8,  1,  1,  0, 12,  5, 11,  3, 10,  3,  3,  2, 14,  7
60               db  8,  0,  9,  9,  5,  4, 13, 13, 10,  2, 11, 11,  7,  6, 15, 15
61idct8x32p:     db  0,  1,  4,  5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
62               db  8,  9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
63               db  2,  3,  6,  7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
64               db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
65idct32x8p:     db  2, 18,  0, 16,  3, 19,  1, 17, 10, 26,  8, 24, 11, 27,  9, 25
66               db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57
67               db  6, 22,  4, 20,  7, 23,  5, 21, 14, 30, 12, 28, 15, 31, 13, 29
68               db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61
69idtx32x8p:     db  0,  8, 16, 24,  4, 12, 20, 28,  2, 10, 18, 26,  6, 14, 22, 30
70               db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62
71               db  1,  9, 17, 25,  5, 13, 21, 29,  3, 11, 19, 27,  7, 15, 23, 31
72               db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63
73
74pw_2048_m2048: times 16 dw  2048
75pw_m2048_2048: times 16 dw -2048
76pw_2048:       times 16 dw  2048
77
78; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-, 4=--
79%macro COEF_PAIR 2-3 0 ; a, b, flags
80%if %3 == 1
81pd_%1_m%2: dd %1, %1, -%2, -%2
82%define pd_%1  (pd_%1_m%2 + 4*0)
83%define pd_m%2 (pd_%1_m%2 + 4*2)
84%elif %3 == 2
85pd_m%1_%2: dd -%1, -%1, %2, %2
86%define pd_m%1 (pd_m%1_%2 + 4*0)
87%define pd_%2  (pd_m%1_%2 + 4*2)
88%elif %3 == 4
89pd_m%1_m%2: dd -%1, -%1, -%2, -%2
90%define pd_m%1 (pd_m%1_m%2 + 4*0)
91%define pd_m%2 (pd_m%1_m%2 + 4*2)
92%else
93pd_%1_%2: dd %1, %1, %2, %2
94%define pd_%1  (pd_%1_%2 + 4*0)
95%define pd_%2  (pd_%1_%2 + 4*2)
96%if %3 == 3
97%define pd_%2_m%2 pd_%2
98dd -%2, -%2
99%endif
100%endif
101%endmacro
102
103COEF_PAIR  101,  501
104COEF_PAIR  201,  601, 1
105COEF_PAIR  201,  995
106COEF_PAIR  401, 1189, 1
107COEF_PAIR  401, 1931
108COEF_PAIR  401, 3920
109COEF_PAIR  401, 4076
110COEF_PAIR  700,  301, 4
111COEF_PAIR  799, 2276, 1
112COEF_PAIR  799, 3406
113COEF_PAIR  799, 4017
114COEF_PAIR 1380,  601
115COEF_PAIR 1751, 2440
116COEF_PAIR 2598, 1189
117COEF_PAIR 2598, 1931, 2
118COEF_PAIR 2598, 3612
119COEF_PAIR 2751, 2106
120COEF_PAIR 2896, 1567, 3
121COEF_PAIR 2896, 3784, 3
122COEF_PAIR 3035, 3513
123COEF_PAIR 3166, 1931
124COEF_PAIR 3166, 3612
125COEF_PAIR 3166, 3920
126COEF_PAIR 3703, 3290
127COEF_PAIR 3857, 4052
128COEF_PAIR 4017, 2276
129COEF_PAIR 4017, 3406
130COEF_PAIR 4036, 4085
131COEF_PAIR 4076, 1189
132COEF_PAIR 4076, 3612
133COEF_PAIR 4076, 3920
134COEF_PAIR 4091, 3973
135COEF_PAIR 4091, 4052
136COEF_PAIR 4095, 4065
137
138pb_32:           times 4 db 32
139pw_5:            times 2 dw 5
140pw_4096:         times 2 dw 4096
141pw_8192:         times 2 dw 8192
142pw_1697x16:      times 2 dw 1697*16
143pw_2896x8:       times 2 dw 2896*8
144pixel_10bpc_max: times 2 dw 0x03ff
145dconly_10bpc:    times 2 dw 0x7c00
146clip_18b_min:    dd -0x20000
147clip_18b_max:    dd  0x1ffff
148pd_1:            dd 1
149pd_2:            dd 2
150pd_1448:         dd 1448
151pd_2048:         dd 2048
152pd_3071:         dd 3071 ; 1024 + 2048 - 1
153pd_3072:         dd 3072 ; 1024 + 2048
154pd_5119:         dd 5119 ; 1024 + 4096 - 1
155pd_5120:         dd 5120 ; 1024 + 4096
156pd_5793:         dd 5793
157
158cextern dup16_perm
159cextern int8_permA
160cextern idct64_mul_16bpc
161cextern idct_8x8_internal_8bpc_avx512icl.main
162cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2
163cextern idct_8x16_internal_8bpc_avx512icl.main
164cextern idct_8x16_internal_8bpc_avx512icl.main2
165cextern idct_8x16_internal_8bpc_avx512icl.main_fast
166cextern idct_8x16_internal_8bpc_avx512icl.main_fast2
167cextern iadst_8x16_internal_8bpc_avx512icl.main2
168cextern idct_16x8_internal_8bpc_avx512icl.main
169cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2
170cextern idct_16x16_internal_8bpc_avx512icl.main
171cextern idct_16x16_internal_8bpc_avx512icl.main2
172cextern idct_16x16_internal_8bpc_avx512icl.main_fast
173cextern idct_16x16_internal_8bpc_avx512icl.main_fast2
174cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b
175cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main
176cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast
177cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2
178cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end
179cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf
180cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast
181cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast2
182cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main
183cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf
184cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast
185cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2
186cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast3
187cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf
188cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast
189cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2
190cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast3
191cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf
192cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast
193cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1
194cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast
195cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast2
196cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part2
197
198SECTION .text
199
200%define o_base (pw_2048+4*128)
201%define o_base_8bpc (int8_permA+64*18)
202%define o(x) (r5 - o_base + (x))
203%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
204
205INIT_ZMM avx512icl
206
207; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
208; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
209; flags: 1 = inv_dst1, 2 = inv_dst2
210; skip round/shift if rnd is not a number
211%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
212%if %8 < 32
213    pmulld              m%4, m%1, m%8
214    pmulld              m%3, m%2, m%8
215%else
216%if %8 < 4096
217    vpbroadcastd        m%3, [o(pd_%8)]
218%else
219    vbroadcasti32x4     m%3, [o(pd_%8)]
220%endif
221    pmulld              m%4, m%1, m%3
222    pmulld              m%3, m%2
223%endif
224%if %7 < 32
225    pmulld              m%1, m%7
226    pmulld              m%2, m%7
227%else
228%if %7 < 4096
229    vpbroadcastd        m%5, [o(pd_%7)]
230%else
231    vbroadcasti32x4     m%5, [o(pd_%7)]
232%endif
233    pmulld              m%1, m%5
234    pmulld              m%2, m%5
235%endif
236%if %9 & 2
237    psubd               m%4, m%6, m%4
238    psubd               m%2, m%4, m%2
239%else
240%ifnum %6
241    paddd               m%4, m%6
242%endif
243    paddd               m%2, m%4
244%endif
245%ifnum %6
246    paddd               m%1, m%6
247%endif
248%if %9 & 1
249    psubd               m%1, m%3, m%1
250%else
251    psubd               m%1, m%3
252%endif
253%ifnum %6
254    psrad               m%2, 12
255    psrad               m%1, 12
256%endif
257%endmacro
258
259%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size
260cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2
261    %define %%p1 m(i%1_%4_internal_10bpc)
262    lea                  r5, [o_base]
263    ; Jump to the 1st txfm function if we're not taking the fast path, which
264    ; in turn performs an indirect jump to the 2nd txfm function.
265    lea tx2q, [m(i%2_%4_internal_10bpc).pass2]
266%ifidn %1_%2, dct_dct
267    test               eobd, eobd
268    jnz %%p1
269%else
270%if %3
271    add                eobd, %3
272%endif
273    ; jump to the 1st txfm function unless it's located directly after this
274    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
275ALIGN function_align
276%%end:
277%endif
278%endmacro
279
280%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
281    INV_TXFM_FN          %1, %2, %3, 8x8
282%ifidn %1_%2, dct_dct
283    imul                r6d, [cq], 181
284    mov                [cq], eobd ; 0
285    or                  r3d, 8
286.dconly:
287    add                 r6d, 384
288    sar                 r6d, 9
289.dconly2:
290    vpbroadcastd        ym2, [o(dconly_10bpc)]
291    imul                r6d, 181
292    add                 r6d, 2176
293    sar                 r6d, 12
294    vpbroadcastw        ym1, r6d
295    paddsw              ym1, ym2
296.dconly_loop:
297    mova                xm0, [dstq+strideq*0]
298    vinserti32x4        ym0, [dstq+strideq*1], 1
299    paddsw              ym0, ym1
300    psubusw             ym0, ym2
301    mova          [dstq+strideq*0], xm0
302    vextracti32x4 [dstq+strideq*1], ym0, 1
303    lea                dstq, [dstq+strideq*2]
304    sub                 r3d, 2
305    jg .dconly_loop
306    RET
307%endif
308%endmacro
309
310INV_TXFM_8X8_FN dct, dct
311INV_TXFM_8X8_FN dct, adst
312INV_TXFM_8X8_FN dct, flipadst
313INV_TXFM_8X8_FN dct, identity
314
315cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
316    call .load
317    vpermi2q             m1, m0, m2 ; 1 5
318    vpermi2q             m3, m6, m4 ; 7 3
319    vpermt2q             m0, m5, m4 ; 0 2
320    vpermt2q             m2, m5, m6 ; 4 6
321    call .main
322    call .main_end
323    mova                 m4, [o(idct8x8p)]
324    packssdw             m0, m2     ; 0 1 4 5
325    packssdw             m1, m3     ; 3 2 7 6
326    vpermb               m0, m4, m0
327    vprolq               m1, 32
328    vpermb               m2, m4, m1
329    punpckhdq            m1, m0, m2
330    punpckldq            m0, m2
331    jmp                tx2q
332.pass2:
333    lea                  r5, [o_base_8bpc]
334    vextracti32x8       ym2, m0, 1
335    vextracti32x8       ym3, m1, 1
336    call m(idct_8x8_internal_8bpc).main
337    mova                m10, [permC]
338    vpbroadcastd        m12, [pw_2048]
339.end:
340    vpermt2q             m0, m10, m1
341    vpermt2q             m2, m10, m3
342.end2:
343    vpbroadcastd        m11, [pixel_10bpc_max]
344    lea                  r6, [strideq*3]
345    pxor                m10, m10
346    pmulhrsw             m8, m12, m0
347    call .write_8x4_start
348    pmulhrsw             m8, m12, m2
349.write_8x4:
350    lea                dstq, [dstq+strideq*4]
351    add                  cq, 64*2
352.write_8x4_start:
353    mova                xm9, [dstq+strideq*0]
354    vinserti32x4        ym9, [dstq+strideq*1], 1
355    vinserti32x4         m9, [dstq+strideq*2], 2
356    vinserti32x4         m9, [dstq+r6       ], 3
357    mova          [cq+64*0], m10
358    mova          [cq+64*1], m10
359    paddw                m9, m8
360    pmaxsw               m9, m10
361    pminsw               m9, m11
362    mova          [dstq+strideq*0], xm9
363    vextracti32x4 [dstq+strideq*1], ym9, 1
364    vextracti32x4 [dstq+strideq*2], m9, 2
365    vextracti32x4 [dstq+r6       ], m9, 3
366    ret
367ALIGN function_align
368.load:
369    mova                 m0, [cq+64*0] ; 0 1
370    mova                 m4, [cq+64*1] ; 2 3
371    mova                 m1, [o(permB)]
372    mova                 m2, [cq+64*2] ; 4 5
373    mova                 m6, [cq+64*3] ; 6 7
374    vpbroadcastd        m13, [o(pd_2048)]
375    vpbroadcastd        m14, [o(clip_18b_min)]
376    vpbroadcastd        m15, [o(clip_18b_max)]
377    psrlq                m5, m1, 32
378    vpbroadcastd        m12, [o(pd_2896)]
379    mova                 m3, m1
380    vpbroadcastd        m11, [o(pd_1)]
381    ret
382ALIGN function_align
383.main_fast: ; bottom half is zero
384    vbroadcasti32x4      m3, [o(pd_4017_3406)]
385    vbroadcasti32x4      m8, [o(pd_799_m2276)]
386    vbroadcasti32x4      m2, [o(pd_2896_3784)]
387    vbroadcasti32x4      m9, [o(pd_2896_1567)]
388    pmulld               m3, m1     ; t4a  t5a
389    pmulld               m1, m8     ; t7a  t6a
390    pmulld               m2, m0     ; t0   t3
391    pmulld               m0, m9     ; t1   t2
392    jmp .main2
393.main:
394    ITX_MULSUB_2D         1, 3, 8, 9, 10, _,  799_3406, 4017_2276
395    ITX_MULSUB_2D         0, 2, 8, 9, 10, _, 2896_1567, 2896_3784
396.main2:
397    REPX     {paddd x, m13}, m1, m3, m0, m2
398    REPX     {psrad x, 12 }, m1, m3, m0, m2
399    punpcklqdq           m8, m1, m3 ; t4a  t7a
400    punpckhqdq           m1, m3     ; t5a  t6a
401    psubd                m3, m8, m1 ; t5a  t6a
402    paddd                m8, m1     ; t4   t7
403    pmaxsd               m3, m14
404    punpckhqdq           m1, m2, m0 ; t3   t2
405    pminsd               m3, m15
406    punpcklqdq           m2, m0     ; t0   t1
407    pmulld               m3, m12
408    paddd                m0, m2, m1 ; dct4 out0 out1
409    psubd                m2, m1     ; dct4 out3 out2
410    REPX    {pmaxsd x, m14}, m8, m0, m2
411    REPX    {pminsd x, m15}, m8, m0, m2
412.main3:
413    pshufd               m1, m3, q1032
414    paddd                m3, m13
415    psubd                m9, m3, m1
416    paddd                m3, m1
417    psrad                m9, 12
418    psrad                m3, 12
419    punpckhqdq           m1, m8, m3   ; t7   t6
420    shufpd               m8, m9, 0xaa ; t4   t5
421    ret
422.main_end:
423    paddd                m0, m11
424    paddd                m2, m11
425    psubd                m3, m0, m1 ; out7 out6
426    paddd                m0, m1     ; out0 out1
427    paddd                m1, m2, m8 ; out3 out2
428    psubd                m2, m8     ; out4 out5
429    REPX   {vpsravd x, m11}, m0, m2, m3, m1
430    ret
431
432INV_TXFM_8X8_FN adst, dct
433INV_TXFM_8X8_FN adst, flipadst
434INV_TXFM_8X8_FN adst, identity
435INV_TXFM_8X8_FN adst, adst
436
437cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
438    call m(idct_8x8_internal_10bpc).load
439    vpermi2q             m1, m6, m2 ; 7 5
440    vpermi2q             m3, m4, m0 ; 3 1
441    vpermt2q             m0, m5, m4 ; 0 2
442    vpermt2q             m2, m5, m6 ; 4 6
443    call .main
444    punpckldq            m1, m2, m4 ;  out4  out6
445    punpckhdq            m2, m0     ; -out5 -out7
446    punpckldq            m0, m3     ;  out0  out2
447    punpckhdq            m4, m3     ; -out1 -out3
448    paddd                m1, m11
449    psubd                m3, m11, m2
450    paddd                m0, m11
451    psubd                m4, m11, m4
452.pass1_end:
453    REPX       {psrad x, 1}, m1, m0, m3, m4
454    packssdw             m0, m1     ; 0 2 4 6
455    packssdw             m4, m3     ; 1 3 5 7
456    psrlq                m1, [o(permB)], 8
457    punpckhwd            m3, m0, m4
458    punpcklwd            m0, m4
459    psrlq                m2, m1, 32
460    vpermi2q             m1, m0, m3
461    vpermt2q             m0, m2, m3
462    jmp                tx2q
463.pass2:
464    call .main_pass2
465    movu                m10, [permC+2]
466    vbroadcasti32x8     m12, [pw_2048_m2048+16]
467    jmp m(idct_8x8_internal_10bpc).end
468.main_pass2:
469    vextracti32x8       ym2, m0, 1
470    vextracti32x8       ym3, m1, 1
471    lea                  r5, [o_base_8bpc]
472    pshufd              ym4, ym0, q1032
473    pshufd              ym5, ym1, q1032
474    jmp m(iadst_8x8_internal_8bpc).main_pass2
475ALIGN function_align
476.main:
477    ITX_MULSUB_2D         1, 0, 4, 5, 6, 13,  401_1931, 4076_3612
478    ITX_MULSUB_2D         3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189
479    psubd                m4, m0, m2   ; t4  t6
480    paddd                m0, m2       ; t0  t2
481    psubd                m2, m1, m3   ; t5  t7
482    paddd                m1, m3       ; t1  t3
483    REPX    {pmaxsd x, m14}, m4, m2, m0, m1
484    REPX    {pminsd x, m15}, m4, m2, m0, m1
485    pxor                 m5, m5
486    psubd                m5, m4
487    shufpd               m4, m2, 0xaa ; t4  t7
488    shufpd               m2, m5, 0xaa ; t5 -t6
489    ITX_MULSUB_2D         4, 2, 3, 5, 6, 13, 1567, 3784
490    punpckhqdq           m3, m0, m1
491    punpcklqdq           m0, m1
492    psubd                m1, m0, m3   ; t2  t3
493    paddd                m0, m3       ; out0 -out7
494    punpckhqdq           m3, m4, m2   ; t7a t6a
495    punpcklqdq           m4, m2       ; t5a t4a
496    psubd                m2, m4, m3   ; t7  t6
497    paddd                m4, m3       ; out6 -out1
498    REPX    {pmaxsd x, m14}, m1, m2
499    REPX    {pminsd x, m15}, m1, m2
500    shufpd               m3, m1, m2, 0xaa
501    shufpd               m1, m2, 0x55
502    pmulld               m3, m12
503    pmulld               m1, m12
504    paddd                m3, m13
505    psubd                m2, m3, m1
506    paddd                m3, m1
507    psrad                m2, 12       ; out4 -out5
508    pshufd               m3, m3, q1032
509    psrad                m3, 12       ; out2 -out3
510    ret
511
512INV_TXFM_8X8_FN flipadst, dct
513INV_TXFM_8X8_FN flipadst, adst
514INV_TXFM_8X8_FN flipadst, identity
515INV_TXFM_8X8_FN flipadst, flipadst
516
517cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
518    call m(idct_8x8_internal_10bpc).load
519    vpermi2q             m1, m6, m2 ; 7 5
520    vpermi2q             m3, m4, m0 ; 3 1
521    vpermt2q             m0, m5, m4 ; 0 2
522    vpermt2q             m2, m5, m6 ; 4 6
523    call m(iadst_8x8_internal_10bpc).main
524    punpckhdq            m1, m3, m4 ; -out3 -out1
525    punpckldq            m3, m0     ;  out2  out0
526    punpckhdq            m0, m2     ; -out7 -out5
527    punpckldq            m4, m2     ;  out6  out4
528    psubd                m1, m11, m1
529    paddd                m3, m11
530    psubd                m0, m11, m0
531    paddd                m4, m11
532    jmp m(iadst_8x8_internal_10bpc).pass1_end
533.pass2:
534    call m(iadst_8x8_internal_10bpc).main_pass2
535    movu                m10, [permC+1]
536    vbroadcasti32x8     m12, [pw_m2048_2048+16]
537    lea                  r6, [strideq*3]
538    vpermt2q             m0, m10, m1 ; 7 6 5 4
539    vpbroadcastd        m11, [pixel_10bpc_max]
540    vpermt2q             m2, m10, m3 ; 3 2 1 0
541    pxor                m10, m10
542    pmulhrsw             m8, m12, m2
543    call m(idct_8x8_internal_10bpc).write_8x4_start
544    pmulhrsw             m8, m12, m0
545    jmp m(idct_8x8_internal_10bpc).write_8x4
546
547INV_TXFM_8X8_FN identity, dct
548INV_TXFM_8X8_FN identity, adst
549INV_TXFM_8X8_FN identity, flipadst
550INV_TXFM_8X8_FN identity, identity
551
552cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
553    mova                 m1, [cq+64*0]
554    packssdw             m1, [cq+64*2] ; 0 4   1 5
555    mova                 m2, [cq+64*1] ; 2 6   3 7
556    packssdw             m2, [cq+64*3]
557    mova                 m0, [o(idtx8x8p)]
558    vpermb               m1, m0, m1
559    vpermb               m2, m0, m2
560    punpckldq            m0, m1, m2    ; 0 1   4 5
561    punpckhdq            m1, m2        ; 2 3   6 7
562    jmp                tx2q
563.pass2:
564    movu                 m3, [o(permC+2)]
565    vpbroadcastd        m12, [o(pw_4096)]
566    psrlq                m2, m3, 32
567    vpermi2q             m2, m0, m1
568    vpermt2q             m0, m3, m1
569    jmp m(idct_8x8_internal_10bpc).end2
570
571%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset
572    INV_TXFM_FN          %1, %2, %3, 8x16
573%ifidn %1_%2, dct_dct
574    imul                r6d, [cq], 181
575    mov                [cq], eobd ; 0
576    or                  r3d, 16
577    add                 r6d, 128
578    sar                 r6d, 8
579    imul                r6d, 181
580    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
581%endif
582%endmacro
583
584INV_TXFM_8X16_FN dct, dct
585INV_TXFM_8X16_FN dct, identity, 35
586INV_TXFM_8X16_FN dct, flipadst
587INV_TXFM_8X16_FN dct, adst
588
589cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
590%undef cmp
591    cmp                eobd, 43
592    jl .fast
593    call .load
594    call .main
595    call .main_end
596.pass1_end:
597    packssdw             m0, m4
598    packssdw             m1, m5
599    packssdw             m2, m6
600    packssdw             m3, m7
601    jmp                tx2q
602.pass2:
603    mova                 m8, [o(idct8x16p)]
604    REPX  {vpermb x, m8, x}, m0, m1, m2, m3
605    punpckhdq            m5, m0, m1
606    punpckldq            m0, m1
607    punpckhdq            m4, m2, m3
608    punpckldq            m2, m3
609    punpcklqdq           m8, m0, m2 ; 15  1
610    punpckhqdq           m0, m2     ;  7  9
611    punpckhqdq           m1, m5, m4 ;  3 13
612    punpcklqdq           m5, m4     ; 11  5
613    lea                  r5, [o_base_8bpc]
614    vextracti32x8       ym7, m8, 1  ; 14  2
615    vextracti32x8       ym3, m0, 1  ;  6 10
616    vextracti32x8       ym6, m1, 1  ; 12  4
617    vextracti32x8       ym9, m5, 1  ;  8  0
618    call m(idct_8x16_internal_8bpc).main2
619    mova                 m8, [permC]
620    vpbroadcastd        m12, [pw_2048]
621    vpermt2q             m0, m8, m1
622    lea                  r6, [strideq*3]
623    vpermt2q             m2, m8, m3
624    vpbroadcastd        m11, [pixel_10bpc_max]
625    vpermt2q             m4, m8, m5
626    pxor                m10, m10
627    vpermt2q             m6, m8, m7
628    pmulhrsw             m8, m12, m0
629    call m(idct_8x8_internal_10bpc).write_8x4_start
630    pmulhrsw             m8, m12, m2
631    call m(idct_8x8_internal_10bpc).write_8x4
632    pmulhrsw             m8, m12, m4
633    call m(idct_8x8_internal_10bpc).write_8x4
634    pmulhrsw             m8, m12, m6
635    jmp m(idct_8x8_internal_10bpc).write_8x4
636.fast:
637    mova                ym0, [cq+64*0]
638    mova                ym4, [cq+64*2]
639    mova                ym1, [cq+64*1]
640    mova                ym5, [cq+64*5]
641    mova                ym2, [cq+64*4]
642    mova                ym6, [cq+64*6]
643    mova                ym3, [cq+64*7]
644    mova                ym7, [cq+64*3]
645    call .round_input_fast
646    call m(idct_8x8_internal_10bpc).main
647    call m(idct_8x8_internal_10bpc).main_end
648    movu                 m6, [o(permC+3)]
649    packssdw             m3, m1, m3
650    packssdw             m1, m0, m2
651    vprolq               m3, 32
652    vpermd               m1, m6, m1
653    vpermd               m3, m6, m3
654    mova                ym0, ym1    ; 0 4
655    vextracti32x8       ym1, m1, 1  ; 1 5
656    mova                ym2, ym3    ; 2 6
657    vextracti32x8       ym3, m3, 1  ; 3 7
658    jmp                tx2q
659ALIGN function_align
660.round_input_fast:
661    movshdup             m8, [o(permB)]
662    vpbroadcastd        m12, [o(pd_2896)]
663    vpermt2q             m0, m8, m4
664    vpermt2q             m1, m8, m5
665    vpermt2q             m2, m8, m6
666    vpermt2q             m3, m8, m7
667    vpbroadcastd        m13, [o(pd_2048)]
668    REPX    {pmulld x, m12}, m0, m1, m2, m3
669    vpbroadcastd        m14, [o(clip_18b_min)]
670    vpbroadcastd        m15, [o(clip_18b_max)]
671    REPX    {paddd  x, m13}, m0, m1, m2, m3
672    vpbroadcastd        m11, [o(pd_1)]
673    REPX    {psrad  x, 12 }, m0, m1, m2, m3
674    ret
675ALIGN function_align
676.load:
677    vpbroadcastd        m14, [o(clip_18b_min)]
678    vpbroadcastd        m15, [o(clip_18b_max)]
679.load2:
680    vpbroadcastd        m12, [o(pd_2896)]
681    pmulld               m0, m12, [cq+64*0]
682    pmulld               m1, m12, [cq+64*1]
683    pmulld               m2, m12, [cq+64*2]
684    pmulld               m3, m12, [cq+64*3]
685    vpbroadcastd        m13, [o(pd_2048)]
686    pmulld               m4, m12, [cq+64*4]
687    pmulld               m5, m12, [cq+64*5]
688    pmulld               m6, m12, [cq+64*6]
689    pmulld               m7, m12, [cq+64*7]
690.round:
691    REPX     {paddd x, m13}, m0, m1, m2, m3
692    REPX     {psrad x, 12 }, m0, m1, m2, m3
693    REPX     {paddd x, m13}, m4, m5, m6, m7
694    REPX     {psrad x, 12 }, m4, m5, m6, m7
695    ret
696ALIGN function_align
697.main_fast2_rect2:
698    REPX     {paddd x, m13}, m0, m1
699    REPX     {psrad x, 12 }, m0, m1
700.main_fast2:
701    pmulld               m0, m12
702    pmulld               m6, m1, [o(pd_4017)] {1to16} ; t7a
703    pmulld               m8, m1, [o(pd_799)] {1to16}  ; t4a
704    REPX    {paddd  x, m13}, m0, m6, m8
705    REPX    {psrad  x, 12 }, m0, m6, m8
706    pmulld               m5, m6, m12
707    pmulld               m1, m8, m12
708    paddd                m5, m13
709    psubd                m4, m5, m1
710    paddd                m5, m1
711    REPX    {psrad  x, 12 }, m4, m5
712    REPX    {mova   x, m0 }, m1, m2, m3
713    ret
714.main_fast_rect2:
715    REPX     {paddd x, m13}, m0, m1, m2, m3
716    REPX     {psrad x, 12 }, m0, m1, m2, m3
717.main_fast:
718    pmulld               m0, m12
719    pmulld               m5, m3, [o(pd_2276)] {1to16} ; t5a
720    pmulld               m3, [o(pd_3406)] {1to16}     ; t6a
721    pmulld               m7, m1, [o(pd_4017)] {1to16} ; t7a
722    pmulld               m1, [o(pd_799)] {1to16}      ; t4a
723    pmulld               m6, m2, [o(pd_3784)] {1to16} ; t3
724    pmulld               m2, [o(pd_1567)] {1to16}     ; t2
725    paddd                m0, m13
726    psubd                m5, m13, m5
727    psrad                m0, 12                       ; t0
728    mova                 m9, m0                       ; t1
729    jmp .main2
730.main_rect2:
731    call .round
732.main:
733    pmulld               m0, m12
734    ITX_MULSUB_2D         5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
735    ITX_MULSUB_2D         1, 7, 8, 9, 10, _,  799, 4017 ; t4a t7a
736    ITX_MULSUB_2D         2, 6, 8, 9, 10, _, 1567, 3784 ; t2  t3
737    pmulld               m4, m12
738    paddd                m0, m13
739    paddd                m5, m13
740    psubd                m9, m0, m4 ; t1
741    paddd                m0, m4     ; t0
742    psrad                m9, 12
743    psrad                m0, 12
744.main2:
745    REPX    {paddd  x, m13}, m3, m1, m7
746    REPX    {psrad  x, 12 }, m5, m1, m3, m7
747    paddd                m8, m1, m5 ; t4
748    psubd                m1, m5     ; t5a
749    psubd                m5, m7, m3 ; t6a
750    paddd                m7, m3     ; t7
751    pmaxsd               m5, m14
752    pmaxsd               m1, m14
753    paddd                m2, m13
754    paddd                m6, m13
755    pminsd               m5, m15
756    pminsd               m1, m15
757    pmulld               m5, m12
758    pmulld               m1, m12
759    pmaxsd               m8, m14
760    pmaxsd               m7, m14
761    pminsd               m8, m15
762    paddd                m5, m13
763    psubd                m4, m5, m1
764    paddd                m5, m1
765    REPX    {psrad  x, 12 }, m2, m6, m5, m4
766    paddd                m1, m9, m2 ; dct4 out1
767    psubd                m2, m9, m2 ; dct4 out2
768    psubd                m3, m0, m6 ; dct4 out3
769    paddd                m0, m6     ; dct4 out0
770    pminsd               m6, m15, m7
771    REPX    {pmaxsd x, m14}, m0, m1, m2, m3
772    REPX    {pminsd x, m15}, m0, m1, m2, m3
773    ret
774.main_end:
775    vpbroadcastd        m11, [o(pd_1)]
776.main_end2:
777    REPX     {paddd x, m11}, m0, m1, m2, m3
778    psubd                m7, m0, m6 ; out7
779    paddd                m0, m6     ; out0
780    psubd                m6, m1, m5 ; out6
781    paddd                m1, m5     ; out1
782    psubd                m5, m2, m4 ; out5
783    paddd                m2, m4     ; out2
784    psubd                m4, m3, m8 ; out4
785    paddd                m3, m8     ; out3
786    REPX   {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
787    ret
788
789INV_TXFM_8X16_FN adst, dct
790INV_TXFM_8X16_FN adst, identity, 35
791INV_TXFM_8X16_FN adst, flipadst
792INV_TXFM_8X16_FN adst, adst
793
794cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
795%undef cmp
796    cmp                eobd, 43
797    jl .fast
798    call m(idct_8x16_internal_10bpc).load
799    call .main
800    psrad                m0, 1
801    psrad                m1, 1
802    psrad                m6, m10, 1
803    psrad                m7, m11, 1
804    psrad                m2, 12
805    psrad                m3, 12
806    psrad                m4, m8, 12
807    psrad                m5, m9, 12
808    jmp m(idct_8x16_internal_10bpc).pass1_end
809.fast:
810    call .fast_main
811    punpcklqdq           m1, m2, m4 ;  out4  out6
812    punpckhqdq           m2, m0     ; -out5 -out7
813    punpcklqdq           m0, m3     ;  out0  out2
814    punpckhqdq           m4, m3     ; -out1 -out3
815    paddd                m1, m11
816    psubd                m3, m11, m2
817    paddd                m0, m11
818    psubd                m4, m11, m4
819.fast_end:
820    movu                 m5, [o(permC+3)]
821    REPX       {psrad x, 1}, m1, m0, m3, m4
822    packssdw             m2, m0, m1 ; 0 2 4 6
823    packssdw             m3, m4, m3 ; 1 3 5 7
824    vpermd               m2, m5, m2
825    vpermd               m3, m5, m3
826    mova                ym0, ym2
827    vextracti32x8       ym2, m2, 1
828    mova                ym1, ym3
829    vextracti32x8       ym3, m3, 1
830    jmp                tx2q
831.pass2:
832    call .pass2_main
833    movu                 m4, [permB+2]
834    vbroadcasti32x8     m12, [pw_2048_m2048+16]
835    psrlq                m7, m4, 8
836    vpermi2q             m4, m0, m3 ;  0  1  2  3
837    psrlq                m5, m7, 24
838    vpermi2q             m7, m0, m3 ; 12 13 14 15
839    psrlq                m6, m5, 8
840    vpermq               m5, m5, m1 ;  4  5  6  7
841    vpermq               m6, m6, m2 ;  8  9 10 11
842.pass2_end:
843    vpbroadcastd        m11, [pixel_10bpc_max]
844    pxor                m10, m10
845    lea                  r6, [strideq*3]
846    pmulhrsw             m8, m12, m4
847    call m(idct_8x8_internal_10bpc).write_8x4_start
848    pmulhrsw             m8, m12, m5
849    call m(idct_8x8_internal_10bpc).write_8x4
850    pmulhrsw             m8, m12, m6
851    call m(idct_8x8_internal_10bpc).write_8x4
852    pmulhrsw             m8, m12, m7
853    jmp m(idct_8x8_internal_10bpc).write_8x4
854ALIGN function_align
855.main:
856    ITX_MULSUB_2D         7, 0, 8, 9, 10, 13,  401, 4076 ; t1a, t0a
857    ITX_MULSUB_2D         1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a
858    ITX_MULSUB_2D         5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a
859    ITX_MULSUB_2D         3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a
860    psubd                m8, m2, m6 ; t6
861    paddd                m2, m6     ; t2
862    psubd                m6, m0, m4 ; t4
863    paddd                m0, m4     ; t0
864    psubd                m4, m5, m1 ; t7
865    paddd                m5, m1     ; t3
866    psubd                m1, m7, m3 ; t5
867    paddd                m7, m3     ; t1
868    REPX    {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7
869    REPX    {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7
870    vpbroadcastd        m10, [o(pd_1567)]
871    vpbroadcastd        m11, [o(pd_3784)]
872    ITX_MULSUB_2D         6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a
873    ITX_MULSUB_2D         4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a
874    vpbroadcastd        m12, [o(pd_1448)]
875    psubd                m9, m6, m8 ;  t7
876    paddd                m6, m8     ;  out6
877    psubd                m3, m7, m5 ;  t3
878    paddd                m7, m5     ; -out7
879    psubd                m5, m0, m2 ;  t2
880    paddd                m0, m2     ;  out0
881    psubd                m2, m1, m4 ;  t6
882    paddd                m1, m4     ; -out1
883    REPX    {pmaxsd x, m14}, m5, m3, m2, m9
884    REPX    {pminsd x, m15}, m5, m3, m2, m9
885    REPX    {pmulld x, m12}, m5, m3, m2, m9
886    vpbroadcastd         m4, [o(pd_1)]
887    psubd                m8, m5, m3 ; (t2 - t3) * 1448
888    paddd                m3, m5     ; (t2 + t3) * 1448
889    psubd                m5, m2, m9 ; (t6 - t7) * 1448
890    paddd                m2, m9     ; (t6 + t7) * 1448
891    vpbroadcastd         m9, [o(pd_3072)]
892    paddd                m0, m4
893    psubd                m1, m4, m1
894    paddd               m10, m6, m4
895    psubd               m11, m4, m7
896    paddd                m2, m9
897    paddd                m8, m9
898    vpbroadcastd         m9, [o(pd_3071)]
899    psubd                m3, m9, m3
900    psubd                m9, m5
901    ret
902ALIGN function_align
903.fast_main:
904    mova                ym0, [cq+64*0]
905    mova                ym4, [cq+64*2]
906    mova                ym1, [cq+64*7]
907    mova                ym5, [cq+64*5]
908    mova                ym2, [cq+64*4]
909    mova                ym6, [cq+64*6]
910    mova                ym3, [cq+64*3]
911    mova                ym7, [cq+64*1]
912    call m(idct_8x16_internal_10bpc).round_input_fast
913    jmp m(iadst_8x8_internal_10bpc).main
914ALIGN function_align
915.pass2_main:
916    mova                 m8, [o(iadst8x16p)]
917    REPX  {vpermb x, m8, x}, m0, m1, m2, m3
918    vpbroadcastd        m10, [o(pw_2896x8)]
919    punpckhdq            m5, m0, m1
920    punpckldq            m0, m1
921    punpckhdq            m1, m2, m3
922    punpckldq            m2, m3
923    lea                  r5, [o_base_8bpc]
924    punpckhqdq           m4, m0, m2 ; 12  3   14  1
925    punpcklqdq           m0, m2     ;  0 15    2 13
926    punpckhqdq           m6, m5, m1 ;  8  7   10  5
927    punpcklqdq           m5, m1     ;  4 11    6  9
928    call m(iadst_8x16_internal_8bpc).main2
929    paddsw               m1, m2, m4
930    psubsw               m2, m4
931    pmulhrsw             m1, m10    ; -out7   out4   out6  -out5
932    pmulhrsw             m2, m10    ;  out8  -out11 -out9   out10
933    ret
934
935INV_TXFM_8X16_FN flipadst, dct
936INV_TXFM_8X16_FN flipadst, identity, 35
937INV_TXFM_8X16_FN flipadst, adst
938INV_TXFM_8X16_FN flipadst, flipadst
939
940cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
941%undef cmp
942    cmp                eobd, 43
943    jl .fast
944    call m(idct_8x16_internal_10bpc).load
945    call m(iadst_8x16_internal_10bpc).main
946    psrad                m7, m0, 1
947    psrad                m0, m11, 1
948    psrad                m6, m1, 1
949    psrad                m1, m10, 1
950    psrad                m5, m2, 12
951    psrad                m2, m9, 12
952    psrad                m4, m3, 12
953    psrad                m3, m8, 12
954    jmp m(idct_8x16_internal_10bpc).pass1_end
955.fast:
956    call m(iadst_8x16_internal_10bpc).fast_main
957    punpckhqdq           m1, m3, m4 ; -out3 -out1
958    punpcklqdq           m3, m0     ;  out2  out0
959    punpckhqdq           m0, m2     ; -out7 -out5
960    punpcklqdq           m4, m2     ;  out6  out4
961    psubd                m1, m11, m1
962    paddd                m3, m11
963    psubd                m0, m11, m0
964    paddd                m4, m11
965    jmp m(iadst_8x16_internal_10bpc).fast_end
966.pass2:
967    call m(iadst_8x16_internal_10bpc).pass2_main
968    movu                 m7, [permB+2]
969    vbroadcasti32x8     m12, [pw_m2048_2048+16]
970    psrlq                m4, m7, 8
971    vpermi2q             m7, m3, m0 ;  3  2  1  0
972    psrlq                m5, m4, 24
973    vpermi2q             m4, m3, m0 ; 15 14 13 12
974    psrlq                m6, m5, 8
975    vpermq               m5, m5, m2 ; 11 10  9  8
976    vpermq               m6, m6, m1 ;  7  6  5  4
977    jmp m(iadst_8x16_internal_10bpc).pass2_end
978
979INV_TXFM_8X16_FN identity, dct
980INV_TXFM_8X16_FN identity, adst
981INV_TXFM_8X16_FN identity, flipadst
982INV_TXFM_8X16_FN identity, identity
983
984cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
985    call m(idct_8x16_internal_10bpc).load2
986    jmp m(idct_8x16_internal_10bpc).pass1_end
987.pass2:
988    vpbroadcastd         m8, [o(pw_1697x16)]
989    pmulhrsw             m4, m8, m0
990    pmulhrsw             m5, m8, m1
991    pmulhrsw             m6, m8, m2
992    pmulhrsw             m7, m8, m3
993    REPX      {paddsw x, x}, m0, m1, m2, m3
994    paddsw               m0, m4
995    paddsw               m1, m5
996    paddsw               m2, m6
997    paddsw               m3, m7
998    vpbroadcastd         m7, [o(pw_2048)]
999    punpckhwd            m4, m0, m1
1000    punpcklwd            m0, m1
1001    punpckhwd            m1, m2, m3
1002    punpcklwd            m2, m3
1003    vpbroadcastd         m6, [o(pixel_10bpc_max)]
1004    punpckhdq            m3, m0, m2
1005    punpckldq            m0, m2
1006    punpckldq            m2, m4, m1
1007    punpckhdq            m4, m1
1008    pxor                 m5, m5
1009    punpckhqdq           m1, m0, m2 ;  1  5  9 13
1010    punpcklqdq           m0, m2     ;  0  4  8 12
1011    punpcklqdq           m2, m3, m4 ;  2  6 10 14
1012    punpckhqdq           m3, m4     ;  3  7 11 15
1013    lea                  r6, [strideq*3]
1014    pmulhrsw             m0, m7
1015    call .write_8x4_start
1016    pmulhrsw             m0, m7, m1
1017    call .write_8x4
1018    pmulhrsw             m0, m7, m2
1019    call .write_8x4
1020    pmulhrsw             m0, m7, m3
1021.write_8x4:
1022    add                dstq, strideq
1023    add                  cq, 64*2
1024.write_8x4_start:
1025    mova                xm4, [dstq+strideq*0]
1026    vinserti32x4        ym4, [dstq+strideq*4], 1
1027    vinserti32x4         m4, [dstq+strideq*8], 2
1028    vinserti32x4         m4, [dstq+r6*4     ], 3
1029    mova          [cq+64*0], m5
1030    mova          [cq+64*1], m5
1031    paddw                m4, m0
1032    pmaxsw               m4, m5
1033    pminsw               m4, m6
1034    mova          [dstq+strideq*0], xm4
1035    vextracti32x4 [dstq+strideq*4], ym4, 1
1036    vextracti32x4 [dstq+strideq*8], m4, 2
1037    vextracti32x4 [dstq+r6*4     ], m4, 3
1038    ret
1039
1040%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
1041    INV_TXFM_FN          %1, %2, %3, 16x8
1042%ifidn %1_%2, dct_dct
1043    imul                r6d, [cq], 181
1044    mov                [cq], eobd ; 0
1045    or                  r3d, 8
1046.dconly:
1047    add                 r6d, 128
1048    sar                 r6d, 8
1049    imul                r6d, 181
1050    add                 r6d, 384
1051    sar                 r6d, 9
1052.dconly2:
1053    vpbroadcastd         m2, [o(dconly_10bpc)]
1054    imul                r6d, 181
1055    add                 r6d, 2176
1056    sar                 r6d, 12
1057    vpbroadcastw         m1, r6d
1058    paddsw               m1, m2
1059.dconly_loop:
1060    mova                ym0, [dstq+strideq*0]
1061    vinserti32x8         m0, [dstq+strideq*1], 1
1062    paddsw               m0, m1
1063    psubusw              m0, m2
1064    mova          [dstq+strideq*0], ym0
1065    vextracti32x8 [dstq+strideq*1], m0, 1
1066    lea                dstq, [dstq+strideq*2]
1067    sub                 r3d, 2
1068    jg .dconly_loop
1069    RET
1070%endif
1071%endmacro
1072
1073INV_TXFM_16X8_FN dct, dct
1074INV_TXFM_16X8_FN dct, identity, -21
1075INV_TXFM_16X8_FN dct, flipadst
1076INV_TXFM_16X8_FN dct, adst
1077
1078cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1079%undef cmp
1080    vpbroadcastd        m12, [o(pd_2896)]
1081    pmulld               m4, m12, [cq+64*0] ;  0  1
1082    pmulld               m9, m12, [cq+64*1] ;  2  3
1083    pmulld               m8, m12, [cq+64*2] ;  4  5
1084    pmulld               m7, m12, [cq+64*3] ;  6  7
1085    vpbroadcastd        m13, [o(pd_2048)]
1086    pxor                 m2, m2
1087    mova                m15, [o(permB)]
1088    REPX {mova [cq+64*x], m2}, 0, 1, 2, 3
1089    psrlq                m0, m15, 32
1090    REPX     {paddd x, m13}, m4, m9, m8, m7
1091    vpbroadcastd        m14, [o(clip_18b_min)]
1092    REPX     {psrad x, 12 }, m4, m8, m9, m7
1093    mova                 m1, m0
1094    vpermi2q             m0, m4, m8   ;  0  4
1095    cmp                eobd, 43
1096    jl .fast
1097    pmulld               m5, m12, [cq+64*4] ;  8  9
1098    pmulld              m10, m12, [cq+64*5] ; 10 11
1099    pmulld              m11, m12, [cq+64*6] ; 12 13
1100    pmulld               m6, m12, [cq+64*7] ; 14 15
1101    REPX {mova [cq+64*x], m2}, 4, 5, 6, 7
1102    REPX     {paddd x, m13}, m5, m10, m11, m6
1103    REPX     {psrad x, 12 }, m10, m5, m11, m6
1104    mova                 m2, m1
1105    vpermi2q             m1, m9, m10  ;  2 10
1106    mova                 m3, m2
1107    vpermi2q             m2, m5, m11  ;  8 12
1108    vpermi2q             m3, m6, m7   ; 14  6
1109    vpermt2q             m4, m15, m11 ;  1 13
1110    vpermt2q             m6, m15, m9  ; 15  3
1111    vpermt2q             m5, m15, m8  ;  9  5
1112    vpermt2q             m7, m15, m10 ;  7 11
1113    vpbroadcastd        m15, [o(clip_18b_max)]
1114    call m(idct_8x8_internal_10bpc).main
1115    call .main
1116    jmp .pass1_end
1117.fast:
1118    vpermi2q             m1, m9, m7   ;  2  6
1119    vpermt2q             m4, m15, m9  ;  1  3
1120    vpermt2q             m7, m15, m8  ;  7  5
1121    vpbroadcastd        m15, [o(clip_18b_max)]
1122    call m(idct_8x8_internal_10bpc).main_fast
1123    call .main_fast
1124.pass1_end:
1125    call m(idct_8x16_internal_10bpc).main_end
1126    mova                 m8, [o(permA)]
1127    psrlq                m9, m8, 8
1128.pass1_end2:
1129    mova                m10, m9
1130    mova                m11, m8
1131    call .transpose_16x8
1132    jmp                tx2q
1133.pass2:
1134    lea                  r5, [o_base_8bpc]
1135    call m(idct_16x8_internal_8bpc).main
1136    movshdup             m4, [permC]
1137    vpbroadcastd        m11, [pw_2048]
1138    psrlq                m5, m4, 8
1139.end:
1140    vpbroadcastd        m13, [pixel_10bpc_max]
1141    pxor                m12, m12
1142    vpermq               m8, m4, m0
1143    vpermq               m9, m5, m1
1144    lea                  r6, [strideq*3]
1145    call .write_16x4
1146    vpermq               m8, m4, m2
1147    vpermq               m9, m5, m3
1148.write_16x4:
1149    pmulhrsw             m8, m11
1150    pmulhrsw             m9, m11
1151.write_16x4_noround:
1152    mova               ym10, [dstq+strideq*0]
1153    vinserti32x8        m10, [dstq+strideq*1], 1
1154    paddw                m8, m10
1155    mova               ym10, [dstq+strideq*2]
1156    vinserti32x8        m10, [dstq+r6       ], 1
1157    paddw                m9, m10
1158    pmaxsw               m8, m12
1159    pmaxsw               m9, m12
1160    pminsw               m8, m13
1161    pminsw               m9, m13
1162    mova          [dstq+strideq*0], ym8
1163    vextracti32x8 [dstq+strideq*1], m8, 1
1164    mova          [dstq+strideq*2], ym9
1165    vextracti32x8 [dstq+r6       ], m9, 1
1166    lea                dstq, [dstq+strideq*4]
1167    ret
1168ALIGN function_align
1169.main_fast: ; bottom half is zero
1170    vbroadcasti32x4      m6, [o(pd_4076_3920)]
1171    vbroadcasti32x4      m3, [o(pd_401_m1189)]
1172    vbroadcasti32x4      m5, [o(pd_m2598_1931)]
1173    vbroadcasti32x4      m9, [o(pd_3166_3612)]
1174    pmulld               m6, m4    ; t15a t12a
1175    pmulld               m4, m3    ; t8a  t11a
1176    pmulld               m5, m7    ; t9a  t10a
1177    pmulld               m7, m9    ; t14a t13a
1178    jmp .main2
1179.main:
1180    ITX_MULSUB_2D         4, 6, 3, 9, 10, _,  401_3920, 4076_1189
1181    ITX_MULSUB_2D         5, 7, 3, 9, 10, _, 3166_1931, 2598_3612
1182.main2:
1183    REPX     {paddd x, m13}, m4, m6, m5, m7
1184    REPX     {psrad x, 12 }, m4, m5, m6, m7
1185    paddd                m9, m4, m5 ; t8   t11
1186    psubd                m4, m5     ; t9   t10
1187    psubd                m5, m6, m7 ; t14  t13
1188    paddd                m6, m7     ; t15  t12
1189    REPX    {pmaxsd x, m14}, m5, m4, m9, m6
1190    REPX    {pminsd x, m15}, m5, m4, m9, m6
1191.main3:
1192    psubd                m3, m0, m1 ; dct8 out7 out6
1193    paddd                m0, m1     ; dct8 out0 out1
1194    vbroadcasti32x4      m7, [o(pd_3784_m3784)]
1195    pmulld               m7, m5
1196    vpmulld              m5, [o(pd_1567)] {1to16}
1197    paddd                m1, m2, m8 ; dct8 out3 out2
1198    psubd                m2, m8     ; dct8 out4 out5
1199    vbroadcasti32x4      m8, [o(pd_1567_m1567)]
1200    pmulld               m8, m4
1201    vpmulld              m4, [o(pd_3784)] {1to16}
1202    REPX    {pmaxsd x, m14}, m0, m1
1203    REPX    {pminsd x, m15}, m0, m1
1204    paddd                m7, m13
1205    paddd                m5, m13
1206    paddd                m7, m8
1207    psubd                m5, m4
1208    psrad                m7, 12     ; t14a t10a
1209    psrad                m5, 12     ; t9a  t13a
1210    punpckhqdq           m4, m9, m7
1211    punpcklqdq           m8, m9, m5
1212    punpckhqdq           m5, m6, m5
1213    punpcklqdq           m6, m7
1214    psubd                m7, m8, m4 ; t11a t10
1215    paddd                m8, m4     ; t8a  t9
1216    psubd                m4, m6, m5 ; t12a t13
1217    paddd                m6, m5     ; t15a t14
1218    REPX    {pmaxsd x, m14}, m4, m7
1219    REPX    {pminsd x, m15}, m4, m7
1220    pmulld               m4, m12
1221    pmulld               m7, m12
1222    REPX    {pmaxsd x, m14}, m2, m3, m6, m8
1223    REPX    {pminsd x, m15}, m2, m3, m6, m8
1224    paddd                m4, m13
1225    paddd                m5, m4, m7
1226    psubd                m4, m7
1227    psrad                m4, 12     ; t11 t10a
1228    psrad                m5, 12     ; t12 t13a
1229    ret
1230ALIGN function_align
1231.transpose_16x8:
1232    packssdw             m0, m4
1233    packssdw             m1, m5
1234    packssdw             m2, m6
1235    packssdw             m3, m7
1236    vpermi2d             m8, m0, m2
1237    vpermt2d             m0, m9, m2
1238    vpermi2d            m10, m1, m3
1239    vpermi2d            m11, m1, m3
1240    punpckhwd            m3, m8, m0
1241    punpcklwd            m1, m8, m0
1242    punpckhwd            m4, m10, m11
1243    punpcklwd            m2, m10, m11
1244    punpckldq            m0, m1, m2
1245    punpckhdq            m1, m2
1246    punpckldq            m2, m3, m4
1247    punpckhdq            m3, m4
1248    ret
1249
1250INV_TXFM_16X8_FN adst, dct
1251INV_TXFM_16X8_FN adst, identity, -21
1252INV_TXFM_16X8_FN adst, flipadst
1253INV_TXFM_16X8_FN adst, adst
1254
1255cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1256%undef cmp
1257    call .main_pass1
1258    vpbroadcastd         m9, [o(pd_1)]
1259    paddd                m0, m9
1260    psubd                m1, m9, m1
1261    paddd                m2, m9
1262    psubd                m3, m9, m3
1263    paddd                m4, m9, m5
1264    psubd                m5, m9, m6
1265    paddd                m6, m9, m7
1266    psubd                m7, m9, m8
1267.pass1_end:
1268    mova                 m9, [o(permA)]
1269    psrlq                m8, m9, 8
1270    REPX       {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7
1271    jmp m(idct_16x8_internal_10bpc).pass1_end2
1272.pass2:
1273    call .main_pass2
1274    vpermq               m8, m11, m0
1275    vpermq               m9, m11, m1
1276    call m(idct_16x8_internal_10bpc).write_16x4_noround
1277    vpermq               m8, m11, m2
1278    vpermq               m9, m11, m3
1279    jmp m(idct_16x8_internal_10bpc).write_16x4_noround
1280ALIGN function_align
1281.main_pass1:
1282    vpbroadcastd        m12, [o(pd_2896)]
1283    pmulld               m2, m12, [cq+64*0]
1284    pmulld               m7, m12, [cq+64*1]
1285    pmulld               m1, m12, [cq+64*2]
1286    pmulld               m5, m12, [cq+64*3]
1287    vpbroadcastd        m13, [o(pd_2048)]
1288    pxor                 m4, m4
1289    mova                m10, [o(permB)]
1290    REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
1291    REPX     {paddd x, m13}, m2, m7, m1, m5
1292    psrlq                m6, m10, 32
1293    REPX     {psrad x, 12 }, m2, m7, m1, m5
1294    mova                 m0, m6
1295    vpermi2q             m0, m2, m7  ;  0  2
1296    vpermt2q             m7, m10, m2 ;  3  1
1297    mova                 m2, m6
1298    vpermi2q             m2, m1, m5  ;  4  6
1299    vpermt2q             m5, m10, m1 ;  7  5
1300    cmp                eobd, 43
1301    jl .main_fast
1302    pmulld               m8, m12, [cq+64*4]
1303    pmulld               m3, m12, [cq+64*5]
1304    pmulld               m9, m12, [cq+64*6]
1305    pmulld               m1, m12, [cq+64*7]
1306    REPX {mova [cq+64*x], m4}, 4, 5, 6, 7
1307    REPX     {paddd x, m13}, m8, m3, m9, m1
1308    REPX     {psrad x, 12 }, m8, m3, m9, m1
1309    mova                 m4, m6
1310    vpermi2q             m4, m8, m3  ;  8 10
1311    vpermt2q             m3, m10, m8 ; 11  9
1312    vpermi2q             m6, m9, m1  ; 12 14
1313    vpermt2q             m1, m10, m9 ; 15 13
1314.main:
1315    ITX_MULSUB_2D         1, 0, 8, 9, 10, _,  201_995,  4091_3973, 1
1316    ITX_MULSUB_2D         3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1
1317    ITX_MULSUB_2D         5, 4, 8, 9, 10, _, 3035_3513, 2751_2106
1318    ITX_MULSUB_2D         7, 6, 8, 9, 10, _, 3857_4052, 1380_601
1319    jmp .main2
1320.main_fast:
1321    vbroadcasti32x4      m1, [o(pd_4091_3973)]
1322    vbroadcasti32x4      m8, [o(pd_201_995)]
1323    vbroadcasti32x4      m3, [o(pd_3703_3290)]
1324    vbroadcasti32x4      m9, [o(pd_1751_2440)]
1325    vbroadcasti32x4      m4, [o(pd_2751_2106)]
1326    vbroadcasti32x4     m10, [o(pd_3035_3513)]
1327    vbroadcasti32x4      m6, [o(pd_1380_601)]
1328    vbroadcasti32x4     m11, [o(pd_3857_4052)]
1329    pmulld               m1, m0
1330    pmulld               m0, m8
1331    pmulld               m3, m2
1332    pmulld               m2, m9
1333    pmulld               m4, m5
1334    pmulld               m5, m10
1335    pmulld               m6, m7
1336    pmulld               m7, m11
1337.main2:
1338    vpbroadcastd        m14, [o(clip_18b_min)]
1339    vpbroadcastd        m15, [o(clip_18b_max)]
1340    REPX  {psubd x, m13, x}, m1, m3
1341    REPX  {paddd x, m13   }, m0, m2, m4, m5, m6, m7
1342    REPX  {psrad x, 12    }, m0, m4, m1, m5, m2, m6, m3, m7
1343    psubd                m8, m0, m4 ; t8a  t10a
1344    paddd                m0, m4     ; t0a  t2a
1345    psubd                m4, m1, m5 ; t9a  t11a
1346    paddd                m1, m5     ; t1a  t3a
1347    psubd                m5, m2, m6 ; t12a t14a
1348    paddd                m2, m6     ; t4a  t6a
1349    psubd                m6, m3, m7 ; t13a t15a
1350    paddd                m3, m7     ; t5a  t7a
1351    REPX    {pmaxsd x, m14}, m8, m4, m5, m6
1352    REPX    {pminsd x, m15}, m8, m4, m5, m6
1353    vbroadcasti32x4     m11, [o(pd_4017_2276)]
1354    vbroadcasti32x4     m10, [o(pd_799_3406)]
1355    ITX_MULSUB_2D         8, 4, 7, 9, _, 13, 10, 11
1356    ITX_MULSUB_2D         6, 5, 7, 9, _, 13, 11, 10
1357    REPX    {pmaxsd x, m14}, m0, m2, m1, m3
1358    REPX    {pminsd x, m15}, m0, m2, m1, m3
1359    psubd                m7, m0, m2 ; t4   t6
1360    paddd                m0, m2     ; t0   t2
1361    psubd                m2, m1, m3 ; t5   t7
1362    paddd                m1, m3     ; t1   t3
1363    psubd                m3, m4, m6 ; t12a t14a
1364    paddd                m4, m6     ; t8a  t10a
1365    psubd                m6, m8, m5 ; t13a t15a
1366    paddd                m8, m5     ; t9a  t11a
1367    REPX    {pmaxsd x, m14}, m7, m3, m2, m6
1368    REPX    {pminsd x, m15}, m7, m3, m2, m6
1369    punpcklqdq           m5, m3, m7 ; t12a t4
1370    punpckhqdq           m3, m7     ; t14a t6
1371    punpckhqdq           m7, m6, m2 ; t15a t7
1372    punpcklqdq           m6, m2     ; t13a t5
1373    vpbroadcastd        m11, [o(pd_1567)]
1374    vpbroadcastd        m10, [o(pd_3784)]
1375    ITX_MULSUB_2D         7, 3, 2, 9, 10, 13, 10, 11
1376    ITX_MULSUB_2D         5, 6, 2, 9, 10, 13, 11, 10
1377    REPX    {pmaxsd x, m14}, m0, m4, m1, m8
1378    REPX    {pminsd x, m15}, m0, m4, m1, m8
1379    punpckhqdq           m2, m4, m0 ; t10a t2
1380    punpcklqdq           m4, m0     ; t8a  t0
1381    punpckhqdq           m0, m8, m1 ; t11a t3
1382    punpcklqdq           m8, m1     ; t9a  t1
1383    paddd                m1, m6, m7 ;  out2  -out3
1384    psubd                m6, m7     ; t14a t6
1385    paddd                m7, m5, m3 ; -out13  out12
1386    psubd                m5, m3     ; t15a t7
1387    psubd                m3, m8, m0 ; t11  t3a
1388    paddd                m8, m0     ;  out14 -out15
1389    paddd                m0, m4, m2 ; -out1   out0
1390    psubd                m4, m2     ; t10  t2a
1391    REPX    {pmaxsd x, m14}, m6, m5, m3, m4
1392    mov                 r6d, 0x3333
1393    REPX    {pminsd x, m15}, m6, m5, m3, m4
1394    kmovw                k1, r6d
1395    REPX    {pmulld x, m12}, m6, m5, m3, m4
1396    pxor                 m9, m9
1397    REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8
1398    paddd                m6, m13
1399    paddd                m4, m13
1400    paddd                m2, m6, m5 ; -out5   out4
1401    psubd                m6, m5     ;  out10 -out11
1402    psubd                m5, m4, m3 ; -out9   out8
1403    paddd                m3, m4     ;  out6  -out7
1404    REPX     {psrad  x, 12}, m2, m3, m5, m6
1405    REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6
1406    ret
1407ALIGN function_align
1408.main_pass2:
1409    lea                  r5, [o_base_8bpc]
1410    pshufd               m4, m0, q1032
1411    pshufd               m5, m1, q1032
1412    call m(iadst_16x8_internal_8bpc).main_pass2
1413    movshdup            m11, [permC]
1414    pmulhrsw             m0, m6
1415    pmulhrsw             m1, m6
1416    vpbroadcastd        m13, [pixel_10bpc_max]
1417    pxor                m12, m12
1418    lea                  r6, [strideq*3]
1419    ret
1420
1421INV_TXFM_16X8_FN flipadst, dct
1422INV_TXFM_16X8_FN flipadst, identity, -21
1423INV_TXFM_16X8_FN flipadst, adst
1424INV_TXFM_16X8_FN flipadst, flipadst
1425
1426cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1427    call m(iadst_16x8_internal_10bpc).main_pass1
1428    vpbroadcastd         m9, [o(pd_1)]
1429    psubd                m4, m9, m3
1430    paddd                m3, m9, m5
1431    paddd                m5, m9, m2
1432    psubd                m2, m9, m6
1433    psubd                m6, m9, m1
1434    paddd                m1, m9, m7
1435    paddd                m7, m9, m0
1436    psubd                m0, m9, m8
1437    jmp m(iadst_16x8_internal_10bpc).pass1_end
1438.pass2:
1439    call m(iadst_16x8_internal_10bpc).main_pass2
1440    psrlq               m11, 8
1441    vpermq               m8, m11, m3
1442    vpermq               m9, m11, m2
1443    call m(idct_16x8_internal_10bpc).write_16x4_noround
1444    vpermq               m8, m11, m1
1445    vpermq               m9, m11, m0
1446    jmp m(idct_16x8_internal_10bpc).write_16x4_noround
1447
1448INV_TXFM_16X8_FN identity, dct
1449INV_TXFM_16X8_FN identity, adst
1450INV_TXFM_16X8_FN identity, flipadst
1451INV_TXFM_16X8_FN identity, identity
1452
1453cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1454    call m(idct_8x16_internal_10bpc).load2
1455    vpbroadcastd         m8, [o(pd_5793)]
1456    vpbroadcastd        m13, [o(pd_3072)]
1457    pxor                m10, m10
1458    REPX     {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
1459    REPX {mova [cq+64*x], m10}, 0, 1, 2, 3, 4, 5, 6, 7
1460    call m(idct_8x16_internal_10bpc).round
1461    psrlq                m8, [o(permA)], 16
1462    psrlq                m9, m8, 8
1463    mova                m10, m8
1464    mova                m11, m9
1465    call m(idct_16x8_internal_10bpc).transpose_16x8
1466    jmp                tx2q
1467.pass2:
1468    movshdup             m4, [o(permC)]
1469    vpbroadcastd        m11, [o(pw_4096)]
1470    mova                 m5, m4
1471    jmp m(idct_16x8_internal_10bpc).end
1472
1473%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
1474    INV_TXFM_FN          %1, %2, %3, 16x16
1475%ifidn %1_%2, dct_dct
1476    imul                r6d, [cq], 181
1477    mov                [cq], eobd ; 0
1478    or                  r3d, 16
1479    add                 r6d, 640
1480    sar                 r6d, 10
1481    jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
1482%endif
1483%endmacro
1484
1485INV_TXFM_16X16_FN dct, dct
1486INV_TXFM_16X16_FN dct, identity, 28
1487INV_TXFM_16X16_FN dct, flipadst
1488INV_TXFM_16X16_FN dct, adst
1489
1490cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1491%undef cmp
1492    vpbroadcastd        m13, [o(pd_2048)]
1493    vpbroadcastd        m12, [o(pd_2896)]
1494    vpbroadcastd        m14, [o(clip_18b_min)]
1495    vpbroadcastd        m15, [o(clip_18b_max)]
1496    cmp                eobd, 36
1497    jl .fast
1498    mova                 m0, [cq+64* 0]
1499    mova                 m1, [cq+64* 2]
1500    mova                 m2, [cq+64* 4]
1501    mova                 m3, [cq+64* 6]
1502    mova                 m4, [cq+64* 8]
1503    mova                 m5, [cq+64*10]
1504    mova                 m6, [cq+64*12]
1505    mova                 m7, [cq+64*14]
1506%if WIN64
1507    movaps        [cq+16*0], xmm6
1508    movaps        [cq+16*1], xmm7
1509%endif
1510    call m(idct_8x16_internal_10bpc).main
1511    mova                m16, [cq+64* 1]
1512    mova                m17, [cq+64* 3]
1513    mova                m18, [cq+64* 5]
1514    mova                m19, [cq+64* 7]
1515    mova                m20, [cq+64* 9]
1516    mova                m21, [cq+64*11]
1517    mova                m22, [cq+64*13]
1518    mova                m23, [cq+64*15]
1519    call .main
1520    call .main_end
1521.pass1_end:
1522%if WIN64
1523    movaps             xmm6, [cq+16*0]
1524    movaps             xmm7, [cq+16*1]
1525%endif
1526    vzeroupper
1527.pass1_end2:
1528    call .main_end3
1529.pass1_end3:
1530    mov                 r6d, 64*12
1531    pxor                 m8, m8
1532.zero_loop:
1533    mova       [cq+r6+64*3], m8
1534    mova       [cq+r6+64*2], m8
1535    mova       [cq+r6+64*1], m8
1536    mova       [cq+r6+64*0], m8
1537    sub                 r6d, 64*4
1538    jge .zero_loop
1539    jmp                tx2q
1540.pass2:
1541    lea                  r5, [o_base_8bpc]
1542    call m(idct_16x16_internal_8bpc).main
1543    movshdup            m12, [permC]
1544    vpbroadcastd        m11, [pw_2048]
1545    psrlq               m13, m12, 8
1546    vpermq               m8, m12, m0
1547    vpermq               m0, m13, m7
1548    vpermq               m7, m13, m1
1549    vpermq               m1, m12, m6
1550    vpermq               m6, m12, m2
1551    vpermq               m2, m13, m5
1552    vpermq               m5, m13, m3
1553    vpermq               m3, m12, m4
1554.pass2_end:
1555    lea                  r6, [strideq*3]
1556    vpbroadcastd        m13, [pixel_10bpc_max]
1557    pxor                m12, m12
1558    pmulhrsw             m8, m11, m8
1559    pmulhrsw             m9, m11, m7
1560    call m(idct_16x8_internal_10bpc).write_16x4_noround
1561    pmulhrsw             m8, m11, m6
1562    pmulhrsw             m9, m11, m5
1563    call m(idct_16x8_internal_10bpc).write_16x4_noround
1564    pmulhrsw             m8, m11, m3
1565    pmulhrsw             m9, m11, m2
1566    call m(idct_16x8_internal_10bpc).write_16x4_noround
1567    pmulhrsw             m8, m11, m1
1568    pmulhrsw             m9, m11, m0
1569    jmp m(idct_16x8_internal_10bpc).write_16x4_noround
1570.fast:
1571    mova                ym0, [cq+64*0]
1572    mova                ym2, [cq+64*4]
1573    movshdup             m8, [o(permB)]
1574    mova                ym1, [cq+64*2]
1575    mova                ym3, [cq+64*6]
1576    mova                ym4, [cq+64*1]
1577    mova                ym5, [cq+64*3]
1578    mova                ym6, [cq+64*5]
1579    mova                ym7, [cq+64*7]
1580    vpermt2q             m0, m8, m2 ; 0 4
1581    vpermt2q             m1, m8, m3 ; 2 6
1582    vpermt2q             m4, m8, m5 ; 1 3
1583    vpermt2q             m7, m8, m6 ; 7 5
1584    call m(idct_8x8_internal_10bpc).main_fast
1585    call m(idct_16x8_internal_10bpc).main_fast
1586    vpbroadcastd        m11, [o(pd_2)]
1587    call m(idct_8x16_internal_10bpc).main_end2
1588    mova                 m8, [o(permA)]
1589    psrlq                m9, m8, 8
1590    jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2
1591ALIGN function_align
1592.main_fast2_rect2:
1593    REPX     {paddd x, m13}, m16, m17
1594    REPX     {psrad x, 12 }, m16, m17
1595.main_fast2:
1596    pmulld              m22, m16, [o(pd_4076)] {1to16} ; t15a
1597    pmulld               m9, m16, [o(pd_401)] {1to16}  ; t8a
1598    pmulld              m18, m17, [o(pd_1189)] {1to16} ; t11a
1599    pmulld              m17, [o(pd_3920)] {1to16}      ; t12a
1600    psubd               m18, m13, m18
1601    REPX    {paddd  x, m13}, m22, m9, m17
1602    REPX    {psrad  x, 12 }, m18, m22, m9, m17
1603
1604    mova                m20, m9
1605    mova                m16, m18
1606    mova                m23, m22
1607    mova                m19, m17
1608    jmp .main3
1609.main_fast_rect2:
1610    REPX     {paddd x, m13}, m16, m17, m18, m19
1611    REPX     {psrad x, 12 }, m16, m17, m18, m19
1612.main_fast:
1613    pmulld              m23, m16, [o(pd_4076)] {1to16} ; t15a
1614    pmulld              m16, [o(pd_401)] {1to16}       ; t8a
1615    pmulld              m20, m19, [o(pd_2598)] {1to16} ; t9a
1616    pmulld              m19, [o(pd_3166)] {1to16}      ; t14a
1617    pmulld              m22, m17, [o(pd_1189)] {1to16} ; t11a
1618    pmulld              m17, [o(pd_3920)] {1to16}      ; t12a
1619    pmulld              m21, m18, [o(pd_3612)] {1to16} ; t13a
1620    pmulld              m18, [o(pd_1931)] {1to16}      ; t10a
1621    psubd               m20, m13, m20
1622    psubd               m22, m13, m22
1623    call .round2
1624    jmp .main2
1625.main_rect2:
1626    call .round
1627.main:
1628    ITX_MULSUB_2D        16, 23, 7, 9, 10, _,  401, 4076 ; t8a,  t15a
1629    ITX_MULSUB_2D        20, 19, 7, 9, 10, _, 3166, 2598 ; t9a,  t14a
1630    ITX_MULSUB_2D        22, 17, 7, 9, 10, _, 3920, 1189 ; t11a, t12a
1631    ITX_MULSUB_2D        18, 21, 7, 9, 10, _, 1931, 3612 ; t10a, t13a
1632    call .round
1633.main2:
1634    paddd                m9, m20, m16 ; t8
1635    psubd               m20, m16, m20 ; t9
1636    psubd               m16, m22, m18 ; t10
1637    paddd               m18, m22      ; t11
1638    paddd               m22, m23, m19 ; t15
1639    psubd               m23, m19      ; t14
1640    psubd               m19, m17, m21 ; t13
1641    paddd               m17, m21      ; t12
1642    REPX    {pmaxsd x, m14}, m20, m23, m16, m19
1643    REPX    {pminsd x, m15}, m20, m23, m16, m19
1644    REPX    {pmaxsd x, m14}, m9, m18, m22, m17
1645    REPX    {pminsd x, m15}, m9, m18, m22, m17
1646.main3:
1647    vpbroadcastd        m11, [o(pd_3784)]
1648    vpbroadcastd        m10, [o(pd_1567)]
1649    ITX_MULSUB_2D        23, 20, 21, 7, _, 13, 10, 11
1650    ITX_MULSUB_2D        19, 16, 21, 7, _, 13, 10, 11, 2
1651    paddd               m21, m20, m19 ; t14
1652    psubd               m20, m19      ; t13
1653    psubd               m19, m9, m18  ; t11a
1654    paddd                m9, m18      ; t8a
1655    psubd               m18, m23, m16 ; t10
1656    paddd               m16, m23      ; t9
1657    psubd               m23, m22, m17 ; t12a
1658    paddd               m22, m17      ; t15a
1659    REPX    {pmaxsd x, m14}, m20, m23, m18, m19
1660    REPX    {pminsd x, m15}, m20, m23, m18, m19
1661    REPX    {pmulld x, m12}, m20, m23, m18, m19
1662    psubd                m7, m0, m6   ; dct8 out7
1663    paddd                m0, m6       ; dct8 out0
1664    psubd                m6, m1, m5   ; dct8 out6
1665    paddd                m1, m5       ; dct8 out1
1666    REPX    {pmaxsd x, m14}, m7, m0, m6, m1
1667    psubd                m5, m2, m4   ; dct8 out5
1668    paddd                m2, m4       ; dct8 out2
1669    REPX    {pminsd x, m15}, m7, m0, m6, m1
1670    psubd                m4, m3, m8   ; dct8 out4
1671    paddd                m3, m8       ; dct8 out3
1672    REPX    {pmaxsd x, m14}, m5, m2, m4, m3
1673    paddd               m20, m13
1674    paddd               m23, m13
1675    REPX    {pminsd x, m15}, m5, m2, m4, m3
1676    psubd               m17, m20, m18 ; t10a
1677    paddd               m20, m18      ; t13a
1678    REPX    {pmaxsd x, m14}, m22, m21, m16, m9
1679    psubd               m18, m23, m19 ; t11
1680    paddd               m19, m23      ; t12
1681    REPX    {pminsd x, m15}, m22, m21, m16, m9
1682    REPX    {psrad  x, 12 }, m20, m19, m18, m17
1683    ret
1684.main_end:
1685    vpbroadcastd        m11, [o(pd_2)]
1686.main_end2:
1687    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
1688    psubd               m23, m0, m22 ; out15
1689    paddd                m0, m22     ; out0
1690    psubd               m22, m1, m21 ; out14
1691    paddd                m1, m21     ; out1
1692    psubd               m21, m2, m20 ; out13
1693    paddd                m2, m20     ; out2
1694    psubd               m20, m3, m19 ; out12
1695    paddd                m3, m19     ; out3
1696    psubd               m19, m4, m18 ; out11
1697    paddd                m4, m18     ; out4
1698    psubd               m18, m5, m17 ; out10
1699    paddd                m5, m17     ; out5
1700    psubd               m17, m6, m16 ; out9
1701    paddd                m6, m16     ; out6
1702    psubd               m16, m7, m9  ; out8
1703    paddd                m7, m9      ; out7
1704    REPX   {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \
1705                             m4, m20, m5, m21, m6, m22, m7, m23
1706    packssdw             m0, m16
1707    packssdw             m1, m17
1708    packssdw             m2, m18
1709    packssdw             m3, m19
1710    packssdw             m4, m20
1711    packssdw             m5, m21
1712    packssdw             m6, m22
1713    packssdw             m7, m23
1714    ret
1715.main_end3:
1716    punpckhwd            m8, m0, m1
1717    punpcklwd            m0, m1
1718    punpckhwd            m1, m2, m3
1719    punpcklwd            m2, m3
1720    punpckhwd            m3, m4, m5
1721    punpcklwd            m4, m5
1722    punpcklwd            m5, m6, m7
1723    punpckhwd            m6, m7
1724    punpckhdq            m7, m0, m2
1725    punpckldq            m0, m2
1726    punpckhdq            m2, m8, m1
1727    punpckldq            m8, m1
1728    punpckhdq            m1, m4, m5
1729    punpckldq            m4, m5
1730    punpckhdq            m5, m3, m6
1731    punpckldq            m3, m6
1732    vshufi32x4           m6, m0, m4, q3232
1733    vinserti32x8         m0, ym4, 1
1734    vinserti32x8         m4, m8, ym3, 1
1735    vshufi32x4           m8, m3, q3232
1736    vinserti32x8         m3, m7, ym1, 1
1737    vshufi32x4           m7, m1, q3232
1738    vshufi32x4           m1, m2, m5, q3232
1739    vinserti32x8         m2, ym5, 1
1740    vshufi32x4           m5, m7, m1, q2020 ; 10 11
1741    vshufi32x4           m7, m1, q3131     ; 14 15
1742    vshufi32x4           m1, m3, m2, q2020 ;  2  3
1743    vshufi32x4           m3, m2, q3131     ;  6  7
1744    vshufi32x4           m2, m0, m4, q3131 ;  4  5
1745    vshufi32x4           m0, m4, q2020     ;  0  1
1746    vshufi32x4           m4, m6, m8, q2020 ;  8  9
1747    vshufi32x4           m6, m8, q3131     ; 12 13
1748    ret
1749ALIGN function_align
1750.round:
1751    paddd               m20, m13
1752    paddd               m22, m13
1753.round2:
1754    paddd               m16, m13
1755    paddd               m18, m13
1756.round3:
1757    REPX     {psrad x, 12 }, m16, m18, m20, m22
1758    REPX     {paddd x, m13}, m17, m19, m21, m23
1759    REPX     {psrad x, 12 }, m17, m19, m21, m23
1760    ret
1761
1762INV_TXFM_16X16_FN adst, dct
1763INV_TXFM_16X16_FN adst, flipadst
1764INV_TXFM_16X16_FN adst, adst
1765
1766cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1767%undef cmp
1768    cmp                eobd, 36
1769    jl .fast
1770    call .main_pass1
1771    packssdw             m0, m16
1772    packssdw             m1, m17
1773    packssdw             m2, m18
1774    packssdw             m3, m19
1775    packssdw             m4, m5, m20
1776    packssdw             m5, m6, m21
1777    packssdw             m6, m7, m22
1778    packssdw             m7, m8, m23
1779    jmp m(idct_16x16_internal_10bpc).pass1_end
1780.fast:
1781    call .main_pass1_fast
1782    vpbroadcastd         m9, [o(pd_2)]
1783    paddd                m0, m9
1784    psubd                m1, m9, m1
1785    paddd                m2, m9
1786    psubd                m3, m9, m3
1787    paddd                m4, m9, m5
1788    psubd                m5, m9, m6
1789    paddd                m6, m9, m7
1790    psubd                m7, m9, m8
1791.pass1_fast_end:
1792    mova                 m9, [o(permA)]
1793    psrlq                m8, m9, 8
1794    REPX       {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
1795.pass1_fast_end2:
1796    mova                m10, m9
1797    mova                m11, m8
1798    call m(idct_16x8_internal_10bpc).transpose_16x8
1799    pxor                 m4, m4
1800    REPX       {mova x, m4}, m5, m6, m7
1801    REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7
1802    jmp                tx2q
1803.pass2:
1804    lea                  r5, [o_base_8bpc]
1805    call m(iadst_16x16_internal_8bpc).main_pass2b
1806    movshdup            m12, [permC]
1807    mova                m11, [pw_2048_m2048]
1808    psrlq               m13, m12, 8
1809    vpermq               m8, m13, m0
1810    vpermq               m0, m12, m7
1811    vpermq               m7, m13, m1
1812    vpermq               m1, m12, m6
1813    vpermq               m6, m13, m2
1814    vpermq               m2, m12, m5
1815    vpermq               m5, m13, m3
1816    vpermq               m3, m12, m4
1817    jmp m(idct_16x16_internal_10bpc).pass2_end
1818ALIGN function_align
1819.main_pass1:
1820    mova                 m0, [cq+64* 0]
1821%if WIN64
1822    movaps        [cq+16*0], xmm6
1823    movaps        [cq+16*1], xmm7
1824%endif
1825    mova                m23, [cq+64*15]
1826    vpbroadcastd        m13, [o(pd_2048)]
1827    ITX_MULSUB_2D        23,  0, 8, 9, 10, 13,  201, 4091 ; t1  t0
1828    mova                 m7, [cq+64* 7]
1829    mova                m16, [cq+64* 8]
1830    ITX_MULSUB_2D         7, 16, 8, 9, 10, 13, 3035, 2751 ; t9  t8
1831    mova                 m2, [cq+64* 2]
1832    mova                m21, [cq+64*13]
1833    ITX_MULSUB_2D        21,  2, 8, 9, 10, 13,  995, 3973 ; t3  t2
1834    mova                 m5, [cq+64* 5]
1835    mova                m18, [cq+64*10]
1836    ITX_MULSUB_2D         5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10
1837    mova                 m4, [cq+64* 4]
1838    mova                m19, [cq+64*11]
1839    ITX_MULSUB_2D        19,  4, 8, 9, 10, 13, 1751, 3703 ; t5  t4
1840    mova                 m3, [cq+64* 3]
1841    mova                m20, [cq+64*12]
1842    ITX_MULSUB_2D         3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12
1843    mova                 m6, [cq+64* 6]
1844    mova                m17, [cq+64* 9]
1845    ITX_MULSUB_2D        17,  6, 8, 9, 10, 13, 2440, 3290 ; t7  t6
1846    mova                 m1, [cq+64* 1]
1847    mova                m22, [cq+64*14]
1848    ITX_MULSUB_2D         1, 22, 8, 9, 10, 13, 4052,  601 ; t15 t14
1849    vpbroadcastd        m14, [o(clip_18b_min)]
1850    vpbroadcastd        m15, [o(clip_18b_max)]
1851    psubd                m9, m23, m7  ; t9a
1852    paddd               m23, m7       ; t1a
1853    psubd                m7, m2, m18  ; t10a
1854    paddd               m18, m2       ; t2a
1855    REPX    {pmaxsd x, m14}, m9, m23, m7, m18
1856    psubd                m2, m17, m1  ; t15a
1857    paddd               m17, m1       ; t7a
1858    REPX    {pminsd x, m15}, m9, m23, m7, m18
1859    psubd                m1, m21, m5  ; t11a
1860    paddd               m21, m5       ; t3a
1861    REPX    {pmaxsd x, m14}, m2, m17, m1, m21
1862    psubd                m5, m4, m20  ; t12a
1863    paddd                m4, m20      ; t4a
1864    REPX    {pminsd x, m15}, m2, m17, m1, m21
1865    psubd               m20, m19, m3  ; t13a
1866    paddd               m19, m3       ; t5a
1867    REPX    {pmaxsd x, m14}, m5, m4, m20, m19
1868    psubd                m8, m6, m22  ; t14a
1869    paddd                m6, m22      ; t6a
1870    REPX    {pminsd x, m15}, m5, m4, m20, m19
1871    psubd               m22, m0, m16  ; t8a
1872    paddd               m16, m0       ; t0a
1873    REPX    {pmaxsd x, m14}, m8, m6, m22, m16
1874    vpbroadcastd        m11, [o(pd_4017)]
1875    vpbroadcastd        m10, [o(pd_799)]
1876    REPX    {pminsd x, m15}, m8, m6, m22, m16
1877    ITX_MULSUB_2D        22,  9, 0, 3, _, 13, 10, 11 ; t9  t8
1878    ITX_MULSUB_2D        20,  5, 0, 3, _, 13, 11, 10 ; t12 t13
1879    vpbroadcastd        m11, [o(pd_2276)]
1880    vpbroadcastd        m10, [o(pd_3406)]
1881    ITX_MULSUB_2D         7,  1, 0, 3, _, 13, 10, 11 ; t11 t10
1882    ITX_MULSUB_2D         2,  8, 0, 3, _, 13, 11, 10 ; t14 t15
1883    paddd                m0, m16, m4  ; t0
1884    psubd               m16, m4       ; t4
1885    psubd                m3, m23, m19 ; t5
1886    paddd               m23, m19      ; t1
1887    REPX    {pmaxsd x, m14}, m0, m16, m3, m23
1888    psubd               m19, m18, m6  ; t6
1889    paddd               m18, m6       ; t2
1890    REPX    {pminsd x, m15}, m0, m16, m3, m23
1891    psubd                m6, m21, m17 ; t7
1892    paddd               m21, m17      ; t3
1893    REPX    {pmaxsd x, m14}, m19, m18, m6, m21
1894    paddd               m17, m9, m20  ; t8a
1895    psubd                m9, m20      ; t12a
1896    REPX    {pminsd x, m15}, m19, m18, m6, m21
1897    psubd               m20, m22, m5  ; t13a
1898    paddd               m22, m5       ; t9a
1899    REPX    {pmaxsd x, m14}, m17, m9, m20, m22
1900    psubd                m5, m1, m2   ; t14a
1901    paddd                m1, m2       ; t10a
1902    REPX    {pminsd x, m15}, m17, m9, m20, m22
1903    psubd                m2, m7, m8   ; t15a
1904    paddd                m7, m8       ; t11a
1905    REPX    {pmaxsd x, m14}, m5, m1, m2, m7
1906    vpbroadcastd        m11, [o(pd_3784)]
1907    vpbroadcastd        m10, [o(pd_1567)]
1908    REPX    {pminsd x, m15}, m5, m1, m2, m7
1909    ITX_MULSUB_2D        16,  3, 4, 8, _, 13, 10, 11 ; t5a t4a
1910    ITX_MULSUB_2D         6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a
1911    ITX_MULSUB_2D         9, 20, 4, 8, _, 13, 10, 11 ; t13 t12
1912    ITX_MULSUB_2D         2,  5, 4, 8, _, 13, 11, 10 ; t14 t15
1913    psubd                m8, m0, m18  ; t2a
1914    paddd                m0, m18      ;  out0
1915    psubd               m18, m23, m21 ; t3a
1916    paddd               m23, m21      ; -out15
1917    paddd               m21, m9, m5   ; -out13
1918    psubd                m9, m5       ; t15a
1919    psubd                m5, m3, m6   ; t6
1920    paddd                m3, m6       ; -out3
1921    REPX    {pmaxsd x, m14}, m8, m18, m9, m5
1922    psubd                m6, m20, m2  ; t14a
1923    paddd                m2, m20      ;  out2
1924    paddd               m20, m16, m19 ;  out12
1925    psubd               m16, m19      ; t7
1926    REPX    {pminsd x, m15}, m8, m18, m9, m5
1927    psubd               m19, m22, m7  ; t11
1928    paddd               m22, m7       ;  out14
1929    psubd                m7, m17, m1  ; t10
1930    paddd                m1, m17      ; -out1
1931    REPX    {pmaxsd x, m14}, m6, m16, m19, m7
1932    vpbroadcastd        m12, [o(pd_1448)]
1933    vpbroadcastd         m4, [o(pd_2)]
1934    vpbroadcastd        m10, [o(pd_5120)]
1935    vpbroadcastd        m11, [o(pd_5119)]
1936    REPX    {pminsd x, m15}, m6, m16, m19, m7
1937    psubd               m17, m7, m19  ; -out9
1938    paddd                m7, m19      ;  out6
1939    psubd               m19, m5, m16  ; -out11
1940    paddd                m5, m16      ;  out4
1941    REPX    {pmulld x, m12}, m17, m7, m19, m5
1942    psubd               m16, m8, m18  ;  out8
1943    paddd                m8, m18      ; -out7
1944    psubd               m18, m6, m9   ;  out10
1945    paddd                m6, m9       ; -out5
1946    REPX    {pmulld x, m12}, m16, m8, m18, m6
1947    REPX  {paddd x, m4    }, m0, m2, m20, m22
1948    REPX  {psubd x, m4,  x}, m1, m3, m21, m23
1949    REPX  {paddd x, m10   }, m7, m5, m16, m18
1950    REPX  {psubd x, m11, x}, m17, m19, m8, m6
1951    REPX      {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3
1952    REPX      {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8
1953    ret
1954ALIGN function_align
1955.main_pass1_fast:
1956    mova                ym0, [cq+64*0]
1957    mova                ym1, [cq+64*2]
1958    movshdup             m8, [o(permB)]
1959    mova                ym6, [cq+64*1]
1960    mova                ym7, [cq+64*3]
1961    mova                ym2, [cq+64*4]
1962    mova                ym3, [cq+64*6]
1963    mova                ym4, [cq+64*5]
1964    mova                ym5, [cq+64*7]
1965    vpermt2q             m0, m8, m1 ; 0 2
1966    vpermt2q             m7, m8, m6 ; 3 1
1967    vpermt2q             m2, m8, m3 ; 4 6
1968    vpermt2q             m5, m8, m4 ; 7 5
1969    vpbroadcastd        m13, [o(pd_2048)]
1970    vpbroadcastd        m12, [o(pd_2896)]
1971    jmp m(iadst_16x8_internal_10bpc).main_fast
1972
1973INV_TXFM_16X16_FN flipadst, dct
1974INV_TXFM_16X16_FN flipadst, adst
1975INV_TXFM_16X16_FN flipadst, flipadst
1976
1977cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1978%undef cmp
1979    cmp                eobd, 36
1980    jl .fast
1981    call m(iadst_16x16_internal_10bpc).main_pass1
1982    packssdw             m4, m19, m3
1983    packssdw             m3, m20, m5
1984    packssdw             m5, m18, m2
1985    packssdw             m2, m21, m6
1986    packssdw             m6, m17, m1
1987    packssdw             m1, m22, m7
1988    packssdw             m7, m16, m0
1989    packssdw             m0, m23, m8
1990    jmp m(idct_16x16_internal_10bpc).pass1_end
1991.fast:
1992    call m(iadst_16x16_internal_10bpc).main_pass1_fast
1993    vpbroadcastd         m9, [o(pd_2)]
1994    psubd                m4, m9, m3
1995    paddd                m3, m9, m5
1996    paddd                m5, m9, m2
1997    psubd                m2, m9, m6
1998    psubd                m6, m9, m1
1999    paddd                m1, m9, m7
2000    paddd                m7, m9, m0
2001    psubd                m0, m9, m8
2002    jmp m(iadst_16x16_internal_10bpc).pass1_fast_end
2003.pass2:
2004    lea                  r5, [o_base_8bpc]
2005    call m(iadst_16x16_internal_8bpc).main_pass2b
2006    movshdup            m12, [permC]
2007    movu                m11, [pw_m2048_2048]
2008    psrlq               m13, m12, 8
2009    vpermq               m8, m13, m7
2010    vpermq               m7, m13, m6
2011    vpermq               m6, m13, m5
2012    vpermq               m5, m13, m4
2013    vpermq               m3, m12, m3
2014    vpermq               m2, m12, m2
2015    vpermq               m1, m12, m1
2016    vpermq               m0, m12, m0
2017    jmp m(idct_16x16_internal_10bpc).pass2_end
2018
2019INV_TXFM_16X16_FN identity, dct, -92
2020INV_TXFM_16X16_FN identity, identity
2021
2022cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
2023%undef cmp
2024    vpbroadcastd        m10, [o(pd_5793)]
2025    vpbroadcastd        m11, [o(pd_5120)]
2026    mov                  r6, cq
2027    cmp                eobd, 36
2028    jl .fast
2029    call .pass1_main
2030    packssdw             m0, m6, m8
2031    packssdw             m1, m7, m9
2032    call .pass1_main
2033    packssdw             m2, m6, m8
2034    packssdw             m3, m7, m9
2035    call .pass1_main
2036    packssdw             m4, m6, m8
2037    packssdw             m5, m7, m9
2038    call .pass1_main
2039    packssdw             m6, m8
2040    packssdw             m7, m9
2041    jmp m(idct_16x16_internal_10bpc).pass1_end2
2042.fast:
2043    call .pass1_main_fast
2044    packssdw             m0, m6, m7
2045    call .pass1_main_fast
2046    packssdw             m1, m6, m7
2047    call .pass1_main_fast
2048    packssdw             m2, m6, m7
2049    call .pass1_main_fast
2050    packssdw             m3, m6, m7
2051    punpckhwd            m4, m0, m1
2052    punpcklwd            m0, m1
2053    punpckhwd            m1, m2, m3
2054    punpcklwd            m2, m3
2055    punpckldq            m3, m4, m1
2056    punpckhdq            m4, m1
2057    punpckhdq            m1, m0, m2
2058    punpckldq            m0, m2
2059    pxor                 m7, m7
2060    vshufi32x4           m2, m0, m3, q3131
2061    vshufi32x4           m0, m3, q2020
2062    vshufi32x4           m3, m1, m4, q3131
2063    vshufi32x4           m1, m4, q2020
2064    REPX       {mova x, m7}, m4, m5, m6
2065    jmp m(idct_16x16_internal_10bpc).pass1_end3
2066.pass2:
2067    movshdup            m14, [o(permC)]
2068    vpbroadcastd        m15, [o(pw_1697x16)]
2069    lea                  r6, [strideq*3]
2070    vpbroadcastd        m11, [o(pw_2048)]
2071    pxor                m12, m12
2072    vpbroadcastd        m13, [pixel_10bpc_max]
2073    vpermq               m8, m14, m0
2074    vpermq               m9, m14, m1
2075    call .pass2_main
2076    vpermq               m8, m14, m2
2077    vpermq               m9, m14, m3
2078    call .pass2_main
2079    vpermq               m8, m14, m4
2080    vpermq               m9, m14, m5
2081    call .pass2_main
2082    vpermq               m8, m14, m6
2083    vpermq               m9, m14, m7
2084.pass2_main:
2085    pmulhrsw             m0, m15, m8
2086    pmulhrsw             m1, m15, m9
2087    paddsw               m8, m8
2088    paddsw               m9, m9
2089    paddsw               m8, m0
2090    paddsw               m9, m1
2091    jmp m(idct_16x8_internal_10bpc).write_16x4
2092ALIGN function_align
2093.pass1_main:
2094    pmulld               m6, m10, [r6+64*0]
2095    pmulld               m7, m10, [r6+64*1]
2096    pmulld               m8, m10, [r6+64*8]
2097    pmulld               m9, m10, [r6+64*9]
2098    add                  r6, 64*2
2099    REPX    {paddd  x, m11}, m6, m7, m8, m9
2100    REPX    {psrad  x, 13 }, m6, m8, m7, m9
2101    ret
2102ALIGN function_align
2103.pass1_main_fast:
2104    mova                ym6, [r6+64* 0]
2105    vinserti32x8         m6, [r6+64* 4], 1
2106    mova                ym7, [r6+64* 8]
2107    vinserti32x8         m7, [r6+64*12], 1
2108    add                  r6, 64
2109    REPX    {pmulld x, m10}, m6, m7
2110    REPX    {paddd  x, m11}, m6, m7
2111    REPX    {psrad  x, 13 }, m6, m7
2112    ret
2113
2114cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob
2115%undef cmp
2116    lea                  r5, [o_base]
2117    test               eobd, eobd
2118    jz .dconly
2119    vpbroadcastd        m12, [o(pd_2896)]
2120    vpbroadcastd        m13, [o(pd_2048)]
2121    vpbroadcastd        m14, [o(clip_18b_min)]
2122    vpbroadcastd        m15, [o(clip_18b_max)]
2123    vpbroadcastd        m11, [o(pd_2)]
2124    mova                m20, [o(idct8x32p)]
2125    pxor                m21, m21
2126    cmp                eobd, 43
2127    jl .fast
2128    call .pass1_main
2129    punpcklwd           m16, m0, m1
2130    punpcklwd           m17, m2, m3
2131    punpckhwd           m18, m0, m1
2132    punpckhwd           m19, m2, m3
2133    cmp                eobd, 107
2134    jge .full
2135    punpckldq            m0, m16, m17 ;  0  2
2136    punpckhdq            m1, m16, m17 ;  4  6
2137    punpckldq            m2, m18, m19 ;  8 10
2138    punpckhdq            m3, m18, m19 ; 12 14
2139    lea                  r5, [o_base_8bpc]
2140    vextracti32x8      ym14, m0, 1
2141    vextracti32x8      ym15, m1, 1
2142    vextracti32x8      ym16, m2, 1
2143    vextracti32x8      ym17, m3, 1
2144    call m(idct_8x16_internal_8bpc).main_fast
2145    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
2146    jmp .end
2147.full:
2148    add                  cq, 64
2149    call .pass1_main
2150    punpcklwd            m5, m0, m1
2151    punpcklwd            m6, m2, m3
2152    punpckhwd            m7, m0, m1
2153    punpckhwd            m8, m2, m3
2154    punpckldq            m0, m16, m17 ;  0  2
2155    punpckhdq            m1, m16, m17 ;  4  6
2156    punpckldq            m2, m18, m19 ;  8 10
2157    punpckhdq            m3, m18, m19 ; 12 14
2158    punpckldq            m4, m5, m6   ; 16 18
2159    punpckhdq            m5, m6       ; 20 22
2160    punpckldq            m6, m7, m8   ; 24 26
2161    punpckhdq            m7, m8       ; 28 30
2162    lea                  r5, [o_base_8bpc]
2163    vextracti32x8      ym14, m0, 1
2164    vextracti32x8      ym15, m1, 1
2165    vextracti32x8      ym16, m2, 1
2166    vextracti32x8      ym17, m3, 1
2167    vextracti32x8      ym18, m4, 1
2168    vextracti32x8      ym19, m5, 1
2169    vextracti32x8      ym20, m6, 1
2170    vextracti32x8      ym21, m7, 1
2171    call m(idct_8x16_internal_8bpc).main
2172    REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21
2173    call m(inv_txfm_add_dct_dct_8x32_8bpc).main
2174    jmp .end
2175.fast:
2176    movshdup             m8, [o(permB)]
2177    mova                ym1, [cq+128*1]
2178    mova                ym5, [cq+128*5]
2179    mova                ym7, [cq+128*3]
2180    mova                ym3, [cq+128*7]
2181    mova                ym0, [cq+128*0]
2182    mova                ym4, [cq+128*2]
2183    mova                ym2, [cq+128*4]
2184    mova                ym6, [cq+128*6]
2185    vpermt2q             m1, m8, m5 ; 1 5
2186    vpermt2q             m3, m8, m7 ; 7 3
2187    vpermt2q             m0, m8, m4 ; 0 2
2188    vpermt2q             m2, m8, m6 ; 4 6
2189    mova         [cq+128*0], ym21
2190    REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7
2191    call m(idct_8x8_internal_10bpc).main
2192    call m(idct_8x8_internal_10bpc).main_end
2193    packssdw             m0, m2
2194    packssdw             m1, m3
2195    vpermb               m0, m20, m0
2196    vprold              m20, 16
2197    vpermb               m2, m20, m1
2198    punpckhdq            m1, m0, m2
2199    punpckldq            m0, m2
2200    lea                  r5, [o_base_8bpc]
2201    vextracti32x8      ym14, m0, 1
2202    vextracti32x8      ym15, m1, 1
2203    call m(idct_8x16_internal_8bpc).main_fast2
2204    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2
2205.end:
2206    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper
2207    lea                  r3, [strideq*2]
2208    vpbroadcastd        m12, [pixel_10bpc_max]
2209    lea                  r6, [strideq*3]
2210    pxor                m11, m11
2211    lea                  r3, [dstq+r3*8]
2212    pmulhrsw             m0, m10
2213    pmulhrsw             m1, m10
2214    call .write_8x4x2
2215    pmulhrsw             m0, m10, m2
2216    pmulhrsw             m1, m10, m3
2217    call .write_8x4x2
2218    pmulhrsw             m0, m10, m4
2219    pmulhrsw             m1, m10, m5
2220    call .write_8x4x2
2221    pmulhrsw             m0, m10, m6
2222    pmulhrsw             m1, m10, m7
2223.write_8x4x2:
2224    mova                xm8, [dstq+strideq*0]
2225    vinserti32x4        ym8, [dstq+strideq*1], 1
2226    vinserti32x4         m8, [dstq+strideq*2], 2
2227    vinserti32x4         m8, [dstq+r6       ], 3
2228    mova                xm9, [r3  +r6       ]
2229    vinserti32x4        ym9, [r3  +strideq*2], 1
2230    vinserti32x4         m9, [r3  +strideq*1], 2
2231    vinserti32x4         m9, [r3  +strideq*0], 3
2232    paddw                m8, m0
2233    paddw                m9, m1
2234    pmaxsw               m8, m11
2235    pmaxsw               m9, m11
2236    pminsw               m8, m12
2237    pminsw               m9, m12
2238    mova          [dstq+strideq*0], xm8
2239    vextracti32x4 [dstq+strideq*1], ym8, 1
2240    vextracti32x4 [dstq+strideq*2], m8, 2
2241    vextracti32x4 [dstq+r6       ], m8, 3
2242    lea                dstq, [dstq+strideq*4]
2243    vextracti32x4 [r3  +strideq*0], m9, 3
2244    vextracti32x4 [r3  +strideq*1], m9, 2
2245    vextracti32x4 [r3  +strideq*2], ym9, 1
2246    mova          [r3  +r6       ], xm9
2247    lea                  r3, [r3+strideq*4]
2248    ret
2249.dconly:
2250    imul                r6d, [cq], 181
2251    mov                [cq], eobd
2252    or                  r3d, 32
2253    add                 r6d, 640
2254    sar                 r6d, 10
2255    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
2256ALIGN function_align
2257.pass1_main:
2258    mova                 m0, [cq+128*0]
2259    mova                 m1, [cq+128*1]
2260    mova                 m2, [cq+128*2]
2261    mova                 m3, [cq+128*3]
2262    mova                 m4, [cq+128*4]
2263    mova                 m5, [cq+128*5]
2264    mova                 m6, [cq+128*6]
2265    mova                 m7, [cq+128*7]
2266    REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7
2267    call m(idct_8x16_internal_10bpc).main
2268    call m(idct_8x16_internal_10bpc).main_end2
2269    packssdw             m0, m4
2270    packssdw             m1, m5
2271    packssdw             m2, m6
2272    packssdw             m3, m7
2273    REPX {vpermb x, m20, x}, m0, m1, m2, m3
2274    ret
2275
2276cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob
2277    vpbroadcastd         m9, [pw_5]
2278    lea                  r4, [strideq*3]
2279    pxor                m10, m10
2280    lea                  r5, [strideq*5]
2281    vpbroadcastd        m11, [pixel_10bpc_max]
2282    sub                eobd, 107
2283    lea                  r6, [strideq+r4*2]
2284.loop:
2285    mova                 m0, [cq+128*0]
2286    packssdw             m0, [cq+128*1]
2287    mova                 m1, [cq+128*2]
2288    packssdw             m1, [cq+128*3]
2289    mova                 m2, [cq+128*4]
2290    packssdw             m2, [cq+128*5]
2291    mova                 m3, [cq+128*6]
2292    packssdw             m3, [cq+128*7]
2293    lea                  r7, [dstq+strideq*8]
2294    REPX {mova [cq+128*x], m10}, 0, 1, 2, 3
2295    REPX     {paddsw x, m9}, m0, m1, m2, m3
2296    REPX {mova [cq+128*x], m10}, 4, 5, 6, 7
2297    REPX     {psraw  x, 3 }, m0, m1, m2, m3
2298    add                  cq, 64
2299    mova                xm4, [dstq+strideq*0]
2300    mova                xm5, [dstq+strideq*1]
2301    mova                xm6, [dstq+strideq*2]
2302    mova                xm7, [dstq+r4     *1]
2303    punpckhwd            m8, m0, m1
2304    vinserti32x4        ym4, [dstq+strideq*4], 1
2305    punpcklwd            m0, m1
2306    vinserti32x4        ym5, [dstq+r5     *1], 1
2307    punpckhwd            m1, m2, m3
2308    vinserti32x4        ym6, [dstq+r4     *2], 1
2309    punpcklwd            m2, m3
2310    vinserti32x4        ym7, [dstq+r6     *1], 1
2311    punpckhwd            m3, m0, m8
2312    vinserti32x4         m4, [r7  +strideq*0], 2
2313    punpcklwd            m0, m8
2314    vinserti32x4         m5, [r7  +strideq*1], 2
2315    punpckhwd            m8, m2, m1
2316    vinserti32x4         m6, [r7  +strideq*2], 2
2317    punpcklwd            m2, m1
2318    vinserti32x4         m7, [r7  +r4     *1], 2
2319    punpckhqdq           m1, m0, m2
2320    vinserti32x4         m4, [r7  +strideq*4], 3
2321    punpcklqdq           m0, m2
2322    vinserti32x4         m5, [r7  +r5     *1], 3
2323    punpcklqdq           m2, m3, m8
2324    vinserti32x4         m6, [r7  +r4     *2], 3
2325    punpckhqdq           m3, m8
2326    vinserti32x4         m7, [r7  +r6     *1], 3
2327    paddw                m0, m4
2328    paddw                m1, m5
2329    paddw                m2, m6
2330    paddw                m3, m7
2331    REPX    {pmaxsw x, m10}, m0, m1, m2, m3
2332    REPX    {pminsw x, m11}, m0, m1, m2, m3
2333    mova          [dstq+strideq*0], xm0
2334    mova          [dstq+strideq*1], xm1
2335    mova          [dstq+strideq*2], xm2
2336    mova          [dstq+r4     *1], xm3
2337    vextracti32x4 [dstq+strideq*4], ym0, 1
2338    vextracti32x4 [dstq+r5     *1], ym1, 1
2339    vextracti32x4 [dstq+r4     *2], ym2, 1
2340    vextracti32x4 [dstq+r6     *1], ym3, 1
2341    lea                dstq, [r7+strideq*8]
2342    vextracti32x4 [r7  +strideq*0], m0, 2
2343    vextracti32x4 [r7  +strideq*1], m1, 2
2344    vextracti32x4 [r7  +strideq*2], m2, 2
2345    vextracti32x4 [r7  +r4     *1], m3, 2
2346    vextracti32x4 [r7  +strideq*4], m0, 3
2347    vextracti32x4 [r7  +r5     *1], m1, 3
2348    vextracti32x4 [r7  +r4     *2], m2, 3
2349    vextracti32x4 [r7  +r6     *1], m3, 3
2350    add                eobd, 0x80000000
2351    jnc .loop
2352    RET
2353
2354cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
2355%undef cmp
2356    lea                  r5, [o_base]
2357    test               eobd, eobd
2358    jz .dconly
2359    mova                m11, [o(permB)]
2360    mova                 m0, [cq+64* 0] ;  0  1
2361    mova                 m4, [cq+64* 1] ;  2  3
2362    mova                 m1, [cq+64* 2] ;  4  5
2363    mova                 m8, [cq+64* 3] ;  6  7
2364    vpbroadcastd        m12, [o(pd_2896)]
2365    vpbroadcastd        m13, [o(pd_2048)]
2366    vpbroadcastd        m14, [o(clip_18b_min)]
2367    vpbroadcastd        m15, [o(clip_18b_max)]
2368    psrlq               m10, m11, 32
2369%if WIN64
2370    movaps        [cq+16*0], xmm6
2371    movaps        [cq+16*1], xmm7
2372%endif
2373    mova                m16, m11
2374    vpermi2q            m16, m0, m1     ;  1  5
2375    mova                m17, m11
2376    vpermi2q            m17, m8, m4     ;  7  3
2377    cmp                eobd, 43
2378    jl .fast
2379    mova                m18, [cq+64* 4] ;  8  9
2380    mova                m20, [cq+64* 5] ; 10 11
2381    mova                 m6, [cq+64* 6] ; 12 13
2382    mova                 m7, [cq+64* 7] ; 14 15
2383    vpermt2q             m0, m10, m18   ;  0  8
2384    vpermt2q            m18, m11, m6    ;  9 13
2385    mova                m19, m11
2386    vpermi2q            m19, m7, m20    ; 15 11
2387    cmp                eobd, 107
2388    jge .full
2389    vpermt2q             m1, m10, m6    ;  4 12
2390    vpermt2q             m4, m10, m8    ;  2  6
2391    vpermt2q             m7, m10, m20   ; 14 10
2392    mov                 r6d, 64*1
2393    call m(idct_8x8_internal_10bpc).main_fast
2394    call m(idct_16x8_internal_10bpc).main_fast
2395    call .main_fast
2396    call m(idct_16x16_internal_10bpc).main_end
2397    jmp .end
2398.full:
2399    mova                 m2, [cq+64* 8] ; 16 17
2400    mova                 m5, [cq+64* 9] ; 18 19
2401    mova                 m9, [cq+64*10] ; 20 21
2402    mova                m21, [cq+64*11] ; 22 23
2403    vpermt2q             m1, m10, m9    ;  4 20
2404    vpermt2q             m7, m10, m21   ; 14 22
2405    vpermt2q            m21, m11, m5    ; 23 19
2406    vpermt2q             m5, m10, m20   ; 18 10
2407    mova                m20, m11
2408    vpermi2q            m20, m2, m9     ; 17 21
2409    mova                m22, [cq+64*12] ; 24 25
2410    mova                 m9, [cq+64*13] ; 26 27
2411    mova                 m3, [cq+64*14] ; 28 29
2412    mova                m23, [cq+64*15] ; 30 31
2413    vpermt2q             m2, m10, m22   ; 16 24
2414    vpermt2q            m22, m11, m3    ; 25 29
2415    vpermt2q             m3, m10, m6    ; 28 12
2416    vpermt2q             m4, m10, m9    ;  2 26
2417    mova                 m6, m10
2418    vpermi2q             m6, m23, m8    ; 30  6
2419    vpermt2q            m23, m11, m9    ; 31 27
2420    mov                 r6d, 64*3
2421    call m(idct_8x8_internal_10bpc).main
2422    call m(idct_16x8_internal_10bpc).main
2423    call .main
2424    call m(idct_16x16_internal_10bpc).main_end
2425    jmp .end
2426.fast:
2427    vpermq               m0, m10, m0    ;  0  0
2428    vpermq               m1, m10, m1    ;  4  4
2429    vpermt2q             m4, m10, m8    ;  2  6
2430    xor                 r6d, r6d
2431    call .main_fast2
2432    call m(idct_16x16_internal_10bpc).main_end
2433.end:
2434%if WIN64
2435    movaps             xmm6, [cq+16*0]
2436    movaps             xmm7, [cq+16*1]
2437%endif
2438    vzeroupper
2439    call .transpose_8x32
2440    pxor                m14, m14
2441.zero_loop:
2442    mova     [cq+r6*4+64*3], m14
2443    mova     [cq+r6*4+64*2], m14
2444    mova     [cq+r6*4+64*1], m14
2445    mova     [cq+r6*4+64*0], m14
2446    sub                 r6d, 64
2447    jge .zero_loop
2448    lea                  r5, [o_base_8bpc]
2449    punpckhqdq           m1, m0, m2
2450    punpcklqdq           m0, m2
2451    punpcklqdq           m2, m3, m4
2452    punpckhqdq           m3, m4
2453    punpcklqdq           m4, m5, m7
2454    punpckhqdq           m5, m7
2455    punpckhqdq           m7, m6, m8
2456    punpcklqdq           m6, m8
2457    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
2458    pxor                m12, m12
2459.write_32x8_start:
2460    vpbroadcastd        m11, [pw_2048]
2461    vpbroadcastd        m13, [pixel_10bpc_max]
2462    lea                  r3, [strideq*3]
2463.write_32x8:
2464    pmulhrsw             m0, m11
2465    pmulhrsw             m1, m11
2466    pmulhrsw             m2, m11
2467    pmulhrsw             m3, m11
2468    call .write_32x4
2469    pmulhrsw             m0, m11, m4
2470    pmulhrsw             m1, m11, m5
2471    pmulhrsw             m2, m11, m6
2472    pmulhrsw             m3, m11, m7
2473.write_32x4:
2474    paddw                m0, [dstq+strideq*0]
2475    paddw                m1, [dstq+strideq*1]
2476    paddw                m2, [dstq+strideq*2]
2477    paddw                m3, [dstq+r3       ]
2478    REPX    {pmaxsw x, m12}, m0, m1, m2, m3
2479    REPX    {pminsw x, m13}, m0, m1, m2, m3
2480    mova   [dstq+strideq*0], m0
2481    mova   [dstq+strideq*1], m1
2482    mova   [dstq+strideq*2], m2
2483    mova   [dstq+r3       ], m3
2484    lea                dstq, [dstq+strideq*4]
2485    ret
2486.dconly:
2487    imul                r6d, [cq], 181
2488    mov                [cq], eobd
2489    or                  r3d, 8
2490    add                 r6d, 640
2491    sar                 r6d, 10
2492    jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
2493ALIGN function_align
2494.main_fast3:
2495    ; assuming m0=in0 in0, m4=in2 in2, and m16=in1 in3
2496    vbroadcasti32x4      m5, [o(pd_401_4076)]
2497    pmulld               m3, m0, m12
2498    pmulld               m4, m5
2499    REPX    {paddd  x, m13}, m3, m4
2500    REPX    {psrad  x, 12 }, m3, m4     ; m3=idct8:t0-7, m4=t8a t15a
2501
2502    ; t8a t15a -> t8/9 t14/15
2503
2504    vbroadcasti32x4      m5, [o(pd_3784_m3784)]
2505    pshufd               m7, m4, q1032
2506    pmulld               m6, m4, [o(pd_1567)]{bcstd}
2507    pmulld               m5, m7
2508    paddd                m6, m13
2509    paddd                m5, m6
2510    psrad                m5, 12         ; m5=t9a t14a
2511
2512    ; t14a t9a -> t13/14 t9/10 [m5] & t8 15 -> t8/11a t12/15a [m4]
2513
2514    shufps               m6, m4, m5, q1032     ; t12  t13
2515    shufps               m8, m4, m5, q3210     ; t11a t10
2516    pmulld               m9, m6, m12
2517    pmulld               m7, m8, m12
2518    paddd                m9, m13
2519    paddd                m5, m9, m7     ; t12 t13a
2520    psubd                m4, m9, m7     ; t11 t10a
2521    REPX    {psrad  x, 12 }, m5, m4
2522
2523    psubd                m7, m3, m6   ; dct16 out15 out14
2524    paddd                m0, m3, m6   ; dct16 out0  out1
2525    psubd                m6, m3, m5   ; dct16 out12 out13
2526    paddd                m1, m3, m5   ; dct16 out3  out2
2527    psubd                m5, m3, m4   ; dct16 out11 out10
2528    paddd                m2, m3, m4   ; dct16 out4  out5
2529    psubd                m4, m3, m8   ; dct16 out8  out9
2530    paddd                m3, m8       ; dct16 out7  out6
2531    REPX    {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
2532    REPX    {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
2533
2534    ; idct32_bottomhalf
2535    vbroadcasti32x4     m18, [o(pd_201_m601)]
2536    vbroadcasti32x4     m19, [o(pd_4091_4052)]
2537    pmulld              m17, m16, m19
2538    pmulld              m16, m18
2539    REPX    {paddd  x, m13}, m17, m16
2540    REPX    {psrad  x, 12 }, m17, m16
2541
2542    ; m17: t31a t24a -> t30/31 t24/25, m16: t16a t23a -> t16/17 t22/23 [step2]
2543
2544    vbroadcasti32x4     m10, [o(pd_799_m2276)]
2545    vbroadcasti32x4     m11, [o(pd_4017_3406)]
2546    pmulld              m18, m17, m10
2547    pmulld              m19, m17, m11
2548    pmulld               m8, m16, m11
2549    pmulld               m9, m16, m10
2550    REPX    {paddd  x, m13}, m18, m19
2551    psubd               m18, m8
2552    paddd               m19, m9
2553    REPX    {psrad  x, 12 }, m18, m19
2554
2555    ; m17=t31  t24  -> t28/31a t24/27a, m16=t16  t23  -> t16/19a t20/23a
2556    ; m18=t17a t22a -> t17/18  t21/22,  m19=t30a t25a -> t29/30  t25/26
2557
2558    punpckhqdq          m23, m17, m19   ; t24a t25 [or t27a t26]
2559    punpcklqdq          m20, m16, m18   ; t16a t17 [or t19a t18]
2560    punpckhqdq          m22, m16, m18   ; t23a t22 [or t20a t21]
2561    punpcklqdq          m16, m17, m19   ; t28a t29 [or t31a t30]
2562    mova                m21, m23
2563    mova                m18, m20
2564    mova                m17, m22
2565    mova                m19, m16
2566
2567    jmp .main4
2568.main_fast2: ; bottom three-quarters are zero
2569    vbroadcasti32x4      m8, [o(pd_799_4017)]
2570    pmulld               m8, m1     ; t4  t7
2571    vpmulld              m0, [o(pd_2896)] {1to16} ; t0 t1
2572    REPX     {paddd x, m13}, m8, m0
2573    REPX     {psrad x, 12 }, m8, m0
2574    pmulld               m3, m8, m12
2575    mova                 m2, m0       ;  t3   t2
2576    call m(idct_8x8_internal_10bpc).main3
2577    vbroadcasti32x4      m6, [o(pd_4076_3920)]
2578    vbroadcasti32x4      m3, [o(pd_401_m1189)]
2579    pmulld               m6, m4       ;  t15  t12
2580    pmulld               m4, m3       ;  t9   t10
2581    REPX     {paddd x, m13}, m6, m4
2582    REPX     {psrad x, 12 }, m6, m4
2583    mova                 m5, m6       ;  t14  t13
2584    mova                 m9, m4       ;  t8   t11
2585    call m(idct_16x8_internal_10bpc).main3
2586    vbroadcasti32x4     m23, [o(pd_4091_3973)]
2587    vbroadcasti32x4      m7, [o(pd_201_995)]
2588    vbroadcasti32x4     m22, [o(pd_1380_601)]
2589    vbroadcasti32x4      m9, [o(pd_3857_4052)]
2590    pmulld              m23, m16      ;  t16  t20
2591    pmulld              m16, m7       ;  t31  t27
2592    pmulld              m22, m17      ; -t19 -t25
2593    pmulld              m17, m9       ;  t28  t24
2594    REPX    {paddd  x, m13}, m23, m16, m17
2595    psubd               m22, m13, m22
2596    REPX    {psrad  x, 12 }, m23, m16, m22, m17
2597    mova                m20, m23      ;  t30  t26
2598    mova                 m9, m16      ;  t17  t21
2599    mova                m19, m22      ;  t18  t22
2600    mova                m18, m17      ;  t29  t25
2601    jmp .main3
2602.main_fast: ; bottom half is zero
2603    vbroadcasti32x4     m23, [o(pd_4091_3973)]
2604    vbroadcasti32x4      m7, [o(pd_201_995)]
2605    vbroadcasti32x4     m20, [o(pd_2751_2106)]
2606    vbroadcasti32x4      m9, [o(pd_3035_3513)]
2607    vbroadcasti32x4     m21, [o(pd_3703_3290)]
2608    vbroadcasti32x4     m10, [o(pd_1751_2440)]
2609    vbroadcasti32x4     m22, [o(pd_1380_601)]
2610    vbroadcasti32x4     m11, [o(pd_3857_4052)]
2611    pmulld              m23, m16      ;  t16a  t20a
2612    pmulld              m16, m7       ;  t31a  t27a
2613    pmulld              m20, m19      ; -t17a -t21a
2614    pmulld              m19, m9       ;  t30a  t26a
2615    pmulld              m21, m18      ;  t18a  t22a
2616    pmulld              m18, m10      ;  t29a  t25a
2617    pmulld              m22, m17      ; -t19a -t25a
2618    pmulld              m17, m11      ;  t28a  t24a
2619    psubd               m20, m13, m20
2620    psubd               m22, m13, m22
2621    jmp .main2
2622.main:
2623    ITX_MULSUB_2D        16, 23, 7, 9, 10, _,  201_995,  4091_3973
2624    ITX_MULSUB_2D        20, 19, 7, 9, 10, _, 3035_3513, 2751_2106
2625    ITX_MULSUB_2D        18, 21, 7, 9, 10, _, 1751_2440, 3703_3290
2626    ITX_MULSUB_2D        22, 17, 7, 9, 10, _, 3857_4052, 1380_601
2627    paddd               m20, m13
2628    paddd               m22, m13
2629.main2:
2630    REPX    {paddd  x, m13}, m16, m23, m19
2631    REPX    {psrad  x, 12 }, m16, m20, m23, m19
2632    psubd                m9, m16, m20 ; t17  t21
2633    paddd               m16, m20      ; t16  t20
2634    psubd               m20, m23, m19 ; t30  t26
2635    paddd               m23, m19      ; t31  t27
2636    REPX    {pmaxsd x, m14}, m9, m16, m20, m23
2637    REPX    {paddd  x, m13}, m21, m18, m17
2638    REPX    {psrad  x, 12 }, m18, m22, m21, m17
2639    psubd               m19, m22, m18 ; t18  t22
2640    paddd               m22, m18      ; t19  t23
2641    psubd               m18, m17, m21 ; t29  t25
2642    paddd               m17, m21      ; t28  t24
2643    REPX    {pmaxsd x, m14}, m19, m22, m18, m17
2644    REPX    {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17
2645.main3:
2646    vbroadcasti32x4     m11, [o(pd_4017_2276)]
2647    vbroadcasti32x4     m10, [o(pd_799_3406)]
2648    psubd                m7, m0, m6   ; dct16 out15 out14
2649    paddd                m0, m6       ; dct16 out0  out1
2650    psubd                m6, m1, m5   ; dct16 out12 out13
2651    paddd                m1, m5       ; dct16 out3  out2
2652    psubd                m5, m2, m4   ; dct16 out11 out10
2653    paddd                m2, m4       ; dct16 out4  out5
2654    psubd                m4, m3, m8   ; dct16 out8  out9
2655    paddd                m3, m8       ; dct16 out7  out6
2656    ITX_MULSUB_2D        20,  9, 8, 21, _, 13, 10, 11
2657    ITX_MULSUB_2D        18, 19, 8, 21, _, 13, 10, 11, 2
2658    REPX    {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
2659    punpckhqdq          m21, m16, m20 ; t20  t21a
2660    punpcklqdq          m16, m20      ; t16  t17a
2661    punpcklqdq          m20, m22, m19 ; t19  t18a
2662    punpckhqdq          m22, m19      ; t23  t22a
2663    REPX    {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
2664    punpcklqdq          m19, m23, m9  ; t31  t30a
2665    punpckhqdq          m23, m9       ; t27  t26a
2666    punpckhqdq           m9, m17, m18 ; t24  t25a
2667    punpcklqdq          m17, m18      ; t28  t29a
2668    psubd               m18, m16, m20 ; t19a t18
2669    paddd               m20, m16      ; t16a t17
2670    psubd               m16, m19, m17 ; t28a t29
2671    paddd               m19, m17      ; t31a t30
2672    psubd               m17, m22, m21 ; t20a t21
2673    paddd               m22, m21      ; t23a t22
2674    psubd               m21, m9, m23  ; t27a t26
2675    paddd               m23, m9       ; t24a t25
2676    REPX    {pmaxsd x, m14}, m18, m16, m17, m21
2677    REPX    {pminsd x, m15}, m16, m18, m21, m17
2678    REPX    {pmaxsd x, m14}, m20, m22, m19, m23
2679    REPX    {pminsd x, m15}, m20, m22, m19, m23
2680.main4:
2681    vpbroadcastd        m11, [o(pd_3784)]
2682    vpbroadcastd        m10, [o(pd_1567)]
2683    ITX_MULSUB_2D        16, 18, 8, 9, _, 13, 10, 11
2684    ITX_MULSUB_2D        21, 17, 8, 9, _, 13, 10, 11, 2
2685    paddd                m9, m20, m22 ; t16  t17a
2686    psubd               m20, m22      ; t23  t22a
2687    paddd               m22, m19, m23 ; t31  t30a
2688    psubd               m19, m23      ; t24  t25a
2689    psubd               m23, m16, m17 ; t20a t21
2690    paddd               m16, m17      ; t19a t18
2691    psubd               m17, m18, m21 ; t27a t26
2692    paddd               m21, m18      ; t28a t29
2693    REPX    {pmaxsd x, m14}, m20, m19, m23, m17
2694    REPX    {pminsd x, m15}, m19, m20, m17, m23
2695    REPX    {pmulld x, m12}, m19, m20, m17, m23
2696    REPX    {pmaxsd x, m14}, m22, m21, m16, m9
2697    paddd               m19, m13
2698    paddd               m17, m13
2699    REPX    {pminsd x, m15}, m22, m21, m16, m9
2700    psubd               m18, m19, m20 ; t23a t22
2701    paddd               m19, m20      ; t24a t25
2702    paddd               m20, m17, m23 ; t27  t26a
2703    psubd               m17, m23      ; t20  t21a
2704    REPX    {psrad  x, 12 }, m20, m19, m18, m17
2705    ret
2706.transpose_8x32:
2707    mova                m10, [o(idct32x8p)]
2708    psrlw                m8, m10, 8
2709    mova                 m9, m8
2710    vpermi2w             m8, m1, m5
2711    vpermt2w             m1, m10, m5
2712    vprold               m5, m9, 16
2713    vpermi2w             m9, m3, m7
2714    vpermt2w             m3, m10, m7
2715    vprold              m10, 16
2716    mova                 m7, m5
2717    vpermi2w             m5, m0, m4
2718    vpermt2w             m0, m10, m4
2719    vpermi2w             m7, m2, m6
2720    vpermt2w             m2, m10, m6
2721    punpckhdq            m6, m5, m8
2722    punpckldq            m5, m8
2723    punpckhdq            m8, m7, m9
2724    punpckldq            m7, m9
2725    punpckhdq            m4, m2, m3
2726    punpckldq            m2, m3
2727    punpckhdq            m3, m0, m1
2728    punpckldq            m0, m1
2729    ret
2730
2731cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob
2732    vpbroadcastd         m5, [pw_4096]
2733    lea                  r4, [strideq*3]
2734    mova                 m6, [idtx32x8p]
2735    lea                  r5, [strideq*5]
2736    vpbroadcastd         m9, [pixel_10bpc_max]
2737    lea                  r6, [strideq+r4*2]
2738    pxor                 m8, m8
2739    sub                eobd, 107
2740    psrlw                m7, m6, 8
2741.loop:
2742    mova                 m0, [cq+64*0]
2743    packssdw             m0, [cq+64*1] ; 02 13
2744    mova                 m1, [cq+64*2]
2745    packssdw             m1, [cq+64*3] ; 46 57
2746    mova                 m2, [cq+64*4]
2747    packssdw             m2, [cq+64*5] ; 8a 9b
2748    mova                 m3, [cq+64*6]
2749    packssdw             m3, [cq+64*7] ; ce df
2750    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
2751    REPX {mova [cq+64*x], m8}, 0, 1, 2, 3
2752    mova                 m4, m6
2753    vpermi2w             m4, m1, m3
2754    vpermt2w             m1, m7, m3
2755    REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
2756    mova                 m3, m7
2757    vpermi2w             m3, m0, m2
2758    vpermt2w             m0, m6, m2
2759    add                  cq, 64*8
2760    punpcklqdq           m2, m3, m1 ; 4 5
2761    punpckhqdq           m3, m1     ; 6 7
2762    punpckhqdq           m1, m0, m4 ; 2 3
2763    punpcklqdq           m0, m4     ; 0 1
2764    mova                ym4, [dstq+strideq*0]
2765    vinserti32x8         m4, [dstq+strideq*1], 1
2766    paddw                m0, m4
2767    mova                ym4, [dstq+strideq*2]
2768    vinserti32x8         m4, [dstq+r4     *1], 1
2769    paddw                m1, m4
2770    mova                ym4, [dstq+strideq*4]
2771    vinserti32x8         m4, [dstq+r5     *1], 1
2772    paddw                m2, m4
2773    mova                ym4, [dstq+r4     *2]
2774    vinserti32x8         m4, [dstq+r6     *1], 1
2775    paddw                m3, m4
2776    REPX     {pmaxsw x, m8}, m0, m1, m2, m3
2777    REPX     {pminsw x, m9}, m0, m1, m2, m3
2778    mova          [dstq+strideq*0], ym0
2779    vextracti32x8 [dstq+strideq*1], m0, 1
2780    mova          [dstq+strideq*2], ym1
2781    vextracti32x8 [dstq+r4     *1], m1, 1
2782    mova          [dstq+strideq*4], ym2
2783    vextracti32x8 [dstq+r5     *1], m2, 1
2784    mova          [dstq+r4     *2], ym3
2785    vextracti32x8 [dstq+r6     *1], m3, 1
2786    add                dstq, 32
2787    add                eobd, 0x80000000
2788    jnc .loop
2789    RET
2790
2791cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
2792%undef cmp
2793    lea                  r5, [o_base]
2794    test               eobd, eobd
2795    jz .dconly
2796    vpbroadcastd        m12, [o(pd_2896)]
2797    vpbroadcastd        m13, [o(pd_2048)]
2798    vpbroadcastd        m14, [o(clip_18b_min)]
2799    vpbroadcastd        m15, [o(clip_18b_max)]
2800%if WIN64
2801    movaps         [rsp+ 8], xmm6
2802    movaps         [rsp+24], xmm7
2803%endif
2804    cmp                eobd, 36
2805    jl .fast
2806    call .pass1
2807    cmp                eobd, 151
2808    jge .full
2809    lea                  r5, [o_base_8bpc]
2810    pxor                 m9, m9
2811    punpcklwd            m8, m1, m1 ;  2
2812    punpckhwd           m14, m1, m1 ;  3
2813    punpcklwd            m1, m3, m3 ;  6
2814    punpckhwd           m15, m3, m3 ;  7
2815    punpcklwd            m3, m6, m6 ; 12
2816    punpckhwd           m19, m6, m6 ; 13
2817    punpcklwd            m6, m9, m4 ; __  8
2818    punpckhwd           m20, m4, m4 ;  9
2819    punpckhwd           m16, m5, m5 ; 11
2820    punpcklwd            m5, m5     ; 10
2821    punpcklwd            m9, m0     ; __  0
2822    punpckhwd           m21, m0, m0 ;  1
2823    punpcklwd            m0, m7, m7 ; 14
2824    punpckhwd           m17, m7, m7 ; 15
2825    punpcklwd            m7, m2, m2 ;  4
2826    punpckhwd           m18, m2, m2 ;  5
2827    call m(idct_16x16_internal_8bpc).main_fast
2828    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
2829    mov                 r6d, 64*3
2830    pxor                 m8, m8
2831.zero_loop:
2832    REPX {mova [cq+r6*8+128*x], m8}, 3, 2, 1, 0
2833    sub                 r6d, 64
2834    jge .zero_loop
2835    jmp .pass2_end
2836.full:
2837    mova         [cq+128*0], m0
2838    mova         [cq+128*1], m1
2839    mova         [cq+128*2], m2
2840    mova         [cq+128*3], m3
2841    mova         [cq+128*4], m4
2842    mova         [cq+128*5], m5
2843    mova         [cq+128*6], m6
2844    mova         [cq+128*7], m7
2845    add                  cq, 64
2846    call .pass1
2847    mova                 m9, [cq-64* 1] ;  0  1
2848    mova                m14, [cq+64* 1] ;  2  3
2849    mova                m18, [cq+64* 3] ;  4  5
2850    mova                m15, [cq+64* 5] ;  6  7
2851    mova                m20, [cq+64* 7] ;  8  9
2852    mova                m16, [cq+64* 9] ; 10 11
2853    mova                m22, [cq+64*11] ; 12 13
2854    mova                m19, [cq+64*13] ; 14 15
2855    lea                  r5, [o_base_8bpc]
2856    punpcklwd            m8, m7, m14   ; 30  2
2857    punpckhwd           m21, m7, m9    ; 31  1
2858    punpcklwd            m7, m6, m18   ; 28  4
2859    punpckhwd           m14, m6        ;  3 29
2860    punpcklwd            m9, m0, m9    ; 16  0
2861    punpckhwd           m17, m19, m0   ; 15 17
2862    punpcklwd            m0, m19, m1   ; 14 18
2863    punpckhwd           m19, m1, m22   ; 19 13
2864    punpcklwd            m1, m15, m5   ;  6 26
2865    punpckhwd           m18, m5, m18   ; 27  5
2866    punpcklwd            m6, m4, m20   ; 24  8
2867    punpckhwd           m15, m4        ;  7 25
2868    punpcklwd            m5, m3, m16   ; 22 10
2869    punpckhwd           m20, m3, m20   ; 23  9
2870    punpcklwd            m3, m22, m2   ; 12 20
2871    punpckhwd           m16, m2        ; 11 21
2872    call m(idct_16x16_internal_8bpc).main2
2873    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
2874    mov                 r6d, 32*7
2875    pxor                 m8, m8
2876.full_zero_loop:
2877    REPX {mova [cq+r6*8+64*x], m8}, 2, 1, 0, -1
2878    sub                 r6d, 32
2879    jge .full_zero_loop
2880    jmp .pass2_end
2881.fast:
2882    mova                ym0, [cq+128*0]
2883    mova                ym2, [cq+128*4]
2884    movshdup             m8, [o(permB)]
2885    mova                ym1, [cq+128*2]
2886    mova                ym3, [cq+128*6]
2887    mova                ym4, [cq+128*1]
2888    mova                ym5, [cq+128*3]
2889    mova                ym6, [cq+128*5]
2890    mova                ym7, [cq+128*7]
2891    vpermt2q             m0, m8, m2 ; 0 4
2892    vpermt2q             m1, m8, m3 ; 2 6
2893    vpermt2q             m4, m8, m5 ; 1 3
2894    vpermt2q             m7, m8, m6 ; 7 5
2895    REPX    {pmulld x, m12}, m0, m1, m4, m7
2896    pxor               ym16, ym16
2897    mova         [cq+128*0], ym16
2898    REPX {vmovdqa32 [cq+128*x], ym16}, 1, 2, 3, 4, 5, 6, 7
2899    REPX    {paddd  x, m13}, m0, m1, m4, m7
2900    REPX    {psrad  x, 12 }, m0, m1, m4, m7
2901    call m(idct_8x8_internal_10bpc).main_fast
2902    call m(idct_16x8_internal_10bpc).main_fast
2903    vpbroadcastd        m11, [o(pd_1)]
2904    call m(idct_8x16_internal_10bpc).main_end2
2905    mova                 m8, [o(idct8x32p)]
2906    packssdw             m0, m4
2907    packssdw             m1, m5
2908    packssdw             m2, m6
2909    packssdw             m3, m7
2910    mova                 m6, [dup16_perm]
2911    vpermb               m0, m8, m0
2912    vpermb               m2, m8, m2
2913    vprold               m8, 16
2914    vpermb               m1, m8, m1
2915    vpermb               m3, m8, m3
2916    punpckldq            m4, m0, m2
2917    punpckhdq            m0, m2
2918    punpckldq            m2, m1, m3
2919    punpckhdq            m1, m3
2920    punpckldq           m21, m4, m2
2921    punpckhdq           m14, m4, m2
2922    punpckldq           m18, m0, m1
2923    punpckhdq           m15, m0, m1
2924    vpermb               m8, m6, m14 ; 2
2925    vpermb               m1, m6, m15 ; 6
2926    vpermb               m7, m6, m18 ; 4
2927    pmovzxwd             m9, ym21    ; 0
2928    vpord                m6, [o(pb_32)] {1to16}
2929    lea                  r5, [o_base_8bpc]
2930    vpermb              m21, m6, m21 ; 1
2931    vpermb              m15, m6, m15 ; 7
2932    vpermb              m18, m6, m18 ; 5
2933    vpermb              m14, m6, m14 ; 3
2934    pslld                m9, 16
2935    call m(idct_16x16_internal_8bpc).main_fast2
2936    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
2937.pass2_end:
2938    movshdup            m22, [permC]
2939    vpbroadcastd        m11, [pw_2048]
2940    vpbroadcastd        m13, [pixel_10bpc_max]
2941    lea                  r6, [strideq*3]
2942    pxor                m12, m12
2943    psrlq               m23, m22, 8
2944    vpermq               m8, m22, m0
2945    vpermq               m9, m23, m1
2946    call m(idct_16x8_internal_10bpc).write_16x4
2947    vpermq               m8, m22, m2
2948    vpermq               m9, m23, m3
2949    call m(idct_16x8_internal_10bpc).write_16x4
2950    vpermq               m8, m22, m4
2951    vpermq               m9, m23, m5
2952    call m(idct_16x8_internal_10bpc).write_16x4
2953    vpermq               m8, m22, m6
2954    vpermq               m9, m23, m7
2955    call m(idct_16x8_internal_10bpc).write_16x4
2956    vpermq               m8, m22, m14
2957    vpermq               m9, m23, m15
2958    call m(idct_16x8_internal_10bpc).write_16x4
2959    vpermq               m8, m22, m16
2960    vpermq               m9, m23, m17
2961    call m(idct_16x8_internal_10bpc).write_16x4
2962    vpermq               m8, m22, m18
2963    vpermq               m9, m23, m19
2964    call m(idct_16x8_internal_10bpc).write_16x4
2965    vpermq               m8, m22, m20
2966    vpermq               m9, m23, m21
2967%if WIN64
2968    movaps             xmm6, [rsp+ 8]
2969    movaps             xmm7, [rsp+24]
2970%endif
2971    vzeroupper
2972    jmp m(idct_16x8_internal_10bpc).write_16x4
2973.pass1:
2974    pmulld               m0, m12, [cq+128* 0]
2975    pmulld               m1, m12, [cq+128* 2]
2976    pmulld               m2, m12, [cq+128* 4]
2977    pmulld               m3, m12, [cq+128* 6]
2978    pmulld               m4, m12, [cq+128* 8]
2979    pmulld               m5, m12, [cq+128*10]
2980    pmulld               m6, m12, [cq+128*12]
2981    pmulld               m7, m12, [cq+128*14]
2982    call m(idct_8x16_internal_10bpc).main_rect2
2983    pmulld              m16, m12, [cq+128* 1]
2984    pmulld              m17, m12, [cq+128* 3]
2985    pmulld              m18, m12, [cq+128* 5]
2986    pmulld              m19, m12, [cq+128* 7]
2987    pmulld              m20, m12, [cq+128* 9]
2988    pmulld              m21, m12, [cq+128*11]
2989    pmulld              m22, m12, [cq+128*13]
2990    pmulld              m23, m12, [cq+128*15]
2991    call m(idct_16x16_internal_10bpc).main_rect2
2992    vpbroadcastd        m11, [o(pd_1)]
2993    call m(idct_16x16_internal_10bpc).main_end2
2994    jmp m(idct_16x16_internal_10bpc).main_end3
2995.dconly:
2996    imul                r6d, [cq], 181
2997    mov                [cq], eobd
2998    or                  r3d, 32
2999    jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly
3000
3001cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 16, dst, stride, c, eob
3002%undef cmp
3003    vpbroadcastd        m10, [pw_2896x8]
3004    vpbroadcastd        m11, [pw_1697x16]
3005    vpbroadcastd        m13, [pw_8192]
3006    vpbroadcastd        m15, [pixel_10bpc_max]
3007    lea                  r6, [strideq*9]
3008    pxor                m14, m14
3009    paddw               m12, m13, m13 ; pw_16384
3010    cmp                eobd, 151
3011    jl .main
3012    call .main
3013    add                  cq, 64-128*4
3014    lea                dstq, [dstq+strideq*8]
3015.main:
3016    call .main_internal
3017    add                  cq, 128*4
3018    pmulhrsw             m1, m13, m2
3019    pmulhrsw             m3, m13, m4
3020    pmulhrsw             m5, m13, m6
3021    pmulhrsw             m7, m13, m8
3022    call .main_internal
3023.main2:
3024    pmulhrsw             m2, m13
3025    pmulhrsw             m4, m13
3026    pmulhrsw             m6, m13
3027    pmulhrsw             m8, m13
3028    punpcklqdq           m0, m1, m2 ;  0  8
3029    punpckhqdq           m1, m2     ;  1  9
3030    call .write_16x2x2
3031    punpcklqdq           m0, m3, m4 ;  2 10
3032    punpckhqdq           m1, m3, m4 ;  3 11
3033    call .write_16x2x2
3034    punpcklqdq           m0, m5, m6 ;  4 12
3035    punpckhqdq           m1, m5, m6 ;  5 13
3036    call .write_16x2x2
3037    punpcklqdq           m0, m7, m8 ;  6 14
3038    punpckhqdq           m1, m7, m8 ;  7 15
3039.write_16x2x2:
3040    mova                ym2, [dstq+strideq*0]
3041    vinserti32x8         m2, [dstq+strideq*8], 1
3042    mova                ym9, [dstq+strideq*1]
3043    vinserti32x8         m9, [dstq+r6       ], 1
3044    paddw                m0, m2
3045    paddw                m1, m9
3046    pmaxsw               m0, m14
3047    pmaxsw               m1, m14
3048    pminsw               m0, m15
3049    pminsw               m1, m15
3050    mova          [dstq+strideq*0], ym0
3051    vextracti32x8 [dstq+strideq*8], m0, 1
3052    mova          [dstq+strideq*1], ym1
3053    vextracti32x8 [dstq+r6       ], m1, 1
3054    lea                dstq, [dstq+strideq*2]
3055    ret
3056.main_internal:
3057    mova                 m8, [cq+128* 0]
3058    packssdw             m8, [cq+128* 8]
3059    mova                 m6, [cq+128* 1]
3060    packssdw             m6, [cq+128* 9]
3061    mova                 m0, [cq+128* 2]
3062    packssdw             m0, [cq+128*10]
3063    mova                 m2, [cq+128* 3]
3064    packssdw             m2, [cq+128*11]
3065    REPX  {pmulhrsw x, m10}, m8, m6, m0, m2
3066    REPX {vpermq x, x, q3120}, m8, m6, m0, m2
3067    pmulhrsw             m4, m11, m8
3068    pmulhrsw             m9, m11, m6
3069    REPX {mova [cq+128*x], m14}, 0, 1, 2, 3
3070    pmulhrsw             m4, m12
3071    pmulhrsw             m9, m12
3072    paddsw               m8, m4
3073    paddsw               m6, m9
3074    pmulhrsw             m4, m11, m0
3075    pmulhrsw             m9, m11, m2
3076    REPX {mova [cq+128*x], m14}, 8, 9, 10, 11
3077    pmulhrsw             m4, m12
3078    pmulhrsw             m9, m12
3079    paddsw               m0, m4
3080    paddsw               m2, m9
3081    punpcklwd            m4, m8, m6
3082    punpckhwd            m8, m6
3083    punpcklwd            m6, m0, m2
3084    punpckhwd            m0, m2
3085    punpckldq            m2, m4, m6 ; 0 1
3086    punpckhdq            m4, m6     ; 2 3
3087    punpckldq            m6, m8, m0 ; 4 5
3088    punpckhdq            m8, m0     ; 6 7
3089    ret
3090
3091cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
3092%undef cmp
3093    lea                  r5, [o_base]
3094    test               eobd, eobd
3095    jz .dconly
3096    vpbroadcastd        m12, [o(pd_2896)]
3097    vpbroadcastd        m13, [o(pd_2048)]
3098    vpbroadcastd        m14, [o(clip_18b_min)]
3099    vpbroadcastd        m15, [o(clip_18b_max)]
3100%if WIN64
3101    movaps         [rsp+ 8], xmm6
3102    movaps         [rsp+24], xmm7
3103%endif
3104    mov                 r6d, 8*12
3105    cmp                eobd, 36
3106    jl .fast
3107    pmulld               m0, m12, [cq+64* 0]
3108    pmulld               m1, m12, [cq+64* 4]
3109    pmulld               m2, m12, [cq+64* 8]
3110    pmulld               m3, m12, [cq+64*12]
3111    pmulld              m16, m12, [cq+64* 2]
3112    pmulld              m17, m12, [cq+64* 6]
3113    pmulld              m18, m12, [cq+64*10]
3114    pmulld              m19, m12, [cq+64*14]
3115    cmp                eobd, 151
3116    jge .full
3117    call m(idct_8x16_internal_10bpc).main_fast_rect2
3118    call m(idct_16x16_internal_10bpc).main_fast_rect2
3119    call .idct16_sumsub
3120    call .pass1_load_spill
3121    call .main_fast_rect2
3122    jmp .pass1_end
3123.full:
3124    pmulld               m4, m12, [cq+64*16]
3125    pmulld               m5, m12, [cq+64*20]
3126    pmulld               m6, m12, [cq+64*24]
3127    pmulld               m7, m12, [cq+64*28]
3128    pmulld              m20, m12, [cq+64*18]
3129    pmulld              m21, m12, [cq+64*22]
3130    pmulld              m22, m12, [cq+64*26]
3131    pmulld              m23, m12, [cq+64*30]
3132    add                 r6d, 8*16
3133    call m(idct_8x16_internal_10bpc).main_rect2
3134    call m(idct_16x16_internal_10bpc).main_rect2
3135    call .idct16_sumsub
3136    call .pass1_load_spill
3137    pmulld              m16, m12, [cq+64*17]
3138    pmulld              m17, m12, [cq+64*19]
3139    pmulld              m18, m12, [cq+64*21]
3140    pmulld              m19, m12, [cq+64*23]
3141    pmulld              m20, m12, [cq+64*25]
3142    pmulld              m21, m12, [cq+64*27]
3143    pmulld              m22, m12, [cq+64*29]
3144    pmulld              m23, m12, [cq+64*31]
3145    call .main_rect2
3146.pass1_end:
3147    vpbroadcastd        m11, [o(pd_1)]
3148    lea                  r4, [cq+64]
3149    call .idct32_pass1_end
3150    lea                  r5, [o_base_8bpc]
3151    punpckhqdq          m19, m5, m16  ; 11
3152    punpcklqdq           m5, m16      ; 10
3153    punpckhqdq          m16, m2, m1   ;  5
3154    punpcklqdq           m2, m1       ;  4
3155    punpcklqdq           m1, m15, m4  ;  2
3156    punpckhqdq          m15, m4       ;  3
3157    punpcklqdq           m4, m14, m18 ;  8
3158    punpckhqdq          m18, m14, m18 ;  9
3159    punpckhqdq          m14, m0, m20  ;  1
3160    punpcklqdq           m0, m20      ;  0
3161    punpckhqdq          m20, m6, m17  ; 13
3162    punpcklqdq           m6, m17      ; 12
3163    punpckhqdq          m17, m3, m21  ;  7
3164    punpcklqdq           m3, m21      ;  6
3165    punpckhqdq          m21, m7, m8   ; 15
3166    punpcklqdq           m7, m8       ; 14
3167    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
3168    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
3169    jmp .end
3170.fast:
3171    pmulld              ym0, ym12, [cq+64*0]
3172    pmulld              ym1, ym12, [cq+64*4]
3173    movshdup             m7, [o(permB)]
3174    mova                ym4, [cq+64*2]
3175    mova                ym5, [cq+64*6]
3176    mova               ym16, [cq+64*1]
3177    mova                ym2, [cq+64*5]
3178    mova                ym3, [cq+64*3]
3179    mova               ym17, [cq+64*7]
3180    vpermt2q             m4, m7, m5 ;  2  6
3181    vpermt2q            m16, m7, m2 ;  1  5
3182    vpermt2q            m17, m7, m3 ;  7  3
3183    paddd               ym0, ym13
3184    paddd               ym1, ym13
3185    psrad               ym0, 12
3186    psrad               ym1, 12
3187    vpermq               m0, m7, m0 ;  0  0
3188    vpermq               m1, m7, m1 ;  4  4
3189    REPX    {pmulld x, m12}, m4, m16, m17
3190    REPX    {paddd  x, m13}, m4, m16, m17
3191    REPX    {psrad  x, 12 }, m4, m16, m17
3192    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
3193    vpbroadcastd        m11, [o(pd_1)]
3194    call m(idct_16x16_internal_10bpc).main_end2
3195    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
3196    lea                  r5, [o_base_8bpc]
3197    punpckhqdq          m14, m0, m2 ; 1
3198    punpcklqdq           m0, m2     ; 0
3199    punpcklqdq           m1, m3, m4 ; 2
3200    punpckhqdq          m15, m3, m4 ; 3
3201    punpcklqdq           m2, m5, m7 ; 4
3202    punpckhqdq          m16, m5, m7 ; 5
3203    punpcklqdq           m3, m6, m8 ; 6
3204    punpckhqdq          m17, m6, m8 ; 7
3205    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
3206.end:
3207%if WIN64
3208    movaps             xmm6, [rsp+ 8]
3209    movaps             xmm7, [rsp+24]
3210%endif
3211    pxor                m12, m12
3212.zero_loop:
3213    mova     [cq+r6*8+64*3], m12
3214    mova     [cq+r6*8+64*2], m12
3215    mova     [cq+r6*8+64*1], m12
3216    mova     [cq+r6*8+64*0], m12
3217    sub                 r6d, 8*4
3218    jge .zero_loop
3219    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start
3220    pmulhrsw             m0, m11, m14
3221    pmulhrsw             m1, m11, m15
3222    pmulhrsw             m2, m11, m16
3223    pmulhrsw             m3, m11, m17
3224    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
3225    pmulhrsw             m0, m11, m18
3226    pmulhrsw             m1, m11, m19
3227    pmulhrsw             m2, m11, m20
3228    pmulhrsw             m3, m11, m21
3229    vzeroupper
3230    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
3231.dconly:
3232    imul                r6d, [cq], 181
3233    mov                [cq], eobd
3234    or                  r3d, 16
3235.dconly3:
3236    add                 r6d, 128
3237    sar                 r6d, 8
3238    imul                r6d, 181
3239    add                 r6d, 384
3240    sar                 r6d, 9
3241.dconly2:
3242    vpbroadcastd         m3, [o(dconly_10bpc)]
3243    imul                r6d, 181
3244    add                 r6d, 2176
3245    sar                 r6d, 12
3246    vpbroadcastw         m2, r6d
3247    paddsw               m2, m3
3248.dconly_loop:
3249    paddsw               m0, m2, [dstq+strideq*0]
3250    paddsw               m1, m2, [dstq+strideq*1]
3251    psubusw              m0, m3
3252    psubusw              m1, m3
3253    mova   [dstq+strideq*0], m0
3254    mova   [dstq+strideq*1], m1
3255    lea                dstq, [dstq+strideq*2]
3256    sub                 r3d, 2
3257    jg .dconly_loop
3258    RET
3259ALIGN function_align
3260.idct16_sumsub:
3261    psubd               m23, m0, m22 ; t15
3262    paddd                m0, m22     ; t0
3263    psubd               m22, m1, m21 ; t14
3264    paddd                m1, m21     ; t1
3265    REPX    {pmaxsd x, m14}, m23, m0, m22, m1
3266    psubd               m21, m2, m20 ; t13
3267    paddd                m2, m20     ; t2
3268    REPX    {pminsd x, m15}, m23, m0, m22, m1
3269    psubd               m20, m3, m19 ; t12
3270    paddd                m3, m19     ; t3
3271    REPX    {pmaxsd x, m14}, m21, m2, m20, m3
3272    psubd               m19, m4, m18 ; t11
3273    paddd                m4, m18     ; t4
3274    REPX    {pminsd x, m15}, m21, m2, m20, m3
3275    psubd               m18, m5, m17 ; t10
3276    paddd                m5, m17     ; t5
3277    REPX    {pmaxsd x, m14}, m19, m4, m18, m5
3278    psubd               m17, m6, m16 ; t9
3279    paddd                m6, m16     ; t6
3280    REPX    {pminsd x, m15}, m19, m4, m18, m5
3281    psubd               m16, m7, m9  ; t8
3282    paddd                m7, m9      ; t7
3283    REPX    {pmaxsd x, m14}, m17, m6, m16, m7
3284    REPX    {pminsd x, m15}, m17, m6, m16, m7
3285    ret
3286.idct32_pass1_end:
3287    psrlq               m12, [o(permC)], 24 ;  0  2  8 10  1  3  9 11
3288    psrlq               m13, m12, 32        ;  4  6 12 14  5  7 13 15
3289%macro IDCT32_PASS1_END 2 ; low, high
3290    paddd                m8, m11, [r4+128*%1]
3291    paddd                m9, m11, [cq+128*%1]
3292    psubd               m10, m8, m%1  ; out 16+n
3293    paddd                m8, m%1      ; out 15-n
3294    paddd               m%1, m9, m%2  ; out  0+n
3295    psubd                m9, m%2      ; out 31-n
3296    REPX   {vpsravd x, m11}, m10, m%1, m8, m9
3297    packssdw            m%1, m10      ;  0+n 16+n
3298    packssdw            m%2, m8, m9   ; 15-n 31-n
3299%endmacro
3300    IDCT32_PASS1_END      0, 23       ;  0 16, 15 31
3301    IDCT32_PASS1_END      7, 16       ;  7 23,  8 24
3302    IDCT32_PASS1_END      1, 22       ;  1 17, 14 30
3303    IDCT32_PASS1_END      6, 17       ;  6 22,  9 25
3304    IDCT32_PASS1_END      2, 21       ;  2 18, 13 29
3305    IDCT32_PASS1_END      5, 18       ;  5 21, 10 26
3306    IDCT32_PASS1_END      3, 20       ;  3 19, 12 28
3307    IDCT32_PASS1_END      4, 19       ;  4 20, 11 27
3308.transpose_16x32:
3309    mova                m14, m13
3310    vpermi2q            m14, m0, m16
3311    vpermt2q             m0, m12, m16
3312    mova                m15, m13
3313    vpermi2q            m15, m1, m17
3314    vpermt2q             m1, m12, m17
3315    mova                m16, m13
3316    vpermi2q            m16, m2, m18
3317    vpermt2q             m2, m12, m18
3318    mova                m17, m13
3319    vpermi2q            m17, m3, m19
3320    vpermt2q             m3, m12, m19
3321    mova                m18, m13
3322    vpermi2q            m18, m4, m20
3323    vpermt2q             m4, m12, m20
3324    mova                m19, m13
3325    vpermi2q            m19, m5, m21
3326    vpermt2q             m5, m12, m21
3327    mova                m20, m13
3328    vpermi2q            m20, m6, m22
3329    vpermt2q             m6, m12, m22
3330    mova                m21, m13
3331    vpermi2q            m21, m7, m23
3332    vpermt2q             m7, m12, m23
3333    punpckhwd            m8, m2, m3   ; c04 d04 c05 d05 c06 d06 c07 d07
3334    punpcklwd            m2, m3       ; c00 d00 c01 d01 c02 d02 c03 d03
3335    punpckhwd            m3, m0, m1   ; a04 b04 a05 b05 a06 b06 a07 b07
3336    punpcklwd            m0, m1       ; a00 b00 a01 b01 a02 b02 a03 b03
3337    punpckhwd            m1, m4, m5   ; e04 f04 e05 f05 e06 f06 e07 f07
3338    punpcklwd            m4, m5       ; e00 f00 e01 f01 e02 f02 e03 f03
3339    punpckhwd            m5, m6, m7   ; g04 h04 g05 h05 g06 h06 g07 h07
3340    punpcklwd            m6, m7       ; g00 h00 g01 h01 g02 h02 g03 h03
3341    punpckhwd            m7, m14, m15 ; a12 b12 a13 b13 a14 b14 a15 b15
3342    punpcklwd           m14, m15      ; a08 b08 a09 b09 a10 b10 a11 b11
3343    punpckhwd           m15, m16, m17 ; c12 d12 c13 d13 c14 d14 c15 d15
3344    punpcklwd           m16, m17      ; c08 d08 c09 d09 c10 d10 c11 d11
3345    punpckhwd           m17, m18, m19 ; e12 f12 e13 f13 e14 f14 e15 f15
3346    punpcklwd           m18, m19      ; e08 f08 e09 f09 e10 f10 e11 f11
3347    punpckhwd           m19, m20, m21 ; g12 h12 g13 h13 g14 h14 g15 h15
3348    punpcklwd           m20, m21      ; g08 h08 g09 h09 g10 h10 g11 h11
3349    punpckhdq           m21, m1, m5   ; e06 f06 g06 h06 e07 f07 g07 h07
3350    punpckldq            m1, m5       ; e04 f04 g04 h04 e05 f05 g05 h05
3351    punpckhdq            m5, m14, m16 ; a10 b10 c10 d10 a11 b11 c11 d11
3352    punpckldq           m14, m16      ; a08 b08 c08 d08 a09 b09 c09 d09
3353    punpckhdq           m16, m18, m20 ; e10 f10 g10 h10 e11 f11 g11 h11
3354    punpckldq           m18, m20      ; e08 f08 g08 h08 e09 f09 g09 h09
3355    punpckldq           m20, m4, m6   ; e00 f00 g00 h00 e01 f01 g01 h01
3356    punpckhdq            m4, m6       ; e02 f02 g02 h02 e03 f03 g03 h03
3357    punpckldq            m6, m7, m15  ; a12 b12 c12 d12 a13 b13 c13 d13
3358    punpckhdq            m7, m15      ; a14 b14 c14 d14 a15 b15 c15 d15
3359    punpckhdq           m15, m0, m2   ; a02 b02 c02 d02 a03 b03 c03 d03
3360    punpckldq            m0, m2       ; a00 b00 c00 d00 a01 b01 c01 d01
3361    punpckldq            m2, m3, m8   ; a04 b04 c04 d04 a05 b05 c05 d05
3362    punpckhdq            m3, m8       ; a06 b06 c06 d06 a07 b07 c07 d07
3363    punpckhdq            m8, m17, m19 ; e14 f14 g14 h14 e15 f15 g15 h15
3364    punpckldq           m17, m19      ; e12 f12 g12 h12 e13 f13 g13 h13
3365    ret
3366.pass1_load_spill:
3367    mova         [cq+64* 0], m0
3368    mova         [cq+64* 2], m1
3369    mova         [cq+64* 4], m2
3370    mova         [cq+64* 6], m3
3371    mova         [cq+64* 8], m4
3372    mova         [cq+64*10], m5
3373    mova         [cq+64*12], m6
3374    mova         [cq+64*14], m7
3375    pmulld               m0, m12, [cq+64* 1]
3376    pmulld               m1, m12, [cq+64* 3]
3377    pmulld               m2, m12, [cq+64* 5]
3378    pmulld               m3, m12, [cq+64* 7]
3379    pmulld               m4, m12, [cq+64* 9]
3380    pmulld               m5, m12, [cq+64*11]
3381    pmulld               m6, m12, [cq+64*13]
3382    pmulld               m7, m12, [cq+64*15]
3383    mova         [cq+64* 1], m23
3384    mova         [cq+64* 3], m22
3385    mova         [cq+64* 5], m21
3386    mova         [cq+64* 7], m20
3387    mova         [cq+64* 9], m19
3388    mova         [cq+64*11], m18
3389    mova         [cq+64*13], m17
3390    mova         [cq+64*15], m16
3391    ret
3392.main_fast2_rect2:
3393    REPX     {paddd x, m13}, m0, m1, m2, m3
3394    REPX     {psrad x, 12 }, m0, m1, m2, m3
3395.main_fast2: ; bottom 3/4 is zero
3396    pmulld              m23, m0, [o(pd_4091)] {1to16} ; t31a
3397    pmulld               m0, [o(pd_201)] {1to16}      ; t16a
3398    pmulld              m20, m3, [o(pd_1380)] {1to16} ; t19a
3399    pmulld               m3, [o(pd_3857)] {1to16}     ; t28a
3400    pmulld              m21, m2, [o(pd_3973)] {1to16} ; t27a
3401    pmulld               m2, [o(pd_995)] {1to16}      ; t20a
3402    pmulld               m6, m1, [o(pd_601)] {1to16}  ; t23a
3403    pmulld              m17, m1, [o(pd_4052)] {1to16} ; t24a
3404    REPX  {psubd x, m13, x}, m20, m6
3405    REPX    {paddd  x, m13}, m23, m0, m3, m21, m2, m17
3406    REPX    {psrad  x, 12 }, m20, m6, m23, m0, m3, m21, m2, m17
3407    mova                 m8, m0
3408    mova                m16, m23
3409    mova                 m7, m20
3410    mova                 m4, m3
3411    mova                m19, m2
3412    mova                m18, m21
3413    mova                 m5, m6
3414    mova                m22, m17
3415    jmp .main3
3416.main_fast_rect2:
3417    call m(idct_8x16_internal_10bpc).round
3418.main_fast: ; bottom half is zero
3419    pmulld              m23, m0, [o(pd_4091)] {1to16} ; t31a
3420    pmulld               m0, [o(pd_201)] {1to16}      ; t16a
3421    pmulld              m16, m7, [o(pd_2751)] {1to16} ; t17a
3422    pmulld               m7, [o(pd_3035)] {1to16}     ; t30a
3423    pmulld              m19, m4, [o(pd_3703)] {1to16} ; t29a
3424    pmulld               m4, [o(pd_1751)] {1to16}     ; t18a
3425    pmulld              m20, m3, [o(pd_1380)] {1to16} ; t19a
3426    pmulld               m3, [o(pd_3857)] {1to16}     ; t28a
3427    pmulld              m21, m2, [o(pd_3973)] {1to16} ; t27a
3428    pmulld               m2, [o(pd_995)] {1to16}      ; t20a
3429    pmulld              m18, m5, [o(pd_2106)] {1to16} ; t21a
3430    pmulld               m5, [o(pd_3513)] {1to16}     ; t26a
3431    pmulld              m17, m6, [o(pd_3290)] {1to16} ; t25a
3432    pmulld               m6, [o(pd_2440)] {1to16}     ; t22a
3433    pmulld              m22, m1, [o(pd_601)] {1to16}  ; t23a
3434    pmulld               m1, [o(pd_4052)] {1to16}     ; t24a
3435    REPX  {psubd x, m13, x}, m16, m20, m18, m22
3436    call m(idct_16x16_internal_10bpc).round3
3437    jmp .main2
3438.main_rect2:
3439    call m(idct_8x16_internal_10bpc).round
3440    call m(idct_16x16_internal_10bpc).round
3441.main:
3442    ITX_MULSUB_2D         0, 23,  8,  9, 10, _,  201, 4091 ; t16a, t31a
3443    ITX_MULSUB_2D        16,  7,  8,  9, 10, _, 3035, 2751 ; t17a, t30a
3444    ITX_MULSUB_2D         4, 19,  8,  9, 10, _, 1751, 3703 ; t18a, t29a
3445    ITX_MULSUB_2D        20,  3,  8,  9, 10, _, 3857, 1380 ; t19a, t28a
3446    ITX_MULSUB_2D         2, 21,  8,  9, 10, _,  995, 3973 ; t20a, t27a
3447    ITX_MULSUB_2D        18,  5,  8,  9, 10, _, 3513, 2106 ; t21a, t26a
3448    ITX_MULSUB_2D         6, 17,  8,  9, 10, _, 2440, 3290 ; t22a, t25a
3449    ITX_MULSUB_2D        22,  1,  8,  9, 10, _, 4052,  601 ; t23a, t24a
3450    call m(idct_16x16_internal_10bpc).round
3451.main2:
3452    call m(idct_8x16_internal_10bpc).round
3453    psubd                m8, m0, m16  ; t17
3454    paddd                m0, m16      ; t16
3455    psubd               m16, m23, m7  ; t30
3456    paddd               m23, m7       ; t31
3457    REPX    {pmaxsd x, m14}, m8, m0, m16, m23
3458    paddd                m7, m20, m4  ; t19
3459    psubd               m20, m4       ; t18
3460    REPX    {pminsd x, m15}, m8, m0, m16, m23
3461    paddd                m4, m3, m19  ; t28
3462    psubd                m3, m19      ; t29
3463    REPX    {pmaxsd x, m14}, m7, m20, m4, m3
3464    psubd               m19, m2, m18  ; t21
3465    paddd                m2, m18      ; t20
3466    REPX    {pminsd x, m15}, m7, m20, m4, m3
3467    psubd               m18, m21, m5  ; t26
3468    paddd               m21, m5       ; t27
3469    REPX    {pmaxsd x, m14}, m19, m2, m18, m21
3470    psubd                m5, m22, m6  ; t22
3471    paddd                m6, m22      ; t23
3472    REPX    {pminsd x, m15}, m19, m2, m18, m21
3473    psubd               m22, m1, m17  ; t25
3474    paddd               m17, m1       ; t24
3475    REPX    {pmaxsd x, m14}, m5, m6, m22, m17
3476    REPX    {pminsd x, m15}, m5, m6, m22, m17
3477.main3:
3478    vpbroadcastd        m11, [o(pd_4017)]
3479    vpbroadcastd        m10, [o(pd_799)]
3480    ITX_MULSUB_2D        16,  8, 9, 1, _, 13, 10, 11    ; t17a, t30a
3481    ITX_MULSUB_2D         3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a
3482    vpbroadcastd        m11, [o(pd_2276)]
3483    vpbroadcastd        m10, [o(pd_3406)]
3484    ITX_MULSUB_2D        18, 19, 9, 1, _, 13, 10, 11    ; t21a, t26a
3485    ITX_MULSUB_2D        22,  5, 9, 1, _, 13, 10, 11, 2 ; t25a, t22a
3486    paddd                m1, m6, m2   ; t23a
3487    psubd                m6, m2       ; t20a
3488    psubd                m2, m17, m21 ; t27a
3489    paddd               m17, m21      ; t24a
3490    REPX    {pmaxsd x, m14}, m1, m6, m2, m17
3491    psubd               m21, m23, m4  ; t28a
3492    paddd               m23, m4       ; t31a
3493    REPX    {pminsd x, m15}, m1, m6, m2, m17
3494    psubd                m4, m16, m20 ; t18
3495    paddd               m16, m20      ; t17
3496    REPX    {pmaxsd x, m14}, m21, m23, m4, m16
3497    psubd               m20, m0, m7   ; t19a
3498    paddd                m0, m7       ; t16a
3499    REPX    {pminsd x, m15}, m21, m23, m4, m16
3500    psubd                m7, m8, m3   ; t29
3501    paddd                m3, m8       ; t30
3502    REPX    {pmaxsd x, m14}, m20, m0, m7, m3
3503    paddd                m8, m5, m18  ; t22
3504    psubd                m5, m18      ; t21
3505    REPX    {pminsd x, m15}, m20, m0, m7, m3
3506    psubd               m18, m22, m19 ; t26
3507    paddd               m22, m19      ; t25
3508    REPX    {pmaxsd x, m14}, m8, m5, m18, m22
3509    vpbroadcastd        m11, [o(pd_3784)]
3510    vpbroadcastd        m10, [o(pd_1567)]
3511    REPX    {pminsd x, m15}, m8, m5, m18, m22
3512    ITX_MULSUB_2D        21, 20, 9, 19, _, 13, 10, 11    ; t19,  t28
3513    ITX_MULSUB_2D         2,  6, 9, 19, _, 13, 10, 11, 2 ; t27,  t20
3514    ITX_MULSUB_2D         7,  4, 9, 19, _, 13, 10, 11    ; t18a, t29a
3515    ITX_MULSUB_2D        18,  5, 9, 19, _, 13, 10, 11, 2 ; t26a, t21a
3516    psubd               m19, m0, m1   ; t23
3517    paddd                m0, m1       ; t16
3518    paddd                m1, m8, m16  ; t17a
3519    psubd                m8, m16, m8  ; t22a
3520    REPX    {pmaxsd x, m14}, m19, m0, m1, m8
3521    psubd               m16, m23, m17 ; t24
3522    paddd               m23, m17      ; t31
3523    REPX    {pminsd x, m15}, m19, m0, m1, m8
3524    psubd               m17, m3, m22  ; t25a
3525    paddd               m22, m3       ; t30a
3526    REPX    {pmaxsd x, m14}, m16, m23, m17, m22
3527    paddd                m3, m6, m21  ; t19a
3528    psubd                m6, m21, m6  ; t20a
3529    REPX    {pminsd x, m15}, m16, m23, m17, m22
3530    paddd               m21, m18, m4  ; t29
3531    psubd               m18, m4, m18  ; t26
3532    REPX    {pmaxsd x, m14}, m3, m6, m21, m18
3533    psubd                m4, m20, m2  ; t27a
3534    paddd               m20, m2       ; t28a
3535    REPX    {pminsd x, m15}, m3, m6, m21, m18
3536    paddd                m2, m7, m5   ; t18
3537    psubd                m7, m5       ; t21
3538    REPX    {pmaxsd x, m14}, m4, m20, m2, m7
3539    REPX    {pminsd x, m15}, m4, m20, m2, m7
3540    REPX    {pmulld x, m12}, m18, m16, m4, m17, m7, m19, m6, m8
3541    REPX    {paddd  x, m13}, m18, m16, m4, m17
3542    psubd                m5, m18, m7  ; t21a
3543    paddd               m18, m7       ; t26a
3544    psubd                m7, m16, m19 ; t23a
3545    paddd               m16, m19      ; t24a
3546    REPX    {psrad  x, 12 }, m5, m18, m7, m16
3547    paddd               m19, m4, m6   ; t27
3548    psubd                m4, m6       ; t20
3549    psubd                m6, m17, m8  ; t22
3550    paddd               m17, m8       ; t25
3551    REPX    {psrad  x, 12 }, m19, m4, m6, m17
3552    ret
3553
3554cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 16, dst, stride, c, eob
3555%undef cmp
3556    vpbroadcastd        m10, [pw_2896x8]
3557    vpbroadcastd        m11, [pw_1697x16]
3558    vpbroadcastd        m13, [pw_2048]
3559    vpbroadcastd        m15, [pixel_10bpc_max]
3560    lea                  r6, [strideq*9]
3561    pxor                m14, m14
3562    cmp                eobd, 151
3563    jl .main
3564    mov                  r4, dstq
3565    call .main
3566    add                  cq, 64*12
3567    lea                dstq, [r4+32]
3568.main:
3569    call .main_internal
3570    add                  cq, 64*4
3571    pmulhrsw             m1, m13, m2
3572    pmulhrsw             m3, m13, m4
3573    pmulhrsw             m5, m13, m6
3574    pmulhrsw             m7, m13, m8
3575    call .main_internal
3576    jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
3577.main_internal:
3578    mova                 m8, [cq+64* 0]
3579    packssdw             m8, [cq+64* 8]
3580    mova                 m6, [cq+64* 1]
3581    packssdw             m6, [cq+64* 9]
3582    mova                 m0, [cq+64* 2]
3583    packssdw             m0, [cq+64*10]
3584    mova                 m2, [cq+64* 3]
3585    packssdw             m2, [cq+64*11]
3586    REPX  {pmulhrsw x, m10}, m8, m6, m0, m2
3587    REPX  {paddsw   x, x  }, m8, m6, m0, m2
3588    REPX {vpermq x, x, q3120}, m8, m6, m0, m2
3589    pmulhrsw             m4, m11, m8
3590    pmulhrsw             m9, m11, m6
3591    paddsw               m8, m8
3592    paddsw               m6, m6
3593    REPX {mova [cq+64*x], m14}, 0, 1, 2, 3
3594    paddsw               m8, m4
3595    paddsw               m6, m9
3596    pmulhrsw             m4, m11, m0
3597    pmulhrsw             m9, m11, m2
3598    paddsw               m0, m0
3599    paddsw               m2, m2
3600    REPX {mova [cq+64*x], m14}, 8, 9, 10, 11
3601    paddsw               m0, m4
3602    paddsw               m2, m9
3603    punpcklwd            m4, m8, m6
3604    punpckhwd            m8, m6
3605    punpcklwd            m6, m0, m2
3606    punpckhwd            m0, m2
3607    punpckldq            m2, m4, m6 ; 0 1
3608    punpckhdq            m4, m6     ; 2 3
3609    punpckldq            m6, m8, m0 ; 4 5
3610    punpckhdq            m8, m0     ; 6 7
3611    ret
3612
3613cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
3614%undef cmp
3615    lea                  r5, [o_base]
3616    test               eobd, eobd
3617    jz .dconly
3618    vpbroadcastd        m12, [o(pd_2896)]
3619    vpbroadcastd        m13, [o(pd_2048)]
3620    vpbroadcastd        m14, [o(clip_18b_min)]
3621    vpbroadcastd        m15, [o(clip_18b_max)]
3622    WIN64_SPILL_XMM      30
3623    cmp                eobd, 136
3624    jl .fast
3625    add                  cq, 64
3626    cmp                eobd, 543
3627    jge .full
3628    call .pass1_fast ; bottomright 16x16 zero
3629    mov                 r6d, 16*12
3630    jmp .lefthalf
3631.full:
3632    call .pass1
3633    mov                 r6d, 16*28
3634.lefthalf:
3635    mova        [cq+128* 0], m0
3636    mova        [cq+128* 1], m1
3637    mova        [cq+128* 2], m2
3638    mova        [cq+128* 3], m3
3639    mova        [cq+128* 4], m14
3640    mova        [cq+128* 5], m15
3641    mova        [cq+128* 6], m16
3642    mova        [cq+128* 7], m17
3643    mova        [cq+128* 8], m22
3644    mova        [cq+128* 9], m23
3645    mova        [cq+128*10], m24
3646    mova        [cq+128*11], m25
3647    mova        [cq+128*12], m26
3648    mova        [cq+128*13], m27
3649    mova        [cq+128*14], m28
3650    mova        [cq+128*15], m29
3651    sub                  cq, 64
3652    vpbroadcastd        m12, [o(pd_2896)]
3653    vpbroadcastd        m13, [o(pd_2048)]
3654    vpbroadcastd        m14, [o(clip_18b_min)]
3655    vpbroadcastd        m15, [o(clip_18b_max)]
3656    call .pass1
3657    lea                  r5, [o_base_8bpc]
3658    call .pass2_start
3659    pxor                m12, m12
3660.right_zero_loop:
3661    mova [cq+r6*8+64+128*3], m12
3662    mova [cq+r6*8+64+128*2], m12
3663    mova [cq+r6*8+64+128*1], m12
3664    mova [cq+r6*8+64+128*0], m12
3665    sub                 r6d, 16*4
3666    jge .right_zero_loop
3667    mov                 r6d, 16*28
3668    jmp .end2
3669.pass2_start:
3670    mova                 m4, [cq+64+128* 0]
3671    mova                 m5, [cq+64+128* 1]
3672    mova                 m6, [cq+64+128* 2]
3673    mova                 m7, [cq+64+128* 3]
3674    mova                m18, [cq+64+128* 4]
3675    mova                m19, [cq+64+128* 5]
3676    mova                m20, [cq+64+128* 6]
3677    mova                m21, [cq+64+128* 7]
3678    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
3679    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
3680    mova         [cq+128*0], m14
3681    mova         [cq+128*1], m15
3682    mova         [cq+128*2], m16
3683    mova         [cq+128*3], m17
3684    mova         [cq+128*4], m18
3685    mova         [cq+128*5], m19
3686    mova         [cq+128*6], m20
3687    mova         [cq+128*7], m21
3688    mova                m14, [cq+64+128* 8]
3689    mova                m15, [cq+64+128* 9]
3690    mova                m16, [cq+64+128*10]
3691    mova                m17, [cq+64+128*11]
3692    mova                m18, [cq+64+128*12]
3693    mova                m19, [cq+64+128*13]
3694    mova                m20, [cq+64+128*14]
3695    mova                m21, [cq+64+128*15]
3696    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
3697.fast: ; topleft 16x16 nonzero
3698    cmp                eobd, 36
3699    jl .fast2
3700    call .pass1_fast
3701    lea                  r5, [o_base_8bpc]
3702    call .pass2_fast_start
3703    jmp .end
3704.pass2_fast_start:
3705    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
3706    mova         [cq+128*0], m14
3707    mova         [cq+128*1], m15
3708    mova         [cq+128*2], m16
3709    mova         [cq+128*3], m17
3710    mova         [cq+128*4], m18
3711    mova         [cq+128*5], m19
3712    mova         [cq+128*6], m20
3713    mova         [cq+128*7], m21
3714    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
3715.fast2: ; topleft 8x8 nonzero
3716    movshdup             m7, [o(permB)]
3717    mova                ym0, [cq+128*0]
3718    mova                ym1, [cq+128*4]
3719    mova                ym4, [cq+128*2]
3720    mova                ym5, [cq+128*6]
3721    mova               ym16, [cq+128*1]
3722    mova                ym2, [cq+128*5]
3723    mova                ym3, [cq+128*3]
3724    mova               ym17, [cq+128*7]
3725    mov                 r6d, 16*4
3726    vpermq               m0, m7, m0 ;  0  0
3727    vpermq               m1, m7, m1 ;  4  4
3728    vpermt2q             m4, m7, m5 ;  2  6
3729    vpermt2q            m16, m7, m2 ;  1  5
3730    vpermt2q            m17, m7, m3 ;  7  3
3731    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
3732    call m(idct_16x16_internal_10bpc).main_end
3733    call .pass2_fast2_start
3734.end:
3735    pxor                m12, m12
3736.end2:
3737    call .pass2_end
3738.zero_loop:
3739    mova    [cq+r6*8+128*3], m12
3740    mova    [cq+r6*8+128*2], m12
3741    mova    [cq+r6*8+128*1], m12
3742    mova    [cq+r6*8+128*0], m12
3743    sub                 r6d, 16*4
3744    jge .zero_loop
3745    WIN64_RESTORE_XMM
3746    vzeroupper
3747    ret
3748.pass2_fast2_start:
3749    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
3750    lea                  r5, [o_base_8bpc]
3751    punpckhqdq          m22, m0, m2 ; 1
3752    punpcklqdq           m0, m2     ; 0
3753    punpcklqdq           m1, m5, m7 ; 4
3754    punpckhqdq          m24, m5, m7 ; 5
3755    punpcklqdq          m14, m3, m4 ; 2
3756    punpckhqdq          m23, m3, m4 ; 3
3757    punpcklqdq          m15, m6, m8 ; 6
3758    punpckhqdq          m25, m6, m8 ; 7
3759    mova                m10, m13
3760    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
3761    mova         [cq+128*0], m14
3762    mova         [cq+128*1], m15
3763    mova         [cq+128*2], m16
3764    mova         [cq+128*3], m17
3765    mova         [cq+128*4], m18
3766    mova         [cq+128*5], m19
3767    mova         [cq+128*6], m20
3768    mova         [cq+128*7], m21
3769    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
3770.pass2_end:
3771    psubsw               m9, m0, m29 ; out31
3772    paddsw               m0, m29     ; out0
3773    psubsw              m29, m1, m28 ; out30
3774    paddsw               m1, m28     ; out1
3775    psubsw              m28, m2, m27 ; out29
3776    paddsw               m2, m27     ; out2
3777    psubsw              m27, m3, m26 ; out28
3778    paddsw               m3, m26     ; out3
3779    psubsw              m26, m4, m25 ; out27
3780    paddsw               m4, m25     ; out4
3781    psubsw              m25, m5, m24 ; out26
3782    paddsw               m5, m24     ; out5
3783    psubsw              m24, m6, m23 ; out25
3784    paddsw               m6, m23     ; out6
3785    psubsw              m23, m7, m22 ; out24
3786    paddsw               m7, m22     ; out7
3787    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start
3788    mova                 m0, [cq+128*0]
3789    mova                 m1, [cq+128*1]
3790    mova                 m2, [cq+128*2]
3791    mova                 m3, [cq+128*3]
3792    mova                 m4, [cq+128*4]
3793    mova                 m5, [cq+128*5]
3794    mova                 m6, [cq+128*6]
3795    mova                 m7, [cq+128*7]
3796    psubsw              m22, m0, m21 ; out23
3797    paddsw               m0, m21     ; out8
3798    psubsw              m21, m1, m20 ; out22
3799    paddsw               m1, m20     ; out9
3800    psubsw              m20, m2, m19 ; out21
3801    paddsw               m2, m19     ; out10
3802    psubsw              m19, m3, m18 ; out20
3803    paddsw               m3, m18     ; out11
3804    psubsw              m18, m4, m17 ; out19
3805    paddsw               m4, m17     ; out12
3806    psubsw              m17, m5, m16 ; out18
3807    paddsw               m5, m16     ; out13
3808    psubsw              m16, m6, m15 ; out17
3809    paddsw               m6, m15     ; out14
3810    psubsw              m15, m7, m14 ; out16
3811    paddsw               m7, m14     ; out15
3812    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
3813    pmulhrsw             m0, m11, m15
3814    pmulhrsw             m1, m11, m16
3815    pmulhrsw             m2, m11, m17
3816    pmulhrsw             m3, m11, m18
3817    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
3818    pmulhrsw             m0, m11, m19
3819    pmulhrsw             m1, m11, m20
3820    pmulhrsw             m2, m11, m21
3821    pmulhrsw             m3, m11, m22
3822    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
3823    pmulhrsw             m0, m11, m23
3824    pmulhrsw             m1, m11, m24
3825    pmulhrsw             m2, m11, m25
3826    pmulhrsw             m3, m11, m26
3827    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
3828    pmulhrsw             m0, m11, m27
3829    pmulhrsw             m1, m11, m28
3830    pmulhrsw             m2, m11, m29
3831    pmulhrsw             m3, m11, m9
3832    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
3833.dconly:
3834    imul                r6d, [cq], 181
3835    mov                [cq], eobd
3836    or                  r3d, 32
3837    add                 r6d, 640
3838    sar                 r6d, 10
3839    jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
3840.pass1_fast:
3841    mova                 m0, [cq+128* 0]
3842    mova                 m1, [cq+128* 4]
3843    mova                 m2, [cq+128* 8]
3844    mova                 m3, [cq+128*12]
3845    mov                 r6d, 16*12
3846    call m(idct_8x16_internal_10bpc).main_fast
3847    mova                m16, [cq+128* 2]
3848    mova                m17, [cq+128* 6]
3849    mova                m18, [cq+128*10]
3850    mova                m19, [cq+128*14]
3851    call m(idct_16x16_internal_10bpc).main_fast
3852    call .pass1_load_spill
3853    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
3854    jmp .pass1_end
3855.pass1:
3856    mova                 m0, [cq+128* 0]
3857    mova                 m1, [cq+128* 4]
3858    mova                 m2, [cq+128* 8]
3859    mova                 m3, [cq+128*12]
3860    mova                 m4, [cq+128*16]
3861    mova                 m5, [cq+128*20]
3862    mova                 m6, [cq+128*24]
3863    mova                 m7, [cq+128*28]
3864    call m(idct_8x16_internal_10bpc).main
3865    mova                m16, [cq+128* 2]
3866    mova                m17, [cq+128* 6]
3867    mova                m18, [cq+128*10]
3868    mova                m19, [cq+128*14]
3869    mova                m20, [cq+128*18]
3870    mova                m21, [cq+128*22]
3871    mova                m22, [cq+128*26]
3872    mova                m23, [cq+128*30]
3873    call m(idct_16x16_internal_10bpc).main
3874    call .pass1_load_spill
3875    mova                m16, [cq+128*17]
3876    mova                m17, [cq+128*19]
3877    mova                m18, [cq+128*21]
3878    mova                m19, [cq+128*23]
3879    mova                m20, [cq+128*25]
3880    mova                m21, [cq+128*27]
3881    mova                m22, [cq+128*29]
3882    mova                m23, [cq+128*31]
3883    call m(inv_txfm_add_dct_dct_32x16_10bpc).main
3884.pass1_end:
3885    vpbroadcastd        m11, [o(pd_2)]
3886    lea                  r4, [cq+128*8]
3887    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end
3888    punpckhqdq          m22, m0, m20  ;  1
3889    punpcklqdq           m0, m20      ;  0
3890    punpckhqdq          m24, m2, m1   ;  5
3891    punpcklqdq           m1, m2, m1   ;  4
3892    punpcklqdq           m2, m14, m18 ;  8
3893    punpckhqdq          m26, m14, m18 ;  9
3894    punpcklqdq          m14, m15, m4  ;  2
3895    punpckhqdq          m23, m15, m4  ;  3
3896    punpckhqdq          m25, m3, m21  ;  7
3897    punpcklqdq          m15, m3, m21  ;  6
3898    punpckhqdq          m28, m6, m17  ; 13
3899    punpcklqdq           m3, m6, m17  ; 12
3900    punpckhqdq          m27, m5, m16  ; 11
3901    punpcklqdq          m16, m5, m16  ; 10
3902    punpckhqdq          m29, m7, m8   ; 15
3903    punpcklqdq          m17, m7, m8   ; 14
3904    ret
3905.pass1_load_spill:
3906    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
3907    mova        [cq+128* 0], m0
3908    mova                 m0, [cq+128* 1]
3909    mova        [cq+128* 1], m1
3910    mova        [cq+128* 2], m2
3911    mova                 m1, [cq+128* 3]
3912    mova                 m2, [cq+128* 5]
3913    mova        [cq+128* 3], m3
3914    mova        [cq+128* 4], m4
3915    mova                 m3, [cq+128* 7]
3916    mova                 m4, [cq+128* 9]
3917    mova        [cq+128* 5], m5
3918    mova        [cq+128* 6], m6
3919    mova        [cq+128* 7], m7
3920    mova                 m5, [cq+128*11]
3921    mova                 m6, [cq+128*13]
3922    mova                 m7, [cq+128*15]
3923    mova        [cq+128* 8], m23
3924    mova        [cq+128* 9], m22
3925    mova        [cq+128*10], m21
3926    mova        [cq+128*11], m20
3927    mova        [cq+128*12], m19
3928    mova        [cq+128*13], m18
3929    mova        [cq+128*14], m17
3930    mova        [cq+128*15], m16
3931    ret
3932
3933cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eob
3934%undef cmp
3935    vpbroadcastd        m13, [pw_8192]
3936    vpbroadcastd        m15, [pixel_10bpc_max]
3937    pxor                m14, m14
3938    lea                  r6, [strideq*9]
3939    cmp                eobd, 136
3940    jl .main
3941    mov                  r4, dstq
3942    call .main
3943    add                  cq, 64-128*4
3944    lea                dstq, [dstq+strideq*8]
3945    call .main
3946    add                  cq, 128*12-64
3947    lea                dstq, [r4+32]
3948    cmp                eobd, 543
3949    jl .main
3950    call .main
3951    add                  cq, 64-128*4
3952    lea                dstq, [dstq+strideq*8]
3953.main:
3954    call .main_internal
3955    add                  cq, 128*4
3956    pmulhrsw             m1, m13, m2
3957    pmulhrsw             m3, m13, m4
3958    pmulhrsw             m5, m13, m6
3959    pmulhrsw             m7, m13, m8
3960    call .main_internal
3961    jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
3962.main_internal:
3963    mova                 m8, [cq+128* 0]
3964    packssdw             m8, [cq+128* 8]
3965    mova                 m6, [cq+128* 1]
3966    packssdw             m6, [cq+128* 9]
3967    mova                 m0, [cq+128* 2]
3968    packssdw             m0, [cq+128*10]
3969    mova                 m2, [cq+128* 3]
3970    packssdw             m2, [cq+128*11]
3971    REPX {vpermq x, x, q3120}, m8, m6, m0, m2
3972    REPX {mova [cq+128*x], m14}, 0, 1, 2, 3
3973    punpcklwd            m4, m8, m6
3974    punpckhwd            m8, m6
3975    punpcklwd            m6, m0, m2
3976    punpckhwd            m0, m2
3977    REPX {mova [cq+128*x], m14}, 8, 9, 10, 11
3978    punpckldq            m2, m4, m6 ; 0 1
3979    punpckhdq            m4, m6     ; 2 3
3980    punpckldq            m6, m8, m0 ; 4 5
3981    punpckhdq            m8, m0     ; 6 7
3982    ret
3983
3984cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
3985    lea                  r5, [o_base]
3986    test               eobd, eobd
3987    jz .dconly
3988
3989    PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob
3990%undef cmp
3991    vpbroadcastd        m12, [o(pd_2896)]
3992    vpbroadcastd        m13, [o(pd_2048)]
3993    vpbroadcastd        m14, [o(clip_18b_min)]
3994    vpbroadcastd        m15, [o(clip_18b_max)]
3995    cmp                eobd, 36
3996    jl .fast
3997    call .pass1
3998    cmp                eobd, 151
3999    jge .full
4000    lea                  r5, [o_base_8bpc]
4001
4002    punpckhwd           m22, m0, m0
4003    punpckhwd           m23, m1, m1
4004    punpckhwd           m24, m2, m2
4005    punpckhwd           m25, m3, m3
4006    punpckhwd           m26, m4, m4
4007    punpckhwd           m27, m5, m5
4008    punpckhwd           m28, m6, m6
4009    punpckhwd           m29, m7, m7
4010    punpcklwd           m21, m1, m1
4011    punpcklwd           m14, m3, m3
4012    punpcklwd           m18, m5, m5
4013    punpcklwd           m15, m7, m7
4014    pxor                 m9, m9
4015    punpcklwd            m9, m9, m0
4016    punpcklwd            m8, m2, m2
4017    punpcklwd            m7, m4, m4
4018    punpcklwd            m1, m6, m6
4019    call m(idct_16x16_internal_8bpc).main_fast2
4020    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
4021    mova     [rsp+mmsize*0], m14
4022    mova     [rsp+mmsize*1], m15
4023    mova     [rsp+mmsize*2], m16
4024    mova     [rsp+mmsize*3], m17
4025    mova     [rsp+mmsize*4], m18
4026    mova     [rsp+mmsize*5], m19
4027    mova     [rsp+mmsize*6], m20
4028    mova     [rsp+mmsize*7], m21
4029    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
4030
4031    pxor                m12, m12
4032    mov                 r3d, 64*3
4033.zero_loop:
4034    REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3
4035    sub                 r3d, 64
4036    jge .zero_loop
4037
4038    jmp .pass2_end
4039.full:
4040    mova         [cq+128*0], m0
4041    mova         [cq+128*1], m1
4042    mova         [cq+128*2], m2
4043    mova         [cq+128*3], m3
4044    mova         [cq+128*4], m4
4045    mova         [cq+128*5], m5
4046    mova         [cq+128*6], m6
4047    mova         [cq+128*7], m7
4048    add                  cq, 64
4049    call .pass1
4050    sub                  cq, 64
4051    mova                m22, [cq+128*0] ;  0  1
4052    mova                m23, [cq+128*1] ;  2  3
4053    mova                m24, [cq+128*2] ;  4  5
4054    mova                m25, [cq+128*3] ;  6  7
4055    mova                m26, [cq+128*4] ;  8  9
4056    mova                m27, [cq+128*5] ; 10 11
4057    mova                m28, [cq+128*6] ; 12 13
4058    mova                m29, [cq+128*7] ; 14 15
4059    mova         [cq+64* 8], m0
4060    mova         [cq+64* 9], m1
4061    mova         [cq+64*10], m2
4062    mova         [cq+64*11], m3
4063    mova         [cq+64*12], m4
4064    mova         [cq+64*13], m5
4065    mova         [cq+64*14], m6
4066    mova         [cq+64*15], m7
4067    lea                  r5, [o_base_8bpc]
4068
4069    punpcklwd           m20, m1, m1
4070    punpcklwd           m16, m3, m3
4071    punpcklwd           m19, m5, m5
4072    punpcklwd           m17, m7, m7
4073    punpcklwd            m8, m24, m24 ;  4
4074    punpcklwd            m5, m2, m2   ; 20
4075    punpcklwd            m1, m28, m28 ; 12
4076    punpcklwd            m7, m26, m26 ;  8
4077    punpcklwd            m3, m4, m4   ; 24
4078    punpcklwd            m4, m6, m6   ; 28
4079    pxor                 m9, m9
4080    punpcklwd            m6, m9, m0   ; __ 16
4081    mova                 m0, m4
4082    punpcklwd            m9, m9, m22  ; __  0
4083    call m(idct_16x16_internal_8bpc).main_fast
4084    punpcklwd           m21, m23, m23 ;  2
4085    punpcklwd           m15, m29, m29 ; 14
4086    punpcklwd           m18, m27, m27 ; 10
4087    punpcklwd           m14, m25, m25 ;  6
4088    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4089    mova     [rsp+mmsize*0], m14
4090    mova     [rsp+mmsize*1], m15
4091    mova     [rsp+mmsize*2], m16
4092    mova     [rsp+mmsize*3], m17
4093    mova     [rsp+mmsize*4], m18
4094    mova     [rsp+mmsize*5], m19
4095    mova     [rsp+mmsize*6], m20
4096    mova     [rsp+mmsize*7], m21
4097    mova                m21, [cq+64*15]
4098    mova                m14, [cq+64* 8]
4099    mova                m17, [cq+64*11]
4100    mova                m18, [cq+64*12]
4101    mova                m19, [cq+64*13]
4102    mova                m16, [cq+64*10]
4103    mova                m15, [cq+64* 9]
4104    mova                m20, [cq+64*14]
4105    REPX   {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
4106                             m24, m19, m16, m27, m28, m15, m20, m23
4107    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
4108
4109    pxor                m12, m12
4110    mov                 r3d, 32*7
4111.full_zero_loop:
4112    REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3
4113    sub                 r3d, 32
4114    jge .full_zero_loop
4115
4116    jmp .pass2_end
4117.fast:
4118    mova                ym0, [cq+128*0]
4119    mova                ym2, [cq+128*4]
4120    movshdup             m8, [o(permB)]
4121    mova                ym1, [cq+128*2]
4122    mova                ym3, [cq+128*6]
4123    mova                ym4, [cq+128*1]
4124    mova                ym5, [cq+128*3]
4125    mova                ym6, [cq+128*5]
4126    mova                ym7, [cq+128*7]
4127    vpermt2q             m0, m8, m2 ; 0 4
4128    vpermt2q             m1, m8, m3 ; 2 6
4129    vpermt2q             m4, m8, m5 ; 1 3
4130    vpermt2q             m7, m8, m6 ; 7 5
4131    call m(idct_8x8_internal_10bpc).main_fast
4132    call m(idct_16x8_internal_10bpc).main_fast
4133    vpbroadcastd        m11, [o(pd_2)]
4134    call m(idct_8x16_internal_10bpc).main_end2
4135    mova                 m8, [o(idct8x32p)]
4136    packssdw             m0, m4
4137    packssdw             m1, m5
4138    packssdw             m2, m6
4139    packssdw             m3, m7
4140    mova                 m6, [dup16_perm]
4141    vpermb               m0, m8, m0
4142    vpermb               m2, m8, m2
4143    vprold               m8, 16
4144    vpermb               m1, m8, m1
4145    vpermb               m3, m8, m3
4146    punpckldq            m4, m0, m2
4147    punpckhdq            m0, m2
4148    punpckldq            m2, m1, m3
4149    punpckhdq            m1, m3
4150    punpckldq           m21, m4, m2
4151    punpckhdq           m14, m4, m2
4152    punpckldq           m18, m0, m1
4153    punpckhdq           m15, m0, m1
4154    vpord                m7, m6, [o(pb_32)] {1to16}
4155    vpermb              m22, m7, m21 ; 1
4156    pmovzxwd             m9, ym21    ; 0
4157    vpermb               m8, m6, m18 ; 4
4158    vpermb              m24, m7, m18 ; 5
4159    vpermb              m21, m6, m14 ; 2
4160    vpermb              m23, m7, m14 ; 3
4161    vpermb              m14, m6, m15 ; 6
4162    vpermb              m25, m7, m15 ; 7
4163    lea                  r5, [o_base_8bpc]
4164    pslld                m9, 16
4165
4166    pxor                 m7, m7
4167    REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29
4168
4169    call m(idct_16x16_internal_8bpc).main_fast2
4170    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
4171    mova     [rsp+mmsize*0], m14
4172    mova     [rsp+mmsize*1], m15
4173    mova     [rsp+mmsize*2], m16
4174    mova     [rsp+mmsize*3], m17
4175    mova     [rsp+mmsize*4], m18
4176    mova     [rsp+mmsize*5], m19
4177    mova     [rsp+mmsize*6], m20
4178    mova     [rsp+mmsize*7], m21
4179
4180    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
4181
4182    pxor                m12, m12
4183    REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
4184.pass2_end:
4185    movshdup            m30, [permC]
4186    vpbroadcastd        m11, [pw_2048]
4187    vpbroadcastd        m13, [pixel_10bpc_max]
4188    lea                  r6, [strideq*3]
4189    psrlq               m31, m30, 8
4190    vpermq               m8, m30, m0
4191    vpermq               m9, m31, m1
4192    call m(idct_16x8_internal_10bpc).write_16x4
4193    vpermq               m8, m30, m2
4194    vpermq               m9, m31, m3
4195    call m(idct_16x8_internal_10bpc).write_16x4
4196    vpermq               m8, m30, m4
4197    vpermq               m9, m31, m5
4198    call m(idct_16x8_internal_10bpc).write_16x4
4199    vpermq               m8, m30, m6
4200    vpermq               m9, m31, m7
4201    call m(idct_16x8_internal_10bpc).write_16x4
4202
4203    mova                 m1, [rsp+mmsize*0]
4204    mova                 m2, [rsp+mmsize*1]
4205    mova                 m3, [rsp+mmsize*2]
4206    mova                 m4, [rsp+mmsize*3]
4207    mova                 m5, [rsp+mmsize*4]
4208    mova                 m6, [rsp+mmsize*5]
4209    mova                 m7, [rsp+mmsize*6]
4210    mova                 m8, [rsp+mmsize*7]
4211
4212    paddsw               m0, m1, m21
4213    psubsw              m21, m1, m21
4214    paddsw               m1, m2, m20
4215    psubsw              m20, m2, m20
4216    paddsw               m2, m3, m19
4217    psubsw              m19, m3, m19
4218    paddsw               m3, m4, m18
4219    psubsw              m18, m4, m18
4220    paddsw               m4, m5, m17
4221    psubsw              m17, m5, m17
4222    paddsw               m5, m6, m16
4223    psubsw              m16, m6, m16
4224    paddsw               m6, m7, m15
4225    psubsw              m15, m7, m15
4226    paddsw               m7, m8, m14
4227    psubsw              m14, m8, m14
4228
4229    vpermq               m8, m30, m0
4230    vpermq               m9, m31, m1
4231    call m(idct_16x8_internal_10bpc).write_16x4
4232    vpermq               m8, m30, m2
4233    vpermq               m9, m31, m3
4234    call m(idct_16x8_internal_10bpc).write_16x4
4235    vpermq               m8, m30, m4
4236    vpermq               m9, m31, m5
4237    call m(idct_16x8_internal_10bpc).write_16x4
4238    vpermq               m8, m30, m6
4239    vpermq               m9, m31, m7
4240    call m(idct_16x8_internal_10bpc).write_16x4
4241
4242    vpermq               m8, m30, m14
4243    vpermq               m9, m31, m15
4244    call m(idct_16x8_internal_10bpc).write_16x4
4245    vpermq               m8, m30, m16
4246    vpermq               m9, m31, m17
4247    call m(idct_16x8_internal_10bpc).write_16x4
4248    vpermq               m8, m30, m18
4249    vpermq               m9, m31, m19
4250    call m(idct_16x8_internal_10bpc).write_16x4
4251    vpermq               m8, m30, m20
4252    vpermq               m9, m31, m21
4253    call m(idct_16x8_internal_10bpc).write_16x4
4254
4255    vpermq               m8, m30, m22
4256    vpermq               m9, m31, m23
4257    call m(idct_16x8_internal_10bpc).write_16x4
4258    vpermq               m8, m30, m24
4259    vpermq               m9, m31, m25
4260    call m(idct_16x8_internal_10bpc).write_16x4
4261    vpermq               m8, m30, m26
4262    vpermq               m9, m31, m27
4263    call m(idct_16x8_internal_10bpc).write_16x4
4264    vpermq               m8, m30, m28
4265    vpermq               m9, m31, m29
4266    call m(idct_16x8_internal_10bpc).write_16x4
4267    RET
4268.pass1:
4269    mova                 m0, [cq+128* 0]
4270    mova                 m1, [cq+128* 2]
4271    mova                 m2, [cq+128* 4]
4272    mova                 m3, [cq+128* 6]
4273    mova                 m4, [cq+128* 8]
4274    mova                 m5, [cq+128*10]
4275    mova                 m6, [cq+128*12]
4276    mova                 m7, [cq+128*14]
4277    call m(idct_8x16_internal_10bpc).main
4278    mova                m16, [cq+128* 1]
4279    mova                m17, [cq+128* 3]
4280    mova                m18, [cq+128* 5]
4281    mova                m19, [cq+128* 7]
4282    mova                m20, [cq+128* 9]
4283    mova                m21, [cq+128*11]
4284    mova                m22, [cq+128*13]
4285    mova                m23, [cq+128*15]
4286    call m(idct_16x16_internal_10bpc).main
4287    call m(idct_16x16_internal_10bpc).main_end
4288    jmp m(idct_16x16_internal_10bpc).main_end3
4289.dconly:
4290    imul                r6d, [cq], 181
4291    mov                [cq], eobd
4292    or                  r3d, 64
4293    add                 r6d, 640
4294    sar                 r6d, 10
4295    jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
4296
4297cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
4298    lea                  r5, [o_base]
4299    test               eobd, eobd
4300    jz .dconly
4301    PROLOGUE 4, 7, 32, -64*40, dst, stride, c, eob
4302%undef cmp
4303    vpbroadcastd        m12, [o(pd_2896)]
4304    vpbroadcastd        m13, [o(pd_2048)]
4305    vpbroadcastd        m14, [o(clip_18b_min)]
4306    vpbroadcastd        m15, [o(clip_18b_max)]
4307    cmp                eobd, 136
4308    jl .fast
4309    add                  cq, 64
4310    cmp                eobd, 543
4311    jge .full
4312    call .pass1_fast ; bottomright 16x16 zero
4313    jmp .lefthalf
4314.full:
4315    call .pass1
4316    mov                 r3d, 16*28
4317.lefthalf:
4318    mova        [cq+128* 0], m27
4319    mova        [cq+128* 1], m14
4320    mova        [cq+128* 2], m28
4321    mova        [cq+128* 3], m15
4322    mova        [cq+128* 4], m22
4323    mova        [cq+128* 5], m23
4324    mova        [cq+128* 6], m24
4325    mova        [cq+128* 7], m25
4326    mova        [cq+128* 8], m0
4327    mova        [cq+128* 9], m26
4328    mova        [cq+128*10], m20
4329    mova        [cq+128*11], m21
4330    mova        [cq+128*12], m18
4331    mova        [cq+128*13], m16
4332    mova        [cq+128*14], m17
4333    mova        [cq+128*15], m3
4334    sub                  cq, 64
4335    vpbroadcastd        m12, [o(pd_2896)]
4336    vpbroadcastd        m13, [o(pd_2048)]
4337    vpbroadcastd        m14, [o(clip_18b_min)]
4338    vpbroadcastd        m15, [o(clip_18b_max)]
4339    call .pass1
4340    call .pass2_start
4341
4342    pxor                m31, m31
4343.right_zero_loop:
4344    REPX {mova [cq+r3*8+64+128*x], m31}, 0, 1, 2, 3
4345    sub                 r3d, 16*4
4346    jge .right_zero_loop
4347    mov                 r3d, 16*28
4348    jmp .left_zero_loop
4349.pass2_start:
4350    vpbroadcastd        m10, [o(pd_2048)]
4351    lea                  r5, [o_base_8bpc]
4352
4353    lea                  r4, [rsp+gprsize]
4354    mova                 m1, [cq+128*15+64]
4355    mova                 m2, [cq+128* 8+64]
4356    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
4357    mova                 m0, m21
4358    mova                 m1, [cq+128*12+64]
4359    mova                 m2, [cq+128*11+64]
4360    mova                 m3, m18
4361    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
4362    mova                 m0, m20
4363    mova                 m1, [cq+128*13+64]
4364    mova                 m2, [cq+128*10+64]
4365    mova                 m3, m16
4366    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
4367    mova                 m0, m26
4368    mova                 m1, [cq+128*14+64]
4369    mova                 m2, [cq+128* 9+64]
4370    mova                 m3, m17
4371    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
4372    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
4373
4374    mova                 m0, m27
4375    mova                 m1, m28
4376    mova                 m2, [cq+128* 0+64]
4377    mova                 m3, [cq+128* 2+64]
4378    mova                m16, [cq+128* 1+64]
4379    mova                m17, [cq+128* 3+64]
4380    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
4381    mova                m26, [cq+128* 4+64]
4382    mova                m27, [cq+128* 5+64]
4383    mova                m28, [cq+128* 6+64]
4384    mova                m29, [cq+128* 7+64]
4385    mova        [rsp+64*32+gprsize], m14
4386    mova        [rsp+64*33+gprsize], m15
4387    mova        [rsp+64*34+gprsize], m16
4388    mova        [rsp+64*35+gprsize], m17
4389    mova        [rsp+64*36+gprsize], m18
4390    mova        [rsp+64*37+gprsize], m19
4391    mova        [rsp+64*38+gprsize], m20
4392    mova        [rsp+64*39+gprsize], m21
4393    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
4394.fast: ; topleft 16x16 nonzero
4395    cmp                eobd, 36
4396    jl .fast2
4397    call .pass1_fast
4398    vpbroadcastd        m10, [o(pd_2048)]
4399    call .pass2_fast_start
4400    jmp .end
4401.fast2: ; topleft 8x8 nonzero
4402    movshdup             m7, [o(permB)]
4403    mova                ym0, [cq+128*0]
4404    mova                ym1, [cq+128*4]
4405    mova                ym4, [cq+128*2]
4406    mova                ym5, [cq+128*6]
4407    mova               ym16, [cq+128*1]
4408    mova                ym2, [cq+128*5]
4409    mova                ym3, [cq+128*3]
4410    mova               ym17, [cq+128*7]
4411    mov                 r3d, 16*4
4412    vpermq               m0, m7, m0 ;  0  0
4413    vpermq               m1, m7, m1 ;  4  4
4414    vpermt2q             m4, m7, m5 ;  2  6
4415    vpermt2q            m16, m7, m2 ;  1  5
4416    vpermt2q            m17, m7, m3 ;  7  3
4417    REPX    {pmulld x, m12}, m0, m1, m4, m16, m17
4418    REPX    {paddd  x, m13}, m0, m1, m4, m16, m17
4419    REPX    {psrad  x, 12 }, m0, m1, m4, m16, m17
4420    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
4421    vpbroadcastd        m11, [o(pd_1)]
4422    call m(idct_16x16_internal_10bpc).main_end2
4423
4424    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
4425    punpcklqdq          m27, m0, m2 ; 0
4426    punpckhqdq           m0, m2     ; 1
4427    punpcklqdq          m22, m3, m4 ; 2
4428    punpckhqdq          m26, m3, m4 ; 3
4429    punpcklqdq          m14, m5, m7 ; 4
4430    punpckhqdq          m20, m5, m7 ; 5
4431    punpcklqdq          m23, m6, m8 ; 6
4432    punpckhqdq          m21, m6, m8 ; 7
4433
4434    mova                m10, m13
4435    call .pass2_fast2_start
4436.end:
4437
4438    pxor                m31, m31
4439
4440.left_zero_loop:
4441    REPX {mova [cq+r3*8+128*x], m31}, 0, 1, 2, 3
4442    sub                 r3d, 16*4
4443    jge .left_zero_loop
4444
4445    call .pass2_end
4446    RET
4447.pass2_end:
4448    DEFINE_ARGS dst, stride, _, dst2, stride32, stklo, stkhi
4449    vpbroadcastd        m30, [pixel_10bpc_max]
4450    vpbroadcastd        m13, [pw_2048]
4451
4452    mov           stride32q, strideq
4453    shl           stride32q, 5
4454    lea              stkhiq, [rsp+31*mmsize+gprsize]
4455    lea               dst2q, [dstq+stride32q]
4456    lea              stkloq, [rsp+gprsize]
4457    sub               dst2q, strideq    ; dst31
4458
4459    paddsw               m8, m0, m29    ; t0[idct32]
4460    psubsw               m9, m0, m29    ; t31[idct32]
4461    call .end_sumsub_write
4462    paddsw               m8, m1, m28    ; t1[idct32]
4463    psubsw               m9, m1, m28    ; t30[idct32]
4464    call .end_sumsub_write
4465    paddsw               m8, m2, m27    ; t2[idct32]
4466    psubsw               m9, m2, m27    ; t29[idct32]
4467    call .end_sumsub_write
4468    paddsw               m8, m3, m26    ; t3[idct32]
4469    psubsw               m9, m3, m26    ; t28[idct32]
4470    call .end_sumsub_write
4471    paddsw               m8, m4, m25    ; t4[idct32]
4472    psubsw               m9, m4, m25    ; t27[idct32]
4473    call .end_sumsub_write
4474    paddsw               m8, m5, m24    ; t5[idct32]
4475    psubsw               m9, m5, m24    ; t26[idct32]
4476    call .end_sumsub_write
4477    paddsw               m8, m6, m23    ; t6[idct32]
4478    psubsw               m9, m6, m23    ; t25[idct32]
4479    call .end_sumsub_write
4480    paddsw               m8, m7, m22    ; t7[idct32]
4481    psubsw               m9, m7, m22    ; t24[idct32]
4482    call .end_sumsub_write
4483    mova                 m0, [rsp+64*32+gprsize]
4484    mova                 m1, [rsp+64*33+gprsize]
4485    mova                 m2, [rsp+64*34+gprsize]
4486    mova                 m3, [rsp+64*35+gprsize]
4487    mova                 m4, [rsp+64*36+gprsize]
4488    mova                 m5, [rsp+64*37+gprsize]
4489    mova                 m6, [rsp+64*38+gprsize]
4490    mova                 m7, [rsp+64*39+gprsize]
4491    paddsw               m8, m0, m21    ; t8[idct32]
4492    psubsw               m9, m0, m21    ; t23[idct32]
4493    call .end_sumsub_write
4494    paddsw               m8, m1, m20    ; t9[idct32]
4495    psubsw               m9, m1, m20    ; t22[idct32]
4496    call .end_sumsub_write
4497    paddsw               m8, m2, m19    ; t10[idct32]
4498    psubsw               m9, m2, m19    ; t21[idct32]
4499    call .end_sumsub_write
4500    paddsw               m8, m3, m18    ; t11[idct32]
4501    psubsw               m9, m3, m18    ; t20[idct32]
4502    call .end_sumsub_write
4503    paddsw               m8, m4, m17    ; t12[idct32]
4504    psubsw               m9, m4, m17    ; t19[idct32]
4505    call .end_sumsub_write
4506    paddsw               m8, m5, m16    ; t13[idct32]
4507    psubsw               m9, m5, m16    ; t18[idct32]
4508    call .end_sumsub_write
4509    paddsw               m8, m6, m15    ; t14[idct32]
4510    psubsw               m9, m6, m15    ; t17[idct32]
4511    call .end_sumsub_write
4512    paddsw               m8, m7, m14    ; t15[idct32]
4513    psubsw               m9, m7, m14    ; t16[idct32]
4514    ; fall-through
4515.end_sumsub_write:
4516    mova                m10, [stkhiq]   ; t63-n
4517    mova                m12, [stkloq]   ; t32+n
4518    psubsw              m11, m8, m10    ; out63-n
4519    paddsw               m8, m10        ; out0 +n
4520    psubsw              m10, m9, m12    ; out32+n
4521    paddsw               m9, m12        ; out32-n
4522    REPX  {pmulhrsw x, m13}, m11, m8, m10, m9
4523    paddw                m8, [dstq]
4524    paddw                m9, [dst2q]
4525    paddw               m10, [dstq+stride32q]
4526    paddw               m11, [dst2q+stride32q]
4527    REPX  {pminsw   x, m30}, m11, m8, m10, m9
4528    REPX  {pmaxsw   x, m31}, m11, m8, m10, m9
4529    mova  [dstq           ], m8
4530    mova  [dst2q          ], m9
4531    mova  [dstq +stride32q], m10
4532    mova  [dst2q+stride32q], m11
4533    add              stkloq, mmsize
4534    sub              stkhiq, mmsize
4535    add                dstq, strideq
4536    sub               dst2q, strideq
4537    ret
4538.pass2_fast_start:
4539    lea                  r5, [o_base_8bpc]
4540    lea                  r4, [rsp+gprsize]
4541    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
4542    mova                 m0, m21
4543    mova                 m3, m18
4544    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
4545    mova                 m0, m20
4546    mova                 m3, m16
4547    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
4548    mova                 m0, m26
4549    mova                 m3, m17
4550    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
4551    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
4552
4553    mova                 m0, m27
4554    mova                 m1, m28
4555    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
4556    mova        [rsp+64*32+gprsize], m14
4557    mova        [rsp+64*33+gprsize], m15
4558    mova        [rsp+64*34+gprsize], m16
4559    mova        [rsp+64*35+gprsize], m17
4560    mova        [rsp+64*36+gprsize], m18
4561    mova        [rsp+64*37+gprsize], m19
4562    mova        [rsp+64*38+gprsize], m20
4563    mova        [rsp+64*39+gprsize], m21
4564    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
4565.pass2_fast2_start:
4566    lea                  r5, [o_base_8bpc]
4567    lea                  r4, [rsp+gprsize]
4568    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
4569    mova                 m0, m21
4570    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
4571    mova                 m0, m20
4572    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
4573    mova                 m0, m26
4574    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
4575    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
4576
4577    mova                 m0, m27
4578    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast3
4579    mova        [rsp+64*32+gprsize], m14
4580    mova        [rsp+64*33+gprsize], m15
4581    mova        [rsp+64*34+gprsize], m16
4582    mova        [rsp+64*35+gprsize], m17
4583    mova        [rsp+64*36+gprsize], m18
4584    mova        [rsp+64*37+gprsize], m19
4585    mova        [rsp+64*38+gprsize], m20
4586    mova        [rsp+64*39+gprsize], m21
4587    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast3
4588.dconly:
4589    DEFINE_ARGS dst, stride, c, eob
4590    imul                r6d, [cq], 181
4591    mov                [cq], eobd
4592    or                  r3d, 64
4593    jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly3
4594.pass1_fast:
4595    pmulld               m0, m12, [cq+128* 0]
4596    pmulld               m1, m12, [cq+128* 4]
4597    pmulld               m2, m12, [cq+128* 8]
4598    pmulld               m3, m12, [cq+128*12]
4599    mov                 r3d, 16*12
4600    call m(idct_8x16_internal_10bpc).main_fast_rect2
4601    pmulld              m16, m12, [cq+128* 2]
4602    pmulld              m17, m12, [cq+128* 6]
4603    pmulld              m18, m12, [cq+128*10]
4604    pmulld              m19, m12, [cq+128*14]
4605    call m(idct_16x16_internal_10bpc).main_fast_rect2
4606    call .pass1_load_spill
4607    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2
4608    jmp .pass1_end
4609.pass1:
4610    pmulld               m0, m12, [cq+128* 0]
4611    pmulld               m1, m12, [cq+128* 4]
4612    pmulld               m2, m12, [cq+128* 8]
4613    pmulld               m3, m12, [cq+128*12]
4614    pmulld               m4, m12, [cq+128*16]
4615    pmulld               m5, m12, [cq+128*20]
4616    pmulld               m6, m12, [cq+128*24]
4617    pmulld               m7, m12, [cq+128*28]
4618    call m(idct_8x16_internal_10bpc).main_rect2
4619    pmulld              m16, m12, [cq+128* 2]
4620    pmulld              m17, m12, [cq+128* 6]
4621    pmulld              m18, m12, [cq+128*10]
4622    pmulld              m19, m12, [cq+128*14]
4623    pmulld              m20, m12, [cq+128*18]
4624    pmulld              m21, m12, [cq+128*22]
4625    pmulld              m22, m12, [cq+128*26]
4626    pmulld              m23, m12, [cq+128*30]
4627    call m(idct_16x16_internal_10bpc).main_rect2
4628    call .pass1_load_spill
4629    pmulld              m16, m12, [cq+128*17]
4630    pmulld              m17, m12, [cq+128*19]
4631    pmulld              m18, m12, [cq+128*21]
4632    pmulld              m19, m12, [cq+128*23]
4633    pmulld              m20, m12, [cq+128*25]
4634    pmulld              m21, m12, [cq+128*27]
4635    pmulld              m22, m12, [cq+128*29]
4636    pmulld              m23, m12, [cq+128*31]
4637    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_rect2
4638.pass1_end:
4639    vpbroadcastd        m11, [o(pd_1)]
4640    lea                  r4, [cq+128*8]
4641    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end
4642    punpcklqdq          m27, m0, m20  ;  0
4643    punpckhqdq           m0, m20      ;  1
4644    punpcklqdq          m24, m5, m16  ; 10
4645    punpckhqdq          m16, m5, m16  ; 11
4646    punpcklqdq          m23, m3, m21  ;  6
4647    punpckhqdq          m21, m3, m21  ;  7
4648    punpcklqdq          m25, m7, m8   ; 14
4649    punpckhqdq           m3, m7, m8   ; 15
4650    punpcklqdq          m22, m15, m4  ;  2
4651    punpckhqdq          m26, m15, m4  ;  3
4652    punpcklqdq          m15, m6, m17  ; 12
4653    punpckhqdq          m17, m6, m17  ; 13
4654    punpcklqdq          m28, m14, m18 ;  8
4655    punpckhqdq          m18, m14, m18 ;  9
4656    punpcklqdq          m14, m2, m1   ;  4
4657    punpckhqdq          m20, m2, m1   ;  5
4658    ret
4659.pass1_load_spill:
4660    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
4661    mova        [cq+128* 0], m0
4662    pmulld               m0, m12, [cq+128* 1]
4663    mova        [cq+128* 1], m1
4664    mova        [cq+128* 2], m2
4665    pmulld               m1, m12, [cq+128* 3]
4666    pmulld               m2, m12, [cq+128* 5]
4667    mova        [cq+128* 3], m3
4668    mova        [cq+128* 4], m4
4669    pmulld               m3, m12, [cq+128* 7]
4670    pmulld               m4, m12, [cq+128* 9]
4671    mova        [cq+128* 5], m5
4672    mova        [cq+128* 6], m6
4673    mova        [cq+128* 7], m7
4674    pmulld               m5, m12, [cq+128*11]
4675    pmulld               m6, m12, [cq+128*13]
4676    pmulld               m7, m12, [cq+128*15]
4677    mova        [cq+128* 8], m23
4678    mova        [cq+128* 9], m22
4679    mova        [cq+128*10], m21
4680    mova        [cq+128*11], m20
4681    mova        [cq+128*12], m19
4682    mova        [cq+128*13], m18
4683    mova        [cq+128*14], m17
4684    mova        [cq+128*15], m16
4685    ret
4686
4687cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
4688%undef cmp
4689    lea                  r5, [o_base]
4690    test               eobd, eobd
4691    jz .dconly
4692
4693    PROLOGUE 4, 7, 32, -64*32, dst, stride, c, eob
4694%undef cmp
4695    vpbroadcastd        m12, [o(pd_2896)]
4696    vpbroadcastd        m13, [o(pd_2048)]
4697    vpbroadcastd        m14, [o(clip_18b_min)]
4698    vpbroadcastd        m15, [o(clip_18b_max)]
4699    cmp                eobd, 36
4700    jl .fast ; 8x8
4701    cmp                eobd, 151
4702    jge .full ; 16x16
4703    lea                  r4, [idct64_mul_16bpc]
4704    lea                  r6, [rsp+4*64]
4705    mova                 m0, [cq+64* 1]
4706    mova                 m3, [cq+64*15]
4707    call .main_part1_fast
4708    mova                 m0, [cq+64* 7]
4709    mova                 m3, [cq+64* 9]
4710    call .main_part1_fast
4711    mova                 m0, [cq+64* 5]
4712    mova                 m3, [cq+64*11]
4713    call .main_part1_fast
4714    mova                 m0, [cq+64* 3]
4715    mova                 m3, [cq+64*13]
4716    call .main_part1_fast
4717    call .main_part2
4718    mova                 m0, [cq+64* 0]
4719    mova                 m1, [cq+64* 8]
4720    mova                m16, [cq+64* 4]
4721    mova                m17, [cq+64*12]
4722    call m(idct_8x16_internal_10bpc).main_fast2
4723    call m(idct_16x16_internal_10bpc).main_fast2
4724    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
4725    call .pass1_load_spill
4726    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2
4727    mov                 r6d, 12*8
4728    jmp .idct64_end
4729.full:
4730    lea                  r4, [idct64_mul_16bpc]
4731    lea                  r6, [rsp+4*64]
4732    mova                 m0, [cq+64* 1]
4733    mova                 m1, [cq+64*31]
4734    mova                 m2, [cq+64*17]
4735    mova                 m3, [cq+64*15]
4736    call .main_part1
4737    mova                 m0, [cq+64* 7]
4738    mova                 m1, [cq+64*25]
4739    mova                 m2, [cq+64*23]
4740    mova                 m3, [cq+64* 9]
4741    call .main_part1
4742    mova                 m0, [cq+64* 5]
4743    mova                 m1, [cq+64*27]
4744    mova                 m2, [cq+64*21]
4745    mova                 m3, [cq+64*11]
4746    call .main_part1
4747    mova                 m0, [cq+64* 3]
4748    mova                 m1, [cq+64*29]
4749    mova                 m2, [cq+64*19]
4750    mova                 m3, [cq+64*13]
4751    call .main_part1
4752    call .main_part2
4753    mova                 m0, [cq+64* 0]
4754    mova                 m1, [cq+64* 8]
4755    mova                 m2, [cq+64*16]
4756    mova                 m3, [cq+64*24]
4757    mova                m16, [cq+64* 4]
4758    mova                m17, [cq+64*12]
4759    mova                m18, [cq+64*20]
4760    mova                m19, [cq+64*28]
4761    call m(idct_8x16_internal_10bpc).main_fast
4762    call m(idct_16x16_internal_10bpc).main_fast
4763    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
4764    call .pass1_load_spill
4765    mova                 m4, [cq+64*18]
4766    mova                 m5, [cq+64*22]
4767    mova                 m6, [cq+64*26]
4768    mova                 m7, [cq+64*30]
4769    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
4770    mov                 r6d, 28*8
4771    jmp .idct64_end
4772.dconly:
4773    imul                r6d, [cq], 181
4774    mov                [cq], eobd
4775    or                  r3d, 16
4776.dconly1:
4777    add                 r6d, 640
4778    sar                 r6d, 10
4779.dconly2:
4780    vpbroadcastd         m3, [o(dconly_10bpc)]
4781    imul                r6d, 181
4782    add                 r6d, 2176
4783    sar                 r6d, 12
4784    vpbroadcastw         m2, r6d
4785    paddsw               m2, m3
4786.dconly_loop:
4787    paddsw               m0, m2, [dstq+64*0]
4788    paddsw               m1, m2, [dstq+64*1]
4789    psubusw              m0, m3
4790    psubusw              m1, m3
4791    mova        [dstq+64*0], m0
4792    mova        [dstq+64*1], m1
4793    add                dstq, strideq
4794    dec                 r3d
4795    jg .dconly_loop
4796    ret
4797.pass1_load_spill:
4798    mova         [cq+64* 0], m0
4799    mova                 m0, [cq+64* 2]
4800    mova         [cq+64* 2], m1
4801    mova                 m1, [cq+64* 6]
4802    mova         [cq+64* 4], m2
4803    mova         [cq+64* 6], m3
4804    mova                 m2, [cq+64*10]
4805    mova                 m3, [cq+64*14]
4806    mova         [cq+64* 8], m4
4807    mova         [cq+64*10], m5
4808    mova         [cq+64*12], m6
4809    mova         [cq+64*14], m7
4810    mova         [cq+64* 1], m23
4811    mova         [cq+64* 3], m22
4812    mova         [cq+64* 5], m21
4813    mova         [cq+64* 7], m20
4814    mova         [cq+64* 9], m19
4815    mova         [cq+64*11], m18
4816    mova         [cq+64*13], m17
4817    mova         [cq+64*15], m16
4818    ret
4819ALIGN function_align
4820.main_part1_fast_rect2:
4821    REPX     {paddd x, m13}, m0, m3
4822    REPX     {psrad x, 12 }, m0, m3
4823.main_part1_fast:
4824    pmulld               m7, m0, [r4+4*0]{bcstd}    ; t63a
4825    pmulld               m0, [r4+4*1]{bcstd}        ; t32a
4826    pmulld               m4, m3, [r4+4*6]{bcstd}    ; t60a
4827    pmulld               m3, [r4+4*7]{bcstd}        ; t35a
4828    vpbroadcastd        m10, [r4+4*8]
4829    vpbroadcastd        m11, [r4+4*9]
4830    REPX     {paddd x, m13}, m7, m0, m4, m3
4831    REPX     {psrad x, 12 }, m7, m0, m4, m3
4832    mova                 m8, m0
4833    mova                 m1, m7
4834    mova                 m6, m3
4835    mova                 m2, m4
4836    jmp .main_part1b
4837.main_part1_rect2:
4838    REPX     {paddd x, m13}, m0, m1, m2, m3
4839    REPX     {psrad x, 12 }, m0, m1, m2, m3
4840.main_part1: ; idct64 steps 1-5
4841    ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
4842    ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
4843    ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
4844    ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
4845    pmulld               m7, m0, [r4+4*0]{bcstd}    ; t63a
4846    pmulld               m0, [r4+4*1]{bcstd}        ; t32a
4847    pmulld               m6, m1, [r4+4*2]{bcstd}    ; t62a
4848    pmulld               m1, [r4+4*3]{bcstd}        ; t33a
4849    pmulld               m5, m2, [r4+4*4]{bcstd}    ; t61a
4850    pmulld               m2, [r4+4*5]{bcstd}        ; t34a
4851    pmulld               m4, m3, [r4+4*6]{bcstd}    ; t60a
4852    pmulld               m3, [r4+4*7]{bcstd}        ; t35a
4853    vpbroadcastd        m10, [r4+4*8]
4854    vpbroadcastd        m11, [r4+4*9]
4855    REPX     {paddd x, m13}, m7, m0, m6, m1, m5, m2, m4, m3
4856    REPX     {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
4857    psubd                m8, m0, m1 ; t33
4858    paddd                m0, m1     ; t32
4859    psubd                m1, m7, m6 ; t62
4860    paddd                m7, m6     ; t63
4861    psubd                m6, m3, m2 ; t34
4862    paddd                m3, m2     ; t35
4863    psubd                m2, m4, m5 ; t61
4864    paddd                m4, m5     ; t60
4865.main_part1b:
4866    REPX    {pmaxsd x, m14}, m8, m1, m6, m2
4867    REPX    {pminsd x, m15}, m8, m1, m6, m2
4868    ITX_MULSUB_2D         1, 8, 5, 9, _, 13, 10, 11    ; t33a, t62a
4869    ITX_MULSUB_2D         2, 6, 5, 9, _, 13, 10, 11, 2 ; t61a, t34a
4870    REPX    {pmaxsd x, m14}, m0, m3, m7, m4
4871    REPX    {pminsd x, m15}, m0, m3, m7, m4
4872    vpbroadcastd        m10, [r4+4*10]
4873    vpbroadcastd        m11, [r4+4*11]
4874    psubd                m5, m0, m3 ; t35a
4875    paddd                m0, m3     ; t32a
4876    psubd                m3, m7, m4 ; t60a
4877    paddd                m7, m4     ; t63a
4878    psubd                m4, m1, m6 ; t34
4879    paddd                m1, m6     ; t33
4880    psubd                m6, m8, m2 ; t61
4881    paddd                m8, m2     ; t62
4882    REPX    {pmaxsd x, m14}, m5, m3, m4, m6
4883    REPX    {pminsd x, m15}, m5, m3, m4, m6
4884    ITX_MULSUB_2D         3, 5, 2, 9, _, 13, 10, 11 ; t35,  t60
4885    ITX_MULSUB_2D         6, 4, 2, 9, _, 13, 10, 11 ; t34a, t61a
4886    REPX    {pmaxsd x, m14}, m0, m7, m1, m8
4887    REPX    {pminsd x, m15}, m0, m7, m1, m8
4888    add                  r4, 4*12
4889    mova          [r6-64*4], m0
4890    mova          [r6+64*3], m7
4891    mova          [r6-64*3], m1
4892    mova          [r6+64*2], m8
4893    mova          [r6-64*2], m6
4894    mova          [r6+64*1], m4
4895    mova          [r6-64*1], m3
4896    mova          [r6+64*0], m5
4897    add                  r6, 64*8
4898    ret
4899.main_part2: ; idct64 steps 6-9
4900    lea                  r4, [r6+64*3]
4901    sub                  r6, 64*4
4902    vpbroadcastd        m10, [pd_1567]
4903    vpbroadcastd        m11, [pd_3784]
4904.main_part2_loop:
4905    mova                 m0, [r6-64*32] ; t32a
4906    mova                 m1, [r4-64*24] ; t39a
4907    mova                 m2, [r4-64*32] ; t63a
4908    mova                 m3, [r6-64*24] ; t56a
4909    mova                 m4, [r6-64*16] ; t40a
4910    mova                 m5, [r4-64* 8] ; t47a
4911    mova                 m6, [r4-64*16] ; t55a
4912    mova                 m7, [r6-64* 8] ; t48a
4913    psubd                m8, m0, m1 ; t39
4914    paddd                m0, m1     ; t32
4915    psubd                m1, m2, m3 ; t56
4916    paddd                m2, m3     ; t63
4917    psubd                m3, m5, m4 ; t40
4918    paddd                m5, m4     ; t47
4919    psubd                m4, m7, m6 ; t55
4920    paddd                m7, m6     ; t48
4921    REPX    {pmaxsd x, m14}, m8, m1, m3, m4
4922    REPX    {pminsd x, m15}, m8, m1, m3, m4
4923    ITX_MULSUB_2D         1, 8, 6, 9, _, 13, 10, 11    ; t39a, t56a
4924    ITX_MULSUB_2D         4, 3, 6, 9, _, 13, 10, 11, 2 ; t55a, t40a
4925    REPX    {pmaxsd x, m14}, m0, m2, m5, m7
4926    REPX    {pminsd x, m15}, m0, m5, m2, m7
4927    psubd                m6, m2, m7 ; t48a
4928    paddd                m2, m7     ; t63a
4929    psubd                m7, m0, m5 ; t47a
4930    paddd                m0, m5     ; t32a
4931    psubd                m5, m8, m4 ; t55
4932    paddd                m8, m4     ; t56
4933    psubd                m4, m1, m3 ; t40
4934    paddd                m1, m3     ; t39
4935    REPX    {pmaxsd x, m14}, m6, m7, m5, m4
4936    REPX    {pminsd x, m15}, m6, m7, m5, m4
4937    REPX    {pmulld x, m12}, m6, m7, m5, m4
4938    REPX    {pmaxsd x, m14}, m2, m0, m8, m1
4939    REPX    {pminsd x, m15}, m2, m0, m8, m1
4940    paddd                m6, m13
4941    paddd                m5, m13
4942    psubd                m3, m6, m7 ; t47
4943    paddd                m6, m7     ; t48
4944    psubd                m7, m5, m4 ; t40a
4945    paddd                m5, m4     ; t55a
4946    REPX      {psrad x, 12}, m3, m6, m7, m5
4947    mova         [r4-64* 8], m2
4948    mova         [r6-64*32], m0
4949    mova         [r6-64* 8], m8
4950    mova         [r4-64*32], m1
4951    mova         [r4-64*24], m3
4952    mova         [r6-64*16], m6
4953    mova         [r6-64*24], m7
4954    mova         [r4-64*16], m5
4955    add                  r6, 64
4956    sub                  r4, 64
4957    cmp                  r6, r4
4958    jl .main_part2_loop
4959    ret
4960.idct64_main_end:
4961%macro IDCT64_PASS1_END 9
4962    mova                m%5, [%9+%1*128]    ; t0+n [idct32] + idct64 rounding
4963    psubd               m%6, m%5, m%2       ; out31-n [idct32] = t31-n [idct64]
4964    paddd               m%5, m%2            ; out0+n [idct32] = t0+n [idct64]
4965    REPX    {pmaxsd x, m14}, m%6, m%5
4966    REPX    {pminsd x, m15}, m%6, m%5
4967    REPX    {paddd  x, m11}, m%6, m%5
4968    mova                m%2, [r3+%3*64]     ; t32+n [idct64]
4969    mova                m%7, [r3+%4*64]     ; t63-n [idct64]
4970    psubd               m%8, m%5, m%7       ; out63-n
4971    paddd               m%5, m%7            ; out0+n
4972    psubd               m%7, m%6, m%2       ; out32+n
4973    paddd               m%6, m%2            ; out31-n
4974    REPX   {vpsravd x, m11}, m%8, m%5, m%7, m%6
4975%endmacro
4976
4977%macro IDCT64_PASS1_ENDx4 1
4978%assign %%m1 %1         ; t32+n
4979%assign %%m2 (7-%1)     ; t39-n
4980%assign %%m3 (8+%1)     ; t40+n
4981%assign %%m4 (15-%1)    ; t47-n
4982%assign %%m5 (16+%1)    ; t48+n
4983%assign %%m6 (23-%1)    ; t55-n
4984%assign %%m7 (24+%1)    ; t56+n
4985%assign %%m8 (31-%1)    ; t63-n
4986
4987%assign %%r1 %1         ; t16+n
4988%assign %%r2 (7-%1)     ; t23-n
4989%assign %%r3 (16+%1)    ; t24-n
4990%assign %%r4 (23-%1)    ; t31-n
4991
4992%assign %%c1 (%1)       ; t0/8+n
4993%assign %%c2 (7-%1)     ; t7/15-n
4994
4995    IDCT64_PASS1_END   %%c1, %%r4, %%m1, %%m8, 24, 25, 26, 27, cq ; out0/31/32/63
4996    IDCT64_PASS1_END   %%c1, %%r1, %%m4, %%m5, 28, 29, 30, 31, r4 ; out15/16/47/48
4997    packssdw      m %+ %%r1, m24, m29
4998    packssdw      m %+ %%r4, m28, m25
4999    packssdw            m26, m31
5000    packssdw            m30, m27
5001    mova   [r3+%%m5*mmsize], m26
5002    mova   [r3+%%m8*mmsize], m30
5003    IDCT64_PASS1_END   %%c2, %%r3, %%m2, %%m7, 24, 25, 26, 27, cq ; out7/24/39/56
5004    IDCT64_PASS1_END   %%c2, %%r2, %%m3, %%m6, 28, 29, 30, 31, r4 ; out8/23/40/55
5005    packssdw      m %+ %%r2, m24, m29
5006    packssdw      m %+ %%r3, m28, m25
5007    packssdw            m26, m31
5008    packssdw            m30, m27
5009    mova   [r3+%%m6*mmsize], m26
5010    mova   [r3+%%m7*mmsize], m30
5011%endmacro
5012    IDCT64_PASS1_ENDx4    0
5013    IDCT64_PASS1_ENDx4    1
5014    IDCT64_PASS1_ENDx4    2
5015    IDCT64_PASS1_ENDx4    3
5016    ret
5017.idct64_end:
5018    vpbroadcastd        m11, [o(pd_2)]
5019    lea                  r4, [cq+64]
5020    mov                  r3, rsp
5021    lea                  r5, [o_base_8bpc]
5022    call .idct64_main_end
5023
5024    pxor                m12, m12
5025.zero_loop:
5026    REPX {mova [cq+r6*8+64*x], m12}, 0, 1, 2, 3
5027    sub                 r6d, 8*4
5028    jge .zero_loop
5029
5030    lea                  r3, [strideq*3]
5031    mov                  r4, dstq
5032    call .pass2
5033    mova                 m0, [rsp+16*mmsize]
5034    mova                 m1, [rsp+17*mmsize]
5035    mova                 m2, [rsp+18*mmsize]
5036    mova                 m3, [rsp+19*mmsize]
5037    mova                 m4, [rsp+20*mmsize]
5038    mova                 m5, [rsp+21*mmsize]
5039    mova                 m6, [rsp+22*mmsize]
5040    mova                 m7, [rsp+23*mmsize]
5041    mova                m16, [rsp+24*mmsize]
5042    mova                m17, [rsp+25*mmsize]
5043    mova                m18, [rsp+26*mmsize]
5044    mova                m19, [rsp+27*mmsize]
5045    mova                m20, [rsp+28*mmsize]
5046    mova                m21, [rsp+29*mmsize]
5047    mova                m22, [rsp+30*mmsize]
5048    mova                m23, [rsp+31*mmsize]
5049    lea                dstq, [r4+64]
5050    call .pass2
5051    RET
5052.pass2:
5053    psrlq               m12, [permC], 24    ;  0  2  8 10  1  3  9 11
5054    psrlq               m13, m12, 32        ;  4  6 12 14  5  7 13 15
5055    call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
5056
5057    punpckhqdq          m19, m5, m16  ; 11
5058    punpcklqdq           m5, m16      ; 10
5059    punpckhqdq          m16, m2, m1   ;  5
5060    punpcklqdq           m2, m1       ;  4
5061    punpcklqdq           m1, m15, m4  ;  2
5062    punpckhqdq          m15, m4       ;  3
5063    punpcklqdq           m4, m14, m18 ;  8
5064    punpckhqdq          m18, m14, m18 ;  9
5065    punpckhqdq          m14, m0, m20  ;  1
5066    punpcklqdq           m0, m20      ;  0
5067    punpckhqdq          m20, m6, m17  ; 13
5068    punpcklqdq           m6, m17      ; 12
5069    punpckhqdq          m17, m3, m21  ;  7
5070    punpcklqdq           m3, m21      ;  6
5071    punpckhqdq          m21, m7, m8   ; 15
5072    punpcklqdq           m7, m8       ; 14
5073
5074    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
5075    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
5076.write:
5077    vpbroadcastd        m11, [pw_2048]
5078    pxor                m12, m12
5079    vpbroadcastd        m13, [pixel_10bpc_max]
5080    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
5081    pmulhrsw             m0, m11, m14
5082    pmulhrsw             m1, m11, m15
5083    pmulhrsw             m2, m11, m16
5084    pmulhrsw             m3, m11, m17
5085    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
5086    pmulhrsw             m0, m11, m18
5087    pmulhrsw             m1, m11, m19
5088    pmulhrsw             m2, m11, m20
5089    pmulhrsw             m3, m11, m21
5090    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
5091.fast: ; 8x8 packed
5092    movshdup             m7, [o(permB)]
5093    mova                ym0, [cq+64*1]
5094    mova                ym2, [cq+64*5]
5095    mova                ym3, [cq+64*3]
5096    mova                ym1, [cq+64*7]
5097    vpermt2q             m0, m7, m2 ;  1  5
5098    vpermt2q             m1, m7, m3 ;  7  3
5099    call .main_oddhalf_packed
5100    mova    [rsp+ 0*mmsize], m0
5101    mova    [rsp+ 1*mmsize], m1
5102    mova    [rsp+ 2*mmsize], m2
5103    mova    [rsp+ 3*mmsize], m3
5104    mova    [rsp+ 4*mmsize], m4
5105    mova    [rsp+ 5*mmsize], m5
5106    mova    [rsp+ 6*mmsize], m6
5107    mova    [rsp+ 7*mmsize], m7
5108    mova    [rsp+ 8*mmsize], m16
5109    mova    [rsp+ 9*mmsize], m17
5110    mova    [rsp+10*mmsize], m18
5111    mova    [rsp+11*mmsize], m19
5112    mova    [rsp+12*mmsize], m20
5113    mova    [rsp+13*mmsize], m21
5114    mova    [rsp+14*mmsize], m22
5115    mova    [rsp+15*mmsize], m23
5116
5117    movshdup             m7, [o(permB)]
5118    mova                ym0, [cq+64*0]
5119    mova                ym4, [cq+64*4]
5120    mova               ym16, [cq+64*2]
5121    mova                ym5, [cq+64*6]
5122    vpermt2q            m16, m7, m5 ;  2  6
5123    vpermq               m0, m7, m0 ;  0  0
5124    vpermq               m4, m7, m4 ;  4  4
5125    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
5126    ; m0-7,9,16-22 contain un-sumsub'ed dct32 output data
5127
5128    ; zero input coefs
5129    pxor                m12, m12
5130    REPX {mova [cq+x*64], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
5131
5132    vpbroadcastd        m11, [o(pd_2)]
5133    call .main_end
5134    lea                  r3, [strideq*3]
5135    mov                  r4, dstq
5136    call .pass2_fast
5137    mova                 m0, m24
5138    mova                 m1, m25
5139    mova                 m2, m26
5140    mova                 m3, m27
5141    mova                 m4, m28
5142    mova                 m5, m29
5143    mova                 m6, m30
5144    mova                 m7, m31
5145    lea                dstq, [r4+64]
5146    lea                  r5, [o_base]
5147    call .pass2_fast
5148    RET
5149.pass2_fast:
5150    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
5151    lea                  r5, [o_base_8bpc]
5152    punpckhqdq          m14, m0, m2 ; 1
5153    punpcklqdq           m0, m2     ; 0
5154    punpcklqdq           m1, m3, m4 ; 2
5155    punpckhqdq          m15, m3, m4 ; 3
5156    punpcklqdq           m2, m5, m7 ; 4
5157    punpckhqdq          m16, m5, m7 ; 5
5158    punpcklqdq           m3, m6, m8 ; 6
5159    punpckhqdq          m17, m6, m8 ; 7
5160    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
5161    jmp .write
5162.main_end:
5163
5164%macro IDCT64_PASS1_PACKED_END 7
5165    psubd               m%5, m%1, m%2       ; out31-n [idct32] = t31-n [idct64]
5166    paddd               m%1, m%2            ; out0+n [idct32] = t0+n [idct64]
5167    REPX    {pmaxsd x, m14}, m%5, m%1
5168    REPX    {pminsd x, m15}, m%5, m%1
5169    REPX    {paddd  x, m11}, m%5, m%1
5170    mova                m%2, [rsp+%6*64+gprsize]    ; t32+n [idct64]
5171    mova                m%3, [rsp+%7*64+gprsize]    ; t63-n [idct64]
5172    psubd               m%4, m%1, m%3       ; out63-n
5173    paddd               m%1, m%3            ; out0+n
5174    psubd               m%3, m%5, m%2       ; out32+n
5175    paddd               m%2, m%5            ; out31-n
5176    REPX   {vpsravd x, m11}, m%4, m%1, m%3, m%2
5177%endmacro
5178
5179    IDCT64_PASS1_PACKED_END  0, 22, 24, 10, 12, 0, 15   ; out0/1,31/30,32/33,63/62
5180    IDCT64_PASS1_PACKED_END  7,  9, 31, 13, 12, 7,  8   ; out15/14,16/17,47/46,48/49
5181    packssdw             m0, m9
5182    packssdw             m7, m22
5183    packssdw            m24, m13
5184    packssdw            m31, m10
5185    IDCT64_PASS1_PACKED_END  1, 21, 25, 10, 12, 1, 14   ; out3/2,28/29,35/34,60/61
5186    IDCT64_PASS1_PACKED_END  6, 16, 30, 13, 12, 6,  9   ; out12/13,19/18,44/45,51/50
5187    packssdw             m1, m16
5188    packssdw             m6, m21
5189    packssdw            m25, m13
5190    packssdw            m30, m10
5191    IDCT64_PASS1_PACKED_END  2, 20, 26, 10, 12, 2, 13   ; out4/5,27/26,36/37,59/58
5192    IDCT64_PASS1_PACKED_END  5, 17, 29, 13, 12, 5, 10   ; out11/10,20/21,43/42,52/53
5193    packssdw             m2, m17
5194    packssdw             m5, m20
5195    packssdw            m26, m13
5196    packssdw            m29, m10
5197    IDCT64_PASS1_PACKED_END  3, 19, 27, 10, 12, 3, 12   ; out7/6,24/25,39/38,56/57
5198    IDCT64_PASS1_PACKED_END  4, 18, 28, 13, 12, 4, 11   ; out8/9,23/22,40/41,55/54
5199    packssdw             m3, m18
5200    packssdw             m4, m19
5201    packssdw            m27, m13
5202    packssdw            m28, m10
5203    ret
5204.main_oddhalf_packed_rect2:
5205    REPX    {paddd  x, m13}, m0, m1
5206    REPX    {psrad  x, 12 }, m0, m1
5207.main_oddhalf_packed:
5208    ; m0=in1 in5, m1=in7 in3
5209    vbroadcasti32x4      m2, [o(pd_101_501)]
5210    vbroadcasti32x4      m3, [o(pd_m700_m301)]
5211    vbroadcasti32x4      m4, [o(pd_4095_4065)]
5212    vbroadcasti32x4      m5, [o(pd_4036_4085)]
5213    pmulld               m2, m0
5214    pmulld               m3, m1
5215    pmulld               m0, m4
5216    pmulld               m1, m5
5217    REPX    {paddd  x, m13}, m2, m3, m0, m1
5218    REPX    {psrad  x, 12 }, m2, m3, m0, m1
5219
5220    ; m2=t32a t40a -> t32/33 t40/41, m3=t39a t47a -> t38/39 t46/47
5221    ; m0=t63a t55a -> t62/63 t54/55, m1=t56a t48a -> t56/57 t48/49
5222    ; end of step 1-2
5223
5224    vbroadcasti32x4     m10, [o(pd_401_1931)]
5225    vbroadcasti32x4     m11, [o(pd_4076_3612)]
5226    mova                 m4, m0
5227    mova                 m5, m2
5228    ITX_MULSUB_2D         4, 5, 8, 9, _, 13, 10, 11
5229    vbroadcasti32x4     m10, [o(pd_3166_3920)]
5230    vbroadcasti32x4     m11, [o(pd_2598_1189)]
5231    mova                 m6, m3
5232    mova                 m7, m1
5233    ITX_MULSUB_2D         7, 6, 8, 9, _, 13, 10, 11, 2
5234
5235    ; m4=t33a t41a -> t41/42  t33/34,  m5=t63a t54a -> t61/62  t53/54
5236    ; m6=t38a t46a -> t37/38  t45/46,  m7=t57a t49a -> t57/58  t49/50
5237    ; and from earlier:
5238    ; m0=t63  t55  -> t60/63a t52/55a, m1=t56  t48  -> t56/59a t48/51a
5239    ; m2=t32  t40  -> t32/35a t40/43a, m3=t39  t47  -> t36/39a t44/47a
5240    ; end of step 3-4
5241
5242    punpcklqdq          m22, m2, m4     ; t32a/33 or t35a/34
5243    punpcklqdq          m21, m3, m6     ; t36a/37 or t39a/38
5244    punpckhqdq          m18, m2, m4     ; t40a/41 or t43a/42
5245    punpckhqdq          m17, m3, m6     ; t44a/45 or t47a/46
5246    punpckhqdq           m6, m1, m7     ; t48a/49 or t51a/50
5247    punpckhqdq          m19, m0, m5     ; t52a/53 or t55a/54
5248    punpcklqdq           m8, m1, m7     ; t56a/57 or t59a/58
5249    punpcklqdq          m23, m0, m5     ; t60a/61 or t63a/62
5250    mova                 m0, m22
5251    mova                 m7, m21
5252    mova                 m3, m18
5253    mova                m16, m17
5254    mova                 m5, m6
5255    mova                 m4, m19
5256    mova                 m2, m8
5257    mova                 m1, m23
5258    ; m0/22/7/21,18/3/17/16,6/5/19/4,2/8/1/23: t32-63[a]
5259
5260    ; step5
5261    vpbroadcastd        m10, [o(pd_799)]
5262    vpbroadcastd        m11, [o(pd_4017)]
5263    ITX_MULSUB_2D         1, 22, 20, 9, _, 13, 10, 11    ; t35/34a, t60/61a
5264    ITX_MULSUB_2D         8,  7, 20, 9, _, 13, 10, 11, 2 ; t59/58a, t36/37a
5265    vpbroadcastd        m10, [o(pd_3406)]
5266    vpbroadcastd        m11, [o(pd_2276)]
5267    ITX_MULSUB_2D        19,  3, 20, 9, _, 13, 10, 11    ; t43/42a, t52/53a
5268    ITX_MULSUB_2D         5, 17, 20, 9, _, 13, 10, 11, 2 ; t51/50a, t44/45a
5269    ; m0-1/7/21: t32-39[a], m18-19/17-16: t40-47[a]
5270    ; m6-5/3-4: t48-55[a], m2/8/22-23: t56-63[a]
5271
5272    ; step6
5273    psubd               m20, m0, m21    ; t39/38a
5274    paddd                m0, m21        ; t32/33a
5275    psubd               m21, m1, m7     ; t36a/37
5276    paddd                m1, m7         ; t35a/34
5277    REPX    {pmaxsd x, m14}, m20, m0, m21, m1
5278    psubd                m7, m16, m18   ; t40/41a
5279    paddd               m16, m18        ; t47/46a
5280    REPX    {pminsd x, m15}, m20, m0, m21, m1
5281    psubd               m18, m17, m19   ; t43a/42
5282    paddd               m17, m19        ; t44a/45
5283    REPX    {pmaxsd x, m14}, m7, m16, m18, m17
5284    psubd               m19, m6, m4     ; t55/54a
5285    paddd                m6, m4         ; t48/49a
5286    REPX    {pminsd x, m15}, m7, m16, m18, m17
5287    psubd                m4, m5, m3     ; t52a/53
5288    paddd                m5, m3         ; t51a/50
5289    REPX    {pmaxsd x, m14}, m19, m6, m4, m5
5290    psubd                m3, m23, m2    ; t56/57a
5291    paddd               m23, m2         ; t63/62a
5292    REPX    {pminsd x, m15}, m19, m6, m4, m5
5293    psubd                m2, m22, m8    ; t59a/58
5294    paddd               m22, m8         ; t60a/61
5295    REPX    {pmaxsd x, m14}, m3, m23, m2, m22
5296    REPX    {pminsd x, m15}, m3, m23, m2, m22
5297    ; m0-1: t32-35[a], m17-16: t44-47[a], m6-5: t48-51[a], m22-23: t60-63[a]
5298    ; m21-20: t36-39[a], m7/18: t40-43[a], m4/19: t52-55[a], m3-2: t56-59[a]
5299
5300    ; step7
5301    vpbroadcastd        m10, [o(pd_1567)]
5302    vpbroadcastd        m11, [o(pd_3784)]
5303    ITX_MULSUB_2D         2, 21, 8, 9, _, 13, 10, 11    ; t36/37a, t59/58a
5304    ITX_MULSUB_2D         3, 20, 8, 9, _, 13, 10, 11    ; t39a/38, t56a/57
5305    ITX_MULSUB_2D        19,  7, 8, 9, _, 13, 10, 11, 2 ; t55a/54, t40a/41
5306    ITX_MULSUB_2D         4, 18, 8, 9, _, 13, 10, 11, 2 ; t52/53a, t43/42a
5307    ; m0-3: t32-39[a], m7,18-16: t40-47[a], m6-4,19: t48-55[a], m20-23: t56-63[a]
5308
5309    ; step8
5310    psubd                m8, m0, m16    ; t47a/46
5311    paddd                m0, m16        ; t32a/33
5312    psubd               m16, m1, m17    ; t44/45a
5313    paddd                m1, m17        ; t35/34a
5314    REPX    {pmaxsd x, m14}, m8, m0, m16, m1
5315    psubd               m17, m2, m18    ; t43a/42
5316    paddd                m2, m18        ; t36a/37
5317    REPX    {pminsd x, m15}, m8, m0, m16, m1
5318    psubd               m18, m3, m7     ; t40/41a
5319    paddd                m3, m7         ; t39/38a
5320    REPX    {pmaxsd x, m14}, m17, m2, m18, m3
5321    psubd                m7, m23, m6    ; t48a/49
5322    paddd               m23, m6         ; t63a/62
5323    REPX    {pminsd x, m15}, m17, m2, m18, m3
5324    psubd                m6, m22, m5    ; t51/50a
5325    paddd               m22, m5         ; t60/61a
5326    REPX    {pmaxsd x, m14}, m7, m23, m6, m22
5327    psubd                m5, m21, m4    ; t52a/53
5328    paddd               m21, m4         ; t59a/58
5329    REPX    {pminsd x, m15}, m7, m23, m6, m22
5330    psubd                m4, m20, m19   ; t55/54a
5331    paddd               m20, m19        ; t56/57a
5332    REPX    {pmaxsd x, m14}, m5, m21, m4, m20
5333    REPX    {pminsd x, m15}, m5, m21, m4, m20
5334    ; m0-3=t32-39[a], m18-16,8: t40-47[a], m7-4=t48-55[a], m20-23=t56-63[a]
5335
5336    ; step9
5337    REPX    {pmulld x, m12}, m4, m18, m5, m17, m6, m16, m7, m8
5338    REPX    {paddd  x, m13}, m4, m5, m6, m7
5339    paddd               m19, m4, m18    ; t55a/54
5340    psubd                m4, m18        ; t40a/41
5341    paddd               m18, m5, m17    ; t52/53a
5342    psubd                m5, m17        ; t43/42a
5343    paddd               m17, m6, m16    ; t51a/50
5344    psubd                m6, m16        ; t44a/45
5345    paddd               m16, m7, m8     ; t48/49a
5346    psubd                m7, m8         ; t47/46a
5347    REPX    {psrad  x, 12 }, m19, m4, m18, m5, m17, m6, m16, m7
5348    ; m4-7=t40-47[a], m16-19=t48-55[a]
5349    ret
5350
5351cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
5352    lea                  r5, [o_base]
5353    test               eobd, eobd
5354    jz .dconly
5355
5356    PROLOGUE 4, 8, 32, -64*32, dst, stride, c, eob
5357%undef cmp
5358    vpbroadcastd        m12, [o(pd_2896)]
5359    vpbroadcastd        m13, [o(pd_2048)]
5360    vpbroadcastd        m14, [o(clip_18b_min)]
5361    vpbroadcastd        m15, [o(clip_18b_max)]
5362    cmp                eobd, 136
5363    jl .fast
5364    add                  cq, 64
5365    cmp                eobd, 543
5366    jge .full
5367    call .pass1_fast ; bottomright 16x16 zero
5368    mov                 r7d, 16*12
5369    jmp .lefthalf
5370.full:
5371    call .pass1
5372    mov                 r7d, 16*28
5373.lefthalf:
5374    mova        [cq+128* 0], m0
5375    mova        [cq+128* 1], m1
5376    mova        [cq+128* 2], m2
5377    mova        [cq+128* 3], m3
5378    mova        [cq+128* 4], m14
5379    mova        [cq+128* 5], m15
5380    mova        [cq+128* 6], m16
5381    mova        [cq+128* 7], m17
5382    mova        [cq+128* 8], m22
5383    mova        [cq+128* 9], m23
5384    mova        [cq+128*10], m24
5385    mova        [cq+128*11], m25
5386    mova        [cq+128*12], m26
5387    mova        [cq+128*13], m27
5388    mova        [cq+128*14], m28
5389    mova        [cq+128*15], m29
5390    sub                  cq, 64
5391    vpbroadcastd        m12, [o(pd_2896)]
5392    vpbroadcastd        m13, [o(pd_2048)]
5393    vpbroadcastd        m14, [o(clip_18b_min)]
5394    vpbroadcastd        m15, [o(clip_18b_max)]
5395    sub                 rsp, 16*64
5396    call .pass1
5397    add                 rsp, 16*64
5398    lea                  r5, [o_base_8bpc]
5399    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start
5400    mov                  r4, dstq
5401    pxor                m12, m12
5402    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
5403    lea                dstq, [r4+64]
5404    mova                 m0, [rsp+16*mmsize]
5405    mova                 m1, [rsp+17*mmsize]
5406    mova                 m2, [rsp+18*mmsize]
5407    mova                 m3, [rsp+19*mmsize]
5408    mova                 m4, [rsp+20*mmsize]
5409    mova                 m5, [rsp+21*mmsize]
5410    mova                 m6, [rsp+22*mmsize]
5411    mova                 m7, [rsp+23*mmsize]
5412    mova                m16, [rsp+24*mmsize]
5413    mova                m17, [rsp+25*mmsize]
5414    mova                m18, [rsp+26*mmsize]
5415    mova                m19, [rsp+27*mmsize]
5416    mova                m20, [rsp+28*mmsize]
5417    mova                m21, [rsp+29*mmsize]
5418    mova                m22, [rsp+30*mmsize]
5419    mova                m23, [rsp+31*mmsize]
5420    call .transpose
5421    mova     [cq+128* 0+64], m0
5422    mova     [cq+128* 1+64], m1
5423    mova     [cq+128* 2+64], m2
5424    mova     [cq+128* 3+64], m3
5425    mova     [cq+128* 4+64], m14
5426    mova     [cq+128* 5+64], m15
5427    mova     [cq+128* 6+64], m16
5428    mova     [cq+128* 7+64], m17
5429    mova     [cq+128* 8+64], m22
5430    mova     [cq+128* 9+64], m23
5431    mova     [cq+128*10+64], m24
5432    mova     [cq+128*11+64], m25
5433    mova     [cq+128*12+64], m26
5434    mova     [cq+128*13+64], m27
5435    mova     [cq+128*14+64], m28
5436    mova     [cq+128*15+64], m29
5437    mova                 m0, [rsp+ 0*mmsize]
5438    mova                 m1, [rsp+ 1*mmsize]
5439    mova                 m2, [rsp+ 2*mmsize]
5440    mova                 m3, [rsp+ 3*mmsize]
5441    mova                 m4, [rsp+ 4*mmsize]
5442    mova                 m5, [rsp+ 5*mmsize]
5443    mova                 m6, [rsp+ 6*mmsize]
5444    mova                 m7, [rsp+ 7*mmsize]
5445    mova                m16, [rsp+ 8*mmsize]
5446    mova                m17, [rsp+ 9*mmsize]
5447    mova                m18, [rsp+10*mmsize]
5448    mova                m19, [rsp+11*mmsize]
5449    mova                m20, [rsp+12*mmsize]
5450    mova                m21, [rsp+13*mmsize]
5451    mova                m22, [rsp+14*mmsize]
5452    mova                m23, [rsp+15*mmsize]
5453    call .transpose
5454    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start
5455    pxor                m12, m12
5456.right_zero_loop:
5457    mova [cq+r7*8+64+128*3], m12
5458    mova [cq+r7*8+64+128*2], m12
5459    mova [cq+r7*8+64+128*1], m12
5460    mova [cq+r7*8+64+128*0], m12
5461    sub                 r7d, 16*4
5462    jge .right_zero_loop
5463    mov                 r7d, 16*28
5464    jmp .end
5465.fast: ; topleft 16x16 nonzero
5466    cmp                eobd, 36
5467    jl .fast2
5468    call .pass1_fast
5469    lea                  r5, [o_base_8bpc]
5470    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start
5471    mov                  r4, dstq
5472    pxor                m12, m12
5473    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
5474    lea                dstq, [r4+64]
5475    mova                 m0, [rsp+16*mmsize]
5476    mova                 m1, [rsp+17*mmsize]
5477    mova                 m2, [rsp+18*mmsize]
5478    mova                 m3, [rsp+19*mmsize]
5479    mova                 m4, [rsp+20*mmsize]
5480    mova                 m5, [rsp+21*mmsize]
5481    mova                 m6, [rsp+22*mmsize]
5482    mova                 m7, [rsp+23*mmsize]
5483    mova                m16, [rsp+24*mmsize]
5484    mova                m17, [rsp+25*mmsize]
5485    mova                m18, [rsp+26*mmsize]
5486    mova                m19, [rsp+27*mmsize]
5487    mova                m20, [rsp+28*mmsize]
5488    mova                m21, [rsp+29*mmsize]
5489    mova                m22, [rsp+30*mmsize]
5490    mova                m23, [rsp+31*mmsize]
5491    call .transpose
5492    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start
5493    mov                 r7d, 16*12
5494    pxor                m12, m12
5495    jmp .end
5496.fast2: ; topleft 8x8 nonzero
5497    movshdup             m7, [o(permB)]
5498    mova                ym0, [cq+128*1]
5499    mova                ym2, [cq+128*5]
5500    mova                ym3, [cq+128*3]
5501    mova                ym1, [cq+128*7]
5502    vpermt2q             m0, m7, m2 ;  1  5
5503    vpermt2q             m1, m7, m3 ;  7  3
5504    REPX    {pmulld x, m12}, m0, m1
5505    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed_rect2
5506    mova    [rsp+ 0*mmsize], m0
5507    mova    [rsp+ 1*mmsize], m1
5508    mova    [rsp+ 2*mmsize], m2
5509    mova    [rsp+ 3*mmsize], m3
5510    mova    [rsp+ 4*mmsize], m4
5511    mova    [rsp+ 5*mmsize], m5
5512    mova    [rsp+ 6*mmsize], m6
5513    mova    [rsp+ 7*mmsize], m7
5514    mova    [rsp+ 8*mmsize], m16
5515    mova    [rsp+ 9*mmsize], m17
5516    mova    [rsp+10*mmsize], m18
5517    mova    [rsp+11*mmsize], m19
5518    mova    [rsp+12*mmsize], m20
5519    mova    [rsp+13*mmsize], m21
5520    mova    [rsp+14*mmsize], m22
5521    mova    [rsp+15*mmsize], m23
5522
5523    movshdup             m7, [o(permB)]
5524    pmulld              ym0, ym12, [cq+128*0]
5525    pmulld              ym4, ym12, [cq+128*4]
5526    mova               ym16, [cq+128*2]
5527    mova                ym5, [cq+128*6]
5528    REPX    {paddd x, ym13}, ym0, ym4
5529    REPX    {psrad x, 12  }, ym0, ym4
5530    vpermt2q            m16, m7, m5 ;  2  6
5531    vpermq               m0, m7, m0 ;  0  0
5532    vpermq               m4, m7, m4 ;  4  4
5533    pmulld              m16, m12
5534    paddd               m16, m13
5535    psrad               m16, 12
5536    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
5537
5538    vpbroadcastd        m11, [o(pd_1)]
5539    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
5540    mova    [rsp+16*mmsize], m24
5541    mova    [rsp+17*mmsize], m25
5542    mova    [rsp+18*mmsize], m26
5543    mova    [rsp+19*mmsize], m27
5544    mova    [rsp+20*mmsize], m28
5545    mova    [rsp+21*mmsize], m29
5546    mova    [rsp+22*mmsize], m30
5547    mova    [rsp+23*mmsize], m31
5548    vpbroadcastd        m13, [o(pd_2048)]
5549    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start
5550    mov                 r7d, 16*4
5551    mov                  r4, dstq
5552    pxor                m12, m12
5553    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
5554    lea                dstq, [r4+64]
5555    mova                 m0, [rsp+16*mmsize]
5556    mova                 m1, [rsp+17*mmsize]
5557    mova                 m2, [rsp+18*mmsize]
5558    mova                 m3, [rsp+19*mmsize]
5559    mova                 m4, [rsp+20*mmsize]
5560    mova                 m5, [rsp+21*mmsize]
5561    mova                 m6, [rsp+22*mmsize]
5562    mova                 m7, [rsp+23*mmsize]
5563    lea                  r5, [o_base]
5564    vpbroadcastd        m13, [o(pd_2048)]
5565    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start
5566    pxor                m12, m12
5567.end:
5568    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
5569.zero_loop:
5570    mova    [cq+r7*8+128*3], m12
5571    mova    [cq+r7*8+128*2], m12
5572    mova    [cq+r7*8+128*1], m12
5573    mova    [cq+r7*8+128*0], m12
5574    sub                 r7d, 16*4
5575    jge .zero_loop
5576    RET
5577.dconly:
5578    imul                r6d, [cq], 181
5579    mov                [cq], eobd
5580    or                  r3d, 32
5581    add                 r6d, 128
5582    sar                 r6d, 8
5583    imul                r6d, 181
5584    add                 r6d, 384
5585    sar                 r6d, 9
5586    jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
5587.pass1_fast:
5588    lea                  r4, [idct64_mul_16bpc]
5589    lea                  r6, [rsp+4*64+gprsize]
5590    pmulld               m0, m12, [cq+128* 1]
5591    pmulld               m3, m12, [cq+128*15]
5592    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
5593    pmulld               m0, m12, [cq+128* 7]
5594    pmulld               m3, m12, [cq+128* 9]
5595    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
5596    pmulld               m0, m12, [cq+128* 5]
5597    pmulld               m3, m12, [cq+128*11]
5598    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
5599    pmulld               m0, m12, [cq+128* 3]
5600    pmulld               m3, m12, [cq+128*13]
5601    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
5602    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
5603    pmulld               m0, m12, [cq+128* 0]
5604    pmulld               m1, m12, [cq+128* 8]
5605    pmulld              m16, m12, [cq+128* 4]
5606    pmulld              m17, m12, [cq+128*12]
5607    call m(idct_8x16_internal_10bpc).main_fast2_rect2
5608    call m(idct_16x16_internal_10bpc).main_fast2_rect2
5609    call .pass1_load_spill
5610    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2_rect2
5611    jmp .pass1_end
5612.pass1:
5613    lea                  r4, [idct64_mul_16bpc]
5614    lea                  r6, [rsp+4*64+gprsize]
5615    pmulld               m0, m12, [cq+128* 1]
5616    pmulld               m1, m12, [cq+128*31]
5617    pmulld               m2, m12, [cq+128*17]
5618    pmulld               m3, m12, [cq+128*15]
5619    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
5620    pmulld               m0, m12, [cq+128* 7]
5621    pmulld               m1, m12, [cq+128*25]
5622    pmulld               m2, m12, [cq+128*23]
5623    pmulld               m3, m12, [cq+128* 9]
5624    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
5625    pmulld               m0, m12, [cq+128* 5]
5626    pmulld               m1, m12, [cq+128*27]
5627    pmulld               m2, m12, [cq+128*21]
5628    pmulld               m3, m12, [cq+128*11]
5629    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
5630    pmulld               m0, m12, [cq+128* 3]
5631    pmulld               m1, m12, [cq+128*29]
5632    pmulld               m2, m12, [cq+128*19]
5633    pmulld               m3, m12, [cq+128*13]
5634    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
5635    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
5636    pmulld               m0, m12, [cq+128* 0]
5637    pmulld               m1, m12, [cq+128* 8]
5638    pmulld               m2, m12, [cq+128*16]
5639    pmulld               m3, m12, [cq+128*24]
5640    pmulld              m16, m12, [cq+128* 4]
5641    pmulld              m17, m12, [cq+128*12]
5642    pmulld              m18, m12, [cq+128*20]
5643    pmulld              m19, m12, [cq+128*28]
5644    call m(idct_8x16_internal_10bpc).main_fast_rect2
5645    call m(idct_16x16_internal_10bpc).main_fast_rect2
5646    call .pass1_load_spill
5647    pmulld               m4, m12, [cq+128*18]
5648    pmulld               m5, m12, [cq+128*22]
5649    pmulld               m6, m12, [cq+128*26]
5650    pmulld               m7, m12, [cq+128*30]
5651    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2
5652.pass1_end:
5653    vpbroadcastd        m11, [o(pd_1)]
5654    lea                  r3, [rsp+gprsize]
5655    lea                  r4, [cq+8*128]
5656    call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end
5657    ; transpose one half immediately, we can transpose lower half later
5658.transpose:
5659    ; transpose m0-7,16-23
5660    psrlq               m12, [permC], 24    ;  0  2  8 10  1  3  9 11
5661    psrlq               m13, m12, 32        ;  4  6 12 14  5  7 13 15
5662    call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
5663    punpckhqdq          m22, m0, m20  ;  1
5664    punpcklqdq           m0, m20      ;  0
5665    punpckhqdq          m24, m2, m1   ;  5
5666    punpcklqdq           m1, m2, m1   ;  4
5667    punpcklqdq           m2, m14, m18 ;  8
5668    punpckhqdq          m26, m14, m18 ;  9
5669    punpcklqdq          m14, m15, m4  ;  2
5670    punpckhqdq          m23, m15, m4  ;  3
5671    punpckhqdq          m25, m3, m21  ;  7
5672    punpcklqdq          m15, m3, m21  ;  6
5673    punpckhqdq          m28, m6, m17  ; 13
5674    punpcklqdq           m3, m6, m17  ; 12
5675    punpckhqdq          m27, m5, m16  ; 11
5676    punpcklqdq          m16, m5, m16  ; 10
5677    punpckhqdq          m29, m7, m8   ; 15
5678    punpcklqdq          m17, m7, m8   ; 14
5679    ret
5680.pass1_load_spill:
5681    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
5682    mova        [cq+128* 0], m0
5683    mova        [cq+128* 1], m1
5684    pmulld               m0, m12, [cq+128* 2]
5685    pmulld               m1, m12, [cq+128* 6]
5686    mova        [cq+128* 2], m2
5687    mova        [cq+128* 3], m3
5688    pmulld               m2, m12, [cq+128*10]
5689    pmulld               m3, m12, [cq+128*14]
5690    mova        [cq+128* 4], m4
5691    mova        [cq+128* 5], m5
5692    mova        [cq+128* 6], m6
5693    mova        [cq+128* 7], m7
5694    mova        [cq+128* 8], m23
5695    mova        [cq+128* 9], m22
5696    mova        [cq+128*10], m21
5697    mova        [cq+128*11], m20
5698    mova        [cq+128*12], m19
5699    mova        [cq+128*13], m18
5700    mova        [cq+128*14], m17
5701    mova        [cq+128*15], m16
5702    ret
5703
5704cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
5705    lea                  r5, [o_base]
5706    test               eobd, eobd
5707    jz .dconly
5708
5709    PROLOGUE 4, 9, 32, -64*32, dst, stride, c, eob
5710%undef cmp
5711    vpbroadcastd        m12, [o(pd_2896)]
5712    vpbroadcastd        m13, [o(pd_2048)]
5713    vpbroadcastd        m14, [o(clip_18b_min)]
5714    vpbroadcastd        m15, [o(clip_18b_max)]
5715    cmp                eobd, 136
5716    jl .fast
5717    add                  cq, 64
5718    cmp                eobd, 543
5719    jge .full
5720    call .pass1_fast ; bottomright 16x16 zero
5721    mov                 r7d, 16*12
5722    jmp .lefthalf
5723.full:
5724    call .pass1
5725    mov                 r7d, 16*28
5726.lefthalf:
5727    mova        [cq+128* 0], m27
5728    mova        [cq+128* 1], m14
5729    mova        [cq+128* 2], m28
5730    mova        [cq+128* 3], m15
5731    mova        [cq+128* 4], m22
5732    mova        [cq+128* 5], m23
5733    mova        [cq+128* 6], m24
5734    mova        [cq+128* 7], m25
5735    mova        [cq+128* 8], m0
5736    mova        [cq+128* 9], m26
5737    mova        [cq+128*10], m20
5738    mova        [cq+128*11], m21
5739    mova        [cq+128*12], m18
5740    mova        [cq+128*13], m16
5741    mova        [cq+128*14], m17
5742    mova        [cq+128*15], m3
5743    sub                  cq, 64
5744    vpbroadcastd        m12, [o(pd_2896)]
5745    vpbroadcastd        m13, [o(pd_2048)]
5746    vpbroadcastd        m14, [o(clip_18b_min)]
5747    vpbroadcastd        m15, [o(clip_18b_max)]
5748    sub                 rsp, 16*64
5749    call .pass1
5750    sub                 rsp, 24*64
5751    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start
5752    mov                  r8, dstq
5753    pxor                m31, m31
5754    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
5755    lea                dstq, [r8+64]
5756    mova                 m0, [rsp+56*mmsize]
5757    mova                 m1, [rsp+57*mmsize]
5758    mova                 m2, [rsp+58*mmsize]
5759    mova                 m3, [rsp+59*mmsize]
5760    mova                 m4, [rsp+60*mmsize]
5761    mova                 m5, [rsp+61*mmsize]
5762    mova                 m6, [rsp+62*mmsize]
5763    mova                 m7, [rsp+63*mmsize]
5764    mova                m16, [rsp+64*mmsize]
5765    mova                m17, [rsp+65*mmsize]
5766    mova                m18, [rsp+66*mmsize]
5767    mova                m19, [rsp+67*mmsize]
5768    mova                m20, [rsp+68*mmsize]
5769    mova                m21, [rsp+69*mmsize]
5770    mova                m22, [rsp+70*mmsize]
5771    mova                m23, [rsp+71*mmsize]
5772    call .transpose
5773    mova     [cq+128* 0+64], m27
5774    mova     [cq+128* 1+64], m14
5775    mova     [cq+128* 2+64], m28
5776    mova     [cq+128* 3+64], m15
5777    mova     [cq+128* 4+64], m22
5778    mova     [cq+128* 5+64], m23
5779    mova     [cq+128* 6+64], m24
5780    mova     [cq+128* 7+64], m25
5781    mova     [cq+128* 8+64], m0
5782    mova     [cq+128* 9+64], m26
5783    mova     [cq+128*10+64], m20
5784    mova     [cq+128*11+64], m21
5785    mova     [cq+128*12+64], m18
5786    mova     [cq+128*13+64], m16
5787    mova     [cq+128*14+64], m17
5788    mova     [cq+128*15+64], m3
5789    mova                 m0, [rsp+40*mmsize]
5790    mova                 m1, [rsp+41*mmsize]
5791    mova                 m2, [rsp+42*mmsize]
5792    mova                 m3, [rsp+43*mmsize]
5793    mova                 m4, [rsp+44*mmsize]
5794    mova                 m5, [rsp+45*mmsize]
5795    mova                 m6, [rsp+46*mmsize]
5796    mova                 m7, [rsp+47*mmsize]
5797    mova                m16, [rsp+48*mmsize]
5798    mova                m17, [rsp+49*mmsize]
5799    mova                m18, [rsp+50*mmsize]
5800    mova                m19, [rsp+51*mmsize]
5801    mova                m20, [rsp+52*mmsize]
5802    mova                m21, [rsp+53*mmsize]
5803    mova                m22, [rsp+54*mmsize]
5804    mova                m23, [rsp+55*mmsize]
5805    add                 rsp, 32*64
5806    call .transpose
5807    lea                  r5, [o_base]
5808    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start
5809.right_zero_loop:
5810    REPX {mova [cq+r7*8+64+128*x], m31}, 0, 1, 2, 3
5811    sub                 r7d, 16*4
5812    jge .right_zero_loop
5813    mov                 r7d, 16*28
5814    jmp .end
5815.fast: ; topleft 16x16 nonzero
5816    cmp                eobd, 36
5817    jl .fast2
5818    call .pass1_fast
5819    sub                 rsp, 24*64
5820    vpbroadcastd        m10, [o(pd_2048)]
5821    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start
5822    mov                  r8, dstq
5823    pxor                m31, m31
5824    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
5825    lea                dstq, [r8+64]
5826    mova                 m0, [rsp+40*mmsize]
5827    mova                 m1, [rsp+41*mmsize]
5828    mova                 m2, [rsp+42*mmsize]
5829    mova                 m3, [rsp+43*mmsize]
5830    mova                 m4, [rsp+44*mmsize]
5831    mova                 m5, [rsp+45*mmsize]
5832    mova                 m6, [rsp+46*mmsize]
5833    mova                 m7, [rsp+47*mmsize]
5834    mova                m16, [rsp+48*mmsize]
5835    mova                m17, [rsp+49*mmsize]
5836    mova                m18, [rsp+50*mmsize]
5837    mova                m19, [rsp+51*mmsize]
5838    mova                m20, [rsp+52*mmsize]
5839    mova                m21, [rsp+53*mmsize]
5840    mova                m22, [rsp+54*mmsize]
5841    mova                m23, [rsp+55*mmsize]
5842    add                 rsp, 16*64
5843    call .transpose
5844    lea                  r5, [o_base]
5845    vpbroadcastd        m10, [o(pd_2048)]
5846    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start
5847    mov                 r7d, 16*12
5848    jmp .end
5849.fast2: ; topleft 8x8 nonzero
5850    movshdup             m7, [o(permB)]
5851    mova                ym0, [cq+128*1]
5852    mova                ym2, [cq+128*5]
5853    mova                ym3, [cq+128*3]
5854    mova                ym1, [cq+128*7]
5855    vpermt2q             m0, m7, m2 ;  1  5
5856    vpermt2q             m1, m7, m3 ;  7  3
5857    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed
5858    mova    [rsp+ 0*mmsize], m0
5859    mova    [rsp+ 1*mmsize], m1
5860    mova    [rsp+ 2*mmsize], m2
5861    mova    [rsp+ 3*mmsize], m3
5862    mova    [rsp+ 4*mmsize], m4
5863    mova    [rsp+ 5*mmsize], m5
5864    mova    [rsp+ 6*mmsize], m6
5865    mova    [rsp+ 7*mmsize], m7
5866    mova    [rsp+ 8*mmsize], m16
5867    mova    [rsp+ 9*mmsize], m17
5868    mova    [rsp+10*mmsize], m18
5869    mova    [rsp+11*mmsize], m19
5870    mova    [rsp+12*mmsize], m20
5871    mova    [rsp+13*mmsize], m21
5872    mova    [rsp+14*mmsize], m22
5873    mova    [rsp+15*mmsize], m23
5874
5875    movshdup             m7, [o(permB)]
5876    mova                ym0, [cq+128*0]
5877    mova                ym4, [cq+128*4]
5878    mova               ym16, [cq+128*2]
5879    mova                ym5, [cq+128*6]
5880    vpermt2q            m16, m7, m5 ;  2  6
5881    vpermq               m0, m7, m0 ;  0  0
5882    vpermq               m4, m7, m4 ;  4  4
5883    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
5884
5885    vpbroadcastd        m11, [o(pd_2)]
5886    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
5887    sub                 rsp, 16*64
5888    mova    [rsp+40*mmsize], m24
5889    mova    [rsp+41*mmsize], m25
5890    mova    [rsp+42*mmsize], m26
5891    mova    [rsp+43*mmsize], m27
5892    mova    [rsp+44*mmsize], m28
5893    mova    [rsp+45*mmsize], m29
5894    mova    [rsp+46*mmsize], m30
5895    mova    [rsp+47*mmsize], m31
5896    call .pass2_fast2_start
5897    mov                 r7d, 16*4
5898    mov                  r8, dstq
5899    pxor                m31, m31
5900    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
5901    lea                dstq, [r8+64]
5902    mova                 m0, [rsp+40*mmsize]
5903    mova                 m1, [rsp+41*mmsize]
5904    mova                 m2, [rsp+42*mmsize]
5905    mova                 m3, [rsp+43*mmsize]
5906    mova                 m4, [rsp+44*mmsize]
5907    mova                 m5, [rsp+45*mmsize]
5908    mova                 m6, [rsp+46*mmsize]
5909    mova                 m7, [rsp+47*mmsize]
5910    add                 rsp, 8*64
5911    lea                  r5, [o_base]
5912    call .pass2_fast2_start
5913.end:
5914    pxor                m31, m31
5915.zero_loop:
5916    REPX {mova [cq+r7*8+128*x], m31}, 0, 1, 2, 3
5917    sub                 r7d, 16*4
5918    jge .zero_loop
5919    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
5920    add                 rsp, 8*64  ; FIXME adjust stack_size_padded instead?
5921    RET
5922.pass2_fast2_start:
5923    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
5924    punpcklqdq          m27, m0, m2 ; 0
5925    punpckhqdq           m0, m2     ; 1
5926    punpcklqdq          m22, m3, m4 ; 2
5927    punpckhqdq          m26, m3, m4 ; 3
5928    punpcklqdq          m14, m5, m7 ; 4
5929    punpckhqdq          m20, m5, m7 ; 5
5930    punpcklqdq          m23, m6, m8 ; 6
5931    punpckhqdq          m21, m6, m8 ; 7
5932    vpbroadcastd        m10, [o(pd_2048)]
5933    jmp m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast2_start
5934.dconly:
5935    imul                r6d, [cq], 181
5936    mov                [cq], eobd
5937    or                  r3d, 64
5938    jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly1
5939.pass1_fast:
5940    lea                  r4, [idct64_mul_16bpc]
5941    lea                  r6, [rsp+4*64+gprsize]
5942    mova                 m0, [cq+128* 1]
5943    mova                 m3, [cq+128*15]
5944    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
5945    mova                 m0, [cq+128* 7]
5946    mova                 m3, [cq+128* 9]
5947    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
5948    mova                 m0, [cq+128* 5]
5949    mova                 m3, [cq+128*11]
5950    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
5951    mova                 m0, [cq+128* 3]
5952    mova                 m3, [cq+128*13]
5953    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
5954    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
5955    mova                 m0, [cq+128* 0]
5956    mova                 m1, [cq+128* 8]
5957    mova                m16, [cq+128* 4]
5958    mova                m17, [cq+128*12]
5959    call m(idct_8x16_internal_10bpc).main_fast2
5960    call m(idct_16x16_internal_10bpc).main_fast2
5961    call .pass1_load_spill
5962    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2
5963    jmp .pass1_end
5964.pass1:
5965    lea                  r4, [idct64_mul_16bpc]
5966    lea                  r6, [rsp+4*64+gprsize]
5967    mova                 m0, [cq+128* 1]
5968    mova                 m1, [cq+128*31]
5969    mova                 m2, [cq+128*17]
5970    mova                 m3, [cq+128*15]
5971    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
5972    mova                 m0, [cq+128* 7]
5973    mova                 m1, [cq+128*25]
5974    mova                 m2, [cq+128*23]
5975    mova                 m3, [cq+128* 9]
5976    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
5977    mova                 m0, [cq+128* 5]
5978    mova                 m1, [cq+128*27]
5979    mova                 m2, [cq+128*21]
5980    mova                 m3, [cq+128*11]
5981    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
5982    mova                 m0, [cq+128* 3]
5983    mova                 m1, [cq+128*29]
5984    mova                 m2, [cq+128*19]
5985    mova                 m3, [cq+128*13]
5986    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
5987    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
5988    mova                 m0, [cq+128* 0]
5989    mova                 m1, [cq+128* 8]
5990    mova                 m2, [cq+128*16]
5991    mova                 m3, [cq+128*24]
5992    mova                m16, [cq+128* 4]
5993    mova                m17, [cq+128*12]
5994    mova                m18, [cq+128*20]
5995    mova                m19, [cq+128*28]
5996    call m(idct_8x16_internal_10bpc).main_fast
5997    call m(idct_16x16_internal_10bpc).main_fast
5998    call .pass1_load_spill
5999    mova                 m4, [cq+128*18]
6000    mova                 m5, [cq+128*22]
6001    mova                 m6, [cq+128*26]
6002    mova                 m7, [cq+128*30]
6003    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
6004.pass1_end:
6005    vpbroadcastd        m11, [o(pd_2)]
6006    lea                  r3, [rsp+gprsize]
6007    lea                  r4, [cq+8*128]
6008    call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end
6009    ; transpose one half immediately, we can transpose lower half later
6010.transpose:
6011    ; transpose m0-7,16-23
6012    psrlq               m12, [permC], 24 ;  0  2  8 10  1  3  9 11
6013    psrlq               m13, m12, 32        ;  4  6 12 14  5  7 13 15
6014    call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
6015    punpcklqdq          m27, m0, m20  ;  0
6016    punpckhqdq           m0, m20      ;  1
6017    punpcklqdq          m24, m5, m16  ; 10
6018    punpckhqdq          m16, m5, m16  ; 11
6019    punpcklqdq          m23, m3, m21  ;  6
6020    punpckhqdq          m21, m3, m21  ;  7
6021    punpcklqdq          m25, m7, m8   ; 14
6022    punpckhqdq           m3, m7, m8   ; 15
6023    punpcklqdq          m22, m15, m4  ;  2
6024    punpckhqdq          m26, m15, m4  ;  3
6025    punpcklqdq          m15, m6, m17  ; 12
6026    punpckhqdq          m17, m6, m17  ; 13
6027    punpcklqdq          m28, m14, m18 ;  8
6028    punpckhqdq          m18, m14, m18 ;  9
6029    punpcklqdq          m14, m2, m1   ;  4
6030    punpckhqdq          m20, m2, m1   ;  5
6031    ret
6032.pass1_load_spill:
6033    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
6034    mova        [cq+128* 0], m0
6035    mova        [cq+128* 1], m1
6036    mova                 m0, [cq+128* 2]
6037    mova                 m1, [cq+128* 6]
6038    mova        [cq+128* 2], m2
6039    mova        [cq+128* 3], m3
6040    mova                 m2, [cq+128*10]
6041    mova                 m3, [cq+128*14]
6042    mova        [cq+128* 4], m4
6043    mova        [cq+128* 5], m5
6044    mova        [cq+128* 6], m6
6045    mova        [cq+128* 7], m7
6046    mova        [cq+128* 8], m23
6047    mova        [cq+128* 9], m22
6048    mova        [cq+128*10], m21
6049    mova        [cq+128*11], m20
6050    mova        [cq+128*12], m19
6051    mova        [cq+128*13], m18
6052    mova        [cq+128*14], m17
6053    mova        [cq+128*15], m16
6054    ret
6055
6056%endif ; ARCH_X86_64
6057