xref: /aosp_15_r20/external/libdav1d/src/x86/itx_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2020-2023, VideoLAN and dav1d authors
2; Copyright © 2020-2023, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 64
32const \
33dup16_perm,  db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
34             db  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15
35             db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23
36             db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31
37const \
38int8_permA,  db  0,  1, 16, 17, 32, 33, 48, 49,  2,  3, 18, 19, 34, 35, 50, 51
39             db  4,  5, 20, 21, 36, 37, 52, 53,  6,  7, 22, 23, 38, 39, 54, 55
40             db  8,  9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
41             db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
42int8_permB:  db  0,  1, 16, 17, 32, 33, 48, 49,  2,  3, 18, 19, 34, 35, 50, 51
43             db  8,  9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
44             db  4,  5, 20, 21, 36, 37, 52, 53,  6,  7, 22, 23, 38, 39, 54, 55
45             db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
46int16_perm:  db  0,  1, 32, 33,  2,  3, 34, 35,  4,  5, 36, 37,  6,  7, 38, 39
47             db  8,  9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
48             db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
49             db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
50idtx_16x4p:  db  0,  1,  4,  5, 16, 17, 20, 21,  2,  3,  6,  7, 18, 19, 22, 23
51             db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55
52             db  8,  9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31
53             db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63
54idct_8x32p:  db 60, 61,  4,  5, 32, 33,  0,  1, 28, 29, 36, 37, 56, 57,  8,  9
55             db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17
56             db 62, 63,  2,  3,  6,  7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51
57             db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35
58idct_16x32p: db  6,  7, 58, 59, 38, 39, 26, 27, 32, 33,  0,  1, 30, 31, 34, 35
59             db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21
60             db 62, 63,  2,  3, 48, 49, 16, 17, 56, 57,  8,  9, 14, 15, 50, 51
61             db 54, 55, 10, 11, 60, 61,  4,  5, 12, 13, 52, 53, 28, 29, 36, 37
62end_16x32p:  db  0, 32,  1, 48,  2, 36,  3, 52, 16, 40, 17, 56, 18, 44, 19, 60
63             db  4, 33,  5, 49,  6, 37,  7, 53, 20, 41, 21, 57, 22, 45, 23, 61
64             db  8, 35,  9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63
65             db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62
66
67; packed 4-bit qword shuffle indices
68permA:       dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262
69             dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373
70             dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb
71             dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea
72permB:       dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604
73             dq 0xc824352d56128751, 0xd906171e74301e15
74             dq 0x6271604b03472d62, 0x735342782165b426
75             dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37
76permC:       dq 0x9d409d041551c2e0, 0xbf62bf263773a486
77             dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597
78             dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e
79             dq 0x5115049dd9045b79, 0x733726bffb263d1f
80permD:       dq 0x0cda098800041504, 0x0edb09b2028c3726
81             dq 0x0f11fa9c01150415, 0x0988f326039d2637
82             dq 0x05640f1108269d8c, 0x05290edb0aaebfae
83             dq 0x0005000509378c9d, 0xffffffff0bbfaebf
84
85pd_0to15:    dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
86gather8a:    dd  0,  2,  1,  3,  8, 10,  9, 11
87gather8b:    dd  0,  1,  4,  5,  8,  9, 12, 13
88gather8c:    dd  0,  4,  2,  6, 12,  8, 14, 10
89gather8d:    dd  0, 19,  1, 18,  2, 17,  3, 16
90
91int_shuf1:   db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
92int_shuf2:   db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7
93int_shuf3:   db  0,  1,  8,  9,  4,  5, 12, 13,  2,  3, 10, 11,  6,  7, 14, 15
94int_shuf4:   db  8,  9,  0,  1, 12, 13,  4,  5, 10, 11,  2,  3, 14, 15,  6,  7
95deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
96int_mshift:  db 12, 20,  0,  0, 44, 52,  0,  0
97
98pb_32:           times 4 db 32
99pw_2048:         times 2 dw 2048
100pw_4096:         times 2 dw 4096
101pw_8192:         times 2 dw 8192
102pw_16384:        times 2 dw 16384
103pw_1697x16:      times 2 dw 1697*16
104pw_1697x8:       times 2 dw 1697*8
105pw_2896x8:       times 2 dw 2896*8
106pd_2048:         dd  2048
107
108%define pw_5          (permD+52)
109%define pd_m1         (permD+60)
110%define pw_3803_1321  (permD+44)
111%define pw_2482_3803  (permD+12)
112%define pw_2440_3290  (permD+ 4)
113%define pw_m3290_2440 (permD+28)
114%define pw_3857_1380  (permD+36)
115%define pw_m1380_3857 (permD+20)
116
117pw_8192_m8192:   dw   8192,  -8192
118pw_m8192_8192:   dw  -8192,   8192
119pw_16384_m16384: dw  16384, -16384
120pw_m16384_16384: dw -16384,  16384
121
122pw_m1321_2482:   dw  -1321,  2482
123pw_m3344_3344:   dw  -3344,  3344
124pw_2482_3344:    dw   2482,  3344
125pw_m3803_3344:   dw  -3803,  3344
126pd_3344:         dd   3344
127pw_m1321_m3344:  dw  -1321, -3344
128pw_2896_m2896:   dw   2896, -2896
129
130pw_1567_m3784:   dw   1567, -3784
131pw_3784_m1567:   dw   3784, -1567
132pw_4017_m799:    dw   4017,  -799
133pw_2276_m3406:   dw   2276, -3406
134pw_m799_m4017:   dw   -799, -4017
135pw_m3406_m2276:  dw  -3406, -2276
136
137%macro COEF_PAIR 2-3 0
138pw_%1_%2:   dw  %1,  %2
139pw_m%2_%1:  dw -%2,  %1
140%if %3
141pw_m%1_m%2: dw -%1, -%2
142%endif
143%endmacro
144
145COEF_PAIR 2896, 2896
146COEF_PAIR 1567, 3784, 1
147COEF_PAIR 3784, 1567
148COEF_PAIR  201, 4091
149COEF_PAIR  995, 3973
150COEF_PAIR 1751, 3703
151COEF_PAIR 3035, 2751
152COEF_PAIR 3513, 2106
153COEF_PAIR 4052,  601
154COEF_PAIR 3166, 2598, 1
155COEF_PAIR 3920, 1189, 1
156COEF_PAIR 2276, 3406
157COEF_PAIR 4017,  799
158
159%macro COEF_X8 1-*
160%rep %0
161    dw %1*8, %1*8
162    %rotate 1
163%endrep
164%endmacro
165
166pw_m2276x8: COEF_X8 -2276
167pw_3406x8:  COEF_X8  3406
168pw_4017x8:  COEF_X8  4017
169pw_799x8:   COEF_X8   799
170pw_3784x8:  COEF_X8  3784
171pw_1567x8:  COEF_X8  1567
172
173pw_4076x8:  COEF_X8  4076
174pw_401x8:   COEF_X8   401
175pw_m2598x8: COEF_X8 -2598
176pw_3166x8:  COEF_X8  3166
177pw_3612x8:  COEF_X8  3612
178pw_1931x8:  COEF_X8  1931
179pw_m1189x8: COEF_X8 -1189
180pw_3920x8:  COEF_X8  3920
181
182pw_4091x8:  COEF_X8  4091
183pw_201x8:   COEF_X8   201
184pw_m2751x8: COEF_X8 -2751
185pw_3035x8:  COEF_X8  3035
186pw_3703x8:  COEF_X8  3703
187pw_1751x8:  COEF_X8  1751
188pw_m1380x8: COEF_X8 -1380
189pw_3857x8:  COEF_X8  3857
190pw_3973x8:  COEF_X8  3973
191pw_995x8:   COEF_X8   995
192pw_m2106x8: COEF_X8 -2106
193pw_3513x8:  COEF_X8  3513
194pw_3290x8:  COEF_X8  3290
195pw_2440x8:  COEF_X8  2440
196pw_m601x8:  COEF_X8  -601
197pw_4052x8:  COEF_X8  4052
198
199pw_401_4076x8:   dw   401*8, 4076*8
200pw_m2598_3166x8: dw -2598*8, 3166*8
201pw_1931_3612x8:  dw  1931*8, 3612*8
202pw_m1189_3920x8: dw -1189*8, 3920*8
203pw_799_4017x8:   dw   799*8, 4017*8
204pw_m2276_3406x8: dw -2276*8, 3406*8
205
206pw_201_4091x8:   dw   201*8, 4091*8
207pw_m601_4052x8:  dw  -601*8, 4052*8
208pw_995_3973x8:   dw   995*8, 3973*8
209pw_m1380_3857x8: dw -1380*8, 3857*8
210pw_1751_3703x8:  dw  1751*8, 3703*8
211pw_m2106_3513x8: dw -2106*8, 3513*8
212pw_2440_3290x8:  dw  2440*8, 3290*8
213pw_m2751_3035x8: dw -2751*8, 3035*8
214
215pw_101_4095x8:   dw   101*8, 4095*8
216pw_m2824_2967x8: dw -2824*8, 2967*8
217pw_1660_3745x8:  dw  1660*8, 3745*8
218pw_m1474_3822x8: dw -1474*8, 3822*8
219pw_897_3996x8:   dw   897*8, 3996*8
220pw_m2191_3461x8: dw -2191*8, 3461*8
221pw_2359_3349x8:  dw  2359*8, 3349*8
222pw_m700_4036x8:  dw  -700*8, 4036*8
223pw_501_4065x8:   dw   501*8, 4065*8
224pw_m2520_3229x8: dw -2520*8, 3229*8
225pw_2019_3564x8:  dw  2019*8, 3564*8
226pw_m1092_3948x8: dw -1092*8, 3948*8
227pw_1285_3889x8:  dw  1285*8, 3889*8
228pw_m1842_3659x8: dw -1842*8, 3659*8
229pw_2675_3102x8:  dw  2675*8, 3102*8
230pw_m301_4085x8:  dw  -301*8, 4085*8
231
232idct64_mul: COEF_X8  4095,   101,  2967, -2824,  3745,  1660,  3822, -1474
233COEF_PAIR  401, 4076, 1
234COEF_PAIR  799, 4017
235            COEF_X8  -700,  4036,  2359,  3349, -2191,  3461,   897,  3996
236dw    -2598, -3166,  3166, -2598,  2598,  3166, -4017,  -799,   799, -4017
237            COEF_X8  4065,   501,  3229, -2520,  3564,  2019,  3948, -1092
238COEF_PAIR 1931, 3612, 1
239COEF_PAIR 3406, 2276
240            COEF_X8  -301,  4085,  2675,  3102, -1842,  3659,  1285,  3889
241dw    -1189, -3920,  3920, -1189,  1189,  3920, -2276, -3406,  3406, -2276
242
243SECTION .text
244
245%define o_base int8_permA+64*18
246%define o(x) (r5 - (o_base) + (x))
247%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
248
249; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
250;        16 = special_mul1, 32 = special_mul2
251%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
252    mova                m%2, m%4
253%if %7 & 16
254    vpdpwssd            m%2, m%1, [o(pw_%5)] {bcstd}
255    mova                m%3, m%4
256%if %7 & 32
257    vpdpwssd            m%3, m%1, [o(pw_%6)] {bcstd}
258%else
259    vpdpwssd            m%3, m%1, m%6
260%endif
261%elif %7 & 32
262    vpdpwssd            m%2, m%1, m%5
263    mova                m%3, m%4
264    vpdpwssd            m%3, m%1, [o(pw_%6)] {bcstd}
265%elif %6 < 32
266    vpdpwssd            m%2, m%1, m%5
267    mova                m%3, m%4
268    vpdpwssd            m%3, m%1, m%6
269%elif %7 & 1
270    vpdpwssd            m%2, m%1, [o(pw_%5_%6)] {bcstd}
271    mova                m%3, m%4
272    vpdpwssd            m%3, m%1, [o(pw_m%6_%5)] {bcstd}
273%else
274    vpdpwssd            m%2, m%1, [o(pw_m%6_%5)] {bcstd}
275    mova                m%3, m%4
276    vpdpwssd            m%3, m%1, [o(pw_%5_%6)] {bcstd}
277%endif
278%if %7 & 2
279    psrld               m%2, 12
280    pslld               m%3, 4
281    vpshrdd             m%1, m%3, m%2, 16
282%elif %7 & 4
283    ; compared to using shifts (as above) this has better throughput,
284    ; but worse latency and requires setting up the opmask/index
285    ; registers, so only use this method for the larger transforms
286    pslld               m%1, m%2, 4
287    vpmultishiftqb  m%1{k7}, m13, m%3
288%else
289    psrad               m%2, 12
290    psrad               m%3, 12
291%if %7 & 8 == 0
292    packssdw            m%1, m%3, m%2
293%endif
294%endif
295%endmacro
296
297; flags: same as ITX_MUL2X_PACK
298%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags
299%if %11 & 1
300    vpbroadcastd        m%4, [o(pw_%9_%10)]
301    vpbroadcastd    m%4{k1}, [o(pw_%7_%8)]
302    vpbroadcastd        m%5, [o(pw_m%10_%9)]
303    vpbroadcastd    m%5{k1}, [o(pw_m%8_%7)]
304%else
305    vpbroadcastd        m%4, [o(pw_m%10_%9)]
306    vpbroadcastd    m%4{k1}, [o(pw_m%8_%7)]
307    vpbroadcastd        m%5, [o(pw_%9_%10)]
308    vpbroadcastd    m%5{k1}, [o(pw_%7_%8)]
309%endif
310    ITX_MUL2X_PACK       %1, %2, %3, %6, %4, %5, %11
311%endmacro
312
313; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
314; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
315%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
316    punpcklwd           m%3, m%2, m%1
317    punpckhwd           m%2, m%1
318%if %7 < 32
319    mova                m%1, m%5
320    vpdpwssd            m%1, m%3, m%7
321    mova                m%4, m%5
322    vpdpwssd            m%4, m%2, m%7
323%else
324    mova                m%1, m%5
325    vpdpwssd            m%1, m%3, [o(pw_m%7_%6)] {bcstd}
326    mova                m%4, m%5
327    vpdpwssd            m%4, m%2, [o(pw_m%7_%6)] {bcstd}
328%endif
329    psrad               m%1, 12
330    psrad               m%4, 12
331    packssdw            m%1, m%4
332    mova                m%4, m%5
333%if %7 < 32
334    vpdpwssd            m%4, m%2, m%6
335    mova                m%2, m%5
336    vpdpwssd            m%2, m%3, m%6
337%else
338    vpdpwssd            m%4, m%2, [o(pw_%6_%7)] {bcstd}
339    mova                m%2, m%5
340    vpdpwssd            m%2, m%3, [o(pw_%6_%7)] {bcstd}
341%endif
342    psrad               m%4, 12
343    psrad               m%2, 12
344%if %0 == 8
345    packssdw            m%8, m%2, m%4
346%else
347    packssdw            m%2, m%4
348%endif
349%endmacro
350
351%macro WRAP_XMM 1+
352    %xdefine %%reset RESET_MM_PERMUTATION
353    INIT_XMM cpuname
354    DEFINE_MMREGS xmm
355    AVX512_MM_PERMUTATION
356    %1
357    %%reset
358%endmacro
359
360%macro WRAP_YMM 1+
361    INIT_YMM cpuname
362    %1
363    INIT_ZMM cpuname
364%endmacro
365
366%macro ITX4_END 4-5 2048 ; row[1-4], rnd
367%if %5
368    vpbroadcastd         m2, [o(pw_%5)]
369    pmulhrsw             m0, m2
370    pmulhrsw             m1, m2
371%endif
372    lea                  r2, [dstq+strideq*2]
373%assign %%i 1
374%rep 4
375    %if %1 & 2
376        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
377    %else
378        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
379    %endif
380    %assign %%i %%i + 1
381    %rotate 1
382%endrep
383    movd                 m2, [%%row_adr1]
384    pinsrd               m2, [%%row_adr2], 1
385    movd                 m3, [%%row_adr3]
386    pinsrd               m3, [%%row_adr4], 1
387    pmovzxbw             m2, m2
388    pmovzxbw             m3, m3
389    paddw                m0, m2
390    paddw                m1, m3
391    packuswb             m0, m1
392    movd       [%%row_adr1], m0
393    pextrd     [%%row_adr2], m0, 1
394    pextrd     [%%row_adr3], m0, 2
395    pextrd     [%%row_adr4], m0, 3
396    ret
397%endmacro
398
399%macro INV_TXFM_FN 3 ; type1, type2, size
400cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base
401    %define %%p1 m(i%1_%3_internal_8bpc)
402    lea               baseq, [o_base]
403    ; Jump to the 1st txfm function if we're not taking the fast path, which
404    ; in turn performs an indirect jump to the 2nd txfm function.
405    lea tx2q, [m(i%2_%3_internal_8bpc).pass2]
406%ifidn %1_%2, dct_dct
407    test               eobd, eobd
408    jnz %%p1
409%else
410    ; jump to the 1st txfm function unless it's located directly after this
411    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
412ALIGN function_align
413%%end:
414%endif
415%endmacro
416
417%macro INV_TXFM_4X4_FN 2 ; type1, type2
418    INV_TXFM_FN          %1, %2, 4x4
419%ifidn %1_%2, dct_dct
420    vpbroadcastw         m0, [cq]
421    vpbroadcastd         m1, [o(pw_2896x8)]
422    pmulhrsw             m0, m1
423    mov                [cq], eobd
424    pmulhrsw             m0, m1
425    mova                 m1, m0
426    jmp m(iadst_4x4_internal_8bpc).end2
427%endif
428%endmacro
429
430%macro IDCT4_1D_PACKED 0
431    vpbroadcastd         m4, [o(pd_2048)]
432    punpckhwd            m2, m1, m0
433    punpcklwd            m1, m0
434    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784
435    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896
436    paddsw               m0, m1, m2 ; out0 out1
437    psubsw               m1, m2     ; out3 out2
438%endmacro
439
440%macro IADST4_1D_PACKED 0
441    punpcklwd            m4, m1, m0 ; in2 in0
442    punpckhwd            m5, m1, m0 ; in3 in1
443.main2:
444    vpbroadcastd         m3, [o(pd_2048)]
445    mova                 m0, m3
446    vpdpwssd             m0, m4, [o(pw_3803_1321)] {bcstd}
447    mova                 m2, m3
448    vpdpwssd             m2, m4, [o(pw_m1321_2482)] {bcstd}
449    mova                 m1, m3
450    vpdpwssd             m1, m4, [o(pw_m3344_3344)] {bcstd}
451    vpdpwssd             m3, m4, [o(pw_2482_3803)] {bcstd}
452    vpdpwssd             m0, m5, [o(pw_2482_3344)] {bcstd}
453    vpdpwssd             m2, m5, [o(pw_m3803_3344)] {bcstd}
454    vpdpwssd             m1, m5, [o(pd_3344)] {bcstd}
455    vpdpwssd             m3, m5, [o(pw_m1321_m3344)] {bcstd}
456    REPX      {psrad x, 12}, m0, m2, m1, m3
457    packssdw             m0, m2 ; out0 out1
458    packssdw             m1, m3 ; out2 out3
459%endmacro
460
461INIT_XMM avx512icl
462INV_TXFM_4X4_FN dct, dct
463INV_TXFM_4X4_FN dct, adst
464INV_TXFM_4X4_FN dct, flipadst
465INV_TXFM_4X4_FN dct, identity
466
467cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
468    mova                 m0, [cq+16*0]
469    mova                 m1, [cq+16*1]
470    IDCT4_1D_PACKED
471    mova                 m2, [o(deint_shuf)]
472    shufps               m3, m0, m1, q1331
473    shufps               m0, m0, m1, q0220
474    pshufb               m0, m2
475    pshufb               m1, m3, m2
476    jmp                tx2q
477.pass2:
478    IDCT4_1D_PACKED
479    pxor              ymm16, ymm16
480    mova               [cq], ymm16
481    ITX4_END              0, 1, 3, 2
482
483INV_TXFM_4X4_FN adst, dct
484INV_TXFM_4X4_FN adst, adst
485INV_TXFM_4X4_FN adst, flipadst
486INV_TXFM_4X4_FN adst, identity
487
488cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
489    mova                 m0, [cq+16*0]
490    mova                 m1, [cq+16*1]
491    call .main
492    punpckhwd            m3, m0, m1
493    punpcklwd            m0, m1
494    punpckhwd            m1, m0, m3
495    punpcklwd            m0, m3
496    jmp                tx2q
497.pass2:
498    call .main
499.end:
500    pxor              ymm16, ymm16
501    mova               [cq], ymm16
502.end2:
503    ITX4_END              0, 1, 2, 3
504ALIGN function_align
505.main:
506    IADST4_1D_PACKED
507    ret
508
509INV_TXFM_4X4_FN flipadst, dct
510INV_TXFM_4X4_FN flipadst, adst
511INV_TXFM_4X4_FN flipadst, flipadst
512INV_TXFM_4X4_FN flipadst, identity
513
514cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
515    mova                 m0, [cq+16*0]
516    mova                 m1, [cq+16*1]
517    call m(iadst_4x4_internal_8bpc).main
518    punpcklwd            m2, m1, m0
519    punpckhwd            m1, m0
520    punpcklwd            m0, m1, m2
521    punpckhwd            m1, m2
522    jmp                tx2q
523.pass2:
524    call m(iadst_4x4_internal_8bpc).main
525.end:
526    pxor              ymm16, ymm16
527    mova               [cq], ymm16
528.end2:
529    ITX4_END              3, 2, 1, 0
530
531INV_TXFM_4X4_FN identity, dct
532INV_TXFM_4X4_FN identity, adst
533INV_TXFM_4X4_FN identity, flipadst
534INV_TXFM_4X4_FN identity, identity
535
536cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
537    mova                 m0, [cq+16*0]
538    mova                 m1, [cq+16*1]
539    vpbroadcastd         m3, [o(pw_1697x8)]
540    pmulhrsw             m2, m3, m0
541    pmulhrsw             m3, m1
542    paddsw               m0, m2
543    paddsw               m1, m3
544    punpckhwd            m2, m0, m1
545    punpcklwd            m0, m1
546    punpckhwd            m1, m0, m2
547    punpcklwd            m0, m2
548    jmp                tx2q
549.pass2:
550    vpbroadcastd         m3, [o(pw_1697x8)]
551    pmulhrsw             m2, m3, m0
552    pmulhrsw             m3, m1
553    paddsw               m0, m2
554    paddsw               m1, m3
555    jmp m(iadst_4x4_internal_8bpc).end
556
557%macro INV_TXFM_4X8_FN 2 ; type1, type2
558    INV_TXFM_FN          %1, %2, 4x8
559%ifidn %1_%2, dct_dct
560    movd               xmm1, [o(pw_2896x8)]
561    pmulhrsw           xmm0, xmm1, [cq]
562    movd               xmm2, [o(pw_2048)]
563    pmulhrsw           xmm0, xmm1
564    pmulhrsw           xmm0, xmm1
565    pmulhrsw           xmm0, xmm2
566    vpbroadcastw        ym0, xmm0
567    mova                ym1, ym0
568    jmp m(iadst_4x8_internal_8bpc).end3
569%endif
570%endmacro
571
572%macro IDCT8_1D_PACKED 0
573    punpckhwd            m5, m3, m0 ; in7 in1
574    punpckhwd            m4, m1, m2 ; in3 in5
575    punpcklwd            m3, m1     ; in6 in2
576    punpcklwd            m2, m0     ; in4 in0
577.main2:
578    vpbroadcastd         m6, [o(pd_2048)]
579    ITX_MUL2X_PACK        5, 0, 1, 6,  799, 4017, 3 ; t4a t7a
580    ITX_MUL2X_PACK        4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
581    ITX_MUL2X_PACK        3, 0, 1, 6, 1567, 3784    ; t3 t2
582    psubsw               m0, m5, m4 ; t5a t6a (interleaved)
583    paddsw               m4, m5     ; t4  t7  (interleaved)
584    ITX_MUL2X_PACK        2, 1, 5, 6, 2896, 2896    ; t0 t1
585    ITX_MUL2X_PACK        0, 1, 5, 6, 2896, 2896, 1 ; t6 t5
586%if mmsize > 16
587    vbroadcasti32x4      m1, [o(deint_shuf)]
588    pshufb               m4, m1
589%else
590    pshufb               m4, [o(deint_shuf)]
591%endif
592    psubsw               m1, m2, m3 ; tmp3 tmp2
593    paddsw               m3, m2     ; tmp0 tmp1
594    punpckhqdq           m2, m4, m0 ; t7 t6
595    punpcklqdq           m4, m0     ; t4 t5
596    paddsw               m0, m3, m2 ; out0 out1
597    psubsw               m3, m2     ; out7 out6
598    psubsw               m2, m1, m4 ; out4 out5
599    paddsw               m1, m4     ; out3 out2
600%endmacro
601
602%macro IADST8_1D_PACKED 1 ; pass
603    vpbroadcastd         m6, [o(pd_2048)]
604%if %1 == 1
605    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076, 3 ; t1a t0a
606    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
607    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
608    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
609    psubsw               m4, m0, m2 ; t5 t4
610    paddsw               m0, m2     ; t1 t0
611    psubsw               m5, m1, m3 ; t6 t7
612    paddsw               m1, m3     ; t2 t3
613    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
614    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
615%if mmsize > 16
616    vbroadcasti32x4      m2, [o(deint_shuf)]
617%else
618    mova                 m2, [o(deint_shuf)]
619%endif
620    vprord               m1, 16
621    psubsw               m3, m0, m1 ; t3 t2
622    paddsw               m0, m1     ; -out7  out0
623    psubsw               m1, m4, m5 ; t7 t6
624    paddsw               m4, m5     ;  out6 -out1
625    pshufb               m0, m2
626    pshufb               m4, m2
627    mova                 m2, m6
628    vpdpwssd             m2, m3, [o(pw_m2896_2896)] {bcstd}
629    mova                 m5, m6
630    vpdpwssd             m5, m1, [o(pw_m2896_2896)] {bcstd}
631    psrad                m2, 12
632    psrad                m5, 12
633    packssdw             m2, m5     ; out4 -out5
634    mova                 m5, m6
635    vpdpwssd             m5, m3, [o(pw_2896_2896)] {bcstd}
636    mova                 m3, m6
637    vpdpwssd             m3, m1, [o(pw_2896_2896)] {bcstd}
638    psrad                m5, 12
639    psrad                m3, 12
640    packssdw             m1, m3, m5 ; out2 -out3
641%else
642    punpckhwd            m0, m4, m3 ; 0 7
643    punpckhwd            m1, m5, m2 ; 2 5
644    punpcklwd            m2, m5     ; 4 3
645    punpcklwd            m3, m4     ; 6 1
646    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
647    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
648    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
649    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
650    psubsw               m4, m0, m2 ; t4 t5
651    paddsw               m0, m2     ; t0 t1
652    psubsw               m5, m1, m3 ; t6 t7
653    paddsw               m1, m3     ; t2 t3
654    shufps               m2, m5, m4, q1032
655    punpckhwd            m4, m2
656    punpcklwd            m5, m2
657    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784    ; t4a t5a
658    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a
659    psubsw               m2, m0, m1 ; t2 t3
660    paddsw               m0, m1     ; out0 -out7
661    psubsw               m1, m4, m5 ; t6 t7
662    paddsw               m4, m5     ; -out1 out6
663    vpbroadcastd         m5, [o(pw_2896x8)]
664    punpckhqdq           m3, m2, m1 ; t3 t7
665    punpcklqdq           m2, m1     ; t2 t6
666    paddsw               m1, m2, m3 ; t2+t3 t6+t7
667    psubsw               m2, m3     ; t2-t3 t6-t7
668    punpckhqdq           m3, m4, m0 ; out6 -out7
669    punpcklqdq           m0, m4     ; out0 -out1
670    pmulhrsw             m2, m5     ; out4 -out5
671    pshufd               m1, m1, q1032
672    pmulhrsw             m1, m5     ; out2 -out3
673%endif
674%endmacro
675
676INIT_YMM avx512icl
677INV_TXFM_4X8_FN dct, dct
678INV_TXFM_4X8_FN dct, identity
679INV_TXFM_4X8_FN dct, adst
680INV_TXFM_4X8_FN dct, flipadst
681
682cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
683    vpermq               m0, [cq+32*0], q3120
684    vpermq               m1, [cq+32*1], q3120
685    vpbroadcastd         m2, [o(pw_2896x8)]
686    pmulhrsw             m0, m2
687    pmulhrsw             m1, m2
688    IDCT4_1D_PACKED
689    vbroadcasti32x4      m2, [o(deint_shuf)]
690    shufps               m3, m0, m1, q1331
691    shufps               m0, m0, m1, q0220
692    pshufb               m0, m2
693    pshufb               m1, m3, m2
694    jmp                tx2q
695.pass2:
696    vextracti32x4       xm2, m0, 1
697    vextracti32x4       xm3, m1, 1
698    call .main
699    vpbroadcastd         m4, [o(pw_2048)]
700    vinserti32x4         m0, m0, xm2, 1
701    vinserti32x4         m1, m1, xm3, 1
702    pshufd               m1, m1, q1032
703    jmp m(iadst_4x8_internal_8bpc).end2
704ALIGN function_align
705.main:
706    WRAP_XMM IDCT8_1D_PACKED
707    ret
708
709INV_TXFM_4X8_FN adst, dct
710INV_TXFM_4X8_FN adst, adst
711INV_TXFM_4X8_FN adst, flipadst
712INV_TXFM_4X8_FN adst, identity
713
714cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
715    vpermq               m0, [cq+32*0], q3120
716    vpermq               m1, [cq+32*1], q3120
717    vpbroadcastd         m2, [o(pw_2896x8)]
718    pmulhrsw             m0, m2
719    pmulhrsw             m1, m2
720    call m(iadst_8x4_internal_8bpc).main
721    punpckhwd            m3, m0, m1
722    punpcklwd            m0, m1
723    punpckhwd            m1, m0, m3
724    punpcklwd            m0, m3
725    jmp                tx2q
726.pass2:
727    vextracti32x4       xm2, m0, 1
728    vextracti32x4       xm3, m1, 1
729    pshufd              xm4, xm0, q1032
730    pshufd              xm5, xm1, q1032
731    call .main_pass2
732    vpbroadcastd         m4, [o(pw_2048)]
733    vinserti32x4         m0, xm2, 1
734    vinserti32x4         m1, xm3, 1
735    pxor                 m5, m5
736    psubw                m5, m4
737.end:
738    punpcklqdq           m4, m5
739.end2:
740    pmulhrsw             m0, m4
741    pmulhrsw             m1, m4
742.end3:
743    vpbroadcastd         m3, strided
744    pmulld               m5, m3, [o(pd_0to15)]
745    kxnorb               k1, k1, k1
746    kmovb                k2, k1
747    vpgatherdd       m3{k1}, [dstq+m5]
748    pxor                 m4, m4
749    mova               [cq], zmm20
750    punpcklbw            m2, m3, m4
751    punpckhbw            m3, m4
752    paddw                m0, m2
753    paddw                m1, m3
754    packuswb             m0, m1
755    vpscatterdd [dstq+m5]{k2}, m0
756    RET
757ALIGN function_align
758.main_pass1:
759    punpckhwd           xm0, xm4, xm3 ; 0 7
760    punpckhwd           xm1, xm5, xm2 ; 2 5
761    punpcklwd           xm2, xm5      ; 4 3
762    punpcklwd           xm3, xm4      ; 6 1
763    WRAP_XMM IADST8_1D_PACKED 1
764    punpcklqdq          xm3, xm4, xm0 ; out6 -out7
765    punpckhqdq          xm0, xm4      ; out0 -out1
766    ret
767ALIGN function_align
768.main_pass2:
769    WRAP_XMM IADST8_1D_PACKED 2
770    ret
771
772INV_TXFM_4X8_FN flipadst, dct
773INV_TXFM_4X8_FN flipadst, adst
774INV_TXFM_4X8_FN flipadst, flipadst
775INV_TXFM_4X8_FN flipadst, identity
776
777cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
778    vpermq               m0, [cq+32*0], q3120
779    vpermq               m1, [cq+32*1], q3120
780    vpbroadcastd         m2, [o(pw_2896x8)]
781    pmulhrsw             m0, m2
782    pmulhrsw             m1, m2
783    call m(iadst_8x4_internal_8bpc).main
784    punpcklwd            m3, m1, m0
785    punpckhwd            m1, m0
786    punpcklwd            m0, m1, m3
787    punpckhwd            m1, m3
788    jmp                tx2q
789.pass2:
790    vextracti32x4       xm2, m0, 1
791    vextracti32x4       xm3, m1, 1
792    pshufd              xm4, xm0, q1032
793    pshufd              xm5, xm1, q1032
794    call m(iadst_4x8_internal_8bpc).main_pass2
795    vpbroadcastd         m5, [o(pw_2048)]
796    vinserti32x4         m3, xm1, 1
797    vinserti32x4         m2, xm0, 1
798    pxor                 m4, m4
799    psubw                m4, m5
800    pshufd               m0, m3, q1032
801    pshufd               m1, m2, q1032
802    jmp m(iadst_4x8_internal_8bpc).end
803
804INIT_ZMM avx512icl
805INV_TXFM_4X8_FN identity, dct
806INV_TXFM_4X8_FN identity, adst
807INV_TXFM_4X8_FN identity, flipadst
808INV_TXFM_4X8_FN identity, identity
809
810cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
811    vpbroadcastd         m0, [o(pw_2896x8)]
812    pmulhrsw             m0, [cq]
813    mova                 m1, [o(int8_permB)]
814    vpbroadcastd         m2, [o(pw_1697x8)]
815    vpermb               m0, m1, m0
816    pmulhrsw             m2, m0
817    paddsw               m0, m2
818    vextracti32x8       ym1, m0, 1
819    jmp                tx2q
820.pass2:
821    vpbroadcastd        ym4, [o(pw_4096)]
822    jmp m(iadst_4x8_internal_8bpc).end2
823
824%macro INV_TXFM_4X16_FN 2 ; type1, type2
825    INV_TXFM_FN          %1, %2, 4x16
826%ifidn %1_%2, dct_dct
827    movsx               r6d, word [cq]
828    mov                [cq], eobd
829    imul                r6d, 181
830    add                 r6d, 128+256
831    sar                 r6d, 8+1
832    imul                r6d, 181
833    add                 r6d, 128+2048
834    sar                 r6d, 8+4
835    vpbroadcastw         m0, r6d
836    mova                 m1, m0
837    jmp m(iadst_4x16_internal_8bpc).end3
838%endif
839%endmacro
840
841%macro IDCT16_1D_PACKED 0
842    punpckhwd            m8, m7, m0 ; dct16 in15 in1
843    punpcklwd            m9, m4, m0 ; dct4  in2  in0
844    punpckhwd            m0, m3, m4 ; dct16 in7  in9
845    punpcklwd            m7, m1     ; dct8  in7  in1
846    punpckhwd            m1, m6     ; dct16 in3  in13
847    punpcklwd            m3, m5     ; dct8  in3  in5
848    punpckhwd            m5, m2     ; dct16 in11 in5
849    punpcklwd            m6, m2     ; dct4  in3  in1
850cglobal_label .main2
851    vpbroadcastd        m10, [o(pd_2048)]
852.main3:
853    vpbroadcastq        m13, [o(int_mshift)]
854    vpcmpub              k7, m13, m10, 6 ; 0x33...
855    ITX_MUL2X_PACK        8, 2, 4, 10,  401, 4076, 5 ; t8a  t15a
856    ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 5 ; t9a  t14a
857    ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a
858    ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a
859    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 5 ; t4a  t7a
860    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 5 ; t5a  t6a
861.main4:
862    psubsw               m2, m8, m0 ; t9  t14
863    paddsw               m8, m0     ; t8  t15
864    psubsw               m4, m1, m5 ; t10 t13
865    paddsw               m1, m5     ; t11 t12
866    ITX_MUL2X_PACK        6, 0, 5, 10, 1567,  3784    ; t3   t2
867    psubsw               m0, m8, m1 ; t11a t12a
868    paddsw               m8, m1     ; t8a  t15a
869    psubsw               m1, m7, m3 ; t5a  t6a
870    paddsw               m7, m3     ; t4   t7
871.main5:
872    ITX_MUL2X_PACK        2, 3, 5, 10, 1567,  3784, 5 ; t9a  t14a
873    ITX_MUL2X_PACK        4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a
874%if mmsize > 16
875    vbroadcasti32x4      m5, [o(deint_shuf)]
876%else
877    mova                 m5, [o(deint_shuf)]
878%endif
879    vpbroadcastd        m11, [o(pw_m2896_2896)]
880    vpbroadcastd        m12, [o(pw_2896_2896)]
881    paddsw               m3, m2, m4 ; t9   t14
882    psubsw               m2, m4     ; t10  t13
883    pshufb               m8, m5
884    pshufb               m7, m5
885    pshufb               m3, m5
886    ITX_MUL2X_PACK        9, 4,  5, 10, 11, 12    ; t0   t1
887    ITX_MUL2X_PACK        1, 4,  5, 10, 12, 11    ; t5   t6
888    ITX_MUL2X_PACK        0, 4,  5, 10, 11, 12, 8 ; t11  t12
889    ITX_MUL2X_PACK        2, 0, 11, 10, 11, 12, 8 ; t10a t13a
890    punpckhqdq           m2, m7, m1 ; t7 t6
891    punpcklqdq           m7, m1     ; t4 t5
892    psubsw               m1, m9, m6 ; dct4 out3 out2
893    paddsw               m9, m6     ; dct4 out0 out1
894    packssdw             m5, m11    ; t12  t13a
895    packssdw             m4, m0     ; t11  t10a
896    punpckhqdq           m0, m8, m3 ; t15a t14
897    punpcklqdq           m8, m3     ; t8a  t9
898    psubsw               m3, m9, m2 ; dct8 out7 out6
899    paddsw               m9, m2     ; dct8 out0 out1
900    psubsw               m2, m1, m7 ; dct8 out4 out5
901    paddsw               m1, m7     ; dct8 out3 out2
902    psubsw               m7, m9, m0 ; out15 out14
903    paddsw               m0, m9     ; out0  out1
904    psubsw               m6, m1, m5 ; out12 out13
905    paddsw               m1, m5     ; out3  out2
906    psubsw               m5, m2, m4 ; out11 out10
907    paddsw               m2, m4     ; out4  out5
908    psubsw               m4, m3, m8 ; out8  out9
909    paddsw               m3, m8     ; out7  out6
910%endmacro
911
912INV_TXFM_4X16_FN dct, dct
913INV_TXFM_4X16_FN dct, identity
914INV_TXFM_4X16_FN dct, adst
915INV_TXFM_4X16_FN dct, flipadst
916
917cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
918    mova                ym1, [cq+32*2]
919    vinserti32x8         m1, [cq+32*0], 1
920    mova                 m0, [o(int16_perm)]
921    mova                ym2, [cq+32*3]
922    vinserti32x8         m2, [cq+32*1], 1
923    vpbroadcastd         m4, [o(pd_2048)]
924    vpermb               m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3
925    vpermb               m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3
926    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896, 2
927    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784, 2
928    vpbroadcastd         m4, [o(pw_16384)]
929    psubsw               m3, m1, m2
930    paddsw               m1, m2     ; out0 out1
931    vprord               m3, 16     ; out2 out3
932    punpckldq            m0, m1, m3
933    punpckhdq            m1, m3
934    pmulhrsw             m0, m4
935    pmulhrsw             m1, m4
936    jmp                tx2q
937.pass2:
938    vextracti32x4       xm2, ym0, 1
939    vextracti32x4       xm3, ym1, 1
940    vextracti32x4       xm4, m0, 2
941    vextracti32x4       xm5, m1, 2
942    vextracti32x4       xm6, m0, 3
943    vextracti32x4       xm7, m1, 3
944    call .main
945    vinserti32x4        ym0, xm2, 1
946    vinserti32x4        ym1, xm3, 1
947    vinserti32x4        ym4, xm6, 1
948    vinserti32x4        ym5, xm7, 1
949    vinserti32x8         m0, ym4, 1
950    vinserti32x8         m1, ym5, 1
951    vpbroadcastd         m5, [o(pw_2048)]
952    pshufd               m1, m1, q1032
953    jmp m(iadst_4x16_internal_8bpc).end2
954ALIGN function_align
955.main:
956    WRAP_XMM IDCT16_1D_PACKED
957    ret
958
959INV_TXFM_4X16_FN adst, dct
960INV_TXFM_4X16_FN adst, adst
961INV_TXFM_4X16_FN adst, flipadst
962INV_TXFM_4X16_FN adst, identity
963
964cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
965    mova                 m1, [o(permB)]
966    vpermq               m0, m1, [cq+64*0]
967    vpermq               m1, m1, [cq+64*1]
968    call m(iadst_16x4_internal_8bpc).main
969    vpbroadcastd         m3, [o(pw_16384)]
970    punpckhwd            m2, m0, m1
971    punpcklwd            m0, m1
972    pmulhrsw             m2, m3
973    pmulhrsw             m0, m3
974    punpckhwd            m1, m0, m2
975    punpcklwd            m0, m2
976    jmp                tx2q
977.pass2:
978    call .main
979    vpbroadcastd         m5, [o(pw_2048)]
980    psrlq               m10, 4
981    psubw                m6, m8, m5
982.end:
983    vpbroadcastd         m7, [o(pw_2896x8)]
984    paddsw              ym1, ym2, ym4
985    psubsw              ym2, ym4
986    vinserti32x8         m1, ym2, 1
987    pmulhrsw             m1, m7      ; -out7   out4   out6  -out5   out8  -out11 -out9   out10
988    psrlq                m0, m10, 4
989    vpermi2q             m0, m1, m3  ; 0 1   4 5   8 9   c d
990    vpermt2q             m1, m10, m3 ; 2 3   6 7   a b   e f
991    punpcklqdq           m5, m6
992.end2:
993    pmulhrsw             m0, m5
994    pmulhrsw             m1, m5
995.end3:
996    vpbroadcastd         m3, strided
997    pmulld               m5, m3, [o(pd_0to15)]
998    kxnorw               k1, k1, k1
999    kmovw                k2, k1
1000    vpgatherdd       m3{k1}, [dstq+m5]
1001    pxor                 m4, m4
1002    mova          [cq+64*0], m4
1003    mova          [cq+64*1], m4
1004    punpcklbw            m2, m3, m4
1005    punpckhbw            m3, m4
1006    paddw                m0, m2
1007    paddw                m1, m3
1008    packuswb             m0, m1
1009    vpscatterdd [dstq+m5]{k2}, m0
1010    RET
1011ALIGN function_align
1012.main:
1013    movu                 m3, [o(permB+1)]
1014    psrlq               m10, m3, 4
1015.main2:
1016    vpermi2q             m3, m0, m1  ; in15 in12 in13 in14 in11 in8  in9  in10
1017    vpermt2q             m0, m10, m1 ; in0  in3  in2  in1  in4  in7  in6  in5
1018    vpbroadcastd         m9, [o(pd_2048)]
1019    vpbroadcastq       ym13, [o(int_mshift)]
1020    kxnorb               k1, k1, k1
1021    punpckhwd            m4, m3, m0  ; in12 in3  in14 in1
1022    punpcklwd            m0, m3      ; in0  in15 in2  in13
1023    kshiftrb             k1, k1, 4
1024    vextracti32x8       ym3, m4, 1   ; in8  in7  in10 in5
1025    vextracti32x8       ym1, m0, 1   ; in4  in11 in6  in9
1026INIT_YMM avx512icl
1027    vpcmpub              k7, m13, m9, 6 ; 0x33...
1028    pxor                 m8, m8
1029    ITX_MUL4X_PACK        0, 2, 5, 6, 7, 9,  201, 4091,  995, 3973, 5
1030    ITX_MUL4X_PACK        1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5
1031    ITX_MUL4X_PACK        3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5
1032    ITX_MUL4X_PACK        4, 2, 5, 6, 7, 9, 3857, 1380, 4052,  601, 5
1033    psubsw               m2, m0, m3 ; t9a  t8a  t11a t10a
1034    paddsw               m0, m3     ; t1a  t0a  t3a  t2a
1035    psubsw               m3, m1, m4 ; t13a t12a t15a t14a
1036    paddsw               m4, m1     ; t5a  t4a  t7a  t6a
1037    ITX_MUL4X_PACK        2, 1, 5, 6, 7, 9,  799, 4017, 3406, 2276, 5
1038    psubw                m7, m8, m7
1039    ITX_MUL2X_PACK        3, 1, 5, 9, 7, 6, 4
1040    vpbroadcastd         m6, [o(pw_3784_m1567)]
1041    vpbroadcastd     m6{k1}, [o(pw_m3784_1567)]
1042    psubsw               m1, m0, m4 ; t5   t4   t7   t6
1043    paddsw               m0, m4     ; t1   t0   t3   t2
1044    psubsw               m4, m2, m3 ; t13a t12a t15a t14a
1045    paddsw               m2, m3     ; t9a  t8a  t11a t10a
1046    ITX_MUL2X_PACK        1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a
1047    ITX_MUL2X_PACK        4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14
1048    vbroadcasti32x4      m5, [o(deint_shuf)]
1049    pshufb               m0, m5
1050    pshufb               m2, m5
1051    vshufi32x4           m3, m0, m2, 0x03  ; t3   t2   t11a t10a
1052    vinserti32x4         m0, xm2, 1        ; t1   t0   t9a  t8a
1053    vshufi32x4           m2, m1, m4, 0x03  ; t7a  t6a  t15  t14
1054    vinserti32x4         m1, xm4, 1        ; t4a  t5a  t12  t13
1055    pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
1056    psubsw               m4, m0, m3        ; t3a t2a t11 t10
1057    paddsw               m0, m3            ; -out15  out0   out14 -out1
1058    paddsw               m3, m1, m2        ;  out12 -out3  -out13  out2
1059    psubsw               m1, m2            ; t7 t6 t15a t14a
1060    punpckhqdq           m2, m4, m1        ; t2a t6  t10 t14a
1061    punpcklqdq           m4, m1            ; t3a t7  t11 t15a
1062INIT_ZMM avx512icl
1063    vinserti32x8         m3, ym0, 1        ; out12 -out3  -out13  out2  -out15  out0   out14 -out1
1064    ret
1065
1066INV_TXFM_4X16_FN flipadst, dct
1067INV_TXFM_4X16_FN flipadst, adst
1068INV_TXFM_4X16_FN flipadst, flipadst
1069INV_TXFM_4X16_FN flipadst, identity
1070
1071cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1072    mova                 m1, [o(permB)]
1073    vpermq               m0, m1, [cq+64*0]
1074    vpermq               m1, m1, [cq+64*1]
1075    call m(iadst_16x4_internal_8bpc).main
1076    vpbroadcastd         m3, [o(pw_16384)]
1077    punpcklwd            m2, m1, m0
1078    punpckhwd            m1, m0
1079    pmulhrsw             m2, m3
1080    pmulhrsw             m1, m3
1081    punpcklwd            m0, m1, m2
1082    punpckhwd            m1, m2
1083    jmp                tx2q
1084.pass2:
1085    call m(iadst_4x16_internal_8bpc).main
1086    vpbroadcastd         m6, [o(pw_2048)]
1087    psrlq               m10, 12
1088    psubw                m5, m8, m6
1089    jmp m(iadst_4x16_internal_8bpc).end
1090
1091INV_TXFM_4X16_FN identity, dct
1092INV_TXFM_4X16_FN identity, adst
1093INV_TXFM_4X16_FN identity, flipadst
1094INV_TXFM_4X16_FN identity, identity
1095
1096cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1097    mova                 m2, [o(int16_perm)]
1098    vpermb               m1, m2, [cq+64*0]
1099    vpermb               m2, m2, [cq+64*1]
1100    vpbroadcastd         m4, [o(pw_1697x8)]
1101    vpbroadcastd         m0, [o(pd_m1)]
1102    pmulhrsw             m3, m4, m1    ; we want to do a signed avg, but pavgw is
1103    vpcmpw               k1, m1, m0, 4 ; unsigned. as long as both signs are equal
1104    pmulhrsw             m4, m2        ; it still works, but if the input is -1 the
1105    vpcmpw               k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes
1106    vpavgw        m1{k1}{z}, m3        ; pavgw to output -32768 instead of 0 unless
1107    vpavgw        m2{k2}{z}, m4        ; we explicitly deal with that case here.
1108    punpckldq            m0, m1, m2
1109    punpckhdq            m1, m2
1110    jmp                tx2q
1111.pass2:
1112    vpbroadcastd         m3, [o(pw_1697x16)]
1113    vpbroadcastd         m5, [o(pw_2048)]
1114    pmulhrsw             m2, m3, m0
1115    pmulhrsw             m3, m1
1116    paddsw               m0, m0
1117    paddsw               m1, m1
1118    paddsw               m0, m2
1119    paddsw               m1, m3
1120    jmp m(iadst_4x16_internal_8bpc).end2
1121
1122%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3]
1123    movq               xm%3, [dstq   ]
1124    movhps             xm%3, [dstq+%5]
1125    movq               xm%4, [dstq+%6]
1126    movhps             xm%4, [dstq+%7]
1127    pmovzxbw            m%3, xm%3
1128    pmovzxbw            m%4, xm%4
1129%ifnum %1
1130    paddw               m%3, m%1
1131%else
1132    paddw               m%3, %1
1133%endif
1134%ifnum %2
1135    paddw               m%4, m%2
1136%else
1137    paddw               m%4, %2
1138%endif
1139    packuswb            m%3, m%4
1140    vextracti32x4      xm%4, m%3, 1
1141    movq          [dstq   ], xm%3
1142    movhps        [dstq+%6], xm%3
1143    movq          [dstq+%5], xm%4
1144    movhps        [dstq+%7], xm%4
1145%endmacro
1146
1147%macro INV_TXFM_8X4_FN 2 ; type1, type2
1148    INV_TXFM_FN          %1, %2, 8x4
1149%ifidn %1_%2, dct_dct
1150    movd                xm1, [o(pw_2896x8)]
1151    pmulhrsw            xm0, xm1, [cq]
1152    movd                xm2, [o(pw_2048)]
1153    pmulhrsw            xm0, xm1
1154    pmulhrsw            xm0, xm1
1155    pmulhrsw            xm0, xm2
1156    vpbroadcastw         m0, xm0
1157    mova                 m1, m0
1158    jmp m(iadst_8x4_internal_8bpc).end3
1159%endif
1160%endmacro
1161
1162INIT_YMM avx512icl
1163INV_TXFM_8X4_FN dct, dct
1164INV_TXFM_8X4_FN dct, adst
1165INV_TXFM_8X4_FN dct, flipadst
1166INV_TXFM_8X4_FN dct, identity
1167
1168cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1169    vpbroadcastd        xm3, [o(pw_2896x8)]
1170    pmulhrsw            xm0, xm3, [cq+16*0]
1171    pmulhrsw            xm1, xm3, [cq+16*1]
1172    pmulhrsw            xm2, xm3, [cq+16*2]
1173    pmulhrsw            xm3,      [cq+16*3]
1174    call m(idct_4x8_internal_8bpc).main
1175    vbroadcasti32x4      m4, [o(deint_shuf)]
1176    vinserti32x4         m3, m1, xm3, 1
1177    vinserti32x4         m1, m0, xm2, 1
1178    shufps               m0, m1, m3, q0220
1179    shufps               m1, m3, q1331
1180    pshufb               m0, m4
1181    pshufb               m1, m4
1182    jmp                tx2q
1183.pass2:
1184    IDCT4_1D_PACKED
1185    vpermq               m0, m0, q3120
1186    vpermq               m1, m1, q2031
1187    jmp m(iadst_8x4_internal_8bpc).end2
1188
1189INV_TXFM_8X4_FN adst, dct
1190INV_TXFM_8X4_FN adst, adst
1191INV_TXFM_8X4_FN adst, flipadst
1192INV_TXFM_8X4_FN adst, identity
1193
1194cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1195    vpbroadcastd        xm0, [o(pw_2896x8)]
1196    pshufd              xm4,      [cq+16*0], q1032
1197    pmulhrsw            xm3, xm0, [cq+16*3]
1198    pshufd              xm5,      [cq+16*1], q1032
1199    pmulhrsw            xm2, xm0, [cq+16*2]
1200    pmulhrsw            xm4, xm0
1201    pmulhrsw            xm5, xm0
1202    call m(iadst_4x8_internal_8bpc).main_pass1
1203    vinserti32x4         m0, xm2, 1
1204    vinserti32x4         m1, xm3, 1
1205    pxor                 m3, m3
1206    punpckhwd            m2, m0, m1
1207    punpcklwd            m0, m1
1208    psubsw               m3, m2
1209    punpckhwd            m1, m0, m3
1210    punpcklwd            m0, m3
1211    jmp                tx2q
1212.pass2:
1213    call .main
1214.end:
1215    vpermq               m0, m0, q3120
1216    vpermq               m1, m1, q3120
1217.end2:
1218    vpbroadcastd         m2, [o(pw_2048)]
1219    pmulhrsw             m0, m2
1220    pmulhrsw             m1, m2
1221.end3:
1222    pxor                 m2, m2
1223    mova               [cq], zmm18
1224    lea                  r6, [strideq*3]
1225    WRITE_8X4             0, 1, 4, 5
1226    RET
1227ALIGN function_align
1228.main:
1229    IADST4_1D_PACKED
1230    ret
1231
1232INV_TXFM_8X4_FN flipadst, dct
1233INV_TXFM_8X4_FN flipadst, adst
1234INV_TXFM_8X4_FN flipadst, flipadst
1235INV_TXFM_8X4_FN flipadst, identity
1236
1237cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1238    vpbroadcastd        xm0, [o(pw_2896x8)]
1239    pshufd              xm4,      [cq+16*0], q1032
1240    pmulhrsw            xm3, xm0, [cq+16*3]
1241    pshufd              xm5,      [cq+16*1], q1032
1242    pmulhrsw            xm2, xm0, [cq+16*2]
1243    pmulhrsw            xm4, xm0
1244    pmulhrsw            xm5, xm0
1245    call m(iadst_4x8_internal_8bpc).main_pass1
1246    vinserti32x4         m3, m3, xm1, 1
1247    vinserti32x4         m2, m2, xm0, 1
1248    punpckhwd            m1, m3, m2
1249    punpcklwd            m3, m2
1250    pxor                 m0, m0
1251    psubsw               m0, m1
1252    punpckhwd            m1, m0, m3
1253    punpcklwd            m0, m3
1254    jmp                tx2q
1255.pass2:
1256    call m(iadst_8x4_internal_8bpc).main
1257    mova                 m2, m1
1258    vpermq               m1, m0, q2031
1259    vpermq               m0, m2, q2031
1260    jmp m(iadst_8x4_internal_8bpc).end2
1261
1262INV_TXFM_8X4_FN identity, dct
1263INV_TXFM_8X4_FN identity, adst
1264INV_TXFM_8X4_FN identity, flipadst
1265INV_TXFM_8X4_FN identity, identity
1266
1267cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1268    mova                xm2, [cq+16*0]
1269    mova                xm0, [cq+16*1]
1270    vinserti32x4         m2, [cq+16*2], 1
1271    vinserti32x4         m0, [cq+16*3], 1
1272    vpbroadcastd         m3, [o(pw_2896x8)]
1273    punpcklwd            m1, m2, m0
1274    punpckhwd            m2, m0
1275    pmulhrsw             m1, m3
1276    pmulhrsw             m2, m3
1277    punpcklwd            m0, m1, m2
1278    punpckhwd            m1, m2
1279    paddsw               m0, m0
1280    paddsw               m1, m1
1281    jmp                tx2q
1282.pass2:
1283    vpbroadcastd         m3, [o(pw_1697x8)]
1284    pmulhrsw             m2, m3, m0
1285    pmulhrsw             m3, m1
1286    paddsw               m0, m2
1287    paddsw               m1, m3
1288    jmp m(iadst_8x4_internal_8bpc).end
1289
1290%macro INV_TXFM_8X8_FN 2 ; type1, type2
1291    INV_TXFM_FN          %1, %2, 8x8
1292%ifidn %1_%2, dct_dct
1293INIT_ZMM avx512icl
1294    movsx               r6d, word [cq]
1295    mov                [cq], eobd
1296.dconly:
1297    imul                r6d, 181
1298    add                 r6d, 128+256
1299    sar                 r6d, 8+1
1300.dconly2:
1301    vpbroadcastd        ym2, strided
1302    imul                r6d, 181
1303    pmulld              ym5, ym2, [o(pd_0to15)]
1304    kxnorb               k1, k1, k1
1305    add                 r6d, 128+2048
1306    sar                 r6d, 8+4
1307    pxor                 m3, m3
1308    vpbroadcastw         m4, r6d
1309.dconly_loop:
1310    kmovb                k2, k1
1311    vpgatherdq       m2{k1}, [dstq+ym5]
1312    punpcklbw            m0, m2, m3
1313    punpckhbw            m1, m2, m3
1314    paddw                m0, m4
1315    paddw                m1, m4
1316    packuswb             m0, m1
1317    kmovb                k1, k2
1318    vpscatterdq [dstq+ym5]{k2}, m0
1319    lea                dstq, [dstq+strideq*8]
1320    sub                 r3d, 8
1321    jg .dconly_loop
1322    RET
1323INIT_YMM avx512icl
1324%endif
1325%endmacro
1326
1327INV_TXFM_8X8_FN dct, dct
1328INV_TXFM_8X8_FN dct, identity
1329INV_TXFM_8X8_FN dct, adst
1330INV_TXFM_8X8_FN dct, flipadst
1331
1332cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1333    vpermq               m0, [cq+32*0], q3120 ; 0 1
1334    vpermq               m3, [cq+32*3], q3120 ; 6 7
1335    vpermq               m2, [cq+32*2], q3120 ; 4 5
1336    vpermq               m1, [cq+32*1], q3120 ; 2 3
1337    call .main
1338    shufps               m4, m0, m1, q0220
1339    shufps               m5, m0, m1, q1331
1340    shufps               m1, m2, m3, q0220
1341    shufps               m3, m2, m3, q1331
1342    vbroadcasti32x4      m0, [o(deint_shuf)]
1343    vpbroadcastd         m2, [o(pw_16384)]
1344    REPX   {pshufb   x, m0}, m4, m5, m1, m3
1345    REPX   {pmulhrsw x, m2}, m4, m5, m1, m3
1346    vinserti32x4         m0, m4, xm1, 1
1347    vshufi32x4           m2, m4, m1, 0x03
1348    vinserti32x4         m1, m5, xm3, 1
1349    vshufi32x4           m3, m5, m3, 0x03
1350    jmp                tx2q
1351.pass2:
1352    call .main
1353    vpbroadcastd         m4, [o(pw_2048)]
1354    vpermq               m0, m0, q3120
1355    vpermq               m1, m1, q2031
1356    vpermq               m2, m2, q3120
1357    vpermq               m3, m3, q2031
1358    jmp m(iadst_8x8_internal_8bpc).end2
1359ALIGN function_align
1360cglobal_label .main
1361    IDCT8_1D_PACKED
1362    ret
1363
1364INV_TXFM_8X8_FN adst, dct
1365INV_TXFM_8X8_FN adst, adst
1366INV_TXFM_8X8_FN adst, flipadst
1367INV_TXFM_8X8_FN adst, identity
1368
1369cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1370    vpermq               m4, [cq+32*0], q1302 ; 1 0
1371    vpermq               m3, [cq+32*3], q3120 ; 6 7
1372    vpermq               m5, [cq+32*1], q1302 ; 3 2
1373    vpermq               m2, [cq+32*2], q3120 ; 4 5
1374    call .main_pass1
1375    vpbroadcastd         m5, [o(pw_16384_m16384)]
1376    punpcklwd            m4, m0, m1
1377    punpckhwd            m0, m1
1378    punpcklwd            m1, m2, m3
1379    punpckhwd            m2, m3
1380    punpcklwd            m3, m4, m0
1381    punpckhwd            m4, m0
1382    punpcklwd            m0, m1, m2
1383    punpckhwd            m1, m2
1384    REPX   {pmulhrsw x, m5}, m3, m4, m0, m1
1385    vshufi32x4           m2, m3, m0, 0x03
1386    vinserti32x4         m0, m3, xm0, 1
1387    vshufi32x4           m3, m4, m1, 0x03
1388    vinserti32x4         m1, m4, xm1, 1
1389    jmp                tx2q
1390.pass2:
1391    pshufd               m4, m0, q1032
1392    pshufd               m5, m1, q1032
1393    call .main_pass2
1394    vpbroadcastd         m5, [o(pw_2048)]
1395    vpbroadcastd        xm4, [o(pw_4096)]
1396    psubw                m4, m5 ; lower half = 2048, upper half = -2048
1397.end:
1398    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
1399.end2:
1400    pmulhrsw             m0, m4
1401    pmulhrsw             m1, m4
1402.end3:
1403    pmulhrsw             m2, m4
1404    pmulhrsw             m3, m4
1405.end4:
1406    pxor                 m4, m4
1407    mova          [cq+32*0], m4
1408    mova          [cq+32*1], m4
1409    mova          [cq+32*2], m4
1410    mova          [cq+32*3], m4
1411    lea                  r6, [strideq*3]
1412    WRITE_8X4             0, 1, 4, 5
1413    lea                dstq, [dstq+strideq*4]
1414    WRITE_8X4             2, 3, 4, 5
1415    RET
1416ALIGN function_align
1417.main_pass1:
1418    punpckhwd            m0, m4, m3 ; 0 7
1419    punpckhwd            m1, m5, m2 ; 2 5
1420    punpcklwd            m2, m5     ; 4 3
1421    punpcklwd            m3, m4     ; 6 1
1422    IADST8_1D_PACKED 1
1423    punpcklqdq           m3, m4, m0        ; out6 -out7
1424    punpckhqdq           m0, m4            ; out0 -out1
1425    ret
1426ALIGN function_align
1427cglobal_label .main_pass2
1428    IADST8_1D_PACKED 2
1429    ret
1430
1431INV_TXFM_8X8_FN flipadst, dct
1432INV_TXFM_8X8_FN flipadst, adst
1433INV_TXFM_8X8_FN flipadst, flipadst
1434INV_TXFM_8X8_FN flipadst, identity
1435
1436cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1437    vpermq               m4, [cq+32*0], q1302 ; 1 0
1438    vpermq               m3, [cq+32*3], q3120 ; 6 7
1439    vpermq               m5, [cq+32*1], q1302 ; 3 2
1440    vpermq               m2, [cq+32*2], q3120 ; 4 5
1441    call m(iadst_8x8_internal_8bpc).main_pass1
1442    vpbroadcastd         m5, [o(pw_m16384_16384)]
1443    punpckhwd            m4, m3, m2
1444    punpcklwd            m3, m2
1445    punpckhwd            m2, m1, m0
1446    punpcklwd            m1, m0
1447    punpckhwd            m0, m4, m3
1448    punpcklwd            m4, m3
1449    punpckhwd            m3, m2, m1
1450    punpcklwd            m2, m1
1451    REPX   {pmulhrsw x, m5}, m0, m4, m3, m2
1452    vinserti32x4         m1, m0, xm3, 1
1453    vshufi32x4           m3, m0, m3, 0x03
1454    vinserti32x4         m0, m4, xm2, 1
1455    vshufi32x4           m2, m4, m2, 0x03
1456    jmp                tx2q
1457.pass2:
1458    pshufd               m4, m0, q1032
1459    pshufd               m5, m1, q1032
1460    call m(iadst_8x8_internal_8bpc).main_pass2
1461    vpbroadcastd         m4, [o(pw_2048)]
1462    vpbroadcastd        xm5, [o(pw_4096)]
1463    psubw                m4, m5 ; lower half = -2048, upper half = 2048
1464    vpermq               m5, m3, q2031
1465    vpermq               m3, m0, q2031
1466    vpermq               m0, m2, q2031
1467    vpermq               m2, m1, q2031
1468    pmulhrsw             m1, m0, m4
1469    pmulhrsw             m0, m5, m4
1470    jmp m(iadst_8x8_internal_8bpc).end3
1471
1472INV_TXFM_8X8_FN identity, dct
1473INV_TXFM_8X8_FN identity, adst
1474INV_TXFM_8X8_FN identity, flipadst
1475INV_TXFM_8X8_FN identity, identity
1476
1477cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1478    mova                xm3, [cq+16*0]
1479    mova                xm2, [cq+16*1]
1480    vinserti32x4         m3, [cq+16*4], 1
1481    vinserti32x4         m2, [cq+16*5], 1
1482    mova                xm4, [cq+16*2]
1483    mova                xm0, [cq+16*3]
1484    vinserti32x4         m4, [cq+16*6], 1
1485    vinserti32x4         m0, [cq+16*7], 1
1486    punpcklwd            m1, m3, m2
1487    punpckhwd            m3, m2
1488    punpcklwd            m2, m4, m0
1489    punpckhwd            m4, m0
1490    punpckldq            m0, m1, m2
1491    punpckhdq            m1, m2
1492    punpckldq            m2, m3, m4
1493    punpckhdq            m3, m4
1494    jmp                tx2q
1495.pass2:
1496    vpbroadcastd         m4, [o(pw_4096)]
1497    jmp m(iadst_8x8_internal_8bpc).end
1498
1499%macro INV_TXFM_8X16_FN 2 ; type1, type2
1500    INV_TXFM_FN          %1, %2, 8x16
1501%ifidn %1_%2, dct_dct
1502    movsx               r6d, word [cq]
1503    mov                [cq], eobd
1504    or                  r3d, 16
1505    imul                r6d, 181
1506    add                 r6d, 128
1507    sar                 r6d, 8
1508    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
1509%endif
1510%endmacro
1511
1512%macro ITX_8X16_LOAD_COEFS 0
1513    vpbroadcastd         m4, [o(pw_2896x8)]
1514    pmulhrsw             m0, m4, [cq+32*0]
1515    add                  cq, 32*4
1516    pmulhrsw             m7, m4, [cq+32*3]
1517    pmulhrsw             m1, m4, [cq-32*3]
1518    pmulhrsw             m6, m4, [cq+32*2]
1519    pmulhrsw             m2, m4, [cq-32*2]
1520    pmulhrsw             m5, m4, [cq+32*1]
1521    pmulhrsw             m3, m4, [cq-32*1]
1522    pmulhrsw             m4,     [cq+32*0]
1523%endmacro
1524
1525INIT_ZMM avx512icl
1526INV_TXFM_8X16_FN dct, dct
1527INV_TXFM_8X16_FN dct, identity
1528INV_TXFM_8X16_FN dct, adst
1529INV_TXFM_8X16_FN dct, flipadst
1530
1531cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1532    mova                 m3, [o(permB)]
1533    vpermq               m0, m3, [cq+64*0]
1534    vpbroadcastd         m4, [o(pw_2896x8)]
1535    vpermq               m1, m3, [cq+64*1]
1536    vpermq               m2, m3, [cq+64*2]
1537    vpermq               m3, m3, [cq+64*3]
1538    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
1539    call m(idct_16x8_internal_8bpc).main
1540    vpbroadcastd         m5, [o(pw_16384)]
1541    punpckhwd            m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3
1542    punpcklwd            m0, m2     ; a0 e0 a1 e1 a2 e2 a3 e3
1543    punpckhwd            m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3
1544    punpcklwd            m1, m3     ; d0 h0 d1 h1 d2 h2 d3 h3
1545    REPX   {pmulhrsw x, m5}, m4, m0, m2, m1
1546    punpckhwd            m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3
1547    punpcklwd            m0, m4     ; a0 b0 e0 f0 a1 b1 e1 f1
1548    punpckhwd            m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3
1549    punpcklwd            m2, m1     ; c0 d0 g0 h0 c1 d1 g1 h1
1550    punpckhdq            m1, m0, m2 ;  1  5  9 13
1551    punpckldq            m0, m2     ;  0  4  8 12
1552    punpckldq            m2, m3, m4 ;  2  6 10 14
1553    punpckhdq            m3, m4     ;  3  7 11 15
1554    jmp                tx2q
1555.pass2:
1556    vprord               m5, [o(int16_perm)], 16
1557    vshufi32x4           m2, m2, q1320     ;  2 10 14  6
1558    vshufi32x4           m4, m1, m3, q2310 ;  1  5 15 11
1559    vshufi32x4           m1, m3, q0132     ;  9 13  7  3
1560    vpermb               m9, m5, m0
1561    vpermb               m7, m5, m2
1562    vpermb               m8, m5, m4
1563    vpermb               m0, m5, m1
1564    vextracti32x8       ym6, m9, 1
1565    vextracti32x8       ym3, m7, 1
1566    vextracti32x8       ym5, m8, 1
1567    vextracti32x8       ym1, m0, 1
1568    call .main2
1569    mova                ym8, [o(gather8a)]
1570    lea                  r3, [dstq+strideq*4]
1571    pmovzxdq             m9, ym8
1572    pshufd              ym8, ym8, q1230
1573    vpermt2q             m0, m9, m4
1574    vpermt2q             m1, m9, m5
1575    vpermt2q             m2, m9, m6
1576    vpermt2q             m3, m9, m7
1577.end:
1578    vpbroadcastd         m7, [o(pw_2048)]
1579.end2:
1580    pmulhrsw             m0, m7
1581    pmulhrsw             m1, m7
1582.end3:
1583    pmulhrsw             m2, m7
1584    pmulhrsw             m3, m7
1585.end4:
1586    vpbroadcastd        ym6, strided
1587    kxnorb               k1, k1, k1
1588    pxor                 m4, m4
1589    pmulld              ym8, ym6
1590    kmovb                k2, k1
1591    vpgatherdq       m6{k1}, [dstq+ym8]
1592    kmovb                k1, k2
1593    vpgatherdq       m7{k2}, [r3+ym8]
1594    mova          [cq+64*0], m4
1595    mova          [cq+64*1], m4
1596    kmovb                k2, k1
1597    mova          [cq+64*2], m4
1598    mova          [cq+64*3], m4
1599    punpcklbw            m5, m6, m4
1600    punpckhbw            m6, m4
1601    paddw                m0, m5
1602    paddw                m1, m6
1603    packuswb             m0, m1
1604    vpscatterdq [dstq+ym8]{k1}, m0
1605    punpcklbw            m6, m7, m4
1606    punpckhbw            m7, m4
1607    paddw                m2, m6
1608    paddw                m3, m7
1609    packuswb             m2, m3
1610    vpscatterdq [r3+ym8]{k2}, m2
1611    RET
1612ALIGN function_align
1613cglobal_label .main_fast2 ; bottom three-quarters are zero
1614    vpbroadcastd       ym10, [o(pd_2048)]
1615    vpbroadcastq       ym13, [o(int_mshift)]
1616    vpbroadcastd        ym3, [o(pw_401_4076x8)]
1617    vpbroadcastd        ym5, [o(pw_799_4017x8)]
1618    vpbroadcastd        ym4, [o(pw_m1189_3920x8)]
1619    pxor                ym6, ym6
1620    punpckhwd           ym2, ym0, ym0
1621    pmulhrsw            ym2, ym3      ; t8a  t15a
1622    punpcklwd           ym7, ym1, ym1
1623    pmulhrsw            ym7, ym5      ; t4a  t7a
1624    punpckhwd           ym1, ym1
1625    pmulhrsw            ym4, ym1      ; t11a t12a
1626    vpcmpub              k7, ym13, ym10, 6
1627    punpcklwd           ym9, ym6, ym0
1628    psubsw              ym0, ym2, ym4 ; t11a t12a
1629    paddsw              ym8, ym2, ym4 ; t8a  t15a
1630    mova                ym1, ym7
1631    jmp .main5
1632ALIGN function_align
1633cglobal_label .main_fast ; bottom half is zero
1634    vpbroadcastd       ym10, [o(pd_2048)]
1635    vpbroadcastq       ym13, [o(int_mshift)]
1636    pxor                ym6, ym6
1637    punpckhwd           ym8, ym0, ym0
1638    punpckhwd           ym4, ym3, ym3
1639    punpckhwd           ym5, ym2, ym2
1640    punpcklwd           ym7, ym1, ym1
1641    punpckhwd           ym1, ym1
1642    punpcklwd           ym3, ym3
1643    punpcklwd           ym9, ym6, ym0
1644    punpcklwd           ym6, ym2
1645    vpbroadcastd        ym2, [o(pw_401_4076x8)]
1646    vpbroadcastd        ym0, [o(pw_m2598_3166x8)]
1647    vpbroadcastd       ym11, [o(pw_1931_3612x8)]
1648    vpbroadcastd       ym12, [o(pw_m1189_3920x8)]
1649    pmulhrsw            ym8, ym2  ; t8a  t15a
1650    vpbroadcastd        ym2, [o(pw_799_4017x8)]
1651    pmulhrsw            ym0, ym4  ; t9a  t14a
1652    vpbroadcastd        ym4, [o(pw_m2276_3406x8)]
1653    pmulhrsw            ym5, ym11 ; t10a t13a
1654    pmulhrsw            ym1, ym12 ; t11a t12a
1655    pmulhrsw            ym7, ym2  ; t4a  t7a
1656    pmulhrsw            ym3, ym4  ; t5a  t6a
1657    vpcmpub              k7, ym13, ym10, 6
1658    jmp .main4
1659ALIGN function_align
1660cglobal_label .main
1661    WRAP_YMM IDCT16_1D_PACKED
1662    ret
1663
1664INV_TXFM_8X16_FN adst, dct
1665INV_TXFM_8X16_FN adst, adst
1666INV_TXFM_8X16_FN adst, flipadst
1667INV_TXFM_8X16_FN adst, identity
1668
1669cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1670    call m(iadst_16x8_internal_8bpc).main_pass1
1671    vbroadcasti32x4      m6, [o(int_shuf1)]
1672    vpbroadcastd         m7, [o(pw_16384_m16384)]
1673    punpckhwd            m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
1674    punpcklwd            m4, m0     ; g0 h0 g1 h1 g2 h2 g3 h3
1675    pshufb               m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
1676    pshufb               m2, m6     ; e0 f0 e1 f1 e2 f2 e3 f3
1677.pass1_end:
1678    REPX   {pmulhrsw x, m7}, m3, m5, m4, m2
1679    punpckldq            m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1
1680    punpckhdq            m3, m5     ; a2 b2 c2 d2 a3 b3 c3 d3
1681    punpckhdq            m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3
1682    punpckldq            m2, m4     ; e0 f0 g0 h0 e1 f1 g1 h1
1683    punpckhqdq           m1, m0, m2
1684    punpcklqdq           m0, m2
1685    punpcklqdq           m2, m3, m5
1686    punpckhqdq           m3, m5
1687    jmp                tx2q
1688.pass2:
1689    call .main_pass2
1690    vpbroadcastd         m6, [o(pw_2048)]
1691    psrlq               m10, 4
1692    psubw                m7, m8, m6
1693.pass2_end:
1694    vpbroadcastd         m5, [o(pw_2896x8)]
1695    paddsw               m1, m2, m4
1696    psubsw               m2, m4
1697    pmulhrsw             m1, m5      ; -out7   out4   out6  -out5
1698    pmulhrsw             m5, m2      ;  out8  -out11 -out9   out10
1699    mova                ym8, [o(gather8c)]
1700    lea                  r3, [dstq+strideq]
1701    psrlq                m2, m10, 4
1702    vpermi2q             m2, m0, m3  ;  1  3 13 15
1703    vpermt2q             m0, m10, m3 ;  0  2 12 14
1704    psrlq                m3, m10, 8
1705    vpermi2q             m3, m1, m5  ;  5  7  9 11
1706    psrlq               m10, 12
1707    vpermt2q             m1, m10, m5 ;  4  6  8 10
1708    pmulhrsw             m0, m6
1709    pmulhrsw             m1, m6
1710    jmp m(idct_8x16_internal_8bpc).end3
1711ALIGN function_align
1712.main_pass1:
1713    vpbroadcastd         m2, [o(pw_2896x8)]
1714    pmulhrsw             m5, m2, [cq+64*0]
1715    pmulhrsw             m3, m2, [cq+64*3]
1716    pmulhrsw             m1, m2, [cq+64*1]
1717    pmulhrsw             m2,     [cq+64*2]
1718    movu                 m4, [o(permA+3)]
1719    psrlq               m10, m4, 4
1720    mova                 m6, m4
1721    vpermi2q             m4, m5, m3  ; in0  in12 in2  in14
1722    vpermt2q             m5, m10, m3 ; in15 in3  in13 in1
1723    vpermi2q             m6, m1, m2  ; in4  in8  in6  in10
1724    vpermt2q             m1, m10, m2 ; in11 in7  in9  in5
1725    jmp .main
1726ALIGN function_align
1727.main_pass2:
1728    mova                 m4, [o(permC)]
1729    psrlq                m5, m4, 4
1730    vpermi2q             m4, m0, m2  ; in0  in12 in2  in14
1731    psrlq                m6, m5, 4
1732    vpermi2q             m5, m1, m3  ; in15 in3  in13 in1
1733    psrlq               m10, m6, 4
1734    vpermi2q             m6, m0, m2  ; in4  in8  in6  in10
1735    vpermt2q             m1, m10, m3 ; in11 in7  in9  in5
1736.main:
1737    punpcklwd            m0, m4, m5  ; in0  in15 in2  in13
1738    punpckhwd            m4, m5      ; in12 in3  in14 in1
1739    punpcklwd            m5, m6, m1  ; in4  in11 in6  in9
1740    punpckhwd            m6, m1      ; in8  in7  in10 in5
1741cglobal_label .main2
1742    vpbroadcastd         m9, [o(pd_2048)]
1743    vpbroadcastq        m13, [o(int_mshift)]
1744    kxnorb               k1, k1, k1
1745    vpcmpub              k7, m13, m9, 6 ; 0x33...
1746    pxor                 m8, m8
1747    ITX_MUL4X_PACK        0, 1, 2, 3, 7, 9,  201, 4091,  995, 3973, 5
1748    ITX_MUL4X_PACK        6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5
1749    ITX_MUL4X_PACK        4, 1, 2, 3, 7, 9, 3857, 1380, 4052,  601, 5
1750    ITX_MUL4X_PACK        5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5
1751    psubsw               m2, m0, m6 ; t9a  t8a  t11a t10a
1752    paddsw               m0, m6     ; t1a  t0a  t3a  t2a
1753    psubsw               m3, m5, m4 ; t13a t12a t15a t14a
1754    paddsw               m5, m4     ; t5a  t4a  t7a  t6a
1755    ITX_MUL4X_PACK        2, 4, 1, 6, 7, 9,  799, 4017, 3406, 2276, 5
1756    psubw                m7, m8, m7
1757    ITX_MUL2X_PACK        3, 4, 1, 9, 7, 6, 4
1758    vpbroadcastd         m6, [o(pw_3784_m1567)]
1759    vpbroadcastd     m6{k1}, [o(pw_m3784_1567)]
1760    psubsw               m1, m0, m5 ; t5   t4   t7   t6
1761    paddsw               m0, m5     ; t1   t0   t3   t2
1762    psubsw               m4, m2, m3 ; t13a t12a t15a t14a
1763    paddsw               m2, m3     ; t9a  t8a  t11a t10a
1764    ITX_MUL2X_PACK        1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a
1765    ITX_MUL2X_PACK        4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15
1766    vbroadcasti32x4      m5, [o(deint_shuf)]
1767    pshufb               m0, m5
1768    pshufb               m2, m5
1769    vshufi32x4           m3, m0, m2, q3232 ; t3   t2   t11a t10a
1770    vinserti32x8         m0, ym2, 1        ; t1   t0   t9a  t8a
1771    vshufi32x4           m2, m1, m4, q3232 ; t6a  t7a  t14  t15
1772    vinserti32x8         m1, ym4, 1        ; t5a  t4a  t13  t12
1773    pshufd               m2, m2, q1032     ; t7a  t6a  t15  t14
1774    psubsw               m4, m0, m3        ; t3a t2a t11 t10
1775    paddsw               m0, m3            ; -out15  out0   out14 -out1
1776    paddsw               m3, m1, m2        ;  out12 -out3  -out13  out2
1777    psubsw               m1, m2            ; t7 t6 t15a t14a
1778    punpckhqdq           m2, m4, m1        ; t2a t6  t10 t14a
1779    punpcklqdq           m4, m1            ; t3a t7  t11 t15a
1780    ret
1781
1782INV_TXFM_8X16_FN flipadst, dct
1783INV_TXFM_8X16_FN flipadst, adst
1784INV_TXFM_8X16_FN flipadst, flipadst
1785INV_TXFM_8X16_FN flipadst, identity
1786
1787cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1788    call m(iadst_16x8_internal_8bpc).main_pass1
1789    vbroadcasti32x4      m6, [o(int_shuf2)]
1790    vpbroadcastd         m7, [o(pw_m16384_16384)]
1791    punpcklwd            m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
1792    punpckhwd            m4, m0     ; g0 h0 g1 h1 g2 h2 g3 h3
1793    pshufb               m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
1794    pshufb               m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
1795    jmp m(iadst_8x16_internal_8bpc).pass1_end
1796.pass2:
1797    call m(iadst_8x16_internal_8bpc).main_pass2
1798    vpbroadcastd         m7, [o(pw_2048)]
1799    psrlq               m10, 36
1800    psubw                m6, m8, m7
1801    jmp m(iadst_8x16_internal_8bpc).pass2_end
1802
1803INV_TXFM_8X16_FN identity, dct
1804INV_TXFM_8X16_FN identity, adst
1805INV_TXFM_8X16_FN identity, flipadst
1806INV_TXFM_8X16_FN identity, identity
1807
1808cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1809    mova                 m0, [o(int16_perm)]
1810    vpermb               m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
1811    vpermb               m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
1812    vpermb               m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
1813    vpermb               m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
1814    vpbroadcastd         m5, [o(pw_2896x8)]
1815    punpckldq            m1, m3, m2        ; a0 b0 c0 d0 a1 b1 c1 d1
1816    punpckhdq            m3, m2            ; a2 b2 c2 d2 a3 b3 c3 d3
1817    punpckldq            m2, m4, m0        ; e0 f0 g0 h0 a1 f1 g1 h1
1818    punpckhdq            m4, m0            ; e2 f2 g2 h2 e3 f3 g3 h3
1819    REPX   {pmulhrsw x, m5}, m1, m2, m3, m4
1820    punpcklqdq           m0, m1, m2        ; a0 b0 c0 d0 e0 f0 g0 h0
1821    punpckhqdq           m1, m2            ; a1 b1 c1 d1 e1 f1 g1 h1
1822    punpcklqdq           m2, m3, m4        ; a2 b2 c2 d2 e2 f2 g2 h2
1823    punpckhqdq           m3, m4            ; a3 b3 c3 d3 e3 f3 g3 h3
1824    jmp                tx2q
1825.pass2:
1826    vpbroadcastd         m7, [o(pw_1697x16)]
1827    mova                ym8, [o(gather8b)]
1828    lea                  r3, [dstq+strideq*2]
1829    pmulhrsw             m4, m7, m0
1830    pmulhrsw             m5, m7, m1
1831    pmulhrsw             m6, m7, m2
1832    pmulhrsw             m7, m3
1833    REPX      {paddsw x, x}, m0, m1, m2, m3
1834    paddsw               m0, m4
1835    paddsw               m1, m5
1836    paddsw               m2, m6
1837    paddsw               m3, m7
1838    jmp m(idct_8x16_internal_8bpc).end
1839
1840%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
1841    pmovzxbw            m%3, [dstq+%5]
1842%ifnum %1
1843    paddw               m%3, m%1
1844%else
1845    paddw               m%3, %1
1846%endif
1847    pmovzxbw            m%4, [dstq+%6]
1848%ifnum %2
1849    paddw               m%4, m%2
1850%else
1851    paddw               m%4, %2
1852%endif
1853    packuswb            m%3, m%4
1854    vpermq              m%3, m%3, q3120
1855    mova          [dstq+%5], xm%3
1856    vextracti32x4 [dstq+%6], m%3, 1
1857%endmacro
1858
1859%macro INV_TXFM_16X4_FN 2 ; type1, type2
1860    INV_TXFM_FN          %1, %2, 16x4
1861%ifidn %1_%2, dct_dct
1862    movsx               r6d, word [cq]
1863    mov                [cq], eobd
1864    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2
1865%endif
1866%endmacro
1867
1868INIT_ZMM avx512icl
1869INV_TXFM_16X4_FN dct, dct
1870INV_TXFM_16X4_FN dct, adst
1871INV_TXFM_16X4_FN dct, flipadst
1872INV_TXFM_16X4_FN dct, identity
1873
1874cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1875    mova                xm0, [cq+16*0]
1876    mova                xm1, [cq+16*1]
1877    mova                xm2, [cq+16*2]
1878    mova                xm3, [cq+16*3]
1879    mova                xm4, [cq+16*4]
1880    mova                xm5, [cq+16*5]
1881    mova                xm6, [cq+16*6]
1882    mova                xm7, [cq+16*7]
1883    call m(idct_4x16_internal_8bpc).main
1884    vpbroadcastd         m8, [o(pw_16384)]
1885    vinserti32x4        ym1, xm3, 1 ; 3 2   7 6
1886    vinserti32x4        ym5, xm7, 1 ; b a   f e
1887    vinserti32x4        ym0, xm2, 1 ; 0 1   4 5
1888    vinserti32x4        ym4, xm6, 1 ; 8 9   c d
1889    vinserti32x8         m1, ym5, 1 ; 3 2   7 6   b a   f e
1890    vinserti32x8         m0, ym4, 1 ; 0 1   4 5   8 9   c d
1891    pmulhrsw             m1, m8
1892    pmulhrsw             m0, m8
1893    pshufd               m1, m1, q1032
1894    punpckhwd            m2, m0, m1
1895    punpcklwd            m0, m1
1896    punpckhwd            m1, m0, m2
1897    punpcklwd            m0, m2
1898    jmp                tx2q
1899.pass2:
1900    IDCT4_1D_PACKED
1901    mova                 m2, [o(permA)]
1902    jmp m(iadst_16x4_internal_8bpc).end
1903
1904INV_TXFM_16X4_FN adst, dct
1905INV_TXFM_16X4_FN adst, adst
1906INV_TXFM_16X4_FN adst, flipadst
1907INV_TXFM_16X4_FN adst, identity
1908
1909cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1910    mova                 m0, [cq+64*0]
1911    mova                 m1, [cq+64*1]
1912    movshdup             m3, [o(permB)]
1913    psrlq               m10, m3, 4
1914    call m(iadst_4x16_internal_8bpc).main2
1915    vpbroadcastd         m6, [o(pw_16384_m16384)]
1916    psrlq                m0, m10, 4
1917    psrlq               m10, 8
1918.pass1_end:
1919    punpcklwd           ym5, ym4, ym2
1920    punpckhwd           ym4, ym2
1921    vinserti32x8         m5, ym4, 1
1922    mova                 m1, m9
1923    vpdpwssd             m1, m5, [o(pw_m2896_2896)] {1to16}
1924    mova                 m4, m9
1925    vpdpwssd             m4, m5, [o(pw_2896_2896)] {1to16}
1926    psrad                m1, 12
1927    psrad                m4, 12
1928    packssdw             m1, m4 ;  out8  -out7  -out9   out6  -out11  out4   out10 -out5
1929    vpermi2q             m0, m1, m3  ; 0 1   4 5   8 9   c d
1930    vpermt2q             m1, m10, m3 ; 2 3   6 7   a b   e f
1931    punpckhwd            m2, m0, m1
1932    punpcklwd            m0, m1
1933    punpckhwd            m1, m0, m2
1934    punpcklwd            m0, m2
1935    pmulhrsw             m0, m6
1936    pmulhrsw             m1, m6
1937    jmp                tx2q
1938.pass2:
1939    call .main
1940    movu                 m2, [o(permA+1)]
1941.end:
1942    vpbroadcastd         m3, [o(pw_2048)]
1943    pmulhrsw             m0, m3
1944    pmulhrsw             m1, m3
1945.end2:
1946    psrlq                m3, m2, 4
1947    vpermi2q             m2, m0, m1
1948    vpermi2q             m3, m0, m1
1949.end3:
1950    lea                  r3, [dstq+strideq*2]
1951    mova                xm1, [dstq+strideq*0]
1952    vinserti32x4        ym1, [dstq+strideq*1], 1
1953    vinserti32x4         m1, [r3  +strideq*0], 2
1954    vinserti32x4         m1, [r3  +strideq*1], 3
1955    pxor                 m4, m4
1956    mova          [cq+64*0], m4
1957    mova          [cq+64*1], m4
1958    punpcklbw            m0, m1, m4
1959    punpckhbw            m1, m4
1960    paddw                m0, m2
1961    paddw                m1, m3
1962    packuswb             m0, m1
1963    mova          [dstq+strideq*0], xm0
1964    vextracti32x4 [dstq+strideq*1], ym0, 1
1965    vextracti32x4 [r3  +strideq*0], m0, 2
1966    vextracti32x4 [r3  +strideq*1], m0, 3
1967    RET
1968ALIGN function_align
1969.main:
1970    IADST4_1D_PACKED
1971    ret
1972
1973INV_TXFM_16X4_FN flipadst, dct
1974INV_TXFM_16X4_FN flipadst, adst
1975INV_TXFM_16X4_FN flipadst, flipadst
1976INV_TXFM_16X4_FN flipadst, identity
1977
1978cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1979    mova                 m0, [cq+64*0]
1980    mova                 m1, [cq+64*1]
1981    movshdup             m3, [o(permB)]
1982    psrlq               m10, m3, 4
1983    call m(iadst_4x16_internal_8bpc).main2
1984    vpbroadcastd         m6, [o(pw_m16384_16384)]
1985    psrlq                m0, m10, 12
1986    psrlq               m10, 16
1987    jmp m(iadst_16x4_internal_8bpc).pass1_end
1988.pass2:
1989    call m(iadst_16x4_internal_8bpc).main
1990    movu                m2, [o(permA+2)]
1991    jmp m(iadst_16x4_internal_8bpc).end
1992
1993INV_TXFM_16X4_FN identity, dct
1994INV_TXFM_16X4_FN identity, adst
1995INV_TXFM_16X4_FN identity, flipadst
1996INV_TXFM_16X4_FN identity, identity
1997
1998cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1999    mova                 m1, [cq+64*0]
2000    mova                 m2, [cq+64*1]
2001    vpbroadcastd         m3, [o(pw_1697x16)]
2002    vpbroadcastd         m4, [o(pw_16384)]
2003    mova                 m5, [o(idtx_16x4p)]
2004    shufps               m0, m1, m2, q2020
2005    shufps               m1, m2, q3131
2006    pmulhrsw             m2, m3, m0
2007    pmulhrsw             m3, m1
2008    pmulhrsw             m2, m4
2009    pmulhrsw             m3, m4
2010    paddsw               m0, m2
2011    paddsw               m1, m3
2012    vpermb               m0, m5, m0
2013    vpermb               m1, m5, m1
2014    jmp                tx2q
2015.pass2:
2016    vpbroadcastd         m3, [o(pw_1697x8)]
2017    pmulhrsw             m2, m3, m0
2018    pmulhrsw             m3, m1
2019    paddsw               m0, m2
2020    paddsw               m1, m3
2021    movu                 m2, [o(permA+1)]
2022    jmp m(iadst_16x4_internal_8bpc).end
2023
2024%macro INV_TXFM_16X8_FN 2 ; type1, type2
2025    INV_TXFM_FN          %1, %2, 16x8
2026%ifidn %1_%2, dct_dct
2027    movsx               r6d, word [cq]
2028    mov                [cq], eobd
2029    or                  r3d, 8
2030.dconly:
2031    imul                r6d, 181
2032    add                 r6d, 128
2033    sar                 r6d, 8
2034.dconly2:
2035    imul                r6d, 181
2036    add                 r6d, 128+256
2037    sar                 r6d, 8+1
2038.dconly3:
2039    imul                r6d, 181
2040    lea                  r2, [strideq*3]
2041    add                 r6d, 128+2048
2042    sar                 r6d, 8+4
2043    pxor                 m2, m2
2044    vpbroadcastw         m3, r6d
2045.dconly_loop:
2046    mova                xm1, [dstq+strideq*0]
2047    vinserti32x4        ym1, [dstq+strideq*1], 1
2048    vinserti32x4         m1, [dstq+strideq*2], 2
2049    vinserti32x4         m1, [dstq+r2       ], 3
2050    punpcklbw            m0, m1, m2
2051    punpckhbw            m1, m2
2052    paddw                m0, m3
2053    paddw                m1, m3
2054    packuswb             m0, m1
2055    mova          [dstq+strideq*0], xm0
2056    vextracti32x4 [dstq+strideq*1], ym0, 1
2057    vextracti32x4 [dstq+strideq*2], m0, 2
2058    vextracti32x4 [dstq+r2       ], m0, 3
2059    lea                dstq, [dstq+strideq*4]
2060    sub                 r3d, 4
2061    jg .dconly_loop
2062    RET
2063%endif
2064%endmacro
2065
2066%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
2067    vpbroadcastd         m8, [o(pw_2896x8)]
2068    vpermq               m0, [cq+32*0], q3120
2069    add                  cq, 32*4
2070    vpermq               m7, [cq+32*3], q%1
2071    vpermq               m1, [cq-32*3], q%1
2072    vpermq               m6, [cq+32*2], q3120
2073    vpermq               m2, [cq-32*2], q3120
2074    vpermq               m5, [cq+32*1], q%1
2075    vpermq               m3, [cq-32*1], q%1
2076    vpermq               m4, [cq+32*0], q3120
2077    REPX   {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
2078%endmacro
2079
2080INV_TXFM_16X8_FN dct, dct
2081INV_TXFM_16X8_FN dct, identity
2082INV_TXFM_16X8_FN dct, adst
2083INV_TXFM_16X8_FN dct, flipadst
2084
2085cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2086    vpbroadcastd         m1, [o(pw_2896x8)]
2087    vpermq               m0, [cq+64*0], q3120
2088    vpermq               m2, [cq+64*1], q3120
2089    vpermq               m4, [cq+64*2], q3120
2090    vpermq               m6, [cq+64*3], q3120
2091    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6
2092    vextracti32x8       ym1, m0, 1
2093    vextracti32x8       ym3, m2, 1
2094    vextracti32x8       ym5, m4, 1
2095    vextracti32x8       ym7, m6, 1
2096    call m(idct_8x16_internal_8bpc).main
2097    vbroadcasti32x4      m8, [o(int_shuf1)]
2098    vbroadcasti32x4      m9, [o(int_shuf2)]
2099    vinserti32x8         m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3
2100    vinserti32x8         m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3
2101    vinserti32x8         m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3
2102    vinserti32x8         m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3
2103    vpbroadcastd         m2, [o(pw_16384)]
2104    pshufb               m0, m8     ; a0 b0 a1 b1 a2 b2 a3 b3
2105    pshufb               m1, m9     ; c0 d0 c1 d1 c2 d2 c3 d3
2106    pshufb               m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3
2107    pshufb               m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3
2108    REPX   {pmulhrsw x, m2}, m0, m1, m6, m7
2109    punpckldq            m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1
2110    punpckhdq            m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3
2111    punpckldq            m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1
2112    punpckhdq            m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3
2113    jmp                tx2q
2114.pass2:
2115    vshufi32x4           m0, m2, m4, q2020 ; 0 1
2116    vshufi32x4           m2, m4, q3131     ; 4 5
2117    vshufi32x4           m1, m3, m5, q2020 ; 2 3
2118    vshufi32x4           m3, m5, q3131     ; 6 7
2119    call .main
2120    movshdup             m4, [o(permC)]
2121    psrlq                m6, m4, 4
2122    vpermq               m5, m4, q1032
2123    vpermi2q             m4, m0, m2 ; a2 a3   b2 b3   e2 e3   f2 f3
2124    vpermt2q             m0, m6, m2 ; a0 a1   b0 b1   e0 e1   f0 f1
2125    psrlq                m6, m5, 4
2126    vpermi2q             m5, m1, m3 ; c2 c3   d2 d3   g2 g3   h2 h3
2127    vpermt2q             m1, m6, m3 ; c0 c1   d0 d1   g0 g1   h0 h1
2128    vpbroadcastd         m6, [o(pw_2048)]
2129.end:
2130    REPX   {pmulhrsw x, m6}, m0, m4, m1, m5
2131.end2:
2132    lea                  r3, [dstq+strideq*4]
2133    lea                  r4, [strideq*3]
2134    mova                xm3, [dstq+strideq*0]
2135    mova                xm6, [dstq+strideq*2]
2136    vinserti32x4        ym3, [dstq+strideq*1], 1
2137    vinserti32x4        ym6, [dstq+r4       ], 1
2138    vinserti32x4         m3, [r3  +strideq*0], 2
2139    vinserti32x4         m6, [r3  +strideq*2], 2
2140    vinserti32x4         m3, [r3  +strideq*1], 3
2141    vinserti32x4         m6, [r3  +r4       ], 3
2142    pxor                 m7, m7
2143    mova          [cq+64*0], m7
2144    mova          [cq+64*1], m7
2145    mova          [cq+64*2], m7
2146    mova          [cq+64*3], m7
2147    punpcklbw            m2, m3, m7
2148    punpckhbw            m3, m7
2149    paddw                m0, m2
2150    paddw                m4, m3
2151    packuswb             m0, m4
2152    mova          [dstq+strideq*0], xm0
2153    vextracti32x4 [dstq+strideq*1], ym0, 1
2154    vextracti32x4 [r3  +strideq*0], m0, 2
2155    vextracti32x4 [r3  +strideq*1], m0, 3
2156    punpcklbw            m3, m6, m7
2157    punpckhbw            m6, m7
2158    paddw                m1, m3
2159    paddw                m5, m6
2160    packuswb             m1, m5
2161    mova          [dstq+strideq*2], xm1
2162    vextracti32x4 [dstq+r4       ], ym1, 1
2163    vextracti32x4 [r3  +strideq*2], m1, 2
2164    vextracti32x4 [r3  +r4       ], m1, 3
2165    RET
2166ALIGN function_align
2167cglobal_label .main
2168    IDCT8_1D_PACKED
2169    ret
2170
2171INV_TXFM_16X8_FN adst, dct
2172INV_TXFM_16X8_FN adst, adst
2173INV_TXFM_16X8_FN adst, flipadst
2174INV_TXFM_16X8_FN adst, identity
2175
2176cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2177    call m(iadst_8x16_internal_8bpc).main_pass1
2178    vpbroadcastd         m7, [o(pw_16384_m16384)]
2179    psrlq               m10, 4
2180.pass1_end:
2181    punpcklwd            m5, m4, m2
2182    punpckhwd            m4, m2
2183    mova                 m1, m9
2184    vpdpwssd             m1, m5, [o(pw_m2896_2896)] {1to16}
2185    mova                 m6, m9
2186    vpdpwssd             m6, m5, [o(pw_2896_2896)] {1to16}
2187    mova                 m2, m9
2188    vpdpwssd             m2, m4, [o(pw_m2896_2896)] {1to16}
2189    vpdpwssd             m9, m4, [o(pw_2896_2896)] {1to16}
2190    psrad                m1, 12
2191    psrad                m6, 12
2192    packssdw             m1, m6 ;  out8  -out7  -out9   out6
2193    psrad                m2, 12
2194    psrad                m9, 12
2195    packssdw             m2, m9 ; -out11  out4   out10 -out5
2196    psrlq                m4, m10, 4
2197    vpermi2q             m4, m0, m2
2198    vpermt2q             m0, m10, m2
2199    psrlq                m5, m10, 8
2200    vpermi2q             m5, m1, m3
2201    psrlq               m10, 12
2202    vpermt2q             m1, m10, m3
2203    punpcklwd            m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3
2204    punpckhwd            m4, m5     ; b0 d0 b1 d1 b2 d2 b3 d3
2205    punpcklwd            m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3
2206    punpckhwd            m1, m0     ; j0 l0 j1 l1 j2 l2 j3 l3
2207    punpcklwd            m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1
2208    punpckhwd            m3, m4     ; a2 b2 c2 d2 a3 b3 c3 d3
2209    punpcklwd            m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1
2210    punpckhwd            m5, m1     ; i2 j2 k2 l2 i3 j3 k3 l3
2211    REPX   {pmulhrsw x, m7}, m2, m3, m4, m5
2212    jmp                tx2q
2213.pass2:
2214    vshufi32x4           m0, m2, m4, q2020
2215    vshufi32x4           m2, m4, q3131     ; 4 5
2216    vshufi32x4           m1, m3, m5, q2020
2217    vshufi32x4           m3, m5, q3131     ; 6 7
2218    pshufd               m4, m0, q1032     ; 1 0
2219    pshufd               m5, m1, q1032     ; 3 2
2220    call .main_pass2
2221    movshdup             m4, [o(permC)]
2222    pmulhrsw             m0, m6
2223    pmulhrsw             m1, m6
2224    psrlq                m6, m4, 4
2225    mova                 m5, m4
2226    vpermi2q             m4, m0, m2
2227    vpermt2q             m0, m6, m2
2228    vpermi2q             m5, m1, m3
2229    vpermt2q             m1, m6, m3
2230    jmp m(idct_16x8_internal_8bpc).end2
2231ALIGN function_align
2232.main_pass1:
2233    vpbroadcastd         m4, [o(pw_2896x8)]
2234    pmulhrsw             m3, m4, [cq+64*0]
2235    pmulhrsw             m1, m4, [cq+64*3]
2236    pmulhrsw             m2, m4, [cq+64*1]
2237    pmulhrsw             m4, [cq+64*2]
2238    mova                 m5, [o(int16_perm)]
2239    kxnorb               k1, k1, k1
2240    vpblendmd        m0{k1}, m1, m3 ; 0 7
2241    vmovdqa32        m3{k1}, m1     ; 6 1
2242    vpblendmd        m1{k1}, m4, m2 ; 2 5
2243    vmovdqa32        m2{k1}, m4     ; 4 3
2244    REPX  {vpermb x, m5, x}, m0, m1, m2, m3
2245    IADST8_1D_PACKED 1
2246    ret
2247ALIGN function_align
2248cglobal_label .main_pass2
2249    IADST8_1D_PACKED 2
2250    pxor                 m5, m5
2251    psubd                m5, m6
2252    packssdw             m6, m5
2253    pmulhrsw             m2, m6
2254    pmulhrsw             m3, m6
2255    ret
2256
2257INV_TXFM_16X8_FN flipadst, dct
2258INV_TXFM_16X8_FN flipadst, adst
2259INV_TXFM_16X8_FN flipadst, flipadst
2260INV_TXFM_16X8_FN flipadst, identity
2261
2262cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2263    call m(iadst_8x16_internal_8bpc).main_pass1
2264    vpbroadcastd         m7, [o(pw_m16384_16384)]
2265    psrlq               m10, 20
2266    jmp m(iadst_16x8_internal_8bpc).pass1_end
2267.pass2:
2268    vshufi32x4           m0, m2, m4, q2020
2269    vshufi32x4           m2, m4, q3131     ; 4 5
2270    vshufi32x4           m1, m3, m5, q2020
2271    vshufi32x4           m3, m5, q3131     ; 6 7
2272    pshufd               m4, m0, q1032     ; 1 0
2273    pshufd               m5, m1, q1032     ; 3 2
2274    call m(iadst_16x8_internal_8bpc).main_pass2
2275    movshdup             m4, [o(permC)]
2276    pmulhrsw             m5, m6, m0
2277    pmulhrsw             m0, m6, m1
2278    psrlq                m1, m4, 12
2279    psrlq                m4, 8
2280    mova                 m7, m4
2281    vpermi2q             m4, m0, m3
2282    vpermt2q             m0, m1, m3
2283    vpermi2q             m1, m5, m2
2284    vpermt2q             m5, m7, m2
2285    jmp m(idct_16x8_internal_8bpc).end2
2286
2287INV_TXFM_16X8_FN identity, dct
2288INV_TXFM_16X8_FN identity, adst
2289INV_TXFM_16X8_FN identity, flipadst
2290INV_TXFM_16X8_FN identity, identity
2291
2292cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2293    vpbroadcastd         m0, [o(pw_2896x8)]
2294    pmulhrsw             m3, m0, [cq+64*0]
2295    pmulhrsw             m4, m0, [cq+64*1]
2296    pmulhrsw             m5, m0, [cq+64*2]
2297    pmulhrsw             m0,     [cq+64*3]
2298    vpbroadcastd         m7, [o(pw_1697x16)]
2299    vpbroadcastd         m8, [o(pw_16384)]
2300    shufps               m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5
2301    shufps               m3, m4, q3131     ; a2 a3 a6 a7 e2 e3 e6 e7
2302    shufps               m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5
2303    shufps               m5, m0, q3131     ; i2 i3 i6 i7 m2 m3 m6 m7
2304    mova                 m9, [o(int8_permA)]
2305    pmulhrsw             m0, m7, m2
2306    pmulhrsw             m1, m7, m3
2307    pmulhrsw             m6, m7, m4
2308    pmulhrsw             m7, m5
2309    REPX   {pmulhrsw x, m8}, m0, m1, m6, m7
2310    paddsw               m2, m0
2311    paddsw               m3, m1
2312    paddsw               m4, m6
2313    paddsw               m5, m7
2314    REPX  {vpermb x, m9, x}, m2, m3, m4, m5
2315    jmp                tx2q
2316.pass2:
2317    mova                 m7, [o(permB)]
2318    vpbroadcastd         m6, [o(pw_4096)]
2319    vpermq               m0, m7, m2
2320    vpermq               m4, m7, m4
2321    vpermq               m1, m7, m3
2322    vpermq               m5, m7, m5
2323    jmp m(idct_16x8_internal_8bpc).end
2324
2325%macro INV_TXFM_16X16_FN 2 ; type1, type2
2326    INV_TXFM_FN          %1, %2, 16x16
2327%ifidn %1_%2, dct_dct
2328    movsx               r6d, word [cq]
2329    mov                [cq], eobd
2330    or                  r3d, 16
2331    imul                r6d, 181
2332    add                 r6d, 128+512
2333    sar                 r6d, 8+2
2334    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
2335%endif
2336%endmacro
2337
2338INV_TXFM_16X16_FN dct, dct
2339INV_TXFM_16X16_FN dct, identity
2340INV_TXFM_16X16_FN dct, adst
2341INV_TXFM_16X16_FN dct, flipadst
2342
2343cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2344    mova                 m7, [o(permB)]
2345    vpermq               m0, m7, [cq+64*0]
2346    vpermq               m1, m7, [cq+64*1]
2347    vpermq               m2, m7, [cq+64*2]
2348    vpermq               m3, m7, [cq+64*3]
2349    vpermq               m4, m7, [cq+64*4]
2350    vpermq               m5, m7, [cq+64*5]
2351    vpermq               m6, m7, [cq+64*6]
2352    vpermq               m7, m7, [cq+64*7]
2353    call .main
2354    vbroadcasti32x4     m12, [o(int_shuf1)]
2355    vbroadcasti32x4     m11, [o(int_shuf2)]
2356    vpbroadcastd        m13, [o(pw_8192)]
2357    pshufb               m0, m12
2358    pshufb               m8, m1, m11
2359    pshufb               m2, m12
2360    pshufb               m9, m3, m11
2361    pshufb               m4, m12
2362    pshufb              m10, m5, m11
2363    pshufb               m6, m12
2364    pshufb              m11, m7, m11
2365    REPX  {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11
2366    punpckhdq            m1, m0, m8
2367    punpckldq            m0, m8
2368    punpckhdq            m3, m2, m9
2369    punpckldq            m2, m9
2370    punpckhdq            m5, m4, m10
2371    punpckldq            m4, m10
2372    punpckhdq            m7, m6, m11
2373    punpckldq            m6, m11
2374    jmp                tx2q
2375.pass2:
2376    vshufi32x4           m8, m4, m6, q3232 ; i8 ic m8 mc
2377    vinserti32x8         m4, ym6, 1        ; i0 i4 m0 m4
2378    vshufi32x4           m6, m0, m2, q3232 ; a8 ac e8 ec
2379    vinserti32x8         m0, ym2, 1        ; a0 a4 e0 e4
2380    vshufi32x4           m9, m5, m7, q3232 ; ia ie ma me
2381    vinserti32x8         m5, ym7, 1        ; i2 i6 m2 m6
2382    vshufi32x4           m7, m1, m3, q3232 ; aa ae ea ee
2383    vinserti32x8         m1, ym3, 1        ; a2 a6 e2 e6
2384    vshufi32x4           m2, m0, m4, q3131 ;  4  5
2385    vshufi32x4           m0, m4, q2020     ;  0  1
2386    vshufi32x4           m4, m6, m8, q2020 ;  8  9
2387    vshufi32x4           m6, m8, q3131     ; 12 13
2388    vshufi32x4           m3, m1, m5, q3131 ;  6  7
2389    vshufi32x4           m1, m5, q2020     ;  2  3
2390    vshufi32x4           m5, m7, m9, q2020 ; 10 11
2391    vshufi32x4           m7, m9, q3131     ; 14 15
2392    call .main
2393    mova                  m8, [o(permD)]
2394    psrlq                m12, m8, 4
2395    psrlq                 m9, m8, 8
2396    psrlq                m13, m8, 12
2397    mova                 m10, m8
2398    vpermi2q              m8, m0, m2 ;  0  1  4  5
2399    vpermt2q              m0, m12, m2
2400    mova                 m11, m9
2401    vpermi2q              m9, m1, m3 ;  2  3  6  7
2402    vpermt2q              m1, m13, m3
2403    vpermi2q             m10, m4, m6 ;  8  9 12 13
2404    vpermt2q              m4, m12, m6
2405    vpermi2q             m11, m5, m7 ; 10 11 14 15
2406    vpermt2q              m5, m13, m7
2407.end:
2408    vpbroadcastd        m12, [o(pw_2048)]
2409.end2:
2410    REPX  {pmulhrsw x, m12}, m0, m1, m4, m5
2411.end3:
2412    REPX  {pmulhrsw x, m12}, m8, m9, m10, m11
2413    lea                  r3, [strideq*3]
2414    lea                  r4, [dstq+strideq*4]
2415    lea                  r5, [dstq+strideq*8]
2416    lea                  r6, [r4  +strideq*8]
2417    mova                xm3, [dstq+strideq*0]
2418    mova                xm6, [dstq+strideq*2]
2419    vinserti32x4        ym3, [dstq+strideq*1], 1
2420    vinserti32x4        ym6, [dstq+r3       ], 1
2421    vinserti32x4         m3, [r4+strideq*0], 2
2422    vinserti32x4         m6, [r4+strideq*2], 2
2423    vinserti32x4         m3, [r4+strideq*1], 3
2424    vinserti32x4         m6, [r4+r3       ], 3
2425    mova               xm12, [r5+strideq*0]
2426    mova               xm13, [r5+strideq*2]
2427    vinserti32x4       ym12, [r5+strideq*1], 1
2428    vinserti32x4       ym13, [r5+r3       ], 1
2429    vinserti32x4        m12, [r6+strideq*0], 2
2430    vinserti32x4        m13, [r6+strideq*2], 2
2431    vinserti32x4        m12, [r6+strideq*1], 3
2432    vinserti32x4        m13, [r6+r3       ], 3
2433    pxor                 m7, m7
2434    REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
2435    punpcklbw            m2, m3, m7
2436    punpckhbw            m3, m7
2437    paddw                m0, m2
2438    paddw                m8, m3
2439    packuswb             m0, m8
2440    punpcklbw            m2, m6, m7
2441    punpckhbw            m6, m7
2442    paddw                m1, m2
2443    paddw                m9, m6
2444    packuswb             m1, m9
2445    punpcklbw            m2, m12, m7
2446    punpckhbw           m12, m7
2447    paddw                m2, m4
2448    paddw               m10, m12
2449    packuswb             m2, m10
2450    punpcklbw            m3, m13, m7
2451    punpckhbw           m13, m7
2452    paddw                m3, m5
2453    paddw               m11, m13
2454    packuswb             m3, m11
2455    mova          [dstq+strideq*0], xm0
2456    vextracti32x4 [dstq+strideq*1], ym0, 1
2457    mova          [dstq+strideq*2], xm1
2458    vextracti32x4 [dstq+r3       ], ym1, 1
2459    vextracti32x4 [r4+strideq*0], m0, 2
2460    vextracti32x4 [r4+strideq*1], m0, 3
2461    vextracti32x4 [r4+strideq*2], m1, 2
2462    vextracti32x4 [r4+r3       ], m1, 3
2463    mova          [r5+strideq*0], xm2
2464    vextracti32x4 [r5+strideq*1], ym2, 1
2465    mova          [r5+strideq*2], xm3
2466    vextracti32x4 [r5+r3       ], ym3, 1
2467    vextracti32x4 [r6+strideq*0], m2, 2
2468    vextracti32x4 [r6+strideq*1], m2, 3
2469    vextracti32x4 [r6+strideq*2], m3, 2
2470    vextracti32x4 [r6+r3       ], m3, 3
2471    RET
2472ALIGN function_align
2473cglobal_label .main_fast2 ; bottom three-quarters are zero
2474    vpbroadcastd        m10, [o(pd_2048)]
2475    vpbroadcastq        m13, [o(int_mshift)]
2476    vpcmpub              k7, m13, m10, 6
2477.main_fast4:
2478    vpbroadcastd         m2, [o(pw_401_4076x8)]
2479    vpbroadcastd         m4, [o(pw_m1189_3920x8)]
2480    vpbroadcastd         m3, [o(pw_799_4017x8)]
2481    pmulhrsw             m2, m8     ; t8a  t15a
2482    pmulhrsw             m4, m1     ; t11a t12a
2483    pmulhrsw             m7, m3     ; t4a  t7a
2484    pxor                 m6, m6
2485    psubsw               m0, m2, m4 ; t11a t12a
2486    paddsw               m8, m2, m4 ; t8a  t15a
2487    mova                 m1, m7
2488    jmp .main5
2489ALIGN function_align
2490cglobal_label .main_fast ; bottom half is zero
2491    vpbroadcastd        m10, [o(pd_2048)]
2492.main_fast3:
2493    vpbroadcastq        m13, [o(int_mshift)]
2494    vpcmpub              k7, m13, m10, 6
2495.main_fast5:
2496    vpbroadcastd         m2, [o(pw_401_4076x8)]
2497    vpbroadcastd         m4, [o(pw_m2598_3166x8)]
2498    vpbroadcastd        m11, [o(pw_1931_3612x8)]
2499    vpbroadcastd        m12, [o(pw_m1189_3920x8)]
2500    pmulhrsw             m8, m2  ; t8a  t15a
2501    vpbroadcastd         m2, [o(pw_799_4017x8)]
2502    pmulhrsw             m0, m4  ; t9a  t14a
2503    vpbroadcastd         m4, [o(pw_m2276_3406x8)]
2504    pmulhrsw             m5, m11 ; t10a t13a
2505    pmulhrsw             m1, m12 ; t11a t12a
2506    pmulhrsw             m7, m2  ; t4a  t7a
2507    pmulhrsw             m3, m4  ; t5a  t6a
2508    jmp .main4
2509ALIGN function_align
2510cglobal_label .main
2511    IDCT16_1D_PACKED
2512    ret
2513
2514INV_TXFM_16X16_FN adst, dct
2515INV_TXFM_16X16_FN adst, adst
2516INV_TXFM_16X16_FN adst, flipadst
2517
2518cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2519    call .main_pass1
2520    vpbroadcastd        m10, [o(pw_8192_m8192)]
2521    punpcklwd            m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3
2522    punpckhwd            m0, m1     ; a0 c0 a1 c1 a2 c2 a3 c3
2523    punpckhwd            m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3
2524    punpcklwd            m0, m8     ; a0 b0 c0 d0 a1 b1 c1 d1
2525    punpcklwd            m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3
2526    punpckhwd            m2, m3     ; e0 g0 e1 g1 e2 g2 e3 g3
2527    punpckhwd            m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3
2528    punpcklwd            m2, m8     ; e0 f0 g0 h0 e1 f1 g1 h1
2529    punpckhwd            m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3
2530    punpcklwd            m4, m5     ; j0 l0 j1 l1 j2 l2 j3 l3
2531    punpckhwd            m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3
2532    punpcklwd            m4, m8     ; i0 j0 k0 l0 i1 j1 k1 l1
2533    punpckhwd            m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3
2534    punpcklwd            m6, m7     ; n0 p0 n1 p1 n2 p2 n3 p3
2535    punpckhwd            m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3
2536    punpcklwd            m6, m8     ; m0 n0 o0 p0 m1 n1 o1 p1
2537.pass1_end:
2538    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
2539    jmp                tx2q
2540.pass2:
2541    call .main_pass2
2542    mova                m10, [o(permD)]
2543    psrlq                m8, m10, 8
2544    psrlq               m12, m10, 12
2545    psrlq               m13, m10, 4
2546    mova                 m9, m8
2547    vpermi2q             m8, m0, m2 ;  0  1  4  5
2548    vpermt2q             m0, m12, m2
2549    vpermi2q             m9, m1, m3 ;  2  3  6  7
2550    vpermt2q             m1, m12, m3
2551    vpbroadcastd        m12, [o(pw_2048)]
2552    mov                 r3d, 0xff00ff00
2553    mova                m11, m10
2554    vpermi2q            m10, m4, m6 ;  8  9 12 13
2555    vpermt2q             m4, m13, m6
2556    kmovd                k1, r3d
2557    vpermi2q            m11, m5, m7 ; 10 11 14 15
2558    vpermt2q             m5, m13, m7
2559    pxor                 m7, m7
2560    vpsubw          m12{k1}, m7, m12
2561    jmp m(idct_16x16_internal_8bpc).end2
2562ALIGN function_align
2563.main_pass1:
2564    mova                 m4, [o(permB)]
2565    psrlq                m3, m4, 4
2566    vpermq               m0, m4, [cq+64*0]
2567    vpermq               m7, m3, [cq+64*7]
2568    vpermq               m6, m4, [cq+64*6]
2569    vpermq               m1, m3, [cq+64*1]
2570    vpermq               m2, m4, [cq+64*2]
2571    vpermq               m5, m3, [cq+64*5]
2572    vpermq               m4, m4, [cq+64*4]
2573    vpermq               m3, m3, [cq+64*3]
2574    call .main
2575    vpbroadcastd        m13, [o(pw_2896_2896)]
2576    vpbroadcastd        m12, [o(pw_m2896_2896)]
2577    mova                 m2, m10
2578    vpdpwssd             m2, m5, m13       ; -out5
2579    mova                 m8, m10
2580    vpdpwssd             m8, m11, m13      ;  out4
2581    mova                 m9, m10
2582    vpdpwssd             m9, m5, m12       ;  out10
2583    mova                 m5, m10
2584    vpdpwssd             m5, m11, m12      ; -out11
2585    mova                m11, m10
2586    vpdpwssd            m11, m3, m13       ; -out7
2587    mova                m14, m10
2588    vpdpwssd            m14, m4, m13       ;  out6
2589    mova                m13, m10
2590    vpdpwssd            m13, m3, m12       ;  out8
2591    vpdpwssd            m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9
2592    REPX      {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10
2593    packssdw             m2, m8            ; -out5   out4
2594    packssdw             m5, m9, m5        ;  out10 -out11
2595    packssdw             m3, m11, m14      ; -out7   out6
2596    packssdw             m4, m13, m10      ;  out8  -out9
2597    ret
2598ALIGN function_align
2599.main_pass2:
2600    vshufi32x4           m8, m4, m6, q3232 ; i8 ic m8 mc
2601    vinserti32x8         m4, ym6, 1        ; i0 i4 m0 m4
2602    vshufi32x4           m6, m0, m2, q3232 ; a8 ac e8 ec
2603    vinserti32x8         m0, ym2, 1        ; a0 a4 e0 e4
2604    vshufi32x4           m9, m5, m7, q3232 ; ia ie ma me
2605    vinserti32x8         m5, ym7, 1        ; i2 i6 m2 m6
2606    vshufi32x4           m7, m1, m3, q3232 ; aa ae ea ee
2607    vinserti32x8         m1, ym3, 1        ; a2 a6 e2 e6
2608    vshufi32x4           m2, m0, m4, q3131 ;  4  5
2609    vshufi32x4           m0, m4, q2020     ;  0  1
2610    vshufi32x4           m4, m6, m8, q2020 ;  8  9
2611    vshufi32x4           m6, m8, q3131     ; 12 13
2612    vshufi32x4           m3, m1, m5, q3131 ;  6  7
2613    vshufi32x4           m1, m5, q2020     ;  2  3
2614    vshufi32x4           m5, m7, m9, q2020 ; 10 11
2615    vshufi32x4           m7, m9, q3131     ; 14 15
2616cglobal_label .main_pass2b
2617    REPX {pshufd x, x, q1032}, m1, m3, m5, m7
2618    call .main
2619    vpbroadcastd         m8, [o(pw_2896x8)]
2620    pshufb               m2, m11, m12
2621    pshufb               m5, m12
2622    pshufb               m3, m12
2623    pshufb               m4, m12
2624    punpcklqdq           m9, m5, m2        ;  t15a   t7
2625    punpckhqdq           m5, m2            ;  t14a   t6
2626    shufps               m2, m3, m4, q1032 ;  t2a    t10
2627    shufps               m3, m4, q3210     ;  t3a    t11
2628    psubsw               m4, m2, m3        ;  out8  -out9
2629    paddsw               m3, m2            ; -out7   out6
2630    paddsw               m2, m5, m9        ; -out5   out4
2631    psubsw               m5, m9            ;  out10 -out11
2632    REPX   {pmulhrsw x, m8}, m2, m3, m4, m5
2633    ret
2634ALIGN function_align
2635.main:
2636    vpbroadcastd        m10, [o(pd_2048)]
2637    vpbroadcastq        m13, [o(int_mshift)]
2638    punpckhwd            m8, m7, m0 ; in14 in1
2639    punpcklwd            m0, m7     ; in0  in15
2640    punpcklwd            m7, m6, m1 ; in12 in3
2641    punpckhwd            m1, m6     ; in2  in13
2642    punpckhwd            m6, m5, m2 ; in10 in5
2643    punpcklwd            m2, m5     ; in4  in11
2644    punpcklwd            m5, m4, m3 ; in8  in7
2645    punpckhwd            m3, m4     ; in6  in9
2646    vpcmpub              k7, m13, m10, 6 ; 0x33...
2647    ITX_MUL2X_PACK        0, 4, 9, 10,  201, 4091, 5 ; t0  t1
2648    ITX_MUL2X_PACK        1, 4, 9, 10,  995, 3973, 5 ; t2  t3
2649    ITX_MUL2X_PACK        2, 4, 9, 10, 1751, 3703, 5 ; t4  t5
2650    ITX_MUL2X_PACK        3, 4, 9, 10, 2440, 3290, 5 ; t6  t7
2651    ITX_MUL2X_PACK        5, 4, 9, 10, 3035, 2751, 5 ; t8  t9
2652    ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 5 ; t10 t11
2653    ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 5 ; t12 t13
2654    ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 5 ; t14 t15
2655    psubsw               m4, m0, m5 ; t9a  t8a
2656    paddsw               m0, m5     ; t1a  t0a
2657    psubsw               m5, m1, m6 ; t11a t10a
2658    paddsw               m1, m6     ; t3a  t2a
2659    psubsw               m6, m2, m7 ; t13a t12a
2660    paddsw               m2, m7     ; t5a  t4a
2661    psubsw               m7, m3, m8 ; t15a t14a
2662    paddsw               m3, m8     ; t7a  t6a
2663    ITX_MUL2X_PACK        4, 8, 9, 10, 799,       4017,        4 ; t8  t9
2664    ITX_MUL2X_PACK        6, 8, 9, 10, 799_4017,  4017_m799,  52 ; t12 t13
2665    ITX_MUL2X_PACK        5, 8, 9, 10, 3406,      2276,        4 ; t10 t11
2666    ITX_MUL2X_PACK        7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15
2667    psubsw               m8, m1, m3 ; t7   t6
2668    paddsw               m1, m3     ; t3   t2
2669    psubsw               m3, m0, m2 ; t5   t4
2670    paddsw               m0, m2     ; t1   t0
2671    psubsw               m2, m5, m7 ; t14a t15a
2672    paddsw               m7, m5     ; t10a t11a
2673    psubsw               m5, m4, m6 ; t12a t13a
2674    paddsw               m4, m6     ; t8a  t9a
2675    ITX_MUL2X_PACK        3, 6, 9, 10, 1567,       3784,        5 ; t5a t4a
2676    ITX_MUL2X_PACK        8, 6, 9, 10, 3784_m1567, 1567_3784,  52 ; t7a t6a
2677    ITX_MUL2X_PACK        2, 6, 9, 10, 3784,       1567,        4 ; t15 t14
2678    ITX_MUL2X_PACK        5, 6, 9, 10, 3784_1567,  1567_m3784, 52 ; t13 t12
2679    vbroadcasti32x4     m12, [o(deint_shuf)]
2680    paddsw               m6, m4, m7        ; -out1  out14
2681    psubsw               m4, m7            ;  t10    t11
2682    psubsw              m11, m3, m8        ;  t7     t6
2683    paddsw               m8, m3            ;  out12 -out3
2684    psubsw               m3, m0, m1        ;  t3a    t2a
2685    paddsw               m0, m1            ; -out15  out0
2686    paddsw               m1, m2, m5        ; -out13  out2
2687    psubsw               m5, m2            ;  t15a   t14a
2688    pshufb               m0, m12
2689    pshufb               m6, m12
2690    pshufb               m8, m12
2691    pshufb               m1, m12
2692    shufps               m7, m6, m0, q1032 ;  out14 -out15
2693    shufps               m0, m6, m0, q3210 ; -out1   out0
2694    punpcklqdq           m6, m8, m1        ;  out12 -out13
2695    punpckhqdq           m1, m8, m1        ; -out3   out2
2696    ret
2697
2698INV_TXFM_16X16_FN flipadst, dct
2699INV_TXFM_16X16_FN flipadst, adst
2700INV_TXFM_16X16_FN flipadst, flipadst
2701
2702cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2703    call m(iadst_16x16_internal_8bpc).main_pass1
2704    vpbroadcastd        m10, [o(pw_m8192_8192)]
2705    punpcklwd            m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3
2706    punpckhwd            m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3
2707    punpckhwd            m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3
2708    punpcklwd            m7, m6     ; b0 d0 b1 d1 b2 d2 b3 d3
2709    punpcklwd            m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1
2710    punpckhwd            m1, m7     ; a2 b2 c2 d2 a3 b3 c3 d3
2711    punpcklwd            m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1
2712    punpckhwd            m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3
2713    punpcklwd            m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3
2714    punpckhwd            m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3
2715    punpckhwd            m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3
2716    punpcklwd            m5, m4     ; f0 h0 f1 h1 f2 h2 f3 h3
2717    punpcklwd            m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1
2718    punpckhwd            m3, m5     ; e2 f2 g2 h2 e3 f3 g3 h3
2719    punpcklwd            m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1
2720    punpckhwd            m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3
2721    jmp m(iadst_16x16_internal_8bpc).pass1_end
2722.pass2:
2723    call m(iadst_16x16_internal_8bpc).main_pass2
2724    mova                m10, [o(permD)]
2725    psrlq                m8, m10, 8
2726    psrlq               m12, m10, 12
2727    psrlq               m13, m10, 4
2728    mova                 m9, m8
2729    vpermi2q             m8, m7, m5 ;  0  1  4  5
2730    vpermt2q             m7, m12, m5
2731    vpermi2q             m9, m6, m4 ;  2  3  6  7
2732    vpermt2q             m6, m12, m4
2733    vpbroadcastd        m12, [o(pw_2048)]
2734    mov                 r3d, 0x00ff00ff
2735    mova                m11, m10
2736    vpermi2q            m10, m3, m1 ;  8  9 12 13
2737    vpermt2q             m3, m13, m1
2738    kmovd                k1, r3d
2739    vpermi2q            m11, m2, m0 ; 10 11 14 15
2740    vpermt2q             m2, m13, m0
2741    pxor                 m0, m0
2742    vpsubw          m12{k1}, m0, m12
2743    pmulhrsw             m0, m7, m12
2744    pmulhrsw             m1, m6, m12
2745    pmulhrsw             m4, m3, m12
2746    pmulhrsw             m5, m2, m12
2747    jmp m(idct_16x16_internal_8bpc).end3
2748
2749INV_TXFM_16X16_FN identity, dct
2750INV_TXFM_16X16_FN identity, identity
2751
2752cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2753    mova                 m8, [o(int16_perm)]
2754    vpermb               m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
2755    vpermb               m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
2756    vpbroadcastd         m0, [o(pw_1697x16)]
2757    vpermb               m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
2758    vpermb               m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
2759    vpermb               m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3
2760    vpermb               m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3
2761    vpermb               m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3
2762    vpermb               m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3
2763    pmulhrsw             m9, m0, m1
2764    pmulhrsw            m10, m0, m2
2765    pmulhrsw            m11, m0, m3
2766    pmulhrsw            m12, m0, m4
2767    pmulhrsw            m13, m0, m5
2768    pmulhrsw            m14, m0, m6
2769    pmulhrsw            m15, m0, m7
2770    pmulhrsw             m0, m8
2771    REPX       {psraw x, 1}, m9, m10, m11, m12
2772    pavgw                m1, m9
2773    pavgw                m2, m10
2774    pavgw                m3, m11
2775    pavgw                m4, m12
2776    REPX       {psraw x, 1}, m13, m14, m15, m0
2777    pavgw                m5, m13
2778    pavgw                m6, m14
2779    pavgw                m7, m15
2780    pavgw                m8, m0
2781    punpckldq            m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
2782    punpckhdq            m1, m2     ; a2 b2 c2 d2 a3 b3 c3 d3
2783    punpckldq            m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1
2784    punpckhdq            m3, m4     ; e2 f2 g2 h2 e3 f3 g3 h3
2785    punpckldq            m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1
2786    punpckhdq            m5, m6     ; i2 j2 k2 l2 i3 j3 k3 l3
2787    punpckldq            m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1
2788    punpckhdq            m7, m8     ; m2 n2 o2 p2 m3 n3 o3 p3
2789    jmp                tx2q
2790ALIGN function_align
2791.pass2:
2792    vpbroadcastd        m11, [o(pw_1697x16)]
2793    pmulhrsw            m12, m11, m0
2794    pmulhrsw            m13, m11, m1
2795    pmulhrsw            m14, m11, m2
2796    pmulhrsw            m15, m11, m3
2797    pmulhrsw             m8, m11, m4
2798    pmulhrsw             m9, m11, m5
2799    pmulhrsw            m10, m11, m6
2800    pmulhrsw            m11, m7
2801    REPX      {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
2802    paddsw               m0, m12
2803    paddsw               m1, m13
2804    paddsw               m2, m14
2805    paddsw               m3, m15
2806    paddsw               m8, m4
2807    movu                 m4, [o(permD+2)]
2808    paddsw               m9, m5
2809    paddsw               m6, m10
2810    paddsw               m7, m11
2811    psrlq               m12, m4, 4
2812    mova                 m5, m4
2813    mova                m10, m4
2814    mova                m11, m4
2815    vpermi2q             m4, m0, m2  ;  8  9 12 13
2816    vpermt2q             m0, m12, m2 ;  0  1  4  5
2817    vpermi2q             m5, m1, m3  ; 10 11 14 15
2818    vpermt2q             m1, m12, m3 ;  2  3  6  7
2819    vpermi2q            m10, m8, m6
2820    vpermt2q             m8, m12, m6
2821    vpermi2q            m11, m9, m7
2822    vpermt2q             m9, m12, m7
2823    jmp m(idct_16x16_internal_8bpc).end
2824
2825%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4]
2826    vpbroadcastd        m%4, [o(pw_%5_%6x8)]
2827    punpcklwd           m%1, m%3, m%3
2828    pmulhrsw            m%1, m%4
2829    vpbroadcastd        m%4, [o(pw_%7_%8x8)]
2830    punpckhwd           m%2, m%3, m%3
2831    pmulhrsw            m%2, m%4
2832%endmacro
2833
2834cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
2835%undef cmp
2836    lea                  r5, [o_base]
2837    test               eobd, eobd
2838    jz .dconly
2839    cmp                eobd, 107
2840    jb .fast
2841    mova                 m5, [cq+64*5]
2842    mova                 m3, [cq+64*3]
2843    mova                 m1, [cq+64*1]
2844    mova                 m7, [cq+64*7]
2845    mova                 m2, [cq+64*2]
2846    mova                 m6, [cq+64*6]
2847    mova                 m0, [cq+64*0]
2848    mova                 m4, [cq+64*4]
2849    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
2850    mova                 m8, [o(idct_8x32p)]
2851    vpbroadcastd         m9, [o(pw_8192)]
2852    REPX  {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7
2853    punpckldq            m8, m0, m1 ; ab
2854    punpckhdq            m0, m1
2855    punpckldq            m1, m2, m3 ; cd
2856    punpckhdq            m2, m3
2857    punpckldq            m3, m4, m5 ; ef
2858    punpckhdq            m4, m5
2859    punpckldq            m5, m6, m7 ; gh
2860    punpckhdq            m6, m7
2861    REPX   {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6
2862    punpcklqdq          m18, m8, m1 ; 30  2    6 26   31  1   23  9
2863    punpckhqdq          m14, m8, m1 ; 16  0   12 20    3 29   11 21
2864    punpcklqdq          m21, m0, m2 ; 14 18   22 10   27  5   19 13
2865    punpckhqdq          m15, m0, m2 ; 18  4   24  8    7 25   15 17
2866    punpcklqdq          m20, m3, m5
2867    punpckhqdq          m16, m3, m5
2868    punpcklqdq          m19, m4, m6
2869    punpckhqdq          m17, m4, m6
2870    vinserti32x4        ym8, ym18, xm20, 1
2871    vshufi32x4          ym1, ym18, ym20, 0x03
2872    vinserti32x4        ym9, ym14, xm16, 1
2873    vshufi32x4          ym3, ym14, ym16, 0x03
2874    vinserti32x4        ym0, ym21, xm19, 1
2875    vshufi32x4          ym5, ym21, ym19, 0x03
2876    vinserti32x4        ym7, ym15, xm17, 1
2877    vshufi32x4          ym6, ym15, ym17, 0x03
2878    call m(idct_8x16_internal_8bpc).main2
2879    psrlq               m12, [o(permB)], 60
2880    vpermt2q            m14, m12, m16
2881    vpermt2q            m21, m12, m19
2882    vpermt2q            m15, m12, m17
2883    vpermi2q            m12, m18, m20
2884    vextracti32x8      ym16, m14, 1
2885    vextracti32x8      ym19, m21, 1
2886    vextracti32x8      ym17, m15, 1
2887    vextracti32x8      ym20, m12, 1
2888    call .main2
2889    jmp .end
2890.fast: ; right half is zero
2891    mova                 m0, [o(int16_perm)]
2892    mova                ym2, [cq+64*4]
2893    vinserti32x8         m2, [cq+64*0], 1
2894    mova                ym3, [cq+64*6]
2895    vinserti32x8         m3, [cq+64*2], 1
2896    mova                ym4, [cq+64*3]
2897    vinserti32x8         m4, [cq+64*5], 1
2898    mova                ym5, [cq+64*7]
2899    vinserti32x8         m5, [cq+64*1], 1
2900    REPX  {vpermb x, m0, x}, m2, m3, m4, m5
2901    call m(idct_16x8_internal_8bpc).main2
2902    vbroadcasti32x4      m4, [o(int_shuf3)]
2903    vbroadcasti32x4      m5, [o(int_shuf4)]
2904    pshufb               m2, m4     ; e0 f0 e2 f2 e1 f1 e3 f3
2905    pshufb               m3, m5     ; g0 h0 g2 h2 g1 h1 g3 h3
2906    pshufb               m0, m4     ; a0 b0 a2 b2 a1 b1 a3 b3
2907    pshufb               m1, m5     ; c0 d0 c2 d2 c1 d1 c3 d3
2908    vpbroadcastd         m4, [o(pw_8192)]
2909    psrlq                m5, [o(permB)], 60
2910    punpckldq            m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2
2911    punpckhdq           m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3
2912    punpckldq            m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2
2913    punpckhdq           m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3
2914    REPX   {pmulhrsw x, m4}, m6, m17, m2, m16
2915    vinserti32x4        ym0, ym2, xm6, 1      ;  0  2
2916    vshufi32x4          ym1, ym2, ym6, 0x03   ;  4  6
2917    vinserti32x4       ym14, ym16, xm17, 1    ;  1  3
2918    vshufi32x4         ym15, ym16, ym17, 0x03 ;  5  7
2919    vpermt2q             m2, m5, m6           ;  8 10
2920    vpermt2q            m16, m5, m17          ;  9 11
2921    vextracti32x8       ym3, m2, 1            ; 12 14
2922    vextracti32x8      ym17, m16, 1           ; 13 15
2923    call m(idct_8x16_internal_8bpc).main_fast
2924    call .main_fast
2925.end:
2926    vpbroadcastd        ym8, strided
2927    pmulld              ym8, [o(gather8d)]
2928    call .main_end
2929    lea                  r3, [dstq+strideq*4]
2930    kxnorb               k1, k1, k1
2931    lea                  r4, [dstq+strideq*8]
2932    pxor                 m9, m9
2933    lea                  r1, [r3+strideq*8]
2934    kmovb                k2, k1
2935    vpgatherdq      m12{k1}, [r0+ym8]
2936    kmovb                k1, k2
2937    vpgatherdq      m13{k2}, [r3+ym8]
2938    kmovb                k2, k1
2939    vpgatherdq      m14{k1}, [r4+ym8]
2940    kmovb                k1, k2
2941    vpgatherdq      m15{k2}, [r1+ym8]
2942    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
2943    REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7
2944    punpcklbw           m11, m12, m9
2945    punpckhbw           m12, m9
2946    paddw                m0, m11
2947    paddw                m1, m12
2948    packuswb             m0, m1
2949    kmovb                k2, k1
2950    vpscatterdq [r0+ym8]{k1}, m0
2951    punpcklbw           m12, m13, m9
2952    punpckhbw           m13, m9
2953    paddw                m2, m12
2954    paddw                m3, m13
2955    packuswb             m2, m3
2956    kmovb                k1, k2
2957    vpscatterdq [r3+ym8]{k2}, m2
2958    punpcklbw           m13, m14, m9
2959    punpckhbw           m14, m9
2960    paddw                m4, m13
2961    paddw                m5, m14
2962    packuswb             m4, m5
2963    kmovb                k2, k1
2964    vpscatterdq [r4+ym8]{k1}, m4
2965    punpcklbw           m14, m15, m9
2966    punpckhbw           m15, m9
2967    paddw                m6, m14
2968    paddw                m7, m15
2969    packuswb             m6, m7
2970    vpscatterdq [r1+ym8]{k2}, m6
2971    RET
2972.dconly:
2973    movsx               r6d, word [cq]
2974    mov                [cq], eobd
2975    or                  r3d, 32
2976    imul                r6d, 181
2977    add                 r6d, 128+512
2978    sar                 r6d, 8+2
2979    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
2980INIT_YMM avx512icl
2981ALIGN function_align
2982cglobal_label .main_fast2 ; bottom three-quarters are zero
2983    ITX_UNPACK_MULHRSW   12, 14, 14, 8,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
2984    ITX_UNPACK_MULHRSW   21, 20, 15, 8,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
2985    mova                m11, m12
2986    mova                m17, m20
2987    mova                m15, m21
2988    mova                m16, m14
2989    jmp .main4
2990ALIGN function_align
2991cglobal_label .main_fast ; bottom half is zero
2992    ITX_UNPACK_MULHRSW   12, 14, 14, 8,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
2993    ITX_UNPACK_MULHRSW   21, 15, 15, 8,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
2994    ITX_UNPACK_MULHRSW   20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
2995    ITX_UNPACK_MULHRSW   19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
2996    jmp .main3
2997ALIGN function_align
2998cglobal_label .main
2999    punpcklwd           m12, m21, m14 ; in31 in1
3000    punpckhwd           m14, m21      ; in3  in29
3001    punpcklwd           m21, m20, m15 ; in27 in5
3002    punpckhwd           m15, m20      ; in7  in25
3003    punpcklwd           m20, m19, m16 ; in23 in9
3004    punpckhwd           m16, m19      ; in11 in21
3005    punpcklwd           m19, m18, m17 ; in19 in13
3006    punpckhwd           m17, m18      ; in15 in17
3007.main2:
3008    ITX_MUL2X_PACK       12, 8, 9, 10,  201, 4091, 5 ; t16a, t31a
3009    ITX_MUL2X_PACK       14, 8, 9, 10, 4052,  601, 5 ; t23a, t24a
3010    ITX_MUL2X_PACK       21, 8, 9, 10,  995, 3973, 5 ; t20a, t27a
3011    ITX_MUL2X_PACK       15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
3012    ITX_MUL2X_PACK       20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
3013    ITX_MUL2X_PACK       16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
3014    ITX_MUL2X_PACK       19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
3015    ITX_MUL2X_PACK       17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
3016.main3:
3017    psubsw              m11, m12, m17 ; t17 t30
3018    paddsw              m12, m17      ; t16 t31
3019    psubsw              m17, m15, m20 ; t18 t29
3020    paddsw              m20, m15      ; t19 t28
3021    psubsw              m15, m21, m16 ; t21 t26
3022    paddsw              m21, m16      ; t20 t27
3023    psubsw              m16, m14, m19 ; t22 t25
3024    paddsw              m14, m19      ; t23 t24
3025.main4:
3026    ITX_MUL2X_PACK       11, 18, 19, 10,   799, 4017, 5 ; t17a t30a
3027    ITX_MUL2X_PACK       17, 18, 19, 10, m4017,  799, 5 ; t18a t29a
3028    ITX_MUL2X_PACK       15, 18, 19, 10,  3406, 2276, 5 ; t21a t26a
3029    ITX_MUL2X_PACK       16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a
3030    vpbroadcastd         m8, [o(pw_m3784_1567)]
3031    psubsw              m19, m12, m20 ; t19a t28a
3032    paddsw              m20, m12      ; t16a t31a
3033    psubsw              m12, m14, m21 ; t20a t27a
3034    paddsw              m14, m21      ; t23a t24a
3035    psubsw              m21, m11, m17 ; t18  t29
3036    paddsw              m11, m17      ; t17  t30
3037    psubsw              m17, m16, m15 ; t21  t26
3038    paddsw              m16, m15      ; t22  t25
3039    ITX_MUL2X_PACK       21, 18, 15, 10, 1567_3784, 8,   20 ; t18a t29a
3040    ITX_MUL2X_PACK       19, 18, 15, 10, 1567_3784, 8,   20 ; t19  t28
3041    ITX_MUL2X_PACK       12, 18, 15, 10, 8, m1567_m3784, 36 ; t20  t27
3042    ITX_MUL2X_PACK       17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a
3043    vbroadcasti32x4     m18, [o(deint_shuf)]
3044    vpbroadcastd         m8, [o(pw_m2896_2896)]
3045    vpbroadcastd         m9, [o(pw_2896_2896)]
3046    psubsw              m15, m20, m14 ; t23  t24
3047    paddsw              m20, m14      ; t16  t31
3048    psubsw              m14, m11, m16 ; t22a t25a
3049    paddsw              m11, m16      ; t17a t30a
3050    psubsw              m16, m21, m17 ; t21  t26
3051    paddsw              m21, m17      ; t18  t29
3052    psubsw              m17, m19, m12 ; t20a t27a
3053    paddsw              m19, m12      ; t19a t28a
3054    REPX    {pshufb x, m18}, m20, m11, m21, m19
3055    ITX_MUL2X_PACK       15, 18, 12, 10, 8, 9, 8 ; t23a t22a
3056    ITX_MUL2X_PACK       14, 13, 15, 10, 8, 9, 8 ; t22  t25
3057    packssdw            m18, m13      ; t23a t22
3058    packssdw            m12, m15      ; t24a t25
3059    ITX_MUL2X_PACK       16, 13, 15, 10, 8, 9, 8 ; t21a t26a
3060    ITX_MUL2X_PACK       17, 16, 14, 10, 8, 9, 8 ; t20  t27
3061    packssdw            m16, m13      ; t20  t21a
3062    packssdw            m14, m15      ; t27  t26a
3063    punpcklqdq          m13, m19, m21 ; t19a t18
3064    punpckhqdq          m19, m21      ; t28a t29
3065    punpcklqdq          m21, m20, m11 ; t16  t17a
3066    punpckhqdq          m20, m11      ; t31  t30a
3067INIT_ZMM avx512icl
3068    mova                m15, [o(permA)]
3069    ret
3070cglobal_label .main_end
3071    vpbroadcastd        m10, [o(pw_2048)]
3072    vpermt2q             m0, m15, m1  ; t0   t1   t2   t3
3073    vpermt2q            m20, m15, m19 ; t31  t30a t29  t28a
3074    vpermt2q             m2, m15, m3  ; t4   t5   t6   t7
3075    vpermt2q            m14, m15, m12 ; t27  t26a t25  t24a
3076    vpermt2q             m4, m15, m5  ; t8   t9   t10  t11
3077    vpermt2q            m18, m15, m16 ; t23a t22  t21a t20
3078    vpermt2q             m6, m15, m7  ; t12  t13  t14  t15
3079    vpermt2q            m13, m15, m21 ; t19a t18  t17a t16
3080    psubsw               m7, m0, m20  ; out31 out30 out29 out28
3081    paddsw               m0, m20      ; out0  out1  out2  out3
3082    psubsw               m5, m2, m14  ; out27 out26 out25 out24
3083    paddsw               m2, m14      ; out4  out5  out6  out7
3084    psubsw               m3, m4, m18  ; out23 out22 out21 out20
3085    paddsw               m4, m18      ; out8  out9  out10 out11
3086    psubsw               m1, m6, m13  ; out19 out18 out17 out16
3087    paddsw               m6, m13      ; out12 out13 out14 out15
3088    vzeroupper
3089    ret
3090
3091%macro LOAD_PACKED_16X2 3 ; dst, row[1-2]
3092    vbroadcasti32x4    ym%1, [cq+16*%2]
3093    vbroadcasti32x4     ym8, [cq+16*%3]
3094    shufpd             ym%1, ym8, 0x0c
3095%endmacro
3096
3097cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
3098%undef cmp
3099    test               eobd, eobd
3100    jz .dconly
3101    lea                  r5, [o_base]
3102    LOAD_PACKED_16X2      0,  0,  2 ; in0  in2
3103    LOAD_PACKED_16X2      1,  4,  6 ; in4  in6
3104    LOAD_PACKED_16X2      2,  8, 10 ; in8  in10
3105    LOAD_PACKED_16X2      3, 12, 14 ; in12 in14
3106    LOAD_PACKED_16X2     14,  1,  3 ; in1  in3
3107    LOAD_PACKED_16X2     15,  5,  7 ; in5  in7
3108    LOAD_PACKED_16X2     16,  9, 11 ; in9  in11
3109    LOAD_PACKED_16X2     17, 13, 15 ; in13 in15
3110    pxor                 m4, m4
3111    REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
3112    cmp                eobd, 107
3113    jb .fast
3114    LOAD_PACKED_16X2      4, 16, 18 ; in16 in18
3115    LOAD_PACKED_16X2      5, 20, 22 ; in20 in22
3116    LOAD_PACKED_16X2      6, 24, 26 ; in24 in26
3117    LOAD_PACKED_16X2      7, 28, 30 ; in28 in30
3118    call m(idct_8x16_internal_8bpc).main
3119    LOAD_PACKED_16X2     18, 19, 17 ; in19 in17
3120    LOAD_PACKED_16X2     19, 23, 21 ; in23 in21
3121    LOAD_PACKED_16X2     20, 27, 25 ; in27 in25
3122    LOAD_PACKED_16X2     21, 31, 29 ; in31 in29
3123    pxor                 m8, m8
3124    REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
3125    call m(inv_txfm_add_dct_dct_8x32_8bpc).main
3126    jmp .pass2
3127.fast: ; bottom half is zero
3128    mova                ym5, ym4
3129    mova                ym6, ym4
3130    mova                ym7, ym4
3131    call m(idct_8x16_internal_8bpc).main
3132    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
3133.pass2:
3134    vpbroadcastd        m10, [o(pw_8192)]
3135    vpermt2q             m0, m15, m4       ; t0   t1   t9   t8
3136    vpermt2q            m20, m15, m18      ; t31  t30a t23a t22
3137    vpermt2q             m3, m15, m7       ; t7   t6   t14  t15
3138    vpermt2q            m12, m15, m21      ; t25  t24a t17a t16
3139    vpermt2q             m2, m15, m6       ; t4   t5   t13  t12
3140    vpermt2q            m14, m15, m13      ; t23a t22  t21a t20
3141    vpermt2q             m1, m15, m5       ; t3   t2   t10  t11
3142    vpermt2q            m19, m15, m16      ; t27  t26a t19a t18
3143    psubsw               m8, m0, m20       ; out31 out30 out22 out23
3144    paddsw               m0, m20           ; out0  out1  out9  out8
3145    paddsw               m6, m3, m12       ; out7  out6  out14 out15
3146    psubsw               m3, m12           ; out24 out25 out17 out16
3147    psubsw               m5, m2, m14       ; out27 out26 out18 out19
3148    paddsw               m4, m2, m14       ; out4  out5  out13 out12
3149    psubsw               m7, m1, m19       ; out28 out29 out21 out20
3150    paddsw               m2, m1, m19       ; out3  out2  out10 out11
3151    vzeroupper
3152    vshufi32x4           m1, m0, m3, q1221 ; out1  out9  out17 out25
3153    vshufi32x4           m0, m3, q0330     ; out0  out8  out16 out24
3154    vshufi32x4           m3, m2, m5, q0330 ; out3  out11 out19 out27
3155    vshufi32x4           m2, m5, q1221     ; out2  out10 out18 out26
3156    vshufi32x4           m5, m4, m7, q1221 ; out5  out13 out21 out29
3157    vshufi32x4           m4, m7, q0330     ; out4  out12 out20 out28
3158    vshufi32x4           m7, m6, m8, q0330 ; out7  out15 out23 out31
3159    vshufi32x4           m6, m8, q1221     ; out6  out14 out22 out30
3160    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
3161    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
3162    call .main
3163    vpbroadcastd         m8, [o(pw_2048)]
3164    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
3165    lea                  r2, [strideq*3]
3166    lea                  r3, [dstq+strideq*4]
3167    movshdup            m12, [o(permD)]
3168    pmovzxbw             m8, [dstq+strideq*0]
3169    pmovzxbw             m9, [dstq+strideq*1]
3170    pmovzxbw            m10, [dstq+strideq*2]
3171    pmovzxbw            m11, [dstq+r2       ]
3172    paddw                m0, m8
3173    paddw                m1, m9
3174    paddw                m2, m10
3175    paddw                m3, m11
3176    pmovzxbw             m8, [r3+strideq*0]
3177    pmovzxbw             m9, [r3+strideq*1]
3178    pmovzxbw            m10, [r3+strideq*2]
3179    pmovzxbw            m11, [r3+r2       ]
3180    paddw                m4, m8
3181    paddw                m5, m9
3182    paddw                m6, m10
3183    paddw                m7, m11
3184    packuswb             m0, m1
3185    packuswb             m2, m3
3186    vpermq               m0, m12, m0
3187    vpermq               m2, m12, m2
3188    mova          [dstq+strideq*0], ym0
3189    vextracti32x8 [dstq+strideq*1], m0, 1
3190    mova          [dstq+strideq*2], ym2
3191    vextracti32x8 [dstq+r2       ], m2, 1
3192    packuswb             m4, m5
3193    packuswb             m6, m7
3194    vpermq               m4, m12, m4
3195    vpermq               m6, m12, m6
3196    mova          [r3+strideq*0], ym4
3197    vextracti32x8 [r3+strideq*1], m4, 1
3198    mova          [r3+strideq*2], ym6
3199    vextracti32x8 [r3+r2       ], m6, 1
3200    RET
3201.dconly:
3202    movsx               r6d, word [cq]
3203    mov                [cq], eobd
3204    or                  r3d, 8
3205.dconly2:
3206    imul                r6d, 181
3207    add                 r6d, 128+512
3208    sar                 r6d, 8+2
3209.dconly3:
3210    imul                r6d, 181
3211    add                 r6d, 128+2048
3212    sar                 r6d, 8+4
3213    pxor                 m2, m2
3214    vpbroadcastw         m3, r6d
3215.dconly_loop:
3216    mova                ym1, [dstq+strideq*0]
3217    vinserti32x8         m1, [dstq+strideq*1], 1
3218    punpcklbw            m0, m1, m2
3219    punpckhbw            m1, m2
3220    paddw                m0, m3
3221    paddw                m1, m3
3222    packuswb             m0, m1
3223    mova          [dstq+strideq*0], ym0
3224    vextracti32x8 [dstq+strideq*1], m0, 1
3225    lea                dstq, [dstq+strideq*2]
3226    sub                 r3d, 2
3227    jg .dconly_loop
3228    RET
3229ALIGN function_align
3230cglobal_label .main
3231    vpbroadcastd       m10, [o(pd_2048)]
3232.main2:
3233    ITX_MULSUB_2W        5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a
3234    ITX_MULSUB_2W        1, 7, 8, 9, 10,  799, 4017 ; t4a, t7a
3235    ITX_MULSUB_2W        2, 6, 8, 9, 10, 1567, 3784 ; t2, t3
3236    vpbroadcastd       m11, [o(pw_2896_2896)]
3237    vpbroadcastd       m12, [o(pw_m2896_2896)]
3238    ITX_MULSUB_2W        0, 4, 8, 9, 10, 11, 12 ; t1, t0
3239.main3:
3240    paddsw              m8, m1, m5 ; t4
3241    psubsw              m1, m5     ; t5a
3242    paddsw              m9, m7, m3 ; t7
3243    psubsw              m7, m3     ; t6a
3244    ITX_MULSUB_2W        7, 1, 3, 5, 10, 11, 12 ; t5, t6
3245    psubsw              m5, m0, m2 ; dct4 out2
3246    paddsw              m2, m0     ; dct4 out1
3247    paddsw              m0, m4, m6 ; dct4 out0
3248    psubsw              m4, m6     ; dct4 out3
3249    psubsw              m6, m2, m1 ; out6
3250    paddsw              m1, m2     ; out1
3251    paddsw              m2, m5, m7 ; out2
3252    psubsw              m5, m7     ; out5
3253    psubsw              m7, m0, m9 ; out7
3254    paddsw              m0, m9     ; out0
3255    paddsw              m3, m4, m8 ; out3
3256    psubsw              m4, m8     ; out4
3257    ret
3258
3259cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c
3260    vpbroadcastd         m7, [pw_5]
3261    paddsw               m0, m7, [cq+64*0]
3262    paddsw               m1, m7, [cq+64*1]
3263    vpbroadcastd        ym9, strided
3264    paddsw               m2, m7, [cq+64*2]
3265    paddsw               m3, m7, [cq+64*3]
3266    paddsw               m4, m7, [cq+64*4]
3267    paddsw               m5, m7, [cq+64*5]
3268    paddsw               m6, m7, [cq+64*6]
3269    paddsw               m7,     [cq+64*7]
3270    pmulld             ym14, ym9, [pd_0to15]
3271    lea                  r3, [dstq+strideq*1]
3272    lea                  r4, [dstq+strideq*2]
3273    kxnorb               k1, k1, k1
3274    pxor                m13, m13
3275    add                  r1, r4 ; dstq+strideq*3
3276    kmovb                k2, k1
3277    vpgatherdq       m9{k1}, [r0+ym14*4]
3278    kmovb                k1, k2
3279    vpgatherdq      m10{k2}, [r3+ym14*4]
3280    kmovb                k2, k1
3281    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
3282    REPX       {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
3283    vpgatherdq      m11{k1}, [r4+ym14*4]
3284    kmovb                k1, k2
3285    vpgatherdq      m12{k2}, [r1+ym14*4]
3286    REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
3287    punpcklbw            m8, m9, m13  ;  0  8 16 24
3288    punpckhbw            m9, m13      ;  4 12 20 28
3289    paddw                m0, m8
3290    paddw                m4, m9
3291    packuswb             m0, m4
3292    kmovb                k2, k1
3293    vpscatterdq [r0+ym14*4]{k1}, m0
3294    punpcklbw            m8, m10, m13 ;  1  9 17 25
3295    punpckhbw           m10, m13      ;  5 13 21 29
3296    paddw                m1, m8
3297    paddw                m5, m10
3298    packuswb             m1, m5
3299    kmovb                k1, k2
3300    vpscatterdq [r3+ym14*4]{k2}, m1
3301    punpcklbw            m8, m11, m13 ;  2 10 18 26
3302    punpckhbw           m11, m13      ;  6 14 22 30
3303    paddw                m2, m8
3304    paddw                m6, m11
3305    packuswb             m2, m6
3306    kmovb                k2, k1
3307    vpscatterdq [r4+ym14*4]{k1}, m2
3308    punpcklbw            m8, m12, m13 ;  3 11 19 27
3309    punpckhbw           m12, m13      ;  7 15 23 31
3310    paddw                m3, m8
3311    paddw                m7, m12
3312    packuswb             m3, m7
3313    vpscatterdq [r1+ym14*4]{k2}, m3
3314    RET
3315
3316cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c
3317    vpbroadcastd         m0, [pw_4096]
3318    pmulhrsw             m3, m0, [cq+64*0]
3319    pmulhrsw             m4, m0, [cq+64*4]
3320    pmulhrsw             m6, m0, [cq+64*1]
3321    pmulhrsw             m5, m0, [cq+64*5]
3322    pmulhrsw             m7, m0, [cq+64*2]
3323    pmulhrsw             m2, m0, [cq+64*6]
3324    pmulhrsw             m8, m0, [cq+64*3]
3325    pmulhrsw             m0,     [cq+64*7]
3326    mova                m13, [int8_permA]
3327    lea                  r3, [strideq*3]
3328    lea                  r4, [dstq+strideq*4]
3329    punpckldq            m1, m3, m4
3330    punpckhdq            m3, m4
3331    punpckldq            m4, m6, m5
3332    punpckhdq            m6, m5
3333    punpckldq            m5, m7, m2
3334    punpckhdq            m7, m2
3335    punpckldq            m2, m8, m0
3336    punpckhdq            m8, m0
3337    mova                ym9, [dstq+strideq*0]
3338    vinserti32x8         m9, [dstq+strideq*2], 1
3339    mova               ym10, [dstq+strideq*1]
3340    vinserti32x8        m10, [dstq+r3       ], 1
3341    mova               ym11, [r4+strideq*0]
3342    vinserti32x8        m11, [r4+strideq*2], 1
3343    mova               ym12, [r4+strideq*1]
3344    vinserti32x8        m12, [r4+r3       ], 1
3345    REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8
3346    pxor                m13, m13
3347    REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
3348    punpcklqdq           m0, m1, m4 ; a0 a2   c0 c2
3349    punpckhqdq           m1, m4     ; b0 b2   d0 d2
3350    punpcklqdq           m4, m5, m2 ; a1 a3   c1 c3
3351    punpckhqdq           m5, m2     ; b1 b3   d1 d3
3352    punpcklqdq           m2, m3, m6 ; e0 e2   g0 g2
3353    punpckhqdq           m3, m6     ; f0 f2   h0 h2
3354    punpcklqdq           m6, m7, m8 ; e1 e3   g1 g3
3355    punpckhqdq           m7, m8     ; f1 f3   h1 h3
3356    punpcklbw            m8, m9, m13
3357    punpckhbw            m9, m13
3358    paddw                m0, m8
3359    paddw                m4, m9
3360    packuswb             m0, m4
3361    mova          [dstq+strideq*0], ym0
3362    vextracti32x8 [dstq+strideq*2], m0, 1
3363    punpcklbw            m8, m10, m13
3364    punpckhbw           m10, m13
3365    paddw                m1, m8
3366    paddw                m5, m10
3367    packuswb             m1, m5
3368    mova          [dstq+strideq*1], ym1
3369    vextracti32x8 [dstq+r3       ], m1, 1
3370    punpcklbw            m8, m11, m13
3371    punpckhbw           m11, m13
3372    paddw                m2, m8
3373    paddw                m6, m11
3374    packuswb             m2, m6
3375    mova          [r4+strideq*0], ym2
3376    vextracti32x8 [r4+strideq*2], m2, 1
3377    punpcklbw            m8, m12, m13
3378    punpckhbw           m12, m13
3379    paddw                m3, m8
3380    paddw                m7, m12
3381    packuswb             m3, m7
3382    mova          [r4+strideq*1], ym3
3383    vextracti32x8 [r4+r3       ], m3, 1
3384    RET
3385
3386%macro IDCT_16x32_END 3 ; src[1-2], row
3387    mova                xm8, [dstq+strideq*0]
3388    vinserti32x4        ym8, [dstq+strideq*1], 1
3389    mova                xm9, [dstq+r3       ]
3390    vinserti32x4        ym9, [dstq+strideq*2], 1
3391    pmulhrsw            m%1, m10
3392    pmulhrsw            m%2, m10
3393    vpermb               m8, m11, m8
3394    vpermb               m9, m11, m9
3395    mova   [cq+64*(%3*2+0)], m13
3396    mova   [cq+64*(%3*2+1)], m13
3397    paddw                m8, m%1
3398    paddw                m9, m%2
3399    packuswb             m8, m9
3400    vpermd               m8, m12, m8
3401    mova          [dstq+strideq*0], xm8
3402    vextracti32x4 [dstq+strideq*1], ym8, 1
3403    vextracti32x4 [dstq+strideq*2], m8, 2
3404    vextracti32x4 [dstq+r3       ], m8, 3
3405%if %1 != 20
3406    lea                dstq, [dstq+strideq*4]
3407%endif
3408%endmacro
3409
3410cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob
3411%undef cmp
3412    lea                  r5, [o_base]
3413    test               eobd, eobd
3414    jz .dconly
3415    vpbroadcastd        m15, [o(pw_2896x8)]
3416    cmp                eobd, 151
3417    jb .fast
3418    pmulhrsw             m5, m15, [cq+64*10]
3419    pmulhrsw             m3, m15, [cq+64* 6]
3420    pmulhrsw             m1, m15, [cq+64* 2]
3421    pmulhrsw             m7, m15, [cq+64*14]
3422    pmulhrsw             m2, m15, [cq+64* 4]
3423    pmulhrsw             m6, m15, [cq+64*12]
3424    pmulhrsw             m0, m15, [cq+64* 0]
3425    pmulhrsw             m4, m15, [cq+64* 8]
3426    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
3427    pmulhrsw            m14, m15, [cq+64* 1]
3428    pmulhrsw            m21, m15, [cq+64*15]
3429    pmulhrsw            m18, m15, [cq+64* 9]
3430    pmulhrsw            m17, m15, [cq+64* 7]
3431    pmulhrsw            m16, m15, [cq+64* 5]
3432    pmulhrsw            m19, m15, [cq+64*11]
3433    pmulhrsw            m20, m15, [cq+64*13]
3434    pmulhrsw            m15,      [cq+64* 3]
3435    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
3436    mova                 m8, [o(idct_16x32p)]
3437    vpbroadcastd         m9, [o(pw_16384)]
3438    REPX {vpermb x, m8, x}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
3439                            m14, m15, m16, m17, m18, m19, m20, m21
3440    punpckldq            m8, m0, m1
3441    punpckhdq            m0, m1
3442    punpckldq            m1, m2, m3
3443    punpckhdq            m2, m3
3444    REPX   {pmulhrsw x, m9}, m8, m0, m1, m2
3445    punpckldq            m3, m4, m5
3446    punpckhdq            m4, m5
3447    punpckldq            m5, m6, m7
3448    punpckhdq            m6, m7
3449    REPX   {pmulhrsw x, m9}, m3, m4, m5, m6
3450    punpckldq            m7, m14, m15
3451    punpckhdq           m14, m15
3452    punpckldq           m15, m16, m17
3453    punpckhdq           m16, m17
3454    REPX   {pmulhrsw x, m9}, m7, m14, m15, m16
3455    punpckldq           m17, m18, m19
3456    punpckhdq           m18, m19
3457    punpckldq           m19, m20, m21
3458    punpckhdq           m20, m21
3459    REPX   {pmulhrsw x, m9}, m17, m18, m19, m20
3460    punpcklqdq          m21, m8, m1
3461    punpckhqdq           m8, m1
3462    punpcklqdq           m1, m0, m2
3463    punpckhqdq           m0, m2
3464    punpcklqdq           m2, m3, m5
3465    punpckhqdq           m3, m5
3466    punpcklqdq           m5, m4, m6
3467    punpckhqdq           m4, m6
3468    punpcklqdq           m6, m7, m15
3469    punpckhqdq           m7, m15
3470    punpcklqdq          m15, m14, m16
3471    punpckhqdq          m14, m16
3472    punpcklqdq          m16, m17, m19
3473    punpckhqdq          m17, m19
3474    punpcklqdq          m19, m18, m20
3475    punpckhqdq          m18, m20
3476    vinserti32x8        m20, m21, ym2, 1
3477    vshufi32x4          m21, m2, q3232
3478    vinserti32x8         m2, m8, ym3, 1
3479    vshufi32x4           m8, m3, q3232
3480    vinserti32x8         m3, m1, ym5, 1
3481    vshufi32x4           m1, m5, q3232
3482    vinserti32x8         m5, m0, ym4, 1
3483    vshufi32x4           m0, m4, q3232
3484    vinserti32x8         m4, m6, ym16, 1
3485    vshufi32x4           m6, m16, q3232
3486    vinserti32x8        m16, m7, ym17, 1
3487    vshufi32x4           m7, m17, q3232
3488    vinserti32x8        m17, m15, ym19, 1
3489    vshufi32x4          m15, m19, q3232
3490    vinserti32x8        m19, m14, ym18, 1
3491    vshufi32x4          m14, m18, q3232
3492    vshufi32x4          m18, m21, m6, q3131 ; 27  5
3493    vshufi32x4          m21, m6, q2020      ; 31  1
3494    vshufi32x4           m6, m8, m7, q2020  ; 24  8
3495    vshufi32x4           m8, m7, q3131      ; 30  2
3496    vshufi32x4           m7, m1, m15, q2020 ; 28  4
3497    vshufi32x4           m1, m15, q3131     ;  6 26
3498    vshufi32x4          m15, m0, m14, q2020 ;  7 25
3499    vshufi32x4           m0, m14, q3131     ; 14 18
3500    vshufi32x4          m14, m20, m4, q2020 ;  3 29
3501    vshufi32x4          m20, m4, q3131      ; 23  9
3502    vshufi32x4           m9, m3, m17, q2020 ; 16  0
3503    vshufi32x4           m3, m17, q3131     ; 12 20
3504    vshufi32x4          m17, m5, m19, q2020 ; 15 17
3505    vshufi32x4           m5, m19, q3131     ; 22 10
3506    vshufi32x4          m19, m2, m16, q2020 ; 19 13
3507    vshufi32x4          m16, m2, m16, q3131 ; 11 21
3508    call m(idct_16x16_internal_8bpc).main3
3509    call .main_oddhalf
3510    jmp .pass2
3511.fast: ; right half is zero
3512    mova                ym8, [cq+64*15]
3513    vinserti32x8         m8, [cq+64* 1], 1
3514    mova                 m2, [o(int16_perm)]
3515    mova                ym9, [cq+64* 8]
3516    vinserti32x8         m9, [cq+64* 0], 1
3517    mova                ym0, [cq+64* 7]
3518    vinserti32x8         m0, [cq+64* 9], 1
3519    mova                ym7, [cq+64*14]
3520    vinserti32x8         m7, [cq+64* 2], 1
3521    mova                ym1, [cq+64* 3]
3522    vinserti32x8         m1, [cq+64*13], 1
3523    mova                ym3, [cq+64* 6]
3524    vinserti32x8         m3, [cq+64*10], 1
3525    mova                ym5, [cq+64*11]
3526    vinserti32x8         m5, [cq+64* 5], 1
3527    mova                ym6, [cq+64*12]
3528    vinserti32x8         m6, [cq+64* 4], 1
3529    REPX  {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6
3530    REPX  {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
3531    call m(idct_16x16_internal_8bpc).main2
3532    vbroadcasti32x4      m8, [o(int_shuf3)]
3533    vbroadcasti32x4      m9, [o(int_shuf4)]
3534    vpbroadcastd        m11, [o(pw_16384)]
3535    pshufb               m0, m8
3536    pshufb               m1, m9
3537    pshufb               m2, m8
3538    pshufb               m3, m9
3539    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
3540    pshufb               m4, m8
3541    pshufb               m5, m9
3542    pshufb               m6, m8
3543    pshufb               m7, m9
3544    REPX  {pmulhrsw x, m11}, m4, m5, m6, m7
3545    punpckhdq           m17, m0, m1
3546    punpckldq            m0, m1
3547    punpckhdq           m16, m2, m3
3548    punpckldq            m2, m3
3549    punpckhdq           m18, m4, m5
3550    punpckldq            m4, m5
3551    punpckhdq            m5, m6, m7
3552    punpckldq            m6, m7
3553    vinserti32x8         m1, m0, ym2, 1
3554    vshufi32x4           m3, m0, m2, q3232
3555    vinserti32x8         m2, m4, ym6, 1
3556    vshufi32x4           m4, m6, q3232
3557    vinserti32x8        m15, m17, ym16, 1
3558    vshufi32x4          m17, m16, q3232
3559    vinserti32x8        m16, m18, ym5, 1
3560    vshufi32x4          m18, m5, q3232
3561    vshufi32x4           m0, m1, m2, q2020   ;  0  2
3562    vshufi32x4           m1, m2, q3131       ;  4  6
3563    vshufi32x4           m2, m3, m4, q2020   ;  8 10
3564    vshufi32x4           m3, m4, q3131       ; 12 14
3565    vshufi32x4          m14, m15, m16, q2020 ;  1  3
3566    vshufi32x4          m15, m16, q3131      ;  5  7
3567    vshufi32x4          m16, m17, m18, q2020 ;  9 11
3568    vshufi32x4          m17, m18, q3131      ; 13 15
3569    pxor                 m6, m6
3570    punpckhwd            m8, m0, m0
3571    punpcklwd            m9, m6, m0
3572    punpckhwd            m0, m3, m3
3573    punpckhwd            m5, m2, m2
3574    punpcklwd            m7, m1, m1
3575    punpckhwd            m1, m1
3576    punpcklwd            m3, m3
3577    punpcklwd            m6, m2
3578    call m(idct_16x16_internal_8bpc).main_fast5
3579    punpcklwd           m21, m14, m14
3580    punpckhwd           m14, m14
3581    punpcklwd           m18, m15, m15
3582    punpckhwd           m15, m15
3583    punpcklwd           m20, m16, m16
3584    punpckhwd           m16, m16
3585    punpcklwd           m19, m17, m17
3586    punpckhwd           m17, m17
3587    call .main_oddhalf_fast
3588.pass2:
3589    vpbroadcastd        m10, [o(pw_2048)]
3590    mova                m11, [o(end_16x32p)]
3591    lea                  r3, [strideq*3]
3592    pxor                m13, m13
3593    psrld               m12, m11, 8
3594    IDCT_16x32_END        0,  1,  0
3595    IDCT_16x32_END        2,  3,  1
3596    IDCT_16x32_END        4,  5,  2
3597    IDCT_16x32_END        6,  7,  3
3598    IDCT_16x32_END       14, 15,  4
3599    IDCT_16x32_END       16, 17,  5
3600    IDCT_16x32_END       18, 19,  6
3601    IDCT_16x32_END       20, 21,  7
3602    RET
3603ALIGN function_align
3604.dconly:
3605    movsx               r6d, word [cq]
3606    mov                [cq], eobd
3607    or                  r3d, 32
3608    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly
3609ALIGN function_align
3610cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
3611    vpbroadcastd         m8, [o(pw_201_4091x8)]
3612    vpbroadcastd        m20, [o(pw_m1380_3857x8)]
3613    vpbroadcastd         m9, [o(pw_995_3973x8)]
3614    vpbroadcastd        m16, [o(pw_m601_4052x8)]
3615    pmulhrsw            m21, m8  ; t16a, t31a
3616    pmulhrsw            m20, m15 ; t19a, t28a
3617    pmulhrsw            m18, m9  ; t20a, t27a
3618    pmulhrsw            m14, m16 ; t23a, t24a
3619    mova                 m8, m21
3620    mova                m17, m20
3621    mova                m15, m18
3622    mova                m16, m14
3623    jmp .main3
3624ALIGN function_align
3625cglobal_label .main_oddhalf_fast ; bottom half is zero
3626    vpbroadcastd         m8, [o(pw_201_4091x8)]
3627    vpbroadcastd         m9, [o(pw_m2751_3035x8)]
3628    vpbroadcastd        m11, [o(pw_1751_3703x8)]
3629    vpbroadcastd        m12, [o(pw_m1380_3857x8)]
3630    pmulhrsw            m21, m8  ; t16a, t31a
3631    vpbroadcastd         m8, [o(pw_995_3973x8)]
3632    pmulhrsw            m17, m9  ; t17a, t30a
3633    vpbroadcastd         m9, [o(pw_m2106_3513x8)]
3634    pmulhrsw            m20, m11 ; t18a, t29a
3635    vpbroadcastd        m11, [o(pw_2440_3290x8)]
3636    pmulhrsw            m15, m12 ; t19a, t28a
3637    vpbroadcastd        m12, [o(pw_m601_4052x8)]
3638    pmulhrsw            m18, m8  ; t20a, t27a
3639    pmulhrsw            m16, m9  ; t21a, t26a
3640    pmulhrsw            m19, m11 ; t22a, t25a
3641    pmulhrsw            m14, m12 ; t23a, t24a
3642    jmp .main2
3643ALIGN function_align
3644cglobal_label .main_oddhalf
3645    ITX_MUL2X_PACK       21, 8, 9, 10,  201, 4091, 5 ; t16a, t31a
3646    ITX_MUL2X_PACK       17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
3647    ITX_MUL2X_PACK       20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
3648    ITX_MUL2X_PACK       15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
3649    ITX_MUL2X_PACK       18, 8, 9, 10,  995, 3973, 5 ; t20a, t27a
3650    ITX_MUL2X_PACK       16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
3651    ITX_MUL2X_PACK       19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
3652    ITX_MUL2X_PACK       14, 8, 9, 10, 4052,  601, 5 ; t23a, t24a
3653.main2:
3654    psubsw               m8, m21, m17 ; t17 t30
3655    paddsw              m21, m17      ; t16 t31
3656    psubsw              m17, m15, m20 ; t18 t29
3657    paddsw              m20, m15      ; t19 t28
3658    psubsw              m15, m18, m16 ; t21 t26
3659    paddsw              m18, m16      ; t20 t27
3660    psubsw              m16, m14, m19 ; t22 t25
3661    paddsw              m14, m19      ; t23 t24
3662.main3:
3663    ITX_MUL2X_PACK        8, 9, 19, 10,   799, 4017, 5 ; t17a t30a
3664    ITX_MUL2X_PACK       17, 9, 19, 10, m4017,  799, 5 ; t18a t29a
3665    ITX_MUL2X_PACK       15, 9, 19, 10,  3406, 2276, 5 ; t21a t26a
3666    ITX_MUL2X_PACK       16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a
3667    vpbroadcastd        m11, [o(pw_m3784_1567)]
3668    psubsw              m19, m21, m20 ; t19a t28a
3669    paddsw              m21, m20      ; t16a t31a
3670    psubsw              m20, m14, m18 ; t20a t27a
3671    paddsw              m14, m18      ; t23a t24a
3672    psubsw              m18, m8, m17  ; t18  t29
3673    paddsw               m8, m17      ; t17  t30
3674    psubsw              m17, m16, m15 ; t21  t26
3675    paddsw              m15, m16      ; t22  t25
3676    ITX_MUL2X_PACK       18, 9, 16, 10, 1567_3784, 11,   20 ; t18a t29a
3677    ITX_MUL2X_PACK       19, 9, 16, 10, 1567_3784, 11,   20 ; t19  t28
3678    ITX_MUL2X_PACK       20, 9, 16, 10, 11, m1567_m3784, 36 ; t20  t27
3679    ITX_MUL2X_PACK       17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a
3680    vbroadcasti32x4      m9, [o(deint_shuf)]
3681    psubsw              m16, m21, m14 ; t23  t24
3682    paddsw              m14, m21      ; t16  t31
3683    psubsw              m21, m8, m15  ; t22a t25a
3684    paddsw              m15, m8       ; t17a t30a
3685    psubsw               m8, m18, m17 ; t21  t26
3686    paddsw              m18, m17      ; t18  t29
3687    paddsw              m17, m19, m20 ; t19a t28a
3688    psubsw              m19, m20      ; t20a t27a
3689    vpbroadcastd        m11, [o(pw_m2896_2896)]
3690    vpbroadcastd        m12, [o(pw_2896_2896)]
3691    REPX     {pshufb x, m9}, m14, m15, m18, m17
3692    mova                 m9, m10
3693    vpdpwssd             m9, m16, m11
3694    mova                m20, m10
3695    vpdpwssd            m20, m21, m11
3696    psrad                m9, 12
3697    psrad               m20, 12
3698    packssdw             m9, m20      ; t23a t22
3699    mova                m20, m10
3700    vpdpwssd            m20, m16, m12
3701    mova                m16, m10
3702    vpdpwssd            m16, m21, m12
3703    psrad               m20, 12
3704    psrad               m16, 12
3705    packssdw            m16, m20, m16 ; t24a t25
3706    ITX_MUL2X_PACK        8, 21, 20, 10, 11, 12, 8 ; t21a t26a
3707    ITX_MUL2X_PACK       19,  8, 11, 10, 11, 12, 8 ; t20  t27
3708    packssdw            m11, m20      ; t27  t26a
3709    packssdw             m8, m21      ; t20  t21a
3710    punpcklqdq          m20, m14, m15 ; t16  t17a
3711    punpckhqdq          m14, m15      ; t31  t30a
3712    punpckhqdq          m15, m17, m18 ; t28a t29
3713    punpcklqdq          m17, m18      ; t19a t18
3714    psubsw              m21, m0, m14  ; out31 out30
3715    paddsw               m0, m14      ; out0  out1
3716    psubsw              m14, m7, m20  ; out16 out17
3717    paddsw               m7, m20      ; out15 out14
3718    psubsw              m20, m1, m15  ; out28 out29
3719    paddsw               m1, m15      ; out3  out2
3720    psubsw              m15, m6, m17  ; out19 out18
3721    paddsw               m6, m17      ; out12 out13
3722    psubsw              m17, m4, m9   ; out23 out22
3723    paddsw               m4, m9       ; out8  out9
3724    psubsw              m18, m3, m16  ; out24 out25
3725    paddsw               m3, m16      ; out7  out6
3726    psubsw              m16, m5, m8   ; out20 out21
3727    paddsw               m5, m8       ; out11 out10
3728    psubsw              m19, m2, m11  ; out27 out26
3729    paddsw               m2, m11      ; out4  out5
3730    ret
3731
3732cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob
3733%undef cmp
3734    lea                  r5, [o_base]
3735    test               eobd, eobd
3736    jz .dconly
3737    mova                m21, [o(permB)]
3738    vpermq               m1, m21, [cq+64* 0] ;  0  1
3739    vpermq              m14, m21, [cq+64* 1] ;  2  3
3740    vpermq              m20, m21, [cq+64* 2] ;  4  5
3741    vpermq              m15, m21, [cq+64* 3] ;  6  7
3742    vpbroadcastd         m8, [o(pw_2896x8)]
3743    vpermq               m2, m21, [cq+64* 4] ;  8  9
3744    vpermq              m16, m21, [cq+64* 5] ; 10 11
3745    vpermq               m3, m21, [cq+64* 6] ; 12 13
3746    vpermq              m17, m21, [cq+64* 7] ; 14 15
3747    REPX   {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17
3748    pxor                m12, m12
3749    REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7
3750    cmp                eobd, 151
3751    jb .fast
3752    vpermq               m9, m21, [cq+64* 8] ; 16 17
3753    vpermq              m19, m21, [cq+64* 9] ; 18 19
3754    vpermq               m4, m21, [cq+64*10] ; 20 21
3755    vpermq               m5, m21, [cq+64*11] ; 22 23
3756    vpermq               m6, m21, [cq+64*12] ; 24 25
3757    vpermq              m18, m21, [cq+64*13] ; 26 27
3758    vpermq               m7, m21, [cq+64*14] ; 28 29
3759    vpermq              m21, m21, [cq+64*15] ; 30 31
3760    REPX   {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21
3761    REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15
3762    punpcklwd            m8, m21, m14 ; 30  2
3763    punpckhwd           m21, m1       ; 31  1
3764    punpcklwd            m0, m17, m19 ; 14 18
3765    punpckhwd           m17, m9       ; 15 17
3766    punpcklwd            m9, m1       ; 16  0
3767    punpckhwd           m14, m7       ;  3 29
3768    punpcklwd            m1, m15, m18 ;  6 26
3769    punpckhwd           m15, m6       ;  7 25
3770    punpcklwd            m6, m2       ; 24  8
3771    punpckhwd           m19, m3       ; 19 13
3772    punpcklwd            m3, m4       ; 12 20
3773    punpckhwd           m18, m20      ; 27  5
3774    punpcklwd            m7, m20      ; 28  4
3775    punpckhwd           m20, m5, m2   ; 23  9
3776    punpcklwd            m5, m16      ; 22 10
3777    punpckhwd           m16, m4       ; 11 21
3778    call m(idct_16x16_internal_8bpc).main2
3779    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
3780    jmp .pass2
3781.fast: ; bottom half zero
3782    punpcklwd            m8, m14, m14 ;  2
3783    punpcklwd            m0, m17, m17 ; 14
3784    punpcklwd            m5, m16, m16 ; 10
3785    punpcklwd            m9, m12, m1  ; __  0
3786    punpckhwd           m21, m1, m1   ;  1
3787    punpcklwd            m1, m15, m15 ;  6
3788    punpcklwd            m7, m20, m20 ;  4
3789    punpckhwd           m19, m3, m3   ; 13
3790    punpcklwd            m3, m3       ; 12
3791    punpcklwd            m6, m12, m2  ; __  8
3792    punpckhwd           m18, m20, m20 ;  5
3793    punpckhwd           m20, m2, m2   ;  9
3794    call m(idct_16x16_internal_8bpc).main_fast
3795    punpckhwd           m15, m15      ;  7
3796    punpckhwd           m14, m14      ;  3
3797    punpckhwd           m16, m16      ; 11
3798    punpckhwd           m17, m17      ; 15
3799    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
3800.pass2:
3801    vpbroadcastd         m9, [o(pw_16384)]
3802    call .transpose_round
3803    vshufi32x4          m16, m14, m2, q3131 ;  5
3804    vshufi32x4          m14, m2, q2020      ;  1
3805    vshufi32x4           m2, m0, m3, q3131  ;  4
3806    vshufi32x4           m0, m3, q2020      ;  0
3807    vshufi32x4           m3, m1, m18, q3131 ;  6
3808    vshufi32x4           m1, m18, q2020     ;  2
3809    vshufi32x4          m18, m20, m6, q2020 ;  9
3810    vshufi32x4          m20, m6, q3131      ; 13
3811    vshufi32x4           m6, m21, m4, q3131 ; 12
3812    vshufi32x4           m4, m21, m4, q2020 ;  8
3813    vshufi32x4          m21, m19, m7, q3131 ; 15
3814    vshufi32x4          m19, m7, q2020      ; 11
3815    vshufi32x4           m7, m5, m15, q3131 ; 14
3816    vshufi32x4           m5, m15, q2020     ; 10
3817    vshufi32x4          m15, m17, m9, q2020 ;  3
3818    vshufi32x4          m17, m9, q3131      ;  7
3819    call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
3820    call .main_oddhalf
3821    vpbroadcastd        m12, [o(pw_2048)]
3822    movshdup            m13, [o(permD)]
3823    lea                  r2, [strideq*3]
3824    pmovzxbw             m8, [dstq+strideq*0]
3825    pmovzxbw             m9, [dstq+strideq*1]
3826    pmovzxbw            m10, [dstq+strideq*2]
3827    pmovzxbw            m11, [dstq+r2       ]
3828    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3
3829    lea                  r3, [dstq+strideq*4]
3830    paddw                m0, m8
3831    paddw                m1, m9
3832    paddw                m2, m10
3833    paddw                m3, m11
3834    pmovzxbw             m8, [r3+strideq*0]
3835    pmovzxbw             m9, [r3+strideq*1]
3836    pmovzxbw            m10, [r3+strideq*2]
3837    pmovzxbw            m11, [r3+r2       ]
3838    REPX  {pmulhrsw x, m12}, m4, m5, m6, m7
3839    lea                  r4, [dstq+strideq*8]
3840    packuswb             m0, m1
3841    paddw                m4, m8
3842    paddw                m5, m9
3843    packuswb             m2, m3
3844    paddw                m6, m10
3845    paddw                m7, m11
3846    pmovzxbw             m8, [r4+strideq*0]
3847    pmovzxbw             m9, [r4+strideq*1]
3848    pmovzxbw            m10, [r4+strideq*2]
3849    pmovzxbw            m11, [r4+r2       ]
3850    REPX  {pmulhrsw x, m12}, m14, m15, m16, m17
3851    lea                  r5, [r3+strideq*8]
3852    packuswb             m4, m5
3853    paddw               m14, m8
3854    paddw               m15, m9
3855    packuswb             m6, m7
3856    paddw               m16, m10
3857    paddw               m17, m11
3858    pmovzxbw             m8, [r5+strideq*0]
3859    pmovzxbw             m9, [r5+strideq*1]
3860    pmovzxbw            m10, [r5+strideq*2]
3861    pmovzxbw            m11, [r5+r2       ]
3862    REPX  {pmulhrsw x, m12}, m18, m19, m20, m21
3863    packuswb            m14, m15
3864    paddw               m18, m8
3865    paddw               m19, m9
3866    packuswb            m16, m17
3867    paddw               m20, m10
3868    paddw               m21, m11
3869    packuswb            m18, m19
3870    packuswb            m20, m21
3871    REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20
3872    mova          [dstq+strideq*0], ym0
3873    vextracti32x8 [dstq+strideq*1], m0, 1
3874    mova          [dstq+strideq*2], ym2
3875    vextracti32x8 [dstq+r2       ], m2, 1
3876    mova          [r3+strideq*0], ym4
3877    vextracti32x8 [r3+strideq*1], m4, 1
3878    mova          [r3+strideq*2], ym6
3879    vextracti32x8 [r3+r2       ], m6, 1
3880    mova          [r4+strideq*0], ym14
3881    vextracti32x8 [r4+strideq*1], m14, 1
3882    mova          [r4+strideq*2], ym16
3883    vextracti32x8 [r4+r2       ], m16, 1
3884    mova          [r5+strideq*0], ym18
3885    vextracti32x8 [r5+strideq*1], m18, 1
3886    mova          [r5+strideq*2], ym20
3887    vextracti32x8 [r5+r2       ], m20, 1
3888    RET
3889ALIGN function_align
3890.dconly:
3891    movsx               r6d, word [cq]
3892    mov                [cq], eobd
3893    or                  r3d, 16
3894    imul                r6d, 181
3895    add                 r6d, 128
3896    sar                 r6d, 8
3897    imul                r6d, 181
3898    add                 r6d, 128+256
3899    sar                 r6d, 8+1
3900    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
3901ALIGN function_align
3902cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero
3903    vpbroadcastd         m8, [o(pw_2896x8)]
3904    vpbroadcastd         m4, [o(pw_4076x8)]
3905    vpbroadcastd         m3, [o(pw_401x8)]
3906    pmulhrsw             m8, m0  ; t0
3907    pmulhrsw             m4, m14 ; t15a
3908    pmulhrsw             m3, m14 ; t8a
3909    punpcklwd            m9, m3, m4
3910    punpckhwd            m5, m3, m4
3911    mova                 m2, m10
3912    vpdpwssd             m2, m9, [o(pw_m3784_1567)] {bcstd}
3913    mova                 m1, m10
3914    vpdpwssd             m1, m5, [o(pw_m3784_1567)] {bcstd}
3915    mova                 m6, m10
3916    vpdpwssd             m6, m5, [o(pw_1567_3784)] {bcstd}
3917    mova                 m5, m10
3918    vpdpwssd             m5, m9, [o(pw_1567_3784)] {bcstd}
3919    vpbroadcastd        m11, [o(pw_2896_2896)]
3920    vpbroadcastd        m12, [o(pw_m2896_2896)]
3921    psubsw              m21, m8, m4 ; out15
3922    paddsw               m0, m8, m4 ; out0
3923    psubsw              m14, m8, m3 ; out8
3924    paddsw               m7, m8, m3 ; out7
3925    REPX      {psrad x, 12}, m2, m1, m6, m5
3926    packssdw             m2, m1     ; t9a
3927    packssdw             m5, m6     ; t14a
3928    ITX_MULSUB_2W         4, 3, 16, 17, 10, 11, 12 ; t11,  t12
3929    psubsw              m20, m8, m5 ; out14
3930    paddsw               m1, m8, m5 ; out1
3931    psubsw              m15, m8, m2 ; out9
3932    paddsw               m6, m8, m2 ; out6
3933    ITX_MULSUB_2W         5, 2, 16, 17, 10, 11, 12 ; t10a, t13a
3934    psubsw              m18, m8, m3 ; out12
3935    paddsw               m3, m8     ; out3
3936    psubsw              m17, m8, m4 ; out11
3937    paddsw               m4, m8     ; out4
3938    psubsw              m19, m8, m2 ; out13
3939    paddsw               m2, m8     ; out2
3940    psubsw              m16, m8, m5 ; out10
3941    paddsw               m5, m8     ; out5
3942    ret
3943cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
3944    vpbroadcastd         m9, [o(pw_2896x8)]
3945    vpbroadcastd         m2, [o(pw_4017x8)]
3946    vpbroadcastd         m3, [o(pw_799x8)]
3947    vpbroadcastd        m18, [o(pw_4076x8)]
3948    vpbroadcastd        m19, [o(pw_401x8)]
3949    vpbroadcastd        m20, [o(pw_m1189x8)]
3950    vpbroadcastd        m16, [o(pw_3920x8)]
3951    pmulhrsw             m9, m0  ; t0
3952    pmulhrsw             m2, m1  ; t7a
3953    pmulhrsw             m1, m3  ; t4a
3954    pmulhrsw            m18, m14 ; t15a
3955    pmulhrsw            m14, m19 ; t8a
3956    pmulhrsw            m20, m15 ; t11a
3957    pmulhrsw            m15, m16 ; t12a
3958    psubsw               m7, m9, m2 ; idct8 out7
3959    paddsw               m0, m9, m2 ; idct8 out0
3960    psubsw               m4, m9, m1 ; idct8 out4
3961    paddsw               m3, m9, m1 ; idct8 out3
3962    ITX_MULSUB_2W         2, 1, 5, 6, 10, 2896, 2896 ; t5, t6
3963    mova                m21, m18
3964    mova                m19, m14
3965    mova                m16, m15
3966    mova                 m8, m20
3967    psubsw               m6, m9, m1 ; idct8 out6
3968    paddsw               m1, m9     ; idct8 out1
3969    psubsw               m5, m9, m2 ; idct8 out5
3970    paddsw               m2, m9     ; idct8 out2
3971    jmp .main3
3972ALIGN function_align
3973cglobal_label .main_oddhalf_fast ; bottom half is zero
3974    vpbroadcastd         m5, [o(pw_m2276x8)]
3975    vpbroadcastd        m11, [o(pw_3406x8)]
3976    vpbroadcastd         m7, [o(pw_4017x8)]
3977    vpbroadcastd        m12, [o(pw_799x8)]
3978    vpbroadcastd         m6, [o(pw_3784x8)]
3979    vpbroadcastd        m10, [o(pw_1567x8)]
3980    vpbroadcastd         m4, [o(pw_2896x8)]
3981    pmulhrsw             m5, m3  ; t5a
3982    pmulhrsw             m3, m11 ; t6a
3983    pmulhrsw             m7, m1  ; t7a
3984    pmulhrsw             m1, m12 ; t4a
3985    pmulhrsw             m6, m2  ; t3
3986    pmulhrsw             m2, m10 ; t2
3987    pmulhrsw             m4, m0  ; t0
3988    vpbroadcastd        m11, [o(pw_2896_2896)]
3989    vpbroadcastd        m12, [o(pw_m2896_2896)]
3990    vpbroadcastd        m10, [o(pd_2048)]
3991    mova                 m0, m4  ; t1
3992    call m(inv_txfm_add_dct_dct_32x8_8bpc).main3
3993    vpbroadcastd        m21, [o(pw_4076x8)]
3994    vpbroadcastd         m8, [o(pw_401x8)]
3995    vpbroadcastd        m18, [o(pw_m2598x8)]
3996    vpbroadcastd         m9, [o(pw_3166x8)]
3997    vpbroadcastd        m19, [o(pw_3612x8)]
3998    vpbroadcastd        m11, [o(pw_1931x8)]
3999    vpbroadcastd        m20, [o(pw_m1189x8)]
4000    vpbroadcastd        m12, [o(pw_3920x8)]
4001    pmulhrsw            m21, m14 ; t15a
4002    pmulhrsw            m14, m8  ; t8a
4003    pmulhrsw            m18, m17 ; t9a
4004    pmulhrsw            m17, m9  ; t14a
4005    pmulhrsw            m19, m16 ; t13a
4006    pmulhrsw            m16, m11 ; t10a
4007    pmulhrsw            m20, m15 ; t11a
4008    pmulhrsw            m15, m12 ; t12a
4009    jmp .main2
4010ALIGN function_align
4011cglobal_label .main_oddhalf
4012    ITX_MULSUB_2W        14, 21, 8, 9, 10,  401, 4076 ; t8a,  t15a
4013    ITX_MULSUB_2W        18, 17, 8, 9, 10, 3166, 2598 ; t9a,  t14a
4014    ITX_MULSUB_2W        16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a
4015    ITX_MULSUB_2W        20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a
4016.main2:
4017    paddsw               m8, m20, m16 ; t11
4018    psubsw              m20, m16      ; t10
4019    paddsw              m16, m15, m19 ; t12
4020    psubsw              m15, m19      ; t13
4021    psubsw              m19, m14, m18 ; t9
4022    paddsw              m14, m18      ; t8
4023    psubsw              m18, m21, m17 ; t14
4024    paddsw              m21, m17      ; t15
4025.main3:
4026    vpbroadcastd        m11, [o(pw_1567_3784)]
4027    vpbroadcastd        m12, [o(pw_m3784_1567)]
4028    ITX_MULSUB_2W        18, 19, 9, 17, 10, 11, 12 ; t9a,  t14a
4029    vpbroadcastd        m11, [o(pw_m1567_m3784)]
4030    ITX_MULSUB_2W        15, 20, 9, 17, 10, 12, 11 ; t10a, t13a
4031    vpbroadcastd        m11, [o(pw_2896_2896)]
4032    vpbroadcastd        m12, [o(pw_m2896_2896)]
4033    psubsw              m17, m14, m8  ; t11a
4034    paddsw               m8, m14      ; t8a
4035    paddsw              m14, m18, m15 ; t9
4036    psubsw              m18, m15      ; t10
4037    psubsw              m15, m19, m20 ; t13
4038    paddsw              m19, m20      ; t14
4039    paddsw              m20, m21, m16 ; t15a
4040    psubsw              m16, m21, m16 ; t12a
4041    ITX_MULSUB_2W        15, 18, 9, 21, 10, 11, 12 ; t10a, t13a
4042    ITX_MULSUB_2W        16, 17, 9, 21, 10, 11, 12 ; t11,  t12
4043    psubsw              m21, m0, m20 ; out15
4044    paddsw               m0, m20     ; out0
4045    psubsw              m20, m1, m19 ; out14
4046    paddsw               m1, m19     ; out1
4047    psubsw              m19, m2, m18 ; out13
4048    paddsw               m2, m18     ; out2
4049    psubsw              m18, m3, m17 ; out12
4050    paddsw               m3, m17     ; out3
4051    psubsw              m17, m4, m16 ; out11
4052    paddsw               m4, m16     ; out4
4053    psubsw              m16, m5, m15 ; out10
4054    paddsw               m5, m15     ; out5
4055    psubsw              m15, m6, m14 ; out9
4056    paddsw               m6, m14     ; out6
4057    psubsw              m14, m7, m8  ; out8
4058    paddsw               m7, m8      ; out7
4059    ret
4060.transpose_round:
4061    punpcklwd            m8, m0, m2
4062    punpckhwd            m0, m2
4063    punpcklwd            m2, m1, m3
4064    punpckhwd            m1, m3
4065    punpcklwd            m3, m4, m6
4066    punpckhwd            m4, m6
4067    punpcklwd            m6, m5, m7
4068    punpckhwd            m5, m7
4069    punpcklwd            m7, m14, m16
4070    punpckhwd           m14, m16
4071    punpcklwd           m16, m15, m17
4072    punpckhwd           m15, m17
4073    punpcklwd           m17, m19, m21
4074    punpckhwd           m19, m21
4075    punpckhwd           m21, m18, m20
4076    punpcklwd           m18, m20
4077    punpcklwd           m20, m8, m1
4078    punpckhwd            m8, m1
4079    punpcklwd            m1, m0, m2
4080    punpckhwd            m0, m2
4081    punpcklwd            m2, m3, m5
4082    punpckhwd            m3, m5
4083    punpcklwd            m5, m4, m6
4084    punpckhwd            m4, m6
4085    REPX   {pmulhrsw x, m9}, m20, m8, m1, m0
4086    punpcklwd            m6, m7, m15
4087    punpckhwd            m7, m15
4088    punpcklwd           m15, m14, m16
4089    punpckhwd           m14, m16
4090    REPX   {pmulhrsw x, m9}, m2, m3, m5, m4
4091    punpckhwd           m16, m18, m19
4092    punpcklwd           m18, m19
4093    punpcklwd           m19, m21, m17
4094    punpckhwd           m21, m17
4095    REPX   {pmulhrsw x, m9}, m6, m7, m15, m14
4096    punpcklwd           m17, m8, m0         ; a2   a6   aa   ae
4097    punpckhwd            m8, m0             ; a3   a7   ab   af
4098    punpcklwd            m0, m20, m1        ; a0   a4   a8   ac
4099    punpckhwd           m20, m1             ; a1   a5   a9   ad
4100    REPX   {pmulhrsw x, m9}, m16, m18, m19, m21
4101    punpcklwd            m1, m2, m5         ; b0   b4   b8   bc
4102    punpckhwd            m2, m5             ; b1   b5   b9   bd
4103    punpcklwd            m5, m3, m4         ; b2   b6   ba   be
4104    punpckhwd            m3, m4             ; b3   b7   bb   bf
4105    punpcklwd            m4, m6, m15        ; c0   c4   c8   cc
4106    punpckhwd            m6, m15            ; c1   c5   c9   cd
4107    punpcklwd           m15, m7, m14        ; c2   c6   ca   ce
4108    punpckhwd            m7, m14            ; c3   c7   cb   cf
4109    punpcklwd           m14, m18, m19       ; d0   d4   d8   dc
4110    punpckhwd           m18, m19            ; d1   d5   d9   dd
4111    punpcklwd            m9, m16, m21       ; d2   d6   da   de
4112    punpckhwd           m16, m21            ; d3   d7   db   df
4113    vshufi32x4          m21, m0, m1, q3232  ; a8   ac   b8   bc
4114    vinserti32x8         m0, ym1, 1         ; a0   a4   b0   b4
4115    vinserti32x8         m1, m17, ym5, 1    ; a2   a6   b2   b6
4116    vshufi32x4           m5, m17, m5, q3232 ; aa   ae   ba   be
4117    vinserti32x8        m17, m8, ym3, 1     ; a3   a7   b3   b7
4118    vshufi32x4          m19, m8, m3, q3232  ; ab   af   bb   bf
4119    vinserti32x8         m3, m4, ym14, 1    ; c0   c4   d0   d4
4120    vshufi32x4           m4, m14, q3232     ; c8   cc   d8   dc
4121    vinserti32x8        m14, m20, ym2, 1    ; a1   a5   b1   b5
4122    vshufi32x4          m20, m2, q3232      ; a9   ad   b9   bd
4123    vinserti32x8         m2, m6, ym18, 1    ; c1   c5   d1   d5
4124    vshufi32x4           m6, m18, q3232     ; c9   cd   d9   dd
4125    vinserti32x8        m18, m15, ym9, 1    ; c2   c6   d2   d6
4126    vshufi32x4          m15, m9, q3232      ; ca   ce   da   de
4127    vinserti32x8         m9, m7, ym16, 1    ; c3   c7   d3   d7
4128    vshufi32x4           m7, m16, q3232     ; cb   cf   db   df
4129    ret
4130
4131%macro IDTX_16x32 4 ; src/dst[1-4]
4132    pmulhrsw            m%1, m15, [cq+64*%1]
4133    pmulhrsw            m%2, m15, [cq+64*%2]
4134    pmulhrsw            m%3, m15, [cq+64*%3]
4135    pmulhrsw            m%4, m15, [cq+64*%4]
4136    pmulhrsw            m18, m16, m%1
4137    pmulhrsw            m19, m16, m%2
4138    pmulhrsw            m20, m16, m%3
4139    pmulhrsw            m21, m16, m%4
4140    REPX  {pmulhrsw x, m17}, m18, m19, m20, m21
4141    paddsw              m%1, m18
4142    paddsw              m%2, m19
4143    paddsw              m%3, m20
4144    paddsw              m%4, m21
4145%endmacro
4146
4147%macro IDTX_16x32_STORE 2 ; src[1-2]
4148    mova               xm17, [dstq+r3*0]
4149    vinserti128        ym17, [dstq+r3*4], 1
4150    vinserti32x4        m17, [dstq+r3*8], 2
4151    vinserti32x4        m17, [dstq+r4*8], 3
4152    mova   [cq+64*(%1*2+0)], m18
4153    mova   [cq+64*(%1*2+1)], m18
4154    punpcklbw           m16, m17, m18
4155    punpckhbw           m17, m18
4156    paddw               m16, m%1
4157    paddw               m17, m%2
4158    packuswb            m16, m17
4159    mova          [dstq+r3*0], xm16
4160    vextracti128  [dstq+r3*4], ym16, 1
4161    vextracti32x4 [dstq+r3*8], m16, 2
4162    vextracti32x4 [dstq+r4*8], m16, 3
4163%if %1 != 7
4164    add                dstq, strideq
4165%endif
4166%endmacro
4167
4168cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c
4169    vpbroadcastd        m15, [pw_2896x8]
4170    vpbroadcastd        m16, [pw_1697x16]
4171    vpbroadcastd        m17, [pw_16384]
4172    IDTX_16x32            0,  1,  2,  3
4173    IDTX_16x32            4,  5,  6,  7
4174    IDTX_16x32            8,  9, 10, 11
4175    IDTX_16x32           12, 13, 14, 15
4176    vpbroadcastd        m16, [pw_8192]
4177    call .transpose_2x8x8_round
4178    lea                  r3, [strideq*2]
4179    lea                  r4, [strideq*3]
4180    pxor                m18, m18
4181    IDTX_16x32_STORE      0,  8
4182    IDTX_16x32_STORE      1,  9
4183    IDTX_16x32_STORE      2, 10
4184    IDTX_16x32_STORE      3, 11
4185    IDTX_16x32_STORE      4, 12
4186    IDTX_16x32_STORE      5, 13
4187    IDTX_16x32_STORE      6, 14
4188    IDTX_16x32_STORE      7, 15
4189    RET
4190ALIGN function_align
4191.transpose_2x8x8_round:
4192    punpckhwd           m17, m4, m5
4193    punpcklwd            m4, m5
4194    punpckhwd            m5, m0, m1
4195    punpcklwd            m0, m1
4196    punpckhwd            m1, m6, m7
4197    punpcklwd            m6, m7
4198    punpckhwd            m7, m2, m3
4199    punpcklwd            m2, m3
4200    punpckhdq            m3, m0, m2
4201    punpckldq            m0, m2
4202    punpckldq            m2, m4, m6
4203    punpckhdq            m4, m6
4204    punpckhdq            m6, m5, m7
4205    punpckldq            m5, m7
4206    punpckldq            m7, m17, m1
4207    punpckhdq           m17, m1
4208    REPX  {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17
4209    punpckhqdq           m1, m0, m2
4210    punpcklqdq           m0, m2
4211    punpcklqdq           m2, m3, m4
4212    punpckhqdq           m3, m4
4213    punpcklqdq           m4, m5, m7
4214    punpckhqdq           m5, m7
4215    punpckhqdq           m7, m6, m17
4216    punpcklqdq           m6, m17
4217    punpckhwd           m17, m12, m13
4218    punpcklwd           m12, m13
4219    punpckhwd           m13, m8, m9
4220    punpcklwd            m8, m9
4221    punpckhwd            m9, m14, m15
4222    punpcklwd           m14, m15
4223    punpckhwd           m15, m10, m11
4224    punpcklwd           m10, m11
4225    punpckhdq           m11, m8, m10
4226    punpckldq            m8, m10
4227    punpckldq           m10, m12, m14
4228    punpckhdq           m12, m14
4229    punpckhdq           m14, m13, m15
4230    punpckldq           m13, m15
4231    punpckldq           m15, m17, m9
4232    punpckhdq           m17, m9
4233    REPX  {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17
4234    punpckhqdq           m9, m8, m10
4235    punpcklqdq           m8, m10
4236    punpcklqdq          m10, m11, m12
4237    punpckhqdq          m11, m12
4238    punpcklqdq          m12, m13, m15
4239    punpckhqdq          m13, m15
4240    punpckhqdq          m15, m14, m17
4241    punpcklqdq          m14, m17
4242    ret
4243
4244%macro IDTX_32x16 4 ; dst[1-4]
4245    pmulhrsw            m%2, m12, [cq+32*(%1+ 0)]
4246    pmulhrsw            m18, m12, [cq+32*(%1+16)]
4247    pmulhrsw            m%4, m12, [cq+32*(%3+ 0)]
4248    pmulhrsw            m19, m12, [cq+32*(%3+16)]
4249    REPX      {paddsw x, x}, m%2, m18, m%4, m19
4250    mova                m%1, m14
4251    vpermi2q            m%1, m%2, m18
4252    vpermt2q            m%2, m16, m18
4253%if %3 != 14
4254    mova                m%3, m14
4255%endif
4256    vpermi2q            m%3, m%4, m19
4257    vpermt2q            m%4, m16, m19
4258    pmulhrsw            m18, m17, m%1
4259    pmulhrsw            m19, m17, m%2
4260    pmulhrsw            m20, m17, m%3
4261    pmulhrsw            m21, m17, m%4
4262    REPX      {paddsw x, x}, m%1, m%2, m%3, m%4
4263    paddsw              m%1, m18
4264    paddsw              m%2, m19
4265    paddsw              m%3, m20
4266    paddsw              m%4, m21
4267%endmacro
4268
4269%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32
4270    mova               ym19, [dstq+strideq*0]
4271    vinserti32x8        m19, [dstq+strideq*8], 1
4272%if %3 == 0
4273    mova   [cq+64*(%1*2+0)], m20
4274    mova   [cq+64*(%1*2+1)], m20
4275%endif
4276    punpcklbw           m18, m19, m20
4277    punpckhbw           m19, m20
4278    paddw               m18, m%1
4279    paddw               m19, m%2
4280    packuswb            m18, m19
4281    mova          [dstq+strideq*0], ym18
4282    vextracti32x8 [dstq+strideq*8], m18, 1
4283%if %3 || %1 != 7
4284    add                dstq, strideq
4285%endif
4286%endmacro
4287
4288cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c
4289    vpbroadcastd        m12, [pw_2896x8]
4290    movu                m14, [permB+7]
4291    vpbroadcastd        m17, [pw_1697x16]
4292    psrlq               m16, m14, 4
4293    IDTX_32x16            0,  1,  2,  3
4294    IDTX_32x16            4,  5,  6,  7
4295    IDTX_32x16            8,  9, 10, 11
4296    IDTX_32x16           12, 13, 14, 15
4297    vpbroadcastd        m16, [pw_2048]
4298    call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
4299    pxor                m20, m20
4300    IDTX_32x16_STORE      0,  8
4301    IDTX_32x16_STORE      1,  9
4302    IDTX_32x16_STORE      2, 10
4303    IDTX_32x16_STORE      3, 11
4304    IDTX_32x16_STORE      4, 12
4305    IDTX_32x16_STORE      5, 13
4306    IDTX_32x16_STORE      6, 14
4307    IDTX_32x16_STORE      7, 15
4308    RET
4309
4310%macro IDCT_32x32_END 4 ; src, mem, stride[1-2]
4311    pmovzxbw            m10, [dstq+%3]
4312    pmovzxbw            m11, [r3  +%4]
4313%if %2 < 8
4314    paddsw               m8, m%2, m%1
4315    psubsw               m9, m%2, m%1
4316%else
4317    mova                 m9, [cq+64*(%2*2-16)]
4318    paddsw               m8, m9, m%1
4319    psubsw               m9, m%1
4320%endif
4321    pmulhrsw             m8, m12
4322    pmulhrsw             m9, m12
4323%if %2 >= 8
4324%if %2 == 8
4325    pxor                 m0, m0
4326%endif
4327    mova  [cq+64*(%2*2-16)], m0
4328    mova  [cq+64*(%2*2-15)], m0
4329%endif
4330    paddw                m8, m10
4331    paddw                m9, m11
4332    packuswb             m8, m9
4333    vpermq               m8, m13, m8
4334    mova          [dstq+%3], ym8
4335    vextracti32x8 [r3  +%4], m8, 1
4336%if %2 == 3 || %2 == 7 || %2 == 11
4337    add                dstq, r5
4338    sub                  r3, r5
4339%endif
4340%endmacro
4341
4342cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob
4343%undef cmp
4344    lea                  r5, [o_base]
4345    test               eobd, eobd
4346    jz .dconly
4347    WIN64_SPILL_XMM      30
4348    cmp                eobd, 136
4349    jb .fast
4350    mova                 m5, [cq+64*20]
4351    mova                 m3, [cq+64*12]
4352    mova                 m1, [cq+64* 4]
4353    mova                 m7, [cq+64*28]
4354    mova                 m2, [cq+64* 8]
4355    mova                 m6, [cq+64*24]
4356    mova                 m0, [cq+64* 0]
4357    mova                 m4, [cq+64*16]
4358    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
4359    mova                m14, [cq+64* 2]
4360    mova                m21, [cq+64*30]
4361    mova                m18, [cq+64*18]
4362    mova                m17, [cq+64*14]
4363    mova                m16, [cq+64*10]
4364    mova                m19, [cq+64*22]
4365    mova                m20, [cq+64*26]
4366    mova                m15, [cq+64* 6]
4367    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
4368    mova         [cq+64* 0], m14
4369    mova         [cq+64* 2], m15
4370    mova         [cq+64* 4], m16
4371    mova         [cq+64* 6], m17
4372    mova         [cq+64* 8], m18
4373    mova         [cq+64*10], m19
4374    mova         [cq+64*12], m20
4375    mova         [cq+64*14], m21
4376    mova                m22, [cq+64* 1]
4377    mova                m21, [cq+64*31]
4378    mova                m14, [cq+64*17]
4379    mova                m29, [cq+64*15]
4380    mova                m26, [cq+64* 9]
4381    mova                m17, [cq+64*23]
4382    mova                m18, [cq+64*25]
4383    mova                m25, [cq+64* 7]
4384    mova                m24, [cq+64* 5]
4385    mova                m19, [cq+64*27]
4386    mova                m16, [cq+64*21]
4387    mova                m27, [cq+64*11]
4388    mova                m28, [cq+64*13]
4389    mova                m15, [cq+64*19]
4390    mova                m20, [cq+64*29]
4391    mova                m23, [cq+64* 3]
4392    call .main_oddhalf
4393    vpbroadcastd        m10, [o(pw_8192)]
4394    psubsw              m13, m0, m29 ; 31
4395    paddsw               m0, m29     ;  0
4396    psubsw              m29, m1, m28 ; 30
4397    paddsw               m1, m28     ;  1
4398    psubsw              m28, m2, m27 ; 29
4399    paddsw               m2, m27     ;  2
4400    psubsw              m27, m3, m26 ; 28
4401    paddsw               m3, m26     ;  3
4402    psubsw              m26, m4, m25 ; 27
4403    paddsw               m4, m25     ;  4
4404    psubsw              m25, m5, m24 ; 26
4405    paddsw               m5, m24     ;  5
4406    psubsw              m24, m6, m23 ; 25
4407    paddsw               m6, m23     ;  6
4408    psubsw              m23, m7, m22 ; 24
4409    paddsw               m7, m22     ;  7
4410    pxor                 m9, m9
4411    punpckhwd            m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
4412    punpcklwd            m0, m1     ; a0 b0 a1 b1 a2 b2 a3 b3
4413    punpckhwd            m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
4414    punpcklwd            m2, m3     ; c0 d0 c1 d1 c2 d2 c3 d3
4415    REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
4416    punpckhwd           m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
4417    punpcklwd            m4, m5     ; e0 f0 e1 f1 e2 f2 e3 f3
4418    punpckhwd            m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
4419    punpcklwd            m6, m7     ; g0 h0 g1 h1 g2 h2 g3 h3
4420    REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
4421    punpckhwd            m3, m23, m24
4422    punpcklwd           m23, m24
4423    punpckhwd           m24, m25, m26
4424    punpcklwd           m25, m26
4425    REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
4426    punpckhwd           m26, m27, m28
4427    punpcklwd           m27, m28
4428    punpckhwd           m28, m29, m13
4429    punpcklwd           m29, m13
4430    REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
4431    punpckhdq            m7, m0, m2  ; a2 b2 c2 d2 a3 b3 c3 d3
4432    punpckldq            m0, m2      ; a0 b0 c0 d0 a1 b1 c1 d1
4433    punpckhdq            m2, m4, m6  ; e2 f2 g2 h2 e3 f3 g3 h3
4434    punpckldq            m4, m6      ; e0 f0 g0 h0 e1 f1 g1 h1
4435    punpckhdq            m6, m8, m1  ; a6 b6 c6 d6 a7 b7 c7 d7
4436    punpckldq            m8, m1      ; a4 b4 c4 d4 a5 b5 c5 d5
4437    punpckhdq            m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
4438    punpckldq           m22, m5      ; e4 f4 g4 h5 e5 f5 g5 h5
4439    REPX  {pmulhrsw x, m10}, m0, m4, m8, m22
4440    punpckhdq           m13, m23, m25
4441    punpckldq           m23, m25
4442    punpckhdq           m25, m27, m29
4443    punpckldq           m27, m29
4444    REPX  {pmulhrsw x, m10}, m13, m23, m25, m27
4445    punpckhdq            m9, m3, m24
4446    punpckldq            m3, m24
4447    punpckhdq           m24, m26, m28
4448    punpckldq           m26, m28
4449    punpcklqdq           m5, m23, m27 ; d00 d08 d16 d24
4450    punpckhqdq          m23, m27      ; d01 d09 d17 d25
4451    punpckhqdq          m27, m13, m25 ; d03 d11 d19 d27
4452    punpcklqdq          m13, m25      ; d02 d10 d18 d26
4453    punpckhqdq          m25, m3, m26  ; d05 d13 d21 d29
4454    punpcklqdq           m3, m26      ; d04 d12 d20 d28
4455    punpckhqdq          m26, m9, m24  ; d07 d15 d23 d31
4456    punpcklqdq           m9, m24      ; d06 d14 d22 d30
4457    REPX  {pmulhrsw x, m10}, m25, m3, m26
4458    mova         [cq+64* 9], m23
4459    mova         [cq+64*11], m27
4460    mova         [cq+64*13], m25
4461    mova         [cq+64*15], m26
4462    punpckhqdq          m24, m8, m22  ; a05 a13 a21 a29
4463    punpcklqdq           m8, m22      ; a04 a12 a20 a28
4464    punpckhqdq          m22, m0, m4   ; a01 a09 a17 a25
4465    punpcklqdq           m0, m4       ; a00 a08 a16 a24
4466    punpckhqdq          m23, m7, m2   ; a03 a11 a19 a27
4467    punpcklqdq           m7, m2       ; a02 a10 a18 a26
4468    punpckhqdq          m25, m6, m1   ; a07 a15 a23 a31
4469    punpcklqdq           m6, m1       ; a06 a14 a22 a30
4470    mova                 m2, [cq+64* 0]
4471    mova                m11, [cq+64* 2]
4472    mova                m12, [cq+64* 4]
4473    mova                m29, [cq+64* 6]
4474    mova                m27, [cq+64* 8]
4475    mova                m26, [cq+64*10]
4476    mova                 m4, [cq+64*12]
4477    mova                m28, [cq+64*14]
4478    psubsw               m1, m2, m21  ; 23
4479    paddsw               m2, m21      ;  8
4480    psubsw              m21, m11, m20 ; 22
4481    paddsw              m11, m20      ;  9
4482    psubsw              m20, m12, m19 ; 21
4483    paddsw              m12, m19      ; 10
4484    psubsw              m19, m29, m18 ; 20
4485    paddsw              m29, m18      ; 11
4486    psubsw              m18, m27, m17 ; 19
4487    paddsw              m27, m17      ; 12
4488    psubsw              m17, m26, m16 ; 18
4489    paddsw              m26, m16      ; 13
4490    paddsw              m16, m4, m15  ; 14
4491    psubsw               m4, m15      ; 17
4492    pmulhrsw            m15, m6, m10
4493    psubsw               m6, m28, m14 ; 16
4494    paddsw              m28, m14      ; 15
4495    pmulhrsw            m14, m7, m10
4496    punpcklwd            m7, m6, m4
4497    punpckhwd            m6, m4
4498    punpckhwd            m4, m17, m18
4499    punpcklwd           m17, m18
4500    punpckhwd           m18, m19, m20
4501    punpcklwd           m19, m20
4502    punpckhwd           m20, m21, m1
4503    punpcklwd           m21, m1
4504    punpckhwd            m1, m2, m11  ; i4 j4 i5 j5 i6 j6 i7 j7
4505    punpcklwd            m2, m11      ; i0 j1 i1 j1 i2 j2 i3 j3
4506    punpckhwd           m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
4507    punpcklwd           m12, m29      ; k0 l0 k1 l1 k2 l2 k3 l3
4508    punpckhwd           m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
4509    punpcklwd           m27, m26      ; m0 n0 m1 n1 m2 n2 m3 n3
4510    punpckhwd           m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
4511    punpcklwd           m16, m28      ; o0 p0 o1 p1 o2 p2 o3 p3
4512    pmulhrsw            m23, m10
4513    pmulhrsw            m25, m10
4514    punpckhdq           m28, m2, m12  ; i2 j2 k2 l2 i3 j3 k3 l3
4515    punpckldq            m2, m12      ; i0 j0 k0 l0 i1 j1 k1 l1
4516    punpckhdq           m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3
4517    punpckldq           m27, m16      ; m0 n0 o0 p0 m1 n1 o1 p1
4518    REPX  {pmulhrsw x, m10}, m28, m2, m12, m27
4519    punpckhdq           m16, m1, m11  ; i6 j6 k6 l6 i7 j7 k7 l7
4520    punpckldq            m1, m11      ; i4 j4 k4 l4 i5 j5 k5 l5
4521    punpckhdq           m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
4522    punpckldq           m29, m26      ; m4 n4 o4 p4 m5 n5 o5 p5
4523    REPX  {pmulhrsw x, m10}, m16, m1, m11, m29
4524    punpckhdq           m26, m19, m21
4525    punpckldq           m19, m21
4526    punpckhdq           m21, m6, m4
4527    punpckldq            m6, m4
4528    REPX  {pmulhrsw x, m10}, m26, m19, m21, m6
4529    punpckhdq            m4, m18, m20
4530    punpckldq           m18, m20
4531    punpckhdq           m20, m7, m17
4532    punpckldq            m7, m17
4533    REPX  {pmulhrsw x, m10}, m4, m18, m20, m7
4534    punpcklqdq          m17, m28, m12 ; b02 b10 b18 b26
4535    punpckhqdq          m28, m12      ; b03 b11 b19 b27
4536    punpckhqdq          m12, m2, m27  ; b01 b09 b17 b25
4537    punpcklqdq           m2, m27      ; b00 b08 b16 b24
4538    punpckhqdq          m27, m1, m29  ; b05 b13 b21 b29
4539    punpcklqdq           m1, m29      ; b04 b12 b20 b28
4540    punpckhqdq          m29, m16, m11 ; b07 b15 b23 b31
4541    punpcklqdq          m16, m11      ; b06 b14 b22 b30
4542    mova         [cq+64* 1], m12
4543    mova         [cq+64* 3], m28
4544    mova         [cq+64* 5], m27
4545    mova         [cq+64* 7], m29
4546    punpckhqdq          m27, m20, m26 ; c03 c11 c19 c27
4547    punpcklqdq          m20, m26      ; c02 c10 c18 c26
4548    punpckhqdq          m26, m7, m19  ; c01 c09 c17 c25
4549    punpcklqdq           m7, m19      ; c00 c08 c16 c24
4550    punpckhqdq          m28, m6, m18  ; c05 c13 c21 c29
4551    punpcklqdq           m6, m18      ; c04 c12 c20 c28
4552    punpckhqdq          m29, m21, m4  ; c07 c15 c23 c31
4553    punpcklqdq          m21, m4       ; c06 c14 c22 c30
4554    pmulhrsw            m19, m9, m10
4555    vshufi32x4           m4, m0, m2, q3232   ; a16 a24 b16 b24
4556    vinserti32x8         m0, ym2, 1          ; a00 a08 b00 b08
4557    vshufi32x4           m2, m7, m5, q3232   ; c16 c24 d16 d24
4558    vinserti32x8         m7, ym5, 1          ; c00 c08 d00 d08
4559    vshufi32x4           m5, m8, m1, q3232   ; a20 a28 b20 b28
4560    vinserti32x8         m1, m8, ym1, 1      ; a04 a12 b04 b12
4561    vshufi32x4           m8, m6, m3, q3232   ; c20 c28 d20 d28
4562    vinserti32x8         m6, ym3, 1          ; c04 c12 d04 d12
4563    vshufi32x4           m3, m1, m6, q3131   ; 12
4564    vshufi32x4           m1, m6, q2020       ;  4
4565    vshufi32x4           m6, m4, m2, q3131   ; 24
4566    vshufi32x4           m4, m2, q2020       ; 16
4567    vshufi32x4           m2, m0, m7, q3131   ;  8
4568    vshufi32x4           m0, m7, q2020       ;  0
4569    vshufi32x4           m7, m5, m8, q3131   ; 28
4570    vshufi32x4           m5, m8, q2020       ; 20
4571    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
4572    vshufi32x4          m18, m14, m17, q3232 ; a18 a26 b18 b26
4573    vinserti32x8        m14, ym17, 1         ; a02 a10 b02 b10
4574    vshufi32x4          m17, m20, m13, q3232 ; c18 c26 d18 d26
4575    vinserti32x8        m20, ym13, 1         ; c02 c10 d02 d10
4576    vshufi32x4          m13, m21, m19, q3232 ; c22 c30 d22 d30
4577    vinserti32x8        m21, ym19, 1         ; c06 c14 d06 d14
4578    vshufi32x4          m19, m15, m16, q3232 ; a22 a30 b22 b30
4579    vinserti32x8        m15, ym16, 1         ; a06 a14 b06 b14
4580    vshufi32x4          m16, m14, m20, q3131 ; 10
4581    vshufi32x4          m14, m20, q2020      ;  2
4582    vshufi32x4          m20, m18, m17, q3131 ; 26
4583    vshufi32x4          m18, m17, q2020      ; 18
4584    vshufi32x4          m17, m15, m21, q3131 ; 14
4585    vshufi32x4          m15, m21, q2020      ;  6
4586    vshufi32x4          m21, m19, m13, q3131 ; 30
4587    vshufi32x4          m19, m13, q2020      ; 22
4588    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
4589    mova         [cq+64* 0], m14
4590    mova         [cq+64* 2], m15
4591    mova         [cq+64* 4], m16
4592    mova         [cq+64* 6], m17
4593    mova         [cq+64* 8], m18
4594    mova         [cq+64*10], m19
4595    mova         [cq+64*12], m20
4596    mova         [cq+64*14], m21
4597    mova                m15, [cq+64* 1]
4598    mova                m16, [cq+64* 3]
4599    mova                m17, [cq+64* 5]
4600    mova                m19, [cq+64* 7]
4601    mova                m20, [cq+64* 9]
4602    mova                m21, [cq+64*11]
4603    mova                m13, [cq+64*13]
4604    mova                m18, [cq+64*15]
4605    vshufi32x4          m14, m22, m15, q3232 ; a17 a25 b17 b25
4606    vinserti32x8        m22, ym15, 1         ; a01 a09 b01 b09
4607    vshufi32x4          m15, m23, m16, q3232 ; a19 a27 b19 b27
4608    vinserti32x8        m23, ym16, 1         ; a03 a11 b03 b11
4609    vshufi32x4          m16, m24, m17, q3232 ; a21 a29 b21 b29
4610    vinserti32x8        m24, ym17, 1         ; a05 a13 b05 b13
4611    vshufi32x4          m17, m25, m19, q3232 ; a23 a31 b23 b31
4612    vinserti32x8        m25, ym19, 1         ; a07 a15 b07 b15
4613    vinserti32x8         m8, m26, ym20, 1    ; c01 c09 d01 d09
4614    vshufi32x4          m26, m20, q3232      ; c17 c25 d17 d25
4615    vinserti32x8         m9, m27, ym21, 1    ; c03 c11 d03 d11
4616    vshufi32x4          m27, m21, q3232      ; c19 c27 d19 d27
4617    vinserti32x8        m11, m28, ym13, 1    ; c05 c13 d05 d13
4618    vshufi32x4          m28, m13, q3232      ; c21 c29 d21 d29
4619    vinserti32x8        m12, m29, ym18, 1    ; c07 c15 d07 d15
4620    vshufi32x4          m29, m18, q3232      ; c23 c31 d23 d31
4621    vshufi32x4          m18, m14, m26, q3131 ; 25
4622    vshufi32x4          m14, m26, q2020      ; 17
4623    vshufi32x4          m19, m15, m27, q3131 ; 27
4624    vshufi32x4          m15, m27, q2020      ; 19
4625    vshufi32x4          m20, m16, m28, q3131 ; 29
4626    vshufi32x4          m16, m28, q2020      ; 21
4627    vshufi32x4          m21, m17, m29, q3131 ; 31
4628    vshufi32x4          m17, m29, q2020      ; 23
4629    vshufi32x4          m26, m22, m8, q3131  ;  9
4630    vshufi32x4          m22, m8, q2020       ;  1
4631    vshufi32x4          m27, m23, m9, q3131  ; 11
4632    vshufi32x4          m23, m9, q2020       ;  3
4633    vshufi32x4          m28, m24, m11, q3131 ; 13
4634    vshufi32x4          m24, m11, q2020      ;  5
4635    vshufi32x4          m29, m25, m12, q3131 ; 15
4636    vshufi32x4          m25, m12, q2020      ;  7
4637    call .main_oddhalf
4638    jmp .end
4639.fast: ; bottom/right halves are zero
4640    mova                m14, [o(dup16_perm)]
4641    pmovzxwd             m9,       [cq+64* 0]
4642    pmovzxwd             m6,       [cq+64* 8]
4643    vpermb               m8, m14,  [cq+64* 2]
4644    vpermb              ym0, ym14, [cq+64*14]
4645    vpermb              ym5, ym14, [cq+64*10]
4646    vpermb               m1, m14,  [cq+64* 6]
4647    vpermb               m7, m14,  [cq+64* 4]
4648    vpermb              ym3, ym14, [cq+64*12]
4649    pslld                m9, 16
4650    pslld                m6, 16
4651    call m(idct_16x16_internal_8bpc).main_fast
4652    vpermb              m21, m14,  [cq+64* 1]
4653    vpermb             ym17, ym14, [cq+64*15]
4654    vpermb             ym20, ym14, [cq+64* 9]
4655    vpermb              m15, m14,  [cq+64* 7]
4656    vpermb              m18, m14,  [cq+64* 5]
4657    vpermb             ym16, ym14, [cq+64*11]
4658    vpermb             ym19, ym14, [cq+64*13]
4659    vpermb              m14, m14,  [cq+64* 3]
4660    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4661    vpbroadcastd         m9, [o(pw_8192)]
4662    call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
4663    vshufi32x4          m22, m14, m2, q2020 ;  1
4664    vshufi32x4          m24, m14, m2, q3131 ;  5
4665    vshufi32x4          m23, m17, m9, q2020 ;  3
4666    vshufi32x4          m25, m17, m9, q3131 ;  7
4667    vshufi32x4          m16, m5, m15, q2020 ; 10
4668    vshufi32x4          m17, m5, m15, q3131 ; 14
4669    vshufi32x4          m14, m1, m18, q2020 ;  2
4670    vshufi32x4          m15, m1, m18, q3131 ;  6
4671    vshufi32x4           m1, m0, m3, q3131  ;  4
4672    vshufi32x4           m0, m3, q2020      ;  0
4673    vshufi32x4           m3, m21, m4, q3131 ; 12
4674    vshufi32x4           m2, m21, m4, q2020 ;  8
4675    vshufi32x4          m26, m20, m6, q2020 ;  9
4676    vshufi32x4          m28, m20, m6, q3131 ; 13
4677    vshufi32x4          m27, m19, m7, q2020 ; 11
4678    vshufi32x4          m29, m19, m7, q3131 ; 15
4679    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
4680    mova         [cq+64* 0], m14
4681    mova         [cq+64* 2], m15
4682    mova         [cq+64* 4], m16
4683    mova         [cq+64* 6], m17
4684    mova         [cq+64* 8], m18
4685    mova         [cq+64*10], m19
4686    mova         [cq+64*12], m20
4687    mova         [cq+64*14], m21
4688    call .main_oddhalf_fast
4689.end:
4690    lea                  r4, [strideq*3]
4691    vpbroadcastd        m12, [o(pw_2048)]
4692    movshdup            m13, [o(permD)]
4693    lea                  r3, [dstq+r4*8]
4694    lea                  r5, [strideq+r4] ; stride*4
4695    add                  r3, r5           ; dst+stride*28
4696    IDCT_32x32_END       29,  0, strideq*0, r4
4697    IDCT_32x32_END       28,  1, strideq*1, strideq*2
4698    IDCT_32x32_END       27,  2, strideq*2, strideq*1
4699    IDCT_32x32_END       26,  3, r4       , strideq*0
4700    IDCT_32x32_END       25,  4, strideq*0, r4
4701    IDCT_32x32_END       24,  5, strideq*1, strideq*2
4702    IDCT_32x32_END       23,  6, strideq*2, strideq*1
4703    IDCT_32x32_END       22,  7, r4       , strideq*0
4704    IDCT_32x32_END       21,  8, strideq*0, r4
4705    IDCT_32x32_END       20,  9, strideq*1, strideq*2
4706    IDCT_32x32_END       19, 10, strideq*2, strideq*1
4707    IDCT_32x32_END       18, 11, r4       , strideq*0
4708    IDCT_32x32_END       17, 12, strideq*0, r4
4709    IDCT_32x32_END       16, 13, strideq*1, strideq*2
4710    IDCT_32x32_END       15, 14, strideq*2, strideq*1
4711    IDCT_32x32_END       14, 15, r4       , strideq*0
4712    RET
4713.dconly:
4714    movsx               r6d, word [cq]
4715    mov                [cq], eobd
4716    or                  r3d, 32
4717    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2
4718ALIGN function_align
4719cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero
4720    vpbroadcastd        m21, [o(pw_4091x8)]
4721    vpbroadcastd         m8, [o(pw_201x8)]
4722    vpbroadcastd        m24, [o(pw_m601x8)]
4723    vpbroadcastd        m12, [o(pw_4052x8)]
4724    pmulhrsw            m21, m22 ; t31a
4725    pmulhrsw            m22, m8  ; t16a
4726    pmulhrsw            m24, m23 ; t23a
4727    pmulhrsw            m23, m12 ; t24a
4728
4729    punpcklwd            m9, m22, m21
4730    punpckhwd            m8, m22, m21
4731    mova                m15, m10
4732    vpdpwssd            m15, m9, [o(pw_m4017_799)] {bcstd}
4733    mova                m17, m10
4734    vpdpwssd            m17, m8, [o(pw_m4017_799)] {bcstd}
4735    REPX      {psrad x, 12}, m15, m17
4736    packssdw            m15, m17
4737    mova                m17, m10
4738    vpdpwssd            m17, m8, [o(pw_799_4017)] {bcstd}
4739    mova                 m8, m10
4740    vpdpwssd             m8, m9, [o(pw_799_4017)] {bcstd}
4741    REPX      {psrad x, 12}, m17, m8
4742    packssdw             m8, m17
4743
4744    punpcklwd            m9, m24, m23
4745    punpckhwd           m16, m24, m23
4746    mova                m20, m10
4747    vpdpwssd            m20, m9, [o(pw_m3406_m2276)] {bcstd}
4748    mova                m17, m10
4749    vpdpwssd            m17, m16, [o(pw_m3406_m2276)] {bcstd}
4750    REPX      {psrad x, 12}, m20, m17
4751    packssdw            m20, m17
4752    mova                m17, m10
4753    vpdpwssd            m17, m16, [o(pw_m2276_3406)] {bcstd}
4754    mova                m16, m10
4755    vpdpwssd            m16, m9, [o(pw_m2276_3406)] {bcstd}
4756    REPX      {psrad x, 12}, m17, m16
4757    packssdw            m16, m17
4758
4759    mova                m17, m21
4760    mova                m27, m15
4761    mova                m25, m20
4762    mova                m29, m8
4763    mova                m18, m22
4764    mova                m14, m24
4765    mova                m28, m16
4766    mova                m26, m23
4767    jmp .main4
4768cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
4769    vpbroadcastd        m21, [o(pw_4091x8)]
4770    vpbroadcastd         m8, [o(pw_201x8)]
4771    vpbroadcastd        m18, [o(pw_m1380x8)]
4772    vpbroadcastd         m9, [o(pw_3857x8)]
4773    vpbroadcastd        m19, [o(pw_3973x8)]
4774    vpbroadcastd        m11, [o(pw_995x8)]
4775    vpbroadcastd        m28, [o(pw_m601x8)]
4776    vpbroadcastd        m12, [o(pw_4052x8)]
4777    pmulhrsw            m21, m22 ; t31a
4778    pmulhrsw            m22, m8  ; t16a
4779    pmulhrsw            m18, m25 ; t19a
4780    pmulhrsw            m25, m9 ; t28a
4781    pmulhrsw            m19, m24 ; t27a
4782    pmulhrsw            m24, m11 ; t20a
4783    pmulhrsw            m28, m23 ; t23a
4784    pmulhrsw            m23, m12 ; t24a
4785    mova                m15, m21
4786    mova                 m8, m22
4787    mova                m14, m18
4788    mova                m27, m25
4789    mova                m29, m19
4790    mova                m26, m24
4791    mova                m16, m28
4792    mova                m20, m23
4793    jmp .main3
4794ALIGN function_align
4795cglobal_label .main_oddhalf_fast ; bottom half is zero
4796    vpbroadcastd        m21, [o(pw_4091x8)]
4797    vpbroadcastd         m8, [o(pw_201x8)]
4798    vpbroadcastd        m14, [o(pw_m2751x8)]
4799    vpbroadcastd         m9, [o(pw_3035x8)]
4800    vpbroadcastd        m17, [o(pw_3703x8)]
4801    vpbroadcastd        m11, [o(pw_1751x8)]
4802    vpbroadcastd        m18, [o(pw_m1380x8)]
4803    vpbroadcastd        m12, [o(pw_3857x8)]
4804    pmulhrsw            m21, m22 ; t31a
4805    vpbroadcastd        m19, [o(pw_3973x8)]
4806    pmulhrsw            m22, m8  ; t16a
4807    vpbroadcastd         m8, [o(pw_995x8)]
4808    pmulhrsw            m14, m29 ; t30a
4809    vpbroadcastd        m16, [o(pw_m2106x8)]
4810    pmulhrsw            m29, m9  ; t17a
4811    vpbroadcastd         m9, [o(pw_3513x8)]
4812    pmulhrsw            m17, m26 ; t29a
4813    vpbroadcastd        m15, [o(pw_3290x8)]
4814    pmulhrsw            m26, m11 ; t18a
4815    vpbroadcastd        m11, [o(pw_2440x8)]
4816    pmulhrsw            m18, m25 ; t19a
4817    vpbroadcastd        m20, [o(pw_m601x8)]
4818    pmulhrsw            m25, m12 ; t28a
4819    vpbroadcastd        m12, [o(pw_4052x8)]
4820    pmulhrsw            m19, m24 ; t27a
4821    pmulhrsw            m24, m8  ; t20a
4822    pmulhrsw            m16, m27 ; t21a
4823    pmulhrsw            m27, m9  ; t26a
4824    pmulhrsw            m15, m28 ; t25a
4825    pmulhrsw            m28, m11 ; t22a
4826    pmulhrsw            m20, m23 ; t23a
4827    pmulhrsw            m23, m12 ; t24a
4828    jmp .main2
4829ALIGN function_align
4830cglobal_label .main_oddhalf
4831    ITX_MULSUB_2W        22, 21,  8,  9, 10,  201, 4091 ; t16a, t31a
4832    ITX_MULSUB_2W        14, 29,  8,  9, 10, 3035, 2751 ; t17a, t30a
4833    ITX_MULSUB_2W        26, 17,  8,  9, 10, 1751, 3703 ; t18a, t29a
4834    ITX_MULSUB_2W        18, 25,  8,  9, 10, 3857, 1380 ; t19a, t28a
4835    ITX_MULSUB_2W        24, 19,  8,  9, 10,  995, 3973 ; t20a, t27a
4836    ITX_MULSUB_2W        16, 27,  8,  9, 10, 3513, 2106 ; t21a, t26a
4837    ITX_MULSUB_2W        28, 15,  8,  9, 10, 2440, 3290 ; t22a, t25a
4838    ITX_MULSUB_2W        20, 23,  8,  9, 10, 4052,  601 ; t23a, t24a
4839.main2:
4840    psubsw               m8, m22, m14 ; t17
4841    paddsw              m22, m14      ; t16
4842    paddsw              m14, m18, m26 ; t19
4843    psubsw              m18, m26      ; t18
4844    psubsw              m26, m24, m16 ; t21
4845    paddsw              m24, m16      ; t20
4846    psubsw              m16, m20, m28 ; t22
4847    paddsw              m28, m20      ; t23
4848    psubsw              m20, m23, m15 ; t25
4849    paddsw              m23, m15      ; t24
4850    psubsw              m15, m21, m29 ; t30
4851    paddsw              m21, m29      ; t31
4852    psubsw              m29, m19, m27 ; t26
4853    paddsw              m19, m27      ; t27
4854    paddsw              m27, m25, m17 ; t28
4855    psubsw              m25, m17      ; t29
4856.main3:
4857    ITX_MULSUB_2W        15,  8,  9, 17, 10,   799, 4017 ; t17a, t30a
4858    ITX_MULSUB_2W        25, 18,  9, 17, 10, m4017,  799 ; t18a, t29a
4859    ITX_MULSUB_2W        29, 26,  9, 17, 10,  3406, 2276 ; t21a, t26a
4860    ITX_MULSUB_2W        20, 16,  9, 17, 10, m2276, 3406 ; t22a, t25a
4861    psubsw              m17, m21, m27 ; t28a
4862    paddsw              m21, m27      ; t31a
4863    psubsw              m27, m15, m25 ; t18
4864    paddsw              m15, m25      ; t17
4865    psubsw              m25, m20, m29 ; t21
4866    paddsw              m20, m29      ; t22
4867    psubsw              m29, m8, m18  ; t29
4868    paddsw               m8, m18      ; t30
4869    psubsw              m18, m22, m14 ; t19a
4870    paddsw              m22, m14      ; t16a
4871    psubsw              m14, m28, m24 ; t20a
4872    paddsw              m24, m28      ; t23a
4873    paddsw              m28, m16, m26 ; t25
4874    psubsw              m16, m26      ; t26
4875    psubsw              m26, m23, m19 ; t27a
4876    paddsw              m23, m19      ; t24a
4877.main4:
4878    vpbroadcastd        m12, [o(pw_m3784_1567)]
4879    vpbroadcastd        m11, [o(pw_1567_3784)]
4880    ITX_MULSUB_2W        29, 27,  9, 19, 10, 11, 12 ; t18a, t29a
4881    ITX_MULSUB_2W        17, 18,  9, 19, 10, 11, 12 ; t19,  t28
4882    vpbroadcastd        m11, [o(pw_m1567_m3784)]
4883    ITX_MULSUB_2W        16, 25,  9, 19, 10, 12, 11 ; t21a, t26a
4884    ITX_MULSUB_2W        26, 14,  9, 19, 10, 12, 11 ; t20,  t27
4885    vpbroadcastd        m12, [o(pw_m2896_2896)]
4886    vpbroadcastd        m11, [o(pw_2896_2896)]
4887    psubsw              m19, m27, m25 ; t26
4888    paddsw              m27, m25      ; t29
4889    psubsw              m25, m17, m26 ; t20a
4890    paddsw              m17, m26      ; t19a
4891    paddsw              m26, m18, m14 ; t28a
4892    psubsw              m18, m14      ; t27a
4893    paddsw              m14, m22, m24 ; t16
4894    psubsw              m22, m24      ; t23
4895    psubsw              m24, m29, m16 ; t21
4896    paddsw              m16, m29      ; t18
4897    paddsw              m29, m21, m23 ; t31
4898    psubsw              m21, m23      ; t24
4899    psubsw              m23, m15, m20 ; t22a
4900    paddsw              m15, m20      ; t17a
4901    psubsw              m20, m8, m28  ; t25a
4902    paddsw              m28, m8       ; t30a
4903    ITX_MULSUB_2W        18, 25,  8,  9, 10, 11, 12 ; t20,  t27
4904    ITX_MULSUB_2W        19, 24,  8,  9, 10, 11, 12 ; t21a, t26a
4905    ITX_MULSUB_2W        21, 22,  8,  9, 10, 11, 12 ; t23a, t24a
4906    ITX_MULSUB_2W        20, 23,  8,  9, 10, 11, 12 ; t22,  t25
4907    ret
4908
4909%macro IDTX_32x32 2 ; dst[1-2]
4910    vmovdqa32           ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which
4911    vmovdqa32           ym17, [cq+64*(%1+16)] ; reduces code size due to
4912    vmovdqa32           ym%2, [cq+64*(%2+ 0)] ; compressed displacements
4913    vmovdqa32           ym18, [cq+64*(%2+16)]
4914    vpermt2q             m%1, m21, m17
4915    vpermt2q             m%2, m21, m18
4916%endmacro
4917
4918cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c
4919    movu                 m21, [permB+7]
4920    vpbroadcastd         m16, [pw_8192]
4921    pxor                 m20, m20
4922.loop:
4923    IDTX_32x32            0,  1
4924    IDTX_32x32            2,  3
4925    IDTX_32x32            4,  5
4926    IDTX_32x32            6,  7
4927    IDTX_32x32            8,  9
4928    IDTX_32x32           10, 11
4929    IDTX_32x32           12, 13
4930    IDTX_32x32           14, 15
4931    call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
4932    IDTX_32x16_STORE      0,  8, 1
4933    IDTX_32x16_STORE      1,  9, 1
4934    IDTX_32x16_STORE      2, 10, 1
4935    IDTX_32x16_STORE      3, 11, 1
4936    IDTX_32x16_STORE      4, 12, 1
4937    IDTX_32x16_STORE      5, 13, 1
4938    IDTX_32x16_STORE      6, 14, 1
4939    IDTX_32x16_STORE      7, 15, 1
4940    lea                dstq, [dstq+strideq*8]
4941    btc                  cq, 5
4942    jnc .loop
4943    mov                 r0d, 8
4944.zero_loop:
4945    mova          [cq+64*0], m20
4946    mova          [cq+64*1], m20
4947    mova          [cq+64*2], m20
4948    mova          [cq+64*3], m20
4949    add                  cq, 64*4
4950    dec                 r0d
4951    jg .zero_loop
4952    RET
4953
4954cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob
4955%undef cmp
4956    lea                  r5, [o_base]
4957    test               eobd, eobd
4958    jz .dconly
4959    WIN64_SPILL_XMM      30
4960    cmp                eobd, 151
4961    jb .fast
4962    mova                 m5, [cq+64*10]
4963    mova                 m3, [cq+64* 6]
4964    mova                 m1, [cq+64* 2]
4965    mova                 m7, [cq+64*14]
4966    mova                 m2, [cq+64* 4]
4967    mova                 m6, [cq+64*12]
4968    mova                 m0, [cq+64* 0]
4969    mova                 m4, [cq+64* 8]
4970    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
4971    mova                m14, [cq+64* 1]
4972    mova                m21, [cq+64*15]
4973    mova                m18, [cq+64* 9]
4974    mova                m17, [cq+64* 7]
4975    mova                m16, [cq+64* 5]
4976    mova                m19, [cq+64*11]
4977    mova                m20, [cq+64*13]
4978    mova                m15, [cq+64* 3]
4979    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
4980    vpbroadcastd         m9, [o(pw_8192)]
4981%macro TRANSPOSE_8x4_ROUND 4
4982    punpckhwd            m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7
4983    punpcklwd           m%3, m%4      ; c0 d0 c1 d1 c2 d2 c3 d3
4984    punpckhwd           m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7
4985    punpcklwd           m%1, m%2      ; a0 b0 a1 b1 a2 b2 a3 b3
4986    punpckhdq           m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3
4987    punpckldq           m%1, m%3      ; a0 b0 c0 d0 a1 b1 c1 d1
4988    punpckldq           m%3, m%4, m8  ; a4 b4 c4 d4 a5 b5 c5 d5
4989    punpckhdq           m%4, m8       ; a6 b6 c6 d6 a7 b7 c7 d7
4990    REPX   {pmulhrsw x, m9}, m%2, m%1, m%3, m%4
4991%endmacro
4992    TRANSPOSE_8x4_ROUND   0,  1,  2,  3
4993    TRANSPOSE_8x4_ROUND   4,  5,  6,  7
4994    TRANSPOSE_8x4_ROUND  14, 15, 16, 17
4995    TRANSPOSE_8x4_ROUND  18, 19, 20, 21
4996    vinserti32x8        m26, m0, ym4, 1     ; a0  a4  b0  b4
4997    vshufi32x4           m0, m4, q3232      ; a8  a12 b8  b12
4998    vinserti32x8        m27, m1, ym5, 1     ; a1  a5  b1  b5
4999    vshufi32x4           m1, m5, q3232      ; a9  a13 b9  b13
5000    vinserti32x8        m28, m2, ym6, 1     ; a2  a6  b2  b6
5001    vshufi32x4           m2, m6, q3232      ; a10 a14 b10 b14
5002    vinserti32x8        m29, m3, ym7, 1     ; a3  a7  b3  b7
5003    vshufi32x4           m8, m3, m7, q3232  ; a11 a15 b11 b15
5004    vinserti32x8         m4, m14, ym18, 1   ; c0  c4  d0  d4
5005    vshufi32x4          m14, m18, q3232     ; c8  c12 d8  d12
5006    vinserti32x8         m5, m15, ym19, 1   ; c1  c5  d1  d5
5007    vshufi32x4          m15, m19, q3232     ; c9  c13 d9  d13
5008    vinserti32x8         m6, m16, ym20, 1   ; c2  c6  d2  d6
5009    vshufi32x4          m16, m20, q3232     ; c10 c14 d10 d14
5010    vinserti32x8         m7, m17, ym21, 1   ; c3  c7  d3  d7
5011    vshufi32x4          m17, m21, q3232     ; c11 c15 d11 d15
5012    vshufi32x4          m22, m26, m4, q2020 ;  0  1
5013    vshufi32x4          m26, m4, q3131      ;  8  9
5014    vshufi32x4          m23, m27, m5, q2020 ;  2  3
5015    vshufi32x4          m27, m5, q3131      ; 10 11
5016    vshufi32x4          m24, m28, m6, q2020 ;  4  5
5017    vshufi32x4          m28, m6, q3131      ; 12 13
5018    vshufi32x4          m25, m29, m7, q2020 ;  6  7
5019    vshufi32x4          m29, m7, q3131      ; 14 15
5020    vshufi32x4           m4, m0, m14, q2020 ; 16 17
5021    vshufi32x4           m3, m0, m14, q3131 ; 24 25
5022    vshufi32x4          m20, m1, m15, q2020 ; 18 19
5023    vshufi32x4          m19, m1, m15, q3131 ; 26 27
5024    vshufi32x4           m5, m2, m16, q2020 ; 20 21
5025    vshufi32x4           m0, m2, m16, q3131 ; 28 29
5026    vshufi32x4          m16, m8, m17, q2020 ; 22 23
5027    vshufi32x4          m17, m8, m17, q3131 ; 30 31
5028    pxor                 m6, m6
5029    mova         [cq+64* 0], m4
5030    mova         [cq+64* 2], m5
5031    mova         [cq+64* 4], m3
5032    mova         [cq+64* 6], m0
5033    punpcklwd            m8, m24, m24 ;  4
5034    punpcklwd            m0, m0       ; 28
5035    punpcklwd            m5, m5       ; 20
5036    punpcklwd            m1, m28, m28 ; 12
5037    punpcklwd            m7, m26, m26 ;  8
5038    punpcklwd            m3, m3       ; 24
5039    punpcklwd            m9, m6, m22  ; __  0
5040    punpcklwd            m6, m4       ; __ 16
5041    call m(idct_16x16_internal_8bpc).main_fast3
5042    mova         [cq+64* 1], m20
5043    mova         [cq+64* 3], m16
5044    mova         [cq+64* 5], m19
5045    mova         [cq+64* 7], m17
5046    punpcklwd           m21, m23, m23 ;  2
5047    punpcklwd           m17, m17      ; 30
5048    punpcklwd           m20, m20      ; 18
5049    punpcklwd           m15, m29, m29 ; 14
5050    punpcklwd           m18, m27, m27 ; 10
5051    punpcklwd           m16, m16      ; 22
5052    punpcklwd           m19, m19      ; 26
5053    punpcklwd           m14, m25, m25 ;  6
5054    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5055    mova         [cq+64* 8], m14
5056    mova         [cq+64* 9], m15
5057    mova         [cq+64*10], m16
5058    mova         [cq+64*11], m17
5059    mova         [cq+64*12], m18
5060    mova         [cq+64*13], m19
5061    mova         [cq+64*14], m20
5062    mova         [cq+64*15], m21
5063    mova                m21, [cq+64* 7]
5064    mova                m14, [cq+64* 0]
5065    mova                m17, [cq+64* 3]
5066    mova                m18, [cq+64* 4]
5067    mova                m19, [cq+64* 5]
5068    mova                m16, [cq+64* 2]
5069    mova                m15, [cq+64* 1]
5070    mova                m20, [cq+64* 6]
5071    REPX   {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
5072                             m24, m19, m16, m27, m28, m15, m20, m23
5073    call .main_oddhalf
5074    jmp .end
5075.fast: ; right half is zero
5076    mova                ym8, [cq+64*15]
5077    vinserti32x8         m8, [cq+64* 1], 1
5078    mova                 m2, [o(int16_perm)]
5079    mova                ym9, [cq+64* 8]
5080    vinserti32x8         m9, [cq+64* 0], 1
5081    mova                ym0, [cq+64* 7]
5082    vinserti32x8         m0, [cq+64* 9], 1
5083    mova                ym7, [cq+64*14]
5084    vinserti32x8         m7, [cq+64* 2], 1
5085    mova                ym1, [cq+64* 3]
5086    vinserti32x8         m1, [cq+64*13], 1
5087    mova                ym3, [cq+64* 6]
5088    vinserti32x8         m3, [cq+64*10], 1
5089    mova                ym5, [cq+64*11]
5090    vinserti32x8         m5, [cq+64* 5], 1
5091    mova                ym6, [cq+64*12]
5092    vinserti32x8         m6, [cq+64* 4], 1
5093    REPX  {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
5094    call m(idct_16x16_internal_8bpc).main2
5095    vbroadcasti32x4      m8, [o(int_shuf3)]
5096    vbroadcasti32x4      m9, [o(int_shuf4)]
5097    vpbroadcastd        m11, [o(pw_8192)]
5098    pshufb               m0, m8
5099    pshufb               m1, m9
5100    pshufb               m2, m8
5101    pshufb               m3, m9
5102    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
5103    pshufb               m4, m8
5104    pshufb               m5, m9
5105    pshufb               m6, m8
5106    pshufb               m7, m9
5107    REPX  {pmulhrsw x, m11}, m4, m5, m6, m7
5108    punpckhdq           m28, m0, m1
5109    punpckldq            m0, m1
5110    punpckhdq           m27, m2, m3
5111    punpckldq            m2, m3
5112    punpckhdq           m22, m4, m5
5113    punpckldq            m4, m5
5114    punpckhdq           m23, m6, m7
5115    punpckldq            m6, m7
5116    vinserti32x8        m14, m0, ym2, 1
5117    vshufi32x4          m15, m0, m2, q3232
5118    vinserti32x8         m2, m4, ym6, 1
5119    vshufi32x4           m4, m6, q3232
5120    vshufi32x4          m21, m14, m2, q2020 ;  0  2
5121    vshufi32x4          m14, m2, q3131      ;  4  6
5122    vshufi32x4          m18, m15, m4, q2020 ;  8 10
5123    vshufi32x4          m15, m4, q3131      ; 12 14
5124    pxor                 m9, m9
5125    punpcklwd            m8, m14, m14 ;  4
5126    punpcklwd            m1, m15, m15 ; 12
5127    punpcklwd            m7, m18, m18 ;  8
5128    punpcklwd            m9, m21      ; __  0
5129    call m(idct_16x16_internal_8bpc).main_fast4
5130    punpckhwd           m21, m21      ;  2
5131    punpckhwd           m15, m15      ; 14
5132    punpckhwd           m18, m18      ; 10
5133    punpckhwd           m14, m14      ;  6
5134    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
5135    vinserti32x8        m24, m28, ym27, 1
5136    vshufi32x4          m28, m27, q3232
5137    vinserti32x8        m27, m22, ym23, 1
5138    vshufi32x4          m22, m23, q3232
5139    vshufi32x4          m23, m24, m27, q2020 ;  1  3
5140    vshufi32x4          m24, m27, q3131      ;  5  7
5141    vshufi32x4          m27, m28, m22, q2020 ;  9 11
5142    vshufi32x4          m28, m22, q3131      ; 13 15
5143    punpcklwd           m22, m23, m23 ;  1
5144    punpckhwd           m29, m28, m28 ; 15
5145    punpcklwd           m26, m27, m27 ;  9
5146    punpckhwd           m25, m24, m24 ;  7
5147    mova         [cq+64* 8], m14
5148    mova         [cq+64* 9], m15
5149    mova         [cq+64*10], m16
5150    mova         [cq+64*11], m17
5151    punpcklwd           m24, m24      ;  5
5152    punpckhwd           m27, m27      ; 11
5153    punpcklwd           m28, m28      ; 13
5154    punpckhwd           m23, m23      ;  3
5155    mova         [cq+64*12], m18
5156    mova         [cq+64*13], m19
5157    mova         [cq+64*14], m20
5158    mova         [cq+64*15], m21
5159    call .main_oddhalf_fast
5160.end:
5161    imul                 r6, strideq, 60
5162    mova                m10, [o(end_16x32p)]
5163    vpbroadcastd        m11, [o(pw_2048)]
5164    lea                  r3, [strideq*3]
5165    pxor                m12, m12
5166    add                  r6, dstq         ; dst+stride*60
5167    psrldq              m13, m10, 1
5168    lea                  r4, [strideq+r3] ; stride*4
5169%macro IDCT_16x64_END 3 ; idct32, idct64, tmp
5170%if %1 & 1
5171    %define %%s0 r3
5172    %define %%s1 strideq*2
5173    %define %%s2 strideq*1
5174    %define %%s3 strideq*0
5175%else
5176    %define %%s0 strideq*0
5177    %define %%s1 strideq*1
5178    %define %%s2 strideq*2
5179    %define %%s3 r3
5180%if %1
5181    add                dstq, r4
5182    sub                  r6, r4
5183%endif
5184%endif
5185%if %1 < 8
5186    pmulhrsw             m8, m11, m%1
5187    pmulhrsw             m9, m11, m%2
5188%else
5189    mova                 m9, [cq+64*%1]
5190    paddsw               m8, m9, m%2 ; out  0+n,  1+n
5191    psubsw               m9, m%2     ; out 63-n, 62-n
5192    pmulhrsw             m8, m11
5193    pmulhrsw             m9, m11
5194%endif
5195    mova               xm29, [dstq+%%s0]
5196    vinserti128        ym29, [dstq+%%s1], 1
5197    mova               xm%3, [r6  +%%s3]
5198    vinserti128        ym%3, [r6  +%%s2], 1
5199    vpermb              m29, m10, m29
5200    vpermb              m%3, m10, m%3
5201    mova         [cq+64*%1], m12
5202    paddw               m29, m8
5203    paddw               m%3, m9
5204    packuswb            m29, m%3
5205    vpermd              m29, m13, m29
5206    mova          [dstq+%%s0], xm29
5207    vextracti128  [dstq+%%s1], ym29, 1
5208    vextracti32x4 [r6  +%%s2], m29, 2
5209    vextracti32x4 [r6  +%%s3], m29, 3
5210%endmacro
5211    IDCT_16x64_END        0, 29,  0
5212    IDCT_16x64_END        1, 28, 28
5213    IDCT_16x64_END        2, 27, 28
5214    IDCT_16x64_END        3, 26, 28
5215    IDCT_16x64_END        4, 25, 28
5216    IDCT_16x64_END        5, 24, 28
5217    IDCT_16x64_END        6, 23, 28
5218    IDCT_16x64_END        7, 22, 28
5219    IDCT_16x64_END        8, 21, 28
5220    IDCT_16x64_END        9, 20, 28
5221    IDCT_16x64_END       10, 19, 28
5222    IDCT_16x64_END       11, 18, 28
5223    IDCT_16x64_END       12, 17, 28
5224    IDCT_16x64_END       13, 16, 28
5225    IDCT_16x64_END       14, 15, 28
5226    IDCT_16x64_END       15, 14, 28
5227    RET
5228.dconly:
5229    movsx               r6d, word [cq]
5230    mov                [cq], eobd
5231    or                  r3d, 64
5232    imul                r6d, 181
5233    add                 r6d, 128+512
5234    sar                 r6d, 8+2
5235    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
5236ALIGN function_align
5237cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero
5238    vpbroadcastd         m8, [o(pw_101_4095x8)]
5239    vpbroadcastd        m21, [o(pw_m1474_3822x8)]
5240    vpbroadcastd        m14, [o(pw_897_3996x8)]
5241    vpbroadcastd        m17, [o(pw_m700_4036x8)]
5242    vpbroadcastd        m18, [o(pw_501_4065x8)]
5243    vpbroadcastd        m19, [o(pw_m1092_3948x8)]
5244    vpbroadcastd        m16, [o(pw_1285_3889x8)]
5245    vpbroadcastd        m15, [o(pw_m301_4085x8)]
5246    pmulhrsw             m8, m22 ; t32a t63a
5247    pmulhrsw            m21, m29 ; t35a t60a
5248    pmulhrsw            m14, m26 ; t36a t59a
5249    pmulhrsw            m17, m25 ; t39a t56
5250    pmulhrsw            m18, m24 ; t40a t55a
5251    pmulhrsw            m19, m27 ; t43a t52a
5252    pmulhrsw            m16, m28 ; t44a t51a
5253    pmulhrsw            m15, m23 ; t47a t48a
5254    mova                m22, m8
5255    mova                m29, m21
5256    mova                m26, m14
5257    mova                m25, m17
5258    mova                m24, m18
5259    mova                m27, m19
5260    mova                m28, m16
5261    mova                m20, m15
5262    jmp .main_oddhalf2
5263ALIGN function_align
5264cglobal_label .main_oddhalf
5265    vpbroadcastd         m8, [o(pw_101_4095x8)]
5266    vpbroadcastd         m9, [o(pw_m2824_2967x8)]
5267    vpbroadcastd        m11, [o(pw_1660_3745x8)]
5268    vpbroadcastd        m12, [o(pw_m1474_3822x8)]
5269    pmulhrsw            m22, m8       ; t32a t63a
5270    vpbroadcastd         m8, [o(pw_897_3996x8)]
5271    pmulhrsw            m21, m9       ; t33a t62a
5272    vpbroadcastd         m9, [o(pw_m2191_3461x8)]
5273    pmulhrsw            m14, m11      ; t34a t61a
5274    vpbroadcastd        m11, [o(pw_2359_3349x8)]
5275    pmulhrsw            m29, m12      ; t35a t60a
5276    vpbroadcastd        m12, [o(pw_m700_4036x8)]
5277    pmulhrsw            m26, m8       ; t36a t59a
5278    vpbroadcastd         m8, [o(pw_501_4065x8)]
5279    pmulhrsw            m17, m9       ; t37a t58a
5280    vpbroadcastd         m9, [o(pw_m2520_3229x8)]
5281    pmulhrsw            m18, m11      ; t38a t57a
5282    vpbroadcastd        m11, [o(pw_2019_3564x8)]
5283    pmulhrsw            m25, m12      ; t39a t56a
5284    vpbroadcastd        m12, [o(pw_m1092_3948x8)]
5285    pmulhrsw            m24, m8       ; t40a t55a
5286    vpbroadcastd         m8, [o(pw_1285_3889x8)]
5287    pmulhrsw            m19, m9       ; t41a t54a
5288    vpbroadcastd         m9, [o(pw_m1842_3659x8)]
5289    pmulhrsw            m16, m11      ; t42a t53a
5290    vpbroadcastd        m11, [o(pw_2675_3102x8)]
5291    pmulhrsw            m27, m12      ; t43a t52a
5292    vpbroadcastd        m12, [o(pw_m301_4085x8)]
5293    pmulhrsw            m28, m8       ; t44a t51a
5294    pmulhrsw            m15, m9       ; t45a t50a
5295    pmulhrsw            m20, m11      ; t46a t49a
5296    pmulhrsw            m23, m12      ; t47a t48a
5297    psubsw               m8, m22, m21 ; t33  t62
5298    paddsw              m22, m21      ; t32  t63
5299    psubsw              m21, m29, m14 ; t34  t61
5300    paddsw              m29, m14      ; t35  t60
5301    psubsw              m14, m26, m17 ; t37  t58
5302    paddsw              m26, m17      ; t36  t59
5303    psubsw              m17, m25, m18 ; t38  t57
5304    paddsw              m25, m18      ; t39  t56
5305    psubsw              m18, m24, m19 ; t41  t54
5306    paddsw              m24, m19      ; t40  t55
5307    psubsw              m19, m27, m16 ; t42  t53
5308    paddsw              m27, m16      ; t43  t52
5309    psubsw              m16, m28, m15 ; t45  t50
5310    paddsw              m28, m15      ; t44  t51
5311    psubsw              m15, m23, m20 ; t46  t49
5312    paddsw              m20, m23      ; t47  t48
5313.main_oddhalf2:
5314    ITX_MUL2X_PACK        8, 9, 23, 10,   401, 4076, 5 ; t33a t62a
5315    ITX_MUL2X_PACK       21, 9, 23, 10, m4076,  401, 5 ; t34a t61a
5316    ITX_MUL2X_PACK       14, 9, 23, 10,  3166, 2598, 5 ; t37a t58a
5317    ITX_MUL2X_PACK       17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a
5318    ITX_MUL2X_PACK       18, 9, 23, 10,  1931, 3612, 5 ; t41a t54a
5319    ITX_MUL2X_PACK       19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a
5320    ITX_MUL2X_PACK       16, 9, 23, 10,  3920, 1189, 5 ; t45a t50a
5321    ITX_MUL2X_PACK       15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a
5322    vpbroadcastd        m11, [o(pw_m4017_799)]
5323    psubsw              m23, m25, m26 ; t36a t59a
5324    paddsw              m25, m26      ; t39a t56a
5325    psubsw              m26, m24, m27 ; t43a t52a
5326    paddsw              m27, m24      ; t40a t55a
5327    psubsw              m24, m20, m28 ; t44a t51a
5328    paddsw              m20, m28      ; t47a t48a
5329    psubsw              m28, m8, m21  ; t34  t61
5330    paddsw               m8, m21      ; t33  t62
5331    psubsw              m21, m17, m14 ; t37  t58
5332    paddsw              m17, m14      ; t38  t57
5333    psubsw              m14, m18, m19 ; t42  t53
5334    paddsw              m18, m19      ; t41  t54
5335    psubsw              m19, m15, m16 ; t45  t50
5336    paddsw              m15, m16      ; t46  t49
5337    psubsw              m16, m22, m29 ; t35a t60a
5338    paddsw              m22, m29      ; t32a t63a
5339    ITX_MUL2X_PACK       16, 9, 29, 10, 799_4017, 11,    20 ; t35  t60
5340    ITX_MUL2X_PACK       28, 9, 29, 10, 799_4017, 11,    20 ; t34a t61a
5341    ITX_MUL2X_PACK       23, 9, 29, 10, 11, m799_m4017,  36 ; t36  t59
5342    ITX_MUL2X_PACK       21, 9, 29, 10, 11, m799_m4017,  36 ; t37a t58a
5343    vpbroadcastd        m11, [o(pw_m2276_3406)]
5344    ITX_MUL2X_PACK       26, 9, 29, 10, 3406_2276, 11,   20 ; t43  t52
5345    ITX_MUL2X_PACK       14, 9, 29, 10, 3406_2276, 11,   20 ; t42a t53a
5346    ITX_MUL2X_PACK       24, 9, 29, 10, 11, m3406_m2276, 36 ; t44  t51
5347    ITX_MUL2X_PACK       19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a
5348    vpbroadcastd        m11, [o(pw_1567_3784)]
5349    vpbroadcastd        m12, [o(pw_m3784_1567)]
5350    psubsw              m29, m22, m25 ; t39  t56
5351    paddsw              m22, m25      ; t32  t63
5352    psubsw              m25, m20, m27 ; t40  t55
5353    paddsw              m20, m27      ; t47  t48
5354    psubsw              m27, m8, m17  ; t38a t57a
5355    paddsw               m8, m17      ; t33a t62a
5356    psubsw              m17, m15, m18 ; t41a t54a
5357    paddsw              m15, m18      ; t46a t49a
5358    paddsw              m18, m16, m23 ; t35a t60a
5359    psubsw              m16, m23      ; t36a t59a
5360    psubsw              m23, m24, m26 ; t43a t52a
5361    paddsw              m24, m26      ; t44a t51a
5362    paddsw              m26, m28, m21 ; t34  t61
5363    psubsw              m28, m21      ; t37  t58
5364    psubsw              m21, m19, m14 ; t42  t53
5365    paddsw              m19, m14      ; t45  t50
5366    ITX_MUL2X_PACK       29, 9, 14, 10, 11, 12, 4 ; t39a t56a
5367    ITX_MUL2X_PACK       27, 9, 14, 10, 11, 12, 4 ; t38  t57
5368    ITX_MUL2X_PACK       16, 9, 14, 10, 11, 12, 4 ; t36  t59
5369    ITX_MUL2X_PACK       28, 9, 14, 10, 11, 12, 4 ; t37a t58a
5370    vpbroadcastd        m11, [o(pw_m1567_m3784)]
5371    ITX_MUL2X_PACK       25, 9, 14, 10, 12, 11, 4 ; t40a t55a
5372    ITX_MUL2X_PACK       17, 9, 14, 10, 12, 11, 4 ; t41  t54
5373    ITX_MUL2X_PACK       23, 9, 14, 10, 12, 11, 4 ; t43  t52
5374    ITX_MUL2X_PACK       21, 9, 14, 10, 12, 11, 4 ; t42a t53a
5375    vbroadcasti32x4     m13, [o(deint_shuf)]
5376    vpbroadcastd        m11, [o(pw_2896_2896)]
5377    vpbroadcastd        m12, [o(pw_m2896_2896)]
5378    paddsw              m14, m22, m20 ; t32a t63a
5379    psubsw              m22, m20      ; t47a t48a
5380    psubsw              m20, m8, m15  ; t46  t49
5381    paddsw               m8, m15      ; t33  t62
5382    paddsw              m15, m18, m24 ; t35  t60
5383    psubsw              m18, m24      ; t44  t51
5384    psubsw              m24, m26, m19 ; t45a t50a
5385    paddsw              m26, m19      ; t34a t61a
5386    REPX    {pshufb x, m13}, m14, m8, m15, m26
5387    psubsw              m19, m29, m25 ; t40  t55
5388    paddsw              m25, m29      ; t39  t56
5389    psubsw              m29, m27, m17 ; t41a t54a
5390    paddsw              m27, m17      ; t38a t57a
5391    psubsw              m17, m16, m23 ; t43a t52a
5392    paddsw              m16, m23      ; t36a t59a
5393    psubsw               m9, m28, m21 ; t42  t53
5394    paddsw              m28, m21      ; t37  t58
5395    REPX    {pshufb x, m13}, m25, m27, m16, m28
5396    ITX_MUL2X_PACK       22, 13, 21, 10, 11, 12, 8 ; t47  t48
5397    ITX_MUL2X_PACK       20, 23, 22, 10, 11, 12, 8 ; t46a t49a
5398    packssdw            m21, m22      ; t47  t46a
5399    packssdw            m13, m23      ; t48  t49a
5400    ITX_MUL2X_PACK       18, 22, 20, 10, 11, 12, 8 ; t44a t51a
5401    ITX_MUL2X_PACK       24, 23, 18, 10, 11, 12, 8 ; t45  t50
5402    packssdw            m20, m18      ; t44a t45
5403    packssdw            m22, m23      ; t51a t50
5404    ITX_MUL2X_PACK       19, 24, 18, 10, 11, 12, 8 ; t40a t55a
5405    ITX_MUL2X_PACK       29, 23, 19, 10, 11, 12, 8 ; t41  t54
5406    packssdw            m18, m19      ; t40a t41
5407    packssdw            m24, m23      ; t55a t54
5408    ITX_MUL2X_PACK       17, 23, 19, 10, 11, 12, 8 ; t43  t52
5409    ITX_MUL2X_PACK        9, 29, 17, 10, 11, 12, 8 ; t42a t53a
5410    packssdw            m19, m17      ; t43  t42a
5411    packssdw            m23, m29      ; t52  t53a
5412    punpcklqdq          m17, m25, m27 ; t39  t38a
5413    punpckhqdq          m25, m27      ; t56  t57a
5414    punpckhqdq          m27, m15, m26 ; t60  t61a
5415    punpcklqdq          m15, m26      ; t35  t34a
5416    punpckhqdq          m26, m16, m28 ; t59a t58
5417    punpcklqdq          m16, m28      ; t36a t37
5418    punpckhqdq          m28, m14, m8  ; t63a t62
5419    punpcklqdq          m14, m8       ; t32a t33
5420    psubsw              m29, m0, m28  ; out63 out62
5421    paddsw               m0, m28      ; out0  out1
5422    psubsw              m28, m1, m27  ; out60 out61
5423    paddsw               m1, m27      ; out3  out2
5424    psubsw              m27, m2, m26  ; out59 out58
5425    paddsw               m2, m26      ; out4  out5
5426    psubsw              m26, m3, m25  ; out56 out57
5427    paddsw               m3, m25      ; out7  out6
5428    psubsw              m25, m4, m24  ; out55 out54
5429    paddsw               m4, m24      ; out8  out9
5430    psubsw              m24, m5, m23  ; out52 out53
5431    paddsw               m5, m23      ; out11 out10
5432    psubsw              m23, m6, m22  ; out51 out50
5433    paddsw               m6, m22      ; out12 out13
5434    psubsw              m22, m7, m13  ; out48 out49
5435    paddsw               m7, m13      ; out15 out14
5436    ret
5437
5438cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob
5439%undef cmp
5440    lea                  r5, [o_base]
5441    test               eobd, eobd
5442    jnz .normal
5443    movsx               r6d, word [cq]
5444    mov                [cq], eobd
5445    or                  r3d, 16
5446.dconly:
5447    imul                r6d, 181
5448    add                 r6d, 128+512
5449    sar                 r6d, 8+2
5450.dconly2:
5451    imul                r6d, 181
5452    add                 r6d, 128+2048
5453    sar                 r6d, 8+4
5454    pxor                 m2, m2
5455    vpbroadcastw         m3, r6d
5456.dconly_loop:
5457    mova                 m1, [dstq]
5458    punpcklbw            m0, m1, m2
5459    punpckhbw            m1, m2
5460    paddw                m0, m3
5461    paddw                m1, m3
5462    packuswb             m0, m1
5463    mova             [dstq], m0
5464    add                dstq, strideq
5465    dec                 r3d
5466    jg .dconly_loop
5467    RET
5468.normal:
5469    WIN64_SPILL_XMM      31
5470    mova                m19, [o(dup16_perm)]
5471    mova                m24, [cq+64* 2]
5472    mova                m28, [cq+64* 6]
5473    mova                m26, [cq+64* 4]
5474    mova                m22, [cq+64* 0]
5475    mova                m23, [cq+64* 1]
5476    mova                m29, [cq+64* 7]
5477    mova                m27, [cq+64* 5]
5478    mova                m25, [cq+64* 3]
5479    vpermb               m8, m19, m24        ;  4
5480    vpermb               m1, m19, m28        ; 12
5481    vpermb               m7, m19, m26        ;  8
5482    vpermb               m9, m19, m22        ; __  0
5483    vpermb              m21, m19, m23        ;  2
5484    vpermb              m15, m19, m29        ; 14
5485    vpermb              m18, m19, m27        ; 10
5486    vpermb              m14, m19, m25        ;  6
5487    pslld                m9, 16
5488    vpord               m30, m19, [o(pb_32)] {1to16}
5489    REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23
5490    cmp                eobd, 151
5491    jb .fast
5492    vpermb               m0, m19, [cq+64*14] ; 28
5493    vpermb               m5, m19, [cq+64*10] ; 20
5494    vpermb               m3, m19, [cq+64*12] ; 24
5495    vpermb               m6, m19, [cq+64* 8] ; __ 16
5496    pslld                m6, 16
5497    call m(idct_16x16_internal_8bpc).main_fast
5498    vpermb              m17, m19, [cq+64*15] ; 30
5499    vpermb              m20, m19, [cq+64* 9] ; 18
5500    vpermb              m16, m19, [cq+64*11] ; 22
5501    vpermb              m19, m19, [cq+64*13] ; 26
5502    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5503    mova         [cq+64* 0], m14
5504    mova         [cq+64* 1], m15
5505    mova         [cq+64* 2], m16
5506    mova         [cq+64* 3], m17
5507    mova         [cq+64* 4], m18
5508    mova         [cq+64* 5], m19
5509    mova         [cq+64* 6], m20
5510    mova         [cq+64* 7], m21
5511    vpermb              m21, m30, [cq+64*15]
5512    vpermb              m14, m30, [cq+64* 8]
5513    vpermb              m17, m30, [cq+64*11]
5514    vpermb              m18, m30, [cq+64*12]
5515    vpermb              m19, m30, [cq+64*13]
5516    vpermb              m16, m30, [cq+64*10]
5517    vpermb              m15, m30, [cq+64* 9]
5518    vpermb              m20, m30, [cq+64*14]
5519    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
5520    jmp .end
5521.fast: ; bottom half is zero
5522    call m(idct_16x16_internal_8bpc).main_fast2
5523    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
5524    mova         [cq+64* 0], m14
5525    mova         [cq+64* 1], m15
5526    mova         [cq+64* 2], m16
5527    mova         [cq+64* 3], m17
5528    mova         [cq+64* 4], m18
5529    mova         [cq+64* 5], m19
5530    mova         [cq+64* 6], m20
5531    mova         [cq+64* 7], m21
5532    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
5533.end:
5534    mova         [cq+64* 8], m4
5535    mova         [cq+64* 9], m5
5536    mova         [cq+64*10], m6
5537    mova         [cq+64*11], m7
5538    mova         [cq+64*12], m26
5539    mova         [cq+64*13], m27
5540    mova         [cq+64*14], m28
5541    mova         [cq+64*15], m29
5542    vpbroadcastd        m13, [o(pw_8192)]
5543    call .pass1_end
5544    call .pass2
5545    mova         [cq+64* 0], m0
5546    mova         [cq+64* 1], m1
5547    mova         [cq+64* 2], m2
5548    mova         [cq+64* 3], m3
5549    mova         [cq+64* 4], m4
5550    mova         [cq+64* 5], m5
5551    mova         [cq+64* 6], m6
5552    mova         [cq+64* 7], m7
5553    pmulhrsw             m0, m13, [cq+64* 8]
5554    pmulhrsw             m1, m13, [cq+64* 9]
5555    pmulhrsw             m2, m13, [cq+64*10]
5556    pmulhrsw             m3, m13, [cq+64*11]
5557    vpbroadcastd        m30, [o(pw_2048)]
5558    pmulhrsw             m4, m13, m22
5559    pmulhrsw             m5, m13, m23
5560    pmulhrsw             m6, m13, m24
5561    pmulhrsw             m7, m13, m25
5562    pmulhrsw            m22, m30, m14
5563    pmulhrsw            m14, m13, m26
5564    pmulhrsw            m23, m30, m15
5565    pmulhrsw            m15, m13, m27
5566    pmulhrsw            m24, m30, m16
5567    pmulhrsw            m16, m13, m28
5568    pmulhrsw            m25, m30, m17
5569    pmulhrsw            m17, m13, m29
5570    pmulhrsw            m26, m30, m18
5571    pmulhrsw            m18, m13, [cq+64*12]
5572    pmulhrsw            m27, m30, m19
5573    pmulhrsw            m19, m13, [cq+64*13]
5574    pmulhrsw            m28, m30, m20
5575    pmulhrsw            m20, m13, [cq+64*14]
5576    pmulhrsw            m29, m30, m21
5577    pmulhrsw            m21, m13, [cq+64*15]
5578    call .transpose_round
5579    call .pass2
5580    pxor                m10, m10
5581    lea                  r3, [strideq*3]
5582%macro IDCT_64x16_END 4
5583    mova                 m9, [dstq+%4]
5584%if %1 < 8
5585    pmulhrsw            m%3, m30, [cq+64*%1]
5586%endif
5587    pmulhrsw            m%2, m30
5588    mova         [cq+64*%1], m10
5589    punpcklbw            m8, m9, m10
5590    punpckhbw            m9, m10
5591    paddw                m8, m%3
5592    paddw                m9, m%2
5593    packuswb             m8, m9
5594    mova          [dstq+%4], m8
5595%if %1 == 3 || %1 == 7 || %1 == 11
5596    lea                dstq, [dstq+strideq*4]
5597%endif
5598%endmacro
5599    IDCT_64x16_END        0,  0, 11, strideq*0
5600    IDCT_64x16_END        1,  1, 11, strideq*1
5601    IDCT_64x16_END        2,  2, 11, strideq*2
5602    IDCT_64x16_END        3,  3, 11, r3
5603    IDCT_64x16_END        4,  4, 11, strideq*0
5604    IDCT_64x16_END        5,  5, 11, strideq*1
5605    IDCT_64x16_END        6,  6, 11, strideq*2
5606    IDCT_64x16_END        7,  7, 11, r3
5607    IDCT_64x16_END        8, 14, 22, strideq*0
5608    IDCT_64x16_END        9, 15, 23, strideq*1
5609    IDCT_64x16_END       10, 16, 24, strideq*2
5610    IDCT_64x16_END       11, 17, 25, r3
5611    IDCT_64x16_END       12, 18, 26, strideq*0
5612    IDCT_64x16_END       13, 19, 27, strideq*1
5613    IDCT_64x16_END       14, 20, 28, strideq*2
5614    IDCT_64x16_END       15, 21, 29, r3
5615    RET
5616ALIGN function_align
5617.pass1_end:
5618    mova                 m4, [cq+64* 0]
5619    mova                 m5, [cq+64* 1]
5620    mova                 m6, [cq+64* 2]
5621    mova                 m7, [cq+64* 3]
5622    mova                 m8, [cq+64* 4]
5623    mova                 m9, [cq+64* 5]
5624    mova                m11, [cq+64* 6]
5625    mova                m12, [cq+64* 7]
5626    psubsw              m29, m4, m21  ; out47 out46
5627    paddsw               m4, m21      ; out16 out17
5628    psubsw              m28, m5, m20  ; out44 out45
5629    paddsw               m5, m20      ; out19 out18
5630    REPX  {pmulhrsw x, m13}, m0, m1, m2, m3
5631    psubsw              m27, m6, m19  ; out43 out42
5632    paddsw               m6, m19      ; out20 out21
5633    psubsw              m26, m7, m18  ; out40 out41
5634    paddsw               m7, m18      ; out23 out22
5635    pmulhrsw            m18, m13, m22
5636    pmulhrsw            m19, m13, m23
5637    pmulhrsw            m20, m13, m24
5638    pmulhrsw            m21, m13, m25
5639    paddsw              m25, m12, m14 ; out31 out30
5640    psubsw              m14, m12, m14 ; out32 out33
5641    paddsw              m24, m11, m15 ; out28 out29
5642    psubsw              m15, m11, m15 ; out35 out34
5643    REPX  {pmulhrsw x, m13}, m4, m5, m6, m7
5644    paddsw              m23, m9, m16  ; out27 out26
5645    psubsw              m16, m9, m16  ; out36 out37
5646    paddsw              m22, m8, m17  ; out24 out25
5647    psubsw              m17, m8, m17  ; out39 out38
5648    REPX  {pmulhrsw x, m13}, m14, m15, m16, m17
5649.transpose_round:
5650%macro TRANSPOSE_8x4_PACKED 4
5651    punpckhwd            m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3
5652    punpcklwd           m%1, m%3      ; a0 e0 a1 e1 a2 e2 a3 e3
5653    punpcklwd           m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3
5654    punpckhwd           m%2, m%4      ; c0 g0 c1 g1 c2 g2 c3 g3
5655    punpckhwd           m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3
5656    punpcklwd           m%1, m%2      ; a0 c0 e0 g0 a1 c1 e1 g1
5657    punpckhwd           m%2, m8, m%3  ; b2 d2 f2 h2 b3 d3 f3 h3
5658    punpcklwd            m8, m%3      ; b0 d0 f0 h0 b1 d1 f1 h1
5659    punpcklwd           m%3, m%4, m%2 ; 2
5660    punpckhwd           m%4, m%2      ; 3
5661    punpckhwd           m%2, m%1, m8  ; 1
5662    punpcklwd           m%1, m8       ; 0
5663%endmacro
5664    TRANSPOSE_8x4_PACKED  0,  1,  2,  3
5665    TRANSPOSE_8x4_PACKED 18, 19, 20, 21
5666    TRANSPOSE_8x4_PACKED  4,  5,  6,  7
5667    TRANSPOSE_8x4_PACKED 14, 15, 16, 17
5668    vshufi32x4           m8, m0, m4, q3232   ; a02 a03 b02 b03
5669    vinserti32x8         m0, ym4, 1          ; a00 a01 b00 b01
5670    vshufi32x4           m4, m1, m5, q3232   ; a12 a13 b12 b13
5671    vinserti32x8         m9, m1, ym5, 1      ; a10 a11 b10 b11
5672    vshufi32x4           m5, m2, m6, q3232   ; a22 a23 b22 b23
5673    vinserti32x8         m1, m2, ym6, 1      ; a20 a21 b20 b21
5674    vshufi32x4           m6, m3, m7, q3232   ; a32 a33 b32 b33
5675    vinserti32x8        m11, m3, ym7, 1      ; a30 a31 b30 b31
5676    vshufi32x4           m2, m14, m18, q3232 ; c02 c03 d02 d03
5677    vinserti32x8         m3, m14, ym18, 1    ; c00 c01 d00 d01
5678    vshufi32x4          m18, m15, m19, q3232 ; c12 c13 d12 d13
5679    vinserti32x8        m15, ym19, 1         ; c10 c11 d10 d11
5680    vshufi32x4          m19, m16, m20, q3232 ; c22 c23 d22 d23
5681    vinserti32x8        m16, ym20, 1         ; c20 c21 d20 d21
5682    vshufi32x4          m20, m17, m21, q3232 ; c32 c33 d32 d33
5683    vinserti32x8        m17, ym21, 1         ; c30 c31 d30 d31
5684    ret
5685.pass2:
5686    vshufi32x4           m7, m5, m19, q3131  ; 14
5687    vshufi32x4           m5, m19, q2020      ; 10
5688    vshufi32x4          m21, m6, m20, q3131  ; 15
5689    vshufi32x4          m19, m6, m20, q2020  ; 11
5690    vshufi32x4          m20, m4, m18, q3131  ; 13
5691    vshufi32x4          m18, m4, m18, q2020  ;  9
5692    vshufi32x4           m6, m8, m2, q3131   ; 12
5693    vshufi32x4           m4, m8, m2, q2020   ;  8
5694    vshufi32x4           m2, m0, m3, q3131   ;  4
5695    vshufi32x4           m0, m3, q2020       ;  0
5696    vshufi32x4           m3, m1, m16, q3131  ;  6
5697    vshufi32x4           m1, m16, q2020      ;  2
5698    vshufi32x4          m16, m9, m15, q3131  ;  5
5699    vshufi32x4          m14, m9, m15, q2020  ;  1
5700    vshufi32x4          m15, m11, m17, q2020 ;  3
5701    vshufi32x4          m17, m11, m17, q3131 ;  7
5702    call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
5703    jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
5704
5705cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob
5706    lea                  r5, [o_base]
5707    test               eobd, eobd
5708    jz .dconly
5709    PROLOGUE              0, 9, 30, 64*32, dst, stride, c, eob
5710    vpbroadcastd        m23, [o(pw_2896x8)]
5711%undef cmp
5712    cmp                eobd, 136
5713    jb .fast
5714    pmulhrsw             m5, m23, [cq+64*20]
5715    pmulhrsw             m3, m23, [cq+64*12]
5716    pmulhrsw             m1, m23, [cq+64* 4]
5717    pmulhrsw             m7, m23, [cq+64*28]
5718    pmulhrsw             m2, m23, [cq+64* 8]
5719    pmulhrsw             m6, m23, [cq+64*24]
5720    pmulhrsw             m0, m23, [cq+64* 0]
5721    pmulhrsw             m4, m23, [cq+64*16]
5722    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
5723    pmulhrsw            m14, m23, [cq+64* 2]
5724    pmulhrsw            m21, m23, [cq+64*30]
5725    pmulhrsw            m18, m23, [cq+64*18]
5726    pmulhrsw            m17, m23, [cq+64*14]
5727    pmulhrsw            m16, m23, [cq+64*10]
5728    pmulhrsw            m19, m23, [cq+64*22]
5729    pmulhrsw            m20, m23, [cq+64*26]
5730    pmulhrsw            m15, m23, [cq+64* 6]
5731    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
5732    mova         [cq+64* 0], m14
5733    mova         [cq+64* 2], m15
5734    mova         [cq+64* 4], m16
5735    mova         [cq+64* 6], m17
5736    mova         [cq+64* 8], m18
5737    mova         [cq+64*10], m19
5738    mova         [cq+64*12], m20
5739    mova         [cq+64*14], m21
5740    pmulhrsw            m22, m23, [cq+64* 1]
5741    pmulhrsw            m21, m23, [cq+64*31]
5742    pmulhrsw            m14, m23, [cq+64*17]
5743    pmulhrsw            m29, m23, [cq+64*15]
5744    pmulhrsw            m26, m23, [cq+64* 9]
5745    pmulhrsw            m17, m23, [cq+64*23]
5746    pmulhrsw            m18, m23, [cq+64*25]
5747    pmulhrsw            m25, m23, [cq+64* 7]
5748    pmulhrsw            m24, m23, [cq+64* 5]
5749    pmulhrsw            m19, m23, [cq+64*27]
5750    pmulhrsw            m16, m23, [cq+64*21]
5751    pmulhrsw            m27, m23, [cq+64*11]
5752    pmulhrsw            m28, m23, [cq+64*13]
5753    pmulhrsw            m15, m23, [cq+64*19]
5754    pmulhrsw            m20, m23, [cq+64*29]
5755    pmulhrsw            m23,      [cq+64* 3]
5756    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
5757    vpbroadcastd        m12, [o(pw_16384)]
5758    psubsw              m13, m0, m29 ; 31
5759    paddsw               m0, m29     ;  0
5760    psubsw              m29, m1, m28 ; 30
5761    paddsw               m1, m28     ;  1
5762    psubsw              m28, m2, m27 ; 29
5763    paddsw               m2, m27     ;  2
5764    psubsw              m27, m3, m26 ; 28
5765    paddsw               m3, m26     ;  3
5766    psubsw              m26, m4, m25 ; 27
5767    paddsw               m4, m25     ;  4
5768    psubsw              m25, m5, m24 ; 26
5769    paddsw               m5, m24     ;  5
5770    psubsw              m24, m6, m23 ; 25
5771    paddsw               m6, m23     ;  6
5772    psubsw              m23, m7, m22 ; 24
5773    paddsw               m7, m22     ;  7
5774    pxor                 m9, m9
5775    punpckhwd            m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
5776    punpcklwd            m0, m1     ; a0 b0 a1 b1 a2 b2 a3 b3
5777    punpckhwd            m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
5778    punpcklwd            m2, m3     ; c0 d0 c1 d1 c2 d2 c3 d3
5779    REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
5780    punpckhwd           m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
5781    punpcklwd            m4, m5     ; e0 f0 e1 f1 e2 f2 e3 f3
5782    punpckhwd            m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
5783    punpcklwd            m6, m7     ; g0 h0 g1 h1 g2 h2 g3 h3
5784    REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
5785    punpckhwd            m3, m23, m24
5786    punpcklwd           m23, m24
5787    punpckhwd           m24, m25, m26
5788    punpcklwd           m25, m26
5789    REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
5790    punpckhwd           m26, m27, m28
5791    punpcklwd           m27, m28
5792    punpckhwd           m28, m29, m13
5793    punpcklwd           m29, m13
5794    REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
5795    punpckhdq            m7, m0, m2  ; a2 b2 c2 d2 a3 b3 c3 d3
5796    punpckldq            m0, m2      ; a0 b0 c0 d0 a1 b1 c1 d1
5797    punpckhdq            m2, m4, m6  ; e2 f2 g2 h2 e3 f3 g3 h3
5798    punpckldq            m4, m6      ; e0 f0 g0 h0 e1 f1 g1 h1
5799    REPX  {pmulhrsw x, m12}, m7, m0, m2, m4
5800    punpckhdq            m6, m8, m1  ; a6 b6 c6 d6 a7 b7 c7 d7
5801    punpckldq            m8, m1      ; a4 b4 c4 d4 a5 b5 c5 d5
5802    punpckhdq            m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
5803    punpckldq           m22, m5      ; e4 f4 g4 h5 e5 f5 g5 h5
5804    REPX  {pmulhrsw x, m12}, m6, m8, m1, m22
5805    punpckhdq           m13, m23, m25
5806    punpckldq           m23, m25
5807    punpckhdq           m25, m27, m29
5808    punpckldq           m27, m29
5809    REPX  {pmulhrsw x, m12}, m13, m23, m25, m27
5810    punpckhdq            m9, m3, m24
5811    punpckldq            m3, m24
5812    punpckhdq           m24, m26, m28
5813    punpckldq           m26, m28
5814    REPX  {pmulhrsw x, m12}, m9, m3, m24, m26
5815    punpckhqdq           m5, m23, m27 ; d01 d09 d17 d25
5816    punpcklqdq          m23, m27      ; d00 d08 d16 d24
5817    punpcklqdq          m27, m13, m25 ; d02 d10 d18 d26
5818    punpckhqdq          m13, m25      ; d03 d11 d19 d27
5819    punpcklqdq          m25, m3, m26  ; d04 d12 d20 d28
5820    punpckhqdq           m3, m26      ; d05 d13 d21 d29
5821    punpcklqdq          m26, m9, m24  ; d06 d14 d22 d30
5822    punpckhqdq           m9, m24      ; d07 d15 d23 d31
5823    mova         [cq+64* 3], m23
5824    mova         [cq+64*13], m27
5825    mova         [cq+64* 7], m25
5826    mova         [cq+64*15], m26
5827    punpckhqdq          m24, m8, m22  ; a05 a13 a21 a29
5828    punpcklqdq           m8, m22      ; a04 a12 a20 a28
5829    punpckhqdq          m22, m0, m4   ; a01 a09 a17 a25
5830    punpcklqdq           m0, m4       ; a00 a08 a16 a24
5831    punpckhqdq          m23, m7, m2   ; a03 a11 a19 a27
5832    punpcklqdq           m7, m2       ; a02 a10 a18 a26
5833    punpckhqdq          m25, m6, m1   ; a07 a15 a23 a31
5834    punpcklqdq           m6, m1       ; a06 a14 a22 a30
5835    mova         [cq+64* 1], m0
5836    mova         [cq+64* 9], m7
5837    mova         [cq+64* 5], m8
5838    mova         [cq+64*11], m6
5839    mova                 m2, [cq+64* 0]
5840    mova                m11, [cq+64* 2]
5841    mova                 m8, [cq+64* 4]
5842    mova                m29, [cq+64* 6]
5843    mova                m27, [cq+64* 8]
5844    mova                m26, [cq+64*10]
5845    mova                 m4, [cq+64*12]
5846    mova                m28, [cq+64*14]
5847    psubsw               m1, m2, m21  ; 23
5848    paddsw               m2, m21      ;  8
5849    psubsw              m21, m11, m20 ; 22
5850    paddsw              m11, m20      ;  9
5851    psubsw              m20, m8, m19  ; 21
5852    paddsw               m8, m19      ; 10
5853    psubsw              m19, m29, m18 ; 20
5854    paddsw              m29, m18      ; 11
5855    psubsw              m18, m27, m17 ; 19
5856    paddsw              m27, m17      ; 12
5857    psubsw              m17, m26, m16 ; 18
5858    paddsw              m26, m16      ; 13
5859    psubsw              m16, m4, m15  ; 17
5860    paddsw               m4, m15      ; 14
5861    psubsw              m15, m28, m14 ; 16
5862    paddsw              m28, m14      ; 15
5863    punpcklwd           m14, m15, m16
5864    punpckhwd           m15, m16
5865    punpckhwd           m16, m17, m18
5866    punpcklwd           m17, m18
5867    punpckhwd           m18, m19, m20
5868    punpcklwd           m19, m20
5869    punpckhwd           m20, m21, m1
5870    punpcklwd           m21, m1
5871    punpckhwd            m1, m2, m11  ; i4 j4 i5 j5 i6 j6 i7 j7
5872    punpcklwd            m2, m11      ; i0 j1 i1 j1 i2 j2 i3 j3
5873    punpckhwd           m11, m8, m29  ; k4 l4 k5 l5 k6 l6 k7 l7
5874    punpcklwd            m8, m29      ; k0 l0 k1 l1 k2 l2 k3 l3
5875    punpckhwd           m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
5876    punpcklwd           m27, m26      ; m0 n0 m1 n1 m2 n2 m3 n3
5877    punpckhwd           m26, m4, m28  ; o4 p4 o5 p5 o6 p6 o7 p7
5878    punpcklwd            m4, m28      ; o0 p0 o1 p1 o2 p2 o3 p3
5879    punpckhdq           m28, m2, m8   ; i2 j2 k2 l2 i3 j3 k3 l3
5880    punpckldq            m2, m8       ; i0 j0 k0 l0 i1 j1 k1 l1
5881    punpckhdq            m8, m27, m4  ; m2 n2 o2 p2 m3 n3 o3 p3
5882    punpckldq           m27, m4       ; m0 n0 o0 p0 m1 n1 o1 p1
5883    REPX  {pmulhrsw x, m12}, m28, m2, m8, m27
5884    punpckhdq            m4, m1, m11  ; i6 j6 k6 l6 i7 j7 k7 l7
5885    punpckldq            m1, m11      ; i4 j4 k4 l4 i5 j5 k5 l5
5886    punpckhdq           m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
5887    punpckldq           m29, m26      ; m4 n4 o4 p4 m5 n5 o5 p5
5888    REPX  {pmulhrsw x, m12}, m4, m1, m11, m29
5889    punpckhdq           m26, m19, m21
5890    punpckldq           m19, m21
5891    punpckhdq           m21, m15, m16
5892    punpckldq           m15, m16
5893    REPX  {pmulhrsw x, m12}, m26, m19, m21, m15
5894    punpckhdq           m16, m18, m20
5895    punpckldq           m18, m20
5896    punpckhdq           m20, m14, m17
5897    punpckldq           m14, m17
5898    REPX  {pmulhrsw x, m12}, m16, m18, m20, m14
5899    punpckhqdq          m17, m28, m8  ; b03 b11 b19 b27
5900    punpcklqdq          m28, m8       ; b02 b10 b18 b26
5901    punpckhqdq           m8, m2, m27  ; b01 b09 b17 b25
5902    punpcklqdq           m2, m27      ; b00 b08 b16 b24
5903    punpcklqdq          m27, m1, m29  ; b04 b12 b20 b28
5904    punpckhqdq           m1, m29      ; b05 b13 b21 b29
5905    punpcklqdq          m29, m4, m11  ; b06 b14 b22 b30
5906    punpckhqdq           m4, m11      ; b07 b15 b23 b31
5907    mova         [cq+64* 0], m2
5908    mova         [cq+64* 8], m28
5909    mova         [cq+64* 4], m27
5910    mova         [cq+64*10], m29
5911    punpckhqdq          m27, m20, m26 ; c03 c11 c19 c27
5912    punpcklqdq          m20, m26      ; c02 c10 c18 c26
5913    punpckhqdq          m26, m14, m19 ; c01 c09 c17 c25
5914    punpcklqdq          m14, m19      ; c00 c08 c16 c24
5915    punpckhqdq          m28, m15, m18 ; c05 c13 c21 c29
5916    punpcklqdq          m15, m18      ; c04 c12 c20 c28
5917    punpckhqdq          m29, m21, m16 ; c07 c15 c23 c31
5918    punpcklqdq          m21, m16      ; c06 c14 c22 c30
5919    mova         [cq+64* 2], m14
5920    mova         [cq+64*12], m20
5921    mova         [cq+64* 6], m15
5922    mova         [cq+64*14], m21
5923    vshufi32x4          m14, m22, m8, q3232  ; a17 a25 b17 b25
5924    vinserti32x8        m22, ym8, 1          ; a01 a09 b01 b09
5925    vshufi32x4          m15, m23, m17, q3232 ; a19 a27 b19 b27
5926    vinserti32x8        m23, ym17, 1         ; a03 a11 b03 b11
5927    vshufi32x4          m16, m24, m1, q3232  ; a21 a29 b21 b29
5928    vinserti32x8        m24, ym1, 1          ; a05 a13 b05 b13
5929    vshufi32x4          m17, m25, m4, q3232  ; a23 a31 b23 b31
5930    vinserti32x8        m25, ym4, 1          ; a07 a15 b07 b15
5931    vinserti32x8        m19, m26, ym5, 1     ; c01 c09 d01 d09
5932    vshufi32x4          m26, m5, q3232       ; c17 c25 d17 d25
5933    vinserti32x8        m20, m27, ym13, 1    ; c03 c11 d03 d11
5934    vshufi32x4          m27, m13, q3232      ; c19 c27 d19 d27
5935    vinserti32x8        m21, m28, ym3, 1     ; c05 c13 d05 d13
5936    vshufi32x4          m28, m3, q3232       ; c21 c29 d21 d29
5937    vinserti32x8        m18, m29, ym9, 1     ; c07 c15 d07 d15
5938    vshufi32x4          m29, m9, q3232       ; c23 c31 d23 d31
5939    mov                  r4, rsp
5940    vshufi32x4           m0, m22, m19, q2020 ;  1
5941    vshufi32x4           m1, m17, m29, q3131 ; 31
5942    vshufi32x4           m2, m14, m26, q2020 ; 17
5943    vshufi32x4           m3, m25, m18, q3131 ; 15
5944    call .main_part1
5945    vshufi32x4           m0, m25, m18, q2020 ;  7
5946    vshufi32x4           m1, m14, m26, q3131 ; 25
5947    vshufi32x4           m2, m17, m29, q2020 ; 23
5948    vshufi32x4           m3, m22, m19, q3131 ;  9
5949    call .main_part1
5950    vshufi32x4           m0, m24, m21, q2020 ;  5
5951    vshufi32x4           m1, m15, m27, q3131 ; 27
5952    vshufi32x4           m2, m16, m28, q2020 ; 21
5953    vshufi32x4           m3, m23, m20, q3131 ; 11
5954    call .main_part1
5955    vshufi32x4           m0, m23, m20, q2020 ;  3
5956    vshufi32x4           m1, m16, m28, q3131 ; 29
5957    vshufi32x4           m2, m15, m27, q2020 ; 19
5958    vshufi32x4           m3, m24, m21, q3131 ; 13
5959    call .main_part1
5960    call .main_part2
5961    mova                 m0, [cq+64* 1] ; a0
5962    mova                m15, [cq+64* 0] ; b0
5963    mova                 m3, [cq+64* 2] ; c0
5964    mova                m16, [cq+64* 3] ; d0
5965    mova                m14, [cq+64* 5] ; a4
5966    mova                 m8, [cq+64* 4] ; b4
5967    mova                m17, [cq+64* 6] ; c4
5968    mova                 m1, [cq+64* 7] ; d4
5969    vshufi32x4           m2, m0, m15, q3232  ; a16 a24 b16 b24
5970    vinserti32x8         m0, ym15, 1         ; a00 a08 b00 b08
5971    vshufi32x4          m15, m3, m16, q3232  ; c16 c24 d16 d24
5972    vinserti32x8         m3, ym16, 1         ; c00 c08 d00 d08
5973    vshufi32x4          m16, m14, m8, q3232  ; a20 a28 b20 b28
5974    vinserti32x8        m14, ym8, 1          ; a04 a12 b04 b12
5975    vshufi32x4           m8, m17, m1, q3232  ; c20 c28 d20 d28
5976    vinserti32x8        m17, ym1, 1          ; c04 c12 d04 d12
5977    vshufi32x4           m1, m0, m3, q3131   ;  8
5978    vshufi32x4           m0, m3, q2020       ;  0
5979    vshufi32x4           m3, m2, m15, q3131  ; 24
5980    vshufi32x4           m2, m15, q2020      ; 16
5981    vshufi32x4          m15, m14, m17, q3131 ; 12
5982    vshufi32x4          m14, m17, q2020      ;  4
5983    vshufi32x4          m17, m16, m8, q3131  ; 28
5984    vshufi32x4          m16, m8, q2020       ; 20
5985    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
5986    mova                 m8, [cq+64* 8]
5987    mova                 m9, [cq+64*12]
5988    mova                m11, [cq+64*10]
5989    mova                m12, [cq+64*14]
5990    mova         [cq+64* 0], m14
5991    mova         [cq+64* 2], m15
5992    mova         [cq+64* 4], m16
5993    mova         [cq+64* 6], m17
5994    mova         [cq+64* 8], m18
5995    mova         [cq+64*10], m19
5996    mova         [cq+64*12], m20
5997    mova         [cq+64*14], m21
5998    mova                m22, [cq+64* 9]
5999    mova                m27, [cq+64*13]
6000    mova                m23, [cq+64*11]
6001    mova                m24, [cq+64*15]
6002    vshufi32x4          m26, m22, m8, q3232  ; a18 a26 b18 b26
6003    vinserti32x8        m22, ym8, 1          ; a02 a10 b02 b10
6004    vshufi32x4           m8, m9, m27, q3232  ; c18 c26 d18 d26
6005    vinserti32x8         m9, ym27, 1         ; c02 c10 d02 d10
6006    vshufi32x4          m27, m23, m11, q3232 ; a22 a30 b22 b30
6007    vinserti32x8        m23, ym11, 1         ; a06 a14 b06 b14
6008    vshufi32x4          m11, m12, m24, q3232 ; c22 c30 d22 d30
6009    vinserti32x8        m12, ym24, 1         ; c06 c14 d06 d14
6010    vshufi32x4          m28, m26, m8, q3131  ; 26
6011    vshufi32x4          m26, m8, q2020       ; 18
6012    vshufi32x4          m24, m22, m9, q3131  ; 10
6013    vshufi32x4          m22, m9, q2020       ;  2
6014    vshufi32x4          m29, m27, m11, q3131 ; 30
6015    vshufi32x4          m27, m11, q2020      ; 22
6016    vshufi32x4          m25, m23, m12, q3131 ; 14
6017    vshufi32x4          m23, m12, q2020      ;  6
6018    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
6019    jmp .end
6020.fast: ; bottom/right halves are zero
6021    pmulhrsw            ym9, ym23, [cq+64* 0]
6022    pmulhrsw            ym6, ym23, [cq+64* 8]
6023    mova                m14, [o(dup16_perm)]
6024    pmulhrsw            ym8, ym23, [cq+64* 2]
6025    pmulhrsw            xm0, xm23, [cq+64*14]
6026    pmulhrsw            xm5, xm23, [cq+64*10]
6027    pmulhrsw            ym1, ym23, [cq+64* 6]
6028    pmulhrsw            ym7, ym23, [cq+64* 4]
6029    pmulhrsw            xm3, xm23, [cq+64*12]
6030    pmovzxwd             m9, ym9
6031    pmovzxwd             m6, ym6
6032    vpermb               m8, m14, m8
6033    punpcklwd           xm0, xm0
6034    vpermb              ym5, ym14, ym5
6035    vpermb               m1, m14, m1
6036    vpermb               m7, m14, m7
6037    punpcklwd           xm3, xm3
6038    pslld                m9, 16
6039    pslld                m6, 16
6040    call m(idct_16x16_internal_8bpc).main_fast
6041          vpmulhrsw    ym21, ym23, [cq+64* 1]
6042    {evex}vpmulhrsw    xm17, xm23, [cq+64*15] ; force EVEX encoding, which
6043    {evex}vpmulhrsw    xm20, xm23, [cq+64* 9] ; reduces code size due to
6044    {evex}vpmulhrsw    ym15, ym23, [cq+64* 7] ; compressed displacements
6045    {evex}vpmulhrsw    ym18, ym23, [cq+64* 5]
6046    {evex}vpmulhrsw    xm16, xm23, [cq+64*11]
6047    {evex}vpmulhrsw    xm19, xm23, [cq+64*13]
6048    {evex}vpmulhrsw    ym23,       [cq+64* 3]
6049    vpermb              m21, m14, m21
6050    punpcklwd          xm17, xm17
6051    vpermb             ym20, ym14, ym20
6052    vpermb              m15, m14, m15
6053    vpermb              m18, m14, m18
6054    vpermb             ym16, ym14, ym16
6055    punpcklwd          xm19, xm19
6056    vpermb              m14, m14, m23
6057    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
6058    vpbroadcastd         m9, [o(pw_16384)]
6059    call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
6060    vshufi32x4          m16, m0, m3, q2020  ;  0
6061    vshufi32x4          m26, m0, m3, q3131  ;  4
6062    vshufi32x4           m0, m14, m2, q2020 ;  1
6063    vshufi32x4          m14, m2, q3131      ;  5
6064    vshufi32x4           m3, m19, m7, q3131 ; 15
6065    vshufi32x4          m19, m7, q2020      ; 11
6066    vshufi32x4          m27, m17, m9, q2020 ;  3
6067    vshufi32x4          m17, m9, q3131      ;  7
6068    vshufi32x4          m28, m20, m6, q2020 ;  9
6069    vshufi32x4          m20, m6, q3131      ; 13
6070    vshufi32x4          m22, m1, m18, q2020 ;  2
6071    vshufi32x4          m23, m1, m18, q3131 ;  6
6072    vshufi32x4          m24, m5, m15, q2020 ; 10
6073    vshufi32x4          m25, m5, m15, q3131 ; 14
6074    vshufi32x4          m15, m21, m4, q3131 ; 12
6075    vshufi32x4          m21, m21, m4, q2020 ;  8
6076    mov                  r4, rsp
6077    call .main_part1_fast
6078    mova                 m0, m17
6079    mova                 m3, m28
6080    call .main_part1_fast
6081    mova                 m0, m14
6082    mova                 m3, m19
6083    call .main_part1_fast
6084    mova                 m0, m27
6085    mova                 m3, m20
6086    call .main_part1_fast
6087    call .main_part2
6088    mova                 m0, m16
6089    mova                 m1, m21
6090    mova                m14, m26
6091    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
6092    mova         [cq+64*14], m21
6093    mova         [cq+64* 0], m14
6094    mova         [cq+64* 6], m17
6095    mova         [cq+64* 8], m18
6096    mova         [cq+64*10], m19
6097    mova         [cq+64* 4], m16
6098    mova         [cq+64* 2], m15
6099    mova         [cq+64*12], m20
6100    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
6101.end:
6102    lea                  r4, [strideq*3]
6103    vpbroadcastd        m12, [o(pw_2048)]
6104    movshdup            m13, [o(permD)]
6105    lea                  r5, [r4+strideq]   ; stride*4
6106    lea                  r3, [dstq+r4*8]
6107    lea                  r6, [strideq+r5*8] ; stride*33
6108    lea                  r8, [r4+r5*8]      ; stride*35
6109    add                  r3, r5             ; dst+stride*28
6110    lea                  r7, [r6+strideq]   ; stride*34
6111%macro IDCT_32x64_END 6 ; src, mem, stride[1-4]
6112%if %2 < 8
6113    paddsw              m10, m%2, m%1
6114    psubsw              m11, m%2, m%1
6115%else
6116    mova                m11, [cq+64*(%2*2-16)]
6117    paddsw              m10, m11, m%1
6118    psubsw              m11, m%1
6119%endif
6120    mova                 m9, [rsp+64*(31-%2)]
6121    mova                m%1, [rsp+64*%2]
6122    paddsw               m8, m10, m9
6123    psubsw              m10, m9
6124    paddsw               m9, m11, m%1
6125    pmovzxbw             m0, [dstq+%3]
6126    psubsw              m11, m%1
6127    pmovzxbw            m%1, [r3  +%4]
6128    REPX  {pmulhrsw x, m12}, m8, m10, m9, m11
6129    paddw                m8, m0
6130    pmovzxbw             m0, [r3  +%5]
6131    paddw               m10, m%1
6132    pmovzxbw            m%1, [dstq+%6]
6133    paddw                m9, m0
6134    paddw               m11, m%1
6135%if %2 >= 8
6136%if %2 == 8
6137    pxor                 m1, m1
6138%endif
6139    mova  [cq+64*(%2*2-16)], m1
6140    mova  [cq+64*(%2*2-15)], m1
6141%endif
6142    packuswb             m8, m10
6143    packuswb             m9, m11
6144    vpermq               m8, m13, m8
6145    vpermq               m9, m13, m9
6146    mova          [dstq+%3], ym8
6147    vextracti32x8 [r3  +%4], m8, 1
6148    mova          [r3  +%5], ym9
6149    vextracti32x8 [dstq+%6], m9, 1
6150%if %2 == 3 || %2 == 7 || %2 == 11
6151    add                dstq, r5
6152    sub                  r3, r5
6153%endif
6154%endmacro
6155    IDCT_32x64_END       29,  0, strideq*0, r8,   r4       , r5*8
6156    IDCT_32x64_END       28,  1, strideq*1, r7,   strideq*2, r6
6157    IDCT_32x64_END       27,  2, strideq*2, r6,   strideq*1, r7
6158    IDCT_32x64_END       26,  3, r4       , r5*8, strideq*0, r8
6159    IDCT_32x64_END       25,  4, strideq*0, r8,   r4       , r5*8
6160    IDCT_32x64_END       24,  5, strideq*1, r7,   strideq*2, r6
6161    IDCT_32x64_END       23,  6, strideq*2, r6,   strideq*1, r7
6162    IDCT_32x64_END       22,  7, r4       , r5*8, strideq*0, r8
6163    IDCT_32x64_END       21,  8, strideq*0, r8,   r4       , r5*8
6164    IDCT_32x64_END       20,  9, strideq*1, r7,   strideq*2, r6
6165    IDCT_32x64_END       19, 10, strideq*2, r6,   strideq*1, r7
6166    IDCT_32x64_END       18, 11, r4       , r5*8, strideq*0, r8
6167    IDCT_32x64_END       17, 12, strideq*0, r8,   r4       , r5*8
6168    IDCT_32x64_END       16, 13, strideq*1, r7,   strideq*2, r6
6169    IDCT_32x64_END       15, 14, strideq*2, r6,   strideq*1, r7
6170    IDCT_32x64_END       14, 15, r4       , r5*8, strideq*0, r8
6171    RET
6172.dconly:
6173    movsx               r6d, word [cq]
6174    mov                [cq], eobd
6175    or                  r3d, 64
6176    imul                r6d, 181
6177    add                 r6d, 128
6178    sar                 r6d, 8
6179    imul                r6d, 181
6180    add                 r6d, 128+256
6181    sar                 r6d, 8+1
6182    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
6183ALIGN function_align ; bottom three-quarters are zero
6184cglobal_label .main_part1_fast2
6185    vpbroadcastd         m7, [o(idct64_mul+4*0)]
6186    vpbroadcastd         m8, [o(idct64_mul+4*1)]
6187    pmulhrsw             m7, m0     ; t63a
6188    pmulhrsw             m0, m8     ; t32a
6189
6190    punpcklwd            m4, m0, m7
6191    punpckhwd            m6, m0, m7
6192    mova                 m1, m10
6193    vpdpwssd             m1, m4, [o(idct64_mul+4*9)] {bcstd}
6194    mova                 m9, m10
6195    vpdpwssd             m9, m6, [o(idct64_mul+4*9)] {bcstd}
6196    REPX      {psrad x, 12}, m1, m9
6197    packssdw             m1, m9
6198    mova                 m9, m10
6199    vpdpwssd             m9, m6, [o(idct64_mul+4*8)] {bcstd}
6200    mova                 m6, m10
6201    vpdpwssd             m6, m4, [o(idct64_mul+4*8)] {bcstd}
6202    REPX      {psrad x, 12}, m9, m6
6203    packssdw             m6, m9
6204
6205    mova                 m4, m0
6206    mova                 m3, m7
6207    mova                 m5, m1
6208    mova                 m2, m6
6209    jmp .main_part1c
6210cglobal_label .main_part1_fast
6211    vpbroadcastd         m1, [o(idct64_mul+4*0)]
6212    vpbroadcastd         m8, [o(idct64_mul+4*1)]
6213    vpbroadcastd         m2, [o(idct64_mul+4*6)]
6214    vpbroadcastd         m9, [o(idct64_mul+4*7)]
6215    pmulhrsw             m1, m0     ; t63a
6216    pmulhrsw             m0, m8     ; t32a
6217    pmulhrsw             m2, m3     ; t60a
6218    pmulhrsw             m3, m9     ; t35a
6219    mova                 m8, m0
6220    mova                 m7, m1
6221    mova                 m6, m3
6222    mova                 m5, m2
6223    jmp .main_part1b
6224cglobal_label .main_part1
6225    ; idct64 steps 1-5:
6226    ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
6227    ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
6228    ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
6229    ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
6230    vpbroadcastd         m7, [o(idct64_mul+4*0)]
6231    vpbroadcastd         m8, [o(idct64_mul+4*1)]
6232    vpbroadcastd         m6, [o(idct64_mul+4*2)]
6233    vpbroadcastd         m9, [o(idct64_mul+4*3)]
6234    pmulhrsw             m7, m0     ; t63a
6235    vpbroadcastd         m5, [o(idct64_mul+4*4)]
6236    pmulhrsw             m0, m8     ; t32a
6237    vpbroadcastd         m8, [o(idct64_mul+4*5)]
6238    pmulhrsw             m6, m1     ; t62a
6239    vpbroadcastd         m4, [o(idct64_mul+4*6)]
6240    pmulhrsw             m1, m9     ; t33a
6241    vpbroadcastd         m9, [o(idct64_mul+4*7)]
6242    pmulhrsw             m5, m2     ; t61a
6243    pmulhrsw             m2, m8     ; t34a
6244    pmulhrsw             m4, m3     ; t60a
6245    pmulhrsw             m3, m9     ; t35a
6246    psubsw               m8, m0, m1 ; t33
6247    paddsw               m0, m1     ; t32
6248    psubsw               m1, m7, m6 ; t62
6249    paddsw               m7, m6     ; t63
6250    psubsw               m6, m3, m2 ; t34
6251    paddsw               m3, m2     ; t35
6252    psubsw               m2, m4, m5 ; t61
6253    paddsw               m5, m4     ; t60
6254.main_part1b:
6255    vpbroadcastd        m11, [o(idct64_mul+4*8)]
6256    vpbroadcastd        m12, [o(idct64_mul+4*9)]
6257    ITX_MULSUB_2W         1, 8, 4, 9, 10, 11, 12 ; t33a, t62a
6258    vpbroadcastd        m11, [o(idct64_mul+4*10)]
6259    ITX_MULSUB_2W         2, 6, 4, 9, 10, 12, 11 ; t34a, t61a
6260    psubsw               m4, m0, m3 ; t35a
6261    paddsw               m0, m3     ; t32a
6262    psubsw               m3, m7, m5 ; t60a
6263    paddsw               m7, m5     ; t63a
6264    psubsw               m5, m1, m2 ; t34
6265    paddsw               m1, m2     ; t33
6266    psubsw               m2, m8, m6 ; t61
6267    paddsw               m6, m8     ; t62
6268.main_part1c:
6269    vpbroadcastd        m11, [o(idct64_mul+4*11)]
6270    vpbroadcastd        m12, [o(idct64_mul+4*12)]
6271    add                  r5, 4*13
6272    ITX_MULSUB_2W         3, 4, 8, 9, 10, 11, 12 ; t35,  t60
6273    ITX_MULSUB_2W         2, 5, 8, 9, 10, 11, 12 ; t34a, t61a
6274    mova          [r4+64*0], m0
6275    mova          [r4+64*7], m7
6276    mova          [r4+64*1], m1
6277    mova          [r4+64*6], m6
6278    mova          [r4+64*3], m3
6279    mova          [r4+64*4], m4
6280    mova          [r4+64*2], m2
6281    mova          [r4+64*5], m5
6282    add                  r4, 64*8
6283    ret
6284cglobal_label .main_part2
6285    vpbroadcastd        m11, [o(pw_1567_3784  -16*13)]
6286    vpbroadcastd        m12, [o(pw_m3784_1567 -16*13)]
6287    lea                  r6, [r4+64*7]
6288    vpbroadcastd        m17, [o(pw_m1567_m3784-16*13)]
6289    vpbroadcastd        m18, [o(pw_2896_2896  -16*13)]
6290    vpbroadcastd        m19, [o(pw_m2896_2896 -16*13)]
6291    sub                  r5, 16*13
6292.main_part2_loop:
6293    mova                 m0, [r4-64*32] ; t32a
6294    mova                 m1, [r6-64*24] ; t39a
6295    mova                 m2, [r6-64*32] ; t63a
6296    mova                 m3, [r4-64*24] ; t56a
6297    mova                 m4, [r4-64*16] ; t40a
6298    mova                 m5, [r6-64* 8] ; t47a
6299    mova                 m6, [r6-64*16] ; t55a
6300    mova                 m7, [r4-64* 8] ; t48a
6301    psubsw               m8, m0, m1 ; t39
6302    paddsw               m0, m1     ; t32
6303    psubsw               m1, m2, m3 ; t56
6304    paddsw               m2, m3     ; t63
6305    psubsw               m3, m5, m4 ; t40
6306    paddsw               m5, m4     ; t47
6307    psubsw               m4, m7, m6 ; t55
6308    paddsw               m7, m6     ; t48
6309    ITX_MULSUB_2W         1, 8, 6, 9, 10, 11, 12 ; t39a, t56a
6310    ITX_MULSUB_2W         4, 3, 6, 9, 10, 12, 17 ; t40a, t55a
6311    psubsw               m6, m2, m7 ; t48a
6312    paddsw               m2, m7     ; t63a
6313    psubsw               m7, m0, m5 ; t47a
6314    paddsw               m0, m5     ; t32a
6315    psubsw               m5, m8, m3 ; t55
6316    paddsw               m8, m3     ; t56
6317    psubsw               m3, m1, m4 ; t40
6318    paddsw               m1, m4     ; t39
6319    ITX_MULSUB_2W         6, 7, 4, 9, 10, 18, 19 ; t47,  t48
6320    ITX_MULSUB_2W         5, 3, 4, 9, 10, 18, 19 ; t40a, t55a
6321    mova         [r6-64* 8], m2
6322    mova         [r4-64*32], m0
6323    mova         [r4-64* 8], m8
6324    mova         [r6-64*32], m1
6325    mova         [r6-64*24], m6
6326    mova         [r4-64*16], m7
6327    mova         [r4-64*24], m5
6328    mova         [r6-64*16], m3
6329    add                  r4, 64
6330    sub                  r6, 64
6331    cmp                  r4, r6
6332    jb .main_part2_loop
6333    ret
6334
6335cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob
6336    lea                  r5, [o_base]
6337    test               eobd, eobd
6338    jz .dconly
6339    PROLOGUE              0, 7, 30, 64*32, dst, stride, c, eob
6340    vpbroadcastd        m23, [o(pw_2896x8)]
6341%undef cmp
6342    cmp                eobd, 136
6343    jb .fast
6344    pmulhrsw             m0, m23, [cq+64* 1]
6345    pmulhrsw             m1, m23, [cq+64*31]
6346    pmulhrsw             m2, m23, [cq+64*17]
6347    pmulhrsw             m3, m23, [cq+64*15]
6348    vpbroadcastd        m10, [o(pd_2048)]
6349    mov                  r4, rsp
6350    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6351    pmulhrsw             m0, m23, [cq+64* 7]
6352    pmulhrsw             m1, m23, [cq+64*25]
6353    pmulhrsw             m2, m23, [cq+64*23]
6354    pmulhrsw             m3, m23, [cq+64* 9]
6355    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6356    pmulhrsw             m0, m23, [cq+64* 5]
6357    pmulhrsw             m1, m23, [cq+64*27]
6358    pmulhrsw             m2, m23, [cq+64*21]
6359    pmulhrsw             m3, m23, [cq+64*11]
6360    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6361    pmulhrsw             m0, m23, [cq+64* 3]
6362    pmulhrsw             m1, m23, [cq+64*29]
6363    pmulhrsw             m2, m23, [cq+64*19]
6364    pmulhrsw             m3, m23, [cq+64*13]
6365    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6366    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
6367    pmulhrsw             m3, m23, [cq+64*24]
6368    pmulhrsw             m1, m23, [cq+64* 8]
6369    pmulhrsw             m2, m23, [cq+64*16]
6370    pmulhrsw             m0, m23, [cq+64* 0]
6371    pmulhrsw            m14, m23, [cq+64* 4]
6372    pmulhrsw            m17, m23, [cq+64*28]
6373    pmulhrsw            m16, m23, [cq+64*20]
6374    pmulhrsw            m15, m23, [cq+64*12]
6375    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
6376    pmulhrsw            m22, m23, [cq+64* 2]
6377    pmulhrsw            m29, m23, [cq+64*30]
6378    pmulhrsw            m26, m23, [cq+64*18]
6379    pmulhrsw            m25, m23, [cq+64*14]
6380    pmulhrsw            m24, m23, [cq+64*10]
6381    pmulhrsw            m27, m23, [cq+64*22]
6382    pmulhrsw            m28, m23, [cq+64*26]
6383    pmulhrsw            m23,      [cq+64* 6]
6384    mova         [cq+64* 0], m14
6385    mova         [cq+64* 1], m15
6386    mova         [cq+64* 2], m16
6387    mova         [cq+64* 3], m17
6388    mova         [cq+64* 4], m18
6389    mova         [cq+64* 5], m19
6390    mova         [cq+64* 6], m20
6391    mova         [cq+64* 7], m21
6392    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
6393    vpbroadcastd        m13, [o(pw_16384)]
6394    call .pass1_end_part1
6395    mova         [cq+64*16], m1
6396    mova         [cq+64*17], m3
6397    mova         [cq+64*18], m5
6398    mova         [cq+64*19], m7
6399    mova         [cq+64*24], m23
6400    mova         [cq+64*25], m25
6401    mova         [cq+64*26], m27
6402    mova         [cq+64*27], m29
6403    pmulhrsw            m23, m13, m0 ; a0
6404    pmulhrsw            m25, m13, m2 ; a2
6405    pmulhrsw            m27, m13, m4 ; a4
6406    pmulhrsw            m29, m13, m6 ; a6
6407    REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6
6408    call .pass1_end_part2
6409    mova         [cq+64*20], m15
6410    mova         [cq+64*21], m17
6411    mova         [cq+64*22], m19
6412    mova         [cq+64*23], m21
6413    mova         [cq+64*28], m1
6414    mova         [cq+64*29], m3
6415    mova         [cq+64*30], m5
6416    mova         [cq+64*31], m7
6417    REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6
6418    REPX {pmulhrsw x, m13}, m0, m2, m4, m6     ; g0 g2 g4 g6
6419    vinserti32x8        m3, m23, ym14, 1 ; a00 a01 c00 c01
6420    vshufi32x4         m23, m14, q3232   ; a02 a03 c02 c03
6421    vinserti32x8       m15, m22, ym0, 1  ; e00 e01 g00 g01
6422    vshufi32x4         m22, m0, q3232    ; e02 e03 g02 g03
6423    vinserti32x8        m1, m27, ym18, 1 ; a40 a41 c40 c41
6424    vshufi32x4         m27, m18, q3232   ; a42 a43 c42 c43
6425    vinserti32x8       m18, m26, ym4, 1  ; e40 e41 g40 g41
6426    vshufi32x4         m26, m4, q3232    ; e42 e43 g42 g43
6427    vinserti32x8       m14, m25, ym16, 1 ; a20 a21 c20 c21
6428    vshufi32x4         m25, m16, q3232   ; a22 a23 c22 c23
6429    vinserti32x8       m17, m24, ym2, 1  ; e20 e21 g20 g21
6430    vshufi32x4         m24, m2, q3232    ; e22 e23 g22 g23
6431    vinserti32x8       m19, m29, ym20, 1 ; a60 a61 c60 c61
6432    vshufi32x4         m29, m20, q3232   ; a62 a63 c62 c63
6433    vinserti32x8       m20, m28, ym6, 1  ; e60 e61 g60 g61
6434    vshufi32x4         m28, m6, q3232    ; e62 e63 g62 g63
6435    vshufi32x4          m2, m3, m15, q3131  ;  8
6436    vshufi32x4          m0, m3, m15, q2020  ;  0
6437    vshufi32x4          m6, m23, m22, q3131 ; 24
6438    vshufi32x4          m4, m23, m22, q2020 ; 16
6439    vshufi32x4          m3, m1, m18, q3131  ; 12
6440    vshufi32x4          m1, m18, q2020      ;  4
6441    vshufi32x4          m7, m27, m26, q3131 ; 28
6442    vshufi32x4          m5, m27, m26, q2020 ; 20
6443    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
6444    vshufi32x4         m16, m14, m17, q3131 ; 10
6445    vshufi32x4         m14, m17, q2020      ;  2
6446    vshufi32x4         m17, m19, m20, q3131 ; 14
6447    vshufi32x4         m15, m19, m20, q2020 ;  6
6448    vshufi32x4         m20, m25, m24, q3131 ; 26
6449    vshufi32x4         m18, m25, m24, q2020 ; 18
6450    vshufi32x4         m21, m29, m28, q3131 ; 30
6451    vshufi32x4         m19, m29, m28, q2020 ; 22
6452    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
6453    pmulhrsw           m22, m13, [cq+64*16] ; a1
6454    pmulhrsw           m23, m13, [cq+64*20] ; c1
6455    pmulhrsw           m24, m13, [cq+64*24] ; e1
6456    pmulhrsw           m25, m13, [cq+64*28] ; g1
6457    pmulhrsw           m26, m13, [cq+64*17] ; a3
6458    pmulhrsw           m27, m13, [cq+64*21] ; c3
6459    pmulhrsw           m28, m13, [cq+64*25] ; e3
6460    pmulhrsw           m29, m13, [cq+64*29] ; g3
6461    mova        [cq+64* 8], m14
6462    mova        [cq+64* 9], m15
6463    mova        [cq+64*10], m16
6464    mova        [cq+64*11], m17
6465    mova        [cq+64*12], m18
6466    mova        [cq+64*13], m19
6467    mova        [cq+64*14], m20
6468    mova        [cq+64*15], m21
6469    pmulhrsw           m14, m13, [cq+64*18] ; a5
6470    pmulhrsw           m15, m13, [cq+64*22] ; c5
6471    pmulhrsw           m16, m13, [cq+64*26] ; e5
6472    pmulhrsw           m17, m13, [cq+64*30] ; g5
6473    pmulhrsw           m18, m13, [cq+64*19] ; a7
6474    pmulhrsw           m19, m13, [cq+64*23] ; c7
6475    pmulhrsw           m20, m13, [cq+64*27] ; e7
6476    pmulhrsw           m21, m13, [cq+64*31] ; g7
6477    vinserti32x8        m8, m22, ym23, 1 ; a10 a11 c10 c11
6478    vshufi32x4         m22, m23, q3232   ; a12 a13 c12 c13
6479    vinserti32x8        m9, m24, ym25, 1 ; e10 e11 g10 g11
6480    vshufi32x4         m24, m25, q3232   ; e12 e13 g12 g13
6481    vinserti32x8       m23, m26, ym27, 1 ; a30 a31 c30 c31
6482    vshufi32x4         m26, m27, q3232   ; a32 a33 c32 c33
6483    vinserti32x8       m11, m28, ym29, 1 ; e30 e31 g30 g31
6484    vshufi32x4         m28, m29, q3232   ; e32 e33 g32 g33
6485    mova        [cq+64* 0], m0
6486    mova        [cq+64* 1], m1
6487    mova        [cq+64* 2], m2
6488    mova        [cq+64* 3], m3
6489    mova        [cq+64* 4], m4
6490    mova        [cq+64* 5], m5
6491    mova        [cq+64* 6], m6
6492    mova        [cq+64* 7], m7
6493    vinserti32x8       m12, m14, ym15, 1 ; a50 a51 c50 c51
6494    vshufi32x4         m14, m15, q3232   ; a52 a53 c52 c53
6495    vinserti32x8       m13, m16, ym17, 1 ; e50 e51 g50 g51
6496    vshufi32x4         m16, m17, q3232   ; e52 e53 g52 g53
6497    vinserti32x8       m25, m18, ym19, 1 ; a70 a71 c70 c71
6498    vshufi32x4         m18, m19, q3232   ; a72 a73 c72 c73
6499    vinserti32x8       m17, m20, ym21, 1 ; e70 e71 g70 g71
6500    vshufi32x4         m20, m21, q3232   ; e72 e73 g72 g73
6501    vshufi32x4         m27, m23, m11, q3131 ; 11 m27
6502    vshufi32x4         m23, m11, q2020      ;  3 m23
6503    vshufi32x4         m19, m26, m28, q3131 ; 27 m19
6504    vshufi32x4         m15, m26, m28, q2020 ; 19 m15
6505    vshufi32x4         m29, m25, m17, q3131 ; 15 m29
6506    vshufi32x4         m25, m17, q2020      ;  7 m25
6507    vshufi32x4         m21, m18, m20, q3131 ; 31 m21
6508    vshufi32x4         m17, m18, m20, q2020 ; 23 m17
6509    vshufi32x4         m20, m14, m16, q3131 ; 29 m20
6510    vshufi32x4         m16, m14, m16, q2020 ; 21 m16
6511    vshufi32x4         m18, m22, m24, q3131 ; 25 m18
6512    vshufi32x4         m14, m22, m24, q2020 ; 17 m14
6513    vshufi32x4         m26, m8, m9, q3131   ;  9 m26
6514    vshufi32x4         m22, m8, m9, q2020   ;  1 m22
6515    vshufi32x4         m28, m12, m13, q3131 ; 13 m28
6516    vshufi32x4         m24, m12, m13, q2020 ;  5 m24
6517    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
6518    vpbroadcastd       m13, [o(pw_16384)]
6519    pmulhrsw            m0, m13, [r4-64*21]
6520    pmulhrsw            m1, m13, [r4-64*22]
6521    pmulhrsw            m2, m13, [r4-64*23]
6522    pmulhrsw            m3, m13, [r4-64*24]
6523    pmulhrsw            m4, m13, [r4-64*25]
6524    pmulhrsw            m5, m13, [r4-64*26]
6525    pmulhrsw            m6, m13, [r4-64*27]
6526    pmulhrsw            m7, m13, [r4-64*28]
6527    mova        [cq+64*16], m14
6528    mova        [cq+64*17], m15
6529    mova        [cq+64*18], m16
6530    mova        [cq+64*19], m17
6531    mova        [cq+64*20], m18
6532    mova        [cq+64*21], m19
6533    mova        [cq+64*22], m20
6534    mova        [cq+64*23], m21
6535    pmulhrsw           m14, m13, [r4-64*12]
6536    pmulhrsw           m15, m13, [r4-64*11]
6537    pmulhrsw           m16, m13, [r4-64*10]
6538    pmulhrsw           m17, m13, [r4-64* 9]
6539    pmulhrsw           m18, m13, [r4-64* 8]
6540    pmulhrsw           m19, m13, [r4-64* 7]
6541    pmulhrsw           m20, m13, [r4-64* 6]
6542    pmulhrsw           m21, m13, [r4-64* 5]
6543    mova        [cq+64*24], m22
6544    mova        [cq+64*25], m23
6545    mova        [cq+64*26], m24
6546    mova        [cq+64*27], m25
6547    mova        [cq+64*28], m26
6548    mova        [cq+64*29], m27
6549    mova        [cq+64*30], m28
6550    mova        [cq+64*31], m29
6551    call .transpose_2x8x8_lo
6552    mova        [r4-64*12], m1
6553    mova        [r4-64*11], m3
6554    mova        [r4-64*10], m5
6555    mova        [r4-64* 9], m7
6556    mova        [r4-64* 8], m15
6557    mova        [r4-64* 7], m17
6558    mova        [r4-64* 6], m19
6559    mova        [r4-64* 5], m21
6560    vinserti32x8       m22, m0, ym14, 1     ; f00 f01 h00 h01
6561    vshufi32x4         m23, m0, m14, q3232  ; f02 f03 h02 h03
6562    vinserti32x8       m24, m2, ym16, 1     ; f20 f21 h20 h21
6563    vshufi32x4         m25, m2, m16, q3232  ; f22 f23 h22 h23
6564    vinserti32x8       m26, m4, ym18, 1     ; f40 f41 h40 h41
6565    vshufi32x4         m27, m4, m18, q3232  ; f42 f43 h42 h43
6566    vinserti32x8       m28, m6, ym20, 1     ; f60 f61 h60 h61
6567    vshufi32x4         m29, m6, m20, q3232  ; f62 f63 h62 h63
6568    pmulhrsw            m0, m13, [r4-64*20]
6569    pmulhrsw            m1, m13, [r4-64*19]
6570    pmulhrsw            m2, m13, [r4-64*18]
6571    pmulhrsw            m3, m13, [r4-64*17]
6572    pmulhrsw            m4, m13, [r4-64*16]
6573    pmulhrsw            m5, m13, [r4-64*15]
6574    pmulhrsw            m6, m13, [r4-64*14]
6575    pmulhrsw            m7, m13, [r4-64*13]
6576    pmulhrsw           m14, m13, [r4-64*29]
6577    pmulhrsw           m15, m13, [r4-64*30]
6578    pmulhrsw           m16, m13, [r4-64*31]
6579    pmulhrsw           m17, m13, [r4-64*32]
6580    pmulhrsw           m18, m13, [r4-64*33]
6581    pmulhrsw           m19, m13, [r4-64*34]
6582    pmulhrsw           m20, m13, [r4-64*35]
6583    pmulhrsw           m21, m13, [r4-64*36]
6584    call .transpose_2x8x8_lo
6585    mova       [r4-64*20], m1
6586    mova       [r4-64*19], m3
6587    mova       [r4-64*18], m5
6588    mova       [r4-64*17], m7
6589    mova       [r4-64*16], m15
6590    mova       [r4-64*15], m17
6591    mova       [r4-64*14], m19
6592    mova       [r4-64*13], m21
6593    vinserti32x8        m1, m4, ym18, 1     ; b40 b41 d40 d41
6594    vshufi32x4          m5, m4, m18, q3232  ; b42 b43 d42 d43
6595    vshufi32x4          m4, m0, m14, q3232  ; b02 b03 d02 d03
6596    vinserti32x8        m0, ym14, 1         ; b00 b01 d00 d01
6597    vinserti32x8       m14, m2, ym16, 1     ; b20 b21 d20 d21
6598    vshufi32x4         m18, m2, m16, q3232  ; b22 b23 d22 d23
6599    vinserti32x8       m15, m6, ym20, 1     ; b60 b61 d60 d61
6600    vshufi32x4         m19, m6, m20, q3232  ; b62 b63 d62 d63
6601    vshufi32x4          m2, m0, m22, q3131  ;  8
6602    vshufi32x4          m0, m22, q2020      ;  0
6603    vshufi32x4          m3, m1, m26, q3131  ; 12
6604    vshufi32x4          m1, m26, q2020      ;  4
6605    vshufi32x4          m6, m4, m23, q3131  ; 24
6606    vshufi32x4          m4, m23, q2020      ; 16
6607    vshufi32x4          m7, m5, m27, q3131  ; 28
6608    vshufi32x4          m5, m27, q2020      ; 20
6609    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
6610    vshufi32x4         m16, m14, m24, q3131 ; 10
6611    vshufi32x4         m14, m24, q2020      ;  2
6612    vshufi32x4         m17, m15, m28, q3131 ; 14
6613    vshufi32x4         m15, m28, q2020      ;  6
6614    vshufi32x4         m20, m18, m25, q3131 ; 26
6615    vshufi32x4         m18, m25, q2020      ; 18
6616    vshufi32x4         m21, m19, m29, q3131 ; 30
6617    vshufi32x4         m19, m29, q2020      ; 22
6618    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
6619    mova               m22, [r4-64*20]
6620    mova               m26, [r4-64*16]
6621    mova               m23, [r4-64*19]
6622    mova               m27, [r4-64*15]
6623    mova               m24, [r4-64*18]
6624    mova               m28, [r4-64*14]
6625    mova               m25, [r4-64*17]
6626    mova               m29, [r4-64*13]
6627    mova        [r4-64*20], m14
6628    mova        [r4-64*19], m15
6629    mova        [r4-64*18], m16
6630    mova        [r4-64*17], m17
6631    mova        [r4-64*16], m18
6632    mova        [r4-64*15], m19
6633    mova        [r4-64*14], m20
6634    mova        [r4-64*13], m21
6635    mova               m19, [r4-64*12]
6636    mova               m11, [r4-64* 8]
6637    mova               m20, [r4-64*11]
6638    mova               m12, [r4-64* 7]
6639    mova               m21, [r4-64*10]
6640    mova                m8, [r4-64* 6]
6641    mova                m9, [r4-64* 9]
6642    mova               m18, [r4-64* 5]
6643    vshufi32x4         m14, m22, m26, q3232 ; b12 b13 d12 d13
6644    vinserti32x8       m22, ym26, 1         ; b10 b11 d10 d11
6645    vshufi32x4         m15, m23, m27, q3232 ; b32 b33 d32 d33
6646    vinserti32x8       m23, ym27, 1         ; b30 b31 d30 d31
6647    vshufi32x4         m16, m24, m28, q3232 ; b52 b53 d52 d53
6648    vinserti32x8       m24, ym28, 1         ; b50 b51 d50 d51
6649    vshufi32x4         m17, m25, m29, q3232 ; b72 b73 d72 d73
6650    vinserti32x8       m25, ym29, 1         ; b70 b71 d70 d71
6651    vinserti32x8       m27, m19, ym11, 1    ; f10 f11 h10 h11
6652    vshufi32x4         m19, m11, q3232      ; f12 f13 h12 h13
6653    vinserti32x8       m28, m20, ym12, 1    ; f30 f31 h30 h31
6654    vshufi32x4         m20, m12, q3232      ; f32 f33 h32 h33
6655    vinserti32x8       m29, m21, ym8, 1     ; f50 f51 h50 h51
6656    vshufi32x4         m21, m8, q3232       ; f52 f53 h52 h53
6657    vinserti32x8        m8, m9, ym18, 1     ; f70 f71 h70 h71
6658    vshufi32x4          m9, m18, q3232      ; f72 f73 h72 h73
6659    vshufi32x4         m26, m22, m27, q3131 ;  9
6660    vshufi32x4         m22, m27, q2020      ;  1
6661    vshufi32x4         m27, m23, m28, q3131 ; 11
6662    vshufi32x4         m23, m28, q2020      ;  3
6663    vshufi32x4         m28, m24, m29, q3131 ; 13
6664    vshufi32x4         m24, m29, q2020      ;  5
6665    vshufi32x4         m29, m25, m8, q3131  ; 15
6666    vshufi32x4         m25, m8, q2020       ;  7
6667    vshufi32x4         m18, m14, m19, q3131 ; 25
6668    vshufi32x4         m14, m19, q2020      ; 17
6669    vshufi32x4         m19, m15, m20, q3131 ; 27
6670    vshufi32x4         m15, m20, q2020      ; 19
6671    vshufi32x4         m20, m16, m21, q3131 ; 29
6672    vshufi32x4         m16, m21, q2020      ; 21
6673    vshufi32x4         m21, m17, m9, q3131  ; 31
6674    vshufi32x4         m17, m9, q2020       ; 23
6675    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
6676    jmp .end
6677.fast: ; bottom/right halves are zero
6678    {evex}vpmulhrsw     ym8, ym23, [cq+64* 4]
6679    {evex}vpmulhrsw     xm1, xm23, [cq+64*12]
6680    mova                m28, [o(dup16_perm)]
6681    {evex}vpmulhrsw     ym7, ym23, [cq+64* 8]
6682          vpmulhrsw    ym22, ym23, [cq+64* 0]
6683    vpermb               m8, m28, m8
6684    vpermb              ym1, ym28, ym1
6685    vpermb               m7, m28, m7
6686    pmovzxwd             m9, ym22
6687    pslld                m9, 16
6688    call m(idct_16x16_internal_8bpc).main_fast2
6689    {evex}vpmulhrsw    ym21, ym23, [cq+64* 2]
6690    {evex}vpmulhrsw    xm15, xm23, [cq+64*14]
6691    {evex}vpmulhrsw    xm18, xm23, [cq+64*10]
6692    {evex}vpmulhrsw    ym14, ym23, [cq+64* 6]
6693    vpermb              m21, m28, m21
6694    punpcklwd          xm15, xm15
6695    vpermb             ym18, ym28, ym18
6696    vpermb              m14, m28, m14
6697    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
6698          vpmulhrsw    ym22, ym23, [cq+64* 1]
6699    {evex}vpmulhrsw    xm29, xm23, [cq+64*15]
6700    {evex}vpmulhrsw    xm26, xm23, [cq+64* 9]
6701    {evex}vpmulhrsw    ym25, ym23, [cq+64* 7]
6702    {evex}vpmulhrsw    ym24, ym23, [cq+64* 5]
6703    {evex}vpmulhrsw    xm27, xm23, [cq+64*11]
6704    {evex}vpmulhrsw     xm8, xm23, [cq+64*13]
6705    {evex}vpmulhrsw    ym23,       [cq+64* 3]
6706    vpermb              m22, m28, m22
6707    punpcklwd          xm29, xm29
6708    vpermb             ym26, ym28, ym26
6709    vpermb              m25, m28, m25
6710    mova         [cq+64* 0], m14
6711    mova         [cq+64* 1], m15
6712    mova         [cq+64* 2], m16
6713    mova         [cq+64* 3], m17
6714    REPX {vpermb x, m28, x}, m24, m27, m23
6715    punpcklwd          xm28, xm8, xm8
6716    mova         [cq+64* 4], m18
6717    mova         [cq+64* 5], m19
6718    mova         [cq+64* 6], m20
6719    mova         [cq+64* 7], m21
6720    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
6721    mov                  r4, rsp
6722    vpbroadcastd        m13, [o(pw_16384)]
6723    mova         [r4+64*16], m4
6724    mova         [r4+64*17], m5
6725    mova         [r4+64*18], m6
6726    mova         [r4+64*19], m7
6727    mova         [r4+64*28], m26
6728    mova         [r4+64*29], m27
6729    mova         [r4+64*30], m28
6730    mova         [r4+64*31], m29
6731    call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
6732    mova         [r4+64*20], m22
6733    mova         [r4+64*21], m23
6734    mova         [r4+64*22], m24
6735    mova         [r4+64*23], m25
6736    mova         [r4+64*24], m26
6737    mova         [r4+64*25], m27
6738    mova         [r4+64*26], m28
6739    mova         [r4+64*27], m29
6740    call .pass2_fast
6741    mova         [cq+64* 8], m14
6742    mova         [cq+64* 9], m15
6743    mova         [cq+64*10], m16
6744    mova         [cq+64*11], m17
6745    mova         [cq+64*12], m18
6746    mova         [cq+64*13], m19
6747    mova         [cq+64*14], m20
6748    mova         [cq+64*15], m21
6749    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
6750    mova         [cq+64* 0], m0
6751    mova         [cq+64* 1], m1
6752    mova         [cq+64* 2], m2
6753    mova         [cq+64* 3], m3
6754    mova         [cq+64* 4], m4
6755    mova         [cq+64* 5], m5
6756    mova         [cq+64* 6], m6
6757    mova         [cq+64* 7], m7
6758    pmulhrsw             m0, m13, [r4+64*16]
6759    pmulhrsw             m1, m13, [r4+64*17]
6760    pmulhrsw             m2, m13, [r4+64*18]
6761    pmulhrsw             m3, m13, [r4+64*19]
6762    pmulhrsw             m4, m13, [r4+64*20]
6763    pmulhrsw             m5, m13, [r4+64*21]
6764    pmulhrsw             m6, m13, [r4+64*22]
6765    pmulhrsw             m7, m13, [r4+64*23]
6766    mova         [cq+64*16], m14
6767    mova         [cq+64*17], m15
6768    mova         [cq+64*18], m16
6769    mova         [cq+64*19], m17
6770    mova         [cq+64*20], m18
6771    mova         [cq+64*21], m19
6772    mova         [cq+64*22], m20
6773    mova         [cq+64*23], m21
6774    pmulhrsw            m14, m13, [r4+64*24]
6775    pmulhrsw            m15, m13, [r4+64*25]
6776    pmulhrsw            m16, m13, [r4+64*26]
6777    pmulhrsw            m17, m13, [r4+64*27]
6778    pmulhrsw            m18, m13, [r4+64*28]
6779    pmulhrsw            m19, m13, [r4+64*29]
6780    pmulhrsw            m20, m13, [r4+64*30]
6781    pmulhrsw            m21, m13, [r4+64*31]
6782    mova         [cq+64*24], m22
6783    mova         [cq+64*25], m23
6784    mova         [cq+64*26], m24
6785    mova         [cq+64*27], m25
6786    mova         [cq+64*28], m26
6787    mova         [cq+64*29], m27
6788    mova         [cq+64*30], m28
6789    mova         [cq+64*31], m29
6790    call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
6791    call .pass2_fast
6792    mova         [r4+64*16], m14
6793    mova         [r4+64*17], m15
6794    mova         [r4+64*18], m16
6795    mova         [r4+64*19], m17
6796    mova         [r4+64*20], m18
6797    mova         [r4+64*21], m19
6798    mova         [r4+64*22], m20
6799    mova         [r4+64*23], m21
6800    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
6801.end:
6802    vpbroadcastd        m13, [o(pw_2048)]
6803    lea                  r5, [strideq*3]
6804    pxor                m12, m12
6805    lea                  r3, [dstq+r5*8]
6806    lea                  r6, [strideq+r5] ; stride*4
6807    add                  r3, r6           ; dst+stride*28
6808%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi
6809    mova                m11, [cq+64*(   %3)] ;  0
6810    mova                 m9, [cq+64*(31-%3)] ; 31
6811%if %3 >= 8
6812    mova                m%1, [rsp+64*(%1+16)]
6813%endif
6814    mova                m10, [dstq+%4]
6815    paddsw               m8, m11, m9
6816    psubsw              m11, m9
6817    paddsw               m9, m%1, m%2
6818    psubsw              m%1, m%2
6819    punpcklbw           m%2, m10, m12
6820    punpckhbw           m10, m12
6821    pmulhrsw             m8, m13
6822    pmulhrsw             m9, m13
6823    paddw                m8, m%2
6824    paddw                m9, m10
6825    mova                m10, [r3+%5]
6826    pmulhrsw            m11, m13
6827    pmulhrsw            m%1, m13
6828    mova    [cq+64*(   %3)], m12
6829    mova    [cq+64*(31-%3)], m12
6830    punpcklbw           m%2, m10, m12
6831    punpckhbw           m10, m12
6832    packuswb             m8, m9
6833    paddw               m11, m%2
6834    paddw               m%1, m10
6835    packuswb            m11, m%1
6836    mova          [dstq+%4], m8
6837    mova          [r3  +%5], m11
6838%if %3 == 3 || %3 == 7 || %3 == 11
6839    add                dstq, r6
6840    sub                  r3, r6
6841%endif
6842%endmacro
6843    IDCT_64x32_END        0, 29,  0, strideq*0, r5
6844    IDCT_64x32_END        1, 28,  1, strideq*1, strideq*2
6845    IDCT_64x32_END        2, 27,  2, strideq*2, strideq*1
6846    IDCT_64x32_END        3, 26,  3, r5       , strideq*0
6847    IDCT_64x32_END        4, 25,  4, strideq*0, r5
6848    IDCT_64x32_END        5, 24,  5, strideq*1, strideq*2
6849    IDCT_64x32_END        6, 23,  6, strideq*2, strideq*1
6850    IDCT_64x32_END        7, 22,  7, r5       , strideq*0
6851    IDCT_64x32_END        0, 21,  8, strideq*0, r5
6852    IDCT_64x32_END        1, 20,  9, strideq*1, strideq*2
6853    IDCT_64x32_END        2, 19, 10, strideq*2, strideq*1
6854    IDCT_64x32_END        3, 18, 11, r5       , strideq*0
6855    IDCT_64x32_END        4, 17, 12, strideq*0, r5
6856    IDCT_64x32_END        5, 16, 13, strideq*1, strideq*2
6857    IDCT_64x32_END        6, 15, 14, strideq*2, strideq*1
6858    IDCT_64x32_END        7, 14, 15, r5       , strideq*0
6859    RET
6860ALIGN function_align
6861.dconly:
6862    movsx               r6d, word [cq]
6863    mov                [cq], eobd
6864    or                  r3d, 32
6865    imul                r6d, 181
6866    add                 r6d, 128
6867    sar                 r6d, 8
6868    imul                r6d, 181
6869    add                 r6d, 128+256
6870    sar                 r6d, 8+1
6871    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2
6872ALIGN function_align
6873.pass1_end_part1:
6874%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64
6875%if %1 != %3
6876    mova                m%1, [cq+64*%1]
6877%endif
6878    mova                 m9, [r4+64*(%3-36)] ; idct64 32+n
6879    mova                m11, [r4+64*(-5-%3)] ; idct64 63-n
6880    psubsw               m8, m%1, m%2        ; idct32 31-n
6881    paddsw              m%1, m%2             ; idct32  0+n
6882%if %1 == %3
6883    psubsw              m%2, m8, m9   ; out 32+n e
6884    paddsw               m8, m9       ; out 31-n d
6885    psubsw               m9, m%1, m11 ; out 63-n h
6886    paddsw              m%1, m11      ; out  0+n a
6887%else
6888    paddsw              m%2, m8, m9   ; out 23-n c
6889    psubsw               m8, m9       ; out 40+n f
6890    paddsw               m9, m%1, m11 ; out  8+n b
6891    psubsw              m%1, m11      ; out 55-n g
6892%endif
6893    mova   [r4+64*(%3-36)], m8
6894    mova   [r4+64*(-5-%3)], m9
6895%endmacro
6896    IDCT_64x32_PASS1_END  0, 29,  0
6897    IDCT_64x32_PASS1_END  1, 28,  1
6898    IDCT_64x32_PASS1_END  2, 27,  2
6899    IDCT_64x32_PASS1_END  3, 26,  3
6900    IDCT_64x32_PASS1_END  4, 25,  4
6901    IDCT_64x32_PASS1_END  5, 24,  5
6902    IDCT_64x32_PASS1_END  6, 23,  6
6903    IDCT_64x32_PASS1_END  7, 22,  7
6904.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted)
6905    punpcklwd            m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3
6906    punpckhwd           m25, m24      ; e4 f4 e5 f5 e6 f6 e7 f7
6907    punpcklwd           m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3
6908    punpckhwd           m23, m22      ; g4 h4 g5 h5 g6 h6 g7 h7
6909    punpcklwd           m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3
6910    punpckhwd           m29, m28      ; a4 b4 a5 b5 a6 b6 a7 b7
6911    punpcklwd           m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3
6912    punpckhwd           m27, m26      ; c4 d4 c5 d5 c6 d6 c7 d7
6913    punpckldq           m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5
6914    punpckhdq           m29, m27      ; a6 b6 c6 d6 a7 b7 c7 d7
6915    punpckldq           m27, m8, m24  ; e0 f0 g0 h0 e1 f1 g1 h1
6916    punpckhdq            m8, m24      ; e2 f2 g2 h2 e3 f3 g3 h3
6917    punpckhdq           m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3
6918    punpckldq           m22, m28      ; a0 b0 c0 d0 a1 b1 c1 d1
6919    punpckldq           m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5
6920    punpckhdq           m25, m23      ; e6 f6 g6 h6 e7 f7 g7 h7
6921    punpckhqdq          m23, m22, m27 ;  1 23
6922    punpcklqdq          m22, m27      ;  0 22
6923    punpckhqdq          m27, m26, m28 ;  5 27
6924    punpcklqdq          m26, m28      ;  4 26
6925    punpcklqdq          m28, m29, m25 ;  6 28
6926    punpckhqdq          m29, m25      ;  7 29
6927    punpckhqdq          m25, m24, m8  ;  3 25
6928    punpcklqdq          m24, m8       ;  2 24
6929.transpose_8x8:
6930    punpckhwd            m8, m4, m5
6931    punpcklwd            m4, m5
6932    punpckhwd            m5, m0, m1
6933    punpcklwd            m0, m1
6934    punpckhwd            m1, m6, m7
6935    punpcklwd            m6, m7
6936    punpckhwd            m7, m2, m3
6937    punpcklwd            m2, m3
6938    punpckhdq            m3, m0, m2
6939    punpckldq            m0, m2
6940    punpckldq            m2, m4, m6
6941    punpckhdq            m4, m6
6942    punpckhdq            m6, m5, m7
6943    punpckldq            m5, m7
6944    punpckldq            m7, m8, m1
6945    punpckhdq            m8, m1
6946    punpckhqdq           m1, m0, m2
6947    punpcklqdq           m0, m2
6948    punpcklqdq           m2, m3, m4
6949    punpckhqdq           m3, m4
6950    punpcklqdq           m4, m5, m7
6951    punpckhqdq           m5, m7
6952    punpckhqdq           m7, m6, m8
6953    punpcklqdq           m6, m8
6954    ret
6955.pass1_end_part2:
6956    IDCT_64x32_PASS1_END  0, 21,  8
6957    IDCT_64x32_PASS1_END  1, 20,  9
6958    IDCT_64x32_PASS1_END  2, 19, 10
6959    IDCT_64x32_PASS1_END  3, 18, 11
6960    IDCT_64x32_PASS1_END  4, 17, 12
6961    IDCT_64x32_PASS1_END  5, 16, 13
6962    IDCT_64x32_PASS1_END  6, 15, 14
6963    IDCT_64x32_PASS1_END  7, 14, 15
6964.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21
6965    punpcklwd            m8, m3, m2
6966    punpckhwd            m3, m2
6967    punpcklwd            m2, m1, m0
6968    punpckhwd            m1, m0
6969    punpcklwd            m0, m7, m6
6970    punpckhwd            m7, m6
6971    punpcklwd            m6, m5, m4
6972    punpckhwd            m5, m4
6973    punpckldq            m4, m7, m5
6974    punpckhdq            m7, m5
6975    punpckldq            m5, m8, m2
6976    punpckhdq            m8, m2
6977    punpckhdq            m2, m0, m6
6978    punpckldq            m0, m6
6979    punpckldq            m6, m3, m1
6980    punpckhdq            m3, m1
6981    punpckhqdq           m1, m0, m5
6982    punpcklqdq           m0, m5
6983    punpckhqdq           m5, m4, m6
6984    punpcklqdq           m4, m6
6985    punpcklqdq           m6, m7, m3
6986    punpckhqdq           m7, m3
6987    punpckhqdq           m3, m2, m8
6988    punpcklqdq           m2, m8
6989    punpckhwd            m8, m18, m19
6990    punpcklwd           m18, m19
6991    punpckhwd           m19, m14, m15
6992    punpcklwd           m14, m15
6993    punpckhwd           m15, m20, m21
6994    punpcklwd           m20, m21
6995    punpckhwd           m21, m16, m17
6996    punpcklwd           m16, m17
6997    punpckhdq           m17, m14, m16
6998    punpckldq           m14, m16
6999    punpckldq           m16, m18, m20
7000    punpckhdq           m18, m20
7001    punpckhdq           m20, m19, m21
7002    punpckldq           m19, m21
7003    punpckldq           m21, m8, m15
7004    punpckhdq            m8, m15
7005    punpckhqdq          m15, m14, m16
7006    punpcklqdq          m14, m16
7007    punpcklqdq          m16, m17, m18
7008    punpckhqdq          m17, m18
7009    punpcklqdq          m18, m19, m21
7010    punpckhqdq          m19, m21
7011    punpckhqdq          m21, m20, m8
7012    punpcklqdq          m20, m8
7013    ret
7014.pass2_fast:
7015    vshufi32x4          m24, m9, m15, q3131  ;  5
7016    vshufi32x4          m22, m9, m15, q2020  ;  1
7017    vshufi32x4          m15, m1, m16, q3131  ;  6
7018    vshufi32x4          m14, m1, m16, q2020  ;  2
7019    vshufi32x4           m1, m0, m3, q3131   ;  4
7020    vshufi32x4           m0, m3, q2020       ;  0
7021    vshufi32x4           m3, m8, m2, q3131   ; 12
7022    vshufi32x4           m2, m8, m2, q2020   ;  8
7023    vshufi32x4          m25, m11, m17, q3131 ;  7
7024    vshufi32x4          m23, m11, m17, q2020 ;  3
7025    vshufi32x4          m17, m5, m19, q3131  ; 14
7026    vshufi32x4          m16, m5, m19, q2020  ; 10
7027    vshufi32x4          m29, m6, m20, q3131  ; 15
7028    vshufi32x4          m27, m6, m20, q2020  ; 11
7029    vshufi32x4          m28, m4, m18, q3131  ; 13
7030    vshufi32x4          m26, m4, m18, q2020  ;  9
7031    jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
7032
7033cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob
7034    lea                  r5, [o_base]
7035    test               eobd, eobd
7036    jz .dconly
7037    PROLOGUE              0, 7, 30, 64*96, dst, stride, c, eob
7038%undef cmp
7039    cmp                eobd, 136
7040    jb .fast
7041    mova                 m0, [cq+64* 1]
7042    mova                 m1, [cq+64*31]
7043    mova                 m2, [cq+64*17]
7044    mova                 m3, [cq+64*15]
7045    vpbroadcastd        m10, [o(pd_2048)]
7046    mov                  r4, rsp
7047    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7048    mova                 m0, [cq+64* 7]
7049    mova                 m1, [cq+64*25]
7050    mova                 m2, [cq+64*23]
7051    mova                 m3, [cq+64* 9]
7052    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7053    mova                 m0, [cq+64* 5]
7054    mova                 m1, [cq+64*27]
7055    mova                 m2, [cq+64*21]
7056    mova                 m3, [cq+64*11]
7057    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7058    mova                 m0, [cq+64* 3]
7059    mova                 m1, [cq+64*29]
7060    mova                 m2, [cq+64*19]
7061    mova                 m3, [cq+64*13]
7062    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7063    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
7064    mova                 m0, [cq+64* 0]
7065    mova                 m1, [cq+64* 8]
7066    mova                 m2, [cq+64*16]
7067    mova                 m3, [cq+64*24]
7068    mova                m14, [cq+64* 4]
7069    mova                m15, [cq+64*12]
7070    mova                m16, [cq+64*20]
7071    mova                m17, [cq+64*28]
7072    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
7073    mova                m22, [cq+64* 2]
7074    mova                m29, [cq+64*30]
7075    mova                m26, [cq+64*18]
7076    mova                m25, [cq+64*14]
7077    mova                m24, [cq+64*10]
7078    mova                m27, [cq+64*22]
7079    mova                m28, [cq+64*26]
7080    mova                m23, [cq+64* 6]
7081    mova         [cq+64* 0], m14
7082    mova         [cq+64* 1], m15
7083    mova         [cq+64* 2], m16
7084    mova         [cq+64* 3], m17
7085    mova         [cq+64* 4], m18
7086    mova         [cq+64* 5], m19
7087    mova         [cq+64* 6], m20
7088    mova         [cq+64* 7], m21
7089    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
7090    vpbroadcastd        m13, [o(pw_8192)]
7091    call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1
7092    mova         [r4+64*36], m1
7093    mova         [r4+64*37], m3
7094    mova         [r4+64*38], m5
7095    mova         [r4+64*39], m7
7096    mova         [r4+64*44], m23
7097    mova         [r4+64*45], m25
7098    mova         [r4+64*46], m27
7099    mova         [r4+64*47], m29
7100    pmulhrsw            m23, m13, m0 ; a0
7101    pmulhrsw            m25, m13, m2 ; a2
7102    pmulhrsw            m27, m13, m4 ; a4
7103    pmulhrsw            m29, m13, m6 ; a6
7104    call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2
7105    lea                  r6, [r4-64*4]
7106    add                  r4, 64*28
7107    call .pass2_end
7108    mov                  r4, rsp
7109    mova                 m0, [r4+64*23]
7110    mova                 m1, [r4+64*22]
7111    mova                 m2, [r4+64*21]
7112    mova                 m3, [r4+64*20]
7113    mova                 m4, [r4+64*19]
7114    mova                 m5, [r4+64*18]
7115    mova                 m6, [r4+64*17]
7116    mova                 m7, [r4+64*16]
7117    mova                m22, [r4+64*15]
7118    mova                m23, [r4+64*14]
7119    mova                m24, [r4+64*13]
7120    mova                m25, [r4+64*12]
7121    mova                m26, [r4+64*11]
7122    mova                m27, [r4+64*10]
7123    mova                m28, [r4+64* 9]
7124    mova                m29, [r4+64* 8]
7125    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi
7126    vpbroadcastd        m13, [o(pw_8192)]
7127    mova         [r4+64* 8], m1
7128    mova         [r4+64* 9], m3
7129    mova         [r4+64*10], m5
7130    mova         [r4+64*11], m7
7131    mova         [r4+64*16], m23
7132    mova         [r4+64*17], m25
7133    mova         [r4+64*18], m27
7134    mova         [r4+64*19], m29
7135    pmulhrsw            m23, m13, m0 ; b0
7136    pmulhrsw            m25, m13, m2 ; b2
7137    pmulhrsw            m27, m13, m4 ; b4
7138    pmulhrsw            m29, m13, m6 ; b6
7139    mova                 m0, [r4+64*31]
7140    mova                 m1, [r4+64*30]
7141    mova                 m2, [r4+64*29]
7142    mova                 m3, [r4+64*28]
7143    mova                 m4, [r4+64*27]
7144    mova                 m5, [r4+64*26]
7145    mova                 m6, [r4+64*25]
7146    mova                 m7, [r4+64*24]
7147    mova                m14, [r4+64* 7]
7148    mova                m15, [r4+64* 6]
7149    mova                m16, [r4+64* 5]
7150    mova                m17, [r4+64* 4]
7151    mova                m18, [r4+64* 3]
7152    mova                m19, [r4+64* 2]
7153    mova                m20, [r4+64* 1]
7154    mova                m21, [r4+64* 0]
7155    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo
7156    mov                  r6, cq
7157    call .pass2_end
7158    jmp .end
7159.fast: ; bottom/right halves are zero
7160    mova                m28, [o(dup16_perm)]
7161    pmovzxwd             m9,       [cq+64* 0]
7162    vpermb               m8, m28,  [cq+64* 4]
7163    vpermb              ym1, ym28, [cq+64*12]
7164    vpermb               m7, m28,  [cq+64* 8]
7165    pslld                m9, 16
7166    call m(idct_16x16_internal_8bpc).main_fast2
7167    vpermb              m21, m28,  [cq+64* 2]
7168    vpermb             ym15, ym28, [cq+64*14]
7169    vpermb             ym18, ym28, [cq+64*10]
7170    vpermb              m14, m28,  [cq+64* 6]
7171    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
7172    vpermb              m22, m28,  [cq+64* 1]
7173    vpermb             ym29, ym28, [cq+64*15]
7174    vpermb             ym26, ym28, [cq+64* 9]
7175    vpermb              m25, m28,  [cq+64* 7]
7176    vpermb              m24, m28,  [cq+64* 5]
7177    vpermb             ym27, ym28, [cq+64*11]
7178    vpermb              m23, m28,  [cq+64* 3]
7179    vpermb             ym28, ym28, [cq+64*13]
7180    mova         [cq+64* 0], m14
7181    mova         [cq+64* 1], m15
7182    mova         [cq+64* 2], m16
7183    mova         [cq+64* 3], m17
7184    mova         [cq+64* 4], m18
7185    mova         [cq+64* 5], m19
7186    mova         [cq+64* 6], m20
7187    mova         [cq+64* 7], m21
7188    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
7189    vpbroadcastd        m13, [o(pw_8192)]
7190    mova         [cq+64*16], m4
7191    mova         [cq+64*17], m5
7192    mova         [cq+64*18], m6
7193    mova         [cq+64*19], m7
7194    mova         [cq+64*28], m26
7195    mova         [cq+64*29], m27
7196    mova         [cq+64*30], m28
7197    mova         [cq+64*31], m29
7198    call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
7199    mova         [cq+64*20], m22
7200    mova         [cq+64*21], m23
7201    mova         [cq+64*22], m24
7202    mova         [cq+64*23], m25
7203    mova         [cq+64*24], m26
7204    mova         [cq+64*25], m27
7205    mova         [cq+64*26], m28
7206    mova         [cq+64*27], m29
7207    lea                  r4, [rsp+64*64]
7208    lea                  r3, [rsp+64*32]
7209    call .pass2_fast
7210    pmulhrsw             m0, m13, [cq+64*16]
7211    pmulhrsw             m1, m13, [cq+64*17]
7212    pmulhrsw             m2, m13, [cq+64*18]
7213    pmulhrsw             m3, m13, [cq+64*19]
7214    pmulhrsw             m4, m13, [cq+64*20]
7215    pmulhrsw             m5, m13, [cq+64*21]
7216    pmulhrsw             m6, m13, [cq+64*22]
7217    pmulhrsw             m7, m13, [cq+64*23]
7218    pmulhrsw            m14, m13, [cq+64*24]
7219    pmulhrsw            m15, m13, [cq+64*25]
7220    pmulhrsw            m16, m13, [cq+64*26]
7221    pmulhrsw            m17, m13, [cq+64*27]
7222    pmulhrsw            m18, m13, [cq+64*28]
7223    pmulhrsw            m19, m13, [cq+64*29]
7224    pmulhrsw            m20, m13, [cq+64*30]
7225    pmulhrsw            m21, m13, [cq+64*31]
7226    call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
7227    mov                  r4, rsp
7228    mov                  r3, cq
7229    call .pass2_fast
7230.end:
7231    vpbroadcastd        m17, [o(pw_2048)]
7232    lea                  r5, [strideq*8]
7233    mov                  r3, dstq
7234    pxor                m16, m16
7235    sub                  r4, 64*5 ; rsp+64*31
7236    mov                  r6, rsp
7237.end_loop:
7238    mova                 m2, [r6+64*32] ; idct16 0+n  lo
7239    mova                 m7, [r6+64*48] ; idct32 31-n lo
7240    mova                 m6, [cq+64* 0] ; idct16 0+n  hi
7241    mova                 m0, [cq+64*16] ; idct32 31-n hi
7242    mova                 m4, [r4+64*64] ; idct64 63-n lo
7243    mova                 m1, [r4+64* 0] ; idct64 63-n hi
7244    mova                 m5, [r6+64*64] ; idct64 32+n lo
7245    mova                 m8, [r6+64* 0] ; idct64 32+n hi
7246    sub                  r3, strideq
7247    paddsw               m3, m2, m7     ; idct32  0+n lo
7248    mova                m12, [dstq+r5*0]
7249    psubsw               m2, m7         ; idct32 31-n lo
7250    mova                m15, [r3  +r5*8]
7251    paddsw               m7, m6, m0     ; idct32  0+n hi
7252    mova                m13, [r3  +r5*4]
7253    psubsw               m6, m0         ; idct32 31-n hi
7254    mova                m14, [dstq+r5*4]
7255    paddsw               m0, m3, m4     ; out  0+n lo
7256    add                  r6, 64
7257    psubsw               m3, m4         ; out 63-n lo
7258    sub                  r4, 64
7259    paddsw               m4, m7, m1     ; out  0+n hi
7260    mova         [cq+64* 0], m16
7261    psubsw               m7, m1         ; out 63-n hi
7262    mova         [cq+64*16], m16
7263    paddsw               m1, m2, m5     ; out 31-n lo
7264    add                  cq, 64
7265    psubsw               m2, m5         ; out 32+n lo
7266    paddsw               m5, m6, m8     ; out 31-n hi
7267    psubsw               m6, m8         ; out 32+n hi
7268    pmulhrsw             m0, m17
7269    punpcklbw            m8, m12, m16
7270    pmulhrsw             m4, m17
7271    punpckhbw           m12, m16
7272    pmulhrsw             m3, m17
7273    punpcklbw           m11, m15, m16
7274    pmulhrsw             m7, m17
7275    punpckhbw           m15, m16
7276    pmulhrsw             m1, m17
7277    punpcklbw            m9, m13, m16
7278    pmulhrsw             m5, m17
7279    punpckhbw           m13, m16
7280    pmulhrsw             m2, m17
7281    punpcklbw           m10, m14, m16
7282    pmulhrsw             m6, m17
7283    punpckhbw           m14, m16
7284    paddw                m0, m8
7285    paddw                m4, m12
7286    packuswb             m0, m4
7287    paddw                m3, m11
7288    paddw                m7, m15
7289    packuswb             m3, m7
7290    paddw                m1, m9
7291    paddw                m5, m13
7292    packuswb             m1, m5
7293    paddw                m2, m10
7294    paddw                m6, m14
7295    packuswb             m2, m6
7296    mova        [dstq+r5*0], m0
7297    mova        [r3  +r5*8], m3
7298    mova        [r3  +r5*4], m1
7299    mova        [dstq+r5*4], m2
7300    add                dstq, strideq
7301    cmp                  r6, r4
7302    jb .end_loop
7303    RET
7304.dconly:
7305    movsx               r6d, word [cq]
7306    mov                [cq], eobd
7307    or                  r3d, 64
7308    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
7309ALIGN function_align
7310.pass2_end:
7311    REPX  {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6
7312    mova         [r4+64*20], m1
7313    mova         [r4+64*21], m3
7314    mova         [r4+64*22], m5
7315    mova         [r4+64*23], m7
7316    vinserti32x8         m1, m23, ym14, 1    ; a00 a01 c00 c01
7317    vshufi32x4           m3, m23, m14, q3232 ; a02 a03 c02 c03
7318    vinserti32x8         m5, m22, ym0, 1     ; e00 e01 g00 g01
7319    vshufi32x4          m14, m22, m0, q3232  ; e02 e03 g02 g03
7320    mova         [r4+64*12], m15
7321    mova         [r4+64*13], m17
7322    mova         [r4+64*14], m19
7323    mova         [r4+64*15], m21
7324    vinserti32x8        m15, m27, ym18, 1    ; a40 a41 c40 c41
7325    vshufi32x4          m17, m27, m18, q3232 ; a42 a43 c42 c43
7326    vinserti32x8        m18, m26, ym4, 1     ; e40 e41 g40 g41
7327    vshufi32x4          m19, m26, m4, q3232  ; e42 e43 g42 g43
7328    vinserti32x8        m22, m25, ym16, 1    ; a20 a21 c20 c21
7329    vshufi32x4          m26, m25, m16, q3232 ; a22 a23 c22 c23
7330    vinserti32x8        m25, m24, ym2, 1     ; e20 e21 g20 g21
7331    vshufi32x4          m27, m24, m2, q3232  ; e22 e23 g22 g23
7332    vinserti32x8        m23, m29, ym20, 1    ; a60 a61 c60 c61
7333    vshufi32x4          m29, m20, q3232      ; a62 a63 c62 c63
7334    vshufi32x4          m13, m28, m6, q3232  ; e62 e63 g62 g63
7335    vinserti32x8        m28, ym6, 1          ; e60 e61 g60 g61
7336    vshufi32x4           m0, m1, m5, q2020   ;  0
7337    vshufi32x4           m1, m5, q3131       ;  8
7338    vshufi32x4           m2, m3, m14, q2020  ; 16
7339    vshufi32x4           m3, m14, q3131      ; 24
7340    vshufi32x4          m14, m15, m18, q2020 ;  4
7341    vshufi32x4          m15, m18, q3131      ; 12
7342    vshufi32x4          m16, m17, m19, q2020 ; 20
7343    vshufi32x4          m17, m19, q3131      ; 28
7344    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
7345    vshufi32x4          m24, m22, m25, q3131 ; 10
7346    vshufi32x4          m22, m25, q2020      ;  2
7347    vshufi32x4          m25, m23, m28, q3131 ; 14
7348    vshufi32x4          m23, m28, q2020      ;  6
7349    vshufi32x4          m28, m26, m27, q3131 ; 26
7350    vshufi32x4          m26, m27, q2020      ; 18
7351    vshufi32x4          m27, m29, m13, q2020 ; 22
7352    vshufi32x4          m29, m13, q3131      ; 30
7353    mova         [r6+64* 0], m0
7354    mova         [r6+64* 1], m1
7355    mova         [r6+64* 2], m2
7356    mova         [r6+64* 3], m3
7357    mova         [r6+64* 4], m4
7358    mova         [r6+64* 5], m5
7359    mova         [r6+64* 6], m6
7360    mova         [r6+64* 7], m7
7361    mova         [r6+64* 8], m14
7362    mova         [r6+64* 9], m15
7363    mova         [r6+64*10], m16
7364    mova         [r6+64*11], m17
7365    mova         [r6+64*12], m18
7366    mova         [r6+64*13], m19
7367    mova         [r6+64*14], m20
7368    mova         [r6+64*15], m21
7369    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
7370    vpbroadcastd        m13, [o(pw_8192)]
7371    mova         [r6+64*16], m29
7372    mova         [r6+64*17], m28
7373    mova         [r6+64*18], m27
7374    mova         [r6+64*19], m26
7375    mova         [r6+64*20], m25
7376    mova         [r6+64*21], m24
7377    mova         [r6+64*22], m23
7378    mova         [r6+64*23], m22
7379    mova         [r6+64*24], m21
7380    mova         [r6+64*25], m20
7381    mova         [r6+64*26], m19
7382    mova         [r6+64*27], m18
7383    mova         [r6+64*28], m17
7384    mova         [r6+64*29], m16
7385    mova         [r6+64*30], m15
7386    mova         [r6+64*31], m14
7387    pmulhrsw            m15, m13, [r4+64* 8] ;  1  9 17 25
7388    pmulhrsw            m16, m13, [r4+64*12]
7389    pmulhrsw            m17, m13, [r4+64*16]
7390    pmulhrsw            m18, m13, [r4+64*20]
7391    pmulhrsw            m19, m13, [r4+64*11] ;  7 15 23 31
7392    pmulhrsw            m20, m13, [r4+64*15]
7393    pmulhrsw            m21, m13, [r4+64*19]
7394    pmulhrsw            m22, m13, [r4+64*23]
7395    vinserti32x8        m14, m15, ym16, 1 ; a1  a9  c1  c9
7396    vshufi32x4          m15, m16, q3232   ; a17 a25 c17 c25
7397    vinserti32x8        m16, m17, ym18, 1 ; e1  e9  g1  g9
7398    vshufi32x4          m17, m18, q3232   ; e17 e25 g17 g25
7399    pmulhrsw            m23, m13, [r4+64*10] ;  5 13 21 29
7400    pmulhrsw            m24, m13, [r4+64*14]
7401    pmulhrsw            m25, m13, [r4+64*18]
7402    pmulhrsw            m26, m13, [r4+64*22]
7403    vinserti32x8        m18, m19, ym20, 1 ; a7  a15 c7  c15
7404    vshufi32x4          m19, m20, q3232   ; a23 a31 c23 c31
7405    vinserti32x8        m20, m21, ym22, 1 ; e7  e15 g7  g15
7406    vshufi32x4          m21, m22, q3232   ; e23 e31 g23 g31
7407    pmulhrsw            m27, m13, [r4+64* 9] ;  3 11 19 27
7408    pmulhrsw            m28, m13, [r4+64*13]
7409    pmulhrsw            m29, m13, [r4+64*17]
7410    pmulhrsw            m13,      [r4+64*21]
7411    vshufi32x4           m0, m14, m16, q2020 ;  1
7412    vshufi32x4           m1, m19, m21, q3131 ; 31
7413    vshufi32x4           m2, m15, m17, q2020 ; 17
7414    vshufi32x4           m3, m18, m20, q3131 ; 15
7415    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7416    vshufi32x4           m0, m18, m20, q2020 ;  7
7417    vshufi32x4           m1, m15, m17, q3131 ; 25
7418    vshufi32x4           m2, m19, m21, q2020 ; 23
7419    vshufi32x4           m3, m14, m16, q3131 ;  9
7420    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7421    vinserti32x8        m22, m23, ym24, 1 ; a5  a13 c5  c13
7422    vshufi32x4          m23, m24, q3232   ; a21 a29 c21 c29
7423    vinserti32x8        m24, m25, ym26, 1 ; e5  e13 g5  g13
7424    vshufi32x4          m25, m26, q3232   ; e21 e29 g21 g29
7425    vinserti32x8        m26, m27, ym28, 1 ; a3  a11 c3  c11
7426    vshufi32x4          m27, m28, q3232   ; a19 a27 c19 c27
7427    vinserti32x8        m28, m29, ym13, 1 ; e3  e11 g3  g11
7428    vshufi32x4          m29, m13, q3232   ; e19 e17 g19 g27
7429    vshufi32x4           m0, m22, m24, q2020 ;  5
7430    vshufi32x4           m1, m27, m29, q3131 ; 27
7431    vshufi32x4           m2, m23, m25, q2020 ; 21
7432    vshufi32x4           m3, m26, m28, q3131 ; 11
7433    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7434    vshufi32x4           m0, m26, m28, q2020 ;  3
7435    vshufi32x4           m1, m23, m25, q3131 ; 29
7436    vshufi32x4           m2, m27, m29, q2020 ; 19
7437    vshufi32x4           m3, m22, m24, q3131 ; 13
7438    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7439    jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
7440ALIGN function_align
7441.pass2_fast:
7442    vshufi32x4          m23, m1, m16, q3131  ;  6
7443    vshufi32x4          m22, m1, m16, q2020  ;  2
7444    vshufi32x4          m14, m0, m3, q3131   ;  4
7445    vshufi32x4          m26, m0, m3, q2020   ;  0
7446    vshufi32x4          m28, m9, m15, q3131  ;  5
7447    vshufi32x4           m0, m9, m15, q2020  ;  1
7448    vshufi32x4          m16, m11, m17, q3131 ;  7
7449    vshufi32x4          m29, m11, m17, q2020 ;  3
7450    vshufi32x4          m15, m8, m2, q3131   ; 12
7451    vshufi32x4          m27, m8, m2, q2020   ;  8
7452    vshufi32x4          m25, m5, m19, q3131  ; 14
7453    vshufi32x4          m24, m5, m19, q2020  ; 10
7454    vshufi32x4           m3, m6, m20, q3131  ; 15
7455    vshufi32x4          m19, m6, m20, q2020  ; 11
7456    vshufi32x4          m17, m4, m18, q3131  ; 13
7457    vshufi32x4          m18, m4, m18, q2020  ;  9
7458    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
7459    mova                 m0, m16
7460    mova                 m3, m18
7461    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
7462    mova                 m0, m28
7463    mova                 m3, m19
7464    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
7465    mova                 m0, m29
7466    mova                 m3, m17
7467    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
7468    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
7469    mova                 m0, m26
7470    mova                 m1, m27
7471    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
7472    mova         [r3+64* 0], m0
7473    mova         [r3+64* 1], m1
7474    mova         [r3+64* 2], m2
7475    mova         [r3+64* 3], m3
7476    mova         [r3+64* 4], m4
7477    mova         [r3+64* 5], m5
7478    mova         [r3+64* 6], m6
7479    mova         [r3+64* 7], m7
7480    mova         [r3+64* 8], m14
7481    mova         [r3+64* 9], m15
7482    mova         [r3+64*10], m16
7483    mova         [r3+64*11], m17
7484    mova         [r3+64*12], m18
7485    mova         [r3+64*13], m19
7486    mova         [r3+64*14], m20
7487    mova         [r3+64*15], m21
7488    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
7489    mova         [r3+64*16], m29
7490    mova         [r3+64*17], m28
7491    mova         [r3+64*18], m27
7492    mova         [r3+64*19], m26
7493    mova         [r3+64*20], m25
7494    mova         [r3+64*21], m24
7495    mova         [r3+64*22], m23
7496    mova         [r3+64*23], m22
7497    mova         [r3+64*24], m21
7498    mova         [r3+64*25], m20
7499    mova         [r3+64*26], m19
7500    mova         [r3+64*27], m18
7501    mova         [r3+64*28], m17
7502    mova         [r3+64*29], m16
7503    mova         [r3+64*30], m15
7504    mova         [r3+64*31], m14
7505    ret
7506
7507%endif ; ARCH_X86_64
7508