xref: /aosp_15_r20/external/libdav1d/src/x86/itx_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2018-2021, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 16
32
33; Note: The order of (at least some of) those constants matter!
34
35const deint_shuf, db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
36
37%macro COEF_PAIR 2
38pw_%1_%2:  dw  %1, %2
39pw_m%2_%1: dw -%2, %1
40%endmacro
41
42; ADST-only
43pw_3803_1321:   dw  3803,  1321
44pw_m1321_2482:  dw -1321,  2482
45pw_2482_3344:   dw  2482,  3344
46pw_m3344_3344:  dw -3344,  3344
47pw_m3803_3344:  dw -3803,  3344
48pw_m3803_m6688: dw -3803, -6688
49pw_2896_m2896:  dw  2896, -2896
50
51const pw_5,       times 2 dw 5
52const pw_2048,    times 2 dw 2048
53const pw_4096,    times 2 dw 4096
54const pw_8192,    times 2 dw 8192
55const pw_16384,   times 2 dw 16384
56const pw_1697x16, times 2 dw 1697*16
57const pw_1697x8,  times 2 dw 1697*8
58const pw_2896x8,  times 2 dw 2896*8
59const pd_2048,    dd 2048
60
61const pw_2896_2896,  dw  2896, 2896
62const pw_m2896_2896, dw -2896, 2896
63const pw_1567_3784,  dw  1567, 3784
64const pw_m3784_1567, dw -3784, 1567
65COEF_PAIR 3784, 1567
66COEF_PAIR  201, 4091
67COEF_PAIR  995, 3973
68COEF_PAIR 1751, 3703
69COEF_PAIR 2440, 3290
70COEF_PAIR 3035, 2751
71COEF_PAIR 3513, 2106
72COEF_PAIR 3857, 1380
73COEF_PAIR 4052,  601
74COEF_PAIR  401, 4076
75COEF_PAIR 1931, 3612
76COEF_PAIR 3166, 2598
77COEF_PAIR 3920, 1189
78COEF_PAIR  799, 4017
79COEF_PAIR 3406, 2276
80pw_m799_m4017:  dw  -799, -4017
81const pw_m1567_m3784, dw -1567, -3784
82pw_m3406_m2276: dw -3406, -2276
83pw_m401_m4076:  dw  -401, -4076
84pw_m3166_m2598: dw -3166, -2598
85pw_m1931_m3612: dw -1931, -3612
86pw_m3920_m1189: dw -3920, -1189
87COEF_PAIR 2276, 3406
88COEF_PAIR 4017,  799
89
90%macro COEF_X8 1-*
91%rep %0
92    dw %1*8, %1*8
93    %rotate 1
94%endrep
95%endmacro
96
97pw_3703x8:  COEF_X8  3703
98pw_1751x8:  COEF_X8  1751
99pw_m1380x8: COEF_X8 -1380
100pw_3857x8:  COEF_X8  3857
101pw_3973x8:  COEF_X8  3973
102pw_995x8:   COEF_X8   995
103pw_m2106x8: COEF_X8 -2106
104pw_3513x8:  COEF_X8  3513
105pw_3290x8:  COEF_X8  3290
106pw_2440x8:  COEF_X8  2440
107pw_m601x8:  COEF_X8  -601
108pw_4052x8:  COEF_X8  4052
109
110const idct64_mul
111COEF_X8  4095,   101,  4065,   501,  2967, -2824,  3229, -2520
112COEF_X8  3745,  1660,  3564,  2019,  3822, -1474,  3948, -1092
113COEF_X8  3996,   897,  3889,  1285,  3461, -2191,  3659, -1842
114COEF_X8  3349,  2359,  3102,  2675,  4036,  -700,  4085,  -301
115
116pw_201_4091x8:   dw   201*8, 4091*8
117pw_m601_4052x8:  dw  -601*8, 4052*8
118pw_995_3973x8:   dw   995*8, 3973*8
119pw_m1380_3857x8: dw -1380*8, 3857*8
120pw_1751_3703x8:  dw  1751*8, 3703*8
121pw_m2106_3513x8: dw -2106*8, 3513*8
122pw_2440_3290x8:  dw  2440*8, 3290*8
123pw_m2751_3035x8: dw -2751*8, 3035*8
124
125%define o_idct64_offset idct64_mul - (o_base) - 8
126
127SECTION .text
128
129; Code size reduction trickery: Instead of using rip-relative loads with
130; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
131; single rip-relative lea and then address things relative from that with
132; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
133%define o_base deint_shuf + 128
134%define o(x) (r6 - (o_base) + (x))
135%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
136
137; flags: 1 = swap, 2 = interleave, 4: coef_regs
138%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
139%if %7 & 4
140    pmaddwd             m%2, m%5, m%1
141    pmaddwd             m%1, m%6
142%else
143%if %7 & 1
144    vpbroadcastd        m%2, [o(pw_%5_%6)]
145    vpbroadcastd        m%3, [o(pw_m%6_%5)]
146%else
147    vpbroadcastd        m%2, [o(pw_m%6_%5)]
148    vpbroadcastd        m%3, [o(pw_%5_%6)]
149%endif
150    pmaddwd             m%2, m%1
151    pmaddwd             m%1, m%3
152%endif
153    paddd               m%2, m%4
154    paddd               m%1, m%4
155%if %7 & 2
156    pslld               m%2, 4
157    psrld               m%1, 12
158    pblendw             m%1, m%2, 0xaa
159%else
160    psrad               m%2, 12
161    psrad               m%1, 12
162    packssdw            m%1, m%2
163%endif
164%endmacro
165
166; flags: 1 = swap, 2 = interleave, 4 = coef_regs
167%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags
168%if %10 & 1
169    vpbroadcastd        m%3, [o(pw_%8_%9)]
170    vpbroadcastd        m%4, [o(pw_m%9_%8)]
171    vpbroadcastd       xm%2, [o(pw_%6_%7)]
172    vpblendd            m%2, m%3, 0xf0
173    vpbroadcastd       xm%3, [o(pw_m%7_%6)]
174%else
175    vpbroadcastd        m%3, [o(pw_m%9_%8)]
176    vpbroadcastd        m%4, [o(pw_%8_%9)]
177    vpbroadcastd       xm%2, [o(pw_m%7_%6)]
178    vpblendd            m%2, m%3, 0xf0
179    vpbroadcastd       xm%3, [o(pw_%6_%7)]
180%endif
181    vpblendd            m%3, m%4, 0xf0
182    ITX_MUL2X_PACK       %1, %4, _, %5, %2, %3, (4|%10)
183%endmacro
184
185; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
186; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
187%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
188    punpckhwd           m%3, m%2, m%1
189    punpcklwd           m%2, m%1
190%if %7 < 32
191    pmaddwd             m%1, m%7, m%2
192    pmaddwd             m%4, m%7, m%3
193%else
194    vpbroadcastd        m%1, [o(pw_m%7_%6)]
195    pmaddwd             m%4, m%3, m%1
196    pmaddwd             m%1, m%2
197%endif
198    paddd               m%4, m%5
199    paddd               m%1, m%5
200    psrad               m%4, 12
201    psrad               m%1, 12
202    packssdw            m%1, m%4
203%if %7 < 32
204    pmaddwd             m%3, m%6
205    pmaddwd             m%2, m%6
206%else
207    vpbroadcastd        m%4, [o(pw_%6_%7)]
208    pmaddwd             m%3, m%4
209    pmaddwd             m%2, m%4
210%endif
211    paddd               m%3, m%5
212    paddd               m%2, m%5
213    psrad               m%3, 12
214    psrad               m%2, 12
215%if %0 == 8
216    packssdw            m%8, m%2, m%3
217%else
218    packssdw            m%2, m%3
219%endif
220%endmacro
221
222%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
223    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3
224    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0
225    psubsw              m%3, m%1, m%2
226    paddsw              m%2, m%1
227    paddsw              m%1, m%4, m%5
228    psubsw              m%4, m%5
229%endmacro
230
231%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
232    ITX_MULSUB_2W        %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
233    ITX_MULSUB_2W        %2, %8, %9, %10, %11,  799, 4017 ; t4a, t7a
234    ITX_MULSUB_2W        %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
235    paddsw              m%9, m%2, m%6  ; t4
236    psubsw              m%2, m%6       ; t5a
237    paddsw             m%10, m%8, m%4  ; t7
238    psubsw              m%8, m%4       ; t6a
239    ITX_MULSUB_2W        %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0
240    ITX_MULSUB_2W        %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6
241    psubsw              m%6, m%1, m%3  ; dct4 out2
242    paddsw              m%3, m%1       ; dct4 out1
243    paddsw              m%1, m%5, m%7  ; dct4 out0
244    psubsw              m%5, m%7       ; dct4 out3
245    psubsw              m%7, m%3, m%2  ; out6
246    paddsw              m%2, m%3       ; out1
247    paddsw              m%3, m%6, m%8  ; out2
248    psubsw              m%6, m%8       ; out5
249    psubsw              m%8, m%1, m%10 ; out7
250    paddsw              m%1, m%10      ; out0
251    paddsw              m%4, m%5, m%9  ; out3
252    psubsw              m%5, m%9       ; out4
253%endmacro
254
255; in1 = %1, in3  = %2, in5  = %3, in7  = %4
256; in9 = %5, in11 = %6, in13 = %7, in15 = %8
257%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048
258    ITX_MULSUB_2W        %1, %8, %9, %10, %11,  401, 4076 ; t8a,  t15a
259    ITX_MULSUB_2W        %5, %4, %9, %10, %11, 3166, 2598 ; t9a,  t14a
260    ITX_MULSUB_2W        %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
261    ITX_MULSUB_2W        %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
262    psubsw              m%9, m%2, m%6 ; t13
263    paddsw              m%6, m%2      ; t12
264    psubsw              m%2, m%8, m%4 ; t14
265    paddsw              m%8, m%4      ; t15
266    psubsw              m%4, m%7, m%3 ; t10
267    paddsw              m%3, m%7      ; t11
268    psubsw              m%7, m%1, m%5 ; t9
269    paddsw              m%1, m%5      ; t8
270    ITX_MULSUB_2W        %2, %7, %5, %10, %11,  1567, 3784 ; t9a,  t14a
271    ITX_MULSUB_2W        %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
272    psubsw              m%5, m%1, m%3 ; t11a
273    paddsw              m%1, m%3      ; t8a
274    psubsw              m%3, m%7, m%4 ; t13
275    paddsw              m%7, m%4      ; t14
276    psubsw              m%4, m%8, m%6 ; t12a
277    paddsw              m%8, m%6      ; t15a
278    psubsw              m%6, m%2, m%9 ; t10
279    paddsw              m%2, m%9      ; t9
280    ITX_MULSUB_2W        %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a
281    ITX_MULSUB_2W        %4, %5, %9, %10, %11, 2896, 2896 ; t11,  t12
282%endmacro
283
284%macro WRAP_XMM 1+
285    INIT_XMM cpuname
286    %1
287    INIT_YMM cpuname
288%endmacro
289
290%macro ITX4_END 4-5 2048 ; row[1-4], rnd
291%if %5
292    vpbroadcastd         m2, [o(pw_%5)]
293    pmulhrsw             m0, m2
294    pmulhrsw             m1, m2
295%endif
296    lea                  r2, [dstq+strideq*2]
297%assign %%i 1
298%rep 4
299    %if %1 & 2
300        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
301    %else
302        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
303    %endif
304    %assign %%i %%i + 1
305    %rotate 1
306%endrep
307    movd                 m2, [%%row_adr1]
308    pinsrd               m2, [%%row_adr2], 1
309    movd                 m3, [%%row_adr3]
310    pinsrd               m3, [%%row_adr4], 1
311    pmovzxbw             m2, m2
312    pmovzxbw             m3, m3
313    paddw                m0, m2
314    paddw                m1, m3
315    packuswb             m0, m1
316    movd       [%%row_adr1], m0
317    pextrd     [%%row_adr2], m0, 1
318    pextrd     [%%row_adr3], m0, 2
319    pextrd     [%%row_adr4], m0, 3
320    ret
321%endmacro
322
323%macro IWHT4_1D_PACKED 0
324    punpckhqdq           m3, m0, m1 ; in1 in3
325    punpcklqdq           m0, m1     ; in0 in2
326    psubw                m2, m0, m3
327    paddw                m0, m3
328    punpckhqdq           m2, m2     ; t2 t2
329    punpcklqdq           m0, m0     ; t0 t0
330    psubw                m1, m0, m2
331    psraw                m1, 1
332    psubw                m1, m3     ; t1 t3
333    psubw                m0, m1     ; ____ out0
334    paddw                m2, m1     ; out3 ____
335%endmacro
336
337INIT_XMM avx2
338cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c
339    mova                 m0, [cq+16*0]
340    mova                 m1, [cq+16*1]
341    pxor                 m2, m2
342    mova          [cq+16*0], m2
343    mova          [cq+16*1], m2
344    psraw                m0, 2
345    psraw                m1, 2
346    IWHT4_1D_PACKED
347    punpckhwd            m0, m1
348    punpcklwd            m3, m1, m2
349    punpckhdq            m1, m0, m3
350    punpckldq            m0, m3
351    IWHT4_1D_PACKED
352    vpblendd             m0, m2, 0x03
353    ITX4_END              3, 0, 2, 1, 0
354
355%macro INV_TXFM_FN 3 ; type1, type2, size
356cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2
357    %define %%p1 m(i%1_%3_internal_8bpc)
358    lea                  r6, [o_base]
359    ; Jump to the 1st txfm function if we're not taking the fast path, which
360    ; in turn performs an indirect jump to the 2nd txfm function.
361    lea                tx2q, [m(i%2_%3_internal_8bpc).pass2]
362%ifidn %1_%2, dct_dct
363    test               eobd, eobd
364    jnz %%p1
365%else
366    ; jump to the 1st txfm function unless it's located directly after this
367    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
368ALIGN function_align
369%%end:
370%endif
371%endmacro
372
373%macro INV_TXFM_4X4_FN 2 ; type1, type2
374    INV_TXFM_FN          %1, %2, 4x4
375%ifidn %1_%2, dct_dct
376    vpbroadcastw         m0, [cq]
377    vpbroadcastd         m1, [o(pw_2896x8)]
378    pmulhrsw             m0, m1
379    mov                [cq], eobd ; 0
380    pmulhrsw             m0, m1
381    mova                 m1, m0
382    jmp m(iadst_4x4_internal_8bpc).end2
383%endif
384%endmacro
385
386%macro IDCT4_1D_PACKED 0
387    vpbroadcastd         m4, [o(pd_2048)]
388    punpckhwd            m2, m1, m0
389    punpcklwd            m1, m0
390    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784
391    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896
392    paddsw               m0, m1, m2 ; out0 out1
393    psubsw               m1, m2     ; out3 out2
394%endmacro
395
396%macro IADST4_1D_PACKED 0
397    punpcklwd            m2, m1, m0
398    punpckhwd            m3, m1, m0
399    vpbroadcastd         m5, [o(pw_m3344_3344)]
400    vpbroadcastd         m0, [o(pw_3803_1321)]
401    vpbroadcastd         m4, [o(pw_m1321_2482)]
402    pmaddwd              m1, m5, m2 ; 3344*in3 - 3344*in2
403    psrld                m5, 16
404    pmaddwd              m0, m2
405    pmaddwd              m2, m4
406    pmaddwd              m5, m3 ; 3344*in0
407    paddd                m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3
408    vpbroadcastd         m4, [o(pw_2482_3344)]
409    vpbroadcastd         m5, [o(pw_m3803_3344)]
410    pmaddwd              m4, m3
411    pmaddwd              m5, m3
412    paddd                m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
413    vpbroadcastd         m0, [o(pw_m3803_m6688)]
414    pmaddwd              m3, m0
415    vpbroadcastd         m0, [o(pd_2048)]
416    paddd                m2, m0
417    paddd                m1, m0
418    paddd                m0, m4
419    paddd                m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
420    paddd                m2, m4
421    paddd                m2, m3
422    REPX      {psrad x, 12}, m1, m2, m0, m5
423    packssdw             m0, m5 ; out0 out1
424    packssdw             m1, m2 ; out2 out3
425%endmacro
426
427INV_TXFM_4X4_FN dct, dct
428INV_TXFM_4X4_FN dct, adst
429INV_TXFM_4X4_FN dct, flipadst
430INV_TXFM_4X4_FN dct, identity
431
432cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
433    mova                 m0, [cq+16*0]
434    mova                 m1, [cq+16*1]
435    IDCT4_1D_PACKED
436    mova                 m2, [o(deint_shuf)]
437    shufps               m3, m0, m1, q1331
438    shufps               m0, m1, q0220
439    pshufb               m0, m2
440    pshufb               m1, m3, m2
441    jmp                tx2q
442.pass2:
443    IDCT4_1D_PACKED
444    pxor                 m2, m2
445    mova          [cq+16*0], m2
446    mova          [cq+16*1], m2
447    ITX4_END              0, 1, 3, 2
448
449INV_TXFM_4X4_FN adst, dct
450INV_TXFM_4X4_FN adst, adst
451INV_TXFM_4X4_FN adst, flipadst
452INV_TXFM_4X4_FN adst, identity
453
454cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
455    mova                 m0, [cq+16*0]
456    mova                 m1, [cq+16*1]
457    call .main
458    punpckhwd            m3, m0, m1
459    punpcklwd            m0, m1
460    punpckhwd            m1, m0, m3
461    punpcklwd            m0, m3
462    jmp                tx2q
463.pass2:
464    call .main
465.end:
466    pxor                 m2, m2
467    mova          [cq+16*0], m2
468    mova          [cq+16*1], m2
469.end2:
470    ITX4_END              0, 1, 2, 3
471ALIGN function_align
472cglobal_label .main
473    IADST4_1D_PACKED
474    ret
475
476INV_TXFM_4X4_FN flipadst, dct
477INV_TXFM_4X4_FN flipadst, adst
478INV_TXFM_4X4_FN flipadst, flipadst
479INV_TXFM_4X4_FN flipadst, identity
480
481cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
482    mova                 m0, [cq+16*0]
483    mova                 m1, [cq+16*1]
484    call m(iadst_4x4_internal_8bpc).main
485    punpcklwd            m2, m1, m0
486    punpckhwd            m1, m0
487    punpcklwd            m0, m1, m2
488    punpckhwd            m1, m2
489    jmp                tx2q
490.pass2:
491    call m(iadst_4x4_internal_8bpc).main
492.end:
493    pxor                 m2, m2
494    mova          [cq+16*0], m2
495    mova          [cq+16*1], m2
496.end2:
497    ITX4_END              3, 2, 1, 0
498
499INV_TXFM_4X4_FN identity, dct
500INV_TXFM_4X4_FN identity, adst
501INV_TXFM_4X4_FN identity, flipadst
502INV_TXFM_4X4_FN identity, identity
503
504cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
505    mova                 m0, [cq+16*0]
506    mova                 m1, [cq+16*1]
507    vpbroadcastd         m3, [o(pw_1697x8)]
508    pmulhrsw             m2, m3, m0
509    pmulhrsw             m3, m1
510    paddsw               m0, m2
511    paddsw               m1, m3
512    punpckhwd            m2, m0, m1
513    punpcklwd            m0, m1
514    punpckhwd            m1, m0, m2
515    punpcklwd            m0, m2
516    jmp                tx2q
517.pass2:
518    vpbroadcastd         m3, [o(pw_1697x8)]
519    pmulhrsw             m2, m3, m0
520    pmulhrsw             m3, m1
521    paddsw               m0, m2
522    paddsw               m1, m3
523    jmp m(iadst_4x4_internal_8bpc).end
524
525%macro WRITE_4X8 2 ; coefs[1-2]
526    movd                xm4, [dstq+strideq*0]
527    pinsrd              xm4, [dstq+strideq*1], 1
528    movd                xm5, [dstq+strideq*2]
529    pinsrd              xm5, [dstq+r3       ], 1
530    pinsrd              xm4, [r2  +strideq*0], 2
531    pinsrd              xm4, [r2  +strideq*1], 3
532    pinsrd              xm5, [r2  +strideq*2], 2
533    pinsrd              xm5, [r2  +r3       ], 3
534    pmovzxbw             m4, xm4
535    pmovzxbw             m5, xm5
536    paddw                m4, m%1
537    paddw                m5, m%2
538    packuswb             m4, m5
539    vextracti128        xm5, m4, 1
540    movd   [dstq+strideq*0], xm4
541    pextrd [dstq+strideq*1], xm4, 1
542    pextrd [dstq+strideq*2], xm4, 2
543    pextrd [dstq+r3       ], xm4, 3
544    movd   [r2  +strideq*0], xm5
545    pextrd [r2  +strideq*1], xm5, 1
546    pextrd [r2  +strideq*2], xm5, 2
547    pextrd [r2  +r3       ], xm5, 3
548%endmacro
549
550%macro INV_TXFM_4X8_FN 2 ; type1, type2
551    INV_TXFM_FN          %1, %2, 4x8
552%ifidn %1_%2, dct_dct
553    movd                xm1, [o(pw_2896x8)]
554    pmulhrsw            xm0, xm1, [cq]
555    movd                xm2, [o(pw_2048)]
556    mov                [cq], eobd
557    pmulhrsw            xm0, xm1
558    pmulhrsw            xm0, xm1
559    pmulhrsw            xm0, xm2
560    vpbroadcastw         m0, xm0
561    mova                 m1, m0
562    jmp m(iadst_4x8_internal_8bpc).end3
563%endif
564%endmacro
565
566%macro IDCT8_1D_PACKED 0
567    vpbroadcastd         m6, [o(pd_2048)]
568    punpckhwd            m5, m3, m0 ; in7 in1
569    punpckhwd            m4, m1, m2 ; in3 in5
570    punpcklwd            m3, m1     ; in6 in2
571    punpcklwd            m2, m0     ; in4 in0
572    ITX_MUL2X_PACK        5, 0, 1, 6,  799, 4017, 3 ; t4a t7a
573    ITX_MUL2X_PACK        4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
574    ITX_MUL2X_PACK        3, 0, 1, 6, 1567, 3784    ; t3 t2
575    psubsw               m0, m5, m4 ; t5a t6a (interleaved)
576    paddsw               m4, m5     ; t4  t7  (interleaved)
577    ITX_MUL2X_PACK        2, 1, 5, 6, 2896, 2896    ; t0 t1
578    vpbroadcastd         m1, [o(pw_m2896_2896)]
579    ITX_MUL2X_PACK        0, 1, _, 6, 1, 5, 4 ; t6 t5
580%if mmsize > 16
581    vbroadcasti128       m1, [o(deint_shuf)]
582    pshufb               m4, m1
583%else
584    pshufb               m4, [o(deint_shuf)]
585%endif
586    psubsw               m1, m2, m3 ; tmp3 tmp2
587    paddsw               m3, m2     ; tmp0 tmp1
588    shufps               m2, m4, m0, q1032 ; t7 t6
589    vpblendd             m4, m0, 0xcc      ; t4 t5
590    paddsw               m0, m3, m2 ; out0 out1
591    psubsw               m3, m2     ; out7 out6
592    psubsw               m2, m1, m4 ; out4 out5
593    paddsw               m1, m4     ; out3 out2
594%endmacro
595
596%macro IADST8_1D_PACKED 1 ; pass
597    vpbroadcastd         m6, [o(pd_2048)]
598    punpckhwd            m0, m4, m3 ; 0 7
599    punpckhwd            m1, m5, m2 ; 2 5
600    punpcklwd            m2, m5     ; 4 3
601    punpcklwd            m3, m4     ; 6 1
602%if %1 == 1
603    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076, 3 ; t1a t0a
604    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
605    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
606    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
607    psubsw               m4, m0, m2 ; t5 t4
608    paddsw               m0, m2     ; t1 t0
609    psubsw               m5, m1, m3 ; t6 t7
610    paddsw               m1, m3     ; t2 t3
611    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
612    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
613%if mmsize > 16
614    vbroadcasti128       m2, [o(deint_shuf)]
615%else
616    mova                 m2, [o(deint_shuf)]
617%endif
618    pshuflw              m1, m1, q2301
619    pshufhw              m1, m1, q2301
620    psubsw               m3, m0, m1        ; t3 t2
621    paddsw               m0, m1            ; -out7  out0
622    psubsw               m1, m4, m5        ; t7 t6
623    paddsw               m4, m5            ;  out6 -out1
624    pshufb               m0, m2
625    pshufb               m4, m2
626    vpbroadcastd         m5, [o(pw_m2896_2896)]
627    pmaddwd              m2, m5, m3
628    pmaddwd              m5, m1
629    paddd                m2, m6
630    paddd                m5, m6
631    psrad                m2, 12
632    psrad                m5, 12
633    packssdw             m2, m5            ; out4 -out5
634    vpbroadcastd         m5, [o(pw_2896_2896)]
635    pmaddwd              m3, m5
636    pmaddwd              m1, m5
637    paddd                m3, m6
638    paddd                m1, m6
639    psrad                m3, 12
640    psrad                m1, 12
641    packssdw             m1, m3            ; out2 -out3
642    punpcklqdq           m3, m4, m0        ; out6 -out7
643    punpckhqdq           m0, m4            ; out0 -out1
644%else
645    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
646    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
647    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
648    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
649    psubsw               m4, m0, m2 ; t4 t5
650    paddsw               m0, m2     ; t0 t1
651    psubsw               m5, m1, m3 ; t6 t7
652    paddsw               m1, m3     ; t2 t3
653    shufps               m2, m5, m4, q1032
654    punpckhwd            m4, m2
655    punpcklwd            m5, m2
656    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
657    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567    ; t7a t6a
658    psubsw               m2, m0, m1        ; t2 t3
659    paddsw               m0, m1            ; out0 -out7
660    psubsw               m1, m4, m5        ; t7 t6
661    paddsw               m4, m5            ; out6 -out1
662    vpbroadcastd         m5, [o(pw_2896x8)]
663    vpblendd             m3, m0, m4, 0x33  ; out6 -out7
664    vpblendd             m0, m4, 0xcc      ; out0 -out1
665    shufps               m4, m2, m1, q1032 ; t3 t7
666    vpblendd             m1, m2, 0x33      ; t2 t6
667    psubsw               m2, m1, m4        ; t2-t3 t6-t7
668    paddsw               m1, m4            ; t2+t3 t6+t7
669    pmulhrsw             m2, m5            ; out4 -out5
670    pshufd               m1, m1, q1032
671    pmulhrsw             m1, m5            ; out2 -out3
672%endif
673%endmacro
674
675INIT_YMM avx2
676INV_TXFM_4X8_FN dct, dct
677INV_TXFM_4X8_FN dct, adst
678INV_TXFM_4X8_FN dct, flipadst
679INV_TXFM_4X8_FN dct, identity
680
681cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
682    vpermq               m0, [cq+32*0], q3120
683    vpermq               m1, [cq+32*1], q3120
684    vpbroadcastd         m2, [o(pw_2896x8)]
685    pmulhrsw             m0, m2
686    pmulhrsw             m1, m2
687    IDCT4_1D_PACKED
688    vbroadcasti128       m2, [o(deint_shuf)]
689    shufps               m3, m0, m1, q1331
690    shufps               m0, m1, q0220
691    pshufb               m0, m2
692    pshufb               m1, m3, m2
693    jmp                tx2q
694.pass2:
695    vextracti128        xm2, m0, 1
696    vextracti128        xm3, m1, 1
697    call .main
698    vpbroadcastd         m4, [o(pw_2048)]
699    vinserti128          m0, xm2, 1
700    vinserti128          m1, xm3, 1
701    pshufd               m1, m1, q1032
702    jmp m(iadst_4x8_internal_8bpc).end2
703ALIGN function_align
704cglobal_label .main
705    WRAP_XMM IDCT8_1D_PACKED
706    ret
707
708INV_TXFM_4X8_FN adst, dct
709INV_TXFM_4X8_FN adst, adst
710INV_TXFM_4X8_FN adst, flipadst
711INV_TXFM_4X8_FN adst, identity
712
713cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
714    vpermq               m0, [cq+32*0], q3120
715    vpermq               m1, [cq+32*1], q3120
716    vpbroadcastd         m2, [o(pw_2896x8)]
717    pmulhrsw             m0, m2
718    pmulhrsw             m1, m2
719    call m(iadst_8x4_internal_8bpc).main
720    punpckhwd            m3, m0, m1
721    punpcklwd            m0, m1
722    punpckhwd            m1, m0, m3
723    punpcklwd            m0, m3
724    jmp                tx2q
725.pass2:
726    vextracti128        xm2, m0, 1
727    vextracti128        xm3, m1, 1
728    pshufd              xm4, xm0, q1032
729    pshufd              xm5, xm1, q1032
730    call .main_pass2
731    vpbroadcastd         m4, [o(pw_2048)]
732    vinserti128          m0, xm2, 1
733    vinserti128          m1, xm3, 1
734    pxor                 m5, m5
735    psubw                m5, m4
736.end:
737    vpblendd             m4, m5, 0xcc
738.end2:
739    pmulhrsw             m0, m4
740    pmulhrsw             m1, m4
741    WIN64_RESTORE_XMM
742    pxor                 m2, m2
743    mova          [cq+32*0], m2
744    mova          [cq+32*1], m2
745.end3:
746    lea                  r2, [dstq+strideq*4]
747    lea                  r3, [strideq*3]
748    WRITE_4X8             0, 1
749    RET
750ALIGN function_align
751.main_pass1:
752    WRAP_XMM IADST8_1D_PACKED 1
753    ret
754ALIGN function_align
755cglobal_label .main_pass2
756    WRAP_XMM IADST8_1D_PACKED 2
757    ret
758
759INV_TXFM_4X8_FN flipadst, dct
760INV_TXFM_4X8_FN flipadst, adst
761INV_TXFM_4X8_FN flipadst, flipadst
762INV_TXFM_4X8_FN flipadst, identity
763
764cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
765    vpermq               m0, [cq+32*0], q3120
766    vpermq               m1, [cq+32*1], q3120
767    vpbroadcastd         m2, [o(pw_2896x8)]
768    pmulhrsw             m0, m2
769    pmulhrsw             m1, m2
770    call m(iadst_8x4_internal_8bpc).main
771    punpcklwd            m3, m1, m0
772    punpckhwd            m1, m0
773    punpcklwd            m0, m1, m3
774    punpckhwd            m1, m3
775    jmp                tx2q
776.pass2:
777    vextracti128        xm2, m0, 1
778    vextracti128        xm3, m1, 1
779    pshufd              xm4, xm0, q1032
780    pshufd              xm5, xm1, q1032
781    call m(iadst_4x8_internal_8bpc).main_pass2
782    vpbroadcastd         m5, [o(pw_2048)]
783    vinserti128          m3, xm1, 1
784    vinserti128          m2, xm0, 1
785    pxor                 m4, m4
786    psubw                m4, m5
787    pshufd               m0, m3, q1032
788    pshufd               m1, m2, q1032
789    jmp m(iadst_4x8_internal_8bpc).end
790
791INV_TXFM_4X8_FN identity, dct
792INV_TXFM_4X8_FN identity, adst
793INV_TXFM_4X8_FN identity, flipadst
794INV_TXFM_4X8_FN identity, identity
795
796cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
797    vpermq               m2, [cq+32*0], q3120
798    vpermq               m0, [cq+32*1], q3120
799    vpbroadcastd         m3, [o(pw_2896x8)]
800    vpbroadcastd         m4, [o(pw_1697x8)]
801    punpcklwd            m1, m2, m0
802    punpckhwd            m2, m0
803    pmulhrsw             m1, m3
804    pmulhrsw             m2, m3
805    punpcklwd            m0, m1, m2
806    punpckhwd            m1, m2
807    pmulhrsw             m2, m4, m0
808    pmulhrsw             m4, m1
809    paddsw               m0, m2
810    paddsw               m1, m4
811    jmp                tx2q
812.pass2:
813    vpbroadcastd         m4, [o(pw_4096)]
814    jmp m(iadst_4x8_internal_8bpc).end2
815
816%macro INV_TXFM_4X16_FN 2 ; type1, type2
817    INV_TXFM_FN          %1, %2, 4x16
818%ifidn %1_%2, dct_dct
819    movd                xm1, [o(pw_2896x8)]
820    pmulhrsw            xm0, xm1, [cq]
821    movd                xm2, [o(pw_16384)]
822    movd                xm3, [o(pw_2048)]
823    mov                [cq], eobd
824    pmulhrsw            xm0, xm2
825    pmulhrsw            xm0, xm1
826    pmulhrsw            xm0, xm3
827    vpbroadcastw         m0, xm0
828    mova                 m1, m0
829    mova                 m2, m0
830    mova                 m3, m0
831    jmp m(iadst_4x16_internal_8bpc).end3
832%endif
833%endmacro
834
835%macro IDCT16_1D_PACKED 0
836    vpbroadcastd        m10, [o(pd_2048)]
837.main2:
838    punpckhwd            m8, m7, m0 ; dct16 in15 in1
839    punpcklwd            m9, m4, m0 ; dct4  in2  in0
840    punpckhwd            m0, m3, m4 ; dct16 in7  in9
841    punpcklwd            m7, m1     ; dct8  in7  in1
842    punpckhwd            m1, m6     ; dct16 in3  in13
843    punpcklwd            m3, m5     ; dct8  in3  in5
844    punpckhwd            m5, m2     ; dct16 in11 in5
845    punpcklwd            m6, m2     ; dct4  in3  in1
846    ITX_MUL2X_PACK        8, 2, 4, 10,  401, 4076, 3 ; t8a  t15a
847    ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 3 ; t9a  t14a
848    ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
849    ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
850    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 3 ; t4a  t7a
851    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 3 ; t5a  t6a
852    ITX_MUL2X_PACK        6, 2, 4, 10, 1567, 3784    ; t3   t2
853    psubsw               m2, m8, m0 ; t9  t14
854    paddsw               m8, m0     ; t8  t15
855    psubsw               m0, m1, m5 ; t10 t13
856    paddsw               m1, m5     ; t11 t12
857    vpbroadcastd         m5, [o(pw_m3784_1567)]  ; reuse pw_1567_3784
858    ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 6   ; t9a  t14a
859    vpbroadcastd         m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
860    ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 6   ; t10a t13a
861    psubsw               m4, m8, m1 ; t11a t12a
862    paddsw               m8, m1     ; t8a  t15a
863    psubsw               m1, m7, m3 ; t5a  t6a
864    paddsw               m7, m3     ; t4   t7
865    paddsw               m3, m2, m0 ; t9   t14
866    psubsw               m2, m0     ; t10  t13
867%if mmsize > 16
868    vbroadcasti128       m0, [o(deint_shuf)]
869%else
870    mova                 m0, [o(deint_shuf)]
871%endif
872    pshufb               m8, m0
873    pshufb               m7, m0
874    pshufb               m3, m0
875    ITX_MUL2X_PACK        9, 0, 5, 10, 2896, 2896 ; t0   t1
876    vpbroadcastd         m0, [o(pw_m2896_2896)]
877    ITX_MUL2X_PACK        4, 5, _, 10, 5, 0, 4    ; t11  t12
878    vpbroadcastd         m5, [o(pw_2896_2896)]
879    ITX_MUL2X_PACK        1, 0, _, 10, 0, 5, 4    ; t6   t5
880    vpbroadcastd         m0, [o(pw_m2896_2896)]
881    ITX_MUL2X_PACK        2, 0, _, 10, 0, 5, 4    ; t13a t10a
882    punpckhqdq           m0, m8, m3        ; t15a t14
883    punpcklqdq           m8, m3            ; t8a  t9
884    shufps               m5, m4, m2, q1032 ; t12  t13a
885    vpblendd             m4, m2, 0xcc      ; t11  t10a
886    shufps               m2, m7, m1, q1032 ; t7 t6
887    vpblendd             m7, m1, 0xcc      ; t4 t5
888    psubsw               m1, m9, m6 ; dct4 out3 out2
889    paddsw               m9, m6     ; dct4 out0 out1
890    psubsw               m3, m9, m2 ; dct8 out7 out6
891    paddsw               m9, m2     ; dct8 out0 out1
892    psubsw               m2, m1, m7 ; dct8 out4 out5
893    paddsw               m1, m7     ; dct8 out3 out2
894    psubsw               m7, m9, m0 ; out15 out14
895    paddsw               m0, m9     ; out0  out1
896    psubsw               m6, m1, m5 ; out12 out13
897    paddsw               m1, m5     ; out3  out2
898    psubsw               m5, m2, m4 ; out11 out10
899    paddsw               m2, m4     ; out4  out5
900    psubsw               m4, m3, m8 ; out8  out9
901    paddsw               m3, m8     ; out7  out6
902%endmacro
903
904INV_TXFM_4X16_FN dct, dct
905INV_TXFM_4X16_FN dct, adst
906INV_TXFM_4X16_FN dct, flipadst
907INV_TXFM_4X16_FN dct, identity
908
909cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
910    mova                 m0, [cq+32*0]
911    mova                 m1, [cq+32*1]
912    mova                 m2, [cq+32*2]
913    mova                 m3, [cq+32*3]
914    call m(idct_16x4_internal_8bpc).main
915    vpbroadcastd         m5, [o(pw_16384)]
916    punpckhwd            m4, m2, m3
917    punpcklwd            m2, m3
918    punpckhwd            m3, m0, m1
919    punpcklwd            m0, m1
920    REPX   {pmulhrsw x, m5}, m0, m4, m2, m3
921    punpckhdq            m1, m0, m2
922    punpckldq            m0, m2
923    punpckldq            m2, m3, m4
924    punpckhdq            m3, m4
925    jmp                tx2q
926.pass2:
927    vextracti128        xm4, m0, 1
928    vextracti128        xm5, m1, 1
929    vextracti128        xm6, m2, 1
930    vextracti128        xm7, m3, 1
931    call .main
932    vinserti128          m0, xm4, 1
933    vinserti128          m1, xm5, 1
934    vpbroadcastd         m5, [o(pw_2048)]
935    vinserti128          m2, xm6, 1
936    vinserti128          m3, xm7, 1
937    pshufd               m1, m1, q1032
938    pshufd               m3, m3, q1032
939    jmp m(iadst_4x16_internal_8bpc).end2
940ALIGN function_align
941cglobal_label .main
942    WRAP_XMM IDCT16_1D_PACKED
943    ret
944
945INV_TXFM_4X16_FN adst, dct
946INV_TXFM_4X16_FN adst, adst
947INV_TXFM_4X16_FN adst, flipadst
948INV_TXFM_4X16_FN adst, identity
949
950cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
951    mova                 m0, [cq+32*0]
952    mova                 m1, [cq+32*1]
953    mova                 m2, [cq+32*2]
954    mova                 m3, [cq+32*3]
955    call m(iadst_16x4_internal_8bpc).main
956    vpbroadcastd         m5, [o(pw_16384)]
957    punpckhwd            m4, m2, m3
958    punpcklwd            m2, m3
959    punpckhwd            m3, m0, m1
960    punpcklwd            m0, m1
961    REPX   {pmulhrsw x, m5}, m4, m2, m3, m0
962    punpckhdq            m1, m0, m2
963    punpckldq            m0, m2
964    punpckldq            m2, m3, m4
965    punpckhdq            m3, m4
966    jmp                tx2q
967.pass2:
968    call .main
969    vpbroadcastd         m5, [o(pw_2896x8)]
970    paddsw               m1, m2, m4
971    psubsw               m2, m4
972    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
973    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
974    vpbroadcastd         m5, [o(pw_2048)]
975    pshufd               m1, m1, q1032
976    vpblendd             m4, m1, m0, 0x33
977    vpblendd             m0, m2, 0x33
978    vpblendd             m2, m3, 0x33
979    vpblendd             m3, m1, 0x33
980    vpermq               m0, m0, q2031
981    vpermq               m1, m2, q1302
982    vpermq               m2, m3, q3120
983    vpermq               m3, m4, q0213
984    psubw                m6, m7, m5
985.end:
986    vpblendd             m5, m6, 0xcc
987.end2:
988    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
989    WIN64_RESTORE_XMM
990    pxor                 m4, m4
991    mova          [cq+32*0], m4
992    mova          [cq+32*1], m4
993    mova          [cq+32*2], m4
994    mova          [cq+32*3], m4
995.end3:
996    lea                  r2, [dstq+strideq*8]
997    lea                  r3, [strideq*3]
998    WRITE_4X8             0, 1
999    lea                dstq, [dstq+strideq*4]
1000    lea                  r2, [r2  +strideq*4]
1001    WRITE_4X8             2, 3
1002    RET
1003ALIGN function_align
1004.main:
1005    vpblendd             m4, m1, m0, 0xcc
1006    vpblendd             m1, m0, 0x33
1007    vpblendd             m5, m2, m3, 0xcc
1008    vpblendd             m2, m3, 0x33
1009    vperm2i128           m3, m5, m2, 0x31
1010    vinserti128          m0, m1, xm4, 1 ; in0  in3  in2  in1
1011    vperm2i128           m4, m1, m4, 0x31
1012    vinserti128          m1, m5, xm2, 1 ; in4  in7  in6  in5
1013    pshufd               m3, m3, q1032  ; in15 in12 in13 in14
1014    pshufd               m2, m4, q1032  ; in11 in8  in9  in10
1015cglobal_label .main2
1016    vpbroadcastd         m8, [o(pd_2048)]
1017    pxor                 m7, m7
1018    punpckhwd            m4, m3, m0 ; in12 in3  in14 in1
1019    punpcklwd            m0, m3     ; in0  in15 in2  in13
1020    punpckhwd            m3, m2, m1 ; in8  in7  in10 in5
1021    punpcklwd            m1, m2     ; in4  in11 in6  in9
1022    ITX_MUL4X_PACK        0, 2, 5, 6, 8,  201, 4091,  995, 3973, 3
1023    ITX_MUL4X_PACK        1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
1024    ITX_MUL4X_PACK        3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
1025    ITX_MUL4X_PACK        4, 2, 5, 6, 8, 3857, 1380, 4052,  601, 3
1026    psubsw               m2, m0, m3 ; t9a  t8a  t11a t10a
1027    paddsw               m0, m3     ; t1a  t0a  t3a  t2a
1028    psubsw               m3, m1, m4 ; t13a t12a t15a t14a
1029    paddsw               m1, m4     ; t5a  t4a  t7a  t6a
1030    ITX_MUL4X_PACK        2, 4, 5, 6, 8,  799, 4017, 3406, 2276, 3
1031    psubw                m6, m7, m5
1032    ITX_MUL2X_PACK        3, 5, _, 8, 6, 4, 6
1033    vpbroadcastd         m6, [o(pw_m3784_1567)]
1034    vpbroadcastd         m5, [o(pw_1567_3784)]
1035    psubsw               m4, m0, m1 ; t5   t4   t7   t6
1036    paddsw               m0, m1     ; t1   t0   t3   t2
1037    psubsw               m1, m2, m3 ; t13a t12a t15a t14a
1038    paddsw               m2, m3     ; t9a  t8a  t11a t10a
1039    psubw                m3, m7, m6 ; pw_3784_m1567
1040    vpblendd             m6, m3, 0xf0
1041    ITX_MUL2X_PACK        4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
1042    ITX_MUL2X_PACK        1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
1043    vbroadcasti128       m5, [o(deint_shuf)]
1044    pshufb               m0, m5
1045    pshufb               m2, m5
1046    vperm2i128           m3, m0, m2, 0x31  ; t3   t2   t11a t10a
1047    vinserti128          m0, xm2, 1        ; t1   t0   t9a  t8a
1048    vperm2i128           m2, m4, m1, 0x31  ; t7a  t6a  t15  t14
1049    vinserti128          m4, xm1, 1        ; t4a  t5a  t12  t13
1050    pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
1051    psubsw               m1, m0, m3        ; t3a t2a t11 t10
1052    paddsw               m0, m3     ; -out15  out0   out14 -out1
1053    paddsw               m3, m4, m2 ; -out3   out12  out2  -out13
1054    psubsw               m4, m2            ; t6 t7 t14a t15a
1055    shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
1056    vpblendd             m4, m1, 0x33      ; t3a t7  t11 t15a
1057    ret
1058ALIGN function_align
1059.main_pass1_end:
1060    vpbroadcastd         m5, [o(pw_m2896_2896)]
1061    vpbroadcastd         m6, [o(pw_2896_2896)]
1062    punpcklwd            m1, m4, m2
1063    punpckhwd            m4, m2
1064    pmaddwd              m2, m5, m4
1065    pmaddwd              m4, m6
1066    pmaddwd              m5, m1
1067    pmaddwd              m1, m6
1068    REPX      {paddd x, m8}, m5, m1, m2, m4
1069    REPX      {psrad x, 12}, m5, m2, m1, m4
1070    packssdw             m2, m5     ; -out11  out8   out10 -out9
1071    packssdw             m1, m4     ; -out7   out4   out6  -out5
1072    ret
1073
1074INV_TXFM_4X16_FN flipadst, dct
1075INV_TXFM_4X16_FN flipadst, adst
1076INV_TXFM_4X16_FN flipadst, flipadst
1077INV_TXFM_4X16_FN flipadst, identity
1078
1079cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
1080    mova                 m0, [cq+32*0]
1081    mova                 m1, [cq+32*1]
1082    mova                 m2, [cq+32*2]
1083    mova                 m3, [cq+32*3]
1084    call m(iadst_16x4_internal_8bpc).main
1085    vpbroadcastd         m5, [o(pw_16384)]
1086    punpcklwd            m4, m1, m0
1087    punpckhwd            m1, m0
1088    punpcklwd            m0, m3, m2
1089    punpckhwd            m3, m2
1090    REPX   {pmulhrsw x, m5}, m4, m1, m0, m3
1091    punpckldq            m2, m3, m1
1092    punpckhdq            m3, m1
1093    punpckhdq            m1, m0, m4
1094    punpckldq            m0, m4
1095    jmp                tx2q
1096.pass2:
1097    call m(iadst_4x16_internal_8bpc).main
1098    vpbroadcastd         m5, [o(pw_2896x8)]
1099    paddsw               m1, m2, m4
1100    psubsw               m2, m4
1101    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
1102    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
1103    vpbroadcastd         m6, [o(pw_2048)]
1104    pshufd               m1, m1, q1032
1105    vpblendd             m4, m0, m2, 0x33
1106    vpblendd             m0, m1, 0xcc
1107    vpblendd             m1, m3, 0xcc
1108    vpblendd             m2, m3, 0x33
1109    vpermq               m0, m0, q3120
1110    vpermq               m1, m1, q0213
1111    vpermq               m2, m2, q2031
1112    vpermq               m3, m4, q1302
1113    psubw                m5, m7, m6
1114    jmp m(iadst_4x16_internal_8bpc).end
1115
1116INV_TXFM_4X16_FN identity, dct
1117INV_TXFM_4X16_FN identity, adst
1118INV_TXFM_4X16_FN identity, flipadst
1119INV_TXFM_4X16_FN identity, identity
1120
1121cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
1122    mova                 m3, [cq+32*0]
1123    mova                 m2, [cq+32*1]
1124    mova                 m4, [cq+32*2]
1125    mova                 m5, [cq+32*3]
1126    vpbroadcastd         m8, [o(pw_1697x8)]
1127    pcmpeqw              m0, m0 ; -1
1128    punpcklwd            m1, m3, m2
1129    punpckhwd            m3, m2
1130    punpcklwd            m2, m4, m5
1131    punpckhwd            m4, m5
1132    pmulhrsw             m5, m8, m1
1133    pmulhrsw             m6, m8, m2
1134    pmulhrsw             m7, m8, m3
1135    pmulhrsw             m8, m4
1136    pcmpeqw              m9, m0, m1 ; we want to do a signed avg, but pavgw is
1137    pxor                 m1, m9     ; unsigned. as long as both signs are equal
1138    pcmpeqw              m9, m0, m2 ; it still works, but if the input is -1 the
1139    pxor                 m2, m9     ; pmulhrsw result will become 0 which causes
1140    pcmpeqw              m9, m0, m3 ; pavgw to output -32768 instead of 0 unless
1141    pxor                 m3, m9     ; we explicitly deal with that case here.
1142    pcmpeqw              m0, m4
1143    pxor                 m4, m0
1144    pavgw                m1, m5
1145    pavgw                m2, m6
1146    pavgw                m3, m7
1147    pavgw                m4, m8
1148    punpckldq            m0, m1, m2
1149    punpckhdq            m1, m2
1150    punpckldq            m2, m3, m4
1151    punpckhdq            m3, m4
1152    jmp                tx2q
1153.pass2:
1154    vpbroadcastd         m8, [o(pw_1697x16)]
1155    vpbroadcastd         m5, [o(pw_2048)]
1156    pmulhrsw             m4, m8, m0
1157    pmulhrsw             m6, m8, m1
1158    pmulhrsw             m7, m8, m2
1159    pmulhrsw             m8, m3
1160    REPX      {paddsw x, x}, m0, m1, m2, m3
1161    paddsw               m0, m4
1162    paddsw               m1, m6
1163    paddsw               m2, m7
1164    paddsw               m3, m8
1165    jmp m(iadst_4x16_internal_8bpc).end2
1166
1167%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3]
1168    movq               xm%3, [dstq   ]
1169    movhps             xm%3, [dstq+%5]
1170    movq               xm%4, [dstq+%6]
1171    movhps             xm%4, [dstq+%7]
1172    pmovzxbw            m%3, xm%3
1173    pmovzxbw            m%4, xm%4
1174%ifnum %1
1175    paddw               m%3, m%1
1176%else
1177    paddw               m%3, %1
1178%endif
1179%ifnum %2
1180    paddw               m%4, m%2
1181%else
1182    paddw               m%4, %2
1183%endif
1184    packuswb            m%3, m%4
1185    vextracti128       xm%4, m%3, 1
1186    movq          [dstq   ], xm%3
1187    movhps        [dstq+%6], xm%3
1188    movq          [dstq+%5], xm%4
1189    movhps        [dstq+%7], xm%4
1190%endmacro
1191
1192%macro INV_TXFM_8X4_FN 2 ; type1, type2
1193    INV_TXFM_FN          %1, %2, 8x4
1194%ifidn %1_%2, dct_dct
1195    movd                xm1, [o(pw_2896x8)]
1196    pmulhrsw            xm0, xm1, [cq]
1197    mov                [cq], eobd
1198    pmulhrsw            xm0, xm1
1199    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
1200%endif
1201%endmacro
1202
1203INV_TXFM_8X4_FN dct, dct
1204INV_TXFM_8X4_FN dct, adst
1205INV_TXFM_8X4_FN dct, flipadst
1206INV_TXFM_8X4_FN dct, identity
1207
1208cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1209    vpbroadcastd        xm3, [o(pw_2896x8)]
1210    pmulhrsw            xm0, xm3, [cq+16*0]
1211    pmulhrsw            xm1, xm3, [cq+16*1]
1212    pmulhrsw            xm2, xm3, [cq+16*2]
1213    pmulhrsw            xm3,      [cq+16*3]
1214    call m(idct_4x8_internal_8bpc).main
1215    vbroadcasti128       m4, [o(deint_shuf)]
1216    vinserti128          m3, m1, xm3, 1
1217    vinserti128          m1, m0, xm2, 1
1218    shufps               m0, m1, m3, q0220
1219    shufps               m1, m3, q1331
1220    pshufb               m0, m4
1221    pshufb               m1, m4
1222    jmp                tx2q
1223.pass2:
1224    IDCT4_1D_PACKED
1225    vpermq               m0, m0, q3120
1226    vpermq               m1, m1, q2031
1227    jmp m(iadst_8x4_internal_8bpc).end2
1228
1229INV_TXFM_8X4_FN adst, dct
1230INV_TXFM_8X4_FN adst, adst
1231INV_TXFM_8X4_FN adst, flipadst
1232INV_TXFM_8X4_FN adst, identity
1233
1234cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1235    vpbroadcastd        xm0, [o(pw_2896x8)]
1236    pshufd              xm4,      [cq+16*0], q1032
1237    pmulhrsw            xm3, xm0, [cq+16*3]
1238    pshufd              xm5,      [cq+16*1], q1032
1239    pmulhrsw            xm2, xm0, [cq+16*2]
1240    pmulhrsw            xm4, xm0
1241    pmulhrsw            xm5, xm0
1242    call m(iadst_4x8_internal_8bpc).main_pass1
1243    vinserti128        m0, xm2, 1
1244    vinserti128        m1, xm3, 1
1245    punpckhwd          m2, m0, m1
1246    punpcklwd          m0, m1
1247    pxor               m3, m3
1248    psubsw             m3, m2
1249    punpckhwd          m1, m0, m3
1250    punpcklwd          m0, m3
1251    jmp              tx2q
1252.pass2:
1253    call .main
1254.end:
1255    vpermq               m0, m0, q3120
1256    vpermq               m1, m1, q3120
1257.end2:
1258    vpbroadcastd         m2, [o(pw_2048)]
1259    pmulhrsw             m0, m2
1260    pmulhrsw             m1, m2
1261    WIN64_RESTORE_XMM
1262.end3:
1263    pxor                 m2, m2
1264    mova          [cq+32*0], m2
1265    mova          [cq+32*1], m2
1266    lea                  r3, [strideq*3]
1267    WRITE_8X4             0, 1, 4, 5
1268    RET
1269ALIGN function_align
1270cglobal_label .main
1271    IADST4_1D_PACKED
1272    ret
1273
1274INV_TXFM_8X4_FN flipadst, dct
1275INV_TXFM_8X4_FN flipadst, adst
1276INV_TXFM_8X4_FN flipadst, flipadst
1277INV_TXFM_8X4_FN flipadst, identity
1278
1279cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1280    vpbroadcastd        xm0, [o(pw_2896x8)]
1281    pshufd              xm4,      [cq+16*0], q1032
1282    pmulhrsw            xm3, xm0, [cq+16*3]
1283    pshufd              xm5,      [cq+16*1], q1032
1284    pmulhrsw            xm2, xm0, [cq+16*2]
1285    pmulhrsw            xm4, xm0
1286    pmulhrsw            xm5, xm0
1287    call m(iadst_4x8_internal_8bpc).main_pass1
1288    vinserti128          m3, xm1, 1
1289    vinserti128          m2, xm0, 1
1290    punpckhwd            m1, m3, m2
1291    punpcklwd            m3, m2
1292    pxor                 m0, m0
1293    psubsw               m0, m1
1294    punpckhwd            m1, m0, m3
1295    punpcklwd            m0, m3
1296    jmp                tx2q
1297.pass2:
1298    call m(iadst_8x4_internal_8bpc).main
1299    mova                 m2, m1
1300    vpermq               m1, m0, q2031
1301    vpermq               m0, m2, q2031
1302    jmp m(iadst_8x4_internal_8bpc).end2
1303
1304INV_TXFM_8X4_FN identity, dct
1305INV_TXFM_8X4_FN identity, adst
1306INV_TXFM_8X4_FN identity, flipadst
1307INV_TXFM_8X4_FN identity, identity
1308
1309cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1310    mova                xm2, [cq+16*0]
1311    mova                xm0, [cq+16*1]
1312    vinserti128          m2, [cq+16*2], 1
1313    vinserti128          m0, [cq+16*3], 1
1314    vpbroadcastd         m3, [o(pw_2896x8)]
1315    punpcklwd            m1, m2, m0
1316    punpckhwd            m2, m0
1317    pmulhrsw             m1, m3
1318    pmulhrsw             m2, m3
1319    punpcklwd            m0, m1, m2
1320    punpckhwd            m1, m2
1321    paddsw               m0, m0
1322    paddsw               m1, m1
1323    jmp                tx2q
1324.pass2:
1325    vpbroadcastd         m3, [o(pw_1697x8)]
1326    pmulhrsw             m2, m3, m0
1327    pmulhrsw             m3, m1
1328    paddsw               m0, m2
1329    paddsw               m1, m3
1330    jmp m(iadst_8x4_internal_8bpc).end
1331
1332%macro INV_TXFM_8X8_FN 2 ; type1, type2
1333    INV_TXFM_FN          %1, %2, 8x8
1334%ifidn %1_%2, dct_dct
1335    movd                xm1, [o(pw_2896x8)]
1336    pmulhrsw            xm0, xm1, [cq]
1337    movd                xm2, [o(pw_16384)]
1338    mov                [cq], eobd
1339    or                  r3d, 8
1340.dconly:
1341    pmulhrsw            xm0, xm2
1342.dconly2:
1343    movd                xm2, [pw_2048]
1344    pmulhrsw            xm0, xm1
1345    lea                  r2, [strideq*3]
1346    pmulhrsw            xm0, xm2
1347    vpbroadcastw         m0, xm0
1348.dconly_loop:
1349    WRITE_8X4             0, 0, 1, 2, strideq*1, strideq*2, r2
1350    lea                dstq, [dstq+strideq*4]
1351    sub                 r3d, 4
1352    jg .dconly_loop
1353    RET
1354%endif
1355%endmacro
1356
1357INV_TXFM_8X8_FN dct, dct
1358INV_TXFM_8X8_FN dct, adst
1359INV_TXFM_8X8_FN dct, flipadst
1360INV_TXFM_8X8_FN dct, identity
1361
1362cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1363    vpermq               m0, [cq+32*0], q3120 ; 0 1
1364    vpermq               m3, [cq+32*3], q3120 ; 6 7
1365    vpermq               m2, [cq+32*2], q3120 ; 4 5
1366    vpermq               m1, [cq+32*1], q3120 ; 2 3
1367    call .main
1368    shufps               m4, m0, m1, q0220
1369    shufps               m5, m0, m1, q1331
1370    shufps               m1, m2, m3, q0220
1371    shufps               m3, m2, m3, q1331
1372    vbroadcasti128       m0, [o(deint_shuf)]
1373    vpbroadcastd         m2, [o(pw_16384)]
1374    REPX   {pshufb   x, m0}, m4, m5, m1, m3
1375    REPX   {pmulhrsw x, m2}, m4, m5, m1, m3
1376    vinserti128          m0, m4, xm1, 1
1377    vperm2i128           m2, m4, m1, 0x31
1378    vinserti128          m1, m5, xm3, 1
1379    vperm2i128           m3, m5, m3, 0x31
1380    jmp                tx2q
1381.pass2:
1382    call .main
1383    vpbroadcastd         m4, [o(pw_2048)]
1384    vpermq               m0, m0, q3120
1385    vpermq               m1, m1, q2031
1386    vpermq               m2, m2, q3120
1387    vpermq               m3, m3, q2031
1388    jmp m(iadst_8x8_internal_8bpc).end2
1389ALIGN function_align
1390cglobal_label .main
1391    IDCT8_1D_PACKED
1392    ret
1393
1394INV_TXFM_8X8_FN adst, dct
1395INV_TXFM_8X8_FN adst, adst
1396INV_TXFM_8X8_FN adst, flipadst
1397INV_TXFM_8X8_FN adst, identity
1398
1399cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1400    vpermq               m4, [cq+32*0], q1302 ; 1 0
1401    vpermq               m3, [cq+32*3], q3120 ; 6 7
1402    vpermq               m5, [cq+32*1], q1302 ; 3 2
1403    vpermq               m2, [cq+32*2], q3120 ; 4 5
1404    call .main_pass1
1405    vpbroadcastd         m5, [o(pw_16384)]
1406    punpcklwd            m4, m0, m1
1407    punpckhwd            m0, m1
1408    punpcklwd            m1, m2, m3
1409    punpckhwd            m2, m3
1410    pxor                 m3, m3
1411    psubw                m3, m5 ; negate odd elements during rounding
1412    pmulhrsw             m4, m5
1413    pmulhrsw             m0, m3
1414    pmulhrsw             m1, m5
1415    pmulhrsw             m2, m3
1416    punpcklwd            m3, m4, m0
1417    punpckhwd            m4, m0
1418    punpcklwd            m0, m1, m2
1419    punpckhwd            m1, m2
1420    vperm2i128           m2, m3, m0, 0x31
1421    vinserti128          m0, m3, xm0, 1
1422    vperm2i128           m3, m4, m1, 0x31
1423    vinserti128          m1, m4, xm1, 1
1424    jmp                tx2q
1425.pass2:
1426    pshufd               m4, m0, q1032
1427    pshufd               m5, m1, q1032
1428    call .main_pass2
1429    vpbroadcastd         m5, [o(pw_2048)]
1430    vpbroadcastd        xm4, [o(pw_4096)]
1431    psubw                m4, m5 ; lower half = 2048, upper half = -2048
1432.end:
1433    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
1434.end2:
1435    pmulhrsw             m0, m4
1436    pmulhrsw             m1, m4
1437.end3:
1438    pmulhrsw             m2, m4
1439    pmulhrsw             m3, m4
1440    WIN64_RESTORE_XMM
1441.end4:
1442    pxor                 m4, m4
1443    mova          [cq+32*0], m4
1444    mova          [cq+32*1], m4
1445    mova          [cq+32*2], m4
1446    mova          [cq+32*3], m4
1447    lea                  r3, [strideq*3]
1448    WRITE_8X4             0, 1, 4, 5
1449    lea                dstq, [dstq+strideq*4]
1450    WRITE_8X4             2, 3, 4, 5
1451    RET
1452ALIGN function_align
1453.main_pass1:
1454    IADST8_1D_PACKED 1
1455    ret
1456ALIGN function_align
1457cglobal_label .main_pass2
1458    IADST8_1D_PACKED 2
1459    ret
1460
1461INV_TXFM_8X8_FN flipadst, dct
1462INV_TXFM_8X8_FN flipadst, adst
1463INV_TXFM_8X8_FN flipadst, flipadst
1464INV_TXFM_8X8_FN flipadst, identity
1465
1466cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1467    vpermq               m4, [cq+32*0], q1302 ; 1 0
1468    vpermq               m3, [cq+32*3], q3120 ; 6 7
1469    vpermq               m5, [cq+32*1], q1302 ; 3 2
1470    vpermq               m2, [cq+32*2], q3120 ; 4 5
1471    call m(iadst_8x8_internal_8bpc).main_pass1
1472    vpbroadcastd         m5, [o(pw_16384)]
1473    punpckhwd            m4, m3, m2
1474    punpcklwd            m3, m2
1475    punpckhwd            m2, m1, m0
1476    punpcklwd            m1, m0
1477    pxor                 m0, m0
1478    psubw                m0, m5
1479    pmulhrsw             m4, m0
1480    pmulhrsw             m3, m5
1481    pmulhrsw             m2, m0
1482    pmulhrsw             m1, m5
1483    punpckhwd            m0, m4, m3
1484    punpcklwd            m4, m3
1485    punpckhwd            m3, m2, m1
1486    punpcklwd            m2, m1
1487    vinserti128          m1, m0, xm3, 1
1488    vperm2i128           m3, m0, m3, 0x31
1489    vinserti128          m0, m4, xm2, 1
1490    vperm2i128           m2, m4, m2, 0x31
1491    jmp                tx2q
1492.pass2:
1493    pshufd               m4, m0, q1032
1494    pshufd               m5, m1, q1032
1495    call m(iadst_8x8_internal_8bpc).main_pass2
1496    vpbroadcastd         m4, [o(pw_2048)]
1497    vpbroadcastd        xm5, [o(pw_4096)]
1498    psubw                m4, m5 ; lower half = -2048, upper half = 2048
1499    vpermq               m5, m3, q2031
1500    vpermq               m3, m0, q2031
1501    vpermq               m0, m2, q2031
1502    vpermq               m2, m1, q2031
1503    pmulhrsw             m1, m0, m4
1504    pmulhrsw             m0, m5, m4
1505    jmp m(iadst_8x8_internal_8bpc).end3
1506
1507INV_TXFM_8X8_FN identity, dct
1508INV_TXFM_8X8_FN identity, adst
1509INV_TXFM_8X8_FN identity, flipadst
1510INV_TXFM_8X8_FN identity, identity
1511
1512cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1513    mova                xm3, [cq+16*0]
1514    mova                xm2, [cq+16*1]
1515    vinserti128          m3, [cq+16*4], 1
1516    vinserti128          m2, [cq+16*5], 1
1517    mova                xm4, [cq+16*2]
1518    mova                xm0, [cq+16*3]
1519    vinserti128          m4, [cq+16*6], 1
1520    vinserti128          m0, [cq+16*7], 1
1521    punpcklwd            m1, m3, m2
1522    punpckhwd            m3, m2
1523    punpcklwd            m2, m4, m0
1524    punpckhwd            m4, m0
1525    punpckldq            m0, m1, m2
1526    punpckhdq            m1, m2
1527    punpckldq            m2, m3, m4
1528    punpckhdq            m3, m4
1529    jmp                tx2q
1530.pass2:
1531    vpbroadcastd         m4, [o(pw_4096)]
1532    jmp m(iadst_8x8_internal_8bpc).end
1533
1534%macro INV_TXFM_8X16_FN 2 ; type1, type2
1535    INV_TXFM_FN          %1, %2, 8x16
1536%ifidn %1_%2, dct_dct
1537    movd                xm1, [o(pw_2896x8)]
1538    pmulhrsw            xm0, xm1, [cq]
1539    movd                xm2, [o(pw_16384)]
1540    mov                [cq], eobd
1541    pmulhrsw            xm0, xm1
1542    or                  r3d, 16
1543    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
1544%endif
1545%endmacro
1546
1547%macro ITX_8X16_LOAD_COEFS 0
1548    vpbroadcastd         m4, [o(pw_2896x8)]
1549    pmulhrsw             m0, m4, [cq+32*0]
1550    add                  cq, 32*4
1551    pmulhrsw             m7, m4, [cq+32*3]
1552    pmulhrsw             m1, m4, [cq-32*3]
1553    pmulhrsw             m6, m4, [cq+32*2]
1554    pmulhrsw             m2, m4, [cq-32*2]
1555    pmulhrsw             m5, m4, [cq+32*1]
1556    pmulhrsw             m3, m4, [cq-32*1]
1557    pmulhrsw             m4,     [cq+32*0]
1558%endmacro
1559
1560INV_TXFM_8X16_FN dct, dct
1561INV_TXFM_8X16_FN dct, adst
1562INV_TXFM_8X16_FN dct, flipadst
1563INV_TXFM_8X16_FN dct, identity
1564
1565cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
1566    ITX_8X16_LOAD_COEFS
1567    call m(idct_16x8_internal_8bpc).main
1568    vpbroadcastd        m10, [o(pw_16384)]
1569.pass1_end:
1570    vperm2i128           m9, m3, m7, 0x31
1571    vinserti128          m3, xm7, 1
1572    vperm2i128           m8, m2, m6, 0x31
1573    vinserti128          m2, xm6, 1
1574    vperm2i128           m6, m1, m5, 0x31
1575    vinserti128          m1, xm5, 1
1576    vperm2i128           m5, m0, m4, 0x31
1577    vinserti128          m0, xm4, 1
1578    punpckhwd            m4, m2, m3
1579    punpcklwd            m2, m3
1580    punpckhwd            m3, m0, m1
1581    punpcklwd            m0, m1
1582.pass1_end2:
1583    punpckhwd            m7, m5, m6
1584    punpcklwd            m5, m6
1585    punpcklwd            m6, m8, m9
1586    punpckhwd            m8, m9
1587    REPX  {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8
1588    punpckhdq            m1, m0, m2
1589    punpckldq            m0, m2
1590    punpckldq            m2, m3, m4
1591    punpckhdq            m3, m4
1592    punpckldq            m4, m5, m6
1593    punpckhdq            m5, m6
1594    punpckldq            m6, m7, m8
1595    punpckhdq            m7, m8
1596    jmp                tx2q
1597.pass2:
1598    call .main
1599    REPX {vpermq x, x, q3120}, m0, m2, m4, m6
1600    REPX {vpermq x, x, q2031}, m1, m3, m5, m7
1601.end:
1602    vpbroadcastd         m8, [o(pw_2048)]
1603.end2:
1604    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
1605.end3:
1606    pxor                 m8, m8
1607    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
1608    lea                  r3, [strideq*3]
1609    WRITE_8X4             0, 1, 8, 9
1610    lea                dstq, [dstq+strideq*4]
1611    WRITE_8X4             2, 3, 0, 1
1612    lea                dstq, [dstq+strideq*4]
1613    WRITE_8X4             4, 5, 0, 1
1614    lea                dstq, [dstq+strideq*4]
1615    WRITE_8X4             6, 7, 0, 1
1616    RET
1617ALIGN function_align
1618cglobal_label .main
1619    IDCT16_1D_PACKED
1620    ret
1621
1622INV_TXFM_8X16_FN adst, dct
1623INV_TXFM_8X16_FN adst, adst
1624INV_TXFM_8X16_FN adst, flipadst
1625INV_TXFM_8X16_FN adst, identity
1626
1627cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
1628    ITX_8X16_LOAD_COEFS
1629    call m(iadst_16x8_internal_8bpc).main
1630    call m(iadst_16x8_internal_8bpc).main_pass1_end
1631    vpbroadcastd        m10, [o(pw_16384)]
1632    pslld                m9, m10, 17
1633    psubw               m10, m9 ; 16384, -16384
1634    jmp m(idct_8x16_internal_8bpc).pass1_end
1635ALIGN function_align
1636.pass2:
1637    call .main
1638    call .main_pass2_end
1639    vpbroadcastd         m9, [o(pw_2048)]
1640    vpbroadcastd        xm8, [o(pw_4096)]
1641    psubw                m8, m9
1642    REPX {vpermq x, x, q2031}, m0, m1, m2, m3
1643    REPX {vpermq x, x, q3120}, m4, m5, m6, m7
1644    jmp m(idct_8x16_internal_8bpc).end2
1645ALIGN function_align
1646cglobal_label .main
1647    REPX {pshufd x, x, q1032}, m7, m1, m5, m3
1648.main2:
1649    vpbroadcastd        m10, [o(pd_2048)]
1650    punpckhwd            m8, m7, m0 ; in14 in1
1651    punpcklwd            m0, m7     ; in0  in15
1652    punpcklwd            m7, m6, m1 ; in12 in3
1653    punpckhwd            m1, m6     ; in2  in13
1654    punpckhwd            m6, m5, m2 ; in10 in5
1655    punpcklwd            m2, m5     ; in4  in11
1656    punpcklwd            m5, m4, m3 ; in8  in7
1657    punpckhwd            m3, m4     ; in6  in9
1658    ITX_MUL2X_PACK        0, 4, 9, 10,  201, 4091, 3 ; t0  t1
1659    ITX_MUL2X_PACK        1, 4, 9, 10,  995, 3973, 3 ; t2  t3
1660    ITX_MUL2X_PACK        2, 4, 9, 10, 1751, 3703, 3 ; t4  t5
1661    ITX_MUL2X_PACK        3, 4, 9, 10, 2440, 3290, 3 ; t6  t7
1662    ITX_MUL2X_PACK        5, 4, 9, 10, 3035, 2751, 3 ; t8  t9
1663    ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
1664    ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
1665    ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 3 ; t14 t15
1666    psubsw               m4, m0, m5 ; t9a  t8a
1667    paddsw               m0, m5     ; t1a  t0a
1668    psubsw               m5, m1, m6 ; t11a t10a
1669    paddsw               m1, m6     ; t3a  t2a
1670    psubsw               m6, m2, m7 ; t13a t12a
1671    paddsw               m2, m7     ; t5a  t4a
1672    psubsw               m7, m3, m8 ; t15a t14a
1673    paddsw               m3, m8     ; t7a  t6a
1674    vpbroadcastd        m11, [o(pw_m4017_799)]
1675    vpbroadcastd        m12, [o(pw_799_4017)]
1676    pxor                 m9, m9
1677    ITX_MUL2X_PACK        4, 8, _, 10, 11, 12, 6 ; t8  t9
1678    psubw                m8, m9, m11 ; pw_4017_m799
1679    ITX_MUL2X_PACK        6, 12, _, 10, 12, 8, 6 ; t12 t13
1680    vpbroadcastd        m11, [o(pw_m2276_3406)]
1681    vpbroadcastd        m12, [o(pw_3406_2276)]
1682    ITX_MUL2X_PACK        5, 8, _, 10, 11, 12, 6 ; t10 t11
1683    psubw                m8, m9, m11 ; pw_2276_m3406
1684    ITX_MUL2X_PACK        7, 12, _, 10, 12, 8, 6 ; t14 t15
1685    psubsw               m8, m1, m3 ; t7   t6
1686    paddsw               m1, m3     ; t3   t2
1687    psubsw               m3, m0, m2 ; t5   t4
1688    paddsw               m0, m2     ; t1   t0
1689    psubsw               m2, m5, m7 ; t14a t15a
1690    paddsw               m7, m5     ; t10a t11a
1691    psubsw               m5, m4, m6 ; t12a t13a
1692    paddsw               m4, m6     ; t8a  t9a
1693    vpbroadcastd        m11, [o(pw_m3784_1567)]
1694    vpbroadcastd        m12, [o(pw_1567_3784)]
1695    ITX_MUL2X_PACK        3, 6, _, 10, 12, 11, 6 ; t5a t4a
1696    psubw                m6, m9, m11 ; pw_3784_m1567
1697    ITX_MUL2X_PACK        8, 6, _, 10, 6, 12, 6  ; t7a t6a
1698    vpbroadcastd        m11, [o(pw_m1567_3784)]
1699    vpbroadcastd        m12, [o(pw_3784_1567)]
1700    ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 6 ; t15 t14
1701    psubw                m6, m9, m11 ; pw_1567_m3784
1702    ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 6 ; t13 t12
1703    vbroadcasti128      m12, [o(deint_shuf)]
1704    paddsw               m6, m4, m7        ; -out1  out14
1705    psubsw               m4, m7            ;  t10    t11
1706    psubsw              m11, m3, m8        ;  t7     t6
1707    paddsw               m8, m3            ;  out12 -out3
1708    psubsw               m3, m0, m1        ;  t3a    t2a
1709    paddsw               m0, m1            ; -out15  out0
1710    paddsw               m1, m2, m5        ; -out13  out2
1711    psubsw               m5, m2            ;  t15a   t14a
1712    pshufb               m0, m12
1713    pshufb               m6, m12
1714    pshufb               m8, m12
1715    pshufb               m1, m12
1716    shufps               m7, m6, m0, q1032 ;  out14 -out15
1717    vpblendd             m0, m6, 0x33      ; -out1   out0
1718    punpcklqdq           m6, m8, m1        ;  out12 -out13
1719    punpckhqdq           m1, m8, m1        ; -out3   out2
1720    ret
1721ALIGN function_align
1722.main_pass1_end:
1723    vpbroadcastd         m8, [o(pw_m2896_2896)]
1724    vpbroadcastd        m12, [o(pw_2896_2896)]
1725    pmaddwd              m9, m8, m11       ; -out11
1726    pmaddwd              m2, m12, m5       ; -out5
1727    pmaddwd              m5, m8            ;  out10
1728    pmaddwd             m11, m12           ;  out4
1729    REPX     {paddd x, m10}, m9, m5, m2, m11
1730    REPX     {psrad x, 12 }, m9, m5, m2, m11
1731    packssdw             m5, m9            ;  out10 -out11
1732    packssdw             m2, m11           ; -out5   out4
1733    pmaddwd             m11, m8, m3        ;  out8
1734    vpbroadcastd         m8, [o(pw_2896_m2896)]
1735    pmaddwd              m3, m12           ; -out7
1736    pmaddwd              m8, m4            ; -out9
1737    pmaddwd              m4, m12           ;  out6
1738    REPX     {paddd x, m10}, m11, m3, m8, m4
1739    REPX     {psrad x, 12 }, m11, m3, m8, m4
1740    packssdw             m3, m4            ; -out7   out6
1741    packssdw             m4, m11, m8       ;  out8  -out9
1742    vpbroadcastd        m10, [o(pw_16384)]
1743    pxor                 m9, m9
1744    ret
1745ALIGN function_align
1746cglobal_label .main_pass2_end
1747    vpbroadcastd         m8, [o(pw_2896x8)]
1748    pshufb               m2, m11, m12
1749    pshufb               m5, m12
1750    pshufb               m3, m12
1751    pshufb               m4, m12
1752    punpcklqdq          m11, m5, m2        ;  t15a   t7
1753    punpckhqdq           m5, m2            ;  t14a   t6
1754    shufps               m2, m3, m4, q1032 ;  t2a    t10
1755    vpblendd             m3, m4, 0xcc      ;  t3a    t11
1756    psubsw               m4, m2, m3        ;  out8  -out9
1757    paddsw               m3, m2            ; -out7   out6
1758    paddsw               m2, m5, m11       ; -out5   out4
1759    psubsw               m5, m11           ;  out10 -out11
1760    REPX   {pmulhrsw x, m8}, m2, m3, m4, m5
1761    ret
1762
1763INV_TXFM_8X16_FN flipadst, dct
1764INV_TXFM_8X16_FN flipadst, adst
1765INV_TXFM_8X16_FN flipadst, flipadst
1766INV_TXFM_8X16_FN flipadst, identity
1767
1768cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
1769    ITX_8X16_LOAD_COEFS
1770    call m(iadst_16x8_internal_8bpc).main
1771    call m(iadst_16x8_internal_8bpc).main_pass1_end
1772    vpbroadcastd         m9, [o(pw_16384)]
1773    pslld               m10, m9, 17
1774    psubw               m10, m9 ; -16384, 16384
1775    vperm2i128           m9, m4, m0, 0x31
1776    vinserti128          m0, m4, xm0, 1
1777    vperm2i128           m8, m5, m1, 0x31
1778    vinserti128          m4, m5, xm1, 1
1779    vperm2i128           m5, m7, m3, 0x31
1780    vinserti128          m3, m7, xm3, 1
1781    vinserti128          m1, m6, xm2, 1
1782    vperm2i128           m6, m6, m2, 0x31
1783    punpcklwd            m2, m4, m0
1784    punpckhwd            m4, m0
1785    punpcklwd            m0, m3, m1
1786    punpckhwd            m3, m1
1787    jmp m(idct_8x16_internal_8bpc).pass1_end2
1788.pass2:
1789    call m(iadst_8x16_internal_8bpc).main
1790    call m(iadst_8x16_internal_8bpc).main_pass2_end
1791    vpbroadcastd         m8, [o(pw_2048)]
1792    vpbroadcastd        xm9, [o(pw_4096)]
1793    psubw                m8, m9
1794    vpermq               m9, m0, q3120
1795    vpermq               m0, m7, q2031
1796    vpermq               m7, m1, q3120
1797    vpermq               m1, m6, q2031
1798    vpermq               m6, m2, q3120
1799    vpermq               m2, m5, q2031
1800    vpermq               m5, m3, q3120
1801    vpermq               m3, m4, q2031
1802    pmulhrsw             m0, m8
1803    pmulhrsw             m1, m8
1804    pmulhrsw             m2, m8
1805    pmulhrsw             m3, m8
1806    pmulhrsw             m4, m5, m8
1807    pmulhrsw             m5, m6, m8
1808    pmulhrsw             m6, m7, m8
1809    pmulhrsw             m7, m9, m8
1810    jmp m(idct_8x16_internal_8bpc).end3
1811
1812INV_TXFM_8X16_FN identity, dct
1813INV_TXFM_8X16_FN identity, adst
1814INV_TXFM_8X16_FN identity, flipadst
1815INV_TXFM_8X16_FN identity, identity
1816
1817%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
1818    pmulhrsw            m%2, m%3, m%1
1819%if %0 == 4 ; if downshifting by 1
1820    pmulhrsw            m%2, m%4
1821%else
1822    paddsw              m%1, m%1
1823%endif
1824    paddsw              m%1, m%2
1825%endmacro
1826
1827cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
1828    mova                xm3, [cq+16*0]
1829    mova                xm2, [cq+16*2]
1830    add                  cq, 16*8
1831    vinserti128          m3, [cq+16*0], 1
1832    vinserti128          m2, [cq+16*2], 1
1833    vpbroadcastd         m9, [o(pw_2896x8)]
1834    mova                xm4, [cq-16*4]
1835    mova                xm5, [cq-16*2]
1836    vinserti128          m4, [cq+16*4], 1
1837    vinserti128          m5, [cq+16*6], 1
1838    mova                xm7, [cq-16*7]
1839    mova                xm6, [cq-16*5]
1840    vinserti128          m7, [cq+16*1], 1
1841    vinserti128          m6, [cq+16*3], 1
1842    mova                xm8, [cq-16*3]
1843    mova                xm0, [cq-16*1]
1844    vinserti128          m8, [cq+16*5], 1
1845    vinserti128          m0, [cq+16*7], 1
1846    punpcklwd            m1, m3, m2
1847    punpckhwd            m3, m2
1848    punpcklwd            m2, m4, m5
1849    punpckhwd            m4, m5
1850    punpcklwd            m5, m7, m6
1851    punpckhwd            m7, m6
1852    punpcklwd            m6, m8, m0
1853    punpckhwd            m8, m0
1854    REPX   {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8
1855    punpckldq            m0, m1, m2
1856    punpckhdq            m1, m2
1857    punpckldq            m2, m3, m4
1858    punpckhdq            m3, m4
1859    punpckldq            m4, m5, m6
1860    punpckhdq            m5, m6
1861    punpckldq            m6, m7, m8
1862    punpckhdq            m7, m8
1863    jmp                tx2q
1864.pass2:
1865    vpbroadcastd         m8, [o(pw_1697x16)]
1866    REPX {vpermq   x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
1867    REPX {IDTX16   x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7
1868    jmp m(idct_8x16_internal_8bpc).end
1869
1870%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
1871    pmovzxbw            m%3, [dstq+%5]
1872%ifnum %1
1873    paddw               m%3, m%1
1874%else
1875    paddw               m%3, %1
1876%endif
1877    pmovzxbw            m%4, [dstq+%6]
1878%ifnum %2
1879    paddw               m%4, m%2
1880%else
1881    paddw               m%4, %2
1882%endif
1883    packuswb            m%3, m%4
1884    vpermq              m%3, m%3, q3120
1885    mova          [dstq+%5], xm%3
1886    vextracti128  [dstq+%6], m%3, 1
1887%endmacro
1888
1889%macro INV_TXFM_16X4_FN 2 ; type1, type2
1890    INV_TXFM_FN          %1, %2, 16x4
1891%ifidn %1_%2, dct_dct
1892    movd                xm1, [o(pw_2896x8)]
1893    pmulhrsw            xm0, xm1, [cq]
1894    movd                xm2, [o(pw_16384)]
1895    mov                [cq], eobd
1896    or                  r3d, 4
1897.dconly:
1898    pmulhrsw            xm0, xm2
1899    movd                xm2, [pw_2048] ; intentionally rip-relative
1900    pmulhrsw            xm0, xm1
1901    pmulhrsw            xm0, xm2
1902    vpbroadcastw         m0, xm0
1903    pxor                 m3, m3
1904.dconly_loop:
1905    mova                xm1, [dstq+strideq*0]
1906    vinserti128          m1, [dstq+strideq*1], 1
1907    punpckhbw            m2, m1, m3
1908    punpcklbw            m1, m3
1909    paddw                m2, m0
1910    paddw                m1, m0
1911    packuswb             m1, m2
1912    mova         [dstq+strideq*0], xm1
1913    vextracti128 [dstq+strideq*1], m1, 1
1914    lea                dstq, [dstq+strideq*2]
1915    sub                 r3d, 2
1916    jg .dconly_loop
1917    RET
1918%endif
1919%endmacro
1920
1921INV_TXFM_16X4_FN dct, dct
1922INV_TXFM_16X4_FN dct, adst
1923INV_TXFM_16X4_FN dct, flipadst
1924INV_TXFM_16X4_FN dct, identity
1925
1926cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
1927    mova                xm0, [cq+16*0]
1928    mova                xm1, [cq+16*1]
1929    mova                xm2, [cq+16*2]
1930    mova                xm3, [cq+16*3]
1931    mova                xm4, [cq+16*4]
1932    mova                xm5, [cq+16*5]
1933    mova                xm6, [cq+16*6]
1934    mova                xm7, [cq+16*7]
1935    call m(idct_4x16_internal_8bpc).main
1936    vinserti128          m6, m2, xm6, 1
1937    vinserti128          m2, m0, xm4, 1
1938    vinserti128          m0, m1, xm5, 1
1939    vinserti128          m1, m3, xm7, 1
1940    punpcklwd            m3, m2, m6
1941    punpckhwd            m2, m6
1942    vpbroadcastd         m6, [o(pw_16384)]
1943    punpckhwd            m4, m0, m1
1944    punpcklwd            m0, m1
1945    mova                 m1, m6
1946    jmp m(iadst_16x4_internal_8bpc).pass1_end
1947.pass2:
1948    call .main
1949    jmp m(iadst_16x4_internal_8bpc).end
1950ALIGN function_align
1951cglobal_label .main
1952    vpbroadcastd         m6, [o(pd_2048)]
1953    IDCT4_1D              0, 1, 2, 3, 4, 5, 6
1954    ret
1955
1956INV_TXFM_16X4_FN adst, dct
1957INV_TXFM_16X4_FN adst, adst
1958INV_TXFM_16X4_FN adst, flipadst
1959INV_TXFM_16X4_FN adst, identity
1960
1961cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
1962    vpermq               m0, [cq+32*0], q1230
1963    vpermq               m3, [cq+32*3], q2103
1964    vpermq               m1, [cq+32*1], q1230
1965    vpermq               m2, [cq+32*2], q2103
1966    call m(iadst_4x16_internal_8bpc).main2
1967    call m(iadst_4x16_internal_8bpc).main_pass1_end
1968    punpcklwd            m4, m3, m1
1969    punpcklwd            m5, m2, m0
1970    punpckhwd            m0, m1
1971    punpckhwd            m2, m3
1972    vpbroadcastd         m1, [o(pw_16384)]
1973    vinserti128          m3, m0, xm2, 1
1974    vperm2i128           m2, m0, m2, 0x31
1975    vinserti128          m0, m4, xm5, 1
1976    vperm2i128           m4, m4, m5, 0x31
1977    psubw                m6, m7, m1
1978.pass1_end:
1979    pmulhrsw             m3, m1
1980    pmulhrsw             m2, m6
1981    pmulhrsw             m4, m1
1982    pmulhrsw             m0, m6
1983    punpcklwd            m1, m3, m2
1984    punpckhwd            m3, m2
1985    punpcklwd            m2, m4, m0
1986    punpckhwd            m4, m0
1987    punpckldq            m0, m1, m2
1988    punpckhdq            m1, m2
1989    punpckldq            m2, m3, m4
1990    punpckhdq            m3, m4
1991    jmp                tx2q
1992.pass2:
1993    call .main
1994.end:
1995    vpbroadcastd         m4, [o(pw_2048)]
1996    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
1997    WIN64_RESTORE_XMM
1998.end2:
1999    pxor                 m4, m4
2000    mova          [cq+32*0], m4
2001    mova          [cq+32*1], m4
2002    mova          [cq+32*2], m4
2003    mova          [cq+32*3], m4
2004.end3:
2005    WRITE_16X2            0, 1, 4, 5, strideq*0, strideq*1
2006    lea                dstq, [dstq+strideq*2]
2007    WRITE_16X2            2, 3, 4, 5, strideq*0, strideq*1
2008    RET
2009ALIGN function_align
2010cglobal_label .main
2011    vpbroadcastd         m6, [o(pw_m3344_3344)]
2012    vpbroadcastd         m7, [o(pw_3803_1321)]
2013    vpbroadcastd         m8, [o(pw_m1321_2482)]
2014    vpbroadcastd         m9, [o(pw_2482_3344)]
2015    punpcklwd            m4, m2, m0 ; in2 in0 l
2016    punpckhwd            m2, m0     ; in2 in0 h
2017    psrld                m5, m6, 16
2018    pmaddwd             m10, m6, m4 ; t2:02 l
2019    pmaddwd              m6, m2     ; t2:02 h
2020    pmaddwd              m0, m7, m4 ; t0:02 l
2021    pmaddwd              m7, m2     ; t0:02 h
2022    pmaddwd              m4, m8     ; t1:02 l
2023    pmaddwd              m8, m2     ; t1:02 h
2024    punpckhwd            m2, m3, m1 ; in3 in1 h
2025    punpcklwd            m3, m1     ; in3 in1 l
2026    pmaddwd              m1, m5, m2 ; t2:3 h
2027    pmaddwd              m5, m3     ; t2:3 l
2028    paddd                m6, m1
2029    vpbroadcastd         m1, [o(pd_2048)]
2030    paddd               m10, m5
2031    pmaddwd              m5, m9, m3
2032    pmaddwd              m9, m2
2033    paddd                m0, m1
2034    paddd                m7, m1
2035    paddd                m0, m5     ; t0 + t3 + 2048 l
2036    paddd                m7, m9     ; t0 + t3 + 2048 h
2037    vpbroadcastd         m9, [o(pw_m3803_3344)]
2038    pmaddwd              m5, m9, m2
2039    pmaddwd              m9, m3
2040    paddd               m10, m1     ; t2 + 2048 l
2041    paddd                m6, m1     ; t2 + 2048 h
2042    paddd                m5, m1     ; t1:13 + 2048 h
2043    paddd                m1, m9     ; t1:13 + 2048 l
2044    vpbroadcastd         m9, [o(pw_m3803_m6688)]
2045    pmaddwd              m2, m9
2046    pmaddwd              m3, m9
2047    paddd                m5, m8     ; t1 + t3 + 2048 h
2048    paddd                m1, m4     ; t1 + t3 + 2048 l
2049    paddd                m8, m7
2050    paddd                m4, m0
2051    paddd                m2, m8     ; t0 + t1 - t3 + 2048 h
2052    paddd                m3, m4     ; t0 + t1 - t3 + 2048 l
2053    REPX      {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3
2054    packssdw             m0, m7
2055    packssdw             m1, m5
2056    packssdw             m3, m2
2057    packssdw             m2, m10, m6
2058    ret
2059
2060INV_TXFM_16X4_FN flipadst, dct
2061INV_TXFM_16X4_FN flipadst, adst
2062INV_TXFM_16X4_FN flipadst, flipadst
2063INV_TXFM_16X4_FN flipadst, identity
2064
2065cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
2066    vpermq               m0, [cq+32*0], q1230
2067    vpermq               m3, [cq+32*3], q2103
2068    vpermq               m1, [cq+32*1], q1230
2069    vpermq               m2, [cq+32*2], q2103
2070    call m(iadst_4x16_internal_8bpc).main2
2071    call m(iadst_4x16_internal_8bpc).main_pass1_end
2072    punpckhwd            m4, m3, m2
2073    punpckhwd            m5, m1, m0
2074    punpcklwd            m0, m2
2075    punpcklwd            m1, m3
2076    vpbroadcastd         m6, [o(pw_16384)]
2077    vinserti128          m3, m0, xm1, 1
2078    vperm2i128           m2, m0, m1, 0x31
2079    vinserti128          m0, m4, xm5, 1
2080    vperm2i128           m4, m4, m5, 0x31
2081    psubw                m1, m7, m6
2082    jmp m(iadst_16x4_internal_8bpc).pass1_end
2083ALIGN function_align
2084.pass2:
2085    call m(iadst_16x4_internal_8bpc).main
2086    vpbroadcastd         m4, [o(pw_2048)]
2087    REPX   {pmulhrsw x, m4}, m3, m2, m1, m0
2088    pxor                 m4, m4
2089    mova          [cq+32*0], m4
2090    mova          [cq+32*1], m4
2091    mova          [cq+32*2], m4
2092    mova          [cq+32*3], m4
2093    WRITE_16X2            3, 2, 4, 5, strideq*0, strideq*1
2094    lea                dstq, [dstq+strideq*2]
2095    WRITE_16X2            1, 0, 4, 5, strideq*0, strideq*1
2096    RET
2097
2098INV_TXFM_16X4_FN identity, dct
2099INV_TXFM_16X4_FN identity, adst
2100INV_TXFM_16X4_FN identity, flipadst
2101INV_TXFM_16X4_FN identity, identity
2102
2103cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
2104    mova                xm2, [cq+16*0]
2105    mova                xm4, [cq+16*1]
2106    vinserti128          m2, [cq+16*4], 1
2107    vinserti128          m4, [cq+16*5], 1
2108    mova                xm0, [cq+16*2]
2109    mova                xm1, [cq+16*3]
2110    vinserti128          m0, [cq+16*6], 1
2111    vinserti128          m1, [cq+16*7], 1
2112    vpbroadcastd         m7, [o(pw_1697x16)]
2113    vpbroadcastd         m8, [o(pw_16384)]
2114    punpcklwd            m3, m2, m4
2115    punpckhwd            m2, m4
2116    punpcklwd            m4, m0, m1
2117    punpckhwd            m0, m1
2118    punpcklwd            m1, m3, m2
2119    punpckhwd            m3, m2
2120    punpcklwd            m2, m4, m0
2121    punpckhwd            m4, m0
2122    pmulhrsw             m0, m7, m1
2123    pmulhrsw             m5, m7, m2
2124    pmulhrsw             m6, m7, m3
2125    pmulhrsw             m7, m4
2126    REPX   {pmulhrsw x, m8}, m0, m5, m6, m7
2127    paddsw               m1, m0
2128    paddsw               m2, m5
2129    paddsw               m3, m6
2130    paddsw               m4, m7
2131    punpcklqdq           m0, m1, m2
2132    punpckhqdq           m1, m2
2133    punpcklqdq           m2, m3, m4
2134    punpckhqdq           m3, m4
2135    jmp                tx2q
2136.pass2:
2137    vpbroadcastd         m7, [o(pw_1697x8)]
2138    pmulhrsw             m4, m7, m0
2139    pmulhrsw             m5, m7, m1
2140    pmulhrsw             m6, m7, m2
2141    pmulhrsw             m7, m3
2142    paddsw               m0, m4
2143    paddsw               m1, m5
2144    paddsw               m2, m6
2145    paddsw               m3, m7
2146    jmp m(iadst_16x4_internal_8bpc).end
2147
2148%macro INV_TXFM_16X8_FN 2 ; type1, type2
2149    INV_TXFM_FN          %1, %2, 16x8
2150%ifidn %1_%2, dct_dct
2151    movd                xm1, [o(pw_2896x8)]
2152    pmulhrsw            xm0, xm1, [cq]
2153    movd                xm2, [o(pw_16384)]
2154    mov                [cq], eobd
2155    pmulhrsw            xm0, xm1
2156    or                  r3d, 8
2157    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
2158%endif
2159%endmacro
2160
2161%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
2162    vpbroadcastd         m8, [o(pw_2896x8)]
2163    vpermq               m0, [cq+32*0], q3120
2164    add                  cq, 32*4
2165    vpermq               m7, [cq+32*3], q%1
2166    vpermq               m1, [cq-32*3], q%1
2167    vpermq               m6, [cq+32*2], q3120
2168    vpermq               m2, [cq-32*2], q3120
2169    vpermq               m5, [cq+32*1], q%1
2170    vpermq               m3, [cq-32*1], q%1
2171    vpermq               m4, [cq+32*0], q3120
2172    REPX   {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
2173%endmacro
2174
2175INV_TXFM_16X8_FN dct, dct
2176INV_TXFM_16X8_FN dct, adst
2177INV_TXFM_16X8_FN dct, flipadst
2178INV_TXFM_16X8_FN dct, identity
2179
2180cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
2181    ITX_16X8_LOAD_COEFS 3120
2182    call m(idct_8x16_internal_8bpc).main
2183    vpbroadcastd        m10, [o(pw_16384)]
2184    punpckhwd            m8, m0, m2
2185    punpcklwd            m0, m2
2186    punpckhwd            m2, m1, m3
2187    punpcklwd            m1, m3
2188    punpcklwd            m9, m4, m6
2189    punpckhwd            m4, m6
2190    punpcklwd            m6, m5, m7
2191    punpckhwd            m5, m7
2192    REPX  {pmulhrsw x, m10}, m8, m1, m4, m6
2193.pass1_end:
2194    REPX  {pmulhrsw x, m10}, m0, m2, m9, m5
2195    punpckhwd            m3, m0, m8
2196    punpcklwd            m0, m8
2197    punpckhwd            m8, m2, m1
2198    punpcklwd            m2, m1
2199    punpcklwd            m7, m9, m4
2200    punpckhwd            m9, m4
2201    punpcklwd            m4, m5, m6
2202    punpckhwd            m5, m6
2203    punpckhdq            m1, m0, m2
2204    punpckldq            m0, m2
2205    punpckldq            m2, m3, m8
2206    punpckhdq            m3, m8
2207    punpckldq            m6, m7, m4
2208    punpckhdq            m7, m4
2209    punpckldq            m8, m9, m5
2210    punpckhdq            m9, m5
2211    vperm2i128           m4, m0, m6, 0x31
2212    vinserti128          m0, xm6, 1
2213    vperm2i128           m5, m1, m7, 0x31
2214    vinserti128          m1, xm7, 1
2215    vperm2i128           m6, m2, m8, 0x31
2216    vinserti128          m2, xm8, 1
2217    vperm2i128           m7, m3, m9, 0x31
2218    vinserti128          m3, xm9, 1
2219    jmp                tx2q
2220.pass2:
2221    call .main
2222    vpbroadcastd         m8, [o(pw_2048)]
2223.end:
2224    REPX   {pmulhrsw x, m8}, m0, m2, m4, m6
2225.end2:
2226    REPX   {pmulhrsw x, m8}, m1, m3, m5, m7
2227    lea                  r3, [strideq*3]
2228    WRITE_16X2            0, 1, 8, 0, strideq*0, strideq*1
2229    WRITE_16X2            2, 3, 0, 1, strideq*2, r3
2230.end3:
2231    pxor                 m0, m0
2232    REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
2233.end4:
2234    lea                dstq, [dstq+strideq*4]
2235    WRITE_16X2            4, 5, 0, 1, strideq*0, strideq*1
2236    WRITE_16X2            6, 7, 0, 1, strideq*2, r3
2237    RET
2238ALIGN function_align
2239cglobal_label .main
2240    vpbroadcastd        m10, [o(pd_2048)]
2241.main2:
2242    IDCT8_1D              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
2243    ret
2244
2245INV_TXFM_16X8_FN adst, dct
2246INV_TXFM_16X8_FN adst, adst
2247INV_TXFM_16X8_FN adst, flipadst
2248INV_TXFM_16X8_FN adst, identity
2249
2250cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
2251    ITX_16X8_LOAD_COEFS 1302
2252    call m(iadst_8x16_internal_8bpc).main2
2253    call m(iadst_8x16_internal_8bpc).main_pass1_end
2254    psubw               m11, m9, m10
2255    punpcklwd            m8, m0, m2
2256    punpckhwd            m0, m2
2257    punpckhwd            m2, m1, m3
2258    punpcklwd            m1, m3
2259    punpcklwd            m9, m4, m6
2260    punpckhwd            m4, m6
2261    punpckhwd            m6, m5, m7
2262    punpcklwd            m5, m7
2263    REPX  {pmulhrsw x, m11}, m8, m1, m4, m6
2264    jmp m(idct_16x8_internal_8bpc).pass1_end
2265ALIGN function_align
2266.pass2:
2267    call .main
2268    call .main_pass2_end
2269    pxor                 m8, m8
2270    psubw                m8, m9
2271    REPX   {pmulhrsw x, m9}, m0, m2, m4, m6
2272    jmp m(idct_16x8_internal_8bpc).end2
2273ALIGN function_align
2274cglobal_label .main
2275    vpbroadcastd        m10, [o(pd_2048)]
2276    ITX_MULSUB_2W         7, 0, 8, 9, 10,  401, 4076 ; t1a, t0a
2277    ITX_MULSUB_2W         3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
2278    ITX_MULSUB_2W         1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
2279    ITX_MULSUB_2W         5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
2280    psubsw               m8, m2, m6 ; t6
2281    paddsw               m2, m6     ; t2
2282    psubsw               m6, m0, m4 ; t4
2283    paddsw               m0, m4     ; t0
2284    psubsw               m4, m5, m1 ; t7
2285    paddsw               m5, m1     ; t3
2286    psubsw               m1, m7, m3 ; t5
2287    paddsw               m7, m3     ; t1
2288    ITX_MULSUB_2W         6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
2289    ITX_MULSUB_2W         4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
2290    psubsw               m9, m6, m8 ;  t7
2291    paddsw               m6, m8     ;  out6
2292    psubsw               m3, m7, m5 ;  t3
2293    paddsw               m7, m5     ; -out7
2294    psubsw               m5, m0, m2 ;  t2
2295    paddsw               m0, m2     ;  out0
2296    psubsw               m2, m1, m4 ;  t6
2297    paddsw               m1, m4     ; -out1
2298    ret
2299ALIGN function_align
2300.main_pass1_end:
2301    vpbroadcastd        m11, [o(pw_m2896_2896)]
2302    vpbroadcastd        m12, [o(pw_2896_2896)]
2303    punpckhwd            m4, m3, m5
2304    punpcklwd            m3, m5
2305    pmaddwd              m5, m11, m4
2306    pmaddwd              m4, m12
2307    pmaddwd              m8, m11, m3
2308    pmaddwd              m3, m12
2309    REPX     {paddd x, m10}, m5, m4, m8, m3
2310    REPX     {psrad x, 12 }, m5, m8, m4, m3
2311    packssdw             m3, m4     ; -out3
2312    packssdw             m4, m8, m5 ;  out4
2313    punpcklwd            m5, m9, m2
2314    punpckhwd            m9, m2
2315    pmaddwd              m2, m12, m5
2316    pmaddwd              m5, m11
2317    pmaddwd             m12, m9
2318    pmaddwd             m11, m9
2319    REPX     {paddd x, m10}, m2, m5, m12, m11
2320    REPX     {psrad x, 12 }, m2, m12, m5, m11
2321    packssdw             m2, m12    ;  out2
2322    packssdw             m5, m11    ; -out5
2323    ret
2324ALIGN function_align
2325cglobal_label .main_pass2_end
2326    vpbroadcastd         m8, [o(pw_2896x8)]
2327    psubsw               m4, m5, m3
2328    paddsw               m3, m5
2329    psubsw               m5, m2, m9
2330    paddsw               m2, m9
2331    pmulhrsw             m2, m8     ;  out2
2332    pmulhrsw             m3, m8     ; -out3
2333    pmulhrsw             m4, m8     ;  out4
2334    pmulhrsw             m5, m8     ; -out5
2335    vpbroadcastd         m9, [o(pw_2048)]
2336    ret
2337
2338INV_TXFM_16X8_FN flipadst, dct
2339INV_TXFM_16X8_FN flipadst, adst
2340INV_TXFM_16X8_FN flipadst, flipadst
2341INV_TXFM_16X8_FN flipadst, identity
2342
2343cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
2344    ITX_16X8_LOAD_COEFS 1302
2345    call m(iadst_8x16_internal_8bpc).main2
2346    call m(iadst_8x16_internal_8bpc).main_pass1_end
2347    psubw                m9, m10
2348    punpcklwd            m8, m6, m4
2349    punpckhwd            m6, m4
2350    punpcklwd            m4, m7, m5
2351    punpckhwd            m7, m5
2352    punpckhwd            m5, m3, m1
2353    punpcklwd            m3, m1
2354    punpckhwd            m1, m2, m0
2355    punpcklwd            m2, m0
2356    REPX  {pmulhrsw x, m10}, m8, m4, m5, m1
2357    REPX  {pmulhrsw x, m9 }, m6, m7, m3, m2
2358    punpcklwd            m0, m7, m4
2359    punpckhwd            m7, m4
2360    punpckhwd            m4, m6, m8
2361    punpcklwd            m6, m8
2362    punpckhwd            m8, m3, m5
2363    punpcklwd            m3, m5
2364    punpcklwd            m5, m2, m1
2365    punpckhwd            m2, m1
2366    punpckhdq            m1, m0, m6
2367    punpckldq            m0, m6
2368    punpckldq            m6, m7, m4
2369    punpckhdq            m7, m4
2370    punpckhdq            m4, m3, m5
2371    punpckldq            m3, m5
2372    punpckldq            m5, m8, m2
2373    punpckhdq            m8, m2
2374    vinserti128          m2, m6, xm5, 1
2375    vperm2i128           m6, m5, 0x31
2376    vperm2i128           m5, m1, m4, 0x31
2377    vinserti128          m1, xm4, 1
2378    vperm2i128           m4, m0, m3, 0x31
2379    vinserti128          m0, xm3, 1
2380    vinserti128          m3, m7, xm8, 1
2381    vperm2i128           m7, m8, 0x31
2382    jmp                tx2q
2383.pass2:
2384    call m(iadst_16x8_internal_8bpc).main
2385    call m(iadst_16x8_internal_8bpc).main_pass2_end
2386    pxor                 m8, m8
2387    psubw                m8, m9
2388    pmulhrsw            m10, m7, m8
2389    pmulhrsw             m7, m0, m9
2390    pmulhrsw             m0, m6, m9
2391    pmulhrsw             m6, m1, m8
2392    pmulhrsw             m1, m5, m8
2393    pmulhrsw             m5, m2, m9
2394    pmulhrsw             m2, m4, m9
2395    pmulhrsw             m4, m3, m8
2396    lea                  r3, [strideq*3]
2397    WRITE_16X2           10, 0, 8, 9, strideq*0, strideq*1
2398    WRITE_16X2            1, 2, 0, 1, strideq*2, r3
2399    jmp m(idct_16x8_internal_8bpc).end3
2400
2401INV_TXFM_16X8_FN identity, dct
2402INV_TXFM_16X8_FN identity, adst
2403INV_TXFM_16X8_FN identity, flipadst
2404INV_TXFM_16X8_FN identity, identity
2405
2406cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
2407    mova                xm7, [cq+16*0]
2408    mova                xm2, [cq+16*1]
2409    add                  cq, 16*8
2410    vpbroadcastd         m3, [o(pw_2896x8)]
2411    vinserti128          m7, [cq+16*0], 1
2412    vinserti128          m2, [cq+16*1], 1
2413    mova                xm6, [cq-16*6]
2414    mova                xm4, [cq-16*5]
2415    vinserti128          m6, [cq+16*2], 1
2416    vinserti128          m4, [cq+16*3], 1
2417    mova                xm8, [cq-16*4]
2418    mova                xm5, [cq-16*3]
2419    vinserti128          m8, [cq+16*4], 1
2420    vinserti128          m5, [cq+16*5], 1
2421    mova                xm0, [cq-16*2]
2422    mova                xm1, [cq-16*1]
2423    vinserti128          m0, [cq+16*6], 1
2424    vinserti128          m1, [cq+16*7], 1
2425    vpbroadcastd        m10, [o(pw_1697x16)]
2426    vpbroadcastd        m11, [o(pw_16384)]
2427    REPX   {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
2428    punpcklwd            m3, m7, m2
2429    punpckhwd            m7, m2
2430    punpcklwd            m2, m6, m4
2431    punpckhwd            m6, m4
2432    punpcklwd            m4, m8, m5
2433    punpckhwd            m8, m5
2434    punpcklwd            m5, m0, m1
2435    punpckhwd            m0, m1
2436    punpckldq            m1, m3, m2
2437    punpckhdq            m3, m2
2438    punpckldq            m2, m4, m5
2439    punpckhdq            m4, m5
2440    punpckldq            m5, m7, m6
2441    punpckhdq            m7, m6
2442    punpckldq            m6, m8, m0
2443    punpckhdq            m8, m0
2444    REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8
2445    punpcklqdq           m0, m1, m2
2446    punpckhqdq           m1, m2
2447    punpcklqdq           m2, m3, m4
2448    punpckhqdq           m3, m4
2449    punpcklqdq           m4, m5, m6
2450    punpckhqdq           m5, m6
2451    punpcklqdq           m6, m7, m8
2452    punpckhqdq           m7, m8
2453    jmp                tx2q
2454.pass2:
2455    vpbroadcastd         m8, [o(pw_4096)]
2456    jmp m(idct_16x8_internal_8bpc).end
2457
2458%define o_base pw_5 + 128
2459
2460%macro INV_TXFM_16X16_FN 2 ; type1, type2
2461    INV_TXFM_FN          %1, %2, 16x16
2462%ifidn %1_%2, dct_dct
2463    movd                xm1, [o(pw_2896x8)]
2464    pmulhrsw            xm0, xm1, [cq]
2465    movd                xm2, [o(pw_8192)]
2466    mov                [cq], eobd
2467    or                  r3d, 16
2468    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
2469%endif
2470%endmacro
2471
2472%macro ITX_16X16_LOAD_COEFS 0
2473    mova                 m0, [cq+32*0]
2474    mova                 m1, [cq+32*1]
2475    mova                 m2, [cq+32*2]
2476    mova                 m3, [cq+32*3]
2477    add                  cq, 32*8
2478    mova                 m4, [cq-32*4]
2479    mova                 m5, [cq-32*3]
2480    mova                 m6, [cq-32*2]
2481    mova                 m7, [cq-32*1]
2482    mova                 m8, [cq+32*0]
2483    mova                 m9, [cq+32*1]
2484    mova                m10, [cq+32*2]
2485    mova                m11, [cq+32*3]
2486    mova                m12, [cq+32*4]
2487    mova                m13, [cq+32*5]
2488    mova                m14, [cq+32*6]
2489    mova                m15, [cq+32*7]
2490    mova              [rsp], m15
2491%endmacro
2492
2493INV_TXFM_16X16_FN dct, dct
2494INV_TXFM_16X16_FN dct, adst
2495INV_TXFM_16X16_FN dct, flipadst
2496INV_TXFM_16X16_FN dct, identity
2497
2498cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
2499    ITX_16X16_LOAD_COEFS
2500    call .main
2501.pass1_end:
2502    vpbroadcastd         m1, [o(pw_8192)]
2503    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
2504    vextracti128 [rsp+16*5], m8, 1
2505    mova         [rsp+16*1], xm8
2506.pass1_end2:
2507    vextracti128 [rsp+16*4], m0, 1
2508    mova         [rsp+16*0], xm0
2509    REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
2510    pmulhrsw             m1, [rsp+32*1]
2511    vperm2i128           m8, m1, m9, 0x31
2512    vinserti128          m1, xm9, 1
2513    vperm2i128           m9, m2, m10, 0x31
2514    vinserti128          m2, xm10, 1
2515    vperm2i128          m10, m3, m11, 0x31
2516    vinserti128          m3, xm11, 1
2517    vperm2i128          m11, m4, m12, 0x31
2518    vinserti128          m4, xm12, 1
2519    vperm2i128          m12, m5, m13, 0x31
2520    vinserti128          m5, xm13, 1
2521    vperm2i128          m13, m6, m14, 0x31
2522    vinserti128          m6, xm14, 1
2523    vperm2i128          m14, m7, m15, 0x31
2524    vinserti128          m7, xm15, 1
2525    mova                m15, [rsp+32*2]
2526.pass1_end3:
2527    punpcklwd            m0, m9, m10
2528    punpckhwd            m9, m10
2529    punpcklwd           m10, m15, m8
2530    punpckhwd           m15, m8
2531    punpckhwd            m8, m11, m12
2532    punpcklwd           m11, m12
2533    punpckhwd           m12, m13, m14
2534    punpcklwd           m13, m14
2535    punpckhdq           m14, m11, m13
2536    punpckldq           m11, m13
2537    punpckldq           m13, m15, m9
2538    punpckhdq           m15, m9
2539    punpckldq            m9, m10, m0
2540    punpckhdq           m10, m0
2541    punpckhdq            m0, m8, m12
2542    punpckldq            m8, m12
2543    punpcklqdq          m12, m13, m8
2544    punpckhqdq          m13, m8
2545    punpcklqdq           m8, m9, m11
2546    punpckhqdq           m9, m11
2547    punpckhqdq          m11, m10, m14
2548    punpcklqdq          m10, m14
2549    punpcklqdq          m14, m15, m0
2550    punpckhqdq          m15, m0
2551    mova                 m0, [rsp]
2552    mova              [rsp], m15
2553    punpckhwd           m15, m4, m5
2554    punpcklwd            m4, m5
2555    punpckhwd            m5, m0, m1
2556    punpcklwd            m0, m1
2557    punpckhwd            m1, m6, m7
2558    punpcklwd            m6, m7
2559    punpckhwd            m7, m2, m3
2560    punpcklwd            m2, m3
2561    punpckhdq            m3, m0, m2
2562    punpckldq            m0, m2
2563    punpckldq            m2, m4, m6
2564    punpckhdq            m4, m6
2565    punpckhdq            m6, m5, m7
2566    punpckldq            m5, m7
2567    punpckldq            m7, m15, m1
2568    punpckhdq           m15, m1
2569    punpckhqdq           m1, m0, m2
2570    punpcklqdq           m0, m2
2571    punpcklqdq           m2, m3, m4
2572    punpckhqdq           m3, m4
2573    punpcklqdq           m4, m5, m7
2574    punpckhqdq           m5, m7
2575    punpckhqdq           m7, m6, m15
2576    punpcklqdq           m6, m15
2577    jmp                tx2q
2578.pass2:
2579    call .main
2580.end:
2581    vpbroadcastd         m1, [o(pw_2048)]
2582    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
2583    mova              [rsp], m6
2584.end2:
2585    REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
2586    pmulhrsw             m1, [rsp+32*1]
2587    lea                  r3, [strideq*3]
2588    WRITE_16X2            0,  1,  6,  0, strideq*0, strideq*1
2589    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
2590    lea                dstq, [dstq+strideq*4]
2591    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
2592    WRITE_16X2        [rsp],  7,  0,  1, strideq*2, r3
2593.end3:
2594    pxor                 m2, m2
2595    REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1
2596    lea                dstq, [dstq+strideq*4]
2597    WRITE_16X2            8,  9,  0,  1, strideq*0, strideq*1
2598    WRITE_16X2           10, 11,  0,  1, strideq*2, r3
2599    REPX {mova [cq+32*x], m2},  0,  1,  2,  3,  4,  5,  6,  7
2600    lea                dstq, [dstq+strideq*4]
2601    WRITE_16X2           12, 13,  0,  1, strideq*0, strideq*1
2602    WRITE_16X2           14, 15,  0,  1, strideq*2, r3
2603    RET
2604ALIGN function_align
2605cglobal_label .main
2606    vpbroadcastd        m15, [o(pd_2048)]
2607    mova [rsp+gprsize+32*1], m1
2608    mova [rsp+gprsize+32*2], m9
2609    IDCT8_1D              0,  2,  4,  6,  8, 10, 12, 14,  1,  9, 15
2610    mova                 m1, [rsp+gprsize+32*2] ; in9
2611    mova [rsp+gprsize+32*2], m14 ; tmp7
2612    mova                 m9, [rsp+gprsize+32*1] ; in1
2613    mova [rsp+gprsize+32*1], m10 ; tmp5
2614    mova                m14, [rsp+gprsize+32*0] ; in15
2615    mova [rsp+gprsize+32*0], m6  ; tmp3
2616    IDCT16_1D_ODDHALF     9,  3,  5,  7,  1, 11, 13, 14,  6, 10, 15
2617    mova                 m6, [rsp+gprsize+32*1] ; tmp5
2618    psubsw              m15, m0, m14  ; out15
2619    paddsw               m0, m14      ; out0
2620    psubsw              m14, m2, m13  ; out14
2621    paddsw               m2, m13      ; out1
2622    mova [rsp+gprsize+32*1], m2
2623    psubsw              m13, m4, m11  ; out13
2624    paddsw               m2, m4, m11  ; out2
2625    psubsw              m11, m8, m7   ; out11
2626    paddsw               m4, m8, m7   ; out4
2627    mova                 m7, [rsp+gprsize+32*2] ; tmp7
2628    psubsw              m10, m6, m5   ; out10
2629    paddsw               m5, m6       ; out5
2630    psubsw               m8, m7, m9   ; out8
2631    paddsw               m7, m9       ; out7
2632    psubsw               m9, m12, m3  ; out9
2633    paddsw               m6, m12, m3  ; out6
2634    mova                 m3, [rsp+gprsize+32*0] ; tmp3
2635    psubsw              m12, m3, m1   ; out12
2636    paddsw               m3, m1       ; out3
2637    ret
2638
2639INV_TXFM_16X16_FN adst, dct
2640INV_TXFM_16X16_FN adst, adst
2641INV_TXFM_16X16_FN adst, flipadst
2642
2643cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
2644    ITX_16X16_LOAD_COEFS
2645    call .main
2646    call .main_pass1_end
2647    pmulhrsw             m0, m1, [cq+32*0]
2648    pmulhrsw             m2, m1, [cq+32*1]
2649    REPX   {pmulhrsw x, m1}, m4, m6, m8, m10
2650    pmulhrsw            m12, m1, [cq+32*2]
2651    pmulhrsw            m14, m1, [cq+32*3]
2652    vextracti128 [rsp+16*5], m8, 1
2653    mova         [rsp+16*1], xm8
2654    pxor                 m8, m8
2655    psubw                m1, m8, m1
2656    jmp m(idct_16x16_internal_8bpc).pass1_end2
2657ALIGN function_align
2658.pass2:
2659    call .main
2660    call .main_pass2_end
2661    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
2662    mova         [rsp+32*0], m6
2663    pxor                 m6, m6
2664    psubw                m1, m6, m1
2665    jmp m(idct_16x16_internal_8bpc).end2
2666ALIGN function_align
2667cglobal_label .main
2668    vpbroadcastd        m15, [o(pd_2048)]
2669    mova [rsp+gprsize+32*1], m0
2670    mova [rsp+gprsize+32*2], m4
2671    ITX_MULSUB_2W        13,  2,  0,  4, 15,  995, 3973 ; t3,  t2
2672    ITX_MULSUB_2W         9,  6,  0,  4, 15, 2440, 3290 ; t7,  t6
2673    ITX_MULSUB_2W         5, 10,  0,  4, 15, 3513, 2106 ; t11, t10
2674    ITX_MULSUB_2W         1, 14,  0,  4, 15, 4052,  601 ; t15, t14
2675    psubsw               m0, m2, m10  ; t10a
2676    paddsw               m2, m10      ; t2a
2677    psubsw              m10, m13, m5  ; t11a
2678    paddsw              m13, m5       ; t3a
2679    psubsw               m5, m6, m14  ; t14a
2680    paddsw               m6, m14      ; t6a
2681    psubsw              m14, m9, m1   ; t15a
2682    paddsw               m9, m1       ; t7a
2683    ITX_MULSUB_2W         0, 10,  1,  4, 15, 3406, 2276 ; t11, t10
2684    ITX_MULSUB_2W        14,  5,  1,  4, 15, 2276, 3406 ; t14, t15
2685    psubsw               m1, m10, m14 ; t14a
2686    paddsw              m10, m14      ; t10a
2687    psubsw              m14, m0, m5   ; t15a
2688    paddsw               m0, m5       ; t11a
2689    psubsw               m5, m2, m6   ; t6
2690    paddsw               m2, m6       ; t2
2691    psubsw               m6, m13, m9  ; t7
2692    paddsw              m13, m9       ; t3
2693    ITX_MULSUB_2W         6,  5,  4,  9, 15, 3784, 1567 ; t6a, t7a
2694    ITX_MULSUB_2W        14,  1,  4,  9, 15, 3784, 1567 ; t14, t15
2695    mova                 m9, [rsp+gprsize+32*0] ; in15
2696    mova [rsp+gprsize+32*0], m10 ; t10a
2697    mova                 m4, [rsp+gprsize+32*1] ; in0
2698    mova [rsp+gprsize+32*1], m6  ; t6a
2699    mova                 m6, [rsp+gprsize+32*2] ; in4
2700    mova [rsp+gprsize+32*2], m2  ; t2
2701    ITX_MULSUB_2W         9,  4,  2, 10, 15,  201, 4091 ; t1,  t0
2702    ITX_MULSUB_2W        11,  6,  2, 10, 15, 1751, 3703 ; t5,  t4
2703    ITX_MULSUB_2W         7,  8,  2, 10, 15, 3035, 2751 ; t9,  t8
2704    ITX_MULSUB_2W         3, 12,  2, 10, 15, 3857, 1380 ; t13, t12
2705    psubsw              m10, m4, m8  ; t8a
2706    paddsw               m8, m4      ; t0a
2707    psubsw               m4, m9, m7  ; t9a
2708    paddsw               m9, m7      ; t1a
2709    psubsw               m7, m6, m12 ; t12a
2710    paddsw               m6, m12     ; t4a
2711    psubsw              m12, m11, m3 ; t13a
2712    paddsw              m11, m3      ; t5a
2713    ITX_MULSUB_2W        10,  4,  2,  3, 15,  799, 4017 ; t9,  t8
2714    ITX_MULSUB_2W        12,  7,  2,  3, 15, 4017,  799 ; t12, t13
2715    psubsw               m3, m9, m11 ; t5
2716    paddsw               m9, m11     ; t1
2717    psubsw              m11, m4, m12 ; t12a
2718    paddsw               m4, m12     ; t8a
2719    paddsw              m12, m8, m6  ; t0
2720    psubsw               m8, m6      ; t4
2721    paddsw               m6, m10, m7 ; t9a
2722    psubsw              m10, m7      ; t13a
2723    ITX_MULSUB_2W         8,  3,  2,  7, 15, 1567, 3784 ; t5a, t4a
2724    ITX_MULSUB_2W        11, 10,  2,  7, 15, 1567, 3784 ; t13, t12
2725    mova                 m7, [rsp+gprsize+32*0] ; t10a
2726    mova                 m2, [rsp+gprsize+32*1] ; t6a
2727    paddsw              m15, m9, m13  ; -out15
2728    psubsw               m9, m13      ;  t3a
2729    paddsw              m13, m11, m1  ; -out13
2730    psubsw              m11, m1       ;  t15a
2731    psubsw               m1, m4, m7   ;  t10
2732    paddsw               m7, m4       ; -out1
2733    psubsw               m4, m3, m2   ;  t6
2734    paddsw               m3, m2       ; -out3
2735    paddsw               m2, m10, m14 ;  out2
2736    psubsw              m10, m14      ;  t14a
2737    paddsw              m14, m6, m0   ;  out14
2738    psubsw               m6, m0       ;  t11
2739    mova                 m0, [rsp+gprsize+32*2] ; t2
2740    mova [rsp+gprsize+32*1], m7
2741    psubsw               m7, m12, m0  ;  t2a
2742    paddsw               m0, m12      ;  out0
2743    paddsw              m12, m8, m5   ;  out12
2744    psubsw               m8, m5       ;  t7
2745    ret
2746ALIGN function_align
2747.main_pass1_end:
2748    mova          [cq+32*0], m0
2749    mova          [cq+32*1], m2
2750    mova          [cq+32*2], m12
2751    mova          [cq+32*3], m14
2752    vpbroadcastd        m14, [pw_m2896_2896]
2753    vpbroadcastd        m12, [pw_2896_2896]
2754    vpbroadcastd         m2, [pd_2048]
2755    punpcklwd            m5, m11, m10
2756    punpckhwd           m11, m10
2757    pmaddwd             m10, m14, m5
2758    pmaddwd              m0, m14, m11
2759    pmaddwd              m5, m12
2760    pmaddwd             m11, m12
2761    REPX      {paddd x, m2}, m10, m0, m5, m11
2762    REPX      {psrad x, 12}, m10, m0, m5, m11
2763    packssdw            m10, m0  ;  out10
2764    packssdw             m5, m11 ; -out5
2765    punpcklwd           m11, m8, m4
2766    punpckhwd            m8, m4
2767    pmaddwd              m4, m12, m11
2768    pmaddwd              m0, m12, m8
2769    pmaddwd             m11, m14
2770    pmaddwd              m8, m14
2771    REPX      {paddd x, m2}, m4, m0, m11, m8
2772    REPX      {psrad x, 12}, m4, m0, m11, m8
2773    packssdw             m4, m0  ;  out4
2774    packssdw            m11, m8  ; -out11
2775    punpcklwd            m8, m9, m7
2776    punpckhwd            m9, m7
2777    pmaddwd              m7, m12, m8
2778    pmaddwd              m0, m12, m9
2779    pmaddwd              m8, m14
2780    pmaddwd              m9, m14
2781    REPX      {paddd x, m2}, m7, m0, m8, m9
2782    REPX      {psrad x, 12}, m7, m0, m8, m9
2783    packssdw             m7, m0  ; -out7
2784    packssdw             m8, m9  ;  out8
2785    punpckhwd            m0, m6, m1
2786    punpcklwd            m6, m1
2787    pmaddwd              m1, m14, m0
2788    pmaddwd              m9, m14, m6
2789    pmaddwd              m0, m12
2790    pmaddwd              m6, m12
2791    REPX      {paddd x, m2}, m1, m9, m0, m6
2792    REPX      {psrad x, 12}, m1, m9, m0, m6
2793    packssdw             m9, m1  ; -out7
2794    packssdw             m6, m0  ;  out8
2795    vpbroadcastd         m1, [o(pw_8192)]
2796    ret
2797ALIGN function_align
2798cglobal_label .main_pass2_end
2799    ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
2800    ; 16-bit here will produce the same result as using 32-bit intermediates.
2801    paddsw               m5, m10, m11 ; -out5
2802    psubsw              m10, m11      ;  out10
2803    psubsw              m11, m4, m8   ; -out11
2804    paddsw               m4, m8       ;  out4
2805    psubsw               m8, m7, m9   ;  out8
2806    paddsw               m7, m9       ; -out7
2807    psubsw               m9, m1, m6   ; -out9
2808    paddsw               m6, m1       ;  out6
2809    vpbroadcastd         m1, [o(pw_2896x8)]
2810    REPX   {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
2811    vpbroadcastd         m1, [o(pw_2048)]
2812    ret
2813
2814INV_TXFM_16X16_FN flipadst, dct
2815INV_TXFM_16X16_FN flipadst, adst
2816INV_TXFM_16X16_FN flipadst, flipadst
2817
2818cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
2819    ITX_16X16_LOAD_COEFS
2820    call m(iadst_16x16_internal_8bpc).main
2821    call m(iadst_16x16_internal_8bpc).main_pass1_end
2822    pmulhrsw             m6, m1
2823    pmulhrsw             m2, m1, m8
2824    mova         [rsp+32*2], m6
2825    pmulhrsw             m6, m1, m4
2826    pmulhrsw             m4, m1, m10
2827    pmulhrsw             m8, m1, [cq+32*3]
2828    pmulhrsw            m10, m1, [cq+32*2]
2829    pmulhrsw            m12, m1, [cq+32*1]
2830    pmulhrsw            m14, m1, [cq+32*0]
2831    pxor                 m0, m0
2832    psubw                m0, m1
2833    REPX   {pmulhrsw x, m0}, m3, m5, m7, m11, m15
2834    pmulhrsw             m1, m0, m9
2835    pmulhrsw             m9, m0, m13
2836    pmulhrsw             m0, [rsp+32*1]
2837    mova         [rsp+16*0], xm15
2838    mova         [rsp+16*1], xm7
2839    vperm2i128          m15, m15, m7, 0x31
2840    vinserti128          m7, m2, xm14, 1
2841    vperm2i128          m14, m2, m14, 0x31
2842    vinserti128          m2, m9, xm5, 1
2843    vperm2i128           m9, m9, m5, 0x31
2844    vinserti128          m5, m4, xm12, 1
2845    vperm2i128          m12, m4, m12, 0x31
2846    vinserti128          m4, m11, xm3, 1
2847    vperm2i128          m11, m11, m3, 0x31
2848    vinserti128          m3, m10, xm6, 1
2849    vperm2i128          m10, m10, m6, 0x31
2850    vinserti128          m6, m1, xm0, 1
2851    vperm2i128          m13, m1, m0, 0x31
2852    vinserti128          m1, m8, [rsp+32*2], 1
2853    vperm2i128           m8, m8, [rsp+32*2], 0x31
2854    jmp m(idct_16x16_internal_8bpc).pass1_end3
2855.pass2:
2856    call m(iadst_16x16_internal_8bpc).main
2857    call m(iadst_16x16_internal_8bpc).main_pass2_end
2858    pmulhrsw             m0, m1
2859    pmulhrsw             m8, m1
2860    mova         [rsp+32*0], m0
2861    mova         [rsp+32*2], m8
2862    pxor                 m0, m0
2863    psubw                m0, m1
2864    pmulhrsw             m8, m0, m7
2865    pmulhrsw             m7, m0, m9
2866    pmulhrsw             m9, m1, m6
2867    pmulhrsw             m6, m1, m10
2868    pmulhrsw            m10, m0, m5
2869    pmulhrsw             m5, m0, m11
2870    pmulhrsw            m11, m1, m4
2871    pmulhrsw             m4, m1, m12
2872    pmulhrsw            m12, m0, m3
2873    pmulhrsw             m3, m0, m13
2874    pmulhrsw            m13, m1, m2
2875    pmulhrsw             m1, m14
2876    pmulhrsw            m14, m0, [rsp+32*1]
2877    pmulhrsw             m0, m15
2878    lea                  r3, [strideq*3]
2879    WRITE_16X2            0,  1,  2,  0, strideq*0, strideq*1
2880    mova                m15, [rsp+32*0]
2881    WRITE_16X2            3,  4,  0,  1, strideq*2, r3
2882    lea                dstq, [dstq+strideq*4]
2883    WRITE_16X2            5,  6,  0,  1, strideq*0, strideq*1
2884    WRITE_16X2            7, [rsp+32*2],  0,  1, strideq*2, r3
2885    jmp m(idct_16x16_internal_8bpc).end3
2886
2887%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
2888    pmulhrsw            m%2, m%3, m%1
2889    psraw               m%2, 1
2890    pavgw               m%1, m%2 ; signs are guaranteed to be equal
2891%endmacro
2892
2893INV_TXFM_16X16_FN identity, dct
2894INV_TXFM_16X16_FN identity, identity
2895
2896cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
2897    vpbroadcastd         m7, [o(pw_1697x16)]
2898    mova                xm0, [cq+16* 0]
2899    vinserti128          m0, [cq+16*16], 1
2900    mova               xm15, [cq+16* 1]
2901    vinserti128         m15, [cq+16*17], 1
2902    mova                xm1, [cq+16* 2]
2903    vinserti128          m1, [cq+16*18], 1
2904    mova                xm8, [cq+16* 3]
2905    vinserti128          m8, [cq+16*19], 1
2906    mova                xm2, [cq+16* 4]
2907    vinserti128          m2, [cq+16*20], 1
2908    mova                xm9, [cq+16* 5]
2909    vinserti128          m9, [cq+16*21], 1
2910    mova                xm3, [cq+16* 6]
2911    vinserti128          m3, [cq+16*22], 1
2912    mova               xm10, [cq+16* 7]
2913    add                  cq, 16*16
2914    vinserti128         m10, [cq+16* 7], 1
2915    mova                xm4, [cq-16* 8]
2916    vinserti128          m4, [cq+16* 8], 1
2917    mova               xm11, [cq-16* 7]
2918    vinserti128         m11, [cq+16* 9], 1
2919    mova                xm5, [cq-16* 6]
2920    vinserti128          m5, [cq+16*10], 1
2921    mova               xm12, [cq-16* 5]
2922    vinserti128         m12, [cq+16*11], 1
2923    mova               xm13, [cq-16* 3]
2924    vinserti128         m13, [cq+16*13], 1
2925    mova               xm14, [cq-16* 1]
2926    vinserti128         m14, [cq+16*15], 1
2927    REPX  {IDTX16B x, 6, 7},  0, 15,  1,  8,  2,  9,  3, \
2928                             10,  4, 11,  5, 12, 13, 14
2929    mova                xm6, [cq-16* 4]
2930    vinserti128          m6, [cq+16*12], 1
2931    mova              [rsp], m0
2932    IDTX16B               6, 0, 7
2933    mova                xm0, [cq-16* 2]
2934    vinserti128          m0, [cq+16*14], 1
2935    pmulhrsw             m7, m0
2936    psraw                m7, 1
2937    pavgw                m7, m0
2938    jmp m(idct_16x16_internal_8bpc).pass1_end3
2939ALIGN function_align
2940.pass2:
2941    vpbroadcastd        m15, [o(pw_1697x16)]
2942    mova         [rsp+32*1], m0
2943    REPX  {IDTX16 x, 0, 15},  1,  2,  3,  4,  5,  6,  7, \
2944                              8,  9, 10, 11, 12, 13, 14
2945    mova                 m0, [rsp+32*1]
2946    mova         [rsp+32*1], m1
2947    IDTX16                0, 1, 15
2948    mova                 m1, [rsp+32*0]
2949    pmulhrsw            m15, m1
2950    paddsw               m1, m1
2951    paddsw              m15, m1
2952    jmp m(idct_16x16_internal_8bpc).end
2953
2954%define o_base deint_shuf + 128
2955
2956%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
2957%if %3
2958    vpbroadcastd        m15, [o(pw_2896x8)]
2959    pmulhrsw             m0, m15, [%1+%2*0]
2960    pmulhrsw             m1, m15, [%1+%2*1]
2961    pmulhrsw             m2, m15, [%1+%2*2]
2962    pmulhrsw             m3, m15, [%1+%2*3]
2963    pmulhrsw             m4, m15, [%1+%2*4]
2964    pmulhrsw             m5, m15, [%1+%2*5]
2965    pmulhrsw             m6, m15, [%1+%2*6]
2966    pmulhrsw             m7, m15, [%1+%2*7]
2967%else
2968    mova                 m0, [%1+%2*0]
2969    mova                 m1, [%1+%2*1]
2970    mova                 m2, [%1+%2*2]
2971    mova                 m3, [%1+%2*3]
2972    mova                 m4, [%1+%2*4]
2973    mova                 m5, [%1+%2*5]
2974    mova                 m6, [%1+%2*6]
2975    mova                 m7, [%1+%2*7]
2976%endif
2977%endmacro
2978
2979%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2
2980%if %3
2981%if %3 == 1
2982    vpbroadcastd        m15, [o(pw_2896x8)]
2983%endif
2984    pmulhrsw             m8, m15, [%1+%2*0]
2985    pmulhrsw             m9, m15, [%1+%2*1]
2986    pmulhrsw            m10, m15, [%1+%2*2]
2987    pmulhrsw            m11, m15, [%1+%2*3]
2988    pmulhrsw            m12, m15, [%1+%2*4]
2989    pmulhrsw            m13, m15, [%1+%2*5]
2990    pmulhrsw            m14, m15, [%1+%2*6]
2991    pmulhrsw            m15,      [%1+%2*7]
2992%else
2993    mova                 m8, [%1+%2*0]
2994    mova                 m9, [%1+%2*1]
2995    mova                m10, [%1+%2*2]
2996    mova                m11, [%1+%2*3]
2997    mova                m12, [%1+%2*4]
2998    mova                m13, [%1+%2*5]
2999    mova                m14, [%1+%2*6]
3000    mova                m15, [%1+%2*7]
3001%endif
3002%endmacro
3003
3004%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
3005    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%4_%5x8]
3006    punpcklwd           m%1, m%2, m%2
3007    pmulhrsw            m%1, m%3
3008    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%6_%7x8]
3009    punpckhwd           m%2, m%2
3010    pmulhrsw            m%2, m%3
3011%endmacro
3012
3013cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
3014    lea                  r6, [o_base]
3015    test               eobd, eobd
3016    jz .dconly
3017    PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
3018    %undef cmp
3019    cmp                eobd, 106
3020    jle .fast
3021    LOAD_8ROWS      cq+32*1, 32*2
3022    call m(idct_16x8_internal_8bpc).main
3023    vperm2i128          m11, m0, m4, 0x31
3024    vinserti128          m0, xm4, 1
3025    vperm2i128           m4, m1, m5, 0x31
3026    vinserti128          m1, xm5, 1
3027    vperm2i128           m5, m2, m6, 0x31
3028    vinserti128          m2, xm6, 1
3029    vperm2i128           m6, m3, m7, 0x31
3030    vinserti128          m3, xm7, 1
3031    pxor                 m7, m7
3032    REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
3033    punpckhwd            m7, m0, m1
3034    punpcklwd            m0, m1
3035    punpckhwd            m1, m2, m3
3036    punpcklwd            m2, m3
3037    punpcklwd            m3, m11, m4
3038    punpckhwd           m11, m4
3039    punpckhwd            m4, m5, m6
3040    punpcklwd            m5, m6
3041    punpckhdq            m6, m0, m2
3042    punpckldq            m0, m2
3043    punpckldq            m2, m3, m5
3044    punpckhdq            m3, m5
3045    punpckhdq            m5, m11, m4
3046    punpckldq           m11, m4
3047    punpckldq            m4, m7, m1
3048    punpckhdq            m7, m1
3049    punpckhqdq          m12, m6, m0
3050    punpcklqdq           m0, m6     ; out4
3051    punpckhqdq          m13, m7, m4
3052    punpcklqdq           m4, m7     ; out5
3053    punpckhqdq          m14, m3, m2
3054    punpcklqdq           m2, m3     ; out6
3055    punpckhqdq          m15, m5, m11
3056    punpcklqdq          m11, m5     ; out7
3057    mova         [rsp+32*0], m0
3058    mova         [rsp+32*1], m4
3059    mova         [rsp+32*2], m2
3060.fast:
3061    LOAD_8ROWS      cq+32*0, 32*2
3062    call m(idct_16x8_internal_8bpc).main
3063    vperm2i128           m8, m0, m4, 0x31
3064    vinserti128          m0, xm4, 1
3065    vperm2i128           m4, m1, m5, 0x31
3066    vinserti128          m1, xm5, 1
3067    vperm2i128           m5, m2, m6, 0x31
3068    vinserti128          m2, xm6, 1
3069    vperm2i128           m6, m3, m7, 0x31
3070    vinserti128          m3, xm7, 1
3071    vpbroadcastd         m9, [o(pw_8192)]
3072    pxor                 m7, m7
3073    REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
3074    punpckhwd            m7, m0, m1
3075    punpcklwd            m0, m1
3076    punpckhwd            m1, m2, m3
3077    punpcklwd            m2, m3
3078    punpckhwd            m3, m8, m4
3079    punpcklwd            m8, m4
3080    punpckhwd            m4, m5, m6
3081    punpcklwd            m5, m6
3082    punpckhdq            m6, m0, m2
3083    punpckldq            m0, m2
3084    punpckldq            m2, m8, m5
3085    punpckhdq            m8, m5
3086    punpckhdq            m5, m3, m4
3087    punpckldq            m3, m4
3088    punpckhdq            m4, m7, m1
3089    punpckldq            m7, m1
3090    punpcklqdq           m1, m7, m4
3091    punpckhqdq           m7, m4     ; out9
3092    punpckhqdq           m4, m2, m8 ; out10
3093    punpcklqdq           m2, m8
3094    punpckhqdq           m8, m3, m5
3095    punpcklqdq           m3, m5
3096    punpckhqdq           m5, m0, m6 ; out8
3097    punpcklqdq           m0, m6
3098    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7
3099    cmp                eobd, 106
3100    jg .full
3101    mova         [rsp+32*0], m5
3102    mova         [rsp+32*1], m7
3103    mova         [rsp+32*2], m4
3104    pmulhrsw            m11, m9, m8
3105    pxor                 m4, m4
3106    REPX       {mova x, m4}, m5, m6, m7
3107    call .main_fast
3108    jmp .pass2
3109.dconly:
3110    movd                xm1, [o(pw_2896x8)]
3111    pmulhrsw            xm0, xm1, [cq]
3112    movd                xm2, [o(pw_8192)]
3113    mov                [cq], eobd
3114    or                  r3d, 32
3115    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
3116.full:
3117    REPX   {pmulhrsw x, m9}, m12, m13, m14, m15
3118    pmulhrsw             m6, m9, [rsp+32*2]
3119    mova         [rsp+32*2], m4
3120    pmulhrsw             m4, m9, [rsp+32*0]
3121    mova         [rsp+32*0], m5
3122    pmulhrsw             m5, m9, [rsp+32*1]
3123    mova         [rsp+32*1], m7
3124    pmulhrsw             m7, m9, m11
3125    pmulhrsw            m11, m9, m8
3126    call .main
3127.pass2:
3128    vpbroadcastd        m12, [o(pw_2048)]
3129    REPX  {pmulhrsw x, m12}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
3130                             m8,  m9,  m10, m11,      m13, m14, m15
3131    pmulhrsw            m12, [rsp]
3132    REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14
3133    REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15
3134    mova         [rsp+32*0], m4
3135    mova         [rsp+32*1], m6
3136    lea                  r3, [strideq*3]
3137    WRITE_8X4             0,  1,  4,  6
3138    lea                dstq, [dstq+strideq*4]
3139    WRITE_8X4             2,  3,  4,  6
3140    lea                dstq, [dstq+strideq*4]
3141    WRITE_8X4    [rsp+32*0],  5,  4,  6
3142    lea                dstq, [dstq+strideq*4]
3143    WRITE_8X4    [rsp+32*1],  7,  4,  6
3144    lea                dstq, [dstq+strideq*4]
3145    WRITE_8X4             8,  9,  4,  6
3146    lea                dstq, [dstq+strideq*4]
3147    WRITE_8X4            10, 11,  4,  6
3148    lea                dstq, [dstq+strideq*4]
3149    WRITE_8X4            12, 13,  4,  6
3150    lea                dstq, [dstq+strideq*4]
3151    WRITE_8X4            14, 15,  4,  6
3152    RET
3153ALIGN function_align
3154cglobal_label .main_fast ; bottom half is zero
3155    call m(idct_8x16_internal_8bpc).main
3156    mova                 m8, [rsp+gprsize+0*32]
3157    mova [rsp+gprsize+0*32], m0
3158    mova                 m9, [rsp+gprsize+1*32]
3159    mova [rsp+gprsize+1*32], m1
3160    mova                 m0, [rsp+gprsize+2*32]
3161    mova [rsp+gprsize+2*32], m6
3162    lea                  r5, [r6-(o_base)+pw_201_4091x8]
3163    ITX_UNPACK_MULHRSW    1,  8,  6,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
3164    ITX_UNPACK_MULHRSW   15,  9,  6,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
3165    ITX_UNPACK_MULHRSW   14,  0,  6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
3166    ITX_UNPACK_MULHRSW   13, 11,  6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
3167    jmp .main2
3168ALIGN function_align
3169cglobal_label .main
3170    call m(idct_8x16_internal_8bpc).main
3171    mova                 m8, [rsp+gprsize+0*32]
3172    mova [rsp+gprsize+0*32], m0
3173    mova                 m9, [rsp+gprsize+1*32]
3174    mova [rsp+gprsize+1*32], m1
3175    mova                 m0, [rsp+gprsize+2*32]
3176    mova [rsp+gprsize+2*32], m6
3177    punpcklwd            m1, m15, m8  ; in31 in1
3178    punpckhwd            m8, m15      ; in3  in29
3179    punpcklwd           m15, m14, m9  ; in27 in5
3180    punpckhwd            m9, m14      ; in7  in25
3181    punpcklwd           m14, m13, m0  ; in23 in9
3182    punpckhwd            m0, m13      ; in11 in21
3183    punpcklwd           m13, m12, m11 ; in19 in13
3184    punpckhwd           m11, m12      ; in15 in17
3185    ITX_MUL2X_PACK        1,  6, 12, 10,  201, 4091, 3 ; t16a, t31a
3186    ITX_MUL2X_PACK        8,  6, 12, 10, 4052,  601, 3 ; t23a, t24a
3187    ITX_MUL2X_PACK       15,  6, 12, 10,  995, 3973, 3 ; t20a, t27a
3188    ITX_MUL2X_PACK        9,  6, 12, 10, 3857, 1380, 3 ; t19a, t28a
3189    ITX_MUL2X_PACK       14,  6, 12, 10, 1751, 3703, 3 ; t18a, t29a
3190    ITX_MUL2X_PACK        0,  6, 12, 10, 3513, 2106, 3 ; t21a, t26a
3191    ITX_MUL2X_PACK       13,  6, 12, 10, 2440, 3290, 3 ; t22a, t25a
3192    ITX_MUL2X_PACK       11,  6, 12, 10, 3035, 2751, 3 ; t17a, t30a
3193.main2:
3194    psubsw               m6, m1, m11  ; t17 t30
3195    paddsw               m1, m11      ; t16 t31
3196    psubsw              m11, m9, m14  ; t18 t29
3197    paddsw               m9, m14      ; t19 t28
3198    psubsw              m14, m15, m0  ; t21 t26
3199    paddsw              m15, m0       ; t20 t27
3200    psubsw               m0, m8, m13  ; t22 t25
3201    paddsw               m8, m13      ; t23 t24
3202    ITX_MUL2X_PACK        6, 12, 13, 10,   799, 4017, 3 ; t17a t30a
3203    ITX_MUL2X_PACK       11, 12, 13, 10, m4017,  799, 3 ; t18a t29a
3204    ITX_MUL2X_PACK       14, 12, 13, 10,  3406, 2276, 3 ; t21a t26a
3205    ITX_MUL2X_PACK        0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
3206    psubsw              m13, m1, m9   ; t19a t28a
3207    paddsw               m1, m9       ; t16a t31a
3208    psubsw               m9, m8, m15  ; t20a t27a
3209    paddsw               m8, m15      ; t23a t24a
3210    psubsw              m15, m6, m11  ; t18  t29
3211    paddsw               m6, m11      ; t17  t30
3212    psubsw              m11, m0, m14  ; t21  t26
3213    paddsw               m0, m14      ; t22  t25
3214    ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 3 ; t18a t29a
3215    ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 3 ; t19  t28
3216    ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 3 ; t20  t27
3217    ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a
3218    vbroadcasti128      m12, [o(deint_shuf)]
3219    psubsw              m14, m1, m8   ; t23  t24
3220    paddsw               m1, m8       ; t16  t31
3221    psubsw               m8, m6, m0   ; t22a t25a
3222    paddsw               m6, m0       ; t17a t30a
3223    psubsw               m0, m15, m11 ; t21  t26
3224    paddsw              m15, m11      ; t18  t29
3225    psubsw              m11, m13, m9  ; t20a t27a
3226    paddsw              m13, m9       ; t19a t28a
3227    REPX    {pshufb x, m12}, m1, m6, m15, m13
3228    ITX_MUL2X_PACK       14,  9, 12, 10, 2896, 2896 ; t24a t23a
3229    vpbroadcastd         m9, [o(pw_m2896_2896)]
3230    ITX_MUL2X_PACK        8, 12,  _, 10, 12,  9, 4  ; t22  t25
3231    vpbroadcastd        m12, [o(pw_2896_2896)]
3232    ITX_MUL2X_PACK        0, 12,  _, 10, 12,  9, 4  ; t21a t26a
3233    vpbroadcastd        m12, [o(pw_2896_2896)]
3234    ITX_MUL2X_PACK       11,  9,  _, 10,  9, 12, 4  ; t27  t20
3235    shufps               m9, m14, m8, q1032 ; t23a t22
3236    vpblendd            m14, m8, 0xcc       ; t24a t25
3237    shufps               m8, m11, m0, q1032 ; t20  t21a
3238    vpblendd            m11, m0, 0xcc       ; t27  t26a
3239    punpcklqdq           m0, m1, m6   ; t16  t17a
3240    punpckhqdq           m1, m6       ; t31  t30a
3241    psubsw              m10, m5, m8   ; out20 out21
3242    paddsw               m5, m8       ; out11 out10
3243    psubsw               m6, m3, m14  ; out24 out25
3244    paddsw               m3, m14      ; out7  out6
3245    psubsw               m8, m7, m0   ; out16 out17
3246    paddsw               m7, m0       ; out15 out14
3247    mova                 m0, [rsp+gprsize+0*32]
3248    punpcklqdq          m12, m13, m15 ; t19a t18
3249    punpckhqdq          m13, m15      ; t28a t29
3250    psubsw              m15, m0, m1   ; out31 out30
3251    paddsw               m0, m1       ; out0  out1
3252    mova                 m1, [rsp+gprsize+1*32]
3253    mova [rsp+gprsize+0*32], m6
3254    mova                 m6, [rsp+gprsize+2*32]
3255    psubsw              m14, m1, m13  ; out28 out29
3256    paddsw               m1, m13      ; out3  out2
3257    psubsw              m13, m2, m11  ; out27 out26
3258    paddsw               m2, m11      ; out4  out5
3259    psubsw              m11, m4, m9   ; out23 out22
3260    paddsw               m4, m9       ; out8  out9
3261    psubsw               m9, m6, m12  ; out19 out18
3262    paddsw               m6, m12      ; out12 out13
3263    ret
3264
3265%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
3266    vbroadcasti128      m%1, [cq+16*%3]
3267    vbroadcasti128      m%2, [cq+16*%4]
3268    shufpd              m%1, m%2, 0x0c
3269%endmacro
3270
3271cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
3272    lea                  r6, [o_base]
3273    test               eobd, eobd
3274    jnz .normal
3275    movd                xm1, [o(pw_2896x8)]
3276    pmulhrsw            xm0, xm1, [cq]
3277    movd                xm2, [o(pw_8192)]
3278    mov                [cq], eobd
3279    or                  r3d, 8
3280.dconly:
3281    pmulhrsw            xm0, xm2
3282    movd                xm2, [pw_2048] ; intentionally rip-relative
3283    pmulhrsw            xm0, xm1
3284    pmulhrsw            xm0, xm2
3285    vpbroadcastw         m0, xm0
3286    pxor                 m3, m3
3287.dconly_loop:
3288    mova                 m1, [dstq]
3289    punpckhbw            m2, m1, m3
3290    punpcklbw            m1, m3
3291    paddw                m2, m0
3292    paddw                m1, m0
3293    packuswb             m1, m2
3294    mova             [dstq], m1
3295    add                dstq, strideq
3296    dec                 r3d
3297    jg .dconly_loop
3298    RET
3299.normal:
3300    PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
3301    %undef cmp
3302    LOAD_PACKED_16X2      0,  7,  0,  2 ; in0  in2
3303    LOAD_PACKED_16X2      4,  7,  1,  3 ; in1  in3
3304    LOAD_PACKED_16X2      1,  7,  4,  6 ; in4  in6
3305    LOAD_PACKED_16X2      5,  7,  5,  7 ; in5  in7
3306    pxor                 m8, m8
3307    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
3308    add                  cq, 16*16
3309    LOAD_PACKED_16X2      2,  7, -8, -6 ; in8  in10
3310    LOAD_PACKED_16X2      6,  7, -7, -5 ; in9  in11
3311    LOAD_PACKED_16X2      3,  7, -4, -2 ; in12 in14
3312    LOAD_PACKED_16X2     11,  7, -3, -1 ; in13 in15
3313    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1
3314    mova         [rsp+32*0], m4
3315    mova         [rsp+32*1], m5
3316    mova         [rsp+32*2], m6
3317    cmp                eobd, 106
3318    jg .full
3319    pxor                 m4, m4
3320    REPX       {mova x, m4}, m5, m6, m7
3321    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
3322    jmp .pass2
3323.full:
3324    LOAD_PACKED_16X2      4,  7,  0,  2 ; in16 in18
3325    LOAD_PACKED_16X2     12,  7,  3,  1 ; in19 in17
3326    LOAD_PACKED_16X2      5,  7,  4,  6 ; in20 in22
3327    LOAD_PACKED_16X2     13,  7,  7,  5 ; in23 in21
3328    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
3329    add                  cq, 16*8
3330    LOAD_PACKED_16X2      6,  7,  0,  2 ; in24 in26
3331    LOAD_PACKED_16X2     14,  7,  3,  1 ; in27 in25
3332    LOAD_PACKED_16X2      7,  8,  4,  6 ; in28 in30
3333    LOAD_PACKED_16X2     15,  8,  7,  5 ; in31 in29
3334    pxor                 m8, m8
3335    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
3336    call m(inv_txfm_add_dct_dct_8x32_8bpc).main
3337.pass2:
3338    vpbroadcastd        m12, [o(pw_8192)]
3339    REPX  {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15
3340    mova         [rsp+32*1], m9
3341    mova         [rsp+32*2], m10
3342    punpckhwd            m9, m0, m2
3343    punpcklwd            m0, m2
3344    punpckhwd            m2, m1, m3
3345    punpcklwd            m1, m3
3346    punpcklwd           m10, m4, m6
3347    punpckhwd            m4, m6
3348    punpcklwd            m6, m5, m7
3349    punpckhwd            m5, m7
3350    punpckhwd            m3, m0, m9
3351    punpcklwd            m0, m9
3352    punpckhwd            m9, m2, m1
3353    punpcklwd            m2, m1
3354    punpcklwd            m7, m10, m4
3355    punpckhwd           m10, m4
3356    punpcklwd            m4, m5, m6
3357    punpckhwd            m5, m6
3358    punpckhdq            m1, m0, m2
3359    punpckldq            m0, m2
3360    punpckldq            m2, m3, m9
3361    punpckhdq            m3, m9
3362    punpckldq            m6, m7, m4
3363    punpckhdq            m7, m4
3364    punpckldq            m9, m10, m5
3365    punpckhdq           m10, m5
3366    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10
3367    pmulhrsw            m12, [rsp+32*0]
3368    mova         [rsp+32*0], m8
3369    vperm2i128           m4, m0, m6, 0x31
3370    vinserti128          m0, xm6, 1
3371    vperm2i128           m5, m1, m7, 0x31
3372    vinserti128          m1, xm7, 1
3373    vperm2i128           m6, m2, m9, 0x31
3374    vinserti128          m2, xm9, 1
3375    vperm2i128           m7, m3, m10, 0x31
3376    vinserti128          m3, xm10, 1
3377    call m(idct_16x8_internal_8bpc).main
3378    vpbroadcastd         m8, [o(pw_2048)]
3379    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
3380    lea                  r2, [strideq*3]
3381    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
3382    WRITE_16X2            2,  3,  0,  1, strideq*2, r2
3383    lea                  r3, [dstq+strideq*4]
3384    %define dstq r3
3385    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
3386    WRITE_16X2            6,  7,  0,  1, strideq*2, r2
3387    mova                 m0, [rsp+32*0]
3388    mova                 m1, [rsp+32*1]
3389    mova                 m2, [rsp+32*2]
3390    punpckhwd            m7, m0, m2
3391    punpcklwd            m0, m2
3392    punpckhwd            m2, m1, m11
3393    punpcklwd            m1, m11
3394    punpckhwd            m4, m12, m14
3395    punpcklwd           m12, m14
3396    punpckhwd            m5, m13, m15
3397    punpcklwd           m13, m15
3398    punpckhwd            m3, m0, m7
3399    punpcklwd            m0, m7
3400    punpckhwd            m9, m2, m1
3401    punpcklwd            m2, m1
3402    punpcklwd            m7, m12, m4
3403    punpckhwd           m12, m4
3404    punpcklwd            m4, m5, m13
3405    punpckhwd            m5, m13
3406    punpckhdq            m1, m0, m2
3407    punpckldq            m0, m2
3408    punpckldq            m2, m3, m9
3409    punpckhdq            m3, m9
3410    punpckldq            m6, m7, m4
3411    punpckhdq            m7, m4
3412    punpckldq            m9, m12, m5
3413    punpckhdq           m12, m5
3414    vperm2i128           m4, m0, m6, 0x31
3415    vinserti128          m0, xm6, 1
3416    vperm2i128           m5, m1, m7, 0x31
3417    vinserti128          m1, xm7, 1
3418    vperm2i128           m6, m2, m9, 0x31
3419    vinserti128          m2, xm9, 1
3420    vperm2i128           m7, m3, m12, 0x31
3421    vinserti128          m3, xm12, 1
3422    call m(idct_16x8_internal_8bpc).main2
3423    vpbroadcastd         m8, [o(pw_2048)]
3424    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
3425    add                  r0, 16
3426    add                  r3, 16
3427    %define dstq r0
3428    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
3429    WRITE_16X2            2,  3,  0,  1, strideq*2, r2
3430    %define dstq r3
3431    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
3432    WRITE_16X2            6,  7,  0,  1, strideq*2, r2
3433    RET
3434
3435cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob
3436    vpbroadcastd         m9, [pw_5]
3437    lea                  r4, [strideq*3]
3438    sub                eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
3439.loop:
3440    mova                xm0,[cq+16* 0]
3441    mova                xm1, [cq+16* 4]
3442    vinserti128          m0, [cq+16* 1], 1
3443    vinserti128          m1, [cq+16* 5], 1
3444    pxor                 m8, m8
3445    mova          [cq+32*0], m8
3446    mova          [cq+32*2], m8
3447    add                  cq, 16*16
3448    mova                xm2, [cq-16* 8]
3449    mova                xm3, [cq-16* 4]
3450    vinserti128          m2, [cq-16* 7], 1
3451    vinserti128          m3, [cq-16* 3], 1
3452    mova                xm4, [cq+16* 0]
3453    mova                xm5, [cq+16* 4]
3454    vinserti128          m4, [cq+16* 1], 1
3455    vinserti128          m5, [cq+16* 5], 1
3456    mova                xm6, [cq+16* 8]
3457    mova                xm7, [cq+16*12]
3458    vinserti128          m6, [cq+16* 9], 1
3459    vinserti128          m7, [cq+16*13], 1
3460    REPX {mova [cq+32*x], m8}, -4, -2,  0,  2,  4,  6
3461    REPX  {paddsw    x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
3462    call .transpose8x8
3463    REPX  {psraw     x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
3464    WRITE_8X4             0,  4,  8, 10, strideq*8, strideq*4, r4*4
3465    add                dstq, strideq
3466    WRITE_8X4             1,  5,  0,  4, strideq*8, strideq*4, r4*4
3467    add                dstq, strideq
3468    WRITE_8X4             2,  6,  0,  4, strideq*8, strideq*4, r4*4
3469    add                dstq, strideq
3470    WRITE_8X4             3,  7,  0,  4, strideq*8, strideq*4, r4*4
3471    add                dstq, strideq
3472    sub                  cq, 16*16-32
3473    lea                dstq, [dstq+r4*4]
3474    add                eobd, 0x80000000
3475    jnc .loop
3476    RET
3477ALIGN function_align
3478.transpose8x8:
3479    punpckhwd            m8, m4, m5
3480    punpcklwd            m4, m5
3481    punpckhwd            m5, m0, m1
3482    punpcklwd            m0, m1
3483    punpckhwd            m1, m6, m7
3484    punpcklwd            m6, m7
3485    punpckhwd            m7, m2, m3
3486    punpcklwd            m2, m3
3487    punpckhdq            m3, m0, m2
3488    punpckldq            m0, m2
3489    punpckldq            m2, m4, m6
3490    punpckhdq            m4, m6
3491    punpckhdq            m6, m5, m7
3492    punpckldq            m5, m7
3493    punpckldq            m7, m8, m1
3494    punpckhdq            m8, m1
3495    punpckhqdq           m1, m0, m2
3496    punpcklqdq           m0, m2
3497    punpcklqdq           m2, m3, m4
3498    punpckhqdq           m3, m4
3499    punpcklqdq           m4, m5, m7
3500    punpckhqdq           m5, m7
3501    punpckhqdq           m7, m6, m8
3502    punpcklqdq           m6, m8
3503    ret
3504
3505cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob
3506    add                  cq, 16*8
3507    vpbroadcastd         m9, [pw_4096]
3508    lea                  r4, [strideq*3]
3509    lea                  r5, [dstq+strideq*4]
3510    sub                eobd, 107
3511.loop:
3512    mova                xm0, [cq-16*8]
3513    mova                xm1, [cq-16*7]
3514    vinserti128          m0, [cq+16*0], 1
3515    vinserti128          m1, [cq+16*1], 1
3516    mova                xm2, [cq-16*6]
3517    mova                xm3, [cq-16*5]
3518    vinserti128          m2, [cq+16*2], 1
3519    vinserti128          m3, [cq+16*3], 1
3520    mova                xm4, [cq-16*4]
3521    mova                xm5, [cq-16*3]
3522    vinserti128          m4, [cq+16*4], 1
3523    vinserti128          m5, [cq+16*5], 1
3524    mova                xm6, [cq-16*2]
3525    mova                xm7, [cq-16*1]
3526    vinserti128          m6, [cq+16*6], 1
3527    vinserti128          m7, [cq+16*7], 1
3528    pxor                 m8, m8
3529    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1,  0,  1,  2,  3
3530    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
3531    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
3532    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
3533    WRITE_16X2            2,  3,  0,  1, strideq*2, r4
3534    %define dstq r5
3535    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
3536    WRITE_16X2            6,  7,  0,  1, strideq*2, r4
3537    add                  cq, 16*16
3538    add                  r0, 16
3539    add                  r5, 16
3540    add                eobd, 0x80000000
3541    jnc .loop
3542    RET
3543
3544%define o_base pw_5 + 128
3545
3546%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs
3547%if %3
3548    vpbroadcastd        m15, [o(pw_2896x8)]
3549    pmulhrsw             m0, m15, [%1+%2* 0]
3550    pmulhrsw             m1, m15, [%1+%2* 1]
3551    pmulhrsw             m2, m15, [%1+%2* 2]
3552    pmulhrsw             m3, m15, [%1+%2* 3]
3553    pmulhrsw             m4, m15, [%1+%2* 4]
3554    pmulhrsw             m5, m15, [%1+%2* 5]
3555    pmulhrsw             m6, m15, [%1+%2* 6]
3556    pmulhrsw             m7, m15, [%1+%2* 7]
3557    pmulhrsw             m8, m15, [%1+%2* 8]
3558    pmulhrsw             m9, m15, [%1+%2* 9]
3559    pmulhrsw            m10, m15, [%1+%2*10]
3560    pmulhrsw            m11, m15, [%1+%2*11]
3561    pmulhrsw            m12, m15, [%1+%2*12]
3562    pmulhrsw            m13, m15, [%1+%2*13]
3563    pmulhrsw            m14, m15, [%1+%2*14]
3564    pmulhrsw            m15,      [%1+%2*15]
3565%else
3566    mova                 m0, [%1+%2* 0]
3567    mova                 m1, [%1+%2* 1]
3568    mova                 m2, [%1+%2* 2]
3569    mova                 m3, [%1+%2* 3]
3570    mova                 m4, [%1+%2* 4]
3571    mova                 m5, [%1+%2* 5]
3572    mova                 m6, [%1+%2* 6]
3573    mova                 m7, [%1+%2* 7]
3574    mova                 m8, [%1+%2* 8]
3575    mova                 m9, [%1+%2* 9]
3576    mova                m10, [%1+%2*10]
3577    mova                m11, [%1+%2*11]
3578    mova                m12, [%1+%2*12]
3579    mova                m13, [%1+%2*13]
3580    mova                m14, [%1+%2*14]
3581    mova                m15, [%1+%2*15]
3582%endif
3583    mova              [rsp], m15
3584%if %4
3585    pxor                m15, m15
3586    REPX {mova [%1+%2*x], m15}, 0,  1,  2,  3,  4,  5,  6,  7, \
3587                                8,  9, 10, 11, 12, 13, 14, 15
3588%endif
3589%endmacro
3590
3591%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
3592    mova                m%4, [%2]
3593    paddsw              m%3, m%1, m%4
3594    psubsw              m%1, m%4
3595    pmovzxbw            m%4, [dstq+%6]
3596    pmulhrsw            m%3, m%5
3597    pmulhrsw            m%1, m%5
3598    paddw               m%3, m%4
3599    pmovzxbw            m%4, [r2+%7]
3600    paddw               m%1, m%4
3601    packuswb            m%3, m%1
3602    vpermq              m%3, m%3, q3120
3603    mova          [dstq+%6], xm%3
3604    vextracti128    [r2+%7], m%3, 1
3605%endmacro
3606
3607cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob
3608    lea                  r6, [o_base]
3609    test               eobd, eobd
3610    jz .dconly
3611    PROLOGUE              0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
3612                                           base, tmp3
3613    %undef cmp
3614    LOAD_16ROWS          cq, 64, 1
3615    call m(idct_16x16_internal_8bpc).main
3616    lea               tmp1q, [rsp+32*7]
3617    lea               tmp2q, [tmp1q+32*8]
3618    lea               tmp3q, [tmp1q+32*16]
3619    mova                 m1, [rsp+32*1]
3620    mova         [rsp+32*0], m6
3621    mova         [rsp+32*1], m7
3622    vpbroadcastd         m7, [o(pw_16384)]
3623    call .transpose_2x8x8_round
3624    mova                m15, [rsp+32*0]
3625    mova         [tmp3q-32*4+ 0], xm0
3626    vextracti128 [tmp3q+32*0+ 0], m0, 1
3627    mova         [tmp3q-32*3+ 0], xm2
3628    vextracti128 [tmp3q+32*1+ 0], m2, 1
3629    mova         [tmp3q-32*2+ 0], xm4
3630    vextracti128 [tmp3q+32*2+ 0], m4, 1
3631    mova         [tmp3q-32*1+ 0], xm6
3632    vextracti128 [tmp3q+32*3+ 0], m6, 1
3633    mova         [tmp3q-32*4+16], xm8
3634    vextracti128 [tmp3q+32*0+16], m8, 1
3635    mova         [tmp3q-32*3+16], xm10
3636    vextracti128 [tmp3q+32*1+16], m10, 1
3637    mova         [tmp3q-32*2+16], xm12
3638    vextracti128 [tmp3q+32*2+16], m12, 1
3639    mova         [tmp3q-32*1+16], xm14
3640    vextracti128 [tmp3q+32*3+16], m14, 1
3641    cmp                eobd, 150
3642    jg .full
3643    vinserti128          m0, m1, xm9, 1
3644    vperm2i128           m4, m1, m9, 0x31
3645    vinserti128          m2, m5, xm13, 1
3646    vperm2i128           m6, m5, m13, 0x31
3647    vinserti128          m1, m3, xm11, 1
3648    vperm2i128           m5, m3, m11, 0x31
3649    vinserti128          m3, m7, xm15, 1
3650    vperm2i128           m7, m7, m15, 0x31
3651    call .main_oddhalf_fast
3652    pxor                 m8, m8
3653    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
3654    jmp .idct16
3655.dconly:
3656    movd                xm1, [o(pw_2896x8)]
3657    pmulhrsw            xm0, xm1, [cq]
3658    movd                xm2, [o(pw_16384)]
3659    mov                [cq], eobd
3660    pmulhrsw            xm0, xm1
3661    or                  r3d, 32
3662    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
3663.full:
3664    mova       [tmp1q-32*4], m1
3665    mova       [tmp1q-32*3], m3
3666    mova       [tmp1q-32*2], m5
3667    mova       [tmp1q-32*1], m7
3668    mova       [tmp1q+32*0], m9
3669    mova       [tmp1q+32*1], m11
3670    mova       [tmp1q+32*2], m13
3671    mova       [tmp1q+32*3], m15
3672    LOAD_16ROWS       cq+32, 64, 1
3673    call m(idct_16x16_internal_8bpc).main
3674    lea                  r2, [tmp3q+32*8]
3675    mova                 m1, [rsp+32*1]
3676    mova         [rsp+32*0], m6
3677    mova         [rsp+32*1], m7
3678    vpbroadcastd         m7, [o(pw_16384)]
3679    call .transpose_2x8x8_round
3680    mova                m15, [rsp+32*0]
3681    mova         [r2-32*4+ 0], xm0
3682    vextracti128 [r2+32*0+ 0], m0, 1
3683    mova         [r2-32*3+ 0], xm2
3684    vextracti128 [r2+32*1+ 0], m2, 1
3685    mova         [r2-32*2+ 0], xm4
3686    vextracti128 [r2+32*2+ 0], m4, 1
3687    mova         [r2-32*1+ 0], xm6
3688    vextracti128 [r2+32*3+ 0], m6, 1
3689    mova         [r2-32*4+16], xm8
3690    vextracti128 [r2+32*0+16], m8, 1
3691    mova         [r2-32*3+16], xm10
3692    vextracti128 [r2+32*1+16], m10, 1
3693    mova         [r2-32*2+16], xm12
3694    vextracti128 [r2+32*2+16], m12, 1
3695    mova         [r2-32*1+16], xm14
3696    vextracti128 [r2+32*3+16], m14, 1
3697    vinserti128          m8, m1, xm9, 1
3698    vperm2i128          m12, m1, m9, 0x31
3699    mova                xm0, [tmp1q-32*4]
3700    mova                xm1, [tmp1q-32*3]
3701    vinserti128          m0, [tmp1q+32*0], 1
3702    vinserti128          m1, [tmp1q+32*1], 1
3703    vinserti128         m10, m5, xm13, 1
3704    vperm2i128          m14, m5, m13, 0x31
3705    mova                xm4, [tmp1q-32*4+16]
3706    mova                xm5, [tmp1q-32*3+16]
3707    vinserti128          m4, [tmp1q+32*0+16], 1
3708    vinserti128          m5, [tmp1q+32*1+16], 1
3709    vinserti128          m9, m3, xm11, 1
3710    vperm2i128          m13, m3, m11, 0x31
3711    mova                xm2, [tmp1q-32*2]
3712    mova                xm3, [tmp1q-32*1]
3713    vinserti128          m2, [tmp1q+32*2], 1
3714    vinserti128          m3, [tmp1q+32*3], 1
3715    vinserti128         m11, m7, xm15, 1
3716    vperm2i128          m15, m7, m15, 0x31
3717    mova                xm6, [tmp1q-32*2+16]
3718    mova                xm7, [tmp1q-32*1+16]
3719    vinserti128          m6, [tmp1q+32*2+16], 1
3720    vinserti128          m7, [tmp1q+32*3+16], 1
3721    call .main_oddhalf
3722    LOAD_8ROWS_H    r2-32*4, 32
3723.idct16:
3724    LOAD_8ROWS   tmp3q-32*4, 32
3725    mova              [rsp], m15
3726    call m(idct_16x16_internal_8bpc).main
3727    imul                 r2, strideq, 19
3728    lea                  r3, [strideq*3]
3729    add                  r2, dstq
3730    call .pass2_end
3731    RET
3732ALIGN function_align
3733cglobal_label .main_oddhalf_fast ; lower half is zero
3734    mova [rsp+gprsize+32*1], m7
3735    pxor                 m7, m7
3736    mova [rsp+gprsize+32*0], m7
3737    mova [rsp+gprsize+32*2], m7
3738    vpbroadcastd        m11, [o(pw_3703x8)]
3739    vpbroadcastd         m7, [o(pw_1751x8)]
3740    vpbroadcastd        m12, [o(pw_m1380x8)]
3741    vpbroadcastd         m8, [o(pw_3857x8)]
3742    vpbroadcastd        m13, [o(pw_3973x8)]
3743    vpbroadcastd        m15, [o(pw_995x8)]
3744    pmulhrsw            m11, m4  ; t29a
3745    pmulhrsw             m4, m7  ; t18a
3746    pmulhrsw            m12, m3  ; t19a
3747    pmulhrsw             m3, m8  ; t28a
3748    pmulhrsw            m13, m2  ; t27a
3749    pmulhrsw             m2, m15 ; t20a
3750    vpbroadcastd        m10, [o(pw_m2106x8)]
3751    vpbroadcastd         m7, [o(pw_3513x8)]
3752    vpbroadcastd         m9, [o(pw_3290x8)]
3753    vpbroadcastd         m8, [o(pw_2440x8)]
3754    vpbroadcastd        m14, [o(pw_m601x8)]
3755    vpbroadcastd        m15, [o(pw_4052x8)]
3756    pmulhrsw            m10, m5  ; t21a
3757    pmulhrsw             m5, m7  ; t26a
3758    pmulhrsw             m9, m6  ; t25a
3759    pmulhrsw             m6, m8  ; t22a
3760    pmulhrsw            m14, m1  ; t23a
3761    pmulhrsw             m1, m15 ; t24a
3762    vpbroadcastd        m15, [o(pd_2048)]
3763    jmp .main2
3764ALIGN function_align
3765cglobal_label .main_oddhalf
3766    mova [rsp+gprsize+32*0], m15
3767    mova [rsp+gprsize+32*1], m7
3768    mova [rsp+gprsize+32*2], m8
3769    vpbroadcastd        m15, [o(pd_2048)]
3770    ITX_MULSUB_2W         4, 11,  7,  8, 15, 1751, 3703 ; t18a, t29a
3771    ITX_MULSUB_2W        12,  3,  7,  8, 15, 3857, 1380 ; t19a, t28a
3772    ITX_MULSUB_2W         2, 13,  7,  8, 15,  995, 3973 ; t20a, t27a
3773    ITX_MULSUB_2W        10,  5,  7,  8, 15, 3513, 2106 ; t21a, t26a
3774    ITX_MULSUB_2W         6,  9,  7,  8, 15, 2440, 3290 ; t22a, t25a
3775    ITX_MULSUB_2W        14,  1,  7,  8, 15, 4052,  601 ; t23a, t24a
3776.main2:
3777    psubsw               m7, m12, m4  ; t18
3778    paddsw              m12, m4       ; t19
3779    psubsw               m4, m2, m10  ; t21
3780    paddsw               m2, m10      ; t20
3781    psubsw              m10, m14, m6  ; t22
3782    paddsw              m14, m6       ; t23
3783    psubsw               m6, m1, m9   ; t25
3784    paddsw               m1, m9       ; t24
3785    psubsw               m9, m13, m5  ; t26
3786    paddsw              m13, m5       ; t27
3787    psubsw               m5, m3, m11  ; t29
3788    paddsw               m3, m11      ; t28
3789    ITX_MULSUB_2W         5,  7,  8, 11, 15, m4017,  799 ; t18a, t29a
3790    ITX_MULSUB_2W         9,  4,  8, 11, 15,  3406, 2276 ; t21a, t26a
3791    ITX_MULSUB_2W         6, 10,  8, 11, 15, m2276, 3406 ; t22a, t25a
3792    psubsw               m8, m14, m2  ; t20a
3793    paddsw              m14, m2       ; t23a
3794    psubsw               m2, m1, m13  ; t27a
3795    paddsw               m1, m13      ; t24a
3796    psubsw              m13, m6, m9   ; t21
3797    paddsw               m6, m9       ; t22
3798    psubsw               m9, m10, m4  ; t26
3799    paddsw              m10, m4       ; t25
3800    ITX_MULSUB_2W         2,  8,  4, 11, 15, m3784, 1567 ; t20,  t27
3801    ITX_MULSUB_2W         9, 13,  4, 11, 15, m3784, 1567 ; t21a, t26a
3802    mova                 m4, [rsp+gprsize+32*0] ; in31
3803    mova [rsp+gprsize+32*0], m6  ; t22
3804    mova                 m6, [rsp+gprsize+32*1] ; in15
3805    mova [rsp+gprsize+32*1], m14 ; t23a
3806    mova                m14, [rsp+gprsize+32*2] ; in17
3807    mova [rsp+gprsize+32*2], m1  ; t24a
3808    ITX_MULSUB_2W         0,  4,  1, 11, 15,  201, 4091 ; t16a, t31a
3809    ITX_MULSUB_2W        14,  6,  1, 11, 15, 3035, 2751 ; t17a, t30a
3810    psubsw               m1, m0, m14  ; t17
3811    paddsw               m0, m14      ; t16
3812    psubsw              m14, m4, m6   ; t30
3813    paddsw               m4, m6       ; t31
3814    ITX_MULSUB_2W        14,  1,  6, 11, 15,  799, 4017 ; t17a, t30a
3815    psubsw               m6, m0, m12  ; t19a
3816    paddsw               m0, m12      ; t16a
3817    psubsw              m12, m4, m3   ; t28a
3818    paddsw               m4, m3       ; t31a
3819    psubsw               m3, m14, m5  ; t18
3820    paddsw              m14, m5       ; t17
3821    psubsw               m5, m1, m7   ; t29
3822    paddsw               m1, m7       ; t30
3823    ITX_MULSUB_2W         5,  3,  7, 11, 15, 1567, 3784 ; t18a, t29a
3824    ITX_MULSUB_2W        12,  6,  7, 11, 15, 1567, 3784 ; t19,  t28
3825    psubsw               m7, m1, m10  ; t25a
3826    paddsw               m1, m10      ; t30a
3827    psubsw              m10, m5, m9   ; t21
3828    paddsw               m5, m9       ; t18
3829    psubsw               m9, m12, m2  ; t20a
3830    paddsw              m12, m2       ; t19a
3831    psubsw               m2, m3, m13  ; t26
3832    paddsw               m3, m13      ; t29
3833    psubsw              m13, m6, m8   ; t27a
3834    paddsw               m6, m8       ; t28a
3835    mova       [tmp1q-32*2], m5
3836    mova       [tmp1q-32*1], m12
3837    mova       [tmp2q+32*0], m6
3838    mova       [tmp2q+32*1], m3
3839    mova       [tmp2q+32*2], m1
3840    mova                 m5, [rsp+gprsize+32*0] ; t22
3841    mova                 m6, [rsp+gprsize+32*1] ; t23
3842    mova                 m3, [rsp+gprsize+32*2] ; t24a
3843    psubsw               m1, m14, m5  ; t22a
3844    paddsw              m14, m5       ; t17a
3845    psubsw               m5, m0, m6   ; t23
3846    paddsw               m0, m6       ; t16
3847    psubsw               m6, m4, m3   ; t24
3848    paddsw               m4, m3       ; t31
3849    vpbroadcastd         m8, [o(pw_m2896_2896)]
3850    vpbroadcastd         m3, [o(pw_2896_2896)]
3851    mova       [tmp1q-32*4], m0
3852    mova       [tmp1q-32*3], m14
3853    mova       [tmp2q+32*3], m4
3854    ITX_MULSUB_2W        13,  9,  0,  4, 15,  3,  8 ; t20,  t27
3855    ITX_MULSUB_2W         2, 10,  0,  4, 15,  3,  8 ; t21a, t26a
3856    ITX_MULSUB_2W         7,  1,  0,  4, 15,  3,  8 ; t22,  t25
3857    ITX_MULSUB_2W         6,  5,  0,  4, 15,  3,  8 ; t23a, t24a
3858    mova       [tmp1q+32*0], m13
3859    mova       [tmp1q+32*1], m2
3860    mova       [tmp1q+32*2], m7
3861    mova       [tmp1q+32*3], m6
3862    mova       [tmp2q-32*4], m5
3863    mova       [tmp2q-32*3], m1
3864    mova       [tmp2q-32*2], m10
3865    mova       [tmp2q-32*1], m9
3866    ret
3867ALIGN function_align
3868.transpose_2x8x8_round:
3869    punpckhwd            m6, m12, m13
3870    punpcklwd           m12, m13
3871    punpckhwd           m13, m8, m9
3872    punpcklwd            m8, m9
3873    punpckhwd            m9, m14, m15
3874    punpcklwd           m14, m15
3875    punpckhwd           m15, m10, m11
3876    punpcklwd           m10, m11
3877    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5
3878    punpckhdq           m11, m8, m10
3879    punpckldq            m8, m10
3880    punpckldq           m10, m12, m14
3881    punpckhdq           m12, m14
3882    punpckhdq           m14, m13, m15
3883    punpckldq           m13, m15
3884    punpckldq           m15, m6, m9
3885    punpckhdq            m6, m9
3886    punpckhqdq           m9, m8, m10
3887    punpcklqdq           m8, m10
3888    punpcklqdq          m10, m11, m12
3889    punpckhqdq          m11, m12
3890    punpcklqdq          m12, m13, m15
3891    punpckhqdq          m13, m15
3892    punpckhqdq          m15, m14, m6
3893    punpcklqdq          m14, m6
3894    pmulhrsw             m6, m7, [rsp+gprsize+32*0]
3895    REPX   {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15
3896    pmulhrsw             m7, [rsp+gprsize+32*1]
3897    mova [rsp+gprsize+32*0], m15
3898    punpckhwd           m15, m4, m5
3899    punpcklwd            m4, m5
3900    punpckhwd            m5, m0, m1
3901    punpcklwd            m0, m1
3902    punpckhwd            m1, m6, m7
3903    punpcklwd            m6, m7
3904    punpckhwd            m7, m2, m3
3905    punpcklwd            m2, m3
3906    punpckhdq            m3, m0, m2
3907    punpckldq            m0, m2
3908    punpckldq            m2, m4, m6
3909    punpckhdq            m4, m6
3910    punpckhdq            m6, m5, m7
3911    punpckldq            m5, m7
3912    punpckldq            m7, m15, m1
3913    punpckhdq           m15, m1
3914    punpckhqdq           m1, m0, m2
3915    punpcklqdq           m0, m2
3916    punpcklqdq           m2, m3, m4
3917    punpckhqdq           m3, m4
3918    punpcklqdq           m4, m5, m7
3919    punpckhqdq           m5, m7
3920    punpckhqdq           m7, m6, m15
3921    punpcklqdq           m6, m15
3922    ret
3923ALIGN function_align
3924.pass2_end:
3925    mova [rsp+gprsize+32*0], m7
3926    mova [rsp+gprsize+32*2], m15
3927    vpbroadcastd        m15, [o(pw_2048)]
3928    IDCT32_PASS2_END      0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4
3929    IDCT32_PASS2_END      4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8
3930    IDCT32_PASS2_END      8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4
3931    IDCT32_PASS2_END     12, tmp1q-32*1, 0, 4, 15, r3*4,      strideq*0
3932    add                dstq, strideq
3933    sub                  r2, strideq
3934    mova                 m1, [rsp+gprsize+32*1]
3935    IDCT32_PASS2_END      1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4
3936    IDCT32_PASS2_END      5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8
3937    IDCT32_PASS2_END      9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4
3938    IDCT32_PASS2_END     13, tmp1q-32*2, 0, 4, 15, r3*4,      strideq*0
3939    add                dstq, strideq
3940    sub                  r2, strideq
3941    IDCT32_PASS2_END      2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4
3942    IDCT32_PASS2_END      6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8
3943    IDCT32_PASS2_END     10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4
3944    IDCT32_PASS2_END     14, tmp1q-32*3, 0, 4, 15, r3*4,      strideq*0
3945    add                dstq, strideq
3946    sub                  r2, strideq
3947    mova                 m7, [rsp+gprsize+32*0]
3948    mova                 m1, [rsp+gprsize+32*2]
3949    IDCT32_PASS2_END      3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4
3950    IDCT32_PASS2_END      7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
3951    IDCT32_PASS2_END     11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
3952    IDCT32_PASS2_END      1, tmp1q-32*4, 0, 4, 15, r3*4,      strideq*0
3953    ret
3954
3955; Perform the final sumsub step and YMM lane shuffling
3956%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
3957    mova                m%3, [tmp2q+32*( 3-%1)]
3958    psubsw              m%4, m%1, m%3
3959    paddsw              m%1, m%3
3960    mova                m%3, [tmp1q+32*(11-%2)]
3961    mova         [tmp1q+32*(11-%2)+16], xm%4
3962    vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
3963    paddsw              m%4, m%2, m%3
3964    psubsw              m%2, m%3
3965    mova         [tmp1q+32*(11-%2)], xm%2
3966    vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
3967    vperm2i128          m%2, m%1, m%4, 0x31
3968    vinserti128         m%1, xm%4, 1
3969%endmacro
3970
3971cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob
3972    lea                  r6, [o_base]
3973    test               eobd, eobd
3974    jnz .normal
3975    movd                xm1, [o(pw_2896x8)]
3976    pmulhrsw            xm0, xm1, [cq]
3977    movd                xm2, [o(pw_16384)]
3978    mov                [cq], eobd
3979    pmulhrsw            xm0, xm1
3980    or                  r3d, 16
3981    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
3982.normal:
3983    PROLOGUE              0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
3984    vpbroadcastd        m15, [o(pw_2896x8)]
3985    pmulhrsw             m0, m15, [cq+32* 1]
3986    pmulhrsw             m1, m15, [cq+32* 3]
3987    pmulhrsw             m2, m15, [cq+32* 5]
3988    pmulhrsw             m3, m15, [cq+32* 7]
3989    pmulhrsw             m4, m15, [cq+32* 9]
3990    pmulhrsw             m5, m15, [cq+32*11]
3991    pmulhrsw             m6, m15, [cq+32*13]
3992    pmulhrsw             m7, m15, [cq+32*15]
3993    pmulhrsw             m8, m15, [cq+32*17]
3994    pmulhrsw             m9, m15, [cq+32*19]
3995    pmulhrsw            m10, m15, [cq+32*21]
3996    pmulhrsw            m11, m15, [cq+32*23]
3997    pmulhrsw            m12, m15, [cq+32*25]
3998    pmulhrsw            m13, m15, [cq+32*27]
3999    pmulhrsw            m14, m15, [cq+32*29]
4000    pmulhrsw            m15,      [cq+32*31]
4001    lea               tmp1q, [rsp+32*7]
4002    lea               tmp2q, [tmp1q+32*8]
4003    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
4004    LOAD_16ROWS     cq+32*0, 32*2, 1, 0
4005    pxor                m15, m15
4006    mov                 r3d, 8
4007.zero_loop:
4008    mova          [cq+32*0], m15
4009    mova          [cq+32*1], m15
4010    mova          [cq+32*2], m15
4011    mova          [cq+32*3], m15
4012    add                  cq, 32*4
4013    dec                 r3d
4014    jg .zero_loop
4015    call m(idct_16x16_internal_8bpc).main
4016    call .pass1_end
4017    lea                  r2, [strideq*3]
4018    mov                  r3, dstq
4019.pass2:
4020    vpbroadcastd         m7, [o(pw_16384)]
4021    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
4022    call m(idct_16x16_internal_8bpc).main
4023    mova         [rsp+32*2], m15
4024    vpbroadcastd        m15, [o(pw_2048)]
4025    REPX  {pmulhrsw x, m15}, m2, m3, m0
4026    WRITE_16X2            2,  3,  1,  2, strideq*2, r2
4027    pmulhrsw             m1, m15, [rsp+32*1]
4028    WRITE_16X2            0,  1,  2,  3, strideq*0, strideq*1
4029    lea                dstq, [dstq+strideq*4]
4030    REPX  {pmulhrsw x, m15}, m4, m5, m6, m7
4031    WRITE_16X2            4,  5,  2,  3, strideq*0, strideq*1
4032    WRITE_16X2            6,  7,  2,  3, strideq*2, r2
4033    lea                dstq, [dstq+strideq*4]
4034    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11
4035    WRITE_16X2            8,  9,  2,  3, strideq*0, strideq*1
4036    WRITE_16X2           10, 11,  2,  3, strideq*2, r2
4037    lea                dstq, [dstq+strideq*4]
4038    REPX  {pmulhrsw x, m15}, m11, m12, m13, m14
4039    pmulhrsw            m15, [rsp+32*2]
4040    WRITE_16X2           12, 13,  2,  3, strideq*0, strideq*1
4041    WRITE_16X2           14, 15,  2,  3, strideq*2, r2
4042    test                 r3, r3
4043    jnz .right_half
4044    RET
4045.right_half:
4046    LOAD_8ROWS   tmp1q-32*4, 32
4047    LOAD_8ROWS_H tmp2q-32*4, 32
4048    lea                dstq, [r3+16]
4049    xor                 r3d, r3d
4050    mova         [rsp+32*0], m6
4051    mova         [rsp+32*1], m7
4052    jmp .pass2
4053ALIGN function_align
4054.pass1_end:
4055    mova [rsp+gprsize+32*0], m9
4056    IDCT32_PASS1_END      0,  8,  1,  9
4057    IDCT32_PASS1_END      2, 10,  1,  9
4058    IDCT32_PASS1_END      3, 11,  1,  9
4059    IDCT32_PASS1_END      4, 12,  1,  9
4060    IDCT32_PASS1_END      5, 13,  1,  9
4061    IDCT32_PASS1_END      6, 14,  1,  9
4062    IDCT32_PASS1_END      7, 15,  1,  9
4063    mova                 m1, [rsp+gprsize+32*1]
4064    mova                 m9, [rsp+gprsize+32*0]
4065    mova [rsp+gprsize+32*0], m6
4066    mova [rsp+gprsize+32*1], m7
4067    IDCT32_PASS1_END      1,  9,  6,  7
4068    ret
4069
4070cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob
4071%undef cmp
4072    lea                  r6, [o_base]
4073    vpbroadcastd         m9, [o(pw_2896x8)]
4074    vpbroadcastd        m10, [o(pw_1697x16)]
4075    vpbroadcastd        m12, [o(pw_8192)]
4076    cmp                eobd, 43   ; if (eob > 43)
4077    setg                r4b       ;   iteration_count++
4078    cmp                eobd, 150  ; if (eob > 150)
4079    setg                 al       ;   iteration_count++
4080    add                eobd, -279 ; if (eob > 278)
4081    adc                 r4b, al   ;   iteration_count++
4082    lea                  r3, [strideq*3]
4083    mov                  r6, cq
4084    paddw               m11, m12, m12 ; pw_16384
4085.loop:
4086    mova                xm0, [cq+64* 0]
4087    mova                xm1, [cq+64* 1]
4088    vinserti128          m0, [cq+64* 8], 1
4089    vinserti128          m1, [cq+64* 9], 1
4090    mova                xm2, [cq+64* 2]
4091    mova                xm3, [cq+64* 3]
4092    vinserti128          m2, [cq+64*10], 1
4093    vinserti128          m3, [cq+64*11], 1
4094    mova                xm4, [cq+64* 4]
4095    mova                xm5, [cq+64* 5]
4096    vinserti128          m4, [cq+64*12], 1
4097    vinserti128          m5, [cq+64*13], 1
4098    mova                xm6, [cq+64* 6]
4099    mova                xm7, [cq+64* 7]
4100    vinserti128          m6, [cq+64*14], 1
4101    vinserti128          m7, [cq+64*15], 1
4102    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
4103    REPX  {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
4104    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
4105    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
4106    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
4107    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
4108    lea                dstq, [dstq+strideq*4]
4109    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
4110    WRITE_16X2            6,  7,  0,  1, strideq*2, r3
4111    lea                dstq, [dstq+strideq*4]
4112    add                  cq, 16
4113    dec                 r4b
4114    jge .loop
4115    sub                  cq, 32
4116    pxor                 m0, m0
4117    mov                 r0d, 8
4118    cmp                  cq, r6
4119    ja .zero_loop
4120.zero_loop_half:
4121    mova          [r6+64*0], m0
4122    mova          [r6+64*1], m0
4123    add                  r6, 64*4
4124    mova          [r6-64*2], m0
4125    mova          [r6-64*1], m0
4126    sub                 r0d, 2
4127    jg .zero_loop_half
4128    RET
4129.zero_loop:
4130    mova          [r6+32*0], m0
4131    mova          [r6+32*1], m0
4132    mova          [r6+32*2], m0
4133    mova          [r6+32*3], m0
4134    add                  r6, 32*4
4135    dec                 r0d
4136    jg .zero_loop
4137    RET
4138
4139cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob
4140%undef cmp
4141    lea                  r6, [o_base]
4142    vpbroadcastd         m9, [o(pw_2896x8)]
4143    vpbroadcastd        m10, [o(pw_1697x16)]
4144    vpbroadcastd        m11, [o(pw_2048)]
4145    cmp                eobd, 35  ; if (eob > 35)
4146    setg                r4b      ;   iteration_count++
4147    cmp                eobd, 150 ; if (eob > 150)
4148    setg                r3b      ;   iteration_count += 2
4149    lea                 r4d, [r4+r3*2]
4150    lea                  r3, [strideq*3]
4151    mov                  r5, dstq
4152    mov                  r6, cq
4153.loop:
4154    mova                xm0, [cq+32* 0]
4155    mova                xm1, [cq+32* 1]
4156    vinserti128          m0, [cq+32* 8], 1
4157    vinserti128          m1, [cq+32* 9], 1
4158    mova                xm2, [cq+32* 2]
4159    mova                xm3, [cq+32* 3]
4160    vinserti128          m2, [cq+32*10], 1
4161    vinserti128          m3, [cq+32*11], 1
4162    mova                xm4, [cq+32* 4]
4163    mova                xm5, [cq+32* 5]
4164    vinserti128          m4, [cq+32*12], 1
4165    vinserti128          m5, [cq+32*13], 1
4166    mova                xm6, [cq+32* 6]
4167    mova                xm7, [cq+32* 7]
4168    vinserti128          m6, [cq+32*14], 1
4169    vinserti128          m7, [cq+32*15], 1
4170    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
4171    REPX  {paddsw   x, x  }, m0, m1, m2, m3, m4, m5, m6, m7
4172    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
4173    REPX  {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7
4174    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
4175    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
4176    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
4177    lea                dstq, [dstq+strideq*4]
4178    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
4179    WRITE_16X2            6,  7,  0,  1, strideq*2, r3
4180    lea                dstq, [dstq+strideq*4]
4181    add                  cq, 16
4182    dec                 r4b
4183    jl .ret
4184    test                r4b, 1
4185    jz .loop
4186    add                  cq, 32*15
4187    lea                dstq, [r5+16]
4188    jmp .loop
4189.ret:
4190    sub                  cd, eax
4191    pxor                 m0, m0
4192    add                  cd, 384
4193.zero_loop:
4194    mova          [r6+32*0], m0
4195    mova          [r6+32*1], m0
4196    mova          [r6+32*2], m0
4197    mova          [r6+32*3], m0
4198    add                  r6, 32*4
4199    sub                  cd, 128
4200    jge .zero_loop
4201    RET
4202
4203cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob
4204    lea                  r6, [o_base]
4205    test               eobd, eobd
4206    jnz .normal
4207    movd                xm1, [o(pw_2896x8)]
4208    pmulhrsw            xm0, xm1, [cq]
4209    movd                xm2, [o(pw_8192)]
4210    mov                [cq], eobd
4211    or                  r3d, 32
4212    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
4213.normal:
4214    PROLOGUE              0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \
4215                                           base, tmp3, tmp4
4216    %undef cmp
4217    lea               tmp1q, [rsp+32*7]
4218    lea               tmp2q, [tmp1q+32*8]
4219    sub                eobd, 136
4220    mov               tmp4d, eobd
4221.pass1_loop:
4222    LOAD_8ROWS      cq+64*1, 64*2
4223    pxor                 m8, m8
4224    REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
4225    test              tmp4d, tmp4d
4226    jl .fast
4227    LOAD_8ROWS_H   cq+64*17, 64*2
4228    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
4229    LOAD_8ROWS_H   cq+64*16, 64*2
4230    pxor                 m0, m0
4231    REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
4232                               24, 25, 26, 27, 28, 29, 30, 31
4233    mova              [rsp], m15
4234    jmp .idct16
4235.fast:
4236    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4237    pxor                 m8, m8
4238    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
4239    mova              [rsp], m8
4240.idct16:
4241    LOAD_8ROWS      cq+64*0, 64*2
4242    pxor                m15, m15
4243    REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
4244    call m(idct_16x16_internal_8bpc).main
4245    call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end
4246    vpbroadcastd         m7, [o(pw_8192)]
4247    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
4248    lea               tmp3q, [tmp1q+32*32]
4249    mova                m15, [rsp]
4250    mova       [tmp3q-32*4], m0
4251    mova       [tmp3q-32*3], m2
4252    mova       [tmp3q-32*2], m4
4253    mova       [tmp3q-32*1], m6
4254    mova       [tmp3q+32*0], m8
4255    mova       [tmp3q+32*1], m10
4256    mova       [tmp3q+32*2], m12
4257    mova       [tmp3q+32*3], m14
4258    add               tmp3q, 32*8
4259    mova       [tmp3q-32*4], m1
4260    mova       [tmp3q-32*3], m3
4261    mova       [tmp3q-32*2], m5
4262    mova       [tmp3q-32*1], m7
4263    mova       [tmp3q+32*0], m9
4264    mova       [tmp3q+32*1], m11
4265    mova       [tmp3q+32*2], m13
4266    mova       [tmp3q+32*3], m15
4267    vpbroadcastd         m9, [o(pw_8192)]
4268    pmulhrsw             m0, m9, [tmp1q-32*4]
4269    pmulhrsw             m1, m9, [tmp1q-32*3]
4270    pmulhrsw             m2, m9, [tmp1q-32*2]
4271    pmulhrsw             m3, m9, [tmp1q-32*1]
4272    pmulhrsw             m4, m9, [tmp1q+32*0]
4273    pmulhrsw             m5, m9, [tmp1q+32*1]
4274    pmulhrsw             m6, m9, [tmp1q+32*2]
4275    pmulhrsw             m7, m9, [tmp1q+32*3]
4276    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
4277    mova       [tmp1q-32*4], m0
4278    pmulhrsw             m0, m9, [tmp2q-32*4]
4279    mova       [tmp2q-32*4], m1
4280    pmulhrsw             m1, m9, [tmp2q-32*3]
4281    mova       [tmp1q-32*3], m2
4282    pmulhrsw             m2, m9, [tmp2q-32*2]
4283    mova       [tmp2q-32*3], m3
4284    pmulhrsw             m3, m9, [tmp2q-32*1]
4285    mova       [tmp1q-32*2], m4
4286    pmulhrsw             m4, m9, [tmp2q+32*0]
4287    mova       [tmp2q-32*2], m5
4288    pmulhrsw             m5, m9, [tmp2q+32*1]
4289    mova       [tmp1q-32*1], m6
4290    pmulhrsw             m6, m9, [tmp2q+32*2]
4291    mova       [tmp2q-32*1], m7
4292    pmulhrsw             m7, m9, [tmp2q+32*3]
4293    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
4294    mova       [tmp1q+32*0], m0
4295    mova       [tmp2q+32*0], m1
4296    mova       [tmp1q+32*1], m2
4297    mova       [tmp2q+32*1], m3
4298    mova       [tmp1q+32*2], m4
4299    mova       [tmp2q+32*2], m5
4300    mova       [tmp1q+32*3], m6
4301    mova       [tmp2q+32*3], m7
4302    add                  cq, 32
4303    add               tmp1q, 32*16
4304    add               tmp2q, 32*16
4305    add                eobd, 0x80000000
4306    jnc .pass1_loop
4307    add               tmp1q, 32*24
4308    imul                 r2, strideq, 19
4309    lea                  r3, [strideq*3]
4310    add                  r2, dstq
4311    test              tmp4d, tmp4d
4312    jge .pass2_loop
4313    add               tmp1q, 32*16
4314    add               tmp2q, 32*16
4315    add               tmp3q, 32*16
4316.pass2_loop:
4317    LOAD_8ROWS   tmp2q-32*4, 32
4318    test              tmp4d, tmp4d
4319    jl .fast2
4320    LOAD_8ROWS_H tmp3q-32*4, 32
4321    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
4322    sub               tmp3q, 32*8
4323    LOAD_8ROWS_H tmp3q-32*4, 32
4324    sub               tmp3q, 32*16
4325    jmp .pass2_loop_end
4326.fast2:
4327    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4328    sub               tmp3q, 32*24
4329    pxor                 m8, m8
4330    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
4331.pass2_loop_end:
4332    LOAD_8ROWS   tmp3q-32*4, 32
4333    mova              [rsp], m15
4334    call m(idct_16x16_internal_8bpc).main
4335    call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end
4336    lea               tmp3q, [tmp1q-32*32]
4337    cmp               tmp2q, tmp3q
4338    jb .ret
4339    sub               tmp2q, 32*32
4340    sub                dstq, r3
4341    lea                  r2, [r2+r3+16]
4342    add                dstq, 16
4343    jmp .pass2_loop
4344.ret:
4345    RET
4346
4347cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob
4348    %undef cmp
4349    vpbroadcastd         m9, [pw_8192]
4350    sub                eobd, 136 ; if (eob < 136)
4351    shr                eobd, 30  ;     topleft 16x16 only
4352    lea                eobd, [eobq*2-8]
4353    lea                  r4, [strideq*3]
4354    mov                  r5, dstq
4355    lea                  r6, [cq+32]
4356.loop:
4357    mova                xm0, [cq+64* 0]
4358    mova                xm1, [cq+64* 1]
4359    vinserti128          m0, [cq+64* 8], 1
4360    vinserti128          m1, [cq+64* 9], 1
4361    mova                xm2, [cq+64* 2]
4362    mova                xm3, [cq+64* 3]
4363    vinserti128          m2, [cq+64*10], 1
4364    vinserti128          m3, [cq+64*11], 1
4365    mova                xm4, [cq+64* 4]
4366    mova                xm5, [cq+64* 5]
4367    vinserti128          m4, [cq+64*12], 1
4368    vinserti128          m5, [cq+64*13], 1
4369    mova                xm6, [cq+64* 6]
4370    mova                xm7, [cq+64* 7]
4371    vinserti128          m6, [cq+64*14], 1
4372    vinserti128          m7, [cq+64*15], 1
4373    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
4374    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
4375    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
4376    WRITE_16X2            2,  3,  0,  1, strideq*2, r4
4377    lea                dstq, [dstq+strideq*4]
4378    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
4379    WRITE_16X2            6,  7,  0,  1, strideq*2, r4
4380    lea                dstq, [dstq+strideq*4]
4381    add                  cq, 16
4382    inc                eobd
4383    jz .ret
4384    test               eobd, 3
4385    jnz .loop
4386    add                  cq, 64*15
4387    lea                dstq, [r5+16]
4388    jmp .loop
4389.ret:
4390    pxor                 m0, m0
4391    mov                 r0d, 16
4392    cmp                  cq, r6
4393    jne .zero_loop
4394.zero_loop_topleft:
4395    mova          [r6-32*1], m0
4396    mova          [r6+32*1], m0
4397    mova          [r6+32*3], m0
4398    mova          [r6+32*5], m0
4399    add                  r6, 64*4
4400    sub                 r0d, 4
4401    jg .zero_loop_topleft
4402    RET
4403.zero_loop:
4404    mova          [r6-32*1], m0
4405    mova          [r6+32*0], m0
4406    mova          [r6+32*1], m0
4407    mova          [r6+32*2], m0
4408    add                  r6, 32*4
4409    dec                 r0d
4410    jg .zero_loop
4411    RET
4412
4413%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
4414%if %1 & 1
4415    mova                m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n
4416    mova                m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n
4417%else
4418    mova                m%5, [tmp1q-32*(45-%1)]
4419    mova                m%4, [tmp2q-32*(20+%1)]
4420%endif
4421    psubsw              m%6, m%5, m%4 ; idct32 out31-n
4422    paddsw              m%5, m%4      ; idct32 out 0+n
4423    psubsw              m%4, m%6, m%3 ; out32+n
4424    paddsw              m%6, m%3      ; out31-n
4425    psubsw              m%3, m%5, m%2 ; out63-n
4426    paddsw              m%5, m%2      ; out 0+n
4427%if %0 == 6 ; pass 1
4428%if %1 & 1
4429    mova [tmp2q-32*(19-%1)], m%4
4430    mova [tmp1q-32*(14+%1)], m%6
4431    mova [tmp1q+32*(18-%1)], m%3
4432    mova [tmp2q-32*(51-%1)], m%5
4433%else
4434    mova [tmp1q-32*(13-%1)], m%4
4435    mova [tmp2q-32*(20+%1)], m%6
4436    mova [tmp2q+32*(12-%1)], m%3
4437    mova [tmp1q-32*(45-%1)], m%5
4438%endif
4439%else ; pass 2
4440    REPX  {pmulhrsw x, m14}, m%4, m%6, m%3, m%5
4441%if %1 & 1
4442    %define %%d0 r2
4443    %define %%d1 dstq
4444%else
4445    %define %%d0 dstq
4446    %define %%d1 r2
4447%endif
4448    pmovzxbw            m%2, [%%d0+%9 ]
4449    paddw               m%2, m%4
4450    pmovzxbw            m%4, [%%d1+%8 ]
4451    paddw               m%4, m%6
4452    pmovzxbw            m%6, [%%d1+%10]
4453    paddw               m%3, m%6
4454    pmovzxbw            m%6, [%%d0+%7 ]
4455    paddw               m%5, m%6
4456    packuswb            m%2, m%4
4457    packuswb            m%3, m%5
4458    vpermq              m%2, m%2, q3120
4459    vpermq              m%3, m%3, q3120
4460    mova         [%%d0+%9 ], xm%2
4461    vextracti128 [%%d1+%8 ], m%2, 1
4462    mova         [%%d1+%10], xm%3
4463    vextracti128 [%%d0+%7 ], m%3, 1
4464%endif
4465%endmacro
4466
4467cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob
4468    lea                  r6, [o_base]
4469    test               eobd, eobd
4470    jnz .normal
4471    movd                xm1, [o(pw_2896x8)]
4472    pmulhrsw            xm0, xm1, [cq]
4473    movd                xm2, [o(pw_8192)]
4474    mov                [cq], eobd
4475    or                  r3d, 64
4476    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
4477.normal:
4478    PROLOGUE              0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
4479    %undef cmp
4480    lea               tmp1q, [rsp+32*23]
4481    lea               tmp2q, [tmp1q+32*24]
4482    sub                eobd, 151
4483    mov                 r7d, eobd
4484.pass1_loop:
4485    LOAD_16ROWS          cq, 64
4486    call m(idct_16x16_internal_8bpc).main
4487    mova                 m1, [rsp+32*1]
4488    mova         [rsp+32*0], m6
4489    mova         [rsp+32*1], m7
4490    vpbroadcastd         m7, [o(pw_8192)]
4491    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
4492    mova                m15, [rsp+32*0]
4493    mova       [tmp1q-32*4], m0
4494    mova       [tmp1q-32*3], m2
4495    mova       [tmp1q-32*2], m4
4496    mova       [tmp1q-32*1], m6
4497    mova       [tmp1q+32*0], m8
4498    mova       [tmp1q+32*1], m10
4499    mova       [tmp1q+32*2], m12
4500    mova       [tmp1q+32*3], m14
4501    mova       [tmp2q-32*4], m1
4502    mova       [tmp2q-32*3], m3
4503    mova       [tmp2q-32*2], m5
4504    mova       [tmp2q-32*1], m7
4505    mova       [tmp2q+32*0], m9
4506    mova       [tmp2q+32*1], m11
4507    mova       [tmp2q+32*2], m13
4508    mova       [tmp2q+32*3], m15
4509    add                  cq, 32
4510    add               tmp1q, 32*8
4511    add               tmp2q, 32*8
4512    add                eobd, 0x80000000
4513    jnc .pass1_loop
4514    lea                  r2, [rsp+32*23]
4515    mova                xm0, [r2-32*4+ 0]
4516    mova                xm1, [r2-32*2+ 0]
4517    vinserti128          m0, [r2+32*0+ 0], 1
4518    vinserti128          m1, [r2+32*2+ 0], 1
4519    mova                xm2, [r2-32*4+16]
4520    mova                xm3, [r2-32*2+16]
4521    vinserti128          m2, [r2+32*0+16], 1
4522    vinserti128          m3, [r2+32*2+16], 1
4523    pxor                 m4, m4
4524    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
4525    test                r7d, r7d
4526    jl .fast
4527    lea                  r3, [r2+32*8]
4528    mova                xm4, [r3-32*4+ 0]
4529    mova                xm5, [r3-32*2+ 0]
4530    vinserti128          m4, [r3+32*0+ 0], 1
4531    vinserti128          m5, [r3+32*2+ 0], 1
4532    mova                xm6, [r3-32*4+16]
4533    mova                xm7, [r3-32*2+16]
4534    vinserti128          m6, [r3+32*0+16], 1
4535    vinserti128          m7, [r3+32*2+16], 1
4536.fast:
4537    mova              [rsp], m8
4538    lea               tmp1q, [rsp+32*7]
4539    call m(idct_16x16_internal_8bpc).main
4540    mova                 m1, [rsp+32*1]
4541    mova       [tmp1q-32*4], m0
4542    mova       [tmp1q-32*3], m1
4543    mova       [tmp1q-32*2], m2
4544    mova       [tmp1q-32*1], m3
4545    mova       [tmp1q+32*0], m4
4546    mova       [tmp1q+32*1], m5
4547    mova       [tmp1q+32*2], m6
4548    mova       [tmp1q+32*3], m7
4549    add               tmp1q, 32*8
4550    mova       [tmp1q-32*4], m8
4551    mova       [tmp1q-32*3], m9
4552    mova       [tmp1q-32*2], m10
4553    mova       [tmp1q-32*1], m11
4554    mova       [tmp1q+32*0], m12
4555    mova       [tmp1q+32*1], m13
4556    mova       [tmp1q+32*2], m14
4557    mova       [tmp1q+32*3], m15
4558    mova                xm0, [r2-32*3+ 0]
4559    mova                xm1, [r2-32*1+ 0]
4560    vinserti128          m0, [r2+32*1+ 0], 1
4561    vinserti128          m1, [r2+32*3+ 0], 1
4562    mova                xm2, [r2-32*3+16]
4563    mova                xm3, [r2-32*1+16]
4564    vinserti128          m2, [r2+32*1+16], 1
4565    vinserti128          m3, [r2+32*3+16], 1
4566    pxor                 m4, m4
4567    REPX       {mova x, m4}, m5, m6, m7
4568    test                r7d, r7d
4569    jl .fast2
4570    mova                xm4, [r3-32*3+ 0]
4571    mova                xm5, [r3-32*1+ 0]
4572    vinserti128          m4, [r3+32*1+ 0], 1
4573    vinserti128          m5, [r3+32*3+ 0], 1
4574    mova                xm6, [r3-32*3+16]
4575    mova                xm7, [r3-32*1+16]
4576    vinserti128          m6, [r3+32*1+16], 1
4577    vinserti128          m7, [r3+32*3+16], 1
4578.fast2:
4579    add               tmp1q, 32*8
4580    lea               tmp2q, [tmp1q+32*8]
4581    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4582    add                  r2, 32*24
4583    vpbroadcastd        m15, [o(pd_2048)]
4584    add               tmp1q, 32*16
4585    add               tmp2q, 32*32
4586    mova                xm0, [r2-32*4+ 0]
4587    mova                xm3, [r2-32*1+16]
4588    vinserti128          m0, [r2+32*0+ 0], 1
4589    vinserti128          m3, [r2+32*3+16], 1
4590    mova                xm4, [r2-32*4+16]
4591    mova                xm7, [r2-32*1+ 0]
4592    vinserti128          m4, [r2+32*0+16], 1
4593    vinserti128          m7, [r2+32*3+ 0], 1
4594    pxor                 m1, m1
4595    REPX       {mova x, m1}, m2, m5, m6
4596    test                r7d, r7d
4597    jl .fast3
4598    add                  r3, 32*24
4599    mova                xm1, [r3-32*1+16]
4600    mova                xm2, [r3-32*4+ 0]
4601    vinserti128          m1, [r3+32*3+16], 1
4602    vinserti128          m2, [r3+32*0+ 0], 1
4603    mova                xm5, [r3-32*1+ 0]
4604    mova                xm6, [r3-32*4+16]
4605    vinserti128          m5, [r3+32*3+ 0], 1
4606    vinserti128          m6, [r3+32*0+16], 1
4607.fast3:
4608    add                  r6, o_idct64_offset
4609    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
4610    add                  r6, 8
4611    add               tmp1q, 32*8
4612    sub               tmp2q, 32*8
4613    mova                xm0, [r2-32*2+ 0]
4614    mova                xm3, [r2-32*3+16]
4615    vinserti128          m0, [r2+32*2+ 0], 1
4616    vinserti128          m3, [r2+32*1+16], 1
4617    mova                xm4, [r2-32*2+16]
4618    mova                xm7, [r2-32*3+ 0]
4619    vinserti128          m4, [r2+32*2+16], 1
4620    vinserti128          m7, [r2+32*1+ 0], 1
4621    pxor                 m1, m1
4622    REPX       {mova x, m1}, m2, m5, m6
4623    test                r7d, r7d
4624    jl .fast4
4625    mova                xm1, [r3-32*3+16]
4626    mova                xm2, [r3-32*2+ 0]
4627    vinserti128          m1, [r3+32*1+16], 1
4628    vinserti128          m2, [r3+32*2+ 0], 1
4629    mova                xm5, [r3-32*3+ 0]
4630    mova                xm6, [r3-32*2+16]
4631    vinserti128          m5, [r3+32*1+ 0], 1
4632    vinserti128          m6, [r3+32*2+16], 1
4633.fast4:
4634    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
4635    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
4636    RET
4637ALIGN function_align
4638%define o_base idct64_mul - 8
4639cglobal_label .main_part1
4640    ; idct64 steps 1-5:
4641    ; in1/31/17/15/ 9/23/25/ 7 ->
4642    ;     t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a
4643    ; in5/27/21/11/13/19/29/ 3 ->
4644    ;     t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a
4645    vpbroadcastd        m11, [o(idct64_mul+4* 0)]
4646    vpbroadcastd        m13, [o(idct64_mul+4* 1)]
4647    vpbroadcastd        m10, [o(idct64_mul+4* 4)]
4648    vpbroadcastd        m12, [o(idct64_mul+4* 5)]
4649    pmulhrsw            m11, m0  ; t63a
4650    pmulhrsw             m0, m13 ; t32a
4651    pmulhrsw            m10, m1  ; t62a
4652    pmulhrsw             m1, m12 ; t33a
4653    vpbroadcastd         m9, [o(idct64_mul+4* 8)]
4654    vpbroadcastd        m13, [o(idct64_mul+4* 9)]
4655    vpbroadcastd         m8, [o(idct64_mul+4*12)]
4656    vpbroadcastd        m12, [o(idct64_mul+4*13)]
4657    pmulhrsw             m9, m2  ; t61a
4658    pmulhrsw             m2, m13 ; t34a
4659    pmulhrsw             m8, m3  ; t60a
4660    pmulhrsw             m3, m12 ; t35a
4661    psubsw              m12, m0, m1   ; t33
4662    paddsw               m0, m1       ; t32
4663    psubsw               m1, m3, m2   ; t34
4664    paddsw               m3, m2       ; t35
4665    psubsw               m2, m8, m9   ; t61
4666    paddsw               m8, m9       ; t60
4667    psubsw               m9, m11, m10 ; t62
4668    paddsw              m11, m10      ; t63
4669    ITX_MULSUB_2W         2,  1, 10, 13, 15, m4076, 401 ; t34a, t61a
4670    vpbroadcastd        m14, [o(pw_401_4076)]
4671    ITX_MULSUB_2W         9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
4672    psubsw              m10, m0, m3  ; t35a
4673    paddsw               m0, m3      ; t32a
4674    psubsw               m3, m11, m8 ; t60a
4675    paddsw              m11, m8      ; t63a
4676    psubsw               m8, m9, m2  ; t34
4677    paddsw               m9, m2      ; t33
4678    psubsw               m2, m12, m1 ; t61
4679    paddsw              m12, m1      ; t62
4680    mova       [tmp1q-32*4], m0
4681    mova       [tmp1q-32*3], m9
4682    mova       [tmp2q+32*2], m12
4683    mova       [tmp2q+32*3], m11
4684    vpbroadcastd        m13, [o(pw_m4017_799)]
4685    vpbroadcastd        m14, [o(pw_799_4017)]
4686    ITX_MULSUB_2W         2,  8,  0,  1, 15, 14, 13 ; t34a, t61a
4687    ITX_MULSUB_2W         3, 10,  0,  1, 15, 14, 13 ; t35,  t60
4688    mova       [tmp1q-32*2], m2
4689    mova       [tmp1q-32*1], m3
4690    mova       [tmp2q+32*0], m10
4691    mova       [tmp2q+32*1], m8
4692    vpbroadcastd         m3, [o(idct64_mul+4*16)]
4693    vpbroadcastd        m11, [o(idct64_mul+4*17)]
4694    vpbroadcastd         m2, [o(idct64_mul+4*20)]
4695    vpbroadcastd        m10, [o(idct64_mul+4*21)]
4696    vpbroadcastd         m1, [o(idct64_mul+4*24)]
4697    vpbroadcastd         m9, [o(idct64_mul+4*25)]
4698    vpbroadcastd         m0, [o(idct64_mul+4*28)]
4699    vpbroadcastd         m8, [o(idct64_mul+4*29)]
4700    pmulhrsw             m3, m4  ; t59a
4701    pmulhrsw             m4, m11 ; t36a
4702    pmulhrsw             m2, m5  ; t58a
4703    pmulhrsw             m5, m10 ; t37a
4704    pmulhrsw             m1, m6  ; t57a
4705    pmulhrsw             m6, m9  ; t38a
4706    pmulhrsw             m0, m7  ; t56a
4707    pmulhrsw             m7, m8  ; t39a
4708    psubsw               m8, m4, m5 ; t37
4709    paddsw               m4, m5     ; t36
4710    psubsw               m5, m7, m6 ; t38
4711    paddsw               m7, m6     ; t39
4712    psubsw               m6, m0, m1 ; t57
4713    paddsw               m0, m1     ; t56
4714    psubsw               m1, m3, m2 ; t58
4715    paddsw               m3, m2     ; t59
4716    ITX_MULSUB_2W         6,  5,  2,  9, 15, m2598, 3166 ; t38a, t57a
4717    vpbroadcastd        m10, [o(pw_3166_2598)]
4718    ITX_MULSUB_2W         1,  8,  2,  9, 15, 10,  9 ; t37a, t58a
4719    psubsw               m2, m7, m4 ; t36a
4720    paddsw               m7, m4     ; t39a
4721    psubsw               m4, m0, m3 ; t59a
4722    paddsw               m0, m3     ; t56a
4723    psubsw               m3, m6, m1 ; t37
4724    paddsw               m6, m1     ; t38
4725    psubsw               m1, m5, m8 ; t58
4726    paddsw               m5, m8     ; t57
4727    mova       [tmp1q+32*2], m6
4728    mova       [tmp1q+32*3], m7
4729    mova       [tmp2q-32*4], m0
4730    mova       [tmp2q-32*3], m5
4731    vpbroadcastd         m6, [o(pw_m799_m4017)]
4732    vpbroadcastd         m7, [o(pw_m4017_799)]
4733    ITX_MULSUB_2W         4,  2,  0,  5, 15,  7,  6 ; t36,  t59
4734    ITX_MULSUB_2W         1,  3,  0,  5, 15,  7,  6 ; t37a, t58a
4735    mova       [tmp1q+32*0], m4
4736    mova       [tmp1q+32*1], m1
4737    mova       [tmp2q-32*2], m3
4738    mova       [tmp2q-32*1], m2
4739    ret
4740%define o_base pw_5 + 128
4741.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub
4742    sub                  r6, o_idct64_offset + 8
4743    vpbroadcastd        m11, [o(pw_1567_3784)]
4744    vpbroadcastd        m12, [o(pw_m3784_1567)]
4745    vpbroadcastd        m13, [o(pw_2896_2896)]
4746    vpbroadcastd        m14, [o(pw_m2896_2896)]
4747.main_part2_pass1_loop:
4748    call .main_part2_internal
4749    IDCT64_PART2_END      0,  7,  0,  6,  9, 10
4750    IDCT64_PART2_END      7,  8,  5,  0,  6,  7
4751    IDCT64_PART2_END      8,  2,  1,  0,  6,  7
4752    IDCT64_PART2_END     15,  3,  4,  0,  6,  7
4753    cmp               tmp1q, tmp2q
4754    jne .main_part2_pass1_loop
4755    ret
4756cglobal_label .main_part2_internal
4757    mova                 m0, [tmp1q-32*12] ; t32a
4758    mova                 m6, [tmp2q-32*13] ; t39a
4759    mova                 m1, [tmp1q-32* 4] ; t40a
4760    mova                 m5, [tmp2q+32* 3] ; t55a
4761    add               tmp1q, 32
4762    sub               tmp2q, 32
4763    mova                 m2, [tmp1q+32* 3] ; t48a
4764    mova                 m4, [tmp2q-32* 4] ; t47a
4765    mova                 m3, [tmp1q+32*11] ; t56a
4766    mova                 m7, [tmp2q+32*12] ; t63a
4767    psubsw               m8, m0, m6 ; t39
4768    paddsw               m0, m6     ; t32
4769    psubsw               m6, m4, m1 ; t40
4770    paddsw               m4, m1     ; t47
4771    psubsw               m1, m2, m5 ; t55
4772    paddsw               m2, m5     ; t48
4773    psubsw               m5, m7, m3 ; t56
4774    paddsw               m7, m3     ; t63
4775    ITX_MULSUB_2W         5,  8,  3,  9, 15, 11, 12 ; t39a, t56a
4776    vpbroadcastd         m9, [o(pw_m1567_m3784)]
4777    ITX_MULSUB_2W         1,  6,  3,  9, 15, 12,  9 ; t40a, t55a
4778    psubsw               m3, m0, m4 ; t47a
4779    paddsw               m0, m4     ; t32a
4780    psubsw               m4, m7, m2 ; t48a
4781    paddsw               m7, m2     ; t63a
4782    psubsw               m2, m5, m1 ; t40
4783    paddsw               m5, m1     ; t39
4784    psubsw               m1, m8, m6 ; t55
4785    paddsw               m8, m6     ; t56
4786    ITX_MULSUB_2W         4,  3,  6,  9, 15, 13, 14 ; t47,  t48
4787    ITX_MULSUB_2W         1,  2,  6,  9, 15, 13, 14 ; t40a, t55a
4788    ret
4789.main_part2_pass2:
4790    sub                  r6, o_idct64_offset + 8
4791    vpbroadcastd        m11, [o(pw_1567_3784)]
4792    vpbroadcastd        m12, [o(pw_m3784_1567)]
4793    vpbroadcastd        m13, [o(pw_2896_2896)]
4794    lea                  r9, [strideq*5]    ; stride*5
4795    lea                  r3, [r9+strideq*1] ; stride*6
4796    lea                  r7, [r9+strideq*2] ; stride*7
4797    lea                  r8, [r3+strideq*2] ; stride*8
4798    lea                  r2, [dstq+r7]
4799.main_part2_pass2_loop:
4800    vpbroadcastd        m14, [o(pw_m2896_2896)]
4801    call .main_part2_internal
4802    vpbroadcastd        m14, [o(pw_2048)]
4803    IDCT64_PART2_END      0,  7,  0,  6,  9, 10, strideq*0, r3*4, r8*4, r7*8
4804    IDCT64_PART2_END      7,  8,  5,  0,  6,  7, strideq*0, r3*4, r8*4, r7*8
4805    IDCT64_PART2_END      8,  2,  1,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
4806    IDCT64_PART2_END     15,  3,  4,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
4807    add                dstq, strideq
4808    sub                  r2, strideq
4809    cmp               tmp1q, tmp2q
4810    jne .main_part2_pass2_loop
4811    ret
4812
4813cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob
4814    lea                  r6, [o_base]
4815    test               eobd, eobd
4816    jnz .normal
4817    movd                xm1, [o(pw_2896x8)]
4818    pmulhrsw            xm0, xm1, [cq]
4819    movd                xm2, [o(pw_8192)]
4820    mov                [cq], eobd
4821    or                  r3d, 16
4822.dconly:
4823    pmulhrsw            xm0, xm2
4824    movd                xm2, [o(pw_2048)]
4825    pmulhrsw            xm0, xm1
4826    pmulhrsw            xm0, xm2
4827    vpbroadcastw         m0, xm0
4828    pxor                 m1, m1
4829.dconly_loop:
4830    mova                 m2, [dstq+32*0]
4831    mova                 m3, [dstq+32*1]
4832    punpckhbw            m4, m2, m1
4833    punpcklbw            m2, m1
4834    punpckhbw            m5, m3, m1
4835    punpcklbw            m3, m1
4836    paddw                m4, m0
4837    paddw                m2, m0
4838    paddw                m5, m0
4839    paddw                m3, m0
4840    packuswb             m2, m4
4841    packuswb             m3, m5
4842    mova        [dstq+32*0], m2
4843    mova        [dstq+32*1], m3
4844    add                dstq, strideq
4845    dec                 r3d
4846    jg .dconly_loop
4847    RET
4848.normal:
4849    PROLOGUE              0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
4850    LOAD_8ROWS      cq+32*0, 32*4
4851    pxor                 m8, m8
4852    REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
4853    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
4854    mova              [rsp], m8
4855    lea               tmp1q, [rsp+32*7]
4856    call m(idct_16x16_internal_8bpc).main
4857    mova                 m1, [rsp+32*1]
4858    mova       [tmp1q-32*4], m0
4859    mova       [tmp1q-32*3], m1
4860    mova       [tmp1q-32*2], m2
4861    mova       [tmp1q-32*1], m3
4862    mova       [tmp1q+32*0], m4
4863    mova       [tmp1q+32*1], m5
4864    mova       [tmp1q+32*2], m6
4865    mova       [tmp1q+32*3], m7
4866    add               tmp1q, 32*8
4867    mova       [tmp1q-32*4], m8
4868    mova       [tmp1q-32*3], m9
4869    mova       [tmp1q-32*2], m10
4870    mova       [tmp1q-32*1], m11
4871    mova       [tmp1q+32*0], m12
4872    mova       [tmp1q+32*1], m13
4873    mova       [tmp1q+32*2], m14
4874    mova       [tmp1q+32*3], m15
4875    LOAD_8ROWS      cq+32*2, 32*4
4876    pxor                 m8, m8
4877    REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
4878    add               tmp1q, 32*8
4879    lea               tmp2q, [tmp1q+32*8]
4880    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4881    vpbroadcastd        m15, [o(pd_2048)]
4882    add               tmp1q, 32*16
4883    add               tmp2q, 32*32
4884    mova                 m0, [cq+32* 1]
4885    mova                 m1, [cq+32*31]
4886    mova                 m2, [cq+32*17]
4887    mova                 m3, [cq+32*15]
4888    mova                 m4, [cq+32* 9]
4889    mova                 m5, [cq+32*23]
4890    mova                 m6, [cq+32*25]
4891    mova                 m7, [cq+32* 7]
4892    pxor                 m8, m8
4893    REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
4894    add                  r6, o_idct64_offset
4895    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
4896    add                  r6, 8
4897    add               tmp1q, 32*8
4898    sub               tmp2q, 32*8
4899    mova                 m0, [cq+32* 5]
4900    mova                 m1, [cq+32*27]
4901    mova                 m2, [cq+32*21]
4902    mova                 m3, [cq+32*11]
4903    mova                 m4, [cq+32*13]
4904    mova                 m5, [cq+32*19]
4905    mova                 m6, [cq+32*29]
4906    mova                 m7, [cq+32* 3]
4907    pxor                 m8, m8
4908    REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
4909    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
4910    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
4911    sub               tmp1q, 32*36
4912    lea                  r2, [strideq*3]
4913    mov               tmp2d, 4
4914.pass2_loop:
4915    lea                  r3, [tmp1q-32*8]
4916    mova                xm0, [r3   -32*4]
4917    mova                xm1, [r3   -32*3]
4918    vinserti128          m0, [tmp1q-32*4], 1
4919    vinserti128          m1, [tmp1q-32*3], 1
4920    mova                xm2, [r3   -32*2]
4921    mova                xm3, [r3   -32*1]
4922    vinserti128          m2, [tmp1q-32*2], 1
4923    vinserti128          m3, [tmp1q-32*1], 1
4924    mova                xm4, [r3   +32*0]
4925    mova                xm5, [r3   +32*1]
4926    vinserti128          m4, [tmp1q+32*0], 1
4927    vinserti128          m5, [tmp1q+32*1], 1
4928    mova                xm6, [r3   +32*2]
4929    mova                xm7, [r3   +32*3]
4930    vinserti128          m6, [tmp1q+32*2], 1
4931    vinserti128          m7, [tmp1q+32*3], 1
4932    mova                xm8, [r3   -32*4+16]
4933    mova                xm9, [r3   -32*3+16]
4934    vinserti128          m8, [tmp1q-32*4+16], 1
4935    vinserti128          m9, [tmp1q-32*3+16], 1
4936    mova               xm10, [r3   -32*2+16]
4937    mova               xm11, [r3   -32*1+16]
4938    vinserti128         m10, [tmp1q-32*2+16], 1
4939    vinserti128         m11, [tmp1q-32*1+16], 1
4940    mova               xm12, [r3   +32*0+16]
4941    mova               xm13, [r3   +32*1+16]
4942    vinserti128         m12, [tmp1q+32*0+16], 1
4943    vinserti128         m13, [tmp1q+32*1+16], 1
4944    mova               xm14, [r3   +32*2+16]
4945    mova               xm15, [r3   +32*3+16]
4946    vinserti128         m14, [tmp1q+32*2+16], 1
4947    vinserti128         m15, [tmp1q+32*3+16], 1
4948    mova         [rsp+32*0], m6
4949    mova         [rsp+32*1], m7
4950    vpbroadcastd         m7, [o(pw_8192)]
4951    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
4952    call m(idct_16x16_internal_8bpc).main
4953    mova         [rsp+32*0], m15
4954    vpbroadcastd        m15, [o(pw_2048)]
4955    REPX  {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7
4956    WRITE_16X2            2,  3,  1,  2, strideq*2, r2
4957    pmulhrsw             m1, m15, [rsp+32*1]
4958    WRITE_16X2            0,  1,  2,  3, strideq*0, strideq*1
4959    lea                  r3, [dstq+strideq*4]
4960    %define dstq r3
4961    WRITE_16X2            4,  5,  2,  3, strideq*0, strideq*1
4962    WRITE_16X2            6,  7,  2,  3, strideq*2, r2
4963    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
4964    lea                  r3, [r3+strideq*4]
4965    WRITE_16X2            8,  9,  2,  3, strideq*0, strideq*1
4966    WRITE_16X2           10, 11,  2,  3, strideq*2, r2
4967    pmulhrsw            m15, [rsp+32*0]
4968    lea                  r3, [r3+strideq*4]
4969    WRITE_16X2           12, 13,  2,  3, strideq*0, strideq*1
4970    WRITE_16X2           14, 15,  2,  3, strideq*2, r2
4971    add               tmp1q, 32*16
4972    add                  r0, 16
4973    dec               tmp2d
4974    jg .pass2_loop
4975    RET
4976
4977cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob
4978    lea                  r6, [o_base]
4979    test               eobd, eobd
4980    jnz .normal
4981    movd                xm1, [o(pw_2896x8)]
4982    pmulhrsw            xm0, xm1, [cq]
4983    movd                xm2, [o(pw_16384)]
4984    mov                [cq], eobd
4985    pmulhrsw            xm0, xm1
4986    or                  r3d, 64
4987    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
4988.normal:
4989    PROLOGUE              0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2
4990    lea               tmp1q, [rsp+32*7]
4991    lea                r10d, [eobq-136]
4992    sar                r10d, 31
4993.pass1_loop:
4994    lea               tmp2q, [tmp1q+32*16]
4995    LOAD_8ROWS      cq+64*1, 64*2, 1
4996    pxor                 m8, m8
4997    REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
4998    test               r10b, r10b
4999    jnz .fast
5000    LOAD_8ROWS_H   cq+64*17, 64*2, 2
5001    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
5002    LOAD_8ROWS_H   cq+64*16, 64*2, 1
5003    mova              [rsp], m15
5004    pxor                m15, m15
5005    REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \
5006                                24, 25, 26, 27, 28, 29, 30, 31
5007    jmp .idct16
5008.fast:
5009    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5010    pxor                 m8, m8
5011    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
5012    mova              [rsp], m8
5013.idct16:
5014    LOAD_8ROWS      cq+64*0, 64*2, 1
5015    pxor                m15, m15
5016    REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
5017    call m(idct_16x16_internal_8bpc).main
5018    call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end
5019    vpbroadcastd         m7, [o(pw_16384)]
5020    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
5021    lea                  r3, [tmp1q+32*48]
5022    mova                m15, [rsp]
5023    mova          [r3-32*4], m0
5024    mova          [r3-32*3], m2
5025    mova          [r3-32*2], m4
5026    mova          [r3-32*1], m6
5027    mova          [r3+32*0], m8
5028    mova          [r3+32*1], m10
5029    mova          [r3+32*2], m12
5030    mova          [r3+32*3], m14
5031    add                  r3, 32*24
5032    mova          [r3-32*4], m1
5033    mova          [r3-32*3], m3
5034    mova          [r3-32*2], m5
5035    mova          [r3-32*1], m7
5036    mova          [r3+32*0], m9
5037    mova          [r3+32*1], m11
5038    mova          [r3+32*2], m13
5039    mova          [r3+32*3], m15
5040    vpbroadcastd         m9, [o(pw_16384)]
5041    pmulhrsw             m0, m9, [tmp1q-32*4]
5042    pmulhrsw             m1, m9, [tmp1q-32*3]
5043    pmulhrsw             m2, m9, [tmp1q-32*2]
5044    pmulhrsw             m3, m9, [tmp1q-32*1]
5045    pmulhrsw             m4, m9, [tmp1q+32*0]
5046    pmulhrsw             m5, m9, [tmp1q+32*1]
5047    pmulhrsw             m6, m9, [tmp1q+32*2]
5048    pmulhrsw             m7, m9, [tmp1q+32*3]
5049    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
5050    mova       [tmp1q-32*4], m0
5051    pmulhrsw             m0, m9, [tmp2q-32*4]
5052    mova       [tmp2q-32*4], m1
5053    pmulhrsw             m1, m9, [tmp2q-32*3]
5054    mova       [tmp1q-32*3], m2
5055    pmulhrsw             m2, m9, [tmp2q-32*2]
5056    mova       [tmp2q-32*3], m3
5057    pmulhrsw             m3, m9, [tmp2q-32*1]
5058    mova       [tmp1q-32*2], m4
5059    pmulhrsw             m4, m9, [tmp2q+32*0]
5060    mova       [tmp2q-32*2], m5
5061    pmulhrsw             m5, m9, [tmp2q+32*1]
5062    mova       [tmp1q-32*1], m6
5063    pmulhrsw             m6, m9, [tmp2q+32*2]
5064    mova       [tmp2q-32*1], m7
5065    pmulhrsw             m7, m9, [tmp2q+32*3]
5066    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
5067    mova       [tmp1q+32*0], m0
5068    mova       [tmp2q+32*0], m1
5069    mova       [tmp1q+32*1], m2
5070    mova       [tmp2q+32*1], m3
5071    mova       [tmp1q+32*2], m4
5072    mova       [tmp2q+32*2], m5
5073    mova       [tmp1q+32*3], m6
5074    mova       [tmp2q+32*3], m7
5075    add                  cq, 32
5076    add               tmp1q, 32*8
5077    add                r10d, 0x80000000
5078    jnc .pass1_loop
5079    lea                  r2, [rsp+32*55]
5080    lea                  r7, [r2+32*24]
5081.pass2_loop:
5082    lea                  r3, [r2+32*8]
5083    lea                  r8, [r7+32*8]
5084    mova                 m0, [r2-32*4]
5085    mova                 m1, [r2-32*2]
5086    mova                 m2, [r2+32*0]
5087    mova                 m3, [r2+32*2]
5088    pxor                 m4, m4
5089    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
5090    test               r10b, r10b
5091    jnz .fast2
5092    mova                 m4, [r3-32*4]
5093    mova                 m5, [r3-32*2]
5094    mova                 m6, [r3+32*0]
5095    mova                 m7, [r3+32*2]
5096.fast2:
5097    mova              [rsp], m8
5098    lea               tmp1q, [rsp+32*39]
5099    call m(idct_16x16_internal_8bpc).main
5100    mova                 m1, [rsp+32*1]
5101    mova       [tmp1q-32*4], m0
5102    mova       [tmp1q-32*3], m1
5103    mova       [tmp1q-32*2], m2
5104    mova       [tmp1q-32*1], m3
5105    mova       [tmp1q+32*0], m4
5106    mova       [tmp1q+32*1], m5
5107    mova       [tmp1q+32*2], m6
5108    mova       [tmp1q+32*3], m7
5109    add               tmp1q, 32*8
5110    mova       [tmp1q-32*4], m8
5111    mova       [tmp1q-32*3], m9
5112    mova       [tmp1q-32*2], m10
5113    mova       [tmp1q-32*1], m11
5114    mova       [tmp1q+32*0], m12
5115    mova       [tmp1q+32*1], m13
5116    mova       [tmp1q+32*2], m14
5117    mova       [tmp1q+32*3], m15
5118    mova                 m0, [r2-32*3]
5119    mova                 m1, [r2-32*1]
5120    mova                 m2, [r2+32*1]
5121    mova                 m3, [r2+32*3]
5122    pxor                 m4, m4
5123    REPX       {mova x, m4}, m5, m6, m7
5124    test               r10b, r10b
5125    jnz .fast3
5126    mova                 m4, [r3-32*3]
5127    mova                 m5, [r3-32*1]
5128    mova                 m6, [r3+32*1]
5129    mova                 m7, [r3+32*3]
5130.fast3:
5131    add               tmp1q, 32*8
5132    lea               tmp2q, [tmp1q+32*8]
5133    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5134    vpbroadcastd        m15, [o(pd_2048)]
5135    add               tmp1q, 32*16
5136    add               tmp2q, 32*32
5137    mova                 m0, [r7-32*4]
5138    mova                 m3, [r7+32*3]
5139    mova                 m4, [r7+32*0]
5140    mova                 m7, [r7-32*1]
5141    pxor                 m1, m1
5142    REPX       {mova x, m1}, m2, m5, m6
5143    test               r10b, r10b
5144    jnz .fast4
5145    mova                 m1, [r8+32*3]
5146    mova                 m2, [r8-32*4]
5147    mova                 m5, [r8-32*1]
5148    mova                 m6, [r8+32*0]
5149.fast4:
5150    add                  r6, o_idct64_offset
5151    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5152    add                  r6, 8
5153    add               tmp1q, 32*8
5154    sub               tmp2q, 32*8
5155    mova                 m0, [r7-32*2]
5156    mova                 m3, [r7+32*1]
5157    mova                 m4, [r7+32*2]
5158    mova                 m7, [r7-32*3]
5159    pxor                 m1, m1
5160    REPX       {mova x, m1}, m2, m5, m6
5161    test               r10b, r10b
5162    jnz .fast5
5163    mova                 m1, [r8+32*1]
5164    mova                 m2, [r8-32*2]
5165    mova                 m5, [r8-32*3]
5166    mova                 m6, [r8+32*2]
5167.fast5:
5168    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5169    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
5170    add                r10d, 0x80000000
5171    jc .ret
5172    lea                  r2, [rsp+32*7]
5173    lea                  r7, [r2+32*16]
5174    sub                dstq, r8
5175    lea                dstq, [dstq+strideq*4+16]
5176    jmp .pass2_loop
5177.ret:
5178    RET
5179
5180cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob
5181    lea                  r6, [o_base]
5182    test               eobd, eobd
5183    jnz .normal
5184    movd                xm1, [o(pw_2896x8)]
5185    pmulhrsw            xm0, xm1, [cq]
5186    movd                xm2, [o(pw_16384)]
5187    mov                [cq], eobd
5188    pmulhrsw            xm0, xm1
5189    or                  r3d, 32
5190    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
5191.normal:
5192    PROLOGUE              0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \
5193                                            base, tmp3, tmp4
5194    lea               tmp1q, [rsp+32*7]
5195    lea               tmp4d, [eobq-136]
5196.pass1_loop:
5197    LOAD_8ROWS      cq+64*0, 64*4, 1
5198    pxor                 m8, m8
5199    REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
5200    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
5201    mova              [rsp], m8
5202    call m(idct_16x16_internal_8bpc).main
5203    mova                 m1, [rsp+32*1]
5204    mova       [tmp1q-32*4], m0
5205    mova       [tmp1q-32*3], m1
5206    mova       [tmp1q-32*2], m2
5207    mova       [tmp1q-32*1], m3
5208    mova       [tmp1q+32*0], m4
5209    mova       [tmp1q+32*1], m5
5210    mova       [tmp1q+32*2], m6
5211    mova       [tmp1q+32*3], m7
5212    add               tmp1q, 32*8
5213    mova       [tmp1q-32*4], m8
5214    mova       [tmp1q-32*3], m9
5215    mova       [tmp1q-32*2], m10
5216    mova       [tmp1q-32*1], m11
5217    mova       [tmp1q+32*0], m12
5218    mova       [tmp1q+32*1], m13
5219    mova       [tmp1q+32*2], m14
5220    mova       [tmp1q+32*3], m15
5221    LOAD_8ROWS      cq+64*2, 64*4, 1
5222    pxor                 m8, m8
5223    REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
5224    add               tmp1q, 32*8
5225    lea               tmp2q, [tmp1q+32*8]
5226    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5227    vpbroadcastd        m15, [o(pd_2048)]
5228    add               tmp1q, 32*16
5229    add               tmp2q, 32*32
5230    vpbroadcastd         m7, [o(pw_2896x8)]
5231    pmulhrsw             m0, m7, [cq+64* 1]
5232    pmulhrsw             m1, m7, [cq+64*31]
5233    pmulhrsw             m2, m7, [cq+64*17]
5234    pmulhrsw             m3, m7, [cq+64*15]
5235    pmulhrsw             m4, m7, [cq+64* 9]
5236    pmulhrsw             m5, m7, [cq+64*23]
5237    pmulhrsw             m6, m7, [cq+64*25]
5238    pmulhrsw             m7,     [cq+64* 7]
5239    pxor                 m8, m8
5240    REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
5241    add                  r6, o_idct64_offset
5242    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5243    vpbroadcastd         m7, [o(pw_2896x8-(o_idct64_offset))]
5244    add                  r6, 8
5245    add               tmp1q, 32*8
5246    sub               tmp2q, 32*8
5247    pmulhrsw             m0, m7, [cq+64* 5]
5248    pmulhrsw             m1, m7, [cq+64*27]
5249    pmulhrsw             m2, m7, [cq+64*21]
5250    pmulhrsw             m3, m7, [cq+64*11]
5251    pmulhrsw             m4, m7, [cq+64*13]
5252    pmulhrsw             m5, m7, [cq+64*19]
5253    pmulhrsw             m6, m7, [cq+64*29]
5254    pmulhrsw             m7,     [cq+64* 3]
5255    pxor                 m8, m8
5256    REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
5257    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5258    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
5259    sub               tmp1q, 32*44
5260    vpbroadcastd        m10, [o(pw_16384)]
5261    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave
5262    add                  cq, 32
5263    add               tmp4d, 0x80000000
5264    jnc .pass1_loop
5265    lea               tmp1q, [rsp+32*15]
5266    imul                 r2, strideq, 19
5267    lea                  r3, [strideq*3]
5268    add                  r2, dstq
5269    mov               tmp4b, 4
5270.pass2_loop:
5271    lea               tmp2q, [tmp1q+32*64]
5272    LOAD_8ROWS   tmp1q-32*4, 32
5273    test              tmp4d, 0x40000000
5274    jnz .fast
5275    LOAD_8ROWS_H tmp2q-32*4, 32
5276    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
5277    lea               tmp3q, [tmp2q-32*8]
5278    LOAD_8ROWS_H tmp3q-32*4, 32
5279    mova              [rsp], m15
5280    jmp .idct16
5281.fast:
5282    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5283    pxor                 m8, m8
5284    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
5285    mova              [rsp], m8
5286.idct16:
5287    lea               tmp3q, [tmp1q-32*8]
5288    LOAD_8ROWS   tmp3q-32*4, 32
5289    call m(idct_16x16_internal_8bpc).main
5290    call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end
5291    add               tmp1q, 32*16
5292    sub                dstq, r3
5293    lea                  r2, [r2+r3+16]
5294    add                dstq, 16
5295    dec               tmp4b
5296    jg .pass2_loop
5297    RET
5298ALIGN function_align
5299.transpose_round_interleave:
5300    mov               tmp3d, 4
5301.loop:
5302    lea               tmp2q, [tmp1q+32*8]
5303    mova                xm0, [tmp1q-32*4]
5304    mova                xm1, [tmp1q-32*3]
5305    vinserti128          m0, [tmp2q-32*4], 1
5306    vinserti128          m1, [tmp2q-32*3], 1
5307    mova                xm2, [tmp1q-32*2]
5308    mova                xm3, [tmp1q-32*1]
5309    vinserti128          m2, [tmp2q-32*2], 1
5310    vinserti128          m3, [tmp2q-32*1], 1
5311    mova                xm4, [tmp1q+32*0]
5312    mova                xm5, [tmp1q+32*1]
5313    vinserti128          m4, [tmp2q+32*0], 1
5314    vinserti128          m5, [tmp2q+32*1], 1
5315    mova                xm6, [tmp1q+32*2]
5316    mova                xm7, [tmp1q+32*3]
5317    vinserti128          m6, [tmp2q+32*2], 1
5318    vinserti128          m7, [tmp2q+32*3], 1
5319    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
5320    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
5321    mova                xm8, [tmp1q-32*4+16]
5322    mova                xm9, [tmp1q-32*3+16]
5323    vinserti128          m8, [tmp2q-32*4+16], 1
5324    vinserti128          m9, [tmp2q-32*3+16], 1
5325    mova       [tmp1q-32*4], m0
5326    mova       [tmp2q-32*4], m1
5327    mova       [tmp1q-32*3], m2
5328    mova       [tmp2q-32*3], m3
5329    mova                xm2, [tmp1q-32*2+16]
5330    mova                xm3, [tmp1q-32*1+16]
5331    vinserti128          m2, [tmp2q-32*2+16], 1
5332    vinserti128          m3, [tmp2q-32*1+16], 1
5333    mova       [tmp1q-32*2], m4
5334    mova       [tmp2q-32*2], m5
5335    mova       [tmp1q-32*1], m6
5336    mova       [tmp2q-32*1], m7
5337    mova                xm4, [tmp1q+32*0+16]
5338    mova                xm5, [tmp1q+32*1+16]
5339    vinserti128          m4, [tmp2q+32*0+16], 1
5340    vinserti128          m5, [tmp2q+32*1+16], 1
5341    mova                xm6, [tmp1q+32*2+16]
5342    mova                xm7, [tmp1q+32*3+16]
5343    vinserti128          m6, [tmp2q+32*2+16], 1
5344    vinserti128          m7, [tmp2q+32*3+16], 1
5345    pmulhrsw             m0, m8, m10
5346    pmulhrsw             m1, m9, m10
5347    REPX  {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7
5348    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
5349    mova       [tmp1q+32*0], m0
5350    mova       [tmp2q+32*0], m1
5351    mova       [tmp1q+32*1], m2
5352    mova       [tmp2q+32*1], m3
5353    mova       [tmp1q+32*2], m4
5354    mova       [tmp2q+32*2], m5
5355    mova       [tmp1q+32*3], m6
5356    mova       [tmp2q+32*3], m7
5357    add               tmp1q, 32*16
5358    dec               tmp3d
5359    jg .loop
5360    ret
5361
5362cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob
5363    lea                  r6, [o_base]
5364    test               eobd, eobd
5365    jnz .normal
5366    movd                xm1, [o(pw_2896x8)]
5367    pmulhrsw            xm0, xm1, [cq]
5368    movd                xm2, [o(pw_8192)]
5369    mov                [cq], eobd
5370    or                  r3d, 64
5371    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
5372.normal:
5373    PROLOGUE              0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2
5374    lea               tmp1q, [rsp+32*71]
5375    lea                r10d, [eobq-136]
5376.pass1_loop:
5377    LOAD_8ROWS      cq+64*0, 64*4
5378    pxor                 m8, m8
5379    REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
5380    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
5381    mova              [rsp], m8
5382    call m(idct_16x16_internal_8bpc).main
5383    mova                 m1, [rsp+32*1]
5384    mova       [tmp1q-32*4], m0
5385    mova       [tmp1q-32*3], m1
5386    mova       [tmp1q-32*2], m2
5387    mova       [tmp1q-32*1], m3
5388    mova       [tmp1q+32*0], m4
5389    mova       [tmp1q+32*1], m5
5390    mova       [tmp1q+32*2], m6
5391    mova       [tmp1q+32*3], m7
5392    add               tmp1q, 32*8
5393    mova       [tmp1q-32*4], m8
5394    mova       [tmp1q-32*3], m9
5395    mova       [tmp1q-32*2], m10
5396    mova       [tmp1q-32*1], m11
5397    mova       [tmp1q+32*0], m12
5398    mova       [tmp1q+32*1], m13
5399    mova       [tmp1q+32*2], m14
5400    mova       [tmp1q+32*3], m15
5401    LOAD_8ROWS      cq+64*2, 64*4
5402    pxor                 m8, m8
5403    REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
5404    add               tmp1q, 32*8
5405    lea               tmp2q, [tmp1q+32*8]
5406    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5407    vpbroadcastd        m15, [o(pd_2048)]
5408    add               tmp1q, 32*16
5409    add               tmp2q, 32*32
5410    mova                 m0, [cq+64* 1]
5411    mova                 m1, [cq+64*31]
5412    mova                 m2, [cq+64*17]
5413    mova                 m3, [cq+64*15]
5414    mova                 m4, [cq+64* 9]
5415    mova                 m5, [cq+64*23]
5416    mova                 m6, [cq+64*25]
5417    mova                 m7, [cq+64* 7]
5418    pxor                 m8, m8
5419    REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
5420    add                  r6, o_idct64_offset
5421    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5422    add                  r6, 8
5423    add               tmp1q, 32*8
5424    sub               tmp2q, 32*8
5425    mova                 m0, [cq+64* 5]
5426    mova                 m1, [cq+64*27]
5427    mova                 m2, [cq+64*21]
5428    mova                 m3, [cq+64*11]
5429    mova                 m4, [cq+64*13]
5430    mova                 m5, [cq+64*19]
5431    mova                 m6, [cq+64*29]
5432    mova                 m7, [cq+64* 3]
5433    pxor                 m8, m8
5434    REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
5435    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5436    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
5437    sub               tmp1q, 32*44
5438    vpbroadcastd        m10, [o(pw_8192)]
5439    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave
5440    add                  cq, 32
5441    add                r10d, 0x80000000
5442    jnc .pass1_loop
5443    lea               tmp1q, [rsp+32*7]
5444    mov                r10b, 4
5445.pass2_loop:
5446    lea                  r2, [tmp1q+32*64]
5447    mova                 m0, [r2-32*4]
5448    mova                 m1, [r2-32*2]
5449    mova                 m2, [r2+32*0]
5450    mova                 m3, [r2+32*2]
5451    pxor                 m4, m4
5452    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
5453    mova              [rsp], m4
5454    test               r10d, 0x40000000
5455    jnz .fast
5456    lea                  r3, [r2+32*64]
5457    mova                 m4, [r3-32*4]
5458    mova                 m5, [r3-32*2]
5459    mova                 m6, [r3+32*0]
5460    mova                 m7, [r3+32*2]
5461.fast:
5462    call m(idct_16x16_internal_8bpc).main
5463    mova                 m1, [rsp+32*1]
5464    mova       [tmp1q-32*4], m0
5465    mova       [tmp1q-32*3], m1
5466    mova       [tmp1q-32*2], m2
5467    mova       [tmp1q-32*1], m3
5468    mova       [tmp1q+32*0], m4
5469    mova       [tmp1q+32*1], m5
5470    mova       [tmp1q+32*2], m6
5471    mova       [tmp1q+32*3], m7
5472    add               tmp1q, 32*8
5473    mova       [tmp1q-32*4], m8
5474    mova       [tmp1q-32*3], m9
5475    mova       [tmp1q-32*2], m10
5476    mova       [tmp1q-32*1], m11
5477    mova       [tmp1q+32*0], m12
5478    mova       [tmp1q+32*1], m13
5479    mova       [tmp1q+32*2], m14
5480    mova       [tmp1q+32*3], m15
5481    mova                 m0, [r2-32*3]
5482    mova                 m1, [r2-32*1]
5483    mova                 m2, [r2+32*1]
5484    mova                 m3, [r2+32*3]
5485    pxor                 m4, m4
5486    REPX       {mova x, m4}, m5, m6, m7
5487    test               r10d, 0x40000000
5488    jnz .fast2
5489    mova                 m4, [r3-32*3]
5490    mova                 m5, [r3-32*1]
5491    mova                 m6, [r3+32*1]
5492    mova                 m7, [r3+32*3]
5493.fast2:
5494    add               tmp1q, 32*8
5495    lea               tmp2q, [tmp1q+32*8]
5496    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5497    vpbroadcastd        m15, [o(pd_2048)]
5498    add                  r2, 32*8
5499    add                  r3, 32*8
5500    add               tmp1q, 32*16
5501    add               tmp2q, 32*32
5502    mova                 m0, [r2-32*4] ;  1
5503    mova                 m3, [r2+32*3] ; 15
5504    mova                 m4, [r2+32*0] ;  9
5505    mova                 m7, [r2-32*1] ;  7
5506    pxor                 m1, m1
5507    REPX       {mova x, m1}, m2, m5, m6
5508    test               r10d, 0x40000000
5509    jnz .fast3
5510    mova                 m1, [r3+32*3] ; 31
5511    mova                 m2, [r3-32*4] ; 17
5512    mova                 m5, [r3-32*1] ; 23
5513    mova                 m6, [r3+32*0] ; 25
5514.fast3:
5515    add                  r6, o_idct64_offset
5516    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5517    add                  r6, 8
5518    add               tmp1q, 32*8
5519    sub               tmp2q, 32*8
5520    mova                 m0, [r2-32*2] ;  5
5521    mova                 m3, [r2+32*1] ; 11
5522    mova                 m4, [r2+32*2] ; 13
5523    mova                 m7, [r2-32*3] ;  3
5524    pxor                 m1, m1
5525    REPX       {mova x, m1}, m2, m5, m6
5526    test               r10d, 0x40000000
5527    jnz .fast4
5528    mova                 m1, [r3+32*1] ; 27
5529    mova                 m2, [r3-32*2] ; 21
5530    mova                 m5, [r3-32*3] ; 19
5531    mova                 m6, [r3+32*2] ; 29
5532.fast4:
5533    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5534    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
5535    sub               tmp1q, 32*28
5536    sub                dstq, r8
5537    lea                dstq, [dstq+strideq*4+16]
5538    dec                r10b
5539    jg .pass2_loop
5540    RET
5541
5542%endif ; ARCH_X86_64
5543