xref: /aosp_15_r20/external/libdav1d/src/x86/itx16_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; Copyright © 2017-2021, The rav1e contributors
4*c0909341SAndroid Build Coastguard Worker; Copyright © 2020, Nathan Egge
5*c0909341SAndroid Build Coastguard Worker; Copyright © 2021, Matthias Dressel
6*c0909341SAndroid Build Coastguard Worker; All rights reserved.
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
9*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
12*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
13*c0909341SAndroid Build Coastguard Worker;
14*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
15*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
16*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
17*c0909341SAndroid Build Coastguard Worker;
18*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%include "config.asm"
30*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
31*c0909341SAndroid Build Coastguard Worker
32*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA
33*c0909341SAndroid Build Coastguard Worker%macro COEF 1-2
34*c0909341SAndroid Build Coastguard Workerpd_%1: times 4 dd %1
35*c0909341SAndroid Build Coastguard Worker%if %0 == 2
36*c0909341SAndroid Build Coastguard Workerpd_m%1: times 4 dd -%1
37*c0909341SAndroid Build Coastguard Worker%endif
38*c0909341SAndroid Build Coastguard Worker%endmacro
39*c0909341SAndroid Build Coastguard Worker
40*c0909341SAndroid Build Coastguard WorkerCOEF  201
41*c0909341SAndroid Build Coastguard WorkerCOEF  401
42*c0909341SAndroid Build Coastguard WorkerCOEF  601, 1
43*c0909341SAndroid Build Coastguard WorkerCOEF  799
44*c0909341SAndroid Build Coastguard WorkerCOEF  995
45*c0909341SAndroid Build Coastguard WorkerCOEF 1189, 1
46*c0909341SAndroid Build Coastguard WorkerCOEF 1380, 1
47*c0909341SAndroid Build Coastguard WorkerCOEF 1567
48*c0909341SAndroid Build Coastguard WorkerCOEF 1751
49*c0909341SAndroid Build Coastguard WorkerCOEF 1931
50*c0909341SAndroid Build Coastguard WorkerCOEF 2106, 1
51*c0909341SAndroid Build Coastguard WorkerCOEF 2276, 1
52*c0909341SAndroid Build Coastguard WorkerCOEF 2440
53*c0909341SAndroid Build Coastguard WorkerCOEF 2598, 1
54*c0909341SAndroid Build Coastguard WorkerCOEF 2751, 1
55*c0909341SAndroid Build Coastguard WorkerCOEF 2896
56*c0909341SAndroid Build Coastguard WorkerCOEF 3035
57*c0909341SAndroid Build Coastguard WorkerCOEF 3166
58*c0909341SAndroid Build Coastguard WorkerCOEF 3290
59*c0909341SAndroid Build Coastguard WorkerCOEF 3406
60*c0909341SAndroid Build Coastguard WorkerCOEF 3513
61*c0909341SAndroid Build Coastguard WorkerCOEF 3612
62*c0909341SAndroid Build Coastguard WorkerCOEF 3703
63*c0909341SAndroid Build Coastguard WorkerCOEF 3784
64*c0909341SAndroid Build Coastguard WorkerCOEF 3857
65*c0909341SAndroid Build Coastguard WorkerCOEF 3920
66*c0909341SAndroid Build Coastguard WorkerCOEF 3973
67*c0909341SAndroid Build Coastguard WorkerCOEF 4017
68*c0909341SAndroid Build Coastguard WorkerCOEF 4052
69*c0909341SAndroid Build Coastguard WorkerCOEF 4076
70*c0909341SAndroid Build Coastguard WorkerCOEF 4091
71*c0909341SAndroid Build Coastguard Worker
72*c0909341SAndroid Build Coastguard Workerdeint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
73*c0909341SAndroid Build Coastguard Worker
74*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
75*c0909341SAndroid Build Coastguard Workerpd_1:            times 4 dd     1
76*c0909341SAndroid Build Coastguard Worker%endif
77*c0909341SAndroid Build Coastguard Workerpd_2:            times 4 dd     2
78*c0909341SAndroid Build Coastguard Workerpw_5:            times 8 dw     5
79*c0909341SAndroid Build Coastguard Workerpd_1321:         times 4 dd  1321
80*c0909341SAndroid Build Coastguard Workerpd_2482:         times 4 dd  2482
81*c0909341SAndroid Build Coastguard Workerpd_m3344:        times 4 dd -3344
82*c0909341SAndroid Build Coastguard Workerpd_2048:         times 4 dd  2048
83*c0909341SAndroid Build Coastguard Workerpw_4x2048_4xm2048: times 4 dw 2048
84*c0909341SAndroid Build Coastguard Worker                   times 4 dw -2048
85*c0909341SAndroid Build Coastguard Workerpw_4xm2048_4x2048: times 4 dw -2048
86*c0909341SAndroid Build Coastguard Worker                   times 4 dw 2048
87*c0909341SAndroid Build Coastguard Workerpw_2048:         times 8 dw  2048
88*c0909341SAndroid Build Coastguard Workerpw_m2048:        times 8 dw  -2048
89*c0909341SAndroid Build Coastguard Workerpd_3803:         times 4 dd  3803
90*c0909341SAndroid Build Coastguard Workerpw_4096:         times 8 dw  4096
91*c0909341SAndroid Build Coastguard Workerpd_5793:         times 4 dd  5793
92*c0909341SAndroid Build Coastguard Workerpd_6144:         times 4 dd  6144
93*c0909341SAndroid Build Coastguard Workerpw_8192:         times 8 dw  8192
94*c0909341SAndroid Build Coastguard Workerpd_10240:        times 4 dd 10240
95*c0909341SAndroid Build Coastguard Workerpd_11586:        times 4 dd 11586
96*c0909341SAndroid Build Coastguard Workerpw_1697x8:       times 8 dw  1697*8
97*c0909341SAndroid Build Coastguard Workerpw_2896x8:       times 8 dw  2896*8
98*c0909341SAndroid Build Coastguard Workerpw_1697x16:      times 8 dw  1697*16
99*c0909341SAndroid Build Coastguard Workerpw_16384:        times 8 dw 16384
100*c0909341SAndroid Build Coastguard Workerpixel_10bpc_max: times 8 dw  0x03ff
101*c0909341SAndroid Build Coastguard Worker
102*c0909341SAndroid Build Coastguard Workerpw_1567_3784:    times 4 dw  1567,  3784
103*c0909341SAndroid Build Coastguard Workerpw_m3784_1567:   times 4 dw -3784,  1567
104*c0909341SAndroid Build Coastguard Workerpw_2896_2896:    times 4 dw  2896,  2896
105*c0909341SAndroid Build Coastguard Workerpw_m2896_2896:   times 4 dw -2896,  2896
106*c0909341SAndroid Build Coastguard Worker
107*c0909341SAndroid Build Coastguard Workerclip_18b_min: times 4 dd -0x20000
108*c0909341SAndroid Build Coastguard Workerclip_18b_max: times 4 dd  0x1ffff
109*c0909341SAndroid Build Coastguard Worker
110*c0909341SAndroid Build Coastguard Workeridct64_mul_16bpc:
111*c0909341SAndroid Build Coastguard Workerdd 4095,  101, 2967, -2824,  3745, 1660, 3822, -1474,   401,  4076,   799,  4017
112*c0909341SAndroid Build Coastguard Workerdd -700, 4036, 2359,  3349, -2191, 3461,  897,  3996, -2598, -3166, -4017,  -799
113*c0909341SAndroid Build Coastguard Workerdd 4065,  501, 3229, -2520,  3564, 2019, 3948, -1092,  1931,  3612,  3406,  2276
114*c0909341SAndroid Build Coastguard Workerdd -301, 4085, 2675,  3102, -1842, 3659, 1285,  3889, -1189, -3920, -2276, -3406
115*c0909341SAndroid Build Coastguard Worker
116*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3
117*c0909341SAndroid Build Coastguard Workercextern iadst_4x4_internal_8bpc_ssse3.main
118*c0909341SAndroid Build Coastguard Workercextern idct_4x8_internal_8bpc_ssse3.main
119*c0909341SAndroid Build Coastguard Workercextern iadst_4x8_internal_8bpc_ssse3.main
120*c0909341SAndroid Build Coastguard Workercextern idct_16x4_internal_8bpc_ssse3.main
121*c0909341SAndroid Build Coastguard Workercextern iadst_16x4_internal_8bpc_ssse3.main
122*c0909341SAndroid Build Coastguard Workercextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end
123*c0909341SAndroid Build Coastguard Workercextern idct_8x4_internal_8bpc_ssse3.main
124*c0909341SAndroid Build Coastguard Workercextern iadst_8x4_internal_8bpc_ssse3.main
125*c0909341SAndroid Build Coastguard Workercextern idct_8x8_internal_8bpc_ssse3.main
126*c0909341SAndroid Build Coastguard Workercextern idct_8x8_internal_8bpc_ssse3.pass1_end3
127*c0909341SAndroid Build Coastguard Workercextern iadst_8x8_internal_8bpc_ssse3.main
128*c0909341SAndroid Build Coastguard Workercextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end
129*c0909341SAndroid Build Coastguard Workercextern idct_16x8_internal_8bpc_ssse3.main
130*c0909341SAndroid Build Coastguard Workercextern iadst_16x8_internal_8bpc_ssse3.main
131*c0909341SAndroid Build Coastguard Workercextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end
132*c0909341SAndroid Build Coastguard Workercextern idct_8x32_internal_8bpc_ssse3.main
133*c0909341SAndroid Build Coastguard Workercextern idct_8x32_internal_8bpc_ssse3.main_fast
134*c0909341SAndroid Build Coastguard Workercextern idct_8x32_internal_8bpc_ssse3.main_veryfast
135*c0909341SAndroid Build Coastguard Workercextern idct_16x64_internal_8bpc_ssse3.main
136*c0909341SAndroid Build Coastguard Workercextern idct_16x64_internal_8bpc_ssse3.main_fast
137*c0909341SAndroid Build Coastguard Worker
138*c0909341SAndroid Build Coastguard Workertbl_4x16_2d: db 0, 13, 29, 45
139*c0909341SAndroid Build Coastguard Workertbl_4x16_h: db 0, 16, 32, 48
140*c0909341SAndroid Build Coastguard Workertbl_4x16_v: db 0, 4, 8, 12
141*c0909341SAndroid Build Coastguard Worker
142*c0909341SAndroid Build Coastguard Workertbl_8x16_2d: db 0, 14, 30, 46
143*c0909341SAndroid Build Coastguard Workertbl_8x16_v: db 0, 4, 8, 12
144*c0909341SAndroid Build Coastguard Workertbl_8x16_h: db 0, 32, 64, 96
145*c0909341SAndroid Build Coastguard Worker
146*c0909341SAndroid Build Coastguard Workertbl_16x16_2d: db 0, 10, 36, 78
147*c0909341SAndroid Build Coastguard Workertbl_16x16_v: db 0, 4, 8, 12
148*c0909341SAndroid Build Coastguard Workertbl_16x16_h: db 0, 64, 128, 192
149*c0909341SAndroid Build Coastguard Worker
150*c0909341SAndroid Build Coastguard Workertbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203
151*c0909341SAndroid Build Coastguard Worker
152*c0909341SAndroid Build Coastguard Workertbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343
153*c0909341SAndroid Build Coastguard Worker
154*c0909341SAndroid Build Coastguard Workertbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one
155*c0909341SAndroid Build Coastguard Workertbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406
156*c0909341SAndroid Build Coastguard Worker
157*c0909341SAndroid Build Coastguard Workertbl_Nx32_odd_offset: db 2*16, 2*23
158*c0909341SAndroid Build Coastguard Worker                     db 2*20, 2*19
159*c0909341SAndroid Build Coastguard Worker                     db 2*18, 2*21
160*c0909341SAndroid Build Coastguard Worker                     db 2*22, 2*17
161*c0909341SAndroid Build Coastguard Worker                     db 2*30, 2*25
162*c0909341SAndroid Build Coastguard Worker                     db 2*26, 2*29
163*c0909341SAndroid Build Coastguard Worker                     db 2*28, 2*27
164*c0909341SAndroid Build Coastguard Worker                     db 2*24, 2*31
165*c0909341SAndroid Build Coastguard Worker
166*c0909341SAndroid Build Coastguard Workertbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46
167*c0909341SAndroid Build Coastguard Worker                 db 2* 8, 2*40, 2*23, 2*38
168*c0909341SAndroid Build Coastguard Worker                 db 2* 1, 2*36, 2*20, 2*42
169*c0909341SAndroid Build Coastguard Worker                 db 2* 9, 2*44, 2*19, 2*34
170*c0909341SAndroid Build Coastguard Worker                 db 2* 2, 2*60, 2*18, 2*50
171*c0909341SAndroid Build Coastguard Worker                 db 2*10, 2*52, 2*21, 2*58
172*c0909341SAndroid Build Coastguard Worker                 db 2* 3, 2*56, 2*22, 2*54
173*c0909341SAndroid Build Coastguard Worker                 db 2*11, 2*48, 2*17, 2*62
174*c0909341SAndroid Build Coastguard Worker
175*c0909341SAndroid Build Coastguard WorkerSECTION .text
176*c0909341SAndroid Build Coastguard Worker
177*c0909341SAndroid Build Coastguard Worker%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx)
178*c0909341SAndroid Build Coastguard Worker%define m(x) m_suffix(x, SUFFIX)
179*c0909341SAndroid Build Coastguard Worker
180*c0909341SAndroid Build Coastguard Worker; This refers to the first function in itx_sse i.e. the start of the text section
181*c0909341SAndroid Build Coastguard Worker; which is needed as a base pointer for constants.
182*c0909341SAndroid Build Coastguard Worker%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3)
183*c0909341SAndroid Build Coastguard Worker
184*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
185*c0909341SAndroid Build Coastguard Worker%define o(x) x
186*c0909341SAndroid Build Coastguard Worker%else
187*c0909341SAndroid Build Coastguard Worker%define o(x) r6-$$+x ; PIC
188*c0909341SAndroid Build Coastguard Worker%endif
189*c0909341SAndroid Build Coastguard Worker
190*c0909341SAndroid Build Coastguard Worker%macro IWHT4_1D 0
191*c0909341SAndroid Build Coastguard Worker    ; m0 = in0,  m1 = in1,  m2 = in2,  m3 = in3
192*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1      ; in0 += in1
193*c0909341SAndroid Build Coastguard Worker    psubd                m4, m2, m3  ; tmp0 = in2 - in3
194*c0909341SAndroid Build Coastguard Worker    psubd                m5, m0, m4  ; tmp1 = (in0 - tmp0) >> 1
195*c0909341SAndroid Build Coastguard Worker    psrad                m5, 1
196*c0909341SAndroid Build Coastguard Worker    psubd                m2, m5, m1  ; in2 = tmp1 - in1
197*c0909341SAndroid Build Coastguard Worker    psubd                m5, m3      ; in1 = tmp1 - in3
198*c0909341SAndroid Build Coastguard Worker    psubd                m0, m5      ; in0 -= in1
199*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2      ; in3 = tmp0 + in2
200*c0909341SAndroid Build Coastguard Worker    ; m0 = out0,  m1 = in1,  m2 = out2,  m3 = in3
201*c0909341SAndroid Build Coastguard Worker    ; m4 = out3,  m5 = out1
202*c0909341SAndroid Build Coastguard Worker%endmacro
203*c0909341SAndroid Build Coastguard Worker
204*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse2
205*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax
206*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+16*0]
207*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*1]
208*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+16*2]
209*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+16*3]
210*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 2}, m0, m1, m2, m3
211*c0909341SAndroid Build Coastguard Worker    IWHT4_1D
212*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m0, m5
213*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m0, m5
214*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m2, m4
215*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m4
216*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m5
217*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m5
218*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m3, m2
219*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m2
220*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
221*c0909341SAndroid Build Coastguard Worker    IWHT4_1D
222*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m4 ; low: out3,  high: out0
223*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m5 ; low: out2,  high: out1
224*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
225*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*0], m4
226*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*1], m4
227*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*2], m4
228*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*3], m4
229*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+strideq*2]
230*c0909341SAndroid Build Coastguard Worker    movq                 m1, [dstq+strideq*0]
231*c0909341SAndroid Build Coastguard Worker    movhps               m1, [r2  +strideq*1]
232*c0909341SAndroid Build Coastguard Worker    movq                 m3, [r2  +strideq*0]
233*c0909341SAndroid Build Coastguard Worker    movhps               m3, [dstq+strideq*1]
234*c0909341SAndroid Build Coastguard Worker    movd                 m5, bdmaxm
235*c0909341SAndroid Build Coastguard Worker    pshuflw              m5, m5, q0000  ; broadcast
236*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m5         ; broadcast
237*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1
238*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m3
239*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m4
240*c0909341SAndroid Build Coastguard Worker    pmaxsw               m2, m4
241*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m5
242*c0909341SAndroid Build Coastguard Worker    pminsw               m2, m5
243*c0909341SAndroid Build Coastguard Worker    movhps [r2  +strideq*1], m0 ; write out0
244*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m2 ; write out1
245*c0909341SAndroid Build Coastguard Worker    movq   [r2  +strideq*0], m2 ; write out2
246*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0 ; write out3
247*c0909341SAndroid Build Coastguard Worker    RET
248*c0909341SAndroid Build Coastguard Worker
249*c0909341SAndroid Build Coastguard Worker; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
250*c0909341SAndroid Build Coastguard Worker; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
251*c0909341SAndroid Build Coastguard Worker; flags: 2 = inv_dst1, 4 = inv_dst2
252*c0909341SAndroid Build Coastguard Worker; skip round/shift if rnd is not a number
253*c0909341SAndroid Build Coastguard Worker%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
254*c0909341SAndroid Build Coastguard Worker; %1 dst/src[1]
255*c0909341SAndroid Build Coastguard Worker; %2 dst/src[2]
256*c0909341SAndroid Build Coastguard Worker; %3 tmp[1]
257*c0909341SAndroid Build Coastguard Worker; %4 tmp[2]
258*c0909341SAndroid Build Coastguard Worker; %5 tmp[3]
259*c0909341SAndroid Build Coastguard Worker; %6 rnd
260*c0909341SAndroid Build Coastguard Worker; %7 coef[1]
261*c0909341SAndroid Build Coastguard Worker; %8 coef[2]
262*c0909341SAndroid Build Coastguard Worker; %9 flags
263*c0909341SAndroid Build Coastguard Worker%ifnidn %7,%8   ; optimize when coef1 == coef2
264*c0909341SAndroid Build Coastguard Worker%if %8 < 32
265*c0909341SAndroid Build Coastguard Worker    pmulld              m%4, m%1, m%8
266*c0909341SAndroid Build Coastguard Worker    pmulld              m%3, m%2, m%8
267*c0909341SAndroid Build Coastguard Worker%else
268*c0909341SAndroid Build Coastguard Worker    mova                m%3, [o(pd_%8)]
269*c0909341SAndroid Build Coastguard Worker    pmulld              m%4, m%1, m%3
270*c0909341SAndroid Build Coastguard Worker    pmulld              m%3, m%2
271*c0909341SAndroid Build Coastguard Worker%endif
272*c0909341SAndroid Build Coastguard Worker%endif
273*c0909341SAndroid Build Coastguard Worker%if %7 < 32
274*c0909341SAndroid Build Coastguard Worker    pmulld              m%1, m%7
275*c0909341SAndroid Build Coastguard Worker    pmulld              m%2, m%7
276*c0909341SAndroid Build Coastguard Worker%else
277*c0909341SAndroid Build Coastguard Worker    mova                m%5, [o(pd_%7)]
278*c0909341SAndroid Build Coastguard Worker    pmulld              m%1, m%5
279*c0909341SAndroid Build Coastguard Worker    pmulld              m%2, m%5
280*c0909341SAndroid Build Coastguard Worker%endif
281*c0909341SAndroid Build Coastguard Worker%if %9 & 4  ; invert dst2
282*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m%2
283*c0909341SAndroid Build Coastguard Worker    psubd               m%2, m%6, m%4
284*c0909341SAndroid Build Coastguard Worker%else
285*c0909341SAndroid Build Coastguard Worker%ifnum %6
286*c0909341SAndroid Build Coastguard Worker%ifnidn %7,%8
287*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m%6
288*c0909341SAndroid Build Coastguard Worker%else
289*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%6
290*c0909341SAndroid Build Coastguard Worker%endif
291*c0909341SAndroid Build Coastguard Worker%endif
292*c0909341SAndroid Build Coastguard Worker%ifnidn %7,%8
293*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%4
294*c0909341SAndroid Build Coastguard Worker%else
295*c0909341SAndroid Build Coastguard Worker    mova                m%3, m%2
296*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%1
297*c0909341SAndroid Build Coastguard Worker%endif
298*c0909341SAndroid Build Coastguard Worker%endif
299*c0909341SAndroid Build Coastguard Worker%if %9 & 2  ; invert dst1
300*c0909341SAndroid Build Coastguard Worker    psubd               m%3, m%1
301*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%3, m%6
302*c0909341SAndroid Build Coastguard Worker%else
303*c0909341SAndroid Build Coastguard Worker%ifnum %6
304*c0909341SAndroid Build Coastguard Worker%ifnidn %7,%8
305*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%6
306*c0909341SAndroid Build Coastguard Worker%endif
307*c0909341SAndroid Build Coastguard Worker%endif
308*c0909341SAndroid Build Coastguard Worker    psubd               m%1, m%3
309*c0909341SAndroid Build Coastguard Worker%endif
310*c0909341SAndroid Build Coastguard Worker%ifnum %6
311*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 12
312*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 12
313*c0909341SAndroid Build Coastguard Worker%endif
314*c0909341SAndroid Build Coastguard Worker%endmacro
315*c0909341SAndroid Build Coastguard Worker
316*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack
317*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2
318*c0909341SAndroid Build Coastguard Worker    %define %%p1 m(i%1_%4_internal_16bpc)
319*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
320*c0909341SAndroid Build Coastguard Worker    LEA                  r6, $$
321*c0909341SAndroid Build Coastguard Worker%endif
322*c0909341SAndroid Build Coastguard Worker%if has_epilogue
323*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
324*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
325*c0909341SAndroid Build Coastguard Worker    jz %%end
326*c0909341SAndroid Build Coastguard Worker%endif
327*c0909341SAndroid Build Coastguard Worker    lea                tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
328*c0909341SAndroid Build Coastguard Worker%ifnum %3
329*c0909341SAndroid Build Coastguard Worker%if %3
330*c0909341SAndroid Build Coastguard Worker    add                eobd, %3
331*c0909341SAndroid Build Coastguard Worker%endif
332*c0909341SAndroid Build Coastguard Worker%else
333*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(%3)]
334*c0909341SAndroid Build Coastguard Worker%endif
335*c0909341SAndroid Build Coastguard Worker    call %%p1
336*c0909341SAndroid Build Coastguard Worker    RET
337*c0909341SAndroid Build Coastguard Worker%%end:
338*c0909341SAndroid Build Coastguard Worker%else
339*c0909341SAndroid Build Coastguard Worker    ; Jump to the 1st txfm function if we're not taking the fast path, which
340*c0909341SAndroid Build Coastguard Worker    ; in turn performs an indirect jump to the 2nd txfm function.
341*c0909341SAndroid Build Coastguard Worker    lea                tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
342*c0909341SAndroid Build Coastguard Worker%ifnum %3
343*c0909341SAndroid Build Coastguard Worker%if %3
344*c0909341SAndroid Build Coastguard Worker    add                eobd, %3
345*c0909341SAndroid Build Coastguard Worker%endif
346*c0909341SAndroid Build Coastguard Worker%else
347*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(%3)]
348*c0909341SAndroid Build Coastguard Worker%endif
349*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
350*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
351*c0909341SAndroid Build Coastguard Worker    jnz %%p1
352*c0909341SAndroid Build Coastguard Worker%else
353*c0909341SAndroid Build Coastguard Worker    ; jump to the 1st txfm function unless it's located directly after this
354*c0909341SAndroid Build Coastguard Worker    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
355*c0909341SAndroid Build Coastguard WorkerALIGN function_align
356*c0909341SAndroid Build Coastguard Worker%%end:
357*c0909341SAndroid Build Coastguard Worker%endif
358*c0909341SAndroid Build Coastguard Worker%endif
359*c0909341SAndroid Build Coastguard Worker%endmacro
360*c0909341SAndroid Build Coastguard Worker
361*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_4X4_FN 2 ; type1, type2
362*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 0, 4x4
363*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
364*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
365*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
366*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 4
367*c0909341SAndroid Build Coastguard Worker.dconly:
368*c0909341SAndroid Build Coastguard Worker    add                 r5d, 128
369*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 8
370*c0909341SAndroid Build Coastguard Worker.dconly2:
371*c0909341SAndroid Build Coastguard Worker    imul                r5d, 2896
372*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(pixel_10bpc_max)]
373*c0909341SAndroid Build Coastguard Worker    add                 r5d, 34816
374*c0909341SAndroid Build Coastguard Worker    movd                 m0, r5d
375*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q1111
376*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
377*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
378*c0909341SAndroid Build Coastguard Worker.dconly_loop:
379*c0909341SAndroid Build Coastguard Worker    movq                 m1, [dstq+strideq*0]
380*c0909341SAndroid Build Coastguard Worker    movhps               m1, [dstq+strideq*1]
381*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
382*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m2
383*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m3
384*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m1
385*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m1
386*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
387*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 2
388*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
389*c0909341SAndroid Build Coastguard Worker    RET
390*c0909341SAndroid Build Coastguard Worker%endif
391*c0909341SAndroid Build Coastguard Worker%endmacro
392*c0909341SAndroid Build Coastguard Worker
393*c0909341SAndroid Build Coastguard Worker%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
394*c0909341SAndroid Build Coastguard Worker    ; butterfly rotation
395*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1  %3 out0
396*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2  %4 out3
397*c0909341SAndroid Build Coastguard Worker    ; Hadamard rotation
398*c0909341SAndroid Build Coastguard Worker    psubd               m%5, m%1, m%2
399*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%1
400*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%3, m%4
401*c0909341SAndroid Build Coastguard Worker    psubd               m%3, m%4
402*c0909341SAndroid Build Coastguard Worker    ; %1 (src1) = out0
403*c0909341SAndroid Build Coastguard Worker    ; %2 (src2) = out1
404*c0909341SAndroid Build Coastguard Worker    ; %3 (src3) = out3
405*c0909341SAndroid Build Coastguard Worker    ; $5 (tmp1) = out2
406*c0909341SAndroid Build Coastguard Worker%endmacro
407*c0909341SAndroid Build Coastguard Worker
408*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse4
409*c0909341SAndroid Build Coastguard Worker
410*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, dct
411*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, identity
412*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, adst
413*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, flipadst
414*c0909341SAndroid Build Coastguard Worker
415*c0909341SAndroid Build Coastguard Workercglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
416*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+16*0]
417*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*1]
418*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+16*2]
419*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+16*3]
420*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
421*c0909341SAndroid Build Coastguard Worker    call .pass1_main
422*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1     ; out0 out1
423*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m2     ; out2 out3
424*c0909341SAndroid Build Coastguard Worker    ; transpose
425*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m4
426*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4
427*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m2
428*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
429*c0909341SAndroid Build Coastguard Worker    ; m0 = out0 out1
430*c0909341SAndroid Build Coastguard Worker    ; m1 = out2 out3
431*c0909341SAndroid Build Coastguard Worker    ; m5 = pd_2048
432*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
433*c0909341SAndroid Build Coastguard Worker.pass1_main:
434*c0909341SAndroid Build Coastguard Worker    IDCT4_1D              0, 1, 2, 3, 4, 6, 7, 5
435*c0909341SAndroid Build Coastguard Worker    ret
436*c0909341SAndroid Build Coastguard Worker.pass2:
437*c0909341SAndroid Build Coastguard Worker    ; m0 = in0 in1
438*c0909341SAndroid Build Coastguard Worker    ; m1 = in2 in3
439*c0909341SAndroid Build Coastguard Worker    ; m5 = pd_2048
440*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1, m0
441*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0
442*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m2, [o(pw_m3784_1567)]
443*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, [o(pw_1567_3784)]
444*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m1, [o(pw_m2896_2896)]
445*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, [o(pw_2896_2896)]
446*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m4, m2, m0, m1
447*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5     ; pw_2048
448*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m4, m2, m0, m1
449*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m4     ; t3 t2
450*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0     ; t0 t1
451*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1, m2 ; out0 out1
452*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2     ; out3 out2
453*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
454*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
455*c0909341SAndroid Build Coastguard Worker    movq                 m2, [dstq+strideq*0]
456*c0909341SAndroid Build Coastguard Worker    movhps               m2, [dstq+strideq*1]
457*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dstq+strideq*2]
458*c0909341SAndroid Build Coastguard Worker    movq                 m3, [r5  +strideq*1]
459*c0909341SAndroid Build Coastguard Worker    movhps               m3, [r5  +strideq*0]
460*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pixel_10bpc_max)]
461*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
462*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*0], m4
463*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*1], m4
464*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*2], m4
465*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*3], m4
466*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
467*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
468*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m4
469*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m4
470*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m5
471*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m5
472*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
473*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
474*c0909341SAndroid Build Coastguard Worker    movhps [r5  +strideq*0], m1
475*c0909341SAndroid Build Coastguard Worker    movq   [r5  +strideq*1], m1
476*c0909341SAndroid Build Coastguard Worker    RET
477*c0909341SAndroid Build Coastguard Worker
478*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, dct
479*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, adst
480*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, flipadst
481*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, identity
482*c0909341SAndroid Build Coastguard Worker
483*c0909341SAndroid Build Coastguard Workercglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
484*c0909341SAndroid Build Coastguard Worker    call .main
485*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2            ; out0 out1
486*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m4            ; out2 out3
487*c0909341SAndroid Build Coastguard Worker    ; transpose
488*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m1
489*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
490*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m2
491*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
492*c0909341SAndroid Build Coastguard Worker    ; m0 = out0 out1
493*c0909341SAndroid Build Coastguard Worker    ; m1 = out2 out3
494*c0909341SAndroid Build Coastguard Worker    ; m5 = pd_2048
495*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
496*c0909341SAndroid Build Coastguard Worker.pass2:
497*c0909341SAndroid Build Coastguard Worker    ; m0 = in0 in1
498*c0909341SAndroid Build Coastguard Worker    ; m1 = in2 in3
499*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
500*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
501*c0909341SAndroid Build Coastguard Worker%endif
502*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
503*c0909341SAndroid Build Coastguard Worker.end:
504*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pw_2048)]
505*c0909341SAndroid Build Coastguard Worker    movq                 m2, [dstq+strideq*0]
506*c0909341SAndroid Build Coastguard Worker    movhps               m2, [dstq+strideq*1]
507*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dstq+strideq*2]
508*c0909341SAndroid Build Coastguard Worker    movq                 m3, [r5  +strideq*0]
509*c0909341SAndroid Build Coastguard Worker    movhps               m3, [r5  +strideq*1]
510*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pixel_10bpc_max)]
511*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
512*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
513*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
514*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*0], m4
515*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*1], m4
516*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*2], m4
517*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*3], m4
518*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
519*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
520*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m4
521*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m4
522*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m5
523*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m5
524*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
525*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
526*c0909341SAndroid Build Coastguard Worker    movq   [r5  +strideq*0], m1
527*c0909341SAndroid Build Coastguard Worker    movhps [r5  +strideq*1], m1
528*c0909341SAndroid Build Coastguard Worker    RET
529*c0909341SAndroid Build Coastguard WorkerALIGN function_align
530*c0909341SAndroid Build Coastguard Worker.main:
531*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*2]
532*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+16*3]
533*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+16*0]
534*c0909341SAndroid Build Coastguard Worker    lea                  r3, [cq+16*1]
535*c0909341SAndroid Build Coastguard Worker.main2:
536*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pd_1321)]  ; SINPI_1_9
537*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(pd_2482)]  ; SINPI_2_9
538*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pd_3803)]  ; SINPI_4_9
539*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m0, m1        ; s[4] = SINPI_1_9 * T[2]
540*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m3, m6        ; s[6] = SINPI_4_9 * T[3]
541*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m1            ; s[3] = SINPI_4_9 * T[2]
542*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m5            ; s[0] = SINPI_1_9 * T[0]
543*c0909341SAndroid Build Coastguard Worker    psubd                m1, m3            ; T[2] - T[3]
544*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m2            ; s[5] = SINPI_2_9 * T[3]
545*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m5            ; s[1] = SINPI_2_9 * T[0]
546*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6            ; s[0] += s[3]
547*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3            ; s[0] += s[5]
548*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_m3344)] ; -SINPI_3_9
549*c0909341SAndroid Build Coastguard Worker    psubd                m2, m4            ; s[1] -= s[4]
550*c0909341SAndroid Build Coastguard Worker    psubd                m2, m7            ; s[1] -= s[6]
551*c0909341SAndroid Build Coastguard Worker    psubd                m1, m5            ; -b7 = (T[2] -T[3]) - T[0]
552*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m3            ; s[2]  = -SINPI_3_9 * -b7
553*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [r3]          ; -s[3] = -SINPI_3_9 * T[1]
554*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
555*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m0, m1        ; {s[0], s[2]} + 2048
556*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0, m2        ; x[3]  = s[0] + s[1]
557*c0909341SAndroid Build Coastguard Worker    psubd                m2, m3            ; x[1]  = s[1] + s[3]
558*c0909341SAndroid Build Coastguard Worker    psubd                m0, m3            ; x[0]  = s[0] + s[3]
559*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3            ; x[3] -= s[3]
560*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5            ; x[1] + 2048
561*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m0, m2, m1, m4
562*c0909341SAndroid Build Coastguard Worker    ret
563*c0909341SAndroid Build Coastguard Worker
564*c0909341SAndroid Build Coastguard Worker
565*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, dct
566*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, adst
567*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, flipadst
568*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, identity
569*c0909341SAndroid Build Coastguard Worker
570*c0909341SAndroid Build Coastguard Workercglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
571*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x4_internal_16bpc).main
572*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2            ; out0 out1
573*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m4            ; out2 out3
574*c0909341SAndroid Build Coastguard Worker    ; transpose
575*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m0
576*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0
577*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
578*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
579*c0909341SAndroid Build Coastguard Worker    ; m0 = out0 out1
580*c0909341SAndroid Build Coastguard Worker    ; m1 = out2 out3
581*c0909341SAndroid Build Coastguard Worker    ; m5 = pd_2048
582*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
583*c0909341SAndroid Build Coastguard Worker.pass2:
584*c0909341SAndroid Build Coastguard Worker    ; m0 = in0 in1
585*c0909341SAndroid Build Coastguard Worker    ; m1 = in2 in3
586*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
587*c0909341SAndroid Build Coastguard Worker    lea                 r5, [o(itx8_start)]
588*c0909341SAndroid Build Coastguard Worker%endif
589*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
590*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pw_2048)]
591*c0909341SAndroid Build Coastguard Worker    movq                 m3, [dstq+strideq*1]
592*c0909341SAndroid Build Coastguard Worker    movhps               m3, [dstq+strideq*0]
593*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dstq+strideq*2]
594*c0909341SAndroid Build Coastguard Worker    movq                 m2, [r5  +strideq*1]
595*c0909341SAndroid Build Coastguard Worker    movhps               m2, [r5  +strideq*0]
596*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pixel_10bpc_max)]
597*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
598*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
599*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
600*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*0], m4
601*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*1], m4
602*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*2], m4
603*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*3], m4
604*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
605*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
606*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m4
607*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m4
608*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m5
609*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m5
610*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*0], m1
611*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*1], m1
612*c0909341SAndroid Build Coastguard Worker    movhps [r5  +strideq*0], m0
613*c0909341SAndroid Build Coastguard Worker    movq   [r5  +strideq*1], m0
614*c0909341SAndroid Build Coastguard Worker    RET
615*c0909341SAndroid Build Coastguard Worker
616*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, dct
617*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, adst
618*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, flipadst
619*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, identity
620*c0909341SAndroid Build Coastguard Worker
621*c0909341SAndroid Build Coastguard Workercglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
622*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_5793)]
623*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m3, [cq+16*0]
624*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m3, [cq+16*1]
625*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m3, [cq+16*2]
626*c0909341SAndroid Build Coastguard Worker    pmulld               m3,     [cq+16*3]
627*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
628*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m0, m1, m2, m3
629*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m0, m1, m2, m3
630*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
631*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
632*c0909341SAndroid Build Coastguard Worker    ; transpose
633*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m2
634*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
635*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m3
636*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3
637*c0909341SAndroid Build Coastguard Worker    ; m0 = out0 out1
638*c0909341SAndroid Build Coastguard Worker    ; m1 = out2 out3
639*c0909341SAndroid Build Coastguard Worker    ; m5 = pd_2048
640*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
641*c0909341SAndroid Build Coastguard Worker.pass2:
642*c0909341SAndroid Build Coastguard Worker    ; m0 = in0 in1
643*c0909341SAndroid Build Coastguard Worker    ; m1 = in2 in3
644*c0909341SAndroid Build Coastguard Worker    ; m5 = pd_2048
645*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pw_1697x8)]
646*c0909341SAndroid Build Coastguard Worker    movq                 m2, [dstq+strideq*0]
647*c0909341SAndroid Build Coastguard Worker    movhps               m2, [dstq+strideq*1]
648*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dstq+strideq*2]
649*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4, m0
650*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m1
651*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m3
652*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m4
653*c0909341SAndroid Build Coastguard Worker    movq                 m3, [r5  +strideq*0]
654*c0909341SAndroid Build Coastguard Worker    movhps               m3, [r5  +strideq*1]
655*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pixel_10bpc_max)]
656*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5 ; pw_2048
657*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
658*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
659*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
660*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*0], m5
661*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*1], m5
662*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*2], m5
663*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*3], m5
664*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
665*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
666*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m5
667*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m5
668*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m4
669*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m4
670*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
671*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
672*c0909341SAndroid Build Coastguard Worker    movq   [r5  +strideq*0], m1
673*c0909341SAndroid Build Coastguard Worker    movhps [r5  +strideq*1], m1
674*c0909341SAndroid Build Coastguard Worker    RET
675*c0909341SAndroid Build Coastguard Worker
676*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
677*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, %3, 4x8
678*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
679*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
680*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
681*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 8
682*c0909341SAndroid Build Coastguard Worker    add                 r5d, 128
683*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 8
684*c0909341SAndroid Build Coastguard Worker    imul                r5d, 181
685*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
686*c0909341SAndroid Build Coastguard Worker%endif
687*c0909341SAndroid Build Coastguard Worker%endmacro
688*c0909341SAndroid Build Coastguard Worker
689*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, dct
690*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, identity, 9
691*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, adst
692*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, flipadst
693*c0909341SAndroid Build Coastguard Worker
694*c0909341SAndroid Build Coastguard Workercglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
695*c0909341SAndroid Build Coastguard Worker%undef cmp
696*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
697*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
698*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
699*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 13
700*c0909341SAndroid Build Coastguard Worker    setge               r5b
701*c0909341SAndroid Build Coastguard Worker%else
702*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 1
703*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 13
704*c0909341SAndroid Build Coastguard Worker    sbb                 r5d, 0
705*c0909341SAndroid Build Coastguard Worker%endif
706*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 4
707*c0909341SAndroid Build Coastguard Worker.loop_pass1:
708*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_2896)]
709*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m3, [cq+32*0+r5]
710*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m3, [cq+32*1+r5]
711*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m3, [cq+32*2+r5]
712*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [cq+32*3+r5]
713*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m0, m1, m2, m3
714*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m0, m1, m2, m3
715*c0909341SAndroid Build Coastguard Worker    call m(idct_4x4_internal_16bpc).pass1_main
716*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1     ; out0 out1
717*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m2     ; out2 out3
718*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
719*c0909341SAndroid Build Coastguard Worker    jz .end_pass1
720*c0909341SAndroid Build Coastguard Worker    mova       [cq+32*0+16], m0
721*c0909341SAndroid Build Coastguard Worker    mova       [cq+32*1+16], m4
722*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
723*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass1
724*c0909341SAndroid Build Coastguard Worker.end_pass1:
725*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m4
726*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4
727*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m2
728*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
729*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*0+16]
730*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+32*1+16]
731*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2, m6
732*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m6
733*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2, m4
734*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4
735*c0909341SAndroid Build Coastguard Worker    ; m0-3 = packed & transposed output
736*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
737*c0909341SAndroid Build Coastguard Worker.pass2:
738*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
739*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
740*c0909341SAndroid Build Coastguard Worker%endif
741*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_4x8_internal_8bpc, _ssse3).main
742*c0909341SAndroid Build Coastguard Worker    ; m0-3 is now out0/1,3/2,4/5,7/6
743*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pw_2048)]
744*c0909341SAndroid Build Coastguard Worker    shufps               m1, m1, q1032
745*c0909341SAndroid Build Coastguard Worker    shufps               m3, m3, q1032
746*c0909341SAndroid Build Coastguard Worker.end:
747*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
748*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
749*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
750*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pixel_10bpc_max)]
751*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
752*c0909341SAndroid Build Coastguard Worker    movq                 m5, [dstq+strideq*0]
753*c0909341SAndroid Build Coastguard Worker    movq                 m6, [dstq+strideq*2]
754*c0909341SAndroid Build Coastguard Worker    movhps               m5, [dstq+strideq*1]
755*c0909341SAndroid Build Coastguard Worker    movhps               m6, [dstq+r2]
756*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dstq+strideq*4]
757*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
758*c0909341SAndroid Build Coastguard Worker    paddw                m1, m6
759*c0909341SAndroid Build Coastguard Worker    movq                 m5, [r4+strideq*0]
760*c0909341SAndroid Build Coastguard Worker    movq                 m6, [r4+strideq*2]
761*c0909341SAndroid Build Coastguard Worker    movhps               m5, [r4+strideq*1]
762*c0909341SAndroid Build Coastguard Worker    movhps               m6, [r4+r2]
763*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
764*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6
765*c0909341SAndroid Build Coastguard Worker    REPX     {pminsw x, m7}, m0, m1, m2, m3
766*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsw x, m4}, m0, m1, m2, m3
767*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
768*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
769*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], m1
770*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r2       ], m1
771*c0909341SAndroid Build Coastguard Worker    movq   [r4  +strideq*0], m2
772*c0909341SAndroid Build Coastguard Worker    movhps [r4  +strideq*1], m2
773*c0909341SAndroid Build Coastguard Worker    movq   [r4  +strideq*2], m3
774*c0909341SAndroid Build Coastguard Worker    movhps [r4  +r2       ], m3
775*c0909341SAndroid Build Coastguard Worker    RET
776*c0909341SAndroid Build Coastguard Worker
777*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, dct
778*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, adst
779*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, flipadst
780*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, identity, 9
781*c0909341SAndroid Build Coastguard Worker
782*c0909341SAndroid Build Coastguard Workercglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
783*c0909341SAndroid Build Coastguard Worker    call .pass1_main
784*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m1
785*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
786*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m2
787*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
788*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*2+16]
789*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+32*3+16]
790*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2, m6
791*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m6
792*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2, m4
793*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4
794*c0909341SAndroid Build Coastguard Worker    ; m0-3 = packed & transposed output
795*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
796*c0909341SAndroid Build Coastguard Worker.pass1_main:
797*c0909341SAndroid Build Coastguard Worker%undef cmp
798*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
799*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
800*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 13
801*c0909341SAndroid Build Coastguard Worker    setge               r5b
802*c0909341SAndroid Build Coastguard Worker%else
803*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 1
804*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 13
805*c0909341SAndroid Build Coastguard Worker    sbb                 r5d, 0
806*c0909341SAndroid Build Coastguard Worker%endif
807*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 4
808*c0909341SAndroid Build Coastguard Worker    lea                  r3, [cq+32*1+16]
809*c0909341SAndroid Build Coastguard Worker.loop_pass1:
810*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pd_2048)]
811*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_2896)]
812*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m3, [cq+32*0+r5]
813*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m3, [cq+32*1+r5]
814*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m3, [cq+32*2+r5]
815*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [cq+32*3+r5]
816*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m0}, m5, m2, m1, m3
817*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m5, m2, m1, m3
818*c0909341SAndroid Build Coastguard Worker    mova               [r3], m2
819*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x4_internal_16bpc).main2
820*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2            ; out0 out1
821*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m4            ; out2 out3
822*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
823*c0909341SAndroid Build Coastguard Worker    jz .end_pass1
824*c0909341SAndroid Build Coastguard Worker    mova       [cq+32*2+16], m0
825*c0909341SAndroid Build Coastguard Worker    mova       [cq+32*3+16], m1
826*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
827*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass1
828*c0909341SAndroid Build Coastguard Worker.end_pass1:
829*c0909341SAndroid Build Coastguard Worker    ret
830*c0909341SAndroid Build Coastguard Worker.pass2:
831*c0909341SAndroid Build Coastguard Worker    shufps               m0, m0, q1032
832*c0909341SAndroid Build Coastguard Worker    shufps               m1, m1, q1032
833*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
834*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
835*c0909341SAndroid Build Coastguard Worker%endif
836*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
837*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pw_4x2048_4xm2048)]
838*c0909341SAndroid Build Coastguard Worker    jmp m(idct_4x8_internal_16bpc).end
839*c0909341SAndroid Build Coastguard Worker
840*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, dct
841*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, adst
842*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, flipadst
843*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, identity, 9
844*c0909341SAndroid Build Coastguard Worker
845*c0909341SAndroid Build Coastguard Workercglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
846*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x8_internal_16bpc).pass1_main
847*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m0
848*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0
849*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
850*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
851*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+32*2+16]
852*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*3+16]
853*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m2, m6
854*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m6
855*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2, m4
856*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4
857*c0909341SAndroid Build Coastguard Worker    ; m0-3 = packed & transposed output
858*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
859*c0909341SAndroid Build Coastguard Worker.pass2:
860*c0909341SAndroid Build Coastguard Worker    shufps               m0, m0, q1032
861*c0909341SAndroid Build Coastguard Worker    shufps               m1, m1, q1032
862*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
863*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
864*c0909341SAndroid Build Coastguard Worker%endif
865*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
866*c0909341SAndroid Build Coastguard Worker    mova                 m4, m0
867*c0909341SAndroid Build Coastguard Worker    mova                 m5, m1
868*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q1032
869*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m2, q1032
870*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m5, q1032
871*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m4, q1032
872*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pw_4xm2048_4x2048)]
873*c0909341SAndroid Build Coastguard Worker    jmp m(idct_4x8_internal_16bpc).end
874*c0909341SAndroid Build Coastguard Worker
875*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, dct
876*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, adst
877*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, flipadst
878*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, identity, 3
879*c0909341SAndroid Build Coastguard Worker
880*c0909341SAndroid Build Coastguard Workercglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
881*c0909341SAndroid Build Coastguard Worker%undef cmp
882*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
883*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pd_2896)]
884*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pd_5793)]
885*c0909341SAndroid Build Coastguard Worker    ; clear m7 in case we skip the bottom square
886*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
887*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
888*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
889*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 16
890*c0909341SAndroid Build Coastguard Worker    setge               r5b
891*c0909341SAndroid Build Coastguard Worker%else
892*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 1
893*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 16
894*c0909341SAndroid Build Coastguard Worker    sbb                 r5d, 0
895*c0909341SAndroid Build Coastguard Worker%endif
896*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 4
897*c0909341SAndroid Build Coastguard Worker.loop_pass1:
898*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m4, [cq+32*0+r5]
899*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m4, [cq+32*1+r5]
900*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m4, [cq+32*2+r5]
901*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m4, [cq+32*3+r5]
902*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m0, m1, m2, m3
903*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m0, m1, m2, m3
904*c0909341SAndroid Build Coastguard Worker    REPX     {pmulld x, m6}, m0, m1, m2, m3
905*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m0, m1, m2, m3
906*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m0, m1, m2, m3
907*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
908*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
909*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
910*c0909341SAndroid Build Coastguard Worker    jz .end_pass1
911*c0909341SAndroid Build Coastguard Worker    mova       [cq+32*0+16], m0
912*c0909341SAndroid Build Coastguard Worker    mova                 m7, m2
913*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
914*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass1
915*c0909341SAndroid Build Coastguard Worker.end_pass1:
916*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0, m2
917*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
918*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m4
919*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4
920*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*0+16]
921*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2, m7
922*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m7
923*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2, m4
924*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4
925*c0909341SAndroid Build Coastguard Worker    ; m0-3 = packed & transposed output
926*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
927*c0909341SAndroid Build Coastguard Worker.pass2:
928*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pw_4096)]
929*c0909341SAndroid Build Coastguard Worker    jmp m(idct_4x8_internal_16bpc).end
930*c0909341SAndroid Build Coastguard Worker
931*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
932*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, tbl_4x16_%3, 4x16
933*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
934*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
935*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
936*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 16
937*c0909341SAndroid Build Coastguard Worker    add                 r5d, 384
938*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 9
939*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
940*c0909341SAndroid Build Coastguard Worker%endif
941*c0909341SAndroid Build Coastguard Worker%endmacro
942*c0909341SAndroid Build Coastguard Worker
943*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, dct
944*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, identity, v
945*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, adst
946*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, flipadst
947*c0909341SAndroid Build Coastguard Worker
948*c0909341SAndroid Build Coastguard Workercglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
949*c0909341SAndroid Build Coastguard Worker%undef cmp
950*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
951*c0909341SAndroid Build Coastguard Worker    mov                 r5m, r6d
952*c0909341SAndroid Build Coastguard Worker%endif
953*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 4
954*c0909341SAndroid Build Coastguard Worker.zero_loop:
955*c0909341SAndroid Build Coastguard Worker    dec                 r6d
956*c0909341SAndroid Build Coastguard Worker    cmp                eobb, byte [r5+r6]
957*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
958*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r6d
959*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 4
960*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
961*c0909341SAndroid Build Coastguard Worker    ; restore pic-ptr
962*c0909341SAndroid Build Coastguard Worker    mov                  r6, r5m
963*c0909341SAndroid Build Coastguard Worker%endif
964*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
965*c0909341SAndroid Build Coastguard Worker.loop_pass1:
966*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64*0+r5]
967*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*1+r5]
968*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*2+r5]
969*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*3+r5]
970*c0909341SAndroid Build Coastguard Worker    call m(idct_4x4_internal_16bpc).pass1_main
971*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m3, m3
972*c0909341SAndroid Build Coastguard Worker    REPX      {psubd x, m3}, m0, m1, m4, m2
973*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m0, m1, m4, m2
974*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1     ; out0 out1
975*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m2     ; out2 out3
976*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m4
977*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4
978*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m2
979*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
980*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
981*c0909341SAndroid Build Coastguard Worker    jz .end_pass1
982*c0909341SAndroid Build Coastguard Worker    mova       [cq+64*0+r5], m0
983*c0909341SAndroid Build Coastguard Worker    mova       [cq+64*1+r5], m1
984*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 16
985*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass1
986*c0909341SAndroid Build Coastguard Worker.end_pass1:
987*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*0+16]
988*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*1+16]
989*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64*0+32]
990*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*1+32]
991*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*0+48]
992*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64*1+48]
993*c0909341SAndroid Build Coastguard Worker    ; m0-7 = packed & transposed output
994*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
995*c0909341SAndroid Build Coastguard Worker.pass2:
996*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
997*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
998*c0909341SAndroid Build Coastguard Worker%endif
999*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_16x4_internal_8bpc, _ssse3).main
1000*c0909341SAndroid Build Coastguard Worker    ; m0-6 is out0-13 [with odd registers having inversed output]
1001*c0909341SAndroid Build Coastguard Worker    ; [coeffq+16*7] has out15/14
1002*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pw_2048)]
1003*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1004*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, [cq+16*7]
1005*c0909341SAndroid Build Coastguard Worker    REPX {shufps x, x, q1032}, m1, m3, m5, m7
1006*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*0], m4
1007*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*1], m5
1008*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*2], m6
1009*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*3], m7
1010*c0909341SAndroid Build Coastguard Worker.end:
1011*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
1012*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1013*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pixel_10bpc_max)]
1014*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 2
1015*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
1016*c0909341SAndroid Build Coastguard Worker.loop:
1017*c0909341SAndroid Build Coastguard Worker    movq                 m5, [dstq+strideq*0]
1018*c0909341SAndroid Build Coastguard Worker    movq                 m6, [dstq+strideq*2]
1019*c0909341SAndroid Build Coastguard Worker    movhps               m5, [dstq+strideq*1]
1020*c0909341SAndroid Build Coastguard Worker    movhps               m6, [dstq+r3]
1021*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dstq+strideq*4]
1022*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
1023*c0909341SAndroid Build Coastguard Worker    paddw                m1, m6
1024*c0909341SAndroid Build Coastguard Worker    movq                 m5, [r4+strideq*0]
1025*c0909341SAndroid Build Coastguard Worker    movq                 m6, [r4+strideq*2]
1026*c0909341SAndroid Build Coastguard Worker    movhps               m5, [r4+strideq*1]
1027*c0909341SAndroid Build Coastguard Worker    movhps               m6, [r4+r3]
1028*c0909341SAndroid Build Coastguard Worker    paddw                m2, m5
1029*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6
1030*c0909341SAndroid Build Coastguard Worker    REPX     {pminsw x, m7}, m0, m1, m2, m3
1031*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsw x, m4}, m0, m1, m2, m3
1032*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
1033*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
1034*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*2], m1
1035*c0909341SAndroid Build Coastguard Worker    movhps [dstq+r3       ], m1
1036*c0909341SAndroid Build Coastguard Worker    movq   [r4  +strideq*0], m2
1037*c0909341SAndroid Build Coastguard Worker    movhps [r4  +strideq*1], m2
1038*c0909341SAndroid Build Coastguard Worker    movq   [r4  +strideq*2], m3
1039*c0909341SAndroid Build Coastguard Worker    movhps [r4  +r3       ], m3
1040*c0909341SAndroid Build Coastguard Worker    dec                 r5d
1041*c0909341SAndroid Build Coastguard Worker    jz .end2
1042*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
1043*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+0*16]
1044*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+1*16]
1045*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+2*16]
1046*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+3*16]
1047*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], m4}, 0, 1, 2, 3
1048*c0909341SAndroid Build Coastguard Worker    jmp .loop
1049*c0909341SAndroid Build Coastguard Worker.end2:
1050*c0909341SAndroid Build Coastguard Worker    RET
1051*c0909341SAndroid Build Coastguard Worker
1052*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, dct
1053*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, adst
1054*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, flipadst
1055*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, identity, v
1056*c0909341SAndroid Build Coastguard Worker
1057*c0909341SAndroid Build Coastguard Workercglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1058*c0909341SAndroid Build Coastguard Worker%undef cmp
1059*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1060*c0909341SAndroid Build Coastguard Worker    mov                 r5m, r6d
1061*c0909341SAndroid Build Coastguard Worker%endif
1062*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 4
1063*c0909341SAndroid Build Coastguard Worker.zero_loop:
1064*c0909341SAndroid Build Coastguard Worker    dec                 r6d
1065*c0909341SAndroid Build Coastguard Worker    cmp                eobb, byte [r6+r5]
1066*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
1067*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r6d
1068*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 4
1069*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1070*c0909341SAndroid Build Coastguard Worker    ; restore pic-ptr
1071*c0909341SAndroid Build Coastguard Worker    mov                  r6, r5m
1072*c0909341SAndroid Build Coastguard Worker%endif
1073*c0909341SAndroid Build Coastguard Worker.loop_pass1:
1074*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*0+r5]
1075*c0909341SAndroid Build Coastguard Worker    lea                  r3, [cq+64*1+r5]
1076*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*2+r5]
1077*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*3+r5]
1078*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x4_internal_16bpc).main2
1079*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m3, m3
1080*c0909341SAndroid Build Coastguard Worker    REPX      {psubd x, m3}, m0, m2, m1, m4
1081*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m0, m2, m1, m4
1082*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2            ; out0 out1
1083*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m4            ; out2 out3
1084*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m1
1085*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
1086*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m2
1087*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
1088*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1089*c0909341SAndroid Build Coastguard Worker    jz m(idct_4x16_internal_16bpc).end_pass1
1090*c0909341SAndroid Build Coastguard Worker    mova       [cq+64*0+r5], m0
1091*c0909341SAndroid Build Coastguard Worker    mova       [cq+64*1+r5], m1
1092*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 16
1093*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass1
1094*c0909341SAndroid Build Coastguard Worker.pass2:
1095*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1096*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
1097*c0909341SAndroid Build Coastguard Worker%endif
1098*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
1099*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
1100*c0909341SAndroid Build Coastguard Worker    ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8
1101*c0909341SAndroid Build Coastguard Worker    ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13
1102*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(pw_4x2048_4xm2048)]
1103*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m1}, m7, m2, m0
1104*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m1, q1032  ; 4x-2048,4x2048
1105*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, [cq+16*7]
1106*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m6}, m5, m4, m3
1107*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, [cq+16*6]
1108*c0909341SAndroid Build Coastguard Worker    ; m7/5/2/4 = out4/11,5/10,6/9,7/8
1109*c0909341SAndroid Build Coastguard Worker    ; m0/3/6/1 = out0/15,3/12,1/14,2/13
1110*c0909341SAndroid Build Coastguard Worker    ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
1111*c0909341SAndroid Build Coastguard Worker    movhps         [cq+0*8], m4
1112*c0909341SAndroid Build Coastguard Worker    movhps         [cq+1*8], m2
1113*c0909341SAndroid Build Coastguard Worker    movhps         [cq+2*8], m5
1114*c0909341SAndroid Build Coastguard Worker    movhps         [cq+3*8], m7
1115*c0909341SAndroid Build Coastguard Worker    movhps         [cq+4*8], m3
1116*c0909341SAndroid Build Coastguard Worker    movhps         [cq+5*8], m1
1117*c0909341SAndroid Build Coastguard Worker    movhps         [cq+6*8], m6
1118*c0909341SAndroid Build Coastguard Worker    movhps         [cq+7*8], m0
1119*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m6
1120*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m3
1121*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m2, m4
1122*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m7, m5
1123*c0909341SAndroid Build Coastguard Worker    jmp m(idct_4x16_internal_16bpc).end
1124*c0909341SAndroid Build Coastguard Worker
1125*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, dct
1126*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, adst
1127*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, flipadst
1128*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, identity, v
1129*c0909341SAndroid Build Coastguard Worker
1130*c0909341SAndroid Build Coastguard Workercglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1131*c0909341SAndroid Build Coastguard Worker%undef cmp
1132*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1133*c0909341SAndroid Build Coastguard Worker    mov                 r5m, r6d
1134*c0909341SAndroid Build Coastguard Worker%endif
1135*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 4
1136*c0909341SAndroid Build Coastguard Worker.zero_loop:
1137*c0909341SAndroid Build Coastguard Worker    dec                 r6d
1138*c0909341SAndroid Build Coastguard Worker    cmp                eobb, byte [r5+r6]
1139*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
1140*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r6d
1141*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 4
1142*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1143*c0909341SAndroid Build Coastguard Worker    ; restore pic-ptr
1144*c0909341SAndroid Build Coastguard Worker    mov                  r6, r5m
1145*c0909341SAndroid Build Coastguard Worker%endif
1146*c0909341SAndroid Build Coastguard Worker.loop_pass1:
1147*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*0+r5]
1148*c0909341SAndroid Build Coastguard Worker    lea                  r3, [cq+64*1+r5]
1149*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*2+r5]
1150*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*3+r5]
1151*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x4_internal_16bpc).main2
1152*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m3, m3
1153*c0909341SAndroid Build Coastguard Worker    REPX      {psubd x, m3}, m0, m2, m1, m4
1154*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m0, m2, m1, m4
1155*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2            ; out3 out2
1156*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m4            ; out1 out0
1157*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m0
1158*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0
1159*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
1160*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
1161*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1162*c0909341SAndroid Build Coastguard Worker    jz m(idct_4x16_internal_16bpc).end_pass1
1163*c0909341SAndroid Build Coastguard Worker    mova       [cq+64*0+r5], m0
1164*c0909341SAndroid Build Coastguard Worker    mova       [cq+64*1+r5], m1
1165*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 16
1166*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass1
1167*c0909341SAndroid Build Coastguard Worker.pass2:
1168*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1169*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
1170*c0909341SAndroid Build Coastguard Worker%endif
1171*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
1172*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
1173*c0909341SAndroid Build Coastguard Worker    ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7
1174*c0909341SAndroid Build Coastguard Worker    ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2
1175*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(pw_4x2048_4xm2048)]
1176*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m1}, m7, m2, m0
1177*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m1, q1032  ; 4x-2048,4x2048
1178*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, [cq+16*7]
1179*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m6}, m5, m4, m3
1180*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, [cq+16*6]
1181*c0909341SAndroid Build Coastguard Worker    ; m7/5/2/4 = out11/4,10/5,9/6,8/7
1182*c0909341SAndroid Build Coastguard Worker    ; m0/3/6/1 = out15/0,12/3,14/1,13/2
1183*c0909341SAndroid Build Coastguard Worker    ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
1184*c0909341SAndroid Build Coastguard Worker    movq           [cq+0*8], m4
1185*c0909341SAndroid Build Coastguard Worker    movq           [cq+1*8], m2
1186*c0909341SAndroid Build Coastguard Worker    movq           [cq+2*8], m5
1187*c0909341SAndroid Build Coastguard Worker    movq           [cq+3*8], m7
1188*c0909341SAndroid Build Coastguard Worker    movq           [cq+4*8], m3
1189*c0909341SAndroid Build Coastguard Worker    movq           [cq+5*8], m1
1190*c0909341SAndroid Build Coastguard Worker    movq           [cq+6*8], m6
1191*c0909341SAndroid Build Coastguard Worker    movq           [cq+7*8], m0
1192*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m6
1193*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m3
1194*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m2, m4
1195*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m7, m5
1196*c0909341SAndroid Build Coastguard Worker    jmp m(idct_4x16_internal_16bpc).end
1197*c0909341SAndroid Build Coastguard Worker
1198*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, dct, h
1199*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, adst, h
1200*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, flipadst, h
1201*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, identity
1202*c0909341SAndroid Build Coastguard Worker
1203*c0909341SAndroid Build Coastguard Workercglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1204*c0909341SAndroid Build Coastguard Worker%undef cmp
1205*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1206*c0909341SAndroid Build Coastguard Worker    mov                 r5m, r6d
1207*c0909341SAndroid Build Coastguard Worker%endif
1208*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 4
1209*c0909341SAndroid Build Coastguard Worker.zero_loop:
1210*c0909341SAndroid Build Coastguard Worker    dec                 r6d
1211*c0909341SAndroid Build Coastguard Worker    cmp                eobb, byte [r5+r6]
1212*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
1213*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r6d
1214*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 4
1215*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1216*c0909341SAndroid Build Coastguard Worker    ; restore pic-ptr
1217*c0909341SAndroid Build Coastguard Worker    mov                  r6, r5m
1218*c0909341SAndroid Build Coastguard Worker%endif
1219*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_6144)]
1220*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pd_5793)]
1221*c0909341SAndroid Build Coastguard Worker.loop_pass1:
1222*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m4, [cq+64*0+r5]
1223*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m4, [cq+64*1+r5]
1224*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m4, [cq+64*2+r5]
1225*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m4, [cq+64*3+r5]
1226*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m0, m1, m2, m3
1227*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 13}, m0, m1, m2, m3
1228*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
1229*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
1230*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m2
1231*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
1232*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m3
1233*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3
1234*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1235*c0909341SAndroid Build Coastguard Worker    jz m(idct_4x16_internal_16bpc).end_pass1
1236*c0909341SAndroid Build Coastguard Worker    mova       [cq+64*0+r5], m0
1237*c0909341SAndroid Build Coastguard Worker    mova       [cq+64*1+r5], m1
1238*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 16
1239*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass1
1240*c0909341SAndroid Build Coastguard Worker.pass2:
1241*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*4], m0
1242*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*5], m1
1243*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*6], m2
1244*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*7], m7
1245*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pw_1697x16)]
1246*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pw_2048)]
1247*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m0, m4
1248*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m0, m5
1249*c0909341SAndroid Build Coastguard Worker    REPX      {paddsw x, x}, m4, m5
1250*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m1
1251*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m2
1252*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m4, m5
1253*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*0], m4
1254*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*1], m5
1255*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+16*7]
1256*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m0, m6
1257*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m0, m4
1258*c0909341SAndroid Build Coastguard Worker    REPX      {paddsw x, x}, m6, m4
1259*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m1
1260*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m2
1261*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m6, m4
1262*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*2], m6
1263*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*3], m4
1264*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+16*4]
1265*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*5]
1266*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+16*6]
1267*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m0, m2
1268*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m0, m3
1269*c0909341SAndroid Build Coastguard Worker    REPX      {paddsw x, x}, m2, m3
1270*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m5
1271*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m6
1272*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m0, m1
1273*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
1274*c0909341SAndroid Build Coastguard Worker    REPX      {paddsw x, x}, m1, m4
1275*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m6
1276*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4
1277*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m2, m3, m1, m0
1278*c0909341SAndroid Build Coastguard Worker    jmp m(idct_4x16_internal_16bpc).end
1279*c0909341SAndroid Build Coastguard Worker
1280*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X4_FN 2 ; type1, type2
1281*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1282*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 0, 8x4, 15
1283*c0909341SAndroid Build Coastguard Worker%else
1284*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 0, 8x4, 8, 0-4*16
1285*c0909341SAndroid Build Coastguard Worker%endif
1286*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
1287*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
1288*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
1289*c0909341SAndroid Build Coastguard Worker    add                 r5d, 128
1290*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 8
1291*c0909341SAndroid Build Coastguard Worker    imul                r5d, 181
1292*c0909341SAndroid Build Coastguard Worker    add                 r5d, 128
1293*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 8
1294*c0909341SAndroid Build Coastguard Worker    imul                r5d, 2896
1295*c0909341SAndroid Build Coastguard Worker    add                 r5d, 34816
1296*c0909341SAndroid Build Coastguard Worker    movd                 m0, r5d
1297*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q1111
1298*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
1299*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pixel_10bpc_max)]
1300*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
1301*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
1302*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+strideq*0]
1303*c0909341SAndroid Build Coastguard Worker    mova                 m2, [dstq+strideq*1]
1304*c0909341SAndroid Build Coastguard Worker    mova                 m3, [dstq+strideq*2]
1305*c0909341SAndroid Build Coastguard Worker    mova                 m4, [dstq+r2]
1306*c0909341SAndroid Build Coastguard Worker    REPX      {paddw x, m0}, m1, m2, m3, m4
1307*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsw x, m5}, m1, m2, m3, m4
1308*c0909341SAndroid Build Coastguard Worker    REPX     {pminsw x, m6}, m1, m2, m3, m4
1309*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m1
1310*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m2
1311*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m3
1312*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r2       ], m4
1313*c0909341SAndroid Build Coastguard Worker    RET
1314*c0909341SAndroid Build Coastguard Worker%endif
1315*c0909341SAndroid Build Coastguard Worker%endmacro
1316*c0909341SAndroid Build Coastguard Worker
1317*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, dct
1318*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, identity
1319*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, adst
1320*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, flipadst
1321*c0909341SAndroid Build Coastguard Worker
1322*c0909341SAndroid Build Coastguard Workercglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1323*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(.main)]
1324*c0909341SAndroid Build Coastguard Worker.pass1_entry:
1325*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1326*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+gprsize]
1327*c0909341SAndroid Build Coastguard Worker%else
1328*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
1329*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
1330*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
1331*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
1332*c0909341SAndroid Build Coastguard Worker%endif
1333*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+0*16]
1334*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+1*16]
1335*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+2*16]
1336*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+3*16]
1337*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+4*16]
1338*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+5*16]
1339*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+6*16]
1340*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+7*16]
1341*c0909341SAndroid Build Coastguard Worker    call .rect2_mul
1342*c0909341SAndroid Build Coastguard Worker    call                 r5
1343*c0909341SAndroid Build Coastguard Worker    call .transpose4x8packed
1344*c0909341SAndroid Build Coastguard Worker    ; m0-3 = packed & transposed output
1345*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1346*c0909341SAndroid Build Coastguard Worker.transpose4x8packed:
1347*c0909341SAndroid Build Coastguard Worker    ; transpose
1348*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m6
1349*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m6
1350*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m0, m4
1351*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4
1352*c0909341SAndroid Build Coastguard Worker
1353*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m1
1354*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
1355*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m6, m2
1356*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m2
1357*c0909341SAndroid Build Coastguard Worker
1358*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m4
1359*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4
1360*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m6
1361*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m6
1362*c0909341SAndroid Build Coastguard Worker    ret
1363*c0909341SAndroid Build Coastguard Worker.main:
1364*c0909341SAndroid Build Coastguard Worker    call .main_pass1
1365*c0909341SAndroid Build Coastguard Worker    call .round
1366*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
1367*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
1368*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
1369*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
1370*c0909341SAndroid Build Coastguard Worker    ret
1371*c0909341SAndroid Build Coastguard Worker.rect2_mul:
1372*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1373*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
1374*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
1375*c0909341SAndroid Build Coastguard Worker%else
1376*c0909341SAndroid Build Coastguard Worker    mova               [r3], m7
1377*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_2896)]
1378*c0909341SAndroid Build Coastguard Worker    REPX     {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
1379*c0909341SAndroid Build Coastguard Worker    pmulld               m7, [r3]
1380*c0909341SAndroid Build Coastguard Worker    mova               [r3], m7
1381*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_2048)]
1382*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
1383*c0909341SAndroid Build Coastguard Worker    paddd                m7, [r3]
1384*c0909341SAndroid Build Coastguard Worker%endif
1385*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
1386*c0909341SAndroid Build Coastguard Worker    ret
1387*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1388*c0909341SAndroid Build Coastguard Worker.main_pass1_fast:
1389*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m3, [o(pd_m2276)]
1390*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [o(pd_3406)]
1391*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m1, [o(pd_4017)]
1392*c0909341SAndroid Build Coastguard Worker    pmulld               m1, [o(pd_799)]
1393*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m2, [o(pd_3784)]
1394*c0909341SAndroid Build Coastguard Worker    pmulld               m2, [o(pd_1567)]
1395*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m14
1396*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
1397*c0909341SAndroid Build Coastguard Worker    jmp .main_pass1_fast2
1398*c0909341SAndroid Build Coastguard Worker.main_pass1:
1399*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
1400*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 7, 8, 9, 10, _,  799, 4017 ; t4a t7a
1401*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 6, 8, 9, 10, _, 1567, 3784 ; t2  t3
1402*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m14}, m0, m4
1403*c0909341SAndroid Build Coastguard Worker.main_pass1_fast2:
1404*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m1, m2, m3, m5, m6, m7
1405*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m1, m2, m3, m5, m6, m7
1406*c0909341SAndroid Build Coastguard Worker    paddd                m8, m1, m5 ; t4
1407*c0909341SAndroid Build Coastguard Worker    psubd                m1, m5     ; t5a
1408*c0909341SAndroid Build Coastguard Worker    paddd                m9, m7, m3 ; t7
1409*c0909341SAndroid Build Coastguard Worker    psubd                m7, m3     ; t6a
1410*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m1, m8, m7, m9
1411*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m1, m8, m7, m9
1412*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m14}, m7, m1
1413*c0909341SAndroid Build Coastguard Worker    paddd                m0, m11
1414*c0909341SAndroid Build Coastguard Worker    paddd                m7, m11
1415*c0909341SAndroid Build Coastguard Worker    psubd                m5, m0, m4
1416*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
1417*c0909341SAndroid Build Coastguard Worker    psubd                m4, m7, m1
1418*c0909341SAndroid Build Coastguard Worker    paddd                m7, m1
1419*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m5, m0, m4, m7
1420*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m6 ; dct4 out3
1421*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6     ; dct4 out0
1422*c0909341SAndroid Build Coastguard Worker    paddd                m6, m5, m2 ; dct4 out1
1423*c0909341SAndroid Build Coastguard Worker    psubd                m5, m2     ; dct4 out2
1424*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m0, m6, m5, m3
1425*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m0, m6, m5, m3
1426*c0909341SAndroid Build Coastguard Worker    ret
1427*c0909341SAndroid Build Coastguard Worker.round:
1428*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6, m7 ; out1
1429*c0909341SAndroid Build Coastguard Worker    psubd                m6, m7     ; out6
1430*c0909341SAndroid Build Coastguard Worker    psubd                m7, m0, m9 ; out7
1431*c0909341SAndroid Build Coastguard Worker    paddd                m0, m9     ; out0
1432*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5, m4 ; out2
1433*c0909341SAndroid Build Coastguard Worker    psubd                m5, m4     ; out5
1434*c0909341SAndroid Build Coastguard Worker    psubd                m4, m3, m8 ; out4
1435*c0909341SAndroid Build Coastguard Worker    paddd                m3, m8     ; out3
1436*c0909341SAndroid Build Coastguard Worker%else
1437*c0909341SAndroid Build Coastguard Worker.main_pass1_fast:
1438*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m3, [o(pd_m2276)]
1439*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [o(pd_3406)]
1440*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m1, [o(pd_4017)]
1441*c0909341SAndroid Build Coastguard Worker    pmulld               m1, [o(pd_799)]
1442*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m2, [o(pd_3784)]
1443*c0909341SAndroid Build Coastguard Worker    pmulld               m2, [o(pd_1567)]
1444*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pd_2048)]
1445*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m2
1446*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m4}, m5, m3, m7, m1
1447*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m5, m3, m7, m1
1448*c0909341SAndroid Build Coastguard Worker    paddd                m2, m1, m5 ; t4
1449*c0909341SAndroid Build Coastguard Worker    psubd                m1, m5     ; t5a
1450*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m0, [o(pd_2896)]
1451*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
1452*c0909341SAndroid Build Coastguard Worker    paddd                m4, m7, m3 ; t7
1453*c0909341SAndroid Build Coastguard Worker    psubd                m7, m3     ; t6a
1454*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_min)]
1455*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m3 }, m1, m2, m7, m4
1456*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_max)]
1457*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m3 }, m1, m2, m7, m4
1458*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m2
1459*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m4
1460*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
1461*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+0*16]
1462*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_2896)]
1463*c0909341SAndroid Build Coastguard Worker    jmp .main_pass1_fast2
1464*c0909341SAndroid Build Coastguard Worker.main_pass1:
1465*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
1466*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m2
1467*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m4
1468*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m6
1469*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pd_2048)]
1470*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a
1471*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 7, 2, 4, 6, 0,  799, 4017 ; t4a t7a
1472*c0909341SAndroid Build Coastguard Worker    paddd                m2, m1, m5 ; t4
1473*c0909341SAndroid Build Coastguard Worker    psubd                m1, m5     ; t5a
1474*c0909341SAndroid Build Coastguard Worker    paddd                m4, m7, m3 ; t7
1475*c0909341SAndroid Build Coastguard Worker    psubd                m7, m3     ; t6a
1476*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(clip_18b_min)]
1477*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m6 }, m1, m2, m7, m4
1478*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(clip_18b_max)]
1479*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m6 }, m1, m2, m7, m4
1480*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+3*16]
1481*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m2
1482*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+1*16]
1483*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m4
1484*c0909341SAndroid Build Coastguard Worker
1485*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 6, 4, 3, 5, _, 1567, 3784 ; t2  t3
1486*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_2896)]
1487*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+0*16]
1488*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+2*16]
1489*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m3 }, m5, m4
1490*c0909341SAndroid Build Coastguard Worker.main_pass1_fast2:
1491*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m0 }, m2, m6
1492*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m2, m6
1493*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m3 }, m7, m1
1494*c0909341SAndroid Build Coastguard Worker    paddd                m7, m0
1495*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
1496*c0909341SAndroid Build Coastguard Worker
1497*c0909341SAndroid Build Coastguard Worker    psubd                m5, m0, m4
1498*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
1499*c0909341SAndroid Build Coastguard Worker    psubd                m4, m7, m1
1500*c0909341SAndroid Build Coastguard Worker    paddd                m7, m1
1501*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m5, m0, m4, m7
1502*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m6 ; dct4 out3
1503*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6     ; dct4 out0
1504*c0909341SAndroid Build Coastguard Worker    paddd                m6, m5, m2 ; dct4 out1
1505*c0909341SAndroid Build Coastguard Worker    psubd                m5, m2     ; dct4 out2
1506*c0909341SAndroid Build Coastguard Worker
1507*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(clip_18b_min)]
1508*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m1 }, m0, m6, m5, m3
1509*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(clip_18b_max)]
1510*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m1 }, m0, m6, m5, m3
1511*c0909341SAndroid Build Coastguard Worker    ret
1512*c0909341SAndroid Build Coastguard Worker.round:
1513*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6, m7 ; out1
1514*c0909341SAndroid Build Coastguard Worker    psubd                m6, m7     ; out6
1515*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m6
1516*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+1*16]
1517*c0909341SAndroid Build Coastguard Worker    psubd                m7, m0, m6 ; out7
1518*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6     ; out0
1519*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5, m4 ; out2
1520*c0909341SAndroid Build Coastguard Worker    psubd                m5, m4     ; out5
1521*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+3*16]
1522*c0909341SAndroid Build Coastguard Worker    psubd                m4, m3, m6 ; out4
1523*c0909341SAndroid Build Coastguard Worker    paddd                m3, m6     ; out3
1524*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+0*16]
1525*c0909341SAndroid Build Coastguard Worker%endif
1526*c0909341SAndroid Build Coastguard Worker    ret
1527*c0909341SAndroid Build Coastguard Worker
1528*c0909341SAndroid Build Coastguard Worker.pass2:
1529*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1530*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
1531*c0909341SAndroid Build Coastguard Worker%endif
1532*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_8x4_internal_8bpc, _ssse3).main
1533*c0909341SAndroid Build Coastguard Worker.end:
1534*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
1535*c0909341SAndroid Build Coastguard Worker    call .round2_and_write_8x4
1536*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
1537*c0909341SAndroid Build Coastguard Worker    RET
1538*c0909341SAndroid Build Coastguard Worker.round2_and_write_8x4:
1539*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
1540*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pixel_10bpc_max)]
1541*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pw_2048)]
1542*c0909341SAndroid Build Coastguard Worker.round1_and_write_8x4:
1543*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
1544*c0909341SAndroid Build Coastguard Worker.write_8x4:
1545*c0909341SAndroid Build Coastguard Worker    paddw                m0, [dstq+strideq*0]
1546*c0909341SAndroid Build Coastguard Worker    paddw                m1, [dstq+strideq*1]
1547*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*2]
1548*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+r3]
1549*c0909341SAndroid Build Coastguard Worker    REPX     {pminsw x, m5}, m0, m1, m2, m3
1550*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsw x, m6}, m0, m1, m2, m3
1551*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
1552*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
1553*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
1554*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r3       ], m3
1555*c0909341SAndroid Build Coastguard Worker    ret
1556*c0909341SAndroid Build Coastguard Worker
1557*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, dct
1558*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, adst
1559*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, flipadst
1560*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, identity
1561*c0909341SAndroid Build Coastguard Worker
1562*c0909341SAndroid Build Coastguard Workercglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1563*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(.main)]
1564*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x4_internal_16bpc).pass1_entry
1565*c0909341SAndroid Build Coastguard Worker.main:
1566*c0909341SAndroid Build Coastguard Worker    call .main_pass1
1567*c0909341SAndroid Build Coastguard Worker    call .round
1568*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
1569*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
1570*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
1571*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
1572*c0909341SAndroid Build Coastguard Worker    ret
1573*c0909341SAndroid Build Coastguard Worker.main_pass1:
1574*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1575*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 0, 8, 9, 10, 11,  401, 4076 ; t1a, t0a
1576*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a
1577*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a
1578*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a
1579*c0909341SAndroid Build Coastguard Worker    psubd                m8, m2, m6 ; t6
1580*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6     ; t2
1581*c0909341SAndroid Build Coastguard Worker    psubd                m6, m0, m4 ; t4
1582*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t0
1583*c0909341SAndroid Build Coastguard Worker    psubd                m4, m5, m1 ; t7
1584*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1     ; t3
1585*c0909341SAndroid Build Coastguard Worker    psubd                m1, m7, m3 ; t5
1586*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t1
1587*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7
1588*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7
1589*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a
1590*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 8, 3, 9, 10, 11, 3784, 10   ; t6a, t7a
1591*c0909341SAndroid Build Coastguard Worker    psubd                m9, m6, m8 ;  t7
1592*c0909341SAndroid Build Coastguard Worker    paddd                m6, m8     ;  out6
1593*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pd_2896)]
1594*c0909341SAndroid Build Coastguard Worker    psubd                m3, m7, m5 ;  t3
1595*c0909341SAndroid Build Coastguard Worker    paddd                m7, m5     ; -out7
1596*c0909341SAndroid Build Coastguard Worker    psubd                m5, m0, m2 ;  t2
1597*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2     ;  out0
1598*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m4 ;  t6
1599*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4     ; -out1
1600*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m5, m3, m2, m9
1601*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m5, m3, m2, m9
1602*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m14}, m5, m3, m2, m9
1603*c0909341SAndroid Build Coastguard Worker    psubd               m4, m5, m3 ; (t2 - t3) * 2896
1604*c0909341SAndroid Build Coastguard Worker    paddd               m3, m5     ; (t2 + t3) * 2896
1605*c0909341SAndroid Build Coastguard Worker    psubd               m5, m2, m9 ; (t6 - t7) * 2896
1606*c0909341SAndroid Build Coastguard Worker    paddd               m2, m9     ; (t6 + t7) * 2896
1607*c0909341SAndroid Build Coastguard Worker    ret
1608*c0909341SAndroid Build Coastguard Worker.round:
1609*c0909341SAndroid Build Coastguard Worker
1610*c0909341SAndroid Build Coastguard Worker    ; m0=out0,m1=-out1,m6=out6,m7=-out7
1611*c0909341SAndroid Build Coastguard Worker
1612*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m8
1613*c0909341SAndroid Build Coastguard Worker    REPX     {pxor  x, m8 }, m1, m7, m3, m5
1614*c0909341SAndroid Build Coastguard Worker    REPX     {psubd x, m8 }, m1, m7
1615*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m2, m3, m4, m5
1616*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m2, m3, m4, m5
1617*c0909341SAndroid Build Coastguard Worker%else
1618*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m2
1619*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m3
1620*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m4
1621*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m5
1622*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
1623*c0909341SAndroid Build Coastguard Worker
1624*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 0, 2, 3, 4, 5,  401, 4076 ; t1a, t0a
1625*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a
1626*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+0*16]
1627*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+1*16]
1628*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+2*16]
1629*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
1630*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m1
1631*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m6
1632*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+3*16]
1633*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m7
1634*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a
1635*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a
1636*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+0*16]
1637*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+2*16]
1638*c0909341SAndroid Build Coastguard Worker    psubd                m7, m2, m6 ; t6
1639*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6     ; t2
1640*c0909341SAndroid Build Coastguard Worker    psubd                m6, m0, m4 ; t4
1641*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t0
1642*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m7
1643*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+1*16]
1644*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+3*16]
1645*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1, m5 ; t7
1646*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1     ; t3
1647*c0909341SAndroid Build Coastguard Worker    psubd                m1, m7, m3 ; t5
1648*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t1
1649*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_min)]
1650*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7
1651*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m7
1652*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(clip_18b_max)]
1653*c0909341SAndroid Build Coastguard Worker    pmaxsd               m3, [r3+0*16]
1654*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5
1655*c0909341SAndroid Build Coastguard Worker    pminsd               m7, [r3+1*16]
1656*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
1657*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m2
1658*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m5
1659*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m7
1660*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pd_2048)]
1661*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a
1662*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 3, 2, 5, 7, 0, 3784, 7    ; t6a, t7a
1663*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+2*16]
1664*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+3*16]
1665*c0909341SAndroid Build Coastguard Worker    psubd                m2, m6, m3 ;  t7
1666*c0909341SAndroid Build Coastguard Worker    paddd                m6, m3     ;  out6
1667*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m6
1668*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+0*16]
1669*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+1*16]
1670*c0909341SAndroid Build Coastguard Worker    psubd                m3, m7, m5 ;  t3
1671*c0909341SAndroid Build Coastguard Worker    paddd                m7, m5     ; -out7
1672*c0909341SAndroid Build Coastguard Worker    psubd                m5, m0, m6 ;  t2
1673*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6     ;  out0
1674*c0909341SAndroid Build Coastguard Worker    psubd                m6, m1, m4 ;  t6
1675*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4     ; -out1
1676*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(clip_18b_min)]
1677*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m4 }, m5, m3, m6, m2
1678*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(clip_18b_max)]
1679*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m4 }, m5, m3, m6, m2
1680*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pd_2896)]
1681*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m4 }, m5, m3, m6, m2
1682*c0909341SAndroid Build Coastguard Worker    psubd               m4, m5, m3 ; (t2 - t3) * 2896
1683*c0909341SAndroid Build Coastguard Worker    paddd               m3, m5     ; (t2 + t3) * 2896
1684*c0909341SAndroid Build Coastguard Worker    psubd               m5, m6, m2 ; (t6 - t7) * 2896
1685*c0909341SAndroid Build Coastguard Worker    paddd               m2, m6     ; (t6 + t7) * 2896
1686*c0909341SAndroid Build Coastguard Worker    ret
1687*c0909341SAndroid Build Coastguard Worker.round:
1688*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m0
1689*c0909341SAndroid Build Coastguard Worker
1690*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m0, m0
1691*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pd_2048)]
1692*c0909341SAndroid Build Coastguard Worker    REPX     {pxor  x, m0 }, m1, m7, m3, m5
1693*c0909341SAndroid Build Coastguard Worker    REPX     {psubd x, m0 }, m1, m7
1694*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m6 }, m2, m3, m4, m5
1695*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m2, m3, m4, m5
1696*c0909341SAndroid Build Coastguard Worker
1697*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+3*16]
1698*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+2*16]
1699*c0909341SAndroid Build Coastguard Worker%endif
1700*c0909341SAndroid Build Coastguard Worker    ret
1701*c0909341SAndroid Build Coastguard Worker
1702*c0909341SAndroid Build Coastguard Worker.pass2:
1703*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1704*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
1705*c0909341SAndroid Build Coastguard Worker%endif
1706*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
1707*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x4_internal_16bpc).end
1708*c0909341SAndroid Build Coastguard Worker
1709*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, dct
1710*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, adst
1711*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, flipadst
1712*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, identity
1713*c0909341SAndroid Build Coastguard Worker
1714*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1715*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(.main)]
1716*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x4_internal_16bpc).pass1_entry
1717*c0909341SAndroid Build Coastguard Worker.main:
1718*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x4_internal_16bpc).main_pass1
1719*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x4_internal_16bpc).round
1720*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m6
1721*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m4
1722*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m2
1723*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0
1724*c0909341SAndroid Build Coastguard Worker    mova                 m0, m7
1725*c0909341SAndroid Build Coastguard Worker    mova                 m2, m5
1726*c0909341SAndroid Build Coastguard Worker    mova                 m4, m3
1727*c0909341SAndroid Build Coastguard Worker    mova                 m6, m1
1728*c0909341SAndroid Build Coastguard Worker    ret
1729*c0909341SAndroid Build Coastguard Worker.pass2:
1730*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1731*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
1732*c0909341SAndroid Build Coastguard Worker%endif
1733*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
1734*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
1735*c0909341SAndroid Build Coastguard Worker    add                dstq, r3
1736*c0909341SAndroid Build Coastguard Worker    neg             strideq
1737*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x4_internal_16bpc).end
1738*c0909341SAndroid Build Coastguard Worker
1739*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, dct
1740*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, adst
1741*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, flipadst
1742*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, identity
1743*c0909341SAndroid Build Coastguard Worker
1744*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1745*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(.main)]
1746*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x4_internal_16bpc).pass1_entry
1747*c0909341SAndroid Build Coastguard Worker.main:
1748*c0909341SAndroid Build Coastguard Worker    REPX       {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7
1749*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
1750*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
1751*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
1752*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
1753*c0909341SAndroid Build Coastguard Worker    ret
1754*c0909341SAndroid Build Coastguard Worker.pass2:
1755*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pw_1697x8)]
1756*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7, m0
1757*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7, m1
1758*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7, m2
1759*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m3
1760*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4
1761*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5
1762*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m6
1763*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m7
1764*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x4_internal_16bpc).end
1765*c0909341SAndroid Build Coastguard Worker
1766*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
1767*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1768*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, %3, 8x8, 15, 0-3*16
1769*c0909341SAndroid Build Coastguard Worker%else
1770*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, %3, 8x8, 8, 0-5*16
1771*c0909341SAndroid Build Coastguard Worker%endif
1772*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
1773*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
1774*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
1775*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 2
1776*c0909341SAndroid Build Coastguard Worker.end:
1777*c0909341SAndroid Build Coastguard Worker    add                 r5d, 384
1778*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 9
1779*c0909341SAndroid Build Coastguard Worker.end2:
1780*c0909341SAndroid Build Coastguard Worker    imul                r5d, 2896
1781*c0909341SAndroid Build Coastguard Worker    add                 r5d, 34816
1782*c0909341SAndroid Build Coastguard Worker    movd                 m0, r5d
1783*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q1111
1784*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
1785*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pixel_10bpc_max)]
1786*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
1787*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
1788*c0909341SAndroid Build Coastguard Worker.loop:
1789*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+strideq*0]
1790*c0909341SAndroid Build Coastguard Worker    mova                 m2, [dstq+strideq*1]
1791*c0909341SAndroid Build Coastguard Worker    mova                 m3, [dstq+strideq*2]
1792*c0909341SAndroid Build Coastguard Worker    mova                 m4, [dstq+r2]
1793*c0909341SAndroid Build Coastguard Worker    REPX      {paddw x, m0}, m1, m2, m3, m4
1794*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsw x, m5}, m1, m2, m3, m4
1795*c0909341SAndroid Build Coastguard Worker    REPX     {pminsw x, m6}, m1, m2, m3, m4
1796*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m1
1797*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m2
1798*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m3
1799*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r2       ], m4
1800*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1801*c0909341SAndroid Build Coastguard Worker    dec                 r3d
1802*c0909341SAndroid Build Coastguard Worker    jg .loop
1803*c0909341SAndroid Build Coastguard Worker    RET
1804*c0909341SAndroid Build Coastguard Worker%endif
1805*c0909341SAndroid Build Coastguard Worker%endmacro
1806*c0909341SAndroid Build Coastguard Worker
1807*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, dct
1808*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, identity, 6
1809*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, adst
1810*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, flipadst
1811*c0909341SAndroid Build Coastguard Worker
1812*c0909341SAndroid Build Coastguard Workercglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1813*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1814*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 1
1815*c0909341SAndroid Build Coastguard Worker    mov [rsp+4*16+1*gprsize], r1
1816*c0909341SAndroid Build Coastguard Worker%else
1817*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 6
1818*c0909341SAndroid Build Coastguard Worker%endif
1819*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(.pass1_main)]
1820*c0909341SAndroid Build Coastguard Worker
1821*c0909341SAndroid Build Coastguard Worker.pass1_full:
1822*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1823*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
1824*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
1825*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
1826*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
1827*c0909341SAndroid Build Coastguard Worker%endif
1828*c0909341SAndroid Build Coastguard Worker%undef cmp
1829*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1830*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
1831*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 10
1832*c0909341SAndroid Build Coastguard Worker    setge               r5b
1833*c0909341SAndroid Build Coastguard Worker%else
1834*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 1
1835*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 10
1836*c0909341SAndroid Build Coastguard Worker    sbb                 r5d, 0
1837*c0909341SAndroid Build Coastguard Worker%endif
1838*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 4
1839*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1840*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+gprsize]
1841*c0909341SAndroid Build Coastguard Worker%endif
1842*c0909341SAndroid Build Coastguard Worker.loop_pass1:
1843*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+0*32+r5]
1844*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+1*32+r5]
1845*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+2*32+r5]
1846*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+3*32+r5]
1847*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+4*32+r5]
1848*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+5*32+r5]
1849*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+6*32+r5]
1850*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+7*32+r5]
1851*c0909341SAndroid Build Coastguard Worker    call                 t0
1852*c0909341SAndroid Build Coastguard Worker
1853*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
1854*c0909341SAndroid Build Coastguard Worker    jz .end_pass1
1855*c0909341SAndroid Build Coastguard Worker
1856*c0909341SAndroid Build Coastguard Worker    mova       [cq+0*32+16], m0
1857*c0909341SAndroid Build Coastguard Worker    mova       [cq+1*32+16], m1
1858*c0909341SAndroid Build Coastguard Worker    mova       [cq+2*32+16], m2
1859*c0909341SAndroid Build Coastguard Worker    mova       [cq+3*32+16], m3
1860*c0909341SAndroid Build Coastguard Worker
1861*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 16
1862*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass1
1863*c0909341SAndroid Build Coastguard Worker.end_pass1:
1864*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+0*32+16]
1865*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+1*32+16]
1866*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+2*32+16]
1867*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+3*32+16]
1868*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1869*c0909341SAndroid Build Coastguard Worker    mov                  r1, [rsp+4*16+1*gprsize]
1870*c0909341SAndroid Build Coastguard Worker%endif
1871*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1872*c0909341SAndroid Build Coastguard Worker.pass1_main:
1873*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1
1874*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m1, m1
1875*c0909341SAndroid Build Coastguard Worker    REPX      {psubd x, m1}, m0, m6, m5, m3
1876*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
1877*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
1878*c0909341SAndroid Build Coastguard Worker.pack_and_transpose:
1879*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
1880*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
1881*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
1882*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
1883*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x4_internal_16bpc).transpose4x8packed
1884*c0909341SAndroid Build Coastguard Worker
1885*c0909341SAndroid Build Coastguard Worker.pass2:
1886*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1887*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
1888*c0909341SAndroid Build Coastguard Worker%endif
1889*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
1890*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
1891*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1892*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
1893*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
1894*c0909341SAndroid Build Coastguard Worker%endif
1895*c0909341SAndroid Build Coastguard Worker    call .round3_and_write_8x8
1896*c0909341SAndroid Build Coastguard Worker.zero:
1897*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1898*c0909341SAndroid Build Coastguard Worker%define mzero m9
1899*c0909341SAndroid Build Coastguard Worker%else
1900*c0909341SAndroid Build Coastguard Worker%define mzero m7
1901*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
1902*c0909341SAndroid Build Coastguard Worker%endif
1903*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1904*c0909341SAndroid Build Coastguard Worker%undef mzero
1905*c0909341SAndroid Build Coastguard Worker    RET
1906*c0909341SAndroid Build Coastguard Worker
1907*c0909341SAndroid Build Coastguard Worker    ; round (rounded right-shift by 5) before writing
1908*c0909341SAndroid Build Coastguard Worker    ; data in m0-7
1909*c0909341SAndroid Build Coastguard Worker    ; on x86-64, pw_2048 is in m8
1910*c0909341SAndroid Build Coastguard Worker    ; .round1 is for m0-7
1911*c0909341SAndroid Build Coastguard Worker    ; .round2 is for m0-6 & [rsp+gprsize*2]
1912*c0909341SAndroid Build Coastguard Worker    ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
1913*c0909341SAndroid Build Coastguard Worker    ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7
1914*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1915*c0909341SAndroid Build Coastguard Worker.round1_and_write_8x8:
1916*c0909341SAndroid Build Coastguard Worker    mova    [rsp+gprsize*2], m7
1917*c0909341SAndroid Build Coastguard Worker.round2_and_write_8x8:
1918*c0909341SAndroid Build Coastguard Worker%endif
1919*c0909341SAndroid Build Coastguard Worker.round3_and_write_8x8:
1920*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pw_2048)]
1921*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1922*c0909341SAndroid Build Coastguard Worker.round4_and_write_8x8:
1923*c0909341SAndroid Build Coastguard Worker%endif
1924*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1925*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, [rsp+gprsize*2]
1926*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1927*c0909341SAndroid Build Coastguard Worker    jmp .write_8x8
1928*c0909341SAndroid Build Coastguard Worker.round2_and_write_8x8:
1929*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize*2]
1930*c0909341SAndroid Build Coastguard Worker.round1_and_write_8x8:
1931*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
1932*c0909341SAndroid Build Coastguard Worker%endif
1933*c0909341SAndroid Build Coastguard Worker
1934*c0909341SAndroid Build Coastguard Worker    ; m0-7 have to-be-written data [pre-rounded]
1935*c0909341SAndroid Build Coastguard Worker    ; on x86-64, m9-10 contain a zero/pixel_max
1936*c0909341SAndroid Build Coastguard Worker    ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch
1937*c0909341SAndroid Build Coastguard Worker    ; r0,1,3 contain dstq/strideq/stride3q
1938*c0909341SAndroid Build Coastguard Worker    ; r5 is a scratch register
1939*c0909341SAndroid Build Coastguard Worker.write_8x8:
1940*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dstq+strideq*4]
1941*c0909341SAndroid Build Coastguard Worker    paddw                m0, [dstq+strideq*0]
1942*c0909341SAndroid Build Coastguard Worker    paddw                m1, [dstq+strideq*1]
1943*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*2]
1944*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+r3]
1945*c0909341SAndroid Build Coastguard Worker    paddw                m4, [r5  +strideq*0]
1946*c0909341SAndroid Build Coastguard Worker    paddw                m5, [r5  +strideq*1]
1947*c0909341SAndroid Build Coastguard Worker    paddw                m6, [r5  +strideq*2]
1948*c0909341SAndroid Build Coastguard Worker    paddw                m7, [r5  +r3]
1949*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1950*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
1951*c0909341SAndroid Build Coastguard Worker    REPX    {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
1952*c0909341SAndroid Build Coastguard Worker%else
1953*c0909341SAndroid Build Coastguard Worker    mova    [rsp+gprsize*2], m7
1954*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
1955*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1956*c0909341SAndroid Build Coastguard Worker    pmaxsw               m7, [rsp+gprsize*2]
1957*c0909341SAndroid Build Coastguard Worker    mova    [rsp+gprsize*2], m7
1958*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pixel_10bpc_max)]
1959*c0909341SAndroid Build Coastguard Worker    REPX     {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1960*c0909341SAndroid Build Coastguard Worker    pminsw               m7, [rsp+gprsize*2]
1961*c0909341SAndroid Build Coastguard Worker%endif
1962*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
1963*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
1964*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
1965*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r3       ], m3
1966*c0909341SAndroid Build Coastguard Worker    mova   [r5  +strideq*0], m4
1967*c0909341SAndroid Build Coastguard Worker    mova   [r5  +strideq*1], m5
1968*c0909341SAndroid Build Coastguard Worker    mova   [r5  +strideq*2], m6
1969*c0909341SAndroid Build Coastguard Worker    mova   [r5  +r3       ], m7
1970*c0909341SAndroid Build Coastguard Worker    ret
1971*c0909341SAndroid Build Coastguard Worker
1972*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, dct
1973*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, adst
1974*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, flipadst
1975*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, identity, 6
1976*c0909341SAndroid Build Coastguard Worker
1977*c0909341SAndroid Build Coastguard Workercglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1978*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1979*c0909341SAndroid Build Coastguard Worker    mov [rsp+4*16+1*gprsize], r1
1980*c0909341SAndroid Build Coastguard Worker%endif
1981*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(.pass1_main)]
1982*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x8_internal_16bpc).pass1_full
1983*c0909341SAndroid Build Coastguard Worker.pass1_main:
1984*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x4_internal_16bpc).main_pass1
1985*c0909341SAndroid Build Coastguard Worker    call .round
1986*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x8_internal_16bpc).pack_and_transpose
1987*c0909341SAndroid Build Coastguard Worker.round:
1988*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1989*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m8         ; -1
1990*c0909341SAndroid Build Coastguard Worker    REPX     {psubd x, m8 }, m0, m6
1991*c0909341SAndroid Build Coastguard Worker    REPX     {pxor  x, m8 }, m1, m7, m3, m5
1992*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 1  }, m0, m1, m6, m7
1993*c0909341SAndroid Build Coastguard Worker    REPX     {psubd x, m8 }, m1, m7
1994*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pd_6144)]
1995*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m8 }, m2, m3, m4, m5
1996*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 13 }, m2, m3, m4, m5
1997*c0909341SAndroid Build Coastguard Worker%else
1998*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m0
1999*c0909341SAndroid Build Coastguard Worker
2000*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m0, m0         ; -1
2001*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pd_6144)]
2002*c0909341SAndroid Build Coastguard Worker    REPX     {pxor  x, m0 }, m1, m7, m3, m5
2003*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 1  }, m1, m7
2004*c0909341SAndroid Build Coastguard Worker    REPX     {psubd x, m0 }, m1, m7
2005*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m6 }, m2, m3, m4, m5
2006*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 13 }, m2, m3, m4, m5
2007*c0909341SAndroid Build Coastguard Worker
2008*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+2*16]
2009*c0909341SAndroid Build Coastguard Worker    psrld                m6, 12         ; +1
2010*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6
2011*c0909341SAndroid Build Coastguard Worker    paddd                m6, [r3+3*16]
2012*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 1  }, m0, m6
2013*c0909341SAndroid Build Coastguard Worker%endif
2014*c0909341SAndroid Build Coastguard Worker    ret
2015*c0909341SAndroid Build Coastguard Worker
2016*c0909341SAndroid Build Coastguard Worker.pass2:
2017*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2018*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
2019*c0909341SAndroid Build Coastguard Worker%endif
2020*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
2021*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
2022*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2023*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2024*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
2025*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
2026*c0909341SAndroid Build Coastguard Worker%endif
2027*c0909341SAndroid Build Coastguard Worker    call .round3_and_write_8x8
2028*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x8_internal_16bpc).zero
2029*c0909341SAndroid Build Coastguard Worker
2030*c0909341SAndroid Build Coastguard Worker    ; round (rounded right-shift by 5) before writing; odd registers are negated
2031*c0909341SAndroid Build Coastguard Worker    ; data in m0-7
2032*c0909341SAndroid Build Coastguard Worker    ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11
2033*c0909341SAndroid Build Coastguard Worker    ; .round1 is for m0-7
2034*c0909341SAndroid Build Coastguard Worker    ; .round2 is for m0-6 & [rsp+gprsize*2]
2035*c0909341SAndroid Build Coastguard Worker    ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
2036*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2037*c0909341SAndroid Build Coastguard Worker.round2_and_write_8x8:
2038*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize*2]
2039*c0909341SAndroid Build Coastguard Worker.round1_and_write_8x8:
2040*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m8 }, m0, m2, m4, m6
2041*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m11}, m1, m3, m5, m7
2042*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x8_internal_16bpc).write_8x8
2043*c0909341SAndroid Build Coastguard Worker%else
2044*c0909341SAndroid Build Coastguard Worker.round1_and_write_8x8:
2045*c0909341SAndroid Build Coastguard Worker    mova    [rsp+gprsize*2], m7
2046*c0909341SAndroid Build Coastguard Worker.round2_and_write_8x8:
2047*c0909341SAndroid Build Coastguard Worker%endif
2048*c0909341SAndroid Build Coastguard Worker.round3_and_write_8x8:
2049*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pw_2048)]
2050*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m0, m2, m4, m6
2051*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pw_m2048)]
2052*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m1, m3, m5
2053*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, [rsp+gprsize*2]
2054*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x8_internal_16bpc).write_8x8
2055*c0909341SAndroid Build Coastguard Worker
2056*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, dct
2057*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, adst
2058*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, flipadst
2059*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, identity, 6
2060*c0909341SAndroid Build Coastguard Worker
2061*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2062*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2063*c0909341SAndroid Build Coastguard Worker    mov [rsp+4*16+1*gprsize], r1
2064*c0909341SAndroid Build Coastguard Worker%endif
2065*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(.pass1_main)]
2066*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x8_internal_16bpc).pass1_full
2067*c0909341SAndroid Build Coastguard Worker.pass1_main:
2068*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x4_internal_16bpc).main_pass1
2069*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x8_internal_16bpc).round
2070*c0909341SAndroid Build Coastguard Worker    ; invert registers
2071*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m6
2072*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m4
2073*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m2
2074*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0
2075*c0909341SAndroid Build Coastguard Worker    mova                 m0, m7
2076*c0909341SAndroid Build Coastguard Worker    mova                 m2, m5
2077*c0909341SAndroid Build Coastguard Worker    mova                 m4, m3
2078*c0909341SAndroid Build Coastguard Worker    mova                 m6, m1
2079*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x4_internal_16bpc).transpose4x8packed
2080*c0909341SAndroid Build Coastguard Worker
2081*c0909341SAndroid Build Coastguard Worker.pass2:
2082*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
2083*c0909341SAndroid Build Coastguard Worker    sub                dstq, strideq
2084*c0909341SAndroid Build Coastguard Worker    neg             strideq
2085*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x8_internal_16bpc).pass2
2086*c0909341SAndroid Build Coastguard Worker
2087*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, dct
2088*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, adst
2089*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, flipadst
2090*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, identity
2091*c0909341SAndroid Build Coastguard Worker
2092*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2093*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+0*32]
2094*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+1*32]
2095*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+2*32]
2096*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+3*32]
2097*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+4*32]
2098*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+5*32]
2099*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+6*32]
2100*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+7*32]
2101*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [cq+0*32+16]
2102*c0909341SAndroid Build Coastguard Worker    packssdw             m1, [cq+1*32+16]
2103*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [cq+2*32+16]
2104*c0909341SAndroid Build Coastguard Worker    packssdw             m3, [cq+3*32+16]
2105*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [cq+4*32+16]
2106*c0909341SAndroid Build Coastguard Worker    packssdw             m5, [cq+5*32+16]
2107*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [cq+6*32+16]
2108*c0909341SAndroid Build Coastguard Worker    packssdw             m7, [cq+7*32+16]
2109*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*1], m6
2110*c0909341SAndroid Build Coastguard Worker    jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3
2111*c0909341SAndroid Build Coastguard Worker
2112*c0909341SAndroid Build Coastguard Worker.pass2:
2113*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2114*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
2115*c0909341SAndroid Build Coastguard Worker%endif
2116*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2117*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2118*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
2119*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
2120*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_4096)]
2121*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
2122*c0909341SAndroid Build Coastguard Worker%else
2123*c0909341SAndroid Build Coastguard Worker    mova      [rsp+gprsize], m7
2124*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pw_4096)]
2125*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round4_and_write_8x8
2126*c0909341SAndroid Build Coastguard Worker%endif
2127*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x8_internal_16bpc).zero
2128*c0909341SAndroid Build Coastguard Worker
2129*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
2130*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2131*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16
2132*c0909341SAndroid Build Coastguard Worker%else
2133*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
2134*c0909341SAndroid Build Coastguard Worker%endif
2135*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
2136*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
2137*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
2138*c0909341SAndroid Build Coastguard Worker    add                 r5d, 128
2139*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 8
2140*c0909341SAndroid Build Coastguard Worker    imul                r5d, 181
2141*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 4
2142*c0909341SAndroid Build Coastguard Worker%if stack_size_padded > 0
2143*c0909341SAndroid Build Coastguard Worker    ; adjust to caller's stack allocation
2144*c0909341SAndroid Build Coastguard Worker    add                 rsp, (12+ARCH_X86_64)*16
2145*c0909341SAndroid Build Coastguard Worker%endif
2146*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end
2147*c0909341SAndroid Build Coastguard Worker%endif
2148*c0909341SAndroid Build Coastguard Worker%endmacro
2149*c0909341SAndroid Build Coastguard Worker
2150*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, dct
2151*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, identity, v
2152*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, adst
2153*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, flipadst
2154*c0909341SAndroid Build Coastguard Worker
2155*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2156*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
2157*c0909341SAndroid Build Coastguard Worker%endif
2158*c0909341SAndroid Build Coastguard Worker
2159*c0909341SAndroid Build Coastguard Workercglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2160*c0909341SAndroid Build Coastguard Worker%if WIN64
2161*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
2162*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
2163*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*1], r1
2164*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*2], r6
2165*c0909341SAndroid Build Coastguard Worker%endif
2166*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(m(idct_8x8_internal_16bpc).pass1_main)]
2167*c0909341SAndroid Build Coastguard Worker.pass1_full:
2168*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2169*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
2170*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
2171*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
2172*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
2173*c0909341SAndroid Build Coastguard Worker%endif
2174*c0909341SAndroid Build Coastguard Worker%undef cmp
2175*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 4
2176*c0909341SAndroid Build Coastguard Worker.zero_loop:
2177*c0909341SAndroid Build Coastguard Worker    dec                 r6d
2178*c0909341SAndroid Build Coastguard Worker    cmp                eobb, byte [r5+r6]
2179*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
2180*c0909341SAndroid Build Coastguard Worker    mov                 r5d, r6d
2181*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 4
2182*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2183*c0909341SAndroid Build Coastguard Worker    ; restore pic-ptr
2184*c0909341SAndroid Build Coastguard Worker    mov                  r6, [rsp+16*16+2*gprsize]
2185*c0909341SAndroid Build Coastguard Worker    ; setup stack pointer
2186*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+gprsize]
2187*c0909341SAndroid Build Coastguard Worker%endif
2188*c0909341SAndroid Build Coastguard Worker.loop_pass1:
2189*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+0*64+r5]
2190*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+1*64+r5]
2191*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+2*64+r5]
2192*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+3*64+r5]
2193*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+4*64+r5]
2194*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+5*64+r5]
2195*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+6*64+r5]
2196*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+7*64+r5]
2197*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
2198*c0909341SAndroid Build Coastguard Worker    call                 t0
2199*c0909341SAndroid Build Coastguard Worker
2200*c0909341SAndroid Build Coastguard Worker    mova       [cq+0*64+r5], m0
2201*c0909341SAndroid Build Coastguard Worker    mova       [cq+1*64+r5], m1
2202*c0909341SAndroid Build Coastguard Worker    mova       [cq+2*64+r5], m2
2203*c0909341SAndroid Build Coastguard Worker    mova       [cq+3*64+r5], m3
2204*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 16
2205*c0909341SAndroid Build Coastguard Worker    jge .loop_pass1
2206*c0909341SAndroid Build Coastguard Worker%if WIN64
2207*c0909341SAndroid Build Coastguard Worker    POP                  r7
2208*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
2209*c0909341SAndroid Build Coastguard Worker    mov                  r1, [rsp+16*16+1*gprsize]
2210*c0909341SAndroid Build Coastguard Worker%endif
2211*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2212*c0909341SAndroid Build Coastguard Worker
2213*c0909341SAndroid Build Coastguard Worker.pass2:
2214*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2215*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
2216*c0909341SAndroid Build Coastguard Worker%endif
2217*c0909341SAndroid Build Coastguard Worker
2218*c0909341SAndroid Build Coastguard Worker    ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15
2219*c0909341SAndroid Build Coastguard Worker    ; some are still pre-loaded from the final loop iteration in pass=1
2220*c0909341SAndroid Build Coastguard Worker
2221*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2222*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 1*16]
2223*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 9*16]
2224*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 2*16]
2225*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+10*16]
2226*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+ 3*16]
2227*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+11*16]
2228*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
2229*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+3*16], m0
2230*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+4*16], m1
2231*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+5*16], m2
2232*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+6*16], m3
2233*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+7*16], m4
2234*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+8*16], m5
2235*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+9*16], m6
2236*c0909341SAndroid Build Coastguard Worker    ; m7 is already stored in [rsp+gprsize+0*16]
2237*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 4*16]
2238*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+12*16]
2239*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 5*16]
2240*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+13*16]
2241*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 6*16]
2242*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+14*16]
2243*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+ 7*16]
2244*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+15*16]
2245*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
2246*c0909341SAndroid Build Coastguard Worker
2247*c0909341SAndroid Build Coastguard Worker    ; out0-7 is in rsp+gprsize+3-10*mmsize
2248*c0909341SAndroid Build Coastguard Worker    ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
2249*c0909341SAndroid Build Coastguard Worker
2250*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2251*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_2048)]
2252*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
2253*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
2254*c0909341SAndroid Build Coastguard Worker    mov                  r6, dstq
2255*c0909341SAndroid Build Coastguard Worker%else
2256*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*1], dstq
2257*c0909341SAndroid Build Coastguard Worker%endif
2258*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2259*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
2260*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round2_and_write_8x8
2261*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2262*c0909341SAndroid Build Coastguard Worker%define mzero m9
2263*c0909341SAndroid Build Coastguard Worker%else
2264*c0909341SAndroid Build Coastguard Worker%define mzero m7
2265*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
2266*c0909341SAndroid Build Coastguard Worker%endif
2267*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
2268*c0909341SAndroid Build Coastguard Worker                     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2269*c0909341SAndroid Build Coastguard Worker%undef mzero
2270*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+ 3*16]
2271*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+ 4*16]
2272*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+ 5*16]
2273*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+ 6*16]
2274*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+ 7*16]
2275*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+gprsize+ 8*16]
2276*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+ 9*16]
2277*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+10*16]
2278*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2279*c0909341SAndroid Build Coastguard Worker    mov                dstq, r6
2280*c0909341SAndroid Build Coastguard Worker%else
2281*c0909341SAndroid Build Coastguard Worker    mov                dstq, [rsp+16*16+gprsize*1]
2282*c0909341SAndroid Build Coastguard Worker%endif
2283*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
2284*c0909341SAndroid Build Coastguard Worker    RET
2285*c0909341SAndroid Build Coastguard Worker
2286*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, dct
2287*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, adst
2288*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, flipadst
2289*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, identity, v
2290*c0909341SAndroid Build Coastguard Worker
2291*c0909341SAndroid Build Coastguard Workercglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2292*c0909341SAndroid Build Coastguard Worker%if WIN64
2293*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
2294*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
2295*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*1], r1
2296*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*2], r6
2297*c0909341SAndroid Build Coastguard Worker%endif
2298*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)]
2299*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_16bpc).pass1_full
2300*c0909341SAndroid Build Coastguard Worker
2301*c0909341SAndroid Build Coastguard Worker.pass2:
2302*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2303*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
2304*c0909341SAndroid Build Coastguard Worker%endif
2305*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 9*16]
2306*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+13*16]
2307*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+7*16], m0
2308*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+8*16], m1
2309*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+5*16], m4
2310*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+6*16], m5
2311*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
2312*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2313*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 1*16]
2314*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 5*16]
2315*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 2*16]
2316*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+ 6*16]
2317*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+11*16]
2318*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+15*16]
2319*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 3*16], m4
2320*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 4*16], m5
2321*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 9*16], m6
2322*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+10*16], m7
2323*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+10*16]
2324*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+14*16]
2325*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+ 3*16]
2326*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+ 7*16]
2327*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
2328*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
2329*c0909341SAndroid Build Coastguard Worker
2330*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2331*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pw_m2048)]
2332*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_2048)]
2333*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
2334*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
2335*c0909341SAndroid Build Coastguard Worker    mov                  r6, dstq
2336*c0909341SAndroid Build Coastguard Worker%else
2337*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*1], dstq
2338*c0909341SAndroid Build Coastguard Worker%endif
2339*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2340*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
2341*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
2342*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2343*c0909341SAndroid Build Coastguard Worker%define mzero m9
2344*c0909341SAndroid Build Coastguard Worker%else
2345*c0909341SAndroid Build Coastguard Worker%define mzero m7
2346*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
2347*c0909341SAndroid Build Coastguard Worker%endif
2348*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
2349*c0909341SAndroid Build Coastguard Worker                     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2350*c0909341SAndroid Build Coastguard Worker%undef mzero
2351*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+ 3*16]
2352*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+ 4*16]
2353*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+ 5*16]
2354*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+ 6*16]
2355*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+ 7*16]
2356*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+gprsize+ 8*16]
2357*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+ 9*16]
2358*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+10*16]
2359*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2360*c0909341SAndroid Build Coastguard Worker    mov                dstq, r6
2361*c0909341SAndroid Build Coastguard Worker%else
2362*c0909341SAndroid Build Coastguard Worker    mov                dstq, [rsp+16*16+gprsize*1]
2363*c0909341SAndroid Build Coastguard Worker%endif
2364*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
2365*c0909341SAndroid Build Coastguard Worker    RET
2366*c0909341SAndroid Build Coastguard Worker
2367*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, dct
2368*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, adst
2369*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, flipadst
2370*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, identity, v
2371*c0909341SAndroid Build Coastguard Worker
2372*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2373*c0909341SAndroid Build Coastguard Worker%if WIN64
2374*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
2375*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
2376*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*1], r1
2377*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*2], r6
2378*c0909341SAndroid Build Coastguard Worker%endif
2379*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)]
2380*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_16bpc).pass1_full
2381*c0909341SAndroid Build Coastguard Worker
2382*c0909341SAndroid Build Coastguard Worker.pass2:
2383*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2384*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r3*5]
2385*c0909341SAndroid Build Coastguard Worker    add                dstq, r3
2386*c0909341SAndroid Build Coastguard Worker    neg             strideq
2387*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x16_internal_16bpc).pass2
2388*c0909341SAndroid Build Coastguard Worker
2389*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, dct, h
2390*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, adst, h
2391*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, flipadst, h
2392*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, identity
2393*c0909341SAndroid Build Coastguard Worker
2394*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2395*c0909341SAndroid Build Coastguard Worker%if WIN64
2396*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
2397*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
2398*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*1], r1
2399*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*2], r6
2400*c0909341SAndroid Build Coastguard Worker%endif
2401*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)]
2402*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_16bpc).pass1_full
2403*c0909341SAndroid Build Coastguard Worker
2404*c0909341SAndroid Build Coastguard Worker.pass2:
2405*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2406*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pw_2048)]
2407*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pixel_10bpc_max)]
2408*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
2409*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pw_1697x16)]
2410*c0909341SAndroid Build Coastguard Worker%endif
2411*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 4
2412*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2413*c0909341SAndroid Build Coastguard Worker.pass2_loop:
2414*c0909341SAndroid Build Coastguard Worker    call .main
2415*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2416*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round1_and_write_8x4
2417*c0909341SAndroid Build Coastguard Worker%else
2418*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round2_and_write_8x4
2419*c0909341SAndroid Build Coastguard Worker%endif
2420*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28
2421*c0909341SAndroid Build Coastguard Worker    dec                 r5d
2422*c0909341SAndroid Build Coastguard Worker    jle .end
2423*c0909341SAndroid Build Coastguard Worker    add                  cq, 16
2424*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2425*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 0*16]
2426*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 4*16]
2427*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 8*16]
2428*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+12*16]
2429*c0909341SAndroid Build Coastguard Worker    jmp .pass2_loop
2430*c0909341SAndroid Build Coastguard Worker.end:
2431*c0909341SAndroid Build Coastguard Worker    RET
2432*c0909341SAndroid Build Coastguard Worker.main:
2433*c0909341SAndroid Build Coastguard Worker    ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y)
2434*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2435*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pw_1697x16)]
2436*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7, m0
2437*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7, m1
2438*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7, m2
2439*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m3
2440*c0909341SAndroid Build Coastguard Worker%else
2441*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m7, m0
2442*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m7, m1
2443*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m7, m2
2444*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m7, m3
2445*c0909341SAndroid Build Coastguard Worker%endif
2446*c0909341SAndroid Build Coastguard Worker    REPX      {paddsw x, x}, m0, m1, m2, m3
2447*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2448*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m8
2449*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m9
2450*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m10
2451*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m11
2452*c0909341SAndroid Build Coastguard Worker%else
2453*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4
2454*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5
2455*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m6
2456*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m7
2457*c0909341SAndroid Build Coastguard Worker%endif
2458*c0909341SAndroid Build Coastguard Worker    ret
2459*c0909341SAndroid Build Coastguard Worker
2460*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X4_FN 2 ; type1, type2
2461*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2462*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 0, 16x4, 16, 0-8*16
2463*c0909341SAndroid Build Coastguard Worker%else
2464*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 0, 16x4, 8, 0-12*16
2465*c0909341SAndroid Build Coastguard Worker%endif
2466*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
2467*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
2468*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
2469*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 4
2470*c0909341SAndroid Build Coastguard Worker.dconly:
2471*c0909341SAndroid Build Coastguard Worker    add                 r5d, 384
2472*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 9
2473*c0909341SAndroid Build Coastguard Worker.dconly2:
2474*c0909341SAndroid Build Coastguard Worker    imul                r5d, 2896
2475*c0909341SAndroid Build Coastguard Worker    add                 r5d, 34816
2476*c0909341SAndroid Build Coastguard Worker    movd                 m0, r5d
2477*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q1111
2478*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
2479*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pixel_10bpc_max)]
2480*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
2481*c0909341SAndroid Build Coastguard Worker.loop:
2482*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+ 0]
2483*c0909341SAndroid Build Coastguard Worker    mova                 m2, [dstq+16]
2484*c0909341SAndroid Build Coastguard Worker    REPX     {paddw  x, m0}, m1, m2
2485*c0909341SAndroid Build Coastguard Worker    REPX     {pminsw x, m3}, m1, m2
2486*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsw x, m4}, m1, m2
2487*c0909341SAndroid Build Coastguard Worker    mova          [dstq+ 0], m1
2488*c0909341SAndroid Build Coastguard Worker    mova          [dstq+16], m2
2489*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
2490*c0909341SAndroid Build Coastguard Worker    dec                 r3d
2491*c0909341SAndroid Build Coastguard Worker    jg .loop
2492*c0909341SAndroid Build Coastguard Worker    RET
2493*c0909341SAndroid Build Coastguard Worker%endif
2494*c0909341SAndroid Build Coastguard Worker%endmacro
2495*c0909341SAndroid Build Coastguard Worker
2496*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, dct
2497*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, identity
2498*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, adst
2499*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, flipadst
2500*c0909341SAndroid Build Coastguard Worker
2501*c0909341SAndroid Build Coastguard Workercglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2502*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2503*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
2504*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
2505*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
2506*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
2507*c0909341SAndroid Build Coastguard Worker%endif
2508*c0909341SAndroid Build Coastguard Worker    ; setup stack pointer
2509*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+gprsize]
2510*c0909341SAndroid Build Coastguard Worker
2511*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 1*16]
2512*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 3*16]
2513*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 5*16]
2514*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 7*16]
2515*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 9*16]
2516*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+11*16]
2517*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+13*16]
2518*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+15*16]
2519*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf
2520*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 0*16]
2521*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 2*16]
2522*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 4*16]
2523*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 6*16]
2524*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 8*16]
2525*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+10*16]
2526*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+12*16]
2527*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+14*16]
2528*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1
2529*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
2530*c0909341SAndroid Build Coastguard Worker    ; t0-7 is in m0-7
2531*c0909341SAndroid Build Coastguard Worker
2532*c0909341SAndroid Build Coastguard Worker    call .round
2533*c0909341SAndroid Build Coastguard Worker
2534*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2535*c0909341SAndroid Build Coastguard Worker.pack_transpose:
2536*c0909341SAndroid Build Coastguard Worker    ; transpose in two parts
2537*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
2538*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
2539*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
2540*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
2541*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m9
2542*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m11
2543*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
2544*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
2545*c0909341SAndroid Build Coastguard Worker.transpose:
2546*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
2547*c0909341SAndroid Build Coastguard Worker    call .transpose4x8packed_hi
2548*c0909341SAndroid Build Coastguard Worker%else
2549*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
2550*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
2551*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m1
2552*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m2
2553*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m3
2554*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+ 8*16]
2555*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+ 9*16]
2556*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+10*16]
2557*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+11*16]
2558*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
2559*c0909341SAndroid Build Coastguard Worker%endif
2560*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2561*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2562*c0909341SAndroid Build Coastguard Worker.transpose4x8packed_hi:
2563*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m10, m14
2564*c0909341SAndroid Build Coastguard Worker    punpckhwd           m10, m14
2565*c0909341SAndroid Build Coastguard Worker    punpckhwd           m14, m8, m12
2566*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m12
2567*c0909341SAndroid Build Coastguard Worker
2568*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m8, m9
2569*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m9
2570*c0909341SAndroid Build Coastguard Worker    punpckhwd           m12, m14, m10
2571*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m10
2572*c0909341SAndroid Build Coastguard Worker
2573*c0909341SAndroid Build Coastguard Worker    punpcklwd           m10, m11, m12
2574*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m12
2575*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m8, m14
2576*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m14
2577*c0909341SAndroid Build Coastguard Worker    ret
2578*c0909341SAndroid Build Coastguard Worker%endif
2579*c0909341SAndroid Build Coastguard Worker.main_oddhalf_fast: ; lower half zero
2580*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m0, [o(pd_4076)]
2581*c0909341SAndroid Build Coastguard Worker    pmulld               m0, [o(pd_401)]
2582*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m1, [o(pd_m1189)]
2583*c0909341SAndroid Build Coastguard Worker    pmulld               m1, [o(pd_3920)]
2584*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2585*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pd_2048)]
2586*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m4}, m1, m6
2587*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m1, m6
2588*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m1
2589*c0909341SAndroid Build Coastguard Worker%endif
2590*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m2, [o(pd_3612)]
2591*c0909341SAndroid Build Coastguard Worker    pmulld               m2, [o(pd_1931)]
2592*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2593*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m3, [o(pd_m2598)]
2594*c0909341SAndroid Build Coastguard Worker%else
2595*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m3, [o(pd_m2598)]
2596*c0909341SAndroid Build Coastguard Worker%endif
2597*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [o(pd_3166)]
2598*c0909341SAndroid Build Coastguard Worker    jmp .main_oddhalf_fast2
2599*c0909341SAndroid Build Coastguard Worker.main_oddhalf:
2600*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2601*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         0, 7, 8, 9, 10, _,  401, 4076 ; t8a,  t15a
2602*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
2603*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
2604*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 3, 8, 9, 10, _, 3166, 2598 ; t9a,  t14a
2605*c0909341SAndroid Build Coastguard Worker.main_oddhalf_fast2:
2606*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
2607*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
2608*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m4 ; t9
2609*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t8
2610*c0909341SAndroid Build Coastguard Worker    psubd                m4, m6, m2 ; t10
2611*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6     ; t11
2612*c0909341SAndroid Build Coastguard Worker    psubd                m6, m1, m5 ; t13
2613*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1     ; t12
2614*c0909341SAndroid Build Coastguard Worker    psubd                m1, m7, m3 ; t14
2615*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t15
2616*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
2617*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
2618*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_3784)]
2619*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pd_1567)]
2620*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 8, 3, 9, _, 11, 10, 15
2621*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 4, 3, 9, _, 11, 10, 15, 4
2622*c0909341SAndroid Build Coastguard Worker    psubd                m3, m1, m4 ; t10
2623*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4     ; t9
2624*c0909341SAndroid Build Coastguard Worker    psubd                m4, m0, m2 ; t11a
2625*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2     ; t8a
2626*c0909341SAndroid Build Coastguard Worker    psubd                m2, m8, m6 ; t13
2627*c0909341SAndroid Build Coastguard Worker    paddd                m6, m8     ; t14
2628*c0909341SAndroid Build Coastguard Worker    psubd                m8, m7, m5 ; t12a
2629*c0909341SAndroid Build Coastguard Worker    paddd                m7, m5     ; t15a
2630*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
2631*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
2632*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m14}, m2, m8, m3, m4
2633*c0909341SAndroid Build Coastguard Worker    paddd                m2, m11
2634*c0909341SAndroid Build Coastguard Worker    paddd                m8, m11
2635*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2, m3 ; t13a
2636*c0909341SAndroid Build Coastguard Worker    psubd                m2, m3     ; t10a
2637*c0909341SAndroid Build Coastguard Worker    psubd                m3, m8, m4 ; t11
2638*c0909341SAndroid Build Coastguard Worker    paddd                m4, m8     ; t12
2639*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m5, m2, m3, m4
2640*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
2641*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m1
2642*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m2
2643*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m3
2644*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m4
2645*c0909341SAndroid Build Coastguard Worker    mova          [r3+5*16], m5
2646*c0909341SAndroid Build Coastguard Worker    mova          [r3+6*16], m6
2647*c0909341SAndroid Build Coastguard Worker    mova          [r3+7*16], m7
2648*c0909341SAndroid Build Coastguard Worker%else
2649*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m2
2650*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m3
2651*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m4
2652*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m5
2653*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pd_2048)]
2654*c0909341SAndroid Build Coastguard Worker
2655*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         0, 7, 2, 3, 5, _,  401, 4076 ; t8a,  t15a
2656*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a
2657*c0909341SAndroid Build Coastguard Worker
2658*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+0*16]
2659*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+1*16]
2660*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
2661*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m1
2662*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+2*16]
2663*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+3*16]
2664*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m6
2665*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m7
2666*c0909341SAndroid Build Coastguard Worker
2667*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a
2668*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 3, 0, 6, 7, _, 3166, 2598 ; t9a,  t14a
2669*c0909341SAndroid Build Coastguard Worker
2670*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+0*16]
2671*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+2*16]
2672*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+3*16]
2673*c0909341SAndroid Build Coastguard Worker.main_oddhalf_fast2:
2674*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m4}, m0, m7, m2, m5, m1, m3
2675*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m0, m7, m2, m5, m1, m3
2676*c0909341SAndroid Build Coastguard Worker    psubd                m4, m0, m1 ; t9
2677*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1     ; t8
2678*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+1*16]
2679*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m4
2680*c0909341SAndroid Build Coastguard Worker    psubd                m4, m6, m2 ; t10
2681*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6     ; t11
2682*c0909341SAndroid Build Coastguard Worker    psubd                m6, m1, m5 ; t13
2683*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1     ; t12
2684*c0909341SAndroid Build Coastguard Worker    psubd                m1, m7, m3 ; t14
2685*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t15
2686*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_min)]
2687*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7
2688*c0909341SAndroid Build Coastguard Worker    pmaxsd               m3, [r3+0*16]
2689*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m3
2690*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_max)]
2691*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7
2692*c0909341SAndroid Build Coastguard Worker    pminsd               m3, [r3+0*16]
2693*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
2694*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m2
2695*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m5
2696*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m7
2697*c0909341SAndroid Build Coastguard Worker    mova                m7, [o(pd_2048)]
2698*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 3, 0, 2, 5, 7, 1567, 3784
2699*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 4, 0, 2, _, 7,    5, 3784, 4
2700*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+0*16]
2701*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+1*16]
2702*c0909341SAndroid Build Coastguard Worker    psubd                m5, m1, m4 ; t10
2703*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m5
2704*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4     ; t9
2705*c0909341SAndroid Build Coastguard Worker    psubd                m4, m0, m2 ; t11a
2706*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2     ; t8a
2707*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+2*16]
2708*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+3*16]
2709*c0909341SAndroid Build Coastguard Worker    psubd                m2, m3, m6 ; t13
2710*c0909341SAndroid Build Coastguard Worker    paddd                m6, m3     ; t14
2711*c0909341SAndroid Build Coastguard Worker    paddd                m3, m7, m5 ; t15a
2712*c0909341SAndroid Build Coastguard Worker    psubd                m7, m5     ; t12a
2713*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m3
2714*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+1*16]
2715*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(clip_18b_min)]
2716*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6
2717*c0909341SAndroid Build Coastguard Worker    pmaxsd               m5, [r3+0*16]
2718*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m5
2719*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(clip_18b_max)]
2720*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6
2721*c0909341SAndroid Build Coastguard Worker    pminsd               m5, [r3+0*16]
2722*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m5
2723*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2896)]
2724*c0909341SAndroid Build Coastguard Worker    REPX     {pmulld x, m5}, m2, m7, m3, m4
2725*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
2726*c0909341SAndroid Build Coastguard Worker    REPX     {paddd  x, m5}, m2, m7
2727*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2, m3 ; t13a
2728*c0909341SAndroid Build Coastguard Worker    psubd                m2, m3     ; t10a
2729*c0909341SAndroid Build Coastguard Worker    psubd                m3, m7, m4 ; t11
2730*c0909341SAndroid Build Coastguard Worker    paddd                m4, m7     ; t12
2731*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m5, m2, m3, m4
2732*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+0*16]
2733*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m0
2734*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m1
2735*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m2
2736*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m3
2737*c0909341SAndroid Build Coastguard Worker    mova          [r3+7*16], m4
2738*c0909341SAndroid Build Coastguard Worker    mova          [r3+6*16], m5
2739*c0909341SAndroid Build Coastguard Worker    mova          [r3+5*16], m6
2740*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m7
2741*c0909341SAndroid Build Coastguard Worker%endif
2742*c0909341SAndroid Build Coastguard Worker    ret
2743*c0909341SAndroid Build Coastguard Worker.round:
2744*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2745*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
2746*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
2747*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m8
2748*c0909341SAndroid Build Coastguard Worker    REPX      {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
2749*c0909341SAndroid Build Coastguard Worker    mova                 m8, [r3+1*16]
2750*c0909341SAndroid Build Coastguard Worker    mova                 m9, [r3+2*16]
2751*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+3*16]
2752*c0909341SAndroid Build Coastguard Worker    mova                m11, [r3+4*16]
2753*c0909341SAndroid Build Coastguard Worker    mova                m12, [r3+5*16]
2754*c0909341SAndroid Build Coastguard Worker    mova                m13, [r3+6*16]
2755*c0909341SAndroid Build Coastguard Worker    mova                m14, [r3+7*16]
2756*c0909341SAndroid Build Coastguard Worker    psubd               m15, m0, m14       ; out15
2757*c0909341SAndroid Build Coastguard Worker    paddd                m0, m14           ; out0
2758*c0909341SAndroid Build Coastguard Worker    psubd               m14, m1, m13       ; out14
2759*c0909341SAndroid Build Coastguard Worker    paddd                m1, m13           ; out1
2760*c0909341SAndroid Build Coastguard Worker    psubd               m13, m2, m12       ; out13
2761*c0909341SAndroid Build Coastguard Worker    paddd                m2, m12           ; out2
2762*c0909341SAndroid Build Coastguard Worker    psubd               m12, m3, m11       ; out12
2763*c0909341SAndroid Build Coastguard Worker    paddd                m3, m11           ; out3
2764*c0909341SAndroid Build Coastguard Worker    psubd               m11, m4, m10       ; out11
2765*c0909341SAndroid Build Coastguard Worker    paddd                m4, m10           ; out4
2766*c0909341SAndroid Build Coastguard Worker    psubd               m10, m5, m9        ; out10
2767*c0909341SAndroid Build Coastguard Worker    paddd                m5, m9            ; out5
2768*c0909341SAndroid Build Coastguard Worker    psubd                m9, m6, m8        ; out9
2769*c0909341SAndroid Build Coastguard Worker    paddd                m6, m8            ; out6
2770*c0909341SAndroid Build Coastguard Worker    psubd                m8, m7, [r3+0*16] ; out8
2771*c0909341SAndroid Build Coastguard Worker    paddd                m7, [r3+0*16]     ; out7
2772*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
2773*c0909341SAndroid Build Coastguard Worker                             m8,  m9,  m10, m11, m12, m13, m14, m15
2774*c0909341SAndroid Build Coastguard Worker    ; and out0-15 is now in m0-15
2775*c0909341SAndroid Build Coastguard Worker%else
2776*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 0*16], m0
2777*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(clip_18b_min)]
2778*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
2779*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, [r3+ 0*16]
2780*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 0*16], m7
2781*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(clip_18b_max)]
2782*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
2783*c0909341SAndroid Build Coastguard Worker    pminsd               m7, [r3+ 0*16]
2784*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 0*16], m0
2785*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m0, m0
2786*c0909341SAndroid Build Coastguard Worker    REPX      {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
2787*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 1*16], m1
2788*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 2*16], m2
2789*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+ 0*16]
2790*c0909341SAndroid Build Coastguard Worker    psubd                m1, m0
2791*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 0*16], m1
2792*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+11*16]
2793*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+10*16]
2794*c0909341SAndroid Build Coastguard Worker    psubd                m0, m7, m1
2795*c0909341SAndroid Build Coastguard Worker    paddd                m7, m1
2796*c0909341SAndroid Build Coastguard Worker    psubd                m1, m6, m2
2797*c0909341SAndroid Build Coastguard Worker    paddd                m6, m2
2798*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m0, m1, m6, m7
2799*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1     ; out8-9
2800*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7     ; out6-7
2801*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m6
2802*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+9*16]
2803*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+8*16]
2804*c0909341SAndroid Build Coastguard Worker    psubd                m2, m5, m1
2805*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1
2806*c0909341SAndroid Build Coastguard Worker    psubd                m1, m4, m7
2807*c0909341SAndroid Build Coastguard Worker    paddd                m4, m7
2808*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m2, m1, m4, m5
2809*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m1     ; out10-11
2810*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5     ; out4-5
2811*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+2*16]
2812*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m4
2813*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+7*16]
2814*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+6*16]
2815*c0909341SAndroid Build Coastguard Worker    psubd                m4, m3, m6
2816*c0909341SAndroid Build Coastguard Worker    paddd                m3, m6
2817*c0909341SAndroid Build Coastguard Worker    psubd                m6, m1, m7
2818*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7
2819*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m4, m6, m1, m3
2820*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m6     ; out12-13
2821*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3     ; out2-3
2822*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+1*16]
2823*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m1
2824*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+0*16]
2825*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+5*16]
2826*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+4*16]
2827*c0909341SAndroid Build Coastguard Worker    psubd                m6, m3, m5
2828*c0909341SAndroid Build Coastguard Worker    paddd                m3, m5
2829*c0909341SAndroid Build Coastguard Worker    psubd                m5, m1, m7
2830*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7
2831*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m6, m5, m1, m3
2832*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m5     ; out14-15
2833*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3     ; out0-1
2834*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m1
2835*c0909341SAndroid Build Coastguard Worker%endif
2836*c0909341SAndroid Build Coastguard Worker    ret
2837*c0909341SAndroid Build Coastguard Worker
2838*c0909341SAndroid Build Coastguard Worker.pass2:
2839*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)]
2840*c0909341SAndroid Build Coastguard Worker.pass2_loop:
2841*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2842*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2843*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
2844*c0909341SAndroid Build Coastguard Worker%endif
2845*c0909341SAndroid Build Coastguard Worker    call                 r4
2846*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round2_and_write_8x4
2847*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
2848*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2849*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
2850*c0909341SAndroid Build Coastguard Worker    mova                 m1, m9
2851*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
2852*c0909341SAndroid Build Coastguard Worker    mova                 m3, m11
2853*c0909341SAndroid Build Coastguard Worker%else
2854*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+0*16]
2855*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+1*16]
2856*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+2*16]
2857*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+3*16]
2858*c0909341SAndroid Build Coastguard Worker%endif
2859*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
2860*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2861*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
2862*c0909341SAndroid Build Coastguard Worker%endif
2863*c0909341SAndroid Build Coastguard Worker    call                 r4
2864*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round2_and_write_8x4
2865*c0909341SAndroid Build Coastguard Worker    RET
2866*c0909341SAndroid Build Coastguard Worker
2867*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, dct
2868*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, adst
2869*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, flipadst
2870*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, identity
2871*c0909341SAndroid Build Coastguard Worker
2872*c0909341SAndroid Build Coastguard Workercglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2873*c0909341SAndroid Build Coastguard Worker    ; setup stack pointer
2874*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+gprsize]
2875*c0909341SAndroid Build Coastguard Worker    call .main
2876*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2877*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x4_internal_16bpc).pack_transpose
2878*c0909341SAndroid Build Coastguard Worker%else
2879*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
2880*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0*16], m0
2881*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+1*16], m1
2882*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+2*16], m2
2883*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+3*16], m3
2884*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+ 8*16]
2885*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+ 9*16]
2886*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+10*16]
2887*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+11*16]
2888*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
2889*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2890*c0909341SAndroid Build Coastguard Worker%endif
2891*c0909341SAndroid Build Coastguard Worker
2892*c0909341SAndroid Build Coastguard Worker.main:
2893*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2894*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
2895*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
2896*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
2897*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
2898*c0909341SAndroid Build Coastguard Worker%endif
2899*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 2*16]
2900*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+13*16]
2901*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 6*16]
2902*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 9*16]
2903*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+10*16]
2904*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+ 5*16]
2905*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+14*16]
2906*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+ 1*16]
2907*c0909341SAndroid Build Coastguard Worker    call .main_part1
2908*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 0*16]
2909*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+15*16]
2910*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 4*16]
2911*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+11*16]
2912*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 8*16]
2913*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+ 7*16]
2914*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+12*16]
2915*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+ 3*16]
2916*c0909341SAndroid Build Coastguard Worker    call .main_part2
2917*c0909341SAndroid Build Coastguard Worker.round:
2918*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2919*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_6144)]
2920*c0909341SAndroid Build Coastguard Worker    psrld               m14, 11       ; pd_1
2921*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m8       ; -1
2922*c0909341SAndroid Build Coastguard Worker    psubd               m13, m15, m14 ; pd_6143
2923*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m14}, m0, m2
2924*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m15}, m4, m6
2925*c0909341SAndroid Build Coastguard Worker    REPX     {pxor  x, m8 }, m1, m3, m5, m7
2926*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 1  }, m1, m3
2927*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m15}, m5, m7
2928*c0909341SAndroid Build Coastguard Worker    REPX     {psubd x, m8 }, m1, m3
2929*c0909341SAndroid Build Coastguard Worker    paddd                m8, m15, m9
2930*c0909341SAndroid Build Coastguard Worker    psubd                m9, m13, m10
2931*c0909341SAndroid Build Coastguard Worker    paddd               m10, m15, m11
2932*c0909341SAndroid Build Coastguard Worker    psubd               m11, m13, m12
2933*c0909341SAndroid Build Coastguard Worker    paddd               m12, m14, [r3+3*16]
2934*c0909341SAndroid Build Coastguard Worker    psubd               m13, m14, [r3+2*16]
2935*c0909341SAndroid Build Coastguard Worker    psubd               m15, m14, [r3+0*16]
2936*c0909341SAndroid Build Coastguard Worker    paddd               m14, [r3+1*16]
2937*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 1 }, m0,  m2,  m12, m13, m14, m15
2938*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 13}, m4,  m5,  m6,  m7,  m8,  m9,  m10, m11
2939*c0909341SAndroid Build Coastguard Worker%else
2940*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m1
2941*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m3
2942*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_6144)]
2943*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m1, m1
2944*c0909341SAndroid Build Coastguard Worker    REPX      {pxor  x, m1}, m5, m7
2945*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m3}, m4, m5, m6, m7
2946*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 13}, m4, m5, m6, m7
2947*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
2948*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
2949*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m4
2950*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m6
2951*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+4*16]
2952*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+5*16]
2953*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+6*16]
2954*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+7*16]
2955*c0909341SAndroid Build Coastguard Worker    REPX      {pxor  x, m1}, m5, m7
2956*c0909341SAndroid Build Coastguard Worker    REPX      {psubd x, m1}, m4, m6
2957*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 1 }, m4, m5, m6, m7
2958*c0909341SAndroid Build Coastguard Worker    REPX      {psubd x, m1}, m5, m7
2959*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
2960*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
2961*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+8*16]
2962*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+9*16]
2963*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m4
2964*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m6
2965*c0909341SAndroid Build Coastguard Worker    REPX      {pxor  x, m1}, m5, m7
2966*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m3}, m0, m5, m2, m7
2967*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 13}, m0, m5, m2, m7
2968*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m5
2969*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m7
2970*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+0*16]
2971*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+1*16]
2972*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+2*16]
2973*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+3*16]
2974*c0909341SAndroid Build Coastguard Worker    REPX      {psubd x, m1}, m4, m6
2975*c0909341SAndroid Build Coastguard Worker    REPX      {pxor  x, m1}, m5, m7
2976*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 1 }, m4, m5, m6, m7
2977*c0909341SAndroid Build Coastguard Worker    REPX      {psubd x, m1}, m5, m7
2978*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
2979*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
2980*c0909341SAndroid Build Coastguard Worker%endif
2981*c0909341SAndroid Build Coastguard Worker    ret
2982*c0909341SAndroid Build Coastguard Worker
2983*c0909341SAndroid Build Coastguard Worker.main_part2:
2984*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2985*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 0, 8, 9, 10, 11,  201, 4091
2986*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 2, 8, 9, 10, 11, 1751, 3703
2987*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 4, 8, 9, 10, 11, 3035, 2751
2988*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 6, 8, 9, 10, 11, 3857, 1380
2989*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m4 ; t8a
2990*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t0a
2991*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1, m5 ; t9a
2992*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5     ; t1a
2993*c0909341SAndroid Build Coastguard Worker    psubd                m5, m2, m6 ; t12a
2994*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6     ; t4a
2995*c0909341SAndroid Build Coastguard Worker    psubd                m6, m3, m7 ; t13a
2996*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t5a
2997*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
2998*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
2999*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_4017)]
3000*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pd_799)]
3001*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         8, 4, 3, 9, _, 11, 10, 15
3002*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 5, 3, 9, _, 11, 15, 10
3003*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m2 ; t4
3004*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2     ; t0
3005*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m7 ; t5
3006*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7     ; t1
3007*c0909341SAndroid Build Coastguard Worker    psubd                m7, m4, m6 ; t12a
3008*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6     ; t8a
3009*c0909341SAndroid Build Coastguard Worker    psubd                m6, m8, m5 ; t13a
3010*c0909341SAndroid Build Coastguard Worker    paddd                m5, m8     ; t9a
3011*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
3012*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
3013*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_3784)]
3014*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pd_1567)]
3015*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 2, 8, 9, _, 11, 10, 15
3016*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 6, 8, 9, _, 11, 10, 15
3017*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+0*16]      ;  t2
3018*c0909341SAndroid Build Coastguard Worker    mova                 m8, [r3+1*16]      ;  t3
3019*c0909341SAndroid Build Coastguard Worker    psubd                m9, m0, m10        ;  t2a
3020*c0909341SAndroid Build Coastguard Worker    paddd                m0, m10            ;  out0
3021*c0909341SAndroid Build Coastguard Worker    psubd               m10, m1, m8         ;  t3a
3022*c0909341SAndroid Build Coastguard Worker    paddd                m1, m8             ; -out15
3023*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m1
3024*c0909341SAndroid Build Coastguard Worker    mova                m15, [r3+3*16]      ;  t7a
3025*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+2*16]      ;  t6a
3026*c0909341SAndroid Build Coastguard Worker    psubd                m8, m3, m15        ;  t7
3027*c0909341SAndroid Build Coastguard Worker    paddd               m15, m3             ;  out12
3028*c0909341SAndroid Build Coastguard Worker    paddd                m3, m2, m1         ; -out3
3029*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1             ;  t6
3030*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m15
3031*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m2
3032*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+7*16]      ;  t15
3033*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+6*16]      ;  t14
3034*c0909341SAndroid Build Coastguard Worker    paddd               m15, m7, m1         ; -out13
3035*c0909341SAndroid Build Coastguard Worker    psubd                m7, m1             ;  t15a
3036*c0909341SAndroid Build Coastguard Worker    psubd               m11, m6, m2         ;  t14a
3037*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6             ;  out2
3038*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m15
3039*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+4*16]      ;  t10a
3040*c0909341SAndroid Build Coastguard Worker    mova                m15, [r3+5*16]      ;  t11a
3041*c0909341SAndroid Build Coastguard Worker    psubd                m6, m4, m1         ;  t10
3042*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4             ; -out1
3043*c0909341SAndroid Build Coastguard Worker    psubd                m4, m5, m15        ;  t11
3044*c0909341SAndroid Build Coastguard Worker    paddd                m5, m15            ;  out14
3045*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8
3046*c0909341SAndroid Build Coastguard Worker    pmaxsd              m12, [r3+1*16]      ;  t6
3047*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m5
3048*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8
3049*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8
3050*c0909341SAndroid Build Coastguard Worker    paddd                m5, m11, m7        ; -out5  (unshifted)
3051*c0909341SAndroid Build Coastguard Worker    psubd               m11, m7             ;  out10 (unshifted)
3052*c0909341SAndroid Build Coastguard Worker    paddd                m7, m9, m10        ; -out7  (unshifted)
3053*c0909341SAndroid Build Coastguard Worker    psubd                m9, m10            ;  out8  (unshifted)
3054*c0909341SAndroid Build Coastguard Worker    psubd               m10, m6, m4         ; -out9  (unshifted)
3055*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4             ;  out6  (unshifted)
3056*c0909341SAndroid Build Coastguard Worker    paddd                m4, m12, m8        ;  out4  (unshifted)
3057*c0909341SAndroid Build Coastguard Worker    psubd               m12, m8             ; -out11 (unshifted)
3058*c0909341SAndroid Build Coastguard Worker%else
3059*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m0
3060*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m1
3061*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m2
3062*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m3
3063*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_2048)]
3064*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 4, 0, 1, 2, 3, 3035, 2751
3065*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 6, 0, 1, 2, 3, 3857, 1380
3066*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+8*16]
3067*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+9*16]
3068*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m4
3069*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+10*16]
3070*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m5
3071*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m6
3072*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+11*16]
3073*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m7
3074*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 0, 2, 6, 7, 3,  201, 4091
3075*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 4, 2, 6, 7, 3, 1751, 3703
3076*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+8*16]
3077*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+9*16]
3078*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m2 ; t8a
3079*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2     ; t0a
3080*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m3
3081*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m6 ; t9a
3082*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6     ; t1a
3083*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+10*16]
3084*c0909341SAndroid Build Coastguard Worker    psubd                m6, m4, m3 ; t12a
3085*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3     ; t4a
3086*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+11*16]
3087*c0909341SAndroid Build Coastguard Worker    psubd                m7, m5, m3 ; t13a
3088*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3     ; t5a
3089*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_min)]
3090*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5
3091*c0909341SAndroid Build Coastguard Worker    pmaxsd               m3, [r3+8*16]
3092*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m3
3093*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_max)]
3094*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5
3095*c0909341SAndroid Build Coastguard Worker    pminsd               m3, [r3+8*16]
3096*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m3
3097*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m4 ; t4
3098*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t0
3099*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1, m5 ; t5
3100*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5     ; t1
3101*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
3102*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m1
3103*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m4
3104*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m3
3105*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+8*16]
3106*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m0
3107*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 2, 0, 1, 4, 5,  799, 4017
3108*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 6, 0, 1, 4, 5, 4017,    4
3109*c0909341SAndroid Build Coastguard Worker    psubd                m5, m2, m7 ; t12a
3110*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7     ; t8a
3111*c0909341SAndroid Build Coastguard Worker    psubd                m7, m3, m6 ; t13a
3112*c0909341SAndroid Build Coastguard Worker    paddd                m6, m3     ; t9a
3113*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+8*16]
3114*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+9*16]
3115*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+10*16]
3116*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_min)]
3117*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6
3118*c0909341SAndroid Build Coastguard Worker    pmaxsd               m3, [r3+11*16]
3119*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m3
3120*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_max)]
3121*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6
3122*c0909341SAndroid Build Coastguard Worker    pminsd               m3, [r3+8*16]
3123*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m0
3124*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m1
3125*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m2
3126*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m6
3127*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pd_2048)]
3128*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 4, 1, 2, 6, 0, 1567, 3784
3129*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 7, 1, 2, 6, 0,    6, 3784
3130*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+7*16]      ;  t7a
3131*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+6*16]      ;  t6a
3132*c0909341SAndroid Build Coastguard Worker    psubd                m1, m3, m0         ;  t7
3133*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3             ;  out12
3134*c0909341SAndroid Build Coastguard Worker    paddd                m3, m4, m2         ; -out3
3135*c0909341SAndroid Build Coastguard Worker    psubd                m4, m2             ;  t6
3136*c0909341SAndroid Build Coastguard Worker    mova          [r3+7*16], m3
3137*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+3*16]      ;  t15
3138*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+2*16]      ;  t14
3139*c0909341SAndroid Build Coastguard Worker    paddd                m6, m5, m3         ; -out13
3140*c0909341SAndroid Build Coastguard Worker    psubd                m5, m3             ;  t15a
3141*c0909341SAndroid Build Coastguard Worker    psubd                m3, m7, m2         ;  t14a
3142*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7             ;  out2
3143*c0909341SAndroid Build Coastguard Worker    mova          [r3+6*16], m2
3144*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+0*16]      ;  t10a
3145*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+1*16]      ;  t11a
3146*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
3147*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m6
3148*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+11*16]
3149*c0909341SAndroid Build Coastguard Worker    psubd                m0, m6, m2         ;  t11
3150*c0909341SAndroid Build Coastguard Worker    paddd                m6, m2             ;  out14
3151*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m6
3152*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+10*16]
3153*c0909341SAndroid Build Coastguard Worker    psubd                m6, m2, m7         ;  t10
3154*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7             ; -out1
3155*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+5*16]      ;  t3
3156*c0909341SAndroid Build Coastguard Worker    mova          [r3+5*16], m2
3157*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m1
3158*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+9*16]
3159*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m7         ;  t3a
3160*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7             ; -out15
3161*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m1
3162*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+4*16]      ;  t2
3163*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+8*16]
3164*c0909341SAndroid Build Coastguard Worker    psubd                m7, m1             ;  t2a
3165*c0909341SAndroid Build Coastguard Worker    paddd                m1, [r3+8*16]      ;  out0
3166*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m1
3167*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(clip_18b_min)]
3168*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7
3169*c0909341SAndroid Build Coastguard Worker    pmaxsd               m1, [r3+10*16]
3170*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m1
3171*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(clip_18b_max)]
3172*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7
3173*c0909341SAndroid Build Coastguard Worker    pminsd               m1, [r3+10*16]
3174*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m1
3175*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(pd_2896)]
3176*c0909341SAndroid Build Coastguard Worker    REPX     {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7
3177*c0909341SAndroid Build Coastguard Worker    pmulld               m1, [r3+10*16]
3178*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m3
3179*c0909341SAndroid Build Coastguard Worker    psubd                m3, m4, m1         ; -out11 (unshifted)
3180*c0909341SAndroid Build Coastguard Worker    paddd                m4, m1             ;  out4  (unshifted)
3181*c0909341SAndroid Build Coastguard Worker    psubd                m1, m6, m0         ; -out9  (unshifted)
3182*c0909341SAndroid Build Coastguard Worker    paddd                m6, m0             ;  out6  (unshifted)
3183*c0909341SAndroid Build Coastguard Worker    psubd                m0, m7, m2         ;  out8  (unshifted)
3184*c0909341SAndroid Build Coastguard Worker    paddd                m7, m2             ; -out7  (unshifted)
3185*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+11*16]
3186*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m5
3187*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2             ; -out5  (unshifted)
3188*c0909341SAndroid Build Coastguard Worker    psubd                m2, [r3+11*16]     ;  out10 (unshifted)
3189*c0909341SAndroid Build Coastguard Worker    ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted)
3190*c0909341SAndroid Build Coastguard Worker    ; r[-4,3] contain out0-3 and out12-15
3191*c0909341SAndroid Build Coastguard Worker%endif
3192*c0909341SAndroid Build Coastguard Worker    ret
3193*c0909341SAndroid Build Coastguard Worker.main_part1:
3194*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3195*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 0, 8, 9, 10, 11,  995, 3973
3196*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 2, 8, 9, 10, 11, 2440, 3290
3197*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 4, 8, 9, 10, 11, 3513, 2106
3198*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 6, 8, 9, 10, 11, 4052,  601
3199*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m4 ; t10a
3200*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t2a
3201*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1, m5 ; t11a
3202*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5     ; t3a
3203*c0909341SAndroid Build Coastguard Worker    psubd                m5, m2, m6 ; t14a
3204*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6     ; t6a
3205*c0909341SAndroid Build Coastguard Worker    psubd                m6, m3, m7 ; t15a
3206*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t7a
3207*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
3208*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
3209*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_2276)]
3210*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pd_3406)]
3211*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         8, 4, 3, 9, _, 11, 10, 15
3212*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 5, 3, 9, _, 11, 15, 10
3213*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m2 ; t6
3214*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2     ; t2
3215*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m7 ; t7
3216*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7     ; t3
3217*c0909341SAndroid Build Coastguard Worker    psubd                m7, m4, m6 ; t14a
3218*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6     ; t10a
3219*c0909341SAndroid Build Coastguard Worker    psubd                m6, m8, m5 ; t15a
3220*c0909341SAndroid Build Coastguard Worker    paddd                m5, m8     ; t11a
3221*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
3222*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
3223*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_1567)]
3224*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pd_3784)]
3225*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 3, 8, 9, _, 11, 10, 15
3226*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 7, 8, 9, _, 11, 10, 15
3227*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
3228*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m1
3229*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m4
3230*c0909341SAndroid Build Coastguard Worker    mova          [r3+5*16], m5
3231*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m2
3232*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m3
3233*c0909341SAndroid Build Coastguard Worker    mova          [r3+6*16], m6
3234*c0909341SAndroid Build Coastguard Worker    mova          [r3+7*16], m7
3235*c0909341SAndroid Build Coastguard Worker%else
3236*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m0
3237*c0909341SAndroid Build Coastguard Worker    mova          [r3+5*16], m1
3238*c0909341SAndroid Build Coastguard Worker    mova          [r3+6*16], m2
3239*c0909341SAndroid Build Coastguard Worker    mova          [r3+7*16], m3
3240*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_2048)]
3241*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 4, 0, 1, 2, 3, 3513, 2106
3242*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 6, 0, 1, 2, 3, 4052,  601
3243*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m4
3244*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m5
3245*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m6
3246*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m7
3247*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+4*16]
3248*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+5*16]
3249*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+6*16]
3250*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+7*16]
3251*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 0, 4, 5, 6, 3,  995, 3973
3252*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 2, 4, 5, 6, 3, 2440, 3290
3253*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+0*16]
3254*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+1*16]
3255*c0909341SAndroid Build Coastguard Worker    psubd                m6, m0, m4 ; t10a
3256*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t2a
3257*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m6
3258*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+2*16]
3259*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+3*16]
3260*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1, m5 ; t11a
3261*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5     ; t3a
3262*c0909341SAndroid Build Coastguard Worker    psubd                m5, m2, m6 ; t14a
3263*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6     ; t6a
3264*c0909341SAndroid Build Coastguard Worker    psubd                m6, m7, m3 ; t15a
3265*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t7a
3266*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_min)]
3267*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7
3268*c0909341SAndroid Build Coastguard Worker    pmaxsd               m3, [r3+4*16]
3269*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m3
3270*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_max)]
3271*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7
3272*c0909341SAndroid Build Coastguard Worker    pminsd               m3, [r3+4*16]
3273*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m3
3274*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m2 ; t6
3275*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2     ; t2
3276*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m7 ; t7
3277*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7     ; t3
3278*c0909341SAndroid Build Coastguard Worker    mova          [r3+5*16], m1
3279*c0909341SAndroid Build Coastguard Worker    mova          [r3+6*16], m3
3280*c0909341SAndroid Build Coastguard Worker    mova          [r3+7*16], m2
3281*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+4*16]
3282*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m0
3283*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_2048)]
3284*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 4, 0, 7, 2, 3, 3406, 2276
3285*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 5, 0, 7, 2, 3, 2276,    2
3286*c0909341SAndroid Build Coastguard Worker    psubd                m7, m4, m6 ; t14a
3287*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6     ; t10a
3288*c0909341SAndroid Build Coastguard Worker    psubd                m6, m1, m5 ; t15a
3289*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1     ; t11a
3290*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+5*16]
3291*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+6*16]
3292*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+7*16]
3293*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(clip_18b_min)]
3294*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5
3295*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, [r3+4*16]
3296*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m0
3297*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(clip_18b_max)]
3298*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5
3299*c0909341SAndroid Build Coastguard Worker    pminsd               m0, [r3+4*16]
3300*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m0
3301*c0909341SAndroid Build Coastguard Worker    mova          [r3+5*16], m1
3302*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m4
3303*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m5
3304*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pd_2048)]
3305*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 3, 1, 4, 5, 0, 3784, 1567
3306*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 7, 1, 4, 5, 0,    5, 1567
3307*c0909341SAndroid Build Coastguard Worker    mova          [r3+6*16], m2
3308*c0909341SAndroid Build Coastguard Worker    mova          [r3+7*16], m3
3309*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m6
3310*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m7
3311*c0909341SAndroid Build Coastguard Worker%endif
3312*c0909341SAndroid Build Coastguard Worker    ret
3313*c0909341SAndroid Build Coastguard Worker
3314*c0909341SAndroid Build Coastguard Worker.pass2:
3315*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
3316*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x4_internal_16bpc).pass2_loop
3317*c0909341SAndroid Build Coastguard Worker
3318*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, dct
3319*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, adst
3320*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, flipadst
3321*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, identity
3322*c0909341SAndroid Build Coastguard Worker
3323*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3324*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+gprsize]
3325*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x4_internal_16bpc).main
3326*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3327*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0
3328*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m2
3329*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m4
3330*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m6
3331*c0909341SAndroid Build Coastguard Worker    packssdw             m9, m8
3332*c0909341SAndroid Build Coastguard Worker    packssdw            m11, m10
3333*c0909341SAndroid Build Coastguard Worker    packssdw            m13, m12
3334*c0909341SAndroid Build Coastguard Worker    packssdw            m15, m14
3335*c0909341SAndroid Build Coastguard Worker    mova                 m0, m15
3336*c0909341SAndroid Build Coastguard Worker    mova                 m2, m13
3337*c0909341SAndroid Build Coastguard Worker    mova                 m4, m11
3338*c0909341SAndroid Build Coastguard Worker    mova                 m6, m9
3339*c0909341SAndroid Build Coastguard Worker    mova                 m8, m7
3340*c0909341SAndroid Build Coastguard Worker    mova                m10, m5
3341*c0909341SAndroid Build Coastguard Worker    mova                m12, m3
3342*c0909341SAndroid Build Coastguard Worker    mova                m14, m1
3343*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x4_internal_16bpc).transpose
3344*c0909341SAndroid Build Coastguard Worker%else
3345*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+4*16], m0
3346*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+5*16], m2
3347*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+6*16], m4
3348*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+7*16], m6
3349*c0909341SAndroid Build Coastguard Worker    pshufd               m6, [rsp+gprsize+ 8*16], q1032
3350*c0909341SAndroid Build Coastguard Worker    pshufd               m4, [rsp+gprsize+ 9*16], q1032
3351*c0909341SAndroid Build Coastguard Worker    pshufd               m2, [rsp+gprsize+10*16], q1032
3352*c0909341SAndroid Build Coastguard Worker    pshufd               m0, [rsp+gprsize+11*16], q1032
3353*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
3354*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0*16], m0
3355*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+1*16], m1
3356*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+2*16], m2
3357*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+3*16], m3
3358*c0909341SAndroid Build Coastguard Worker    pshufd               m6, [rsp+gprsize+ 4*16], q1032
3359*c0909341SAndroid Build Coastguard Worker    pshufd               m4, [rsp+gprsize+ 5*16], q1032
3360*c0909341SAndroid Build Coastguard Worker    pshufd               m2, [rsp+gprsize+ 6*16], q1032
3361*c0909341SAndroid Build Coastguard Worker    pshufd               m0, [rsp+gprsize+ 7*16], q1032
3362*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
3363*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
3364*c0909341SAndroid Build Coastguard Worker%endif
3365*c0909341SAndroid Build Coastguard Worker
3366*c0909341SAndroid Build Coastguard Worker.pass2:
3367*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3368*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+r3]
3369*c0909341SAndroid Build Coastguard Worker    neg             strideq
3370*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
3371*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x4_internal_16bpc).pass2_loop
3372*c0909341SAndroid Build Coastguard Worker
3373*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, dct
3374*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, adst
3375*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, flipadst
3376*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, identity
3377*c0909341SAndroid Build Coastguard Worker
3378*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3379*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3380*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_11586)]
3381*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m15, [cq+ 0*16]
3382*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m15, [cq+ 1*16]
3383*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m15, [cq+ 2*16]
3384*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m15, [cq+ 3*16]
3385*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m15, [cq+ 4*16]
3386*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m15, [cq+ 5*16]
3387*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m15, [cq+ 6*16]
3388*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m15, [cq+ 7*16]
3389*c0909341SAndroid Build Coastguard Worker    pmulld               m8, m15, [cq+ 8*16]
3390*c0909341SAndroid Build Coastguard Worker    pmulld               m9, m15, [cq+ 9*16]
3391*c0909341SAndroid Build Coastguard Worker    pmulld              m10, m15, [cq+10*16]
3392*c0909341SAndroid Build Coastguard Worker    pmulld              m11, m15, [cq+11*16]
3393*c0909341SAndroid Build Coastguard Worker    pmulld              m12, m15, [cq+12*16]
3394*c0909341SAndroid Build Coastguard Worker    pmulld              m13, m15, [cq+13*16]
3395*c0909341SAndroid Build Coastguard Worker    pmulld              m14, m15, [cq+14*16]
3396*c0909341SAndroid Build Coastguard Worker    pmulld              m15, [cq+15*16]
3397*c0909341SAndroid Build Coastguard Worker    mova         [cq+ 0*16], m15
3398*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_6144)]
3399*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
3400*c0909341SAndroid Build Coastguard Worker                         m8, m9, m10, m11, m12, m13, m14
3401*c0909341SAndroid Build Coastguard Worker    paddd               m15, [cq+ 0*16]
3402*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
3403*c0909341SAndroid Build Coastguard Worker                         m8, m9, m10, m11, m12, m13, m14, m15
3404*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x4_internal_16bpc).pack_transpose
3405*c0909341SAndroid Build Coastguard Worker%else
3406*c0909341SAndroid Build Coastguard Worker    add                  cq, 8*16
3407*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 2
3408*c0909341SAndroid Build Coastguard Worker.loop_pass1:
3409*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_11586)]
3410*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m7, [cq+0*16]
3411*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m7, [cq+1*16]
3412*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m7, [cq+2*16]
3413*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m7, [cq+3*16]
3414*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m7, [cq+4*16]
3415*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m7, [cq+5*16]
3416*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m7, [cq+6*16]
3417*c0909341SAndroid Build Coastguard Worker    pmulld               m7, [cq+7*16]
3418*c0909341SAndroid Build Coastguard Worker    mova          [cq+7*16], m7
3419*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_6144)]
3420*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
3421*c0909341SAndroid Build Coastguard Worker    paddd                m7, [cq+7*16]
3422*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
3423*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3424*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
3425*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
3426*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
3427*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
3428*c0909341SAndroid Build Coastguard Worker    dec                 r5d
3429*c0909341SAndroid Build Coastguard Worker    jz .end_pass1
3430*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0*16], m0
3431*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+1*16], m1
3432*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+2*16], m2
3433*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+3*16], m3
3434*c0909341SAndroid Build Coastguard Worker    sub                  cq, 8*16
3435*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass1
3436*c0909341SAndroid Build Coastguard Worker.end_pass1:
3437*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
3438*c0909341SAndroid Build Coastguard Worker%endif
3439*c0909341SAndroid Build Coastguard Worker
3440*c0909341SAndroid Build Coastguard Worker.pass2:
3441*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3442*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(pw_1697x8)]
3443*c0909341SAndroid Build Coastguard Worker%endif
3444*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(.main)]
3445*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x4_internal_16bpc).pass2_loop
3446*c0909341SAndroid Build Coastguard Worker.main:
3447*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3448*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m0, m12
3449*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m1, m12
3450*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m2, m12
3451*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m3, m12
3452*c0909341SAndroid Build Coastguard Worker%else
3453*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pw_1697x8)]
3454*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m0, m7
3455*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m1, m7
3456*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m2, m7
3457*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m3
3458*c0909341SAndroid Build Coastguard Worker%endif
3459*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4
3460*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5
3461*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m6
3462*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m7
3463*c0909341SAndroid Build Coastguard Worker    ret
3464*c0909341SAndroid Build Coastguard Worker
3465*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
3466*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3467*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, %3, 16x8, 16, 0-8*16
3468*c0909341SAndroid Build Coastguard Worker%else
3469*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, %3, 16x8, 8, 0-13*16
3470*c0909341SAndroid Build Coastguard Worker%endif
3471*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
3472*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
3473*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
3474*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 8
3475*c0909341SAndroid Build Coastguard Worker    add                 r5d, 128
3476*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 8
3477*c0909341SAndroid Build Coastguard Worker    imul                r5d, 181
3478*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3479*c0909341SAndroid Build Coastguard Worker    add                 rsp, 1*16
3480*c0909341SAndroid Build Coastguard Worker%endif
3481*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
3482*c0909341SAndroid Build Coastguard Worker%endif
3483*c0909341SAndroid Build Coastguard Worker%endmacro
3484*c0909341SAndroid Build Coastguard Worker
3485*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, dct
3486*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, identity, 6
3487*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, adst
3488*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, flipadst
3489*c0909341SAndroid Build Coastguard Worker
3490*c0909341SAndroid Build Coastguard Workercglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3491*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3492*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 6, 4, 6
3493*c0909341SAndroid Build Coastguard Worker%else
3494*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize+12*16], r1
3495*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 1, 4, 3
3496*c0909341SAndroid Build Coastguard Worker%endif
3497*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(.main)]
3498*c0909341SAndroid Build Coastguard Worker.loop_main:
3499*c0909341SAndroid Build Coastguard Worker%undef cmp
3500*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3501*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
3502*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 10
3503*c0909341SAndroid Build Coastguard Worker    setge               r5b
3504*c0909341SAndroid Build Coastguard Worker%else
3505*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 1
3506*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 10
3507*c0909341SAndroid Build Coastguard Worker    sbb                 r5d, 0
3508*c0909341SAndroid Build Coastguard Worker%endif
3509*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 4
3510*c0909341SAndroid Build Coastguard Worker
3511*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+gprsize]
3512*c0909341SAndroid Build Coastguard Worker.loop_pass1:
3513*c0909341SAndroid Build Coastguard Worker    call                 t0
3514*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3515*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
3516*c0909341SAndroid Build Coastguard Worker    mova       [cq+4*32+r5], m8
3517*c0909341SAndroid Build Coastguard Worker    mova       [cq+5*32+r5], m9
3518*c0909341SAndroid Build Coastguard Worker    mova       [cq+6*32+r5], m10
3519*c0909341SAndroid Build Coastguard Worker    mova       [cq+7*32+r5], m11
3520*c0909341SAndroid Build Coastguard Worker%else
3521*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
3522*c0909341SAndroid Build Coastguard Worker    mova       [cq+4*32+r5], m0
3523*c0909341SAndroid Build Coastguard Worker    mova       [cq+5*32+r5], m1
3524*c0909341SAndroid Build Coastguard Worker    mova       [cq+6*32+r5], m2
3525*c0909341SAndroid Build Coastguard Worker    mova       [cq+7*32+r5], m3
3526*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+ 8*16]
3527*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+ 9*16]
3528*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+10*16]
3529*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+11*16]
3530*c0909341SAndroid Build Coastguard Worker%endif
3531*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
3532*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
3533*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15
3534*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
3535*c0909341SAndroid Build Coastguard Worker    jz .end
3536*c0909341SAndroid Build Coastguard Worker    mova       [cq+0*32+r5], m0
3537*c0909341SAndroid Build Coastguard Worker    mova       [cq+1*32+r5], m1
3538*c0909341SAndroid Build Coastguard Worker    mova       [cq+2*32+r5], m2
3539*c0909341SAndroid Build Coastguard Worker    mova       [cq+3*32+r5], m3
3540*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
3541*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass1
3542*c0909341SAndroid Build Coastguard Worker.end:
3543*c0909341SAndroid Build Coastguard Worker
3544*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
3545*c0909341SAndroid Build Coastguard Worker.main:
3546*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3547*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
3548*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
3549*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
3550*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
3551*c0909341SAndroid Build Coastguard Worker%endif
3552*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 1*32+r5]
3553*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 3*32+r5]
3554*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 5*32+r5]
3555*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 7*32+r5]
3556*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 9*32+r5]
3557*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+11*32+r5]
3558*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+13*32+r5]
3559*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+15*32+r5]
3560*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
3561*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).main_oddhalf
3562*c0909341SAndroid Build Coastguard Worker
3563*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 0*32+r5]
3564*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 2*32+r5]
3565*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 4*32+r5]
3566*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 6*32+r5]
3567*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 8*32+r5]
3568*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+10*32+r5]
3569*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+12*32+r5]
3570*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+14*32+r5]
3571*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
3572*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1
3573*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
3574*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).round
3575*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3576*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3577*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
3578*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
3579*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
3580*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m9
3581*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m11
3582*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
3583*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
3584*c0909341SAndroid Build Coastguard Worker%endif
3585*c0909341SAndroid Build Coastguard Worker    ret
3586*c0909341SAndroid Build Coastguard Worker
3587*c0909341SAndroid Build Coastguard Worker.pass2:
3588*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3589*c0909341SAndroid Build Coastguard Worker    mov             strideq, [rsp+gprsize+12*16]
3590*c0909341SAndroid Build Coastguard Worker%endif
3591*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 2
3592*c0909341SAndroid Build Coastguard Worker.pass2_main:
3593*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3594*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_2048)]
3595*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
3596*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
3597*c0909341SAndroid Build Coastguard Worker%endif
3598*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3599*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass2_entry
3600*c0909341SAndroid Build Coastguard Worker.loop_pass2:
3601*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+0*32+ 0]
3602*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+1*32+ 0]
3603*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+2*32+ 0]
3604*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+3*32+ 0]
3605*c0909341SAndroid Build Coastguard Worker.loop_pass2_entry:
3606*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+0*32+16]
3607*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+1*32+16]
3608*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+2*32+16]
3609*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+3*32+16]
3610*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3611*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
3612*c0909341SAndroid Build Coastguard Worker%endif
3613*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
3614*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round2_and_write_8x8
3615*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3616*c0909341SAndroid Build Coastguard Worker%define mzero m9
3617*c0909341SAndroid Build Coastguard Worker%else
3618*c0909341SAndroid Build Coastguard Worker%define mzero m7
3619*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
3620*c0909341SAndroid Build Coastguard Worker%endif
3621*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
3622*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
3623*c0909341SAndroid Build Coastguard Worker    add                  cq, 4*32
3624*c0909341SAndroid Build Coastguard Worker    dec                 r4d
3625*c0909341SAndroid Build Coastguard Worker    jg .loop_pass2
3626*c0909341SAndroid Build Coastguard Worker    RET
3627*c0909341SAndroid Build Coastguard Worker
3628*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, dct
3629*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, adst
3630*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, flipadst
3631*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, identity, 6
3632*c0909341SAndroid Build Coastguard Worker
3633*c0909341SAndroid Build Coastguard Workercglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3634*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3635*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize+12*16], r1
3636*c0909341SAndroid Build Coastguard Worker%endif
3637*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(.main)]
3638*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_16bpc).loop_main
3639*c0909341SAndroid Build Coastguard Worker
3640*c0909341SAndroid Build Coastguard Worker.main:
3641*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3642*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
3643*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
3644*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
3645*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
3646*c0909341SAndroid Build Coastguard Worker%endif
3647*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 2*32+r5]
3648*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+13*32+r5]
3649*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 6*32+r5]
3650*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 9*32+r5]
3651*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+10*32+r5]
3652*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+ 5*32+r5]
3653*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+14*32+r5]
3654*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+ 1*32+r5]
3655*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
3656*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x4_internal_16bpc).main_part1
3657*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 0*32+r5]
3658*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+15*32+r5]
3659*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 4*32+r5]
3660*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+11*32+r5]
3661*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 8*32+r5]
3662*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+ 7*32+r5]
3663*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+12*32+r5]
3664*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+ 3*32+r5]
3665*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3666*c0909341SAndroid Build Coastguard Worker    add                  r3, 8*16
3667*c0909341SAndroid Build Coastguard Worker%endif
3668*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
3669*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3670*c0909341SAndroid Build Coastguard Worker    sub                  r3, 8*16
3671*c0909341SAndroid Build Coastguard Worker%endif
3672*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x4_internal_16bpc).main_part2
3673*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x4_internal_16bpc).round
3674*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3675*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3676*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
3677*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
3678*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
3679*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m9
3680*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m11
3681*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
3682*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
3683*c0909341SAndroid Build Coastguard Worker%endif
3684*c0909341SAndroid Build Coastguard Worker    ret
3685*c0909341SAndroid Build Coastguard Worker
3686*c0909341SAndroid Build Coastguard Worker.pass2:
3687*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3688*c0909341SAndroid Build Coastguard Worker    mov             strideq, [rsp+gprsize+12*16]
3689*c0909341SAndroid Build Coastguard Worker%endif
3690*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 2
3691*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3692*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_2048)]
3693*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
3694*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
3695*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pw_m2048)]
3696*c0909341SAndroid Build Coastguard Worker%endif
3697*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3698*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass2_entry
3699*c0909341SAndroid Build Coastguard Worker.loop_pass2:
3700*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+0*32+ 0]
3701*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+1*32+ 0]
3702*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+2*32+ 0]
3703*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+3*32+ 0]
3704*c0909341SAndroid Build Coastguard Worker.loop_pass2_entry:
3705*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+0*32+16]
3706*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+1*32+16]
3707*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+2*32+16]
3708*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+3*32+16]
3709*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3710*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
3711*c0909341SAndroid Build Coastguard Worker%endif
3712*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
3713*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
3714*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
3715*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3716*c0909341SAndroid Build Coastguard Worker%define mzero m9
3717*c0909341SAndroid Build Coastguard Worker%else
3718*c0909341SAndroid Build Coastguard Worker%define mzero m7
3719*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
3720*c0909341SAndroid Build Coastguard Worker%endif
3721*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
3722*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
3723*c0909341SAndroid Build Coastguard Worker    add                  cq, 4*32
3724*c0909341SAndroid Build Coastguard Worker    dec                 r4d
3725*c0909341SAndroid Build Coastguard Worker    jg .loop_pass2
3726*c0909341SAndroid Build Coastguard Worker    RET
3727*c0909341SAndroid Build Coastguard Worker
3728*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, dct
3729*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, adst
3730*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, flipadst
3731*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, identity, 6
3732*c0909341SAndroid Build Coastguard Worker
3733*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3734*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3735*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize+12*16], r1
3736*c0909341SAndroid Build Coastguard Worker%endif
3737*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(.main)]
3738*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_16bpc).loop_main
3739*c0909341SAndroid Build Coastguard Worker.main:
3740*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_16bpc).main
3741*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3742*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q1032
3743*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m2, q1032
3744*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m4, q1032
3745*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m6, q1032
3746*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m14, q1032
3747*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m12, q1032
3748*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m10, q1032
3749*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m8, q1032
3750*c0909341SAndroid Build Coastguard Worker    mova                m14, m1
3751*c0909341SAndroid Build Coastguard Worker    mova                m12, m3
3752*c0909341SAndroid Build Coastguard Worker    mova                m10, m5
3753*c0909341SAndroid Build Coastguard Worker    mova                 m8, m7
3754*c0909341SAndroid Build Coastguard Worker%else
3755*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q1032
3756*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m2, q1032
3757*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m4, q1032
3758*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m6, q1032
3759*c0909341SAndroid Build Coastguard Worker    pshufd               m0, [r3+11*16], q1032
3760*c0909341SAndroid Build Coastguard Worker    pshufd               m2, [r3+10*16], q1032
3761*c0909341SAndroid Build Coastguard Worker    pshufd               m4, [r3+9*16], q1032
3762*c0909341SAndroid Build Coastguard Worker    pshufd               m6, [r3+8*16], q1032
3763*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m7
3764*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m5
3765*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m3
3766*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m1
3767*c0909341SAndroid Build Coastguard Worker%endif
3768*c0909341SAndroid Build Coastguard Worker    ret
3769*c0909341SAndroid Build Coastguard Worker
3770*c0909341SAndroid Build Coastguard Worker.pass2:
3771*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3772*c0909341SAndroid Build Coastguard Worker    mov             strideq, [rsp+gprsize+12*16]
3773*c0909341SAndroid Build Coastguard Worker%endif
3774*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
3775*c0909341SAndroid Build Coastguard Worker    neg             strideq
3776*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3777*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3778*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize+12*16], strideq
3779*c0909341SAndroid Build Coastguard Worker%endif
3780*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x8_internal_16bpc).pass2
3781*c0909341SAndroid Build Coastguard Worker
3782*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, dct, -54
3783*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, adst, -54
3784*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, flipadst, -54
3785*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, identity
3786*c0909341SAndroid Build Coastguard Worker
3787*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3788*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3789*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize+12*16], r1
3790*c0909341SAndroid Build Coastguard Worker%endif
3791*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(.main)]
3792*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_16bpc).loop_main
3793*c0909341SAndroid Build Coastguard Worker.main:
3794*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3795*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_2896)]
3796*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m15, [cq+ 0*32+r5]
3797*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m15, [cq+ 1*32+r5]
3798*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m15, [cq+ 2*32+r5]
3799*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m15, [cq+ 3*32+r5]
3800*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m15, [cq+ 4*32+r5]
3801*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m15, [cq+ 5*32+r5]
3802*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m15, [cq+ 6*32+r5]
3803*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m15, [cq+ 7*32+r5]
3804*c0909341SAndroid Build Coastguard Worker    pmulld               m8, m15, [cq+ 8*32+r5]
3805*c0909341SAndroid Build Coastguard Worker    pmulld               m9, m15, [cq+ 9*32+r5]
3806*c0909341SAndroid Build Coastguard Worker    pmulld              m10, m15, [cq+10*32+r5]
3807*c0909341SAndroid Build Coastguard Worker    pmulld              m11, m15, [cq+11*32+r5]
3808*c0909341SAndroid Build Coastguard Worker    pmulld              m12, m15, [cq+12*32+r5]
3809*c0909341SAndroid Build Coastguard Worker    pmulld              m13, m15, [cq+13*32+r5]
3810*c0909341SAndroid Build Coastguard Worker    pmulld              m14, m15, [cq+14*32+r5]
3811*c0909341SAndroid Build Coastguard Worker    pmulld              m15, [cq+15*32+r5]
3812*c0909341SAndroid Build Coastguard Worker    mova               [r3], m15
3813*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_2048)]
3814*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
3815*c0909341SAndroid Build Coastguard Worker                         m8, m9, m10, m11, m12, m13, m14
3816*c0909341SAndroid Build Coastguard Worker    paddd               m15, [r3]
3817*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
3818*c0909341SAndroid Build Coastguard Worker                         m8, m9, m10, m11, m12, m13, m14, m15
3819*c0909341SAndroid Build Coastguard Worker    mova               [r3], m15
3820*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_11586)]
3821*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
3822*c0909341SAndroid Build Coastguard Worker                         m8, m9, m10, m11, m12, m13, m14
3823*c0909341SAndroid Build Coastguard Worker    pmulld              m15, [r3]
3824*c0909341SAndroid Build Coastguard Worker    mova               [r3], m15
3825*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_6144)]
3826*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
3827*c0909341SAndroid Build Coastguard Worker                         m8, m9, m10, m11, m12, m13, m14
3828*c0909341SAndroid Build Coastguard Worker    paddd               m15, [r3]
3829*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
3830*c0909341SAndroid Build Coastguard Worker                         m8, m9, m10, m11, m12, m13, m14, m15
3831*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3832*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
3833*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
3834*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
3835*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m9
3836*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m11
3837*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
3838*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
3839*c0909341SAndroid Build Coastguard Worker%else
3840*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 0*32+r5]
3841*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 1*32+r5]
3842*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 2*32+r5]
3843*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 3*32+r5]
3844*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 4*32+r5]
3845*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+ 5*32+r5]
3846*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+ 6*32+r5]
3847*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+ 7*32+r5]
3848*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
3849*c0909341SAndroid Build Coastguard Worker    mova               [r3], m7
3850*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_11586)]
3851*c0909341SAndroid Build Coastguard Worker    REPX      {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
3852*c0909341SAndroid Build Coastguard Worker    pmulld               m7, [r3]
3853*c0909341SAndroid Build Coastguard Worker    mova               [r3], m7
3854*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_6144)]
3855*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
3856*c0909341SAndroid Build Coastguard Worker    paddd                m7, [r3]
3857*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
3858*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3859*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
3860*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
3861*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
3862*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 8*16], m0
3863*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 9*16], m2
3864*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m4
3865*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m6
3866*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 8*32+r5]
3867*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 9*32+r5]
3868*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+10*32+r5]
3869*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+11*32+r5]
3870*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+12*32+r5]
3871*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+13*32+r5]
3872*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+14*32+r5]
3873*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+15*32+r5]
3874*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
3875*c0909341SAndroid Build Coastguard Worker    mova               [r3], m7
3876*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_11586)]
3877*c0909341SAndroid Build Coastguard Worker    REPX      {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
3878*c0909341SAndroid Build Coastguard Worker    pmulld               m7, [r3]
3879*c0909341SAndroid Build Coastguard Worker    mova               [r3], m7
3880*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_6144)]
3881*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
3882*c0909341SAndroid Build Coastguard Worker    paddd                m7, [r3]
3883*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
3884*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
3885*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
3886*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
3887*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
3888*c0909341SAndroid Build Coastguard Worker%endif
3889*c0909341SAndroid Build Coastguard Worker    ret
3890*c0909341SAndroid Build Coastguard Worker.pass2:
3891*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3892*c0909341SAndroid Build Coastguard Worker    mov             strideq, [rsp+gprsize+12*16]
3893*c0909341SAndroid Build Coastguard Worker%endif
3894*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 2
3895*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3896*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_4096)]
3897*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
3898*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
3899*c0909341SAndroid Build Coastguard Worker%endif
3900*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3901*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass2_entry
3902*c0909341SAndroid Build Coastguard Worker.loop_pass2:
3903*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+0*32+ 0]
3904*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+1*32+ 0]
3905*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+2*32+ 0]
3906*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+3*32+ 0]
3907*c0909341SAndroid Build Coastguard Worker.loop_pass2_entry:
3908*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+0*32+16]
3909*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+1*32+16]
3910*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+2*32+16]
3911*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+3*32+16]
3912*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3913*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
3914*c0909341SAndroid Build Coastguard Worker%else
3915*c0909341SAndroid Build Coastguard Worker    mova      [rsp+gprsize], m7
3916*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pw_4096)]
3917*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round4_and_write_8x8
3918*c0909341SAndroid Build Coastguard Worker%endif
3919*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3920*c0909341SAndroid Build Coastguard Worker%define mzero m9
3921*c0909341SAndroid Build Coastguard Worker%else
3922*c0909341SAndroid Build Coastguard Worker%define mzero m7
3923*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
3924*c0909341SAndroid Build Coastguard Worker%endif
3925*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
3926*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
3927*c0909341SAndroid Build Coastguard Worker    add                  cq, 4*32
3928*c0909341SAndroid Build Coastguard Worker    dec                 r4d
3929*c0909341SAndroid Build Coastguard Worker    jg .loop_pass2
3930*c0909341SAndroid Build Coastguard Worker    RET
3931*c0909341SAndroid Build Coastguard Worker
3932*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
3933*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3934*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16
3935*c0909341SAndroid Build Coastguard Worker%else
3936*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
3937*c0909341SAndroid Build Coastguard Worker%endif
3938*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
3939*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
3940*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
3941*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 16
3942*c0909341SAndroid Build Coastguard Worker    add                 r5d, 640
3943*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 10
3944*c0909341SAndroid Build Coastguard Worker    add                 rsp, (5+ARCH_X86_64*3+WIN64)*16
3945*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
3946*c0909341SAndroid Build Coastguard Worker%endif
3947*c0909341SAndroid Build Coastguard Worker%endmacro
3948*c0909341SAndroid Build Coastguard Worker
3949*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, dct
3950*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, identity, v
3951*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, adst
3952*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, flipadst
3953*c0909341SAndroid Build Coastguard Worker
3954*c0909341SAndroid Build Coastguard Workercglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3955*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3956*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP       6, 7
3957*c0909341SAndroid Build Coastguard Worker%if WIN64
3958*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize], r7
3959*c0909341SAndroid Build Coastguard Worker%endif
3960*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
3961*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP       1, 6
3962*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*1], r1
3963*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*2], r6
3964*c0909341SAndroid Build Coastguard Worker%endif
3965*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(.main)]
3966*c0909341SAndroid Build Coastguard Worker.pass1_full:
3967*c0909341SAndroid Build Coastguard Worker%undef cmp
3968*c0909341SAndroid Build Coastguard Worker    mov                 t1d, 4
3969*c0909341SAndroid Build Coastguard Worker.zero_loop:
3970*c0909341SAndroid Build Coastguard Worker    dec                 t1d
3971*c0909341SAndroid Build Coastguard Worker    cmp                eobb, byte [r5+t1]
3972*c0909341SAndroid Build Coastguard Worker    jb .zero_loop
3973*c0909341SAndroid Build Coastguard Worker    mov                 r5d, t1d
3974*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 4
3975*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3976*c0909341SAndroid Build Coastguard Worker    ; restore pic-ptr
3977*c0909341SAndroid Build Coastguard Worker    mov                  r6, [rsp+16*16+2*gprsize]
3978*c0909341SAndroid Build Coastguard Worker%endif
3979*c0909341SAndroid Build Coastguard Worker    ; setup stack pointer
3980*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+gprsize]
3981*c0909341SAndroid Build Coastguard Worker.loop_pass1:
3982*c0909341SAndroid Build Coastguard Worker    call                 t0
3983*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3984*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
3985*c0909341SAndroid Build Coastguard Worker    mova       [cq+4*64+r5], m8
3986*c0909341SAndroid Build Coastguard Worker    mova       [cq+5*64+r5], m9
3987*c0909341SAndroid Build Coastguard Worker    mova       [cq+6*64+r5], m10
3988*c0909341SAndroid Build Coastguard Worker    mova       [cq+7*64+r5], m11
3989*c0909341SAndroid Build Coastguard Worker%else
3990*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
3991*c0909341SAndroid Build Coastguard Worker    mova       [cq+4*64+r5], m0
3992*c0909341SAndroid Build Coastguard Worker    mova       [cq+5*64+r5], m1
3993*c0909341SAndroid Build Coastguard Worker    mova       [cq+6*64+r5], m2
3994*c0909341SAndroid Build Coastguard Worker    mova       [cq+7*64+r5], m3
3995*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+ 8*16]
3996*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+ 9*16]
3997*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+10*16]
3998*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+11*16]
3999*c0909341SAndroid Build Coastguard Worker%endif
4000*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
4001*c0909341SAndroid Build Coastguard Worker    mova       [cq+0*64+r5], m0
4002*c0909341SAndroid Build Coastguard Worker    mova       [cq+1*64+r5], m1
4003*c0909341SAndroid Build Coastguard Worker    mova       [cq+2*64+r5], m2
4004*c0909341SAndroid Build Coastguard Worker    mova       [cq+3*64+r5], m3
4005*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
4006*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15
4007*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 16
4008*c0909341SAndroid Build Coastguard Worker    jge .loop_pass1
4009*c0909341SAndroid Build Coastguard Worker
4010*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4011*c0909341SAndroid Build Coastguard Worker    ; restore pic-ptr
4012*c0909341SAndroid Build Coastguard Worker    mov                  r1, [rsp+16*16+1*gprsize]
4013*c0909341SAndroid Build Coastguard Worker%endif
4014*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
4015*c0909341SAndroid Build Coastguard Worker.main:
4016*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4017*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
4018*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
4019*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
4020*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
4021*c0909341SAndroid Build Coastguard Worker%endif
4022*c0909341SAndroid Build Coastguard Worker
4023*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 1*64+r5]
4024*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 3*64+r5]
4025*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 5*64+r5]
4026*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 7*64+r5]
4027*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 9*64+r5]
4028*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+11*64+r5]
4029*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+13*64+r5]
4030*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+15*64+r5]
4031*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).main_oddhalf
4032*c0909341SAndroid Build Coastguard Worker
4033*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 0*64+r5]
4034*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 2*64+r5]
4035*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 4*64+r5]
4036*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 6*64+r5]
4037*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 8*64+r5]
4038*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+10*64+r5]
4039*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+12*64+r5]
4040*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+14*64+r5]
4041*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1
4042*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
4043*c0909341SAndroid Build Coastguard Worker    call .round
4044*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4045*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
4046*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
4047*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
4048*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
4049*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m9
4050*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m11
4051*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
4052*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
4053*c0909341SAndroid Build Coastguard Worker%endif
4054*c0909341SAndroid Build Coastguard Worker    ret
4055*c0909341SAndroid Build Coastguard Worker.round:
4056*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4057*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
4058*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
4059*c0909341SAndroid Build Coastguard Worker    psrld                m8, m11, 10        ; 2
4060*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
4061*c0909341SAndroid Build Coastguard Worker    mova                 m8, [r3+1*16]
4062*c0909341SAndroid Build Coastguard Worker    mova                 m9, [r3+2*16]
4063*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+3*16]
4064*c0909341SAndroid Build Coastguard Worker    mova                m11, [r3+4*16]
4065*c0909341SAndroid Build Coastguard Worker    mova                m12, [r3+5*16]
4066*c0909341SAndroid Build Coastguard Worker    mova                m13, [r3+6*16]
4067*c0909341SAndroid Build Coastguard Worker    mova                m14, [r3+7*16]
4068*c0909341SAndroid Build Coastguard Worker    psubd               m15, m0, m14       ; out15
4069*c0909341SAndroid Build Coastguard Worker    paddd                m0, m14           ; out0
4070*c0909341SAndroid Build Coastguard Worker    psubd               m14, m1, m13       ; out14
4071*c0909341SAndroid Build Coastguard Worker    paddd                m1, m13           ; out1
4072*c0909341SAndroid Build Coastguard Worker    psubd               m13, m2, m12       ; out13
4073*c0909341SAndroid Build Coastguard Worker    paddd                m2, m12           ; out2
4074*c0909341SAndroid Build Coastguard Worker    psubd               m12, m3, m11       ; out12
4075*c0909341SAndroid Build Coastguard Worker    paddd                m3, m11           ; out3
4076*c0909341SAndroid Build Coastguard Worker    psubd               m11, m4, m10       ; out11
4077*c0909341SAndroid Build Coastguard Worker    paddd                m4, m10           ; out4
4078*c0909341SAndroid Build Coastguard Worker    psubd               m10, m5, m9        ; out10
4079*c0909341SAndroid Build Coastguard Worker    paddd                m5, m9            ; out5
4080*c0909341SAndroid Build Coastguard Worker    psubd                m9, m6, m8        ; out9
4081*c0909341SAndroid Build Coastguard Worker    paddd                m6, m8            ; out6
4082*c0909341SAndroid Build Coastguard Worker    psubd                m8, m7, [r3+0*16] ; out8
4083*c0909341SAndroid Build Coastguard Worker    paddd                m7, [r3+0*16]     ; out7
4084*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 2}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
4085*c0909341SAndroid Build Coastguard Worker                             m8,  m9,  m10, m11, m12, m13, m14, m15
4086*c0909341SAndroid Build Coastguard Worker    ; and out0-15 is now in m0-15
4087*c0909341SAndroid Build Coastguard Worker%else
4088*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 0*16], m0
4089*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(clip_18b_min)]
4090*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
4091*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, [r3+ 0*16]
4092*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 0*16], m7
4093*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(clip_18b_max)]
4094*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
4095*c0909341SAndroid Build Coastguard Worker    pminsd               m7, [r3+ 0*16]
4096*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 0*16], m0
4097*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pd_2)]
4098*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
4099*c0909341SAndroid Build Coastguard Worker    paddd                m0, [r3+ 0*16]
4100*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 0*16], m0
4101*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 1*16], m1
4102*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 2*16], m2
4103*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+11*16]
4104*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+10*16]
4105*c0909341SAndroid Build Coastguard Worker    psubd                m0, m7, m1
4106*c0909341SAndroid Build Coastguard Worker    paddd                m7, m1
4107*c0909341SAndroid Build Coastguard Worker    psubd                m1, m6, m2
4108*c0909341SAndroid Build Coastguard Worker    paddd                m6, m2
4109*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 2}, m0, m1, m6, m7
4110*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1     ; out8-9
4111*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7     ; out6-7
4112*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m6
4113*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+9*16]
4114*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+8*16]
4115*c0909341SAndroid Build Coastguard Worker    psubd                m2, m5, m1
4116*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1
4117*c0909341SAndroid Build Coastguard Worker    psubd                m1, m4, m7
4118*c0909341SAndroid Build Coastguard Worker    paddd                m4, m7
4119*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 2}, m2, m1, m4, m5
4120*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m1     ; out10-11
4121*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5     ; out4-5
4122*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+2*16]
4123*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m4
4124*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+7*16]
4125*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+6*16]
4126*c0909341SAndroid Build Coastguard Worker    psubd                m4, m3, m6
4127*c0909341SAndroid Build Coastguard Worker    paddd                m3, m6
4128*c0909341SAndroid Build Coastguard Worker    psubd                m6, m1, m7
4129*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7
4130*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 2}, m4, m6, m1, m3
4131*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m6     ; out12-13
4132*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3     ; out2-3
4133*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+1*16]
4134*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m1
4135*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+0*16]
4136*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+5*16]
4137*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+4*16]
4138*c0909341SAndroid Build Coastguard Worker    psubd                m6, m3, m5
4139*c0909341SAndroid Build Coastguard Worker    paddd                m3, m5
4140*c0909341SAndroid Build Coastguard Worker    psubd                m5, m1, m7
4141*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7
4142*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 2}, m6, m5, m1, m3
4143*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m5     ; out14-15
4144*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3     ; out0-1
4145*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m1
4146*c0909341SAndroid Build Coastguard Worker%endif
4147*c0909341SAndroid Build Coastguard Worker    ret
4148*c0909341SAndroid Build Coastguard Worker
4149*c0909341SAndroid Build Coastguard Worker.pass2:
4150*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4151*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_2048)]
4152*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
4153*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
4154*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
4155*c0909341SAndroid Build Coastguard Worker%else
4156*c0909341SAndroid Build Coastguard Worker    mov [rsp+2*gprsize+16*16], dstq
4157*c0909341SAndroid Build Coastguard Worker%endif
4158*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
4159*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 2
4160*c0909341SAndroid Build Coastguard Worker.loop_pass2:
4161*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4162*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
4163*c0909341SAndroid Build Coastguard Worker%endif
4164*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+0*64+ 0]
4165*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+2*64+ 0]
4166*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+0*64+16]
4167*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+2*64+16]
4168*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+0*64+32]
4169*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+2*64+32]
4170*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+0*64+48]
4171*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+2*64+48]
4172*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
4173*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+3*16], m0
4174*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+4*16], m1
4175*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+5*16], m2
4176*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+6*16], m3
4177*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+7*16], m4
4178*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+8*16], m5
4179*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+9*16], m6
4180*c0909341SAndroid Build Coastguard Worker    ; m7 is already stored in [rsp+gprsize+0*16]
4181*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+1*64+ 0]
4182*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+3*64+ 0]
4183*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+1*64+16]
4184*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+3*64+16]
4185*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+1*64+32]
4186*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+3*64+32]
4187*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+1*64+48]
4188*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+3*64+48]
4189*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
4190*c0909341SAndroid Build Coastguard Worker
4191*c0909341SAndroid Build Coastguard Worker    ; out0-7 is in rsp+gprsize+3-10*mmsize
4192*c0909341SAndroid Build Coastguard Worker    ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
4193*c0909341SAndroid Build Coastguard Worker
4194*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4195*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r7+strideq*8]
4196*c0909341SAndroid Build Coastguard Worker%else
4197*c0909341SAndroid Build Coastguard Worker    mov                dstq, [rsp+2*gprsize+16*16]
4198*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
4199*c0909341SAndroid Build Coastguard Worker%endif
4200*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round2_and_write_8x8
4201*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4202*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
4203*c0909341SAndroid Build Coastguard Worker%else
4204*c0909341SAndroid Build Coastguard Worker    mov                dstq, [rsp+2*gprsize+16*16]
4205*c0909341SAndroid Build Coastguard Worker%endif
4206*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+ 3*16]
4207*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+ 4*16]
4208*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+ 5*16]
4209*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+ 6*16]
4210*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+ 7*16]
4211*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+gprsize+ 8*16]
4212*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+ 9*16]
4213*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+10*16]
4214*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
4215*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4216*c0909341SAndroid Build Coastguard Worker    add                  r7, 16
4217*c0909341SAndroid Build Coastguard Worker%define mzero m9
4218*c0909341SAndroid Build Coastguard Worker%else
4219*c0909341SAndroid Build Coastguard Worker    add dword [rsp+2*gprsize+16*16], 16
4220*c0909341SAndroid Build Coastguard Worker%define mzero m7
4221*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
4222*c0909341SAndroid Build Coastguard Worker%endif
4223*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
4224*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*4
4225*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
4226*c0909341SAndroid Build Coastguard Worker%undef mzero
4227*c0909341SAndroid Build Coastguard Worker    dec                 r4d
4228*c0909341SAndroid Build Coastguard Worker    jg .loop_pass2
4229*c0909341SAndroid Build Coastguard Worker%if WIN64
4230*c0909341SAndroid Build Coastguard Worker    mov                  r7, [rsp+16*16+gprsize]
4231*c0909341SAndroid Build Coastguard Worker%endif
4232*c0909341SAndroid Build Coastguard Worker    RET
4233*c0909341SAndroid Build Coastguard Worker
4234*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, dct
4235*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, adst
4236*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, flipadst
4237*c0909341SAndroid Build Coastguard Worker
4238*c0909341SAndroid Build Coastguard Workercglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
4239*c0909341SAndroid Build Coastguard Worker%if WIN64
4240*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize], r7
4241*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
4242*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*1], r1
4243*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*2], r6
4244*c0909341SAndroid Build Coastguard Worker%endif
4245*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(.main)]
4246*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_16bpc).pass1_full
4247*c0909341SAndroid Build Coastguard Worker
4248*c0909341SAndroid Build Coastguard Worker.main:
4249*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4250*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
4251*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
4252*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
4253*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
4254*c0909341SAndroid Build Coastguard Worker%endif
4255*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 2*64+r5]
4256*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+13*64+r5]
4257*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 6*64+r5]
4258*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 9*64+r5]
4259*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+10*64+r5]
4260*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+ 5*64+r5]
4261*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+14*64+r5]
4262*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+ 1*64+r5]
4263*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x4_internal_16bpc).main_part1
4264*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 0*64+r5]
4265*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+15*64+r5]
4266*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 4*64+r5]
4267*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+11*64+r5]
4268*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 8*64+r5]
4269*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+ 7*64+r5]
4270*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+12*64+r5]
4271*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+ 3*64+r5]
4272*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x4_internal_16bpc).main_part2
4273*c0909341SAndroid Build Coastguard Worker    call .round
4274*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4275*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
4276*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
4277*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
4278*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
4279*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m9
4280*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m11
4281*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
4282*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
4283*c0909341SAndroid Build Coastguard Worker%endif
4284*c0909341SAndroid Build Coastguard Worker    ret
4285*c0909341SAndroid Build Coastguard Worker.round:
4286*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4287*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m8         ; -1
4288*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_10240)]
4289*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10         ; +2
4290*c0909341SAndroid Build Coastguard Worker    psubd               m13, m14, m8    ; +3
4291*c0909341SAndroid Build Coastguard Worker    REPX     {pxor  x, m8 }, m1, m3, m5, m7
4292*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m14}, m0, m2
4293*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m1, m3
4294*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m15}, m4, m5, m6, m7
4295*c0909341SAndroid Build Coastguard Worker    paddd               m13, m15, m8    ; +10239
4296*c0909341SAndroid Build Coastguard Worker    paddd                m8, m15, m9
4297*c0909341SAndroid Build Coastguard Worker    psubd                m9, m13, m10
4298*c0909341SAndroid Build Coastguard Worker    paddd               m10, m15, m11
4299*c0909341SAndroid Build Coastguard Worker    psubd               m11, m13, m12
4300*c0909341SAndroid Build Coastguard Worker    paddd               m12, m14, [r3+3*16]
4301*c0909341SAndroid Build Coastguard Worker    psubd               m13, m14, [r3+2*16]
4302*c0909341SAndroid Build Coastguard Worker    psubd               m15, m14, [r3+0*16]
4303*c0909341SAndroid Build Coastguard Worker    paddd               m14, [r3+1*16]
4304*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 2 }, m0,  m1,  m2,  m3,  m12, m13, m14, m15
4305*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 14}, m4,  m5,  m6,  m7,  m8,  m9,  m10, m11
4306*c0909341SAndroid Build Coastguard Worker%else
4307*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m1
4308*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m3
4309*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_10240)]
4310*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m1, m1
4311*c0909341SAndroid Build Coastguard Worker    REPX      {pxor  x, m1}, m5, m7
4312*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m3}, m4, m5, m6, m7
4313*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 14}, m4, m5, m6, m7
4314*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
4315*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
4316*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m4
4317*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m6
4318*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+4*16]
4319*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+5*16]
4320*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+6*16]
4321*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+7*16]
4322*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_2)]
4323*c0909341SAndroid Build Coastguard Worker    REPX      {pxor  x, m1}, m5, m7
4324*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m3}, m4, m6
4325*c0909341SAndroid Build Coastguard Worker    psubd                m3, m1
4326*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m3}, m5, m7
4327*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 2 }, m4, m5, m6, m7
4328*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
4329*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
4330*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+8*16]
4331*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+9*16]
4332*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m4
4333*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m6
4334*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_10240)]
4335*c0909341SAndroid Build Coastguard Worker    REPX      {pxor  x, m1}, m5, m7
4336*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m3}, m0, m5, m2, m7
4337*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 14}, m0, m5, m2, m7
4338*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m5
4339*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m7
4340*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+0*16]
4341*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+1*16]
4342*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+2*16]
4343*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+3*16]
4344*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_2)]
4345*c0909341SAndroid Build Coastguard Worker    REPX      {pxor  x, m1}, m5, m7
4346*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m3}, m4, m6
4347*c0909341SAndroid Build Coastguard Worker    psubd                m3, m1
4348*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m3}, m5, m7
4349*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 2 }, m4, m5, m6, m7
4350*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
4351*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
4352*c0909341SAndroid Build Coastguard Worker%endif
4353*c0909341SAndroid Build Coastguard Worker    ret
4354*c0909341SAndroid Build Coastguard Worker.pass2:
4355*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4356*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_2048)]
4357*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pw_m2048)]
4358*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
4359*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
4360*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
4361*c0909341SAndroid Build Coastguard Worker%else
4362*c0909341SAndroid Build Coastguard Worker    mov [rsp+2*gprsize+16*16], dstq
4363*c0909341SAndroid Build Coastguard Worker%endif
4364*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
4365*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 2
4366*c0909341SAndroid Build Coastguard Worker.loop_pass2:
4367*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4368*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
4369*c0909341SAndroid Build Coastguard Worker%endif
4370*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+0*64+32]
4371*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+1*64+32]
4372*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+2*64+16]
4373*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+3*64+16]
4374*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+0*64+ 0]
4375*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+1*64+ 0]
4376*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+2*64+48]
4377*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+3*64+48]
4378*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+3*16], m0
4379*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+4*16], m1
4380*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+5*16], m2
4381*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+6*16], m3
4382*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+7*16], m4
4383*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+8*16], m5
4384*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+9*16], m6
4385*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+10*16], m7
4386*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+2*64+ 0]
4387*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+3*64+ 0]
4388*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+0*64+16]
4389*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+1*64+16]
4390*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+2*64+32]
4391*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+3*64+32]
4392*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+0*64+48]
4393*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+1*64+48]
4394*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
4395*c0909341SAndroid Build Coastguard Worker    call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
4396*c0909341SAndroid Build Coastguard Worker
4397*c0909341SAndroid Build Coastguard Worker    ; out0-7 is in rsp+gprsize+3-10*mmsize
4398*c0909341SAndroid Build Coastguard Worker    ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
4399*c0909341SAndroid Build Coastguard Worker
4400*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4401*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r7+strideq*8]
4402*c0909341SAndroid Build Coastguard Worker%else
4403*c0909341SAndroid Build Coastguard Worker    mov                dstq, [rsp+2*gprsize+16*16]
4404*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
4405*c0909341SAndroid Build Coastguard Worker%endif
4406*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
4407*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4408*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
4409*c0909341SAndroid Build Coastguard Worker%else
4410*c0909341SAndroid Build Coastguard Worker    mov                dstq, [rsp+2*gprsize+16*16]
4411*c0909341SAndroid Build Coastguard Worker%endif
4412*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+ 3*16]
4413*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+ 4*16]
4414*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+ 5*16]
4415*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+ 6*16]
4416*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+ 7*16]
4417*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+gprsize+ 8*16]
4418*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+ 9*16]
4419*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+10*16]
4420*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
4421*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4422*c0909341SAndroid Build Coastguard Worker    add                  r7, 16
4423*c0909341SAndroid Build Coastguard Worker%define mzero m9
4424*c0909341SAndroid Build Coastguard Worker%else
4425*c0909341SAndroid Build Coastguard Worker    add dword [rsp+2*gprsize+16*16], 16
4426*c0909341SAndroid Build Coastguard Worker%define mzero m7
4427*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
4428*c0909341SAndroid Build Coastguard Worker%endif
4429*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
4430*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*4
4431*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
4432*c0909341SAndroid Build Coastguard Worker%undef mzero
4433*c0909341SAndroid Build Coastguard Worker    dec                 r4d
4434*c0909341SAndroid Build Coastguard Worker    jg .loop_pass2
4435*c0909341SAndroid Build Coastguard Worker%if WIN64
4436*c0909341SAndroid Build Coastguard Worker    mov                  r7, [rsp+16*16+gprsize]
4437*c0909341SAndroid Build Coastguard Worker%endif
4438*c0909341SAndroid Build Coastguard Worker    RET
4439*c0909341SAndroid Build Coastguard Worker
4440*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, dct
4441*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, adst
4442*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, flipadst
4443*c0909341SAndroid Build Coastguard Worker
4444*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
4445*c0909341SAndroid Build Coastguard Worker%if WIN64
4446*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize], r7
4447*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
4448*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*1], r1
4449*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*2], r6
4450*c0909341SAndroid Build Coastguard Worker%endif
4451*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(.main)]
4452*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_16bpc).pass1_full
4453*c0909341SAndroid Build Coastguard Worker
4454*c0909341SAndroid Build Coastguard Worker.main:
4455*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x16_internal_16bpc).main
4456*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4457*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
4458*c0909341SAndroid Build Coastguard Worker    mova                 m3, m2
4459*c0909341SAndroid Build Coastguard Worker    mova                 m5, m4
4460*c0909341SAndroid Build Coastguard Worker    mova                 m7, m6
4461*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m14, q1032
4462*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m12, q1032
4463*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m10, q1032
4464*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m8, q1032
4465*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m7, q1032
4466*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m5, q1032
4467*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m3, q1032
4468*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m1, q1032
4469*c0909341SAndroid Build Coastguard Worker%else
4470*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q1032
4471*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m2, q1032
4472*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m4, q1032
4473*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m6, q1032
4474*c0909341SAndroid Build Coastguard Worker    pshufd               m0, [r3+11*16], q1032
4475*c0909341SAndroid Build Coastguard Worker    pshufd               m2, [r3+10*16], q1032
4476*c0909341SAndroid Build Coastguard Worker    pshufd               m4, [r3+9*16], q1032
4477*c0909341SAndroid Build Coastguard Worker    pshufd               m6, [r3+8*16], q1032
4478*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m1
4479*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m3
4480*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 9*16], m5
4481*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 8*16], m7
4482*c0909341SAndroid Build Coastguard Worker%endif
4483*c0909341SAndroid Build Coastguard Worker    ret
4484*c0909341SAndroid Build Coastguard Worker
4485*c0909341SAndroid Build Coastguard Worker.pass2:
4486*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
4487*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r3*5]
4488*c0909341SAndroid Build Coastguard Worker    add                dstq, r3
4489*c0909341SAndroid Build Coastguard Worker    neg             strideq
4490*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x16_internal_16bpc).pass2
4491*c0909341SAndroid Build Coastguard Worker
4492*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN identity, dct, h
4493*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN identity, identity
4494*c0909341SAndroid Build Coastguard Worker
4495*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
4496*c0909341SAndroid Build Coastguard Worker%if WIN64
4497*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize], r7
4498*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
4499*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*1], r1
4500*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize*2], r6
4501*c0909341SAndroid Build Coastguard Worker%endif
4502*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(.main)]
4503*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_16bpc).pass1_full
4504*c0909341SAndroid Build Coastguard Worker
4505*c0909341SAndroid Build Coastguard Worker.main:
4506*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4507*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_11586)]
4508*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m15, [cq+ 0*64+r5]
4509*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m15, [cq+ 1*64+r5]
4510*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m15, [cq+ 2*64+r5]
4511*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m15, [cq+ 3*64+r5]
4512*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m15, [cq+ 4*64+r5]
4513*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m15, [cq+ 5*64+r5]
4514*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m15, [cq+ 6*64+r5]
4515*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m15, [cq+ 7*64+r5]
4516*c0909341SAndroid Build Coastguard Worker    pmulld               m8, m15, [cq+ 8*64+r5]
4517*c0909341SAndroid Build Coastguard Worker    pmulld               m9, m15, [cq+ 9*64+r5]
4518*c0909341SAndroid Build Coastguard Worker    pmulld              m10, m15, [cq+10*64+r5]
4519*c0909341SAndroid Build Coastguard Worker    pmulld              m11, m15, [cq+11*64+r5]
4520*c0909341SAndroid Build Coastguard Worker    pmulld              m12, m15, [cq+12*64+r5]
4521*c0909341SAndroid Build Coastguard Worker    pmulld              m13, m15, [cq+13*64+r5]
4522*c0909341SAndroid Build Coastguard Worker    pmulld              m14, m15, [cq+14*64+r5]
4523*c0909341SAndroid Build Coastguard Worker    pmulld              m15, [cq+15*64+r5]
4524*c0909341SAndroid Build Coastguard Worker    mova               [r3], m15
4525*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_10240)]
4526*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
4527*c0909341SAndroid Build Coastguard Worker                         m8, m9, m10, m11, m12, m13, m14
4528*c0909341SAndroid Build Coastguard Worker    paddd               m15, [r3]
4529*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \
4530*c0909341SAndroid Build Coastguard Worker                         m8, m9, m10, m11, m12, m13, m14, m15
4531*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
4532*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
4533*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
4534*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
4535*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m9
4536*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m11
4537*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
4538*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
4539*c0909341SAndroid Build Coastguard Worker%else
4540*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_11586)]
4541*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m7, [cq+ 0*64+r5]
4542*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m7, [cq+ 1*64+r5]
4543*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m7, [cq+ 2*64+r5]
4544*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m7, [cq+ 3*64+r5]
4545*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m7, [cq+ 4*64+r5]
4546*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m7, [cq+ 5*64+r5]
4547*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m7, [cq+ 6*64+r5]
4548*c0909341SAndroid Build Coastguard Worker    pmulld               m7, [cq+ 7*64+r5]
4549*c0909341SAndroid Build Coastguard Worker    mova               [r3], m7
4550*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_10240)]
4551*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
4552*c0909341SAndroid Build Coastguard Worker    paddd                m7, [r3]
4553*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
4554*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
4555*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
4556*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
4557*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
4558*c0909341SAndroid Build Coastguard Worker    mova          [r3+8*16], m0
4559*c0909341SAndroid Build Coastguard Worker    mova          [r3+9*16], m2
4560*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m4
4561*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m6
4562*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_11586)]
4563*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m7, [cq+ 8*64+r5]
4564*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m7, [cq+ 9*64+r5]
4565*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m7, [cq+10*64+r5]
4566*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m7, [cq+11*64+r5]
4567*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m7, [cq+12*64+r5]
4568*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m7, [cq+13*64+r5]
4569*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m7, [cq+14*64+r5]
4570*c0909341SAndroid Build Coastguard Worker    pmulld               m7, [cq+15*64+r5]
4571*c0909341SAndroid Build Coastguard Worker    mova               [r3], m7
4572*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_10240)]
4573*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
4574*c0909341SAndroid Build Coastguard Worker    paddd                m7, [r3]
4575*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
4576*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
4577*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
4578*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
4579*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
4580*c0909341SAndroid Build Coastguard Worker%endif
4581*c0909341SAndroid Build Coastguard Worker    ret
4582*c0909341SAndroid Build Coastguard Worker
4583*c0909341SAndroid Build Coastguard Worker.pass2:
4584*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4585*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pw_2048)]
4586*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pixel_10bpc_max)]
4587*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
4588*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pw_1697x16)]
4589*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
4590*c0909341SAndroid Build Coastguard Worker%else
4591*c0909341SAndroid Build Coastguard Worker    mov [rsp+2*gprsize+16*16], dstq
4592*c0909341SAndroid Build Coastguard Worker%endif
4593*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 4
4594*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
4595*c0909341SAndroid Build Coastguard Worker.pass2_loop:
4596*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+0*64+0]
4597*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+1*64+0]
4598*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+2*64+0]
4599*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+3*64+0]
4600*c0909341SAndroid Build Coastguard Worker    call m(iidentity_8x16_internal_16bpc).main
4601*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4602*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round1_and_write_8x4
4603*c0909341SAndroid Build Coastguard Worker%else
4604*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round2_and_write_8x4
4605*c0909341SAndroid Build Coastguard Worker%endif
4606*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*16], m6}, 0, 4, 8, 12
4607*c0909341SAndroid Build Coastguard Worker    add                  cq, 16
4608*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4609*c0909341SAndroid Build Coastguard Worker    dec                 r5w
4610*c0909341SAndroid Build Coastguard Worker    jg .pass2_loop
4611*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*3
4612*c0909341SAndroid Build Coastguard Worker    btc                 r5d, 16
4613*c0909341SAndroid Build Coastguard Worker    jc .end
4614*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4615*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r7+16]
4616*c0909341SAndroid Build Coastguard Worker%else
4617*c0909341SAndroid Build Coastguard Worker    mov                dstq, [rsp+2*gprsize+16*16]
4618*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
4619*c0909341SAndroid Build Coastguard Worker%endif
4620*c0909341SAndroid Build Coastguard Worker    add                 r5d, 4
4621*c0909341SAndroid Build Coastguard Worker    jmp .pass2_loop
4622*c0909341SAndroid Build Coastguard Worker.end:
4623*c0909341SAndroid Build Coastguard Worker%if WIN64
4624*c0909341SAndroid Build Coastguard Worker    mov                  r7, [rsp+16*16+gprsize]
4625*c0909341SAndroid Build Coastguard Worker%endif
4626*c0909341SAndroid Build Coastguard Worker    RET
4627*c0909341SAndroid Build Coastguard Worker
4628*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob
4629*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4630*c0909341SAndroid Build Coastguard Worker    LEA                  r6, $$
4631*c0909341SAndroid Build Coastguard Worker%endif
4632*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pw_5)]
4633*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pixel_10bpc_max)]
4634*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
4635*c0909341SAndroid Build Coastguard Worker    mov                 r5d, eobd
4636*c0909341SAndroid Build Coastguard Worker    add                eobb, 21
4637*c0909341SAndroid Build Coastguard Worker    cmovc              eobd, r5d ; 43, 107, 171 -> 64, 128, 192
4638*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
4639*c0909341SAndroid Build Coastguard Worker.loop:
4640*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128*0]
4641*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [cq+128*1]
4642*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*2]
4643*c0909341SAndroid Build Coastguard Worker    packssdw             m1, [cq+128*3]
4644*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*4]
4645*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [cq+128*5]
4646*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*6]
4647*c0909341SAndroid Build Coastguard Worker    packssdw             m3, [cq+128*7]
4648*c0909341SAndroid Build Coastguard Worker    REPX     {paddsw x, m5}, m0, m1, m2, m3
4649*c0909341SAndroid Build Coastguard Worker    REPX     {psraw  x, 3 }, m0, m1, m2, m3
4650*c0909341SAndroid Build Coastguard Worker    call .main_zero
4651*c0909341SAndroid Build Coastguard Worker    add                  cq, 16
4652*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4653*c0909341SAndroid Build Coastguard Worker    btc                eobd, 16
4654*c0909341SAndroid Build Coastguard Worker    jnc .loop
4655*c0909341SAndroid Build Coastguard Worker    sub                eobd, 64
4656*c0909341SAndroid Build Coastguard Worker    jge .loop
4657*c0909341SAndroid Build Coastguard Worker    RET
4658*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4659*c0909341SAndroid Build Coastguard Worker.main_zero:
4660*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
4661*c0909341SAndroid Build Coastguard Worker.main:
4662*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0, m1
4663*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
4664*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3
4665*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
4666*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m4
4667*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4
4668*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2, m1
4669*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1
4670*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m2
4671*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2
4672*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m4
4673*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4
4674*c0909341SAndroid Build Coastguard Worker    paddw                m0, [dstq+strideq*0]
4675*c0909341SAndroid Build Coastguard Worker    paddw                m1, [dstq+strideq*1]
4676*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*2]
4677*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+r4       ]
4678*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsw x, m6}, m0, m1, m2, m3
4679*c0909341SAndroid Build Coastguard Worker    REPX     {pminsw x, m7}, m0, m1, m2, m3
4680*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
4681*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
4682*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
4683*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r4       ], m3
4684*c0909341SAndroid Build Coastguard Worker    ret
4685*c0909341SAndroid Build Coastguard Worker
4686*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob
4687*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4688*c0909341SAndroid Build Coastguard Worker    LEA                  r6, $$
4689*c0909341SAndroid Build Coastguard Worker%endif
4690*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pw_4096)]
4691*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pixel_10bpc_max)]
4692*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
4693*c0909341SAndroid Build Coastguard Worker    mov                 r4d, eobd
4694*c0909341SAndroid Build Coastguard Worker    add                eobb, 21
4695*c0909341SAndroid Build Coastguard Worker    cmovc              eobd, r4d
4696*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
4697*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
4698*c0909341SAndroid Build Coastguard Worker.loop:
4699*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+32*0]
4700*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [cq+32*1]
4701*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+32*2]
4702*c0909341SAndroid Build Coastguard Worker    packssdw             m1, [cq+32*3]
4703*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*4]
4704*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [cq+32*5]
4705*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+32*6]
4706*c0909341SAndroid Build Coastguard Worker    packssdw             m3, [cq+32*7]
4707*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
4708*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
4709*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_16bpc).main
4710*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4711*c0909341SAndroid Build Coastguard Worker    add                  cq, 16
4712*c0909341SAndroid Build Coastguard Worker    btc                eobd, 16
4713*c0909341SAndroid Build Coastguard Worker    jnc .loop
4714*c0909341SAndroid Build Coastguard Worker    add                  cq, 32*8-32
4715*c0909341SAndroid Build Coastguard Worker    add                  r5, 16
4716*c0909341SAndroid Build Coastguard Worker    mov                dstq, r5
4717*c0909341SAndroid Build Coastguard Worker    sub                eobd, 64
4718*c0909341SAndroid Build Coastguard Worker    jge .loop
4719*c0909341SAndroid Build Coastguard Worker    RET
4720*c0909341SAndroid Build Coastguard Worker
4721*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob
4722*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4723*c0909341SAndroid Build Coastguard Worker    LEA                  r6, $$
4724*c0909341SAndroid Build Coastguard Worker%else
4725*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_2896x8)]
4726*c0909341SAndroid Build Coastguard Worker    mova                 m9, [o(pw_1697x16)]
4727*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pw_8192)]
4728*c0909341SAndroid Build Coastguard Worker%endif
4729*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pixel_10bpc_max)]
4730*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
4731*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
4732*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4733*c0909341SAndroid Build Coastguard Worker    paddw               m10, m11, m11 ; pw_16384
4734*c0909341SAndroid Build Coastguard Worker%endif
4735*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
4736*c0909341SAndroid Build Coastguard Worker    call .main
4737*c0909341SAndroid Build Coastguard Worker    sub                eobd, 36
4738*c0909341SAndroid Build Coastguard Worker    jl .ret
4739*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*8-32
4740*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+16]
4741*c0909341SAndroid Build Coastguard Worker    call .main
4742*c0909341SAndroid Build Coastguard Worker    sub                  cq, 128*8
4743*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+strideq*8]
4744*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
4745*c0909341SAndroid Build Coastguard Worker    call .main
4746*c0909341SAndroid Build Coastguard Worker    sub                eobd, 107 ; eob < 143
4747*c0909341SAndroid Build Coastguard Worker    jl .ret
4748*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*8-32
4749*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+16]
4750*c0909341SAndroid Build Coastguard Worker    call .main
4751*c0909341SAndroid Build Coastguard Worker    sub                  cq, 128*8
4752*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+strideq*8]
4753*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
4754*c0909341SAndroid Build Coastguard Worker    call .main
4755*c0909341SAndroid Build Coastguard Worker    sub                eobd, 128 ; eob < 271
4756*c0909341SAndroid Build Coastguard Worker    jl .ret
4757*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*8-32
4758*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+16]
4759*c0909341SAndroid Build Coastguard Worker    call .main
4760*c0909341SAndroid Build Coastguard Worker    sub                  cq, 128*8
4761*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+strideq*8]
4762*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
4763*c0909341SAndroid Build Coastguard Worker    call .main
4764*c0909341SAndroid Build Coastguard Worker    sub                eobd, 128 ; eob < 399
4765*c0909341SAndroid Build Coastguard Worker    jl .ret
4766*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*8-32
4767*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+16]
4768*c0909341SAndroid Build Coastguard Worker    call .main
4769*c0909341SAndroid Build Coastguard Worker.ret:
4770*c0909341SAndroid Build Coastguard Worker    RET
4771*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4772*c0909341SAndroid Build Coastguard Worker.main:
4773*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128*0]
4774*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [cq+128*1]
4775*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*2]
4776*c0909341SAndroid Build Coastguard Worker    packssdw             m1, [cq+128*3]
4777*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*4]
4778*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [cq+128*5]
4779*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*6]
4780*c0909341SAndroid Build Coastguard Worker    packssdw             m3, [cq+128*7]
4781*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4782*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m8 }, m0, m1, m2, m3
4783*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m9, m0
4784*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m9, m1
4785*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m4, m5
4786*c0909341SAndroid Build Coastguard Worker%else
4787*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pw_2896x8)]
4788*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m6 }, m0, m1, m2, m3
4789*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pw_1697x16)]
4790*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m5, m0
4791*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m1
4792*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pw_16384)]
4793*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m6 }, m4, m5
4794*c0909341SAndroid Build Coastguard Worker%endif
4795*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4
4796*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5
4797*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4798*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m9, m2
4799*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m9, m3
4800*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m4, m5
4801*c0909341SAndroid Build Coastguard Worker%else
4802*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pw_1697x16)]
4803*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m5, m2
4804*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m3
4805*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m6 }, m4, m5
4806*c0909341SAndroid Build Coastguard Worker%endif
4807*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m4
4808*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m5
4809*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4810*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
4811*c0909341SAndroid Build Coastguard Worker%else
4812*c0909341SAndroid Build Coastguard Worker    psrlw                m6, 1          ; pw_8192
4813*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m6 }, m0, m1, m2, m3
4814*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
4815*c0909341SAndroid Build Coastguard Worker%endif
4816*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
4817*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4818*c0909341SAndroid Build Coastguard Worker    add                  cq, 16
4819*c0909341SAndroid Build Coastguard Worker    btc                eobd, 16
4820*c0909341SAndroid Build Coastguard Worker    jnc .main
4821*c0909341SAndroid Build Coastguard Worker    ret
4822*c0909341SAndroid Build Coastguard Worker
4823*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob
4824*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4825*c0909341SAndroid Build Coastguard Worker    LEA                  r6, $$
4826*c0909341SAndroid Build Coastguard Worker%else
4827*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_2896x8)]
4828*c0909341SAndroid Build Coastguard Worker    mova                 m9, [o(pw_1697x16)]
4829*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pw_2048)]
4830*c0909341SAndroid Build Coastguard Worker%endif
4831*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pixel_10bpc_max)]
4832*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
4833*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
4834*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
4835*c0909341SAndroid Build Coastguard Worker    call .main
4836*c0909341SAndroid Build Coastguard Worker    sub                eobd, 36
4837*c0909341SAndroid Build Coastguard Worker    jl .ret
4838*c0909341SAndroid Build Coastguard Worker    call .main
4839*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*8-64
4840*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+16*1]
4841*c0909341SAndroid Build Coastguard Worker    call .main
4842*c0909341SAndroid Build Coastguard Worker    sub                eobd, 107 ; eob < 143
4843*c0909341SAndroid Build Coastguard Worker    jl .ret
4844*c0909341SAndroid Build Coastguard Worker    call .main
4845*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*8-64
4846*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+16*2]
4847*c0909341SAndroid Build Coastguard Worker    call .main
4848*c0909341SAndroid Build Coastguard Worker    sub                eobd, 128 ; eob < 271
4849*c0909341SAndroid Build Coastguard Worker    jl .ret
4850*c0909341SAndroid Build Coastguard Worker    call .main
4851*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*8-64
4852*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+16*3]
4853*c0909341SAndroid Build Coastguard Worker    call .main
4854*c0909341SAndroid Build Coastguard Worker    sub                eobd, 128 ; eob < 399
4855*c0909341SAndroid Build Coastguard Worker    jl .ret
4856*c0909341SAndroid Build Coastguard Worker    call .main
4857*c0909341SAndroid Build Coastguard Worker.ret:
4858*c0909341SAndroid Build Coastguard Worker    RET
4859*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4860*c0909341SAndroid Build Coastguard Worker.main:
4861*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64*0]
4862*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [cq+64*1]
4863*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*2]
4864*c0909341SAndroid Build Coastguard Worker    packssdw             m1, [cq+64*3]
4865*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*4]
4866*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [cq+64*5]
4867*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*6]
4868*c0909341SAndroid Build Coastguard Worker    packssdw             m3, [cq+64*7]
4869*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4870*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m8 }, m0, m1, m2, m3
4871*c0909341SAndroid Build Coastguard Worker%else
4872*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pw_2896x8)]
4873*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m6 }, m0, m1, m2, m3
4874*c0909341SAndroid Build Coastguard Worker%endif
4875*c0909341SAndroid Build Coastguard Worker    REPX  {paddsw   x, x  }, m0, m1, m2, m3
4876*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4877*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m9, m0
4878*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m9, m1
4879*c0909341SAndroid Build Coastguard Worker%else
4880*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pw_1697x16)]
4881*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m6, m0
4882*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m6, m1
4883*c0909341SAndroid Build Coastguard Worker%endif
4884*c0909341SAndroid Build Coastguard Worker    REPX  {paddsw   x, x  }, m0, m1
4885*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4
4886*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5
4887*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4888*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m9, m2
4889*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m9, m3
4890*c0909341SAndroid Build Coastguard Worker%else
4891*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m6, m2
4892*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m3
4893*c0909341SAndroid Build Coastguard Worker%endif
4894*c0909341SAndroid Build Coastguard Worker    REPX  {paddsw   x, x  }, m2, m3
4895*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m4
4896*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4897*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m5
4898*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3
4899*c0909341SAndroid Build Coastguard Worker%else
4900*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m6
4901*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pw_2048)]
4902*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m6 }, m0, m1, m2, m3
4903*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
4904*c0909341SAndroid Build Coastguard Worker%endif
4905*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
4906*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_16bpc).main
4907*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4908*c0909341SAndroid Build Coastguard Worker    add                  cq, 16
4909*c0909341SAndroid Build Coastguard Worker    btc                eobd, 16
4910*c0909341SAndroid Build Coastguard Worker    jnc .main
4911*c0909341SAndroid Build Coastguard Worker    ret
4912*c0909341SAndroid Build Coastguard Worker
4913*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob
4914*c0909341SAndroid Build Coastguard Worker%undef cmp
4915*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4916*c0909341SAndroid Build Coastguard Worker    LEA                  r6, $$
4917*c0909341SAndroid Build Coastguard Worker%endif
4918*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pw_8192)]
4919*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pixel_10bpc_max)]
4920*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
4921*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
4922*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
4923*c0909341SAndroid Build Coastguard Worker    call .main                              ; 0
4924*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
4925*c0909341SAndroid Build Coastguard Worker    jl .ret
4926*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*8-32       ; 0 1
4927*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+16]        ; 1
4928*c0909341SAndroid Build Coastguard Worker    call .main
4929*c0909341SAndroid Build Coastguard Worker    call .main2
4930*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
4931*c0909341SAndroid Build Coastguard Worker    jl .ret
4932*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*16-64      ; 0 1 2
4933*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+16*2]      ; 1 2
4934*c0909341SAndroid Build Coastguard Worker    call .main                              ; 2
4935*c0909341SAndroid Build Coastguard Worker    call .main2
4936*c0909341SAndroid Build Coastguard Worker    call .main2
4937*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 300
4938*c0909341SAndroid Build Coastguard Worker    jl .ret
4939*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*24-96      ; 0 1 2 3
4940*c0909341SAndroid Build Coastguard Worker    add                  r5, 16*3           ; 1 2 3
4941*c0909341SAndroid Build Coastguard Worker    mov                dstq, r5             ; 2 3
4942*c0909341SAndroid Build Coastguard Worker    call .main                              ; 3
4943*c0909341SAndroid Build Coastguard Worker    call .main2
4944*c0909341SAndroid Build Coastguard Worker    call .main2
4945*c0909341SAndroid Build Coastguard Worker    call .main2
4946*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 535
4947*c0909341SAndroid Build Coastguard Worker    jl .ret
4948*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*24-96      ; 0 1 2 3
4949*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+strideq*8] ; 1 2 3 4
4950*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq           ; 2 3 4
4951*c0909341SAndroid Build Coastguard Worker    call .main                              ; 3 4
4952*c0909341SAndroid Build Coastguard Worker    call .main2
4953*c0909341SAndroid Build Coastguard Worker    call .main2
4954*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 755
4955*c0909341SAndroid Build Coastguard Worker    jl .ret
4956*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*16-64      ; 0 1 2 3
4957*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+strideq*8] ; 1 2 3 4
4958*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq           ; 2 3 4 5
4959*c0909341SAndroid Build Coastguard Worker    call .main                              ; 3 4 5
4960*c0909341SAndroid Build Coastguard Worker    call .main2
4961*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 911
4962*c0909341SAndroid Build Coastguard Worker    jl .ret
4963*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*8-32       ; 0 1 2 3
4964*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+strideq*8] ; 1 2 3 4
4965*c0909341SAndroid Build Coastguard Worker    call .main                              ; 2 3 4 5
4966*c0909341SAndroid Build Coastguard Worker.ret:                                       ; 3 4 5 6
4967*c0909341SAndroid Build Coastguard Worker    RET
4968*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4969*c0909341SAndroid Build Coastguard Worker.main2:
4970*c0909341SAndroid Build Coastguard Worker    sub                  cq, 128*8
4971*c0909341SAndroid Build Coastguard Worker    sub                dstq, 16
4972*c0909341SAndroid Build Coastguard Worker.main:
4973*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128*0]
4974*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [cq+128*1]
4975*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*2]
4976*c0909341SAndroid Build Coastguard Worker    packssdw             m1, [cq+128*3]
4977*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*4]
4978*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [cq+128*5]
4979*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*6]
4980*c0909341SAndroid Build Coastguard Worker    packssdw             m3, [cq+128*7]
4981*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
4982*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
4983*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4984*c0909341SAndroid Build Coastguard Worker    add                  cq, 16
4985*c0909341SAndroid Build Coastguard Worker    btc                eobd, 16
4986*c0909341SAndroid Build Coastguard Worker    jnc .main
4987*c0909341SAndroid Build Coastguard Worker    ret
4988*c0909341SAndroid Build Coastguard Worker
4989*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
4990*c0909341SAndroid Build Coastguard Worker                                         dst, stride, c, eob
4991*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4992*c0909341SAndroid Build Coastguard Worker    LEA                  r6, $$
4993*c0909341SAndroid Build Coastguard Worker%define base $$
4994*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP       0, 4
4995*c0909341SAndroid Build Coastguard Worker%else
4996*c0909341SAndroid Build Coastguard Worker    lea                  r6, [tbl_Nx32_odd_offset]
4997*c0909341SAndroid Build Coastguard Worker%define base tbl_Nx32_odd_offset
4998*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP       4, 7
4999*c0909341SAndroid Build Coastguard Worker%if WIN64
5000*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*1+35*16], r7
5001*c0909341SAndroid Build Coastguard Worker%endif
5002*c0909341SAndroid Build Coastguard Worker%endif
5003*c0909341SAndroid Build Coastguard Worker%define o2(x) r6-base+x
5004*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
5005*c0909341SAndroid Build Coastguard Worker    jz .dconly
5006*c0909341SAndroid Build Coastguard Worker
5007*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
5008*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*1+35*16], r0
5009*c0909341SAndroid Build Coastguard Worker%endif
5010*c0909341SAndroid Build Coastguard Worker%undef cmp
5011*c0909341SAndroid Build Coastguard Worker    ; remove entirely-zero iterations
5012*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 7*2
5013*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_8x32_2d)+r5]
5014*c0909341SAndroid Build Coastguard Worker    jge .end_zero_loop
5015*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
5016*c0909341SAndroid Build Coastguard Worker.zero_loop:
5017*c0909341SAndroid Build Coastguard Worker    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
5018*c0909341SAndroid Build Coastguard Worker    movzx               t1d, t0b
5019*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 8
5020*c0909341SAndroid Build Coastguard Worker    mova   [rsp+ 3*16+r5*8], m0
5021*c0909341SAndroid Build Coastguard Worker    mova   [rsp+11*16+r5*8], m0
5022*c0909341SAndroid Build Coastguard Worker    mova   [rsp+ 3*16+t0*8], m0
5023*c0909341SAndroid Build Coastguard Worker    mova   [rsp+ 3*16+t1*8], m0
5024*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
5025*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_8x32_2d)+r5]
5026*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
5027*c0909341SAndroid Build Coastguard Worker.end_zero_loop:
5028*c0909341SAndroid Build Coastguard Worker    ; actual first pass after skipping all-zero data
5029*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*0+35*16], eobd
5030*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
5031*c0909341SAndroid Build Coastguard Worker.loop_pass1:
5032*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5033*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
5034*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
5035*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
5036*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
5037*c0909341SAndroid Build Coastguard Worker%endif
5038*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+0*128+r5*8]
5039*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+1*128+r5*8]
5040*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+2*128+r5*8]
5041*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+3*128+r5*8]
5042*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+4*128+r5*8]
5043*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+5*128+r5*8]
5044*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+6*128+r5*8]
5045*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+7*128+r5*8]
5046*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1
5047*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(pd_2)]
5048*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m1}, m0, m6, m5, m3
5049*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
5050*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
5051*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
5052*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
5053*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
5054*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
5055*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
5056*c0909341SAndroid Build Coastguard Worker
5057*c0909341SAndroid Build Coastguard Worker    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
5058*c0909341SAndroid Build Coastguard Worker    movzx               t1d, t0b
5059*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 8
5060*c0909341SAndroid Build Coastguard Worker    mova    [r3+ 3*16+r5*8], m0
5061*c0909341SAndroid Build Coastguard Worker    mova    [r3+11*16+r5*8], m2
5062*c0909341SAndroid Build Coastguard Worker    mova    [r3+ 3*16+t1*8], m1
5063*c0909341SAndroid Build Coastguard Worker    mova    [r3+ 3*16+t0*8], m3
5064*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
5065*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7
5066*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
5067*c0909341SAndroid Build Coastguard Worker    jge .loop_pass1
5068*c0909341SAndroid Build Coastguard Worker
5069*c0909341SAndroid Build Coastguard Worker    ; pass 2 code starts here
5070*c0909341SAndroid Build Coastguard Worker    ; m0 is already loaded from last iteration of first pass
5071*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
5072*c0909341SAndroid Build Coastguard Worker    mov                  r0, [rsp+gprsize*1+35*16]
5073*c0909341SAndroid Build Coastguard Worker%endif
5074*c0909341SAndroid Build Coastguard Worker    mov                eobd, [rsp+gprsize*0+35*16]
5075*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 43
5076*c0909341SAndroid Build Coastguard Worker    jl .load_veryfast
5077*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 107
5078*c0909341SAndroid Build Coastguard Worker    jl .load_fast
5079*c0909341SAndroid Build Coastguard Worker    ; load normal
5080*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
5081*c0909341SAndroid Build Coastguard Worker    jmp .run
5082*c0909341SAndroid Build Coastguard Worker.load_fast:
5083*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
5084*c0909341SAndroid Build Coastguard Worker    jmp .run
5085*c0909341SAndroid Build Coastguard Worker.load_veryfast:
5086*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
5087*c0909341SAndroid Build Coastguard Worker    ; fall-through
5088*c0909341SAndroid Build Coastguard Worker.run:
5089*c0909341SAndroid Build Coastguard Worker    call .pass2
5090*c0909341SAndroid Build Coastguard Worker%if WIN64
5091*c0909341SAndroid Build Coastguard Worker    mov                  r7, [rsp+gprsize*1+35*16]
5092*c0909341SAndroid Build Coastguard Worker%endif
5093*c0909341SAndroid Build Coastguard Worker    RET
5094*c0909341SAndroid Build Coastguard Worker
5095*c0909341SAndroid Build Coastguard Worker.pass2:
5096*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
5097*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
5098*c0909341SAndroid Build Coastguard Worker%endif
5099*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+16* 4]
5100*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+16* 5]
5101*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+16* 6]
5102*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+16* 7]
5103*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+gprsize+16* 8]
5104*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+16* 9]
5105*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+16*10]
5106*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
5107*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 3*16], m0
5108*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 4*16], m1
5109*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 5*16], m2
5110*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 6*16], m3
5111*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 7*16], m4
5112*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 8*16], m5
5113*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 9*16], m6
5114*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+11*16]
5115*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+12*16]
5116*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+13*16]
5117*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+14*16]
5118*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+15*16]
5119*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+gprsize+16*16]
5120*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+17*16]
5121*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+18*16]
5122*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
5123*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+ 0*16]
5124*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+11*16], m0
5125*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+12*16], m1
5126*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+13*16], m2
5127*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+14*16], m3
5128*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+15*16], m4
5129*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*16], m5
5130*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+17*16], m6
5131*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+18*16], m7
5132*c0909341SAndroid Build Coastguard Worker    call                 r4
5133*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5134*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_2048)]
5135*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
5136*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
5137*c0909341SAndroid Build Coastguard Worker%endif
5138*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
5139*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
5140*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
5141*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+11*16]
5142*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+12*16]
5143*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+13*16]
5144*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+14*16]
5145*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+15*16]
5146*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+gprsize+16*16]
5147*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+17*16]
5148*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+18*16]
5149*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
5150*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
5151*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+19*16]
5152*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+20*16]
5153*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+21*16]
5154*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+22*16]
5155*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+23*16]
5156*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+gprsize+24*16]
5157*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+25*16]
5158*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+26*16]
5159*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
5160*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
5161*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+27*16]
5162*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+28*16]
5163*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+29*16]
5164*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+30*16]
5165*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+31*16]
5166*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+gprsize+32*16]
5167*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+33*16]
5168*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+34*16]
5169*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
5170*c0909341SAndroid Build Coastguard Worker    ret
5171*c0909341SAndroid Build Coastguard Worker.dconly:
5172*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
5173*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
5174*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 8
5175*c0909341SAndroid Build Coastguard Worker    add                 r5d, 640
5176*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 10
5177*c0909341SAndroid Build Coastguard Worker    add                 rsp, (31+2*ARCH_X86_64)*16
5178*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
5179*c0909341SAndroid Build Coastguard Worker
5180*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
5181*c0909341SAndroid Build Coastguard Worker                                          dst, stride, c, eob
5182*c0909341SAndroid Build Coastguard Worker    LEA                  r6, base
5183*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
5184*c0909341SAndroid Build Coastguard Worker    jz .dconly
5185*c0909341SAndroid Build Coastguard Worker
5186*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
5187*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*1+76*16], r0
5188*c0909341SAndroid Build Coastguard Worker%elif WIN64
5189*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*1+76*16], r7
5190*c0909341SAndroid Build Coastguard Worker%endif
5191*c0909341SAndroid Build Coastguard Worker%undef cmp
5192*c0909341SAndroid Build Coastguard Worker    ; remove entirely-zero iterations
5193*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 7*2
5194*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_16x32_2d)+r5]
5195*c0909341SAndroid Build Coastguard Worker    jge .end_zero_loop
5196*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
5197*c0909341SAndroid Build Coastguard Worker.zero_loop:
5198*c0909341SAndroid Build Coastguard Worker    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
5199*c0909341SAndroid Build Coastguard Worker    movzx               t1d, t0b
5200*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 8
5201*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+r5*8], m0
5202*c0909341SAndroid Build Coastguard Worker    mova   [rsp+20*16+r5*8], m0
5203*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t0*8], m0
5204*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t1*8], m0
5205*c0909341SAndroid Build Coastguard Worker    mova   [rsp+44*16+r5*8], m0
5206*c0909341SAndroid Build Coastguard Worker    mova   [rsp+52*16+r5*8], m0
5207*c0909341SAndroid Build Coastguard Worker    mova   [rsp+44*16+t0*8], m0
5208*c0909341SAndroid Build Coastguard Worker    mova   [rsp+44*16+t1*8], m0
5209*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
5210*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_16x32_2d)+r5]
5211*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
5212*c0909341SAndroid Build Coastguard Worker.end_zero_loop:
5213*c0909341SAndroid Build Coastguard Worker    ; actual first pass after skipping all-zero data
5214*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*0+76*16], eobd
5215*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
5216*c0909341SAndroid Build Coastguard Worker.loop_pass1:
5217*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5218*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
5219*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
5220*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
5221*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
5222*c0909341SAndroid Build Coastguard Worker%endif
5223*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 1*128+r5*8]
5224*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 3*128+r5*8]
5225*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 5*128+r5*8]
5226*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 7*128+r5*8]
5227*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 9*128+r5*8]
5228*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+11*128+r5*8]
5229*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+13*128+r5*8]
5230*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+15*128+r5*8]
5231*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
5232*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).main_oddhalf
5233*c0909341SAndroid Build Coastguard Worker
5234*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 0*128+r5*8]
5235*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 2*128+r5*8]
5236*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 4*128+r5*8]
5237*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 6*128+r5*8]
5238*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 8*128+r5*8]
5239*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+10*128+r5*8]
5240*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+12*128+r5*8]
5241*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+14*128+r5*8]
5242*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
5243*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1
5244*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
5245*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).round
5246*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5247*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
5248*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
5249*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
5250*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
5251*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m9
5252*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m11
5253*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
5254*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
5255*c0909341SAndroid Build Coastguard Worker%endif
5256*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
5257*c0909341SAndroid Build Coastguard Worker    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
5258*c0909341SAndroid Build Coastguard Worker    movzx               t1d, t0b
5259*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 8
5260*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5261*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+r5*8], m0
5262*c0909341SAndroid Build Coastguard Worker    mova   [rsp+20*16+r5*8], m2
5263*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t1*8], m1
5264*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t0*8], m3
5265*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
5266*c0909341SAndroid Build Coastguard Worker    mova   [rsp+44*16+r5*8], m8
5267*c0909341SAndroid Build Coastguard Worker    mova   [rsp+52*16+r5*8], m10
5268*c0909341SAndroid Build Coastguard Worker    mova   [rsp+44*16+t1*8], m9
5269*c0909341SAndroid Build Coastguard Worker    mova   [rsp+44*16+t0*8], m11
5270*c0909341SAndroid Build Coastguard Worker%else
5271*c0909341SAndroid Build Coastguard Worker    mova   [rsp+44*16+r5*8], m0
5272*c0909341SAndroid Build Coastguard Worker    mova   [rsp+52*16+r5*8], m2
5273*c0909341SAndroid Build Coastguard Worker    mova   [rsp+44*16+t1*8], m1
5274*c0909341SAndroid Build Coastguard Worker    mova   [rsp+44*16+t0*8], m3
5275*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+ 8*16]
5276*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+ 9*16]
5277*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+10*16]
5278*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+11*16]
5279*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
5280*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+r5*8], m0
5281*c0909341SAndroid Build Coastguard Worker    mova   [rsp+20*16+r5*8], m2
5282*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t1*8], m1
5283*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t0*8], m3
5284*c0909341SAndroid Build Coastguard Worker%endif
5285*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
5286*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
5287*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
5288*c0909341SAndroid Build Coastguard Worker    jge .loop_pass1
5289*c0909341SAndroid Build Coastguard Worker
5290*c0909341SAndroid Build Coastguard Worker    ; pass=2
5291*c0909341SAndroid Build Coastguard Worker    add                 rsp, 9*16
5292*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5293*c0909341SAndroid Build Coastguard Worker    mov                  r6, dstq
5294*c0909341SAndroid Build Coastguard Worker%else
5295*c0909341SAndroid Build Coastguard Worker    mov                dstq, [rsp+gprsize*1+67*16]
5296*c0909341SAndroid Build Coastguard Worker%endif
5297*c0909341SAndroid Build Coastguard Worker    mov                eobd, [rsp+gprsize*0+67*16]
5298*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 44
5299*c0909341SAndroid Build Coastguard Worker    jl .load_veryfast
5300*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
5301*c0909341SAndroid Build Coastguard Worker    jl .load_fast
5302*c0909341SAndroid Build Coastguard Worker    ; load normal
5303*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
5304*c0909341SAndroid Build Coastguard Worker    jmp .run
5305*c0909341SAndroid Build Coastguard Worker.load_fast:
5306*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
5307*c0909341SAndroid Build Coastguard Worker    jmp .run
5308*c0909341SAndroid Build Coastguard Worker.load_veryfast:
5309*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
5310*c0909341SAndroid Build Coastguard Worker    ; fall-through
5311*c0909341SAndroid Build Coastguard Worker.run:
5312*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5313*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+32]
5314*c0909341SAndroid Build Coastguard Worker    mov                  r7, -4
5315*c0909341SAndroid Build Coastguard Worker%else
5316*c0909341SAndroid Build Coastguard Worker    lea                  r2, [rsp+67*16]
5317*c0909341SAndroid Build Coastguard Worker    mov dword [r2+0*gprsize], 2
5318*c0909341SAndroid Build Coastguard Worker%endif
5319*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass2_entry
5320*c0909341SAndroid Build Coastguard Worker.loop_pass2:
5321*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16* 3]
5322*c0909341SAndroid Build Coastguard Worker.loop_pass2_entry:
5323*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
5324*c0909341SAndroid Build Coastguard Worker    mov                dstq, [r2+1*gprsize]
5325*c0909341SAndroid Build Coastguard Worker%endif
5326*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2
5327*c0909341SAndroid Build Coastguard Worker    add                 rsp, 32*16
5328*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5329*c0909341SAndroid Build Coastguard Worker    add                  r7, 2
5330*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r2+r7*8]
5331*c0909341SAndroid Build Coastguard Worker    jl .loop_pass2
5332*c0909341SAndroid Build Coastguard Worker%if WIN64
5333*c0909341SAndroid Build Coastguard Worker    mov                  r7, [rsp+gprsize*1+3*16]
5334*c0909341SAndroid Build Coastguard Worker%endif
5335*c0909341SAndroid Build Coastguard Worker%else
5336*c0909341SAndroid Build Coastguard Worker    add dword [r2+1*gprsize], 16
5337*c0909341SAndroid Build Coastguard Worker    dec dword [r2+0*gprsize]
5338*c0909341SAndroid Build Coastguard Worker    jg .loop_pass2
5339*c0909341SAndroid Build Coastguard Worker%endif
5340*c0909341SAndroid Build Coastguard Worker%assign stack_size (stack_size-73*16)
5341*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= 16
5342*c0909341SAndroid Build Coastguard Worker%assign stack_size_padded (stack_size_padded-73*16)
5343*c0909341SAndroid Build Coastguard Worker%assign stack_offset (stack_offset-73*16)
5344*c0909341SAndroid Build Coastguard Worker%else
5345*c0909341SAndroid Build Coastguard Worker%xdefine rstkm [rsp + stack_size]
5346*c0909341SAndroid Build Coastguard Worker%endif
5347*c0909341SAndroid Build Coastguard Worker    RET
5348*c0909341SAndroid Build Coastguard Worker.dconly:
5349*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
5350*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
5351*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 32
5352*c0909341SAndroid Build Coastguard Worker    add                 r5d, 128
5353*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 8
5354*c0909341SAndroid Build Coastguard Worker    imul                r5d, 181
5355*c0909341SAndroid Build Coastguard Worker    add                 rsp, (65+4*ARCH_X86_64)*16
5356*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
5357*c0909341SAndroid Build Coastguard Worker
5358*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
5359*c0909341SAndroid Build Coastguard Worker                                         dst, stride, c, eob
5360*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
5361*c0909341SAndroid Build Coastguard Worker    LEA                  r6, $$
5362*c0909341SAndroid Build Coastguard Worker%endif
5363*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
5364*c0909341SAndroid Build Coastguard Worker    jz .dconly
5365*c0909341SAndroid Build Coastguard Worker
5366*c0909341SAndroid Build Coastguard Worker    ; remove entirely-zero iterations
5367*c0909341SAndroid Build Coastguard Worker%undef cmp
5368*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5369*c0909341SAndroid Build Coastguard Worker    xor                 r5d, r5d
5370*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 10
5371*c0909341SAndroid Build Coastguard Worker    setge               r5b
5372*c0909341SAndroid Build Coastguard Worker%else
5373*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 1
5374*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 10
5375*c0909341SAndroid Build Coastguard Worker    sbb                 r5d, 0
5376*c0909341SAndroid Build Coastguard Worker%endif
5377*c0909341SAndroid Build Coastguard Worker    add                 r5d, r5d
5378*c0909341SAndroid Build Coastguard Worker
5379*c0909341SAndroid Build Coastguard Worker    ; actual first pass after skipping all-zero data
5380*c0909341SAndroid Build Coastguard Worker.loop_pass1:
5381*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+32* 1+r5*8]
5382*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+32* 7+r5*8]
5383*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32* 9+r5*8]
5384*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+32*15+r5*8]
5385*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+32*17+r5*8]
5386*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+32*23+r5*8]
5387*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+32*25+r5*8]
5388*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+32*31+r5*8]
5389*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5390*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
5391*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
5392*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
5393*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
5394*c0909341SAndroid Build Coastguard Worker%endif
5395*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
5396*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf_part1
5397*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+32* 3+r5*8]
5398*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+32* 5+r5*8]
5399*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*11+r5*8]
5400*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+32*13+r5*8]
5401*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+32*19+r5*8]
5402*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+32*21+r5*8]
5403*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+32*27+r5*8]
5404*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+32*29+r5*8]
5405*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf_part2
5406*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+32* 2+r5*8]
5407*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+32* 6+r5*8]
5408*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*10+r5*8]
5409*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+32*14+r5*8]
5410*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+32*18+r5*8]
5411*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+32*22+r5*8]
5412*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+32*26+r5*8]
5413*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+32*30+r5*8]
5414*c0909341SAndroid Build Coastguard Worker    add                  r3, 16*(16+4*ARCH_X86_32)
5415*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).main_oddhalf
5416*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+32* 0+r5*8]
5417*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+32* 4+r5*8]
5418*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32* 8+r5*8]
5419*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+32*12+r5*8]
5420*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+32*16+r5*8]
5421*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+32*20+r5*8]
5422*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+32*24+r5*8]
5423*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+32*28+r5*8]
5424*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1
5425*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
5426*c0909341SAndroid Build Coastguard Worker    sub                  r3, 16*(16+4*ARCH_X86_32)
5427*c0909341SAndroid Build Coastguard Worker    call .round_dct32
5428*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5429*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
5430*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
5431*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 8+r5*8], m8
5432*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 9+r5*8], m9
5433*c0909341SAndroid Build Coastguard Worker    mova    [cq+32*10+r5*8], m10
5434*c0909341SAndroid Build Coastguard Worker    mova    [cq+32*11+r5*8], m11
5435*c0909341SAndroid Build Coastguard Worker    mova                 m8, [r3+16* 9] ;  8  9
5436*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+16*11] ; 10 11
5437*c0909341SAndroid Build Coastguard Worker    mova                m12, [r3+16*13] ; 12 13
5438*c0909341SAndroid Build Coastguard Worker    mova                m14, [r3+16*15] ; 14 15
5439*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
5440*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 4+r5*8], m8
5441*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 5+r5*8], m9
5442*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 6+r5*8], m10
5443*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 7+r5*8], m11
5444*c0909341SAndroid Build Coastguard Worker    mova                 m8, [r3+16* 8] ; 24 25
5445*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+16*10] ; 26 27
5446*c0909341SAndroid Build Coastguard Worker    mova                m12, [r3+16*12] ; 28 29
5447*c0909341SAndroid Build Coastguard Worker    mova                m14, [r3+16*14] ; 30 31
5448*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
5449*c0909341SAndroid Build Coastguard Worker    mova    [cq+32*12+r5*8], m8
5450*c0909341SAndroid Build Coastguard Worker    mova    [cq+32*13+r5*8], m9
5451*c0909341SAndroid Build Coastguard Worker    mova    [cq+32*14+r5*8], m10
5452*c0909341SAndroid Build Coastguard Worker    mova    [cq+32*15+r5*8], m11
5453*c0909341SAndroid Build Coastguard Worker%else
5454*c0909341SAndroid Build Coastguard Worker    sub                  r3, 8*16
5455*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+ 8*16]
5456*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+10*16]
5457*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+12*16]
5458*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+14*16]
5459*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+ 9*16]
5460*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+11*16]
5461*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+13*16]
5462*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+15*16]
5463*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
5464*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 4+r5*8], m0
5465*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 5+r5*8], m1
5466*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 6+r5*8], m2
5467*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 7+r5*8], m3
5468*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+16*16]
5469*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+18*16]
5470*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+20*16]
5471*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+22*16]
5472*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+17*16]
5473*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+19*16]
5474*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+21*16]
5475*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+23*16]
5476*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
5477*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 8+r5*8], m0
5478*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 9+r5*8], m1
5479*c0909341SAndroid Build Coastguard Worker    mova    [cq+32*10+r5*8], m2
5480*c0909341SAndroid Build Coastguard Worker    mova    [cq+32*11+r5*8], m3
5481*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+31*16]
5482*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+29*16]
5483*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+27*16]
5484*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+25*16]
5485*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+30*16]
5486*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+28*16]
5487*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+26*16]
5488*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+24*16]
5489*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
5490*c0909341SAndroid Build Coastguard Worker    mova    [cq+32*12+r5*8], m0
5491*c0909341SAndroid Build Coastguard Worker    mova    [cq+32*13+r5*8], m1
5492*c0909341SAndroid Build Coastguard Worker    mova    [cq+32*14+r5*8], m2
5493*c0909341SAndroid Build Coastguard Worker    mova    [cq+32*15+r5*8], m3
5494*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+ 0*16]
5495*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+ 2*16]
5496*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+ 4*16]
5497*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+ 6*16]
5498*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+ 1*16]
5499*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+ 3*16]
5500*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+ 5*16]
5501*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+ 7*16]
5502*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
5503*c0909341SAndroid Build Coastguard Worker%endif
5504*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
5505*c0909341SAndroid Build Coastguard Worker    ; clear lower half of [cq]
5506*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \
5507*c0909341SAndroid Build Coastguard Worker                                    24, 25, 26, 27, 28, 29, 30, 31
5508*c0909341SAndroid Build Coastguard Worker    test                r5d, r5d
5509*c0909341SAndroid Build Coastguard Worker    jz .end_pass1
5510*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 0+r5*8], m0
5511*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 1+r5*8], m1
5512*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 2+r5*8], m2
5513*c0909341SAndroid Build Coastguard Worker    mova    [cq+32* 3+r5*8], m3
5514*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
5515*c0909341SAndroid Build Coastguard Worker    jmp .loop_pass1
5516*c0909341SAndroid Build Coastguard Worker.end_pass1:
5517*c0909341SAndroid Build Coastguard Worker
5518*c0909341SAndroid Build Coastguard Worker    ; pass=2, we need to call this otherwise the stack pointer has
5519*c0909341SAndroid Build Coastguard Worker    ; the wrong offset in the 8-bit code
5520*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 4
5521*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_16bpc).pass2_main
5522*c0909341SAndroid Build Coastguard Worker    RET
5523*c0909341SAndroid Build Coastguard Worker
5524*c0909341SAndroid Build Coastguard Worker.main_oddhalf_part1_fast: ; lower half zero
5525*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m0, [o(pd_4091)]
5526*c0909341SAndroid Build Coastguard Worker    pmulld               m0, [o(pd_201)]
5527*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m3, [o(pd_m2751)]
5528*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
5529*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [o(pd_3035)]
5530*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
5531*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m0, m7
5532*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m0, m7
5533*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m7
5534*c0909341SAndroid Build Coastguard Worker    mova                 m7, m3
5535*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
5536*c0909341SAndroid Build Coastguard Worker%else
5537*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [o(pd_3035)]
5538*c0909341SAndroid Build Coastguard Worker%endif
5539*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m1, [o(pd_m1380)]
5540*c0909341SAndroid Build Coastguard Worker    pmulld               m1, [o(pd_3857)]
5541*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m2, [o(pd_3703)]
5542*c0909341SAndroid Build Coastguard Worker    pmulld               m2, [o(pd_1751)]
5543*c0909341SAndroid Build Coastguard Worker    jmp .main_oddhalf_part1_fast2
5544*c0909341SAndroid Build Coastguard Worker.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
5545*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5546*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         0, 7, 8, 9, 10, _,  201, 4091 ; t16a, t31a
5547*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
5548*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
5549*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
5550*c0909341SAndroid Build Coastguard Worker.main_oddhalf_part1_fast2:
5551*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
5552*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
5553*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m4 ; t17
5554*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t16
5555*c0909341SAndroid Build Coastguard Worker    psubd                m4, m6, m2 ; t18
5556*c0909341SAndroid Build Coastguard Worker    paddd                m6, m2     ; t19
5557*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m5 ; t29
5558*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5     ; t28
5559*c0909341SAndroid Build Coastguard Worker    psubd                m5, m7, m3 ; t30
5560*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t31
5561*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
5562*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
5563*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_4017)]
5564*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pd_799)]
5565*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 8, 3, 9, _, 11, 10, 15    ; t17a, t30a
5566*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a
5567*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m6 ; t19a
5568*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6     ; t16a
5569*c0909341SAndroid Build Coastguard Worker    psubd                m6, m7, m1 ; t28a
5570*c0909341SAndroid Build Coastguard Worker    paddd                m7, m1     ; t31a
5571*c0909341SAndroid Build Coastguard Worker    psubd                m1, m5, m4 ; t18
5572*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4     ; t17
5573*c0909341SAndroid Build Coastguard Worker    psubd                m4, m8, m2 ; t29
5574*c0909341SAndroid Build Coastguard Worker    paddd                m8, m2     ; t30
5575*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
5576*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
5577*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_3784)]
5578*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pd_1567)]
5579*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
5580*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 3, 2, 9, _, 11, 10, 15 ; t19,  t28
5581*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*0], m0
5582*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*1], m5
5583*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*2], m4
5584*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*3], m6
5585*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*4], m3
5586*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*5], m1
5587*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*6], m8
5588*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*7], m7
5589*c0909341SAndroid Build Coastguard Worker%else
5590*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m2
5591*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m3
5592*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m4
5593*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m5
5594*c0909341SAndroid Build Coastguard Worker    mova                  m3, [o(pd_2048)]
5595*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         0, 7, 2, 4, 5, 3,  201, 4091 ; t16a, t31a
5596*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a
5597*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+2*16]
5598*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+3*16]
5599*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m6
5600*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m7
5601*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+0*16]
5602*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+1*16]
5603*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
5604*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m1
5605*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a
5606*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a
5607*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+0*16]
5608*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+1*16]
5609*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+2*16]
5610*c0909341SAndroid Build Coastguard Worker.main_oddhalf_part1_fast2:
5611*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m3}, m1, m2, m4, m5, m6, m7
5612*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m1, m2, m4, m5, m6, m7
5613*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m4 ; t17
5614*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m3
5615*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+3*16]
5616*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t16
5617*c0909341SAndroid Build Coastguard Worker    psubd                m4, m6, m2 ; t18
5618*c0909341SAndroid Build Coastguard Worker    paddd                m6, m2     ; t19
5619*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m5 ; t29
5620*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5     ; t28
5621*c0909341SAndroid Build Coastguard Worker    psubd                m5, m3, m7 ; t30
5622*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t31
5623*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_min)]
5624*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
5625*c0909341SAndroid Build Coastguard Worker    pmaxsd               m3, [r3+0*16]
5626*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m3
5627*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_max)]
5628*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
5629*c0909341SAndroid Build Coastguard Worker    pminsd               m3, [r3+0*16]
5630*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
5631*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m1
5632*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m6
5633*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m7
5634*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pd_2048)]
5635*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 3, 1, 6, 7, 0,  799, 4017    ; t17a, t30a
5636*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 4, 1, 6, _, 0,    7, 4017, 4 ; t29a, t18a
5637*c0909341SAndroid Build Coastguard Worker    psubd                m1, m5, m4 ; t18
5638*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4     ; t17
5639*c0909341SAndroid Build Coastguard Worker    psubd                m4, m3, m2 ; t29
5640*c0909341SAndroid Build Coastguard Worker    paddd                m3, m2     ; t30
5641*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+0*16]
5642*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+1*16]
5643*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+2*16]
5644*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+3*16]
5645*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m3
5646*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m6 ; t19a
5647*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6     ; t16a
5648*c0909341SAndroid Build Coastguard Worker    psubd                m6, m7, m2 ; t28a
5649*c0909341SAndroid Build Coastguard Worker    paddd                m7, m2     ; t31a
5650*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(clip_18b_min)]
5651*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
5652*c0909341SAndroid Build Coastguard Worker    pmaxsd               m2, [r3+0*16]
5653*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m2
5654*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(clip_18b_max)]
5655*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
5656*c0909341SAndroid Build Coastguard Worker    pminsd               m2, [r3+0*16]
5657*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*0], m0
5658*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*1], m5
5659*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*6], m2
5660*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*7], m7
5661*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_2048)]
5662*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a
5663*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 3, 0, 5, 2, 7,    2, 3784 ; t19,  t28
5664*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*2], m4
5665*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*3], m6
5666*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*4], m3
5667*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*5], m1
5668*c0909341SAndroid Build Coastguard Worker%endif
5669*c0909341SAndroid Build Coastguard Worker    ret
5670*c0909341SAndroid Build Coastguard Worker.main_oddhalf_part2_fast: ; lower half zero
5671*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m0, [o(pd_m601)]
5672*c0909341SAndroid Build Coastguard Worker    pmulld               m0, [o(pd_4052)]
5673*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m3, [o(pd_3290)]
5674*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
5675*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [o(pd_2440)]
5676*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
5677*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m5}, m0, m7
5678*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m0, m7
5679*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m7
5680*c0909341SAndroid Build Coastguard Worker    mova                 m7, m3
5681*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
5682*c0909341SAndroid Build Coastguard Worker%else
5683*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [o(pd_2440)]
5684*c0909341SAndroid Build Coastguard Worker%endif
5685*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m1, [o(pd_3973)]
5686*c0909341SAndroid Build Coastguard Worker    pmulld               m1, [o(pd_995)]
5687*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m2, [o(pd_m2106)]
5688*c0909341SAndroid Build Coastguard Worker    pmulld               m2, [o(pd_3513)]
5689*c0909341SAndroid Build Coastguard Worker    jmp .main_oddhalf_part2_fast2
5690*c0909341SAndroid Build Coastguard Worker.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
5691*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5692*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 0, 8, 9, 10, _, 4052,  601 ; t23a, t24a
5693*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 6, 8, 9, 10, _,  995, 3973 ; t20a, t27a
5694*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
5695*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
5696*c0909341SAndroid Build Coastguard Worker.main_oddhalf_part2_fast2:
5697*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
5698*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
5699*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m4 ; t25
5700*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t24
5701*c0909341SAndroid Build Coastguard Worker    psubd                m4, m6, m2 ; t26
5702*c0909341SAndroid Build Coastguard Worker    paddd                m6, m2     ; t27
5703*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m5 ; t21
5704*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5     ; t20
5705*c0909341SAndroid Build Coastguard Worker    psubd                m5, m7, m3 ; t22
5706*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t23
5707*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
5708*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
5709*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_2276)]
5710*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pd_3406)]
5711*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 2, 3, 9, _, 11, 10, 15    ; t21a, t26a
5712*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a
5713*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m6 ; t27a
5714*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6     ; t24a
5715*c0909341SAndroid Build Coastguard Worker    psubd                m6, m7, m1 ; t20a
5716*c0909341SAndroid Build Coastguard Worker    paddd                m7, m1     ; t23a
5717*c0909341SAndroid Build Coastguard Worker    psubd                m1, m5, m4 ; t21
5718*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4     ; t22
5719*c0909341SAndroid Build Coastguard Worker    psubd                m4, m8, m2 ; t26
5720*c0909341SAndroid Build Coastguard Worker    paddd                m8, m2     ; t25
5721*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
5722*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
5723*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_3784)]
5724*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pd_1567)]
5725*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a
5726*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 6, 2, 9, _, 11, 10, 15, 4 ; t27,  t20
5727*c0909341SAndroid Build Coastguard Worker    mova                 m9, [r3+16*0] ; t16a
5728*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+16*1] ; t17
5729*c0909341SAndroid Build Coastguard Worker    psubd                m2, m9, m7    ; t23
5730*c0909341SAndroid Build Coastguard Worker    paddd                m9, m7        ; t16
5731*c0909341SAndroid Build Coastguard Worker    psubd                m7, m10, m5   ; t22a
5732*c0909341SAndroid Build Coastguard Worker    paddd               m10, m5        ; t17a
5733*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m9, m10, m2, m7
5734*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m9, m10, m2, m7
5735*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*0], m9
5736*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*1], m10
5737*c0909341SAndroid Build Coastguard Worker    mova                 m9, [r3+16*2] ; t18a
5738*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+16*3] ; t19
5739*c0909341SAndroid Build Coastguard Worker    psubd                m5, m9, m1    ; t21
5740*c0909341SAndroid Build Coastguard Worker    paddd                m9, m1        ; t18
5741*c0909341SAndroid Build Coastguard Worker    psubd                m1, m10, m6   ; t20a
5742*c0909341SAndroid Build Coastguard Worker    paddd               m10, m6        ; t19a
5743*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m9, m10, m5, m1
5744*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m9, m10, m5, m1
5745*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*2], m9
5746*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*3], m10
5747*c0909341SAndroid Build Coastguard Worker    mova                 m9, [r3+16*4] ; t28
5748*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+16*5] ; t29a
5749*c0909341SAndroid Build Coastguard Worker    psubd                m6, m9, m3    ; t27a
5750*c0909341SAndroid Build Coastguard Worker    paddd                m9, m3        ; t28a
5751*c0909341SAndroid Build Coastguard Worker    psubd                m3, m10, m4   ; t26
5752*c0909341SAndroid Build Coastguard Worker    paddd               m10, m4        ; t29
5753*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m9, m10, m6, m3
5754*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m9, m10, m6, m3
5755*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m14}, m6, m3, m1, m5
5756*c0909341SAndroid Build Coastguard Worker    paddd                m6, m11
5757*c0909341SAndroid Build Coastguard Worker    paddd                m3, m11
5758*c0909341SAndroid Build Coastguard Worker    psubd                m4, m6, m1    ; t20
5759*c0909341SAndroid Build Coastguard Worker    paddd                m6, m1        ; t27
5760*c0909341SAndroid Build Coastguard Worker    psubd                m1, m3, m5    ; t21a
5761*c0909341SAndroid Build Coastguard Worker    paddd                m3, m5        ; t26a
5762*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m4, m1, m3, m6
5763*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*4], m4
5764*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*5], m1
5765*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+16*6] ; t30
5766*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+16*7] ; t31a
5767*c0909341SAndroid Build Coastguard Worker    psubd                m5, m4, m8    ; t25a
5768*c0909341SAndroid Build Coastguard Worker    paddd                m4, m8        ; t30a
5769*c0909341SAndroid Build Coastguard Worker    psubd                m8, m1, m0    ; t24
5770*c0909341SAndroid Build Coastguard Worker    paddd                m1, m0        ; t31
5771*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m8, m5, m4, m1
5772*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m8, m5, m4, m1
5773*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m14}, m5, m8, m7, m2
5774*c0909341SAndroid Build Coastguard Worker    paddd                m5, m11
5775*c0909341SAndroid Build Coastguard Worker    paddd                m8, m11
5776*c0909341SAndroid Build Coastguard Worker    psubd                m0, m5, m7    ; t22
5777*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7        ; t25
5778*c0909341SAndroid Build Coastguard Worker    psubd                m7, m8, m2    ; t23a
5779*c0909341SAndroid Build Coastguard Worker    paddd                m2, m8        ; t24a
5780*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m0, m7, m2, m5
5781*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*6], m0
5782*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*7], m7
5783*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*8], m2
5784*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*9], m5
5785*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*10], m3
5786*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*11], m6
5787*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*12], m9
5788*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*13], m10
5789*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*14], m4
5790*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*15], m1
5791*c0909341SAndroid Build Coastguard Worker%else
5792*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 8*16], m2
5793*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 9*16], m3
5794*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m4
5795*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m5
5796*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(pd_2048)]
5797*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 0, 2, 4, 5, 3, 4052,  601 ; t23a, t24a
5798*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 6, 2, 4, 5, _,  995, 3973 ; t20a, t27a
5799*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+ 8*16]
5800*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+10*16]
5801*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+11*16]
5802*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 8*16], m0
5803*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m6
5804*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m7
5805*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+ 9*16]
5806*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 9*16], m1
5807*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a
5808*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a
5809*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+ 8*16]
5810*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+ 9*16]
5811*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+10*16]
5812*c0909341SAndroid Build Coastguard Worker.main_oddhalf_part2_fast2:
5813*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m3}, m1, m2, m7, m4, m5, m6
5814*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m1, m2, m7, m4, m5, m6
5815*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m4 ; t25
5816*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 8*16], m3
5817*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+11*16]
5818*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t24
5819*c0909341SAndroid Build Coastguard Worker    psubd                m4, m6, m2 ; t26
5820*c0909341SAndroid Build Coastguard Worker    paddd                m6, m2     ; t27
5821*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m5 ; t21
5822*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5     ; t20
5823*c0909341SAndroid Build Coastguard Worker    psubd                m5, m3, m7 ; t22
5824*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t23
5825*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_min)]
5826*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
5827*c0909341SAndroid Build Coastguard Worker    pmaxsd               m3, [r3+ 8*16]
5828*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 8*16], m3
5829*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_max)]
5830*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
5831*c0909341SAndroid Build Coastguard Worker    pminsd               m3, [r3+ 8*16]
5832*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 8*16], m0
5833*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 9*16], m1
5834*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m6
5835*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m7
5836*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_2048)]
5837*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 2, 0, 1, 6, 7, 3406, 2276    ; t21a, t26a
5838*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 5, 0, 1, _, 7,    6, 2276, 4 ; t25a, t22a
5839*c0909341SAndroid Build Coastguard Worker    psubd                m1, m5, m4 ; t21
5840*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4     ; t22
5841*c0909341SAndroid Build Coastguard Worker    psubd                m4, m3, m2 ; t26
5842*c0909341SAndroid Build Coastguard Worker    paddd                m3, m2     ; t25
5843*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+ 8*16]
5844*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+ 9*16]
5845*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+10*16]
5846*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+11*16]
5847*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 8*16], m3
5848*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m6 ; t27a
5849*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6     ; t24a
5850*c0909341SAndroid Build Coastguard Worker    psubd                m6, m7, m2 ; t20a
5851*c0909341SAndroid Build Coastguard Worker    paddd                m7, m2     ; t23a
5852*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(clip_18b_min)]
5853*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
5854*c0909341SAndroid Build Coastguard Worker    pmaxsd               m2, [r3+ 8*16]
5855*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 8*16], m2
5856*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(clip_18b_max)]
5857*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
5858*c0909341SAndroid Build Coastguard Worker    pminsd               m2, [r3+ 8*16]
5859*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 8*16], m0
5860*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 9*16], m2
5861*c0909341SAndroid Build Coastguard Worker    mova         [r3+14*16], m5
5862*c0909341SAndroid Build Coastguard Worker    mova         [r3+15*16], m7
5863*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pd_2048)]
5864*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a
5865*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 6, 2, 5, _, 0,    7, 3784, 4 ; t27,  t20
5866*c0909341SAndroid Build Coastguard Worker    mova         [r3+10*16], m3
5867*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(clip_18b_min)]
5868*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(clip_18b_max)]
5869*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+16*2] ; t18a
5870*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+16*3] ; t19
5871*c0909341SAndroid Build Coastguard Worker    psubd                m3, m5, m1    ; t21
5872*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1        ; t18
5873*c0909341SAndroid Build Coastguard Worker    psubd                m1, m7, m6    ; t20a
5874*c0909341SAndroid Build Coastguard Worker    paddd                m7, m6        ; t19a
5875*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m0}, m5, m7, m3, m1
5876*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m2}, m5, m7, m3, m1
5877*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*2], m5
5878*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*3], m7
5879*c0909341SAndroid Build Coastguard Worker    mova         [r3+11*16], m3
5880*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+10*16]
5881*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+16*4] ; t28
5882*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+16*5] ; t29a
5883*c0909341SAndroid Build Coastguard Worker    psubd                m6, m5, m3    ; t27a
5884*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3        ; t28a
5885*c0909341SAndroid Build Coastguard Worker    psubd                m3, m7, m4    ; t26
5886*c0909341SAndroid Build Coastguard Worker    paddd                m7, m4        ; t29
5887*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m0}, m5, m7, m6, m3
5888*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m2}, m5, m7, m6, m3
5889*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*12], m5
5890*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*13], m7
5891*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
5892*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_2896)]
5893*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+11*16]
5894*c0909341SAndroid Build Coastguard Worker    REPX     {pmulld x, m7}, m6, m3, m1, m4
5895*c0909341SAndroid Build Coastguard Worker    paddd                m6, m5
5896*c0909341SAndroid Build Coastguard Worker    paddd                m3, m5
5897*c0909341SAndroid Build Coastguard Worker    psubd                m5, m6, m1    ; t20
5898*c0909341SAndroid Build Coastguard Worker    paddd                m6, m1        ; t27
5899*c0909341SAndroid Build Coastguard Worker    psubd                m1, m3, m4    ; t21a
5900*c0909341SAndroid Build Coastguard Worker    paddd                m3, m4        ; t26a
5901*c0909341SAndroid Build Coastguard Worker    REPX     {psrad  x, 12}, m5, m1, m3, m6
5902*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*4], m5
5903*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*5], m1
5904*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*10], m3
5905*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*11], m6
5906*c0909341SAndroid Build Coastguard Worker
5907*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+14*16]
5908*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+15*16]
5909*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+16*0] ; t16a
5910*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+16*1] ; t17
5911*c0909341SAndroid Build Coastguard Worker    psubd                m1, m3, m6    ; t23
5912*c0909341SAndroid Build Coastguard Worker    paddd                m3, m6        ; t16
5913*c0909341SAndroid Build Coastguard Worker    psubd                m6, m4, m5    ; t22a
5914*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5        ; t17a
5915*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m0}, m3, m4, m1, m6
5916*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m2}, m3, m4, m1, m6
5917*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*0], m3
5918*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*1], m4
5919*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+ 8*16]
5920*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+ 9*16]
5921*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 8*16], m1
5922*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 9*16], m6
5923*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+16*6] ; t30
5924*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+16*7] ; t31a
5925*c0909341SAndroid Build Coastguard Worker    psubd                m6, m1, m5    ; t24
5926*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5        ; t31
5927*c0909341SAndroid Build Coastguard Worker    psubd                m5, m4, m3    ; t25a
5928*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3        ; t30a
5929*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m0}, m6, m5, m4, m1
5930*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m2}, m6, m5, m4, m1
5931*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*14], m4
5932*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*15], m1
5933*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pd_2048)]
5934*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+ 9*16]
5935*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+ 8*16]
5936*c0909341SAndroid Build Coastguard Worker    REPX     {pmulld x, m7}, m5, m6, m1, m2
5937*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
5938*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
5939*c0909341SAndroid Build Coastguard Worker    psubd                m0, m5, m1    ; t22
5940*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1        ; t25
5941*c0909341SAndroid Build Coastguard Worker    psubd                m1, m6, m2    ; t23a
5942*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6        ; t24a
5943*c0909341SAndroid Build Coastguard Worker    REPX     {psrad  x, 12}, m0, m1, m2, m5
5944*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*6], m0
5945*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*7], m1
5946*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*8], m2
5947*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*9], m5
5948*c0909341SAndroid Build Coastguard Worker%endif
5949*c0909341SAndroid Build Coastguard Worker    ret
5950*c0909341SAndroid Build Coastguard Worker
5951*c0909341SAndroid Build Coastguard Worker    ; final sumsub for idct16 as well as idct32, plus final downshift
5952*c0909341SAndroid Build Coastguard Worker%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
5953*c0909341SAndroid Build Coastguard Worker    mova                m%4, [r3+16*(23-%1)]
5954*c0909341SAndroid Build Coastguard Worker    pmaxsd              m%1, m12
5955*c0909341SAndroid Build Coastguard Worker    pminsd              m%1, m13
5956*c0909341SAndroid Build Coastguard Worker    psubd               m%3, m%1, m%4 ; idct16 out15 - n
5957*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%4      ; idct16 out0  + n
5958*c0909341SAndroid Build Coastguard Worker    pmaxsd              m%1, m12
5959*c0909341SAndroid Build Coastguard Worker    pmaxsd              m%3, m12
5960*c0909341SAndroid Build Coastguard Worker    pminsd              m%1, m13
5961*c0909341SAndroid Build Coastguard Worker    pminsd              m%3, m13
5962*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m11
5963*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m11
5964*c0909341SAndroid Build Coastguard Worker    mova                m%5, [r3+16*( 0+%1)]
5965*c0909341SAndroid Build Coastguard Worker    mova                m%2, [r3+16*(15-%1)]
5966*c0909341SAndroid Build Coastguard Worker    psubd               m%4, m%1, m%2 ; out31 - n
5967*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%2      ; out0  + n
5968*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%3, m%5 ; out15 - n
5969*c0909341SAndroid Build Coastguard Worker    psubd               m%3, m%5      ; out16 + n
5970*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, %6}, m%1, m%3, m%2, m%4
5971*c0909341SAndroid Build Coastguard Worker%endmacro
5972*c0909341SAndroid Build Coastguard Worker
5973*c0909341SAndroid Build Coastguard Worker.round_dct32:
5974*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5975*c0909341SAndroid Build Coastguard Worker    psrld               m11, 10 ; pd_2
5976*c0909341SAndroid Build Coastguard Worker    IDCT32_END            0, 15, 8, 9, 10, 2    ; 0 15 16 31
5977*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 0*16], m6
5978*c0909341SAndroid Build Coastguard Worker    mova         [r3+23*16], m7
5979*c0909341SAndroid Build Coastguard Worker    IDCT32_END            1, 14, 6, 7, 10, 2    ; 1 14 17 30
5980*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1       ;  0  1
5981*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15      ; 14 15
5982*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m6       ; 16 17
5983*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m9       ; 30 31
5984*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*15], m14
5985*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*14], m7
5986*c0909341SAndroid Build Coastguard Worker    IDCT32_END            2, 15, 10, 7, 6, 2    ; 2 13 18 29
5987*c0909341SAndroid Build Coastguard Worker    IDCT32_END            3, 14,  1, 9, 6, 2    ; 3 12 19 28
5988*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3       ;  2  3
5989*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15      ; 12 13
5990*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m1       ; 18 19
5991*c0909341SAndroid Build Coastguard Worker    packssdw             m9, m7       ; 28 29
5992*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*13], m14
5993*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*12], m9
5994*c0909341SAndroid Build Coastguard Worker    IDCT32_END            4, 15, 1, 7, 6, 2     ; 4 11 20 27
5995*c0909341SAndroid Build Coastguard Worker    IDCT32_END            5, 14, 3, 9, 6, 2     ; 5 10 21 26
5996*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5       ;  4  5
5997*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15      ; 10 11
5998*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3       ; 20 21
5999*c0909341SAndroid Build Coastguard Worker    packssdw             m9, m7       ; 26 27
6000*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*11], m14
6001*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*10], m9
6002*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+ 0*16]
6003*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+23*16]
6004*c0909341SAndroid Build Coastguard Worker    IDCT32_END            6, 15, 14, 5,  3, 2   ; 6 9 22 25
6005*c0909341SAndroid Build Coastguard Worker    IDCT32_END            7, 11,  3, 9, 13, 2   ; 7 8 23 24
6006*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7       ;  6  7
6007*c0909341SAndroid Build Coastguard Worker    packssdw            m11, m15      ;  8  9
6008*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m3       ; 22 23
6009*c0909341SAndroid Build Coastguard Worker    packssdw             m9, m5       ; 24 25
6010*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*9], m11
6011*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*8], m9
6012*c0909341SAndroid Build Coastguard Worker    mova                m12, m1
6013*c0909341SAndroid Build Coastguard Worker    ret
6014*c0909341SAndroid Build Coastguard Worker%else
6015*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*16], m0
6016*c0909341SAndroid Build Coastguard Worker    mova         [r3+17*16], m1
6017*c0909341SAndroid Build Coastguard Worker    mova         [r3+18*16], m2
6018*c0909341SAndroid Build Coastguard Worker    mova         [r3+19*16], m3
6019*c0909341SAndroid Build Coastguard Worker    mova         [r3+20*16], m4
6020*c0909341SAndroid Build Coastguard Worker    mova         [r3+21*16], m5
6021*c0909341SAndroid Build Coastguard Worker    mova         [r3+22*16], m6
6022*c0909341SAndroid Build Coastguard Worker    mova         [r3+23*16], m7
6023*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(pd_2)]
6024*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(clip_18b_min)]
6025*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_max)]
6026*c0909341SAndroid Build Coastguard Worker
6027*c0909341SAndroid Build Coastguard Worker    mov                  r4, 15*16
6028*c0909341SAndroid Build Coastguard Worker.loop_dct32_end:
6029*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+16*16]
6030*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+16*24]
6031*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, m2
6032*c0909341SAndroid Build Coastguard Worker    pminsd               m0, m3
6033*c0909341SAndroid Build Coastguard Worker    psubd                m5, m0, m6 ; idct16 out15 - n
6034*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6     ; idct16 out0  + n
6035*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, m2
6036*c0909341SAndroid Build Coastguard Worker    pmaxsd               m5, m2
6037*c0909341SAndroid Build Coastguard Worker    pminsd               m0, m3
6038*c0909341SAndroid Build Coastguard Worker    pminsd               m5, m3
6039*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
6040*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1
6041*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3]
6042*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+r4]
6043*c0909341SAndroid Build Coastguard Worker    psubd                m6, m0, m4 ; out31 - n
6044*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; out0  + n
6045*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5, m7 ; out15 - n
6046*c0909341SAndroid Build Coastguard Worker    psubd                m5, m7     ; out16 + n
6047*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 2}, m0, m5, m4, m6
6048*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
6049*c0909341SAndroid Build Coastguard Worker    mova            [r3+r4], m4
6050*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*16], m5
6051*c0909341SAndroid Build Coastguard Worker    mova         [r3+24*16], m6
6052*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
6053*c0909341SAndroid Build Coastguard Worker    sub                  r4, 32
6054*c0909341SAndroid Build Coastguard Worker    jg .loop_dct32_end
6055*c0909341SAndroid Build Coastguard Worker    ret
6056*c0909341SAndroid Build Coastguard Worker%endif
6057*c0909341SAndroid Build Coastguard Worker
6058*c0909341SAndroid Build Coastguard Worker.dconly:
6059*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
6060*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
6061*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 8
6062*c0909341SAndroid Build Coastguard Worker.dconly1:
6063*c0909341SAndroid Build Coastguard Worker    add                 r5d, 640
6064*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 10
6065*c0909341SAndroid Build Coastguard Worker.dconly2:
6066*c0909341SAndroid Build Coastguard Worker    imul                r5d, 2896
6067*c0909341SAndroid Build Coastguard Worker    add                 r5d, 34816
6068*c0909341SAndroid Build Coastguard Worker    movd                 m0, r5d
6069*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q1111
6070*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
6071*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pixel_10bpc_max)]
6072*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
6073*c0909341SAndroid Build Coastguard Worker.dconly_loop:
6074*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+16*0]
6075*c0909341SAndroid Build Coastguard Worker    mova                 m2, [dstq+16*1]
6076*c0909341SAndroid Build Coastguard Worker    mova                 m3, [dstq+16*2]
6077*c0909341SAndroid Build Coastguard Worker    mova                 m4, [dstq+16*3]
6078*c0909341SAndroid Build Coastguard Worker    REPX     {paddw  x, m0}, m1, m2, m3, m4
6079*c0909341SAndroid Build Coastguard Worker    REPX     {pminsw x, m6}, m1, m2, m3, m4
6080*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsw x, m5}, m1, m2, m3, m4
6081*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m1
6082*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m2
6083*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m3
6084*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m4
6085*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
6086*c0909341SAndroid Build Coastguard Worker    dec                 r3d
6087*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
6088*c0909341SAndroid Build Coastguard Worker    RET
6089*c0909341SAndroid Build Coastguard Worker
6090*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
6091*c0909341SAndroid Build Coastguard Worker                                         dst, stride, c, eob
6092*c0909341SAndroid Build Coastguard Worker    LEA                  r6, base
6093*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
6094*c0909341SAndroid Build Coastguard Worker    jz .dconly
6095*c0909341SAndroid Build Coastguard Worker
6096*c0909341SAndroid Build Coastguard Worker    ; remove entirely-zero iterations
6097*c0909341SAndroid Build Coastguard Worker%undef cmp
6098*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 8
6099*c0909341SAndroid Build Coastguard Worker.zero_loop:
6100*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
6101*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_32x16_2d)+r5]
6102*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
6103*c0909341SAndroid Build Coastguard Worker
6104*c0909341SAndroid Build Coastguard Worker    ; actual first pass after skipping all-zero data
6105*c0909341SAndroid Build Coastguard Worker.loop_pass1:
6106*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6107*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
6108*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
6109*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
6110*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
6111*c0909341SAndroid Build Coastguard Worker%endif
6112*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 1+r5*8]
6113*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 7+r5*8]
6114*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64* 9+r5*8]
6115*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*15+r5*8]
6116*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64*17+r5*8]
6117*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*23+r5*8]
6118*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*25+r5*8]
6119*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64*31+r5*8]
6120*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
6121*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
6122*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
6123*c0909341SAndroid Build Coastguard Worker
6124*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 3+r5*8]
6125*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 5+r5*8]
6126*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*11+r5*8]
6127*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*13+r5*8]
6128*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64*19+r5*8]
6129*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*21+r5*8]
6130*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*27+r5*8]
6131*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64*29+r5*8]
6132*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6133*c0909341SAndroid Build Coastguard Worker    add                  r3, 16*8
6134*c0909341SAndroid Build Coastguard Worker%endif
6135*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
6136*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6137*c0909341SAndroid Build Coastguard Worker    sub                  r3, 16*8
6138*c0909341SAndroid Build Coastguard Worker%endif
6139*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
6140*c0909341SAndroid Build Coastguard Worker    add                  r3, 16*(16+4*ARCH_X86_32)
6141*c0909341SAndroid Build Coastguard Worker
6142*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 2+r5*8]
6143*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 6+r5*8]
6144*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*10+r5*8]
6145*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*14+r5*8]
6146*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64*18+r5*8]
6147*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*22+r5*8]
6148*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*26+r5*8]
6149*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64*30+r5*8]
6150*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
6151*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).main_oddhalf
6152*c0909341SAndroid Build Coastguard Worker
6153*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 0+r5*8]
6154*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 4+r5*8]
6155*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64* 8+r5*8]
6156*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*12+r5*8]
6157*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64*16+r5*8]
6158*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*20+r5*8]
6159*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*24+r5*8]
6160*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64*28+r5*8]
6161*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
6162*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1
6163*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
6164*c0909341SAndroid Build Coastguard Worker    sub                  r3, 16*(16+4*ARCH_X86_32)
6165*c0909341SAndroid Build Coastguard Worker    call .round_dct32
6166*c0909341SAndroid Build Coastguard Worker
6167*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6168*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6169*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6170*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 8+r5*8], m8
6171*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 9+r5*8], m9
6172*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*10+r5*8], m10
6173*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*11+r5*8], m11
6174*c0909341SAndroid Build Coastguard Worker    mova                 m8, [r3+16* 9] ;  8  9
6175*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+16*11] ; 10 11
6176*c0909341SAndroid Build Coastguard Worker    mova                m12, [r3+16*13] ; 12 13
6177*c0909341SAndroid Build Coastguard Worker    mova                m14, [r3+16*15] ; 14 15
6178*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6179*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 4+r5*8], m8
6180*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 5+r5*8], m9
6181*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 6+r5*8], m10
6182*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 7+r5*8], m11
6183*c0909341SAndroid Build Coastguard Worker    mova                 m8, [r3+16* 8] ; 24 25
6184*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+16*10] ; 26 27
6185*c0909341SAndroid Build Coastguard Worker    mova                m12, [r3+16*12] ; 28 29
6186*c0909341SAndroid Build Coastguard Worker    mova                m14, [r3+16*14] ; 30 31
6187*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6188*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*12+r5*8], m8
6189*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*13+r5*8], m9
6190*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*14+r5*8], m10
6191*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*15+r5*8], m11
6192*c0909341SAndroid Build Coastguard Worker%else
6193*c0909341SAndroid Build Coastguard Worker    sub                  r3, 8*16
6194*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+ 8*16]
6195*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+10*16]
6196*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+12*16]
6197*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+14*16]
6198*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+ 9*16]
6199*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+11*16]
6200*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+13*16]
6201*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+15*16]
6202*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6203*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 4+r5*8], m0
6204*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 5+r5*8], m1
6205*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 6+r5*8], m2
6206*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 7+r5*8], m3
6207*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+16*16]
6208*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+18*16]
6209*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+20*16]
6210*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+22*16]
6211*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+17*16]
6212*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+19*16]
6213*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+21*16]
6214*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+23*16]
6215*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6216*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 8+r5*8], m0
6217*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 9+r5*8], m1
6218*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*10+r5*8], m2
6219*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*11+r5*8], m3
6220*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+31*16]
6221*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+29*16]
6222*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+27*16]
6223*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+25*16]
6224*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+30*16]
6225*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+28*16]
6226*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+26*16]
6227*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+24*16]
6228*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6229*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*12+r5*8], m0
6230*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*13+r5*8], m1
6231*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*14+r5*8], m2
6232*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*15+r5*8], m3
6233*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+ 0*16]
6234*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+ 2*16]
6235*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+ 4*16]
6236*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+ 6*16]
6237*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+ 1*16]
6238*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+ 3*16]
6239*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+ 5*16]
6240*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+ 7*16]
6241*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6242*c0909341SAndroid Build Coastguard Worker%endif
6243*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 0+r5*8], m0
6244*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 1+r5*8], m1
6245*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 2+r5*8], m2
6246*c0909341SAndroid Build Coastguard Worker    mova    [cq+64* 3+r5*8], m3
6247*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
6248*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
6249*c0909341SAndroid Build Coastguard Worker                                    24, 25, 26, 27, 28, 29, 30, 31
6250*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
6251*c0909341SAndroid Build Coastguard Worker    jge .loop_pass1
6252*c0909341SAndroid Build Coastguard Worker
6253*c0909341SAndroid Build Coastguard Worker    ; pass=2, we need to call this otherwise the stack pointer has
6254*c0909341SAndroid Build Coastguard Worker    ; the wrong offset in the 8-bit code
6255*c0909341SAndroid Build Coastguard Worker    call .pass2
6256*c0909341SAndroid Build Coastguard Worker    RET
6257*c0909341SAndroid Build Coastguard Worker
6258*c0909341SAndroid Build Coastguard Worker.pass2:
6259*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6260*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_2048)]
6261*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
6262*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
6263*c0909341SAndroid Build Coastguard Worker%if WIN64
6264*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize], r7
6265*c0909341SAndroid Build Coastguard Worker%endif
6266*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
6267*c0909341SAndroid Build Coastguard Worker%else
6268*c0909341SAndroid Build Coastguard Worker    mov [rsp+2*gprsize+16*16], dstq
6269*c0909341SAndroid Build Coastguard Worker%endif
6270*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
6271*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 4
6272*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_16bpc).loop_pass2
6273*c0909341SAndroid Build Coastguard Worker
6274*c0909341SAndroid Build Coastguard Worker.round_dct32:
6275*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6276*c0909341SAndroid Build Coastguard Worker    psrld               m11, 11 ; pd_1
6277*c0909341SAndroid Build Coastguard Worker    IDCT32_END            0, 15, 8, 9, 10, 1    ; 0 15 16 31
6278*c0909341SAndroid Build Coastguard Worker    mova         [r3+ 0*16], m6
6279*c0909341SAndroid Build Coastguard Worker    mova         [r3+23*16], m7
6280*c0909341SAndroid Build Coastguard Worker    IDCT32_END            1, 14, 6, 7, 10, 1    ; 1 14 17 30
6281*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1       ;  0  1
6282*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15      ; 14 15
6283*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m6       ; 16 17
6284*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m9       ; 30 31
6285*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*15], m14
6286*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*14], m7
6287*c0909341SAndroid Build Coastguard Worker    IDCT32_END            2, 15, 10, 7, 6, 1    ; 2 13 18 29
6288*c0909341SAndroid Build Coastguard Worker    IDCT32_END            3, 14,  1, 9, 6, 1    ; 3 12 19 28
6289*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3       ;  2  3
6290*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15      ; 12 13
6291*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m1       ; 18 19
6292*c0909341SAndroid Build Coastguard Worker    packssdw             m9, m7       ; 28 29
6293*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*13], m14
6294*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*12], m9
6295*c0909341SAndroid Build Coastguard Worker    IDCT32_END            4, 15, 1, 7, 6, 1     ; 4 11 20 27
6296*c0909341SAndroid Build Coastguard Worker    IDCT32_END            5, 14, 3, 9, 6, 1     ; 5 10 21 26
6297*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5       ;  4  5
6298*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15      ; 10 11
6299*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3       ; 20 21
6300*c0909341SAndroid Build Coastguard Worker    packssdw             m9, m7       ; 26 27
6301*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*11], m14
6302*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*10], m9
6303*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+ 0*16]
6304*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+23*16]
6305*c0909341SAndroid Build Coastguard Worker    IDCT32_END            6, 15, 14, 5,  3, 1   ; 6 9 22 25
6306*c0909341SAndroid Build Coastguard Worker    IDCT32_END            7, 11,  3, 9, 13, 1   ; 7 8 23 24
6307*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7       ;  6  7
6308*c0909341SAndroid Build Coastguard Worker    packssdw            m11, m15      ;  8  9
6309*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m3       ; 22 23
6310*c0909341SAndroid Build Coastguard Worker    packssdw             m9, m5       ; 24 25
6311*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*9], m11
6312*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*8], m9
6313*c0909341SAndroid Build Coastguard Worker    mova                m12, m1
6314*c0909341SAndroid Build Coastguard Worker    ret
6315*c0909341SAndroid Build Coastguard Worker%else
6316*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*16], m0
6317*c0909341SAndroid Build Coastguard Worker    mova         [r3+17*16], m1
6318*c0909341SAndroid Build Coastguard Worker    mova         [r3+18*16], m2
6319*c0909341SAndroid Build Coastguard Worker    mova         [r3+19*16], m3
6320*c0909341SAndroid Build Coastguard Worker    mova         [r3+20*16], m4
6321*c0909341SAndroid Build Coastguard Worker    mova         [r3+21*16], m5
6322*c0909341SAndroid Build Coastguard Worker    mova         [r3+22*16], m6
6323*c0909341SAndroid Build Coastguard Worker    mova         [r3+23*16], m7
6324*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m1, m1     ; -1
6325*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(clip_18b_min)]
6326*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_max)]
6327*c0909341SAndroid Build Coastguard Worker
6328*c0909341SAndroid Build Coastguard Worker    mov                  r4, 15*16
6329*c0909341SAndroid Build Coastguard Worker.loop_dct32_end:
6330*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+16*16]
6331*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+16*24]
6332*c0909341SAndroid Build Coastguard Worker    psubd                m5, m0, m6 ; idct16 out15 - n
6333*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6     ; idct16 out0  + n
6334*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, m2
6335*c0909341SAndroid Build Coastguard Worker    pmaxsd               m5, m2
6336*c0909341SAndroid Build Coastguard Worker    pminsd               m0, m3
6337*c0909341SAndroid Build Coastguard Worker    pminsd               m5, m3
6338*c0909341SAndroid Build Coastguard Worker    psubd                m0, m1
6339*c0909341SAndroid Build Coastguard Worker    psubd                m5, m1
6340*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3]
6341*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+r4]
6342*c0909341SAndroid Build Coastguard Worker    psubd                m6, m0, m4 ; out31 - n
6343*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; out0  + n
6344*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5, m7 ; out15 - n
6345*c0909341SAndroid Build Coastguard Worker    psubd                m5, m7     ; out16 + n
6346*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m0, m5, m4, m6
6347*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
6348*c0909341SAndroid Build Coastguard Worker    mova            [r3+r4], m4
6349*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*16], m5
6350*c0909341SAndroid Build Coastguard Worker    mova         [r3+24*16], m6
6351*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
6352*c0909341SAndroid Build Coastguard Worker    sub                  r4, 32
6353*c0909341SAndroid Build Coastguard Worker    jg .loop_dct32_end
6354*c0909341SAndroid Build Coastguard Worker    ret
6355*c0909341SAndroid Build Coastguard Worker%endif
6356*c0909341SAndroid Build Coastguard Worker
6357*c0909341SAndroid Build Coastguard Worker.dconly:
6358*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
6359*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
6360*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 16
6361*c0909341SAndroid Build Coastguard Worker    add                 r5d, 128
6362*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 8
6363*c0909341SAndroid Build Coastguard Worker    imul                r5d, 181
6364*c0909341SAndroid Build Coastguard Worker    add                 r5d, 384
6365*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 9
6366*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
6367*c0909341SAndroid Build Coastguard Worker
6368*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
6369*c0909341SAndroid Build Coastguard Worker                                          dst, stride, c, eob
6370*c0909341SAndroid Build Coastguard Worker    LEA                  r6, base
6371*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
6372*c0909341SAndroid Build Coastguard Worker    jz .dconly
6373*c0909341SAndroid Build Coastguard Worker
6374*c0909341SAndroid Build Coastguard Worker    ; remove entirely-zero iterations
6375*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6376*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*32*16+1*gprsize], dstq
6377*c0909341SAndroid Build Coastguard Worker%elif WIN64
6378*c0909341SAndroid Build Coastguard Worker    mov [rsp+5*32*16+1*gprsize], r7
6379*c0909341SAndroid Build Coastguard Worker%endif
6380*c0909341SAndroid Build Coastguard Worker%undef cmp
6381*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 14
6382*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
6383*c0909341SAndroid Build Coastguard Worker    jge .end_zero_loop
6384*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
6385*c0909341SAndroid Build Coastguard Worker.zero_loop:
6386*c0909341SAndroid Build Coastguard Worker    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
6387*c0909341SAndroid Build Coastguard Worker    movzx               t1d, t0b
6388*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 8
6389*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+r5*8+0*32*16], m0
6390*c0909341SAndroid Build Coastguard Worker    mova   [rsp+40*16+r5*8+0*32*16], m0
6391*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t0*8+0*32*16], m0
6392*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t1*8+0*32*16], m0
6393*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+r5*8+1*32*16], m0
6394*c0909341SAndroid Build Coastguard Worker    mova   [rsp+40*16+r5*8+1*32*16], m0
6395*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t0*8+1*32*16], m0
6396*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t1*8+1*32*16], m0
6397*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+r5*8+2*32*16], m0
6398*c0909341SAndroid Build Coastguard Worker    mova   [rsp+40*16+r5*8+2*32*16], m0
6399*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t0*8+2*32*16], m0
6400*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t1*8+2*32*16], m0
6401*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+r5*8+3*32*16], m0
6402*c0909341SAndroid Build Coastguard Worker    mova   [rsp+40*16+r5*8+3*32*16], m0
6403*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t0*8+3*32*16], m0
6404*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t1*8+3*32*16], m0
6405*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
6406*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
6407*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
6408*c0909341SAndroid Build Coastguard Worker.end_zero_loop:
6409*c0909341SAndroid Build Coastguard Worker
6410*c0909341SAndroid Build Coastguard Worker    ; actual first pass after skipping all-zero data
6411*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*0+5*32*16], eobd
6412*c0909341SAndroid Build Coastguard Worker.loop_pass1:
6413*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 1+r5*8]
6414*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 7+r5*8]
6415*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 9+r5*8]
6416*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*15+r5*8]
6417*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128*17+r5*8]
6418*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*23+r5*8]
6419*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*25+r5*8]
6420*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*31+r5*8]
6421*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6422*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
6423*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
6424*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
6425*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
6426*c0909341SAndroid Build Coastguard Worker%endif
6427*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
6428*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
6429*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 3+r5*8]
6430*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 5+r5*8]
6431*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*11+r5*8]
6432*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*13+r5*8]
6433*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128*19+r5*8]
6434*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*21+r5*8]
6435*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*27+r5*8]
6436*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*29+r5*8]
6437*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
6438*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 2+r5*8]
6439*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 6+r5*8]
6440*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*10+r5*8]
6441*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*14+r5*8]
6442*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128*18+r5*8]
6443*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*22+r5*8]
6444*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*26+r5*8]
6445*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*30+r5*8]
6446*c0909341SAndroid Build Coastguard Worker    add                  r3, 16*(16+4*ARCH_X86_32)
6447*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).main_oddhalf
6448*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 0+r5*8]
6449*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 4+r5*8]
6450*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 8+r5*8]
6451*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*12+r5*8]
6452*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128*16+r5*8]
6453*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*20+r5*8]
6454*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*24+r5*8]
6455*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*28+r5*8]
6456*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1
6457*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
6458*c0909341SAndroid Build Coastguard Worker    sub                  r3, 16*(16+4*ARCH_X86_32)
6459*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32
6460*c0909341SAndroid Build Coastguard Worker    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
6461*c0909341SAndroid Build Coastguard Worker    movzx               t1d, t0b
6462*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 8
6463*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6464*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6465*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6466*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+r5*8+2*32*16], m8
6467*c0909341SAndroid Build Coastguard Worker    mova   [rsp+40*16+r5*8+2*32*16], m10
6468*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t1*8+2*32*16], m9
6469*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t0*8+2*32*16], m11
6470*c0909341SAndroid Build Coastguard Worker    mova                 m8, [r3+16* 9] ;  8  9
6471*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+16*11] ; 10 11
6472*c0909341SAndroid Build Coastguard Worker    mova                m12, [r3+16*13] ; 12 13
6473*c0909341SAndroid Build Coastguard Worker    mova                m14, [r3+16*15] ; 14 15
6474*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6475*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+r5*8+1*32*16], m8
6476*c0909341SAndroid Build Coastguard Worker    mova   [rsp+40*16+r5*8+1*32*16], m10
6477*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t1*8+1*32*16], m9
6478*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t0*8+1*32*16], m11
6479*c0909341SAndroid Build Coastguard Worker    mova                 m8, [r3+16* 8] ; 24 25
6480*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+16*10] ; 26 27
6481*c0909341SAndroid Build Coastguard Worker    mova                m12, [r3+16*12] ; 28 29
6482*c0909341SAndroid Build Coastguard Worker    mova                m14, [r3+16*14] ; 30 31
6483*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6484*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+r5*8+3*32*16], m8
6485*c0909341SAndroid Build Coastguard Worker    mova   [rsp+40*16+r5*8+3*32*16], m10
6486*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t1*8+3*32*16], m9
6487*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t0*8+3*32*16], m11
6488*c0909341SAndroid Build Coastguard Worker%else
6489*c0909341SAndroid Build Coastguard Worker    sub                  r3, 8*16
6490*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+ 8*16]
6491*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+10*16]
6492*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+12*16]
6493*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+14*16]
6494*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+ 9*16]
6495*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+11*16]
6496*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+13*16]
6497*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+15*16]
6498*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6499*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+r5*8+1*32*16], m0
6500*c0909341SAndroid Build Coastguard Worker    mova   [rsp+40*16+r5*8+1*32*16], m2
6501*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t1*8+1*32*16], m1
6502*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t0*8+1*32*16], m3
6503*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+16*16]
6504*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+18*16]
6505*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+20*16]
6506*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+22*16]
6507*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+17*16]
6508*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+19*16]
6509*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+21*16]
6510*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+23*16]
6511*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6512*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+r5*8+2*32*16], m0
6513*c0909341SAndroid Build Coastguard Worker    mova   [rsp+40*16+r5*8+2*32*16], m2
6514*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t1*8+2*32*16], m1
6515*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t0*8+2*32*16], m3
6516*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+31*16]
6517*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+29*16]
6518*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+27*16]
6519*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+25*16]
6520*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+30*16]
6521*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+28*16]
6522*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+26*16]
6523*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+24*16]
6524*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6525*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+r5*8+3*32*16], m0
6526*c0909341SAndroid Build Coastguard Worker    mova   [rsp+40*16+r5*8+3*32*16], m2
6527*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t1*8+3*32*16], m1
6528*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t0*8+3*32*16], m3
6529*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+ 0*16]
6530*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+ 2*16]
6531*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+ 4*16]
6532*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+ 6*16]
6533*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+ 1*16]
6534*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+ 3*16]
6535*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+ 5*16]
6536*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+ 7*16]
6537*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6538*c0909341SAndroid Build Coastguard Worker%endif
6539*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
6540*c0909341SAndroid Build Coastguard Worker    ; clear lower half of [cq]
6541*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
6542*c0909341SAndroid Build Coastguard Worker                                     8, 9, 10, 11, 12, 13, 14, 15, \
6543*c0909341SAndroid Build Coastguard Worker                                     16, 17, 18, 19, 20, 21, 22, 23, \
6544*c0909341SAndroid Build Coastguard Worker                                     24, 25, 26, 27, 28, 29, 30, 31
6545*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+r5*8+0*32*16], m0
6546*c0909341SAndroid Build Coastguard Worker    mova   [rsp+40*16+r5*8+0*32*16], m2
6547*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t1*8+0*32*16], m1
6548*c0909341SAndroid Build Coastguard Worker    mova   [rsp+32*16+t0*8+0*32*16], m3
6549*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
6550*c0909341SAndroid Build Coastguard Worker    jge .loop_pass1
6551*c0909341SAndroid Build Coastguard Worker
6552*c0909341SAndroid Build Coastguard Worker    ; pass=2 code starts here
6553*c0909341SAndroid Build Coastguard Worker    mov                eobd, [rsp+gprsize*0+5*32*16]
6554*c0909341SAndroid Build Coastguard Worker    add                 rsp, 29*16
6555*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
6556*c0909341SAndroid Build Coastguard Worker    jl .load_veryfast
6557*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
6558*c0909341SAndroid Build Coastguard Worker    jl .load_fast
6559*c0909341SAndroid Build Coastguard Worker    ; load normal
6560*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
6561*c0909341SAndroid Build Coastguard Worker    jmp .run
6562*c0909341SAndroid Build Coastguard Worker.load_fast:
6563*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
6564*c0909341SAndroid Build Coastguard Worker    jmp .run
6565*c0909341SAndroid Build Coastguard Worker.load_veryfast:
6566*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
6567*c0909341SAndroid Build Coastguard Worker    ; fall-through
6568*c0909341SAndroid Build Coastguard Worker.run:
6569*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6570*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+64]
6571*c0909341SAndroid Build Coastguard Worker    mov                  r7, -8
6572*c0909341SAndroid Build Coastguard Worker%else
6573*c0909341SAndroid Build Coastguard Worker    lea                  r2, [rsp+(4*32+3)*16]
6574*c0909341SAndroid Build Coastguard Worker    mov dword [r2+0*gprsize], 4
6575*c0909341SAndroid Build Coastguard Worker%endif
6576*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
6577*c0909341SAndroid Build Coastguard Worker
6578*c0909341SAndroid Build Coastguard Worker.dconly:
6579*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
6580*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
6581*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 32
6582*c0909341SAndroid Build Coastguard Worker    add                 rsp, (5*32+1-(24+8*ARCH_X86_32))*16
6583*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1
6584*c0909341SAndroid Build Coastguard Worker
6585*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
6586*c0909341SAndroid Build Coastguard Worker                                          0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \
6587*c0909341SAndroid Build Coastguard Worker                                          dst, stride, c, eob
6588*c0909341SAndroid Build Coastguard Worker    LEA                  r6, base
6589*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
6590*c0909341SAndroid Build Coastguard Worker    jz .dconly
6591*c0909341SAndroid Build Coastguard Worker
6592*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6593*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 4, 1, 2, 0
6594*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*1+(64*2+12)*16], r0
6595*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*2+(64*2+12)*16], r1
6596*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*3+(64*2+12)*16], r2
6597*c0909341SAndroid Build Coastguard Worker%else
6598*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 8, 9, 4, 7
6599*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*1+(64*2+12)*16], r9
6600*c0909341SAndroid Build Coastguard Worker%if WIN64
6601*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*2+(64*2+12)*16], r7
6602*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*3+(64*2+12)*16], r8
6603*c0909341SAndroid Build Coastguard Worker%endif
6604*c0909341SAndroid Build Coastguard Worker%endif
6605*c0909341SAndroid Build Coastguard Worker%undef cmp
6606*c0909341SAndroid Build Coastguard Worker    ; remove entirely-zero iterations
6607*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 7*2
6608*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_16x32_2d)+r5]
6609*c0909341SAndroid Build Coastguard Worker    jge .end_zero_loop
6610*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
6611*c0909341SAndroid Build Coastguard Worker.zero_loop:
6612*c0909341SAndroid Build Coastguard Worker    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
6613*c0909341SAndroid Build Coastguard Worker    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
6614*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t1b
6615*c0909341SAndroid Build Coastguard Worker    movzx               t2d, t3b
6616*c0909341SAndroid Build Coastguard Worker    shr                 t1d, 8
6617*c0909341SAndroid Build Coastguard Worker    shr                 t3d, 8
6618*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t0*8], m0
6619*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t1*8], m0
6620*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t2*8], m0
6621*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t3*8], m0
6622*c0909341SAndroid Build Coastguard Worker    mova   [rsp+76*16+t0*8], m0
6623*c0909341SAndroid Build Coastguard Worker    mova   [rsp+76*16+t1*8], m0
6624*c0909341SAndroid Build Coastguard Worker    mova   [rsp+76*16+t2*8], m0
6625*c0909341SAndroid Build Coastguard Worker    mova   [rsp+76*16+t3*8], m0
6626*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
6627*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_16x32_2d)+r5]
6628*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
6629*c0909341SAndroid Build Coastguard Worker.end_zero_loop:
6630*c0909341SAndroid Build Coastguard Worker    ; actual first pass after skipping all-zero data
6631*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*0+(64*2+12)*16], eobd
6632*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
6633*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6634*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 4, 1, 6, 0
6635*c0909341SAndroid Build Coastguard Worker    mov                  r2, [rsp+gprsize*3+(64*2+12)*16]
6636*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*3+(64*2+12)*16], r6
6637*c0909341SAndroid Build Coastguard Worker%endif
6638*c0909341SAndroid Build Coastguard Worker.loop_pass1:
6639*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6640*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
6641*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
6642*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
6643*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
6644*c0909341SAndroid Build Coastguard Worker%endif
6645*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 1*128+r5*8]
6646*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 3*128+r5*8]
6647*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 5*128+r5*8]
6648*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 7*128+r5*8]
6649*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 9*128+r5*8]
6650*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+11*128+r5*8]
6651*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+13*128+r5*8]
6652*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+15*128+r5*8]
6653*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).main_oddhalf
6654*c0909341SAndroid Build Coastguard Worker
6655*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+ 0*128+r5*8]
6656*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+ 2*128+r5*8]
6657*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+ 4*128+r5*8]
6658*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+ 6*128+r5*8]
6659*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+ 8*128+r5*8]
6660*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+10*128+r5*8]
6661*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+12*128+r5*8]
6662*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+14*128+r5*8]
6663*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1
6664*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
6665*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_16bpc).round
6666*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6667*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
6668*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
6669*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
6670*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
6671*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m9
6672*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m11
6673*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
6674*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
6675*c0909341SAndroid Build Coastguard Worker%endif
6676*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6677*c0909341SAndroid Build Coastguard Worker    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
6678*c0909341SAndroid Build Coastguard Worker    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
6679*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t1b
6680*c0909341SAndroid Build Coastguard Worker    movzx               t2d, t3b
6681*c0909341SAndroid Build Coastguard Worker    shr                 t1d, 8
6682*c0909341SAndroid Build Coastguard Worker    shr                 t3d, 8
6683*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6684*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6685*c0909341SAndroid Build Coastguard Worker    mova   [rsp+76*16+t0*8], m8
6686*c0909341SAndroid Build Coastguard Worker    mova   [rsp+76*16+t1*8], m9
6687*c0909341SAndroid Build Coastguard Worker    mova   [rsp+76*16+t2*8], m10
6688*c0909341SAndroid Build Coastguard Worker    mova   [rsp+76*16+t3*8], m11
6689*c0909341SAndroid Build Coastguard Worker%else
6690*c0909341SAndroid Build Coastguard Worker    mova   [rsp+76*16+t0*8], m0
6691*c0909341SAndroid Build Coastguard Worker    mova   [rsp+76*16+t1*8], m1
6692*c0909341SAndroid Build Coastguard Worker    mova   [rsp+76*16+t2*8], m2
6693*c0909341SAndroid Build Coastguard Worker    mova   [rsp+76*16+t3*8], m3
6694*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+ 8*16]
6695*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+ 9*16]
6696*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+10*16]
6697*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+11*16]
6698*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6699*c0909341SAndroid Build Coastguard Worker%endif
6700*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t0*8], m0
6701*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t1*8], m1
6702*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t2*8], m2
6703*c0909341SAndroid Build Coastguard Worker    mova   [rsp+12*16+t3*8], m3
6704*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6705*c0909341SAndroid Build Coastguard Worker    mov                  r6, [rsp+gprsize*3+(64*2+12)*16]
6706*c0909341SAndroid Build Coastguard Worker%endif
6707*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
6708*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
6709*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
6710*c0909341SAndroid Build Coastguard Worker    jge .loop_pass1
6711*c0909341SAndroid Build Coastguard Worker
6712*c0909341SAndroid Build Coastguard Worker    ; pass=2
6713*c0909341SAndroid Build Coastguard Worker    mov                eobd, [rsp+gprsize*0+(64*2+12)*16]
6714*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
6715*c0909341SAndroid Build Coastguard Worker    jl .fast
6716*c0909341SAndroid Build Coastguard Worker    ; fall-through
6717*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6718*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 8, 9
6719*c0909341SAndroid Build Coastguard Worker%else
6720*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 1, 5
6721*c0909341SAndroid Build Coastguard Worker%endif
6722*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
6723*c0909341SAndroid Build Coastguard Worker    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
6724*c0909341SAndroid Build Coastguard Worker    jmp .run
6725*c0909341SAndroid Build Coastguard Worker.fast:
6726*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
6727*c0909341SAndroid Build Coastguard Worker    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
6728*c0909341SAndroid Build Coastguard Worker.run:
6729*c0909341SAndroid Build Coastguard Worker    add                 rsp, 9*16
6730*c0909341SAndroid Build Coastguard Worker
6731*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6732*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+32]
6733*c0909341SAndroid Build Coastguard Worker    mov                  r7, -4
6734*c0909341SAndroid Build Coastguard Worker%else
6735*c0909341SAndroid Build Coastguard Worker    lea                  r2, [rsp+(64*2+3)*16]
6736*c0909341SAndroid Build Coastguard Worker    mov      [r2+4*gprsize], t0
6737*c0909341SAndroid Build Coastguard Worker    mov      [r2+5*gprsize], t1
6738*c0909341SAndroid Build Coastguard Worker    mov                  r1, [r2+2*gprsize]
6739*c0909341SAndroid Build Coastguard Worker    mov dword [r2+0*gprsize], 2
6740*c0909341SAndroid Build Coastguard Worker%endif
6741*c0909341SAndroid Build Coastguard Worker.loop_pass2:
6742*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6743*c0909341SAndroid Build Coastguard Worker    mov                dstq, [r2+1*gprsize]
6744*c0909341SAndroid Build Coastguard Worker%endif
6745*c0909341SAndroid Build Coastguard Worker    call .pass2
6746*c0909341SAndroid Build Coastguard Worker    add                 rsp, 64*16
6747*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6748*c0909341SAndroid Build Coastguard Worker    add                  r7, 2
6749*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r2+r7*8]
6750*c0909341SAndroid Build Coastguard Worker    jl .loop_pass2
6751*c0909341SAndroid Build Coastguard Worker%else
6752*c0909341SAndroid Build Coastguard Worker    add dword [r2+1*gprsize], 16
6753*c0909341SAndroid Build Coastguard Worker    dec dword [r2+0*gprsize]
6754*c0909341SAndroid Build Coastguard Worker    jg .loop_pass2
6755*c0909341SAndroid Build Coastguard Worker%endif
6756*c0909341SAndroid Build Coastguard Worker%assign stack_size (stack_size-(64*2+9)*16)
6757*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= 16
6758*c0909341SAndroid Build Coastguard Worker%assign stack_size_padded (stack_size_padded-(64*2+9)*16)
6759*c0909341SAndroid Build Coastguard Worker%assign stack_offset (stack_offset-(64*2+9)*16)
6760*c0909341SAndroid Build Coastguard Worker%else
6761*c0909341SAndroid Build Coastguard Worker%xdefine rstkm [rsp + stack_size]
6762*c0909341SAndroid Build Coastguard Worker%endif
6763*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6764*c0909341SAndroid Build Coastguard Worker    mov                  r9, [rsp+gprsize*1+3*16]
6765*c0909341SAndroid Build Coastguard Worker%if WIN64
6766*c0909341SAndroid Build Coastguard Worker    mov                  r7, [rsp+gprsize*2+3*16]
6767*c0909341SAndroid Build Coastguard Worker    mov                  r8, [rsp+gprsize*3+3*16]
6768*c0909341SAndroid Build Coastguard Worker%endif
6769*c0909341SAndroid Build Coastguard Worker%endif
6770*c0909341SAndroid Build Coastguard Worker    RET
6771*c0909341SAndroid Build Coastguard Worker
6772*c0909341SAndroid Build Coastguard Worker.pass2:
6773*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6774*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o(itx8_start)]
6775*c0909341SAndroid Build Coastguard Worker%endif
6776*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+16* 3]
6777*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+16* 4]
6778*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+16* 5]
6779*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+16* 6]
6780*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
6781*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m4}, m5, m6, m7
6782*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
6783*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 3*16], m0
6784*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 4*16], m1
6785*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 5*16], m2
6786*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 6*16], m3
6787*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 7*16], m4
6788*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 8*16], m5
6789*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 9*16], m6
6790*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+10*16], m7
6791*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+16*11]
6792*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+16*12]
6793*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+16*13]
6794*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+16*14]
6795*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
6796*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m4}, m5, m6, m7
6797*c0909341SAndroid Build Coastguard Worker    call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
6798*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+ 0*16]
6799*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+11*16], m0
6800*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+12*16], m1
6801*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+13*16], m2
6802*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+14*16], m3
6803*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+15*16], m4
6804*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+16*16], m5
6805*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+17*16], m6
6806*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+18*16], m7
6807*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6808*c0909341SAndroid Build Coastguard Worker    call                  r8
6809*c0909341SAndroid Build Coastguard Worker%else
6810*c0909341SAndroid Build Coastguard Worker    call      [r2+4*gprsize]
6811*c0909341SAndroid Build Coastguard Worker%endif
6812*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 3*16], m0
6813*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 5*16], m2
6814*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+ 8*16], m5
6815*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+10*16], m7
6816*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6817*c0909341SAndroid Build Coastguard Worker    call                 r9
6818*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_2048)]
6819*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
6820*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
6821*c0909341SAndroid Build Coastguard Worker%else
6822*c0909341SAndroid Build Coastguard Worker    call     [r2+5*gprsize]
6823*c0909341SAndroid Build Coastguard Worker%endif
6824*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
6825*c0909341SAndroid Build Coastguard Worker    lea                  r4, [rsp+gprsize+ 3*16]
6826*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6827*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 8
6828*c0909341SAndroid Build Coastguard Worker%else
6829*c0909341SAndroid Build Coastguard Worker    mov dword [r2+2*gprsize], 8
6830*c0909341SAndroid Build Coastguard Worker%endif
6831*c0909341SAndroid Build Coastguard Worker.loop_write:
6832*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r4+0*16]
6833*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r4+1*16]
6834*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r4+2*16]
6835*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r4+3*16]
6836*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r4+4*16]
6837*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r4+5*16]
6838*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r4+6*16]
6839*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r4+7*16]
6840*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
6841*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
6842*c0909341SAndroid Build Coastguard Worker    add                  r4, 8*16
6843*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6844*c0909341SAndroid Build Coastguard Worker    dec                 r6d
6845*c0909341SAndroid Build Coastguard Worker%else
6846*c0909341SAndroid Build Coastguard Worker    dec dword [r2+2*gprsize]
6847*c0909341SAndroid Build Coastguard Worker%endif
6848*c0909341SAndroid Build Coastguard Worker    jg .loop_write
6849*c0909341SAndroid Build Coastguard Worker    ret
6850*c0909341SAndroid Build Coastguard Worker
6851*c0909341SAndroid Build Coastguard Worker.dconly:
6852*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
6853*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
6854*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 64
6855*c0909341SAndroid Build Coastguard Worker    add                 r5d, 640
6856*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 10
6857*c0909341SAndroid Build Coastguard Worker    add                 rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
6858*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
6859*c0909341SAndroid Build Coastguard Worker
6860*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
6861*c0909341SAndroid Build Coastguard Worker                                          0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \
6862*c0909341SAndroid Build Coastguard Worker                                          dst, stride, c, eob
6863*c0909341SAndroid Build Coastguard Worker    LEA                  r6, base
6864*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
6865*c0909341SAndroid Build Coastguard Worker    jz .dconly
6866*c0909341SAndroid Build Coastguard Worker
6867*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6868*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 4, 1, 2, 0
6869*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*1+(64*4+32)*16], r0
6870*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*2+(64*4+32)*16], r1
6871*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*3+(64*4+32)*16], r2
6872*c0909341SAndroid Build Coastguard Worker%else
6873*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 8, 9, 4, 7
6874*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*1+(64*4+32)*16], r9
6875*c0909341SAndroid Build Coastguard Worker%if WIN64
6876*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*2+(64*4+32)*16], r7
6877*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*3+(64*4+32)*16], r8
6878*c0909341SAndroid Build Coastguard Worker%endif
6879*c0909341SAndroid Build Coastguard Worker%endif
6880*c0909341SAndroid Build Coastguard Worker%undef cmp
6881*c0909341SAndroid Build Coastguard Worker    ; remove entirely-zero iterations
6882*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 7*2
6883*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
6884*c0909341SAndroid Build Coastguard Worker    jge .end_zero_loop
6885*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
6886*c0909341SAndroid Build Coastguard Worker.zero_loop:
6887*c0909341SAndroid Build Coastguard Worker    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
6888*c0909341SAndroid Build Coastguard Worker    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
6889*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t1b
6890*c0909341SAndroid Build Coastguard Worker    movzx               t2d, t3b
6891*c0909341SAndroid Build Coastguard Worker    shr                 t1d, 8
6892*c0909341SAndroid Build Coastguard Worker    shr                 t3d, 8
6893*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 32*16+t0*8], m0
6894*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 32*16+t1*8], m0
6895*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 32*16+t2*8], m0
6896*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 32*16+t3*8], m0
6897*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96*16+t0*8], m0
6898*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96*16+t1*8], m0
6899*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96*16+t2*8], m0
6900*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96*16+t3*8], m0
6901*c0909341SAndroid Build Coastguard Worker    mova  [rsp+160*16+t0*8], m0
6902*c0909341SAndroid Build Coastguard Worker    mova  [rsp+160*16+t1*8], m0
6903*c0909341SAndroid Build Coastguard Worker    mova  [rsp+160*16+t2*8], m0
6904*c0909341SAndroid Build Coastguard Worker    mova  [rsp+160*16+t3*8], m0
6905*c0909341SAndroid Build Coastguard Worker    mova  [rsp+224*16+t0*8], m0
6906*c0909341SAndroid Build Coastguard Worker    mova  [rsp+224*16+t1*8], m0
6907*c0909341SAndroid Build Coastguard Worker    mova  [rsp+224*16+t2*8], m0
6908*c0909341SAndroid Build Coastguard Worker    mova  [rsp+224*16+t3*8], m0
6909*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
6910*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
6911*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
6912*c0909341SAndroid Build Coastguard Worker.end_zero_loop:
6913*c0909341SAndroid Build Coastguard Worker    ; actual first pass after skipping all-zero data
6914*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*0+(64*4+32)*16], eobd
6915*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
6916*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6917*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 4, 1, 6, 0
6918*c0909341SAndroid Build Coastguard Worker    mov                  r2, [rsp+gprsize*3+(64*4+32)*16]
6919*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*3+(64*4+32)*16], r6
6920*c0909341SAndroid Build Coastguard Worker%endif
6921*c0909341SAndroid Build Coastguard Worker.loop_pass1:
6922*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6923*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
6924*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
6925*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
6926*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
6927*c0909341SAndroid Build Coastguard Worker%endif
6928*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 1+r5*8]
6929*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 7+r5*8]
6930*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 9+r5*8]
6931*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*15+r5*8]
6932*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128*17+r5*8]
6933*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*23+r5*8]
6934*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*25+r5*8]
6935*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*31+r5*8]
6936*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
6937*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
6938*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
6939*c0909341SAndroid Build Coastguard Worker
6940*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 3+r5*8]
6941*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 5+r5*8]
6942*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*11+r5*8]
6943*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*13+r5*8]
6944*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128*19+r5*8]
6945*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*21+r5*8]
6946*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*27+r5*8]
6947*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*29+r5*8]
6948*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6949*c0909341SAndroid Build Coastguard Worker    add                  r3, 16*8
6950*c0909341SAndroid Build Coastguard Worker%endif
6951*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
6952*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6953*c0909341SAndroid Build Coastguard Worker    sub                  r3, 16*8
6954*c0909341SAndroid Build Coastguard Worker%endif
6955*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
6956*c0909341SAndroid Build Coastguard Worker    add                  r3, 16*(16+4*ARCH_X86_32)
6957*c0909341SAndroid Build Coastguard Worker
6958*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 2+r5*8]
6959*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 6+r5*8]
6960*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*10+r5*8]
6961*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*14+r5*8]
6962*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128*18+r5*8]
6963*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*22+r5*8]
6964*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*26+r5*8]
6965*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*30+r5*8]
6966*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
6967*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).main_oddhalf
6968*c0909341SAndroid Build Coastguard Worker
6969*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 0+r5*8]
6970*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 4+r5*8]
6971*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 8+r5*8]
6972*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*12+r5*8]
6973*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128*16+r5*8]
6974*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*20+r5*8]
6975*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*24+r5*8]
6976*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*28+r5*8]
6977*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).rect2_mul
6978*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1
6979*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
6980*c0909341SAndroid Build Coastguard Worker    sub                  r3, 16*(16+4*ARCH_X86_32)
6981*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32
6982*c0909341SAndroid Build Coastguard Worker
6983*c0909341SAndroid Build Coastguard Worker    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
6984*c0909341SAndroid Build Coastguard Worker    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
6985*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t1b
6986*c0909341SAndroid Build Coastguard Worker    movzx               t2d, t3b
6987*c0909341SAndroid Build Coastguard Worker    shr                 t1d, 8
6988*c0909341SAndroid Build Coastguard Worker    shr                 t3d, 8
6989*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6990*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
6991*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6992*c0909341SAndroid Build Coastguard Worker    mova  [rsp+160*16+t0*8], m8
6993*c0909341SAndroid Build Coastguard Worker    mova  [rsp+160*16+t1*8], m9
6994*c0909341SAndroid Build Coastguard Worker    mova  [rsp+160*16+t2*8], m10
6995*c0909341SAndroid Build Coastguard Worker    mova  [rsp+160*16+t3*8], m11
6996*c0909341SAndroid Build Coastguard Worker    mova                 m8, [r3+16* 9] ;  8  9
6997*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+16*11] ; 10 11
6998*c0909341SAndroid Build Coastguard Worker    mova                m12, [r3+16*13] ; 12 13
6999*c0909341SAndroid Build Coastguard Worker    mova                m14, [r3+16*15] ; 14 15
7000*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
7001*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96*16+t0*8], m8
7002*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96*16+t1*8], m9
7003*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96*16+t2*8], m10
7004*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96*16+t3*8], m11
7005*c0909341SAndroid Build Coastguard Worker    mova                 m8, [r3+16* 8] ; 24 25
7006*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+16*10] ; 26 27
7007*c0909341SAndroid Build Coastguard Worker    mova                m12, [r3+16*12] ; 28 29
7008*c0909341SAndroid Build Coastguard Worker    mova                m14, [r3+16*14] ; 30 31
7009*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
7010*c0909341SAndroid Build Coastguard Worker    mova  [rsp+224*16+t0*8], m8
7011*c0909341SAndroid Build Coastguard Worker    mova  [rsp+224*16+t1*8], m9
7012*c0909341SAndroid Build Coastguard Worker    mova  [rsp+224*16+t2*8], m10
7013*c0909341SAndroid Build Coastguard Worker    mova  [rsp+224*16+t3*8], m11
7014*c0909341SAndroid Build Coastguard Worker%else
7015*c0909341SAndroid Build Coastguard Worker    sub                  r3, 8*16
7016*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+ 8*16]
7017*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+10*16]
7018*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+12*16]
7019*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+14*16]
7020*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+ 9*16]
7021*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+11*16]
7022*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+13*16]
7023*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+15*16]
7024*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
7025*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96*16+t0*8], m0
7026*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96*16+t1*8], m1
7027*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96*16+t2*8], m2
7028*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 96*16+t3*8], m3
7029*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+16*16]
7030*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+18*16]
7031*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+20*16]
7032*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+22*16]
7033*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+17*16]
7034*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+19*16]
7035*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+21*16]
7036*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+23*16]
7037*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
7038*c0909341SAndroid Build Coastguard Worker    mova  [rsp+160*16+t0*8], m0
7039*c0909341SAndroid Build Coastguard Worker    mova  [rsp+160*16+t1*8], m1
7040*c0909341SAndroid Build Coastguard Worker    mova  [rsp+160*16+t2*8], m2
7041*c0909341SAndroid Build Coastguard Worker    mova  [rsp+160*16+t3*8], m3
7042*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+31*16]
7043*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+29*16]
7044*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+27*16]
7045*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+25*16]
7046*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+30*16]
7047*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+28*16]
7048*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+26*16]
7049*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+24*16]
7050*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
7051*c0909341SAndroid Build Coastguard Worker    mova  [rsp+224*16+t0*8], m0
7052*c0909341SAndroid Build Coastguard Worker    mova  [rsp+224*16+t1*8], m1
7053*c0909341SAndroid Build Coastguard Worker    mova  [rsp+224*16+t2*8], m2
7054*c0909341SAndroid Build Coastguard Worker    mova  [rsp+224*16+t3*8], m3
7055*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+ 0*16]
7056*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+ 2*16]
7057*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+ 4*16]
7058*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+ 6*16]
7059*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [r3+ 1*16]
7060*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [r3+ 3*16]
7061*c0909341SAndroid Build Coastguard Worker    packssdw             m4, [r3+ 5*16]
7062*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [r3+ 7*16]
7063*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
7064*c0909341SAndroid Build Coastguard Worker%endif
7065*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 32*16+t0*8], m0
7066*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 32*16+t1*8], m1
7067*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 32*16+t2*8], m2
7068*c0909341SAndroid Build Coastguard Worker    mova  [rsp+ 32*16+t3*8], m3
7069*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
7070*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \
7071*c0909341SAndroid Build Coastguard Worker                                     8, 9, 10, 11, 12, 13, 14, 15, \
7072*c0909341SAndroid Build Coastguard Worker                                     16, 17, 18, 19, 20, 21, 22, 23, \
7073*c0909341SAndroid Build Coastguard Worker                                     24, 25, 26, 27, 28, 29, 30, 31
7074*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7075*c0909341SAndroid Build Coastguard Worker    mov                  r6, [rsp+gprsize*3+(64*4+32)*16]
7076*c0909341SAndroid Build Coastguard Worker%endif
7077*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
7078*c0909341SAndroid Build Coastguard Worker    jge .loop_pass1
7079*c0909341SAndroid Build Coastguard Worker
7080*c0909341SAndroid Build Coastguard Worker    ; pass=2
7081*c0909341SAndroid Build Coastguard Worker    mov                eobd, [rsp+gprsize*0+(64*4+32)*16]
7082*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
7083*c0909341SAndroid Build Coastguard Worker    jl .fast
7084*c0909341SAndroid Build Coastguard Worker    ; fall-through
7085*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7086*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 8, 9
7087*c0909341SAndroid Build Coastguard Worker%else
7088*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 1, 5
7089*c0909341SAndroid Build Coastguard Worker%endif
7090*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
7091*c0909341SAndroid Build Coastguard Worker    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
7092*c0909341SAndroid Build Coastguard Worker    jmp .run
7093*c0909341SAndroid Build Coastguard Worker.fast:
7094*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
7095*c0909341SAndroid Build Coastguard Worker    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
7096*c0909341SAndroid Build Coastguard Worker.run:
7097*c0909341SAndroid Build Coastguard Worker    add                 rsp, 29*16
7098*c0909341SAndroid Build Coastguard Worker
7099*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7100*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+64]
7101*c0909341SAndroid Build Coastguard Worker    mov                  r7, -8
7102*c0909341SAndroid Build Coastguard Worker%else
7103*c0909341SAndroid Build Coastguard Worker    lea                  r2, [rsp+(64*4+3)*16]
7104*c0909341SAndroid Build Coastguard Worker    mov      [r2+4*gprsize], t0
7105*c0909341SAndroid Build Coastguard Worker    mov      [r2+5*gprsize], t1
7106*c0909341SAndroid Build Coastguard Worker    mov                  r1, [r2+2*gprsize]
7107*c0909341SAndroid Build Coastguard Worker    mov dword [r2+0*gprsize], 4
7108*c0909341SAndroid Build Coastguard Worker%endif
7109*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
7110*c0909341SAndroid Build Coastguard Worker
7111*c0909341SAndroid Build Coastguard Worker.dconly:
7112*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
7113*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
7114*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 64
7115*c0909341SAndroid Build Coastguard Worker    add                 r5d, 128
7116*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 8
7117*c0909341SAndroid Build Coastguard Worker    imul                r5d, 181
7118*c0909341SAndroid Build Coastguard Worker    add                 r5d, 384
7119*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 9
7120*c0909341SAndroid Build Coastguard Worker    add                 rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
7121*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
7122*c0909341SAndroid Build Coastguard Worker
7123*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
7124*c0909341SAndroid Build Coastguard Worker                                         dst, stride, c, eob
7125*c0909341SAndroid Build Coastguard Worker    LEA                  r6, base
7126*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
7127*c0909341SAndroid Build Coastguard Worker    jz .dconly
7128*c0909341SAndroid Build Coastguard Worker
7129*c0909341SAndroid Build Coastguard Worker    ; remove entirely-zero iterations
7130*c0909341SAndroid Build Coastguard Worker%undef cmp
7131*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 8
7132*c0909341SAndroid Build Coastguard Worker.zero_loop:
7133*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
7134*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_32x16_2d)+r5]
7135*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
7136*c0909341SAndroid Build Coastguard Worker
7137*c0909341SAndroid Build Coastguard Worker    ; actual first pass after skipping all-zero data
7138*c0909341SAndroid Build Coastguard Worker.loop_pass1:
7139*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7140*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
7141*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
7142*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
7143*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
7144*c0909341SAndroid Build Coastguard Worker%endif
7145*c0909341SAndroid Build Coastguard Worker
7146*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
7147*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(idct64_mul_16bpc)]
7148*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 1+r5*8]
7149*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*31+r5*8]
7150*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*17+r5*8]
7151*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*15+r5*8]
7152*c0909341SAndroid Build Coastguard Worker    call .main_part1
7153*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 7+r5*8]
7154*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*25+r5*8]
7155*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*23+r5*8]
7156*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64* 9+r5*8]
7157*c0909341SAndroid Build Coastguard Worker    call .main_part1
7158*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 5+r5*8]
7159*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*27+r5*8]
7160*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*21+r5*8]
7161*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*11+r5*8]
7162*c0909341SAndroid Build Coastguard Worker    call .main_part1
7163*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 3+r5*8]
7164*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*29+r5*8]
7165*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*19+r5*8]
7166*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*13+r5*8]
7167*c0909341SAndroid Build Coastguard Worker    call .main_part1
7168*c0909341SAndroid Build Coastguard Worker    call .main_part2
7169*c0909341SAndroid Build Coastguard Worker
7170*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 2+r5*8]
7171*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*14+r5*8]
7172*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*18+r5*8]
7173*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*30+r5*8]
7174*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
7175*c0909341SAndroid Build Coastguard Worker
7176*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 6+r5*8]
7177*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*10+r5*8]
7178*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*22+r5*8]
7179*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*26+r5*8]
7180*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
7181*c0909341SAndroid Build Coastguard Worker    add                  r3, 16*(24+4*ARCH_X86_32)
7182*c0909341SAndroid Build Coastguard Worker
7183*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 4+r5*8]
7184*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*12+r5*8]
7185*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*20+r5*8]
7186*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*28+r5*8]
7187*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).main_oddhalf_fast
7188*c0909341SAndroid Build Coastguard Worker
7189*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 0+r5*8]
7190*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 8+r5*8]
7191*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*16+r5*8]
7192*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*24+r5*8]
7193*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1_fast
7194*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
7195*c0909341SAndroid Build Coastguard Worker    mova [r3-(7+4*ARCH_X86_32)*16], m1
7196*c0909341SAndroid Build Coastguard Worker    mova [r3-(6+4*ARCH_X86_32)*16], m2
7197*c0909341SAndroid Build Coastguard Worker    mova [r3-(5+4*ARCH_X86_32)*16], m3
7198*c0909341SAndroid Build Coastguard Worker    mova [r3-(4+4*ARCH_X86_32)*16], m4
7199*c0909341SAndroid Build Coastguard Worker    mova [r3-(3+4*ARCH_X86_32)*16], m5
7200*c0909341SAndroid Build Coastguard Worker    mova [r3-(2+4*ARCH_X86_32)*16], m6
7201*c0909341SAndroid Build Coastguard Worker    mova [r3-(1+4*ARCH_X86_32)*16], m7
7202*c0909341SAndroid Build Coastguard Worker    sub                  r3, 16*(40+4*ARCH_X86_32-4)
7203*c0909341SAndroid Build Coastguard Worker
7204*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7205*c0909341SAndroid Build Coastguard Worker    psrld               m15, m11, 10 ; pd_2
7206*c0909341SAndroid Build Coastguard Worker%else
7207*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_2)]
7208*c0909341SAndroid Build Coastguard Worker%endif
7209*c0909341SAndroid Build Coastguard Worker    call .main_end_loop_start
7210*c0909341SAndroid Build Coastguard Worker
7211*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+56*16]
7212*c0909341SAndroid Build Coastguard Worker    lea                  r4, [cq+r5*8+64*28]
7213*c0909341SAndroid Build Coastguard Worker    call .shift_transpose
7214*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
7215*c0909341SAndroid Build Coastguard Worker    jge .loop_pass1
7216*c0909341SAndroid Build Coastguard Worker
7217*c0909341SAndroid Build Coastguard Worker    ; pass=2, we need to call this otherwise the stack pointer has
7218*c0909341SAndroid Build Coastguard Worker    ; the wrong offset in the 8-bit code
7219*c0909341SAndroid Build Coastguard Worker    call .pass2
7220*c0909341SAndroid Build Coastguard Worker    RET
7221*c0909341SAndroid Build Coastguard Worker
7222*c0909341SAndroid Build Coastguard Worker.pass2:
7223*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7224*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(pw_2048)]
7225*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
7226*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pixel_10bpc_max)]
7227*c0909341SAndroid Build Coastguard Worker%if WIN64
7228*c0909341SAndroid Build Coastguard Worker    mov [rsp+16*16+gprsize], r7
7229*c0909341SAndroid Build Coastguard Worker%endif
7230*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
7231*c0909341SAndroid Build Coastguard Worker%else
7232*c0909341SAndroid Build Coastguard Worker    mov [rsp+2*gprsize+16*16], dstq
7233*c0909341SAndroid Build Coastguard Worker%endif
7234*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
7235*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 8
7236*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_16bpc).loop_pass2
7237*c0909341SAndroid Build Coastguard Worker
7238*c0909341SAndroid Build Coastguard Worker.main_part1: ; idct64 steps 1-5
7239*c0909341SAndroid Build Coastguard Worker    ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
7240*c0909341SAndroid Build Coastguard Worker    ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
7241*c0909341SAndroid Build Coastguard Worker    ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
7242*c0909341SAndroid Build Coastguard Worker    ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
7243*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7244*c0909341SAndroid Build Coastguard Worker    movd                 m7, [r4+4*0]
7245*c0909341SAndroid Build Coastguard Worker    movd                 m8, [r4+4*1]
7246*c0909341SAndroid Build Coastguard Worker    movd                 m6, [r4+4*2]
7247*c0909341SAndroid Build Coastguard Worker    movd                 m9, [r4+4*3]
7248*c0909341SAndroid Build Coastguard Worker    movd                 m5, [r4+4*4]
7249*c0909341SAndroid Build Coastguard Worker    movd                m10, [r4+4*5]
7250*c0909341SAndroid Build Coastguard Worker    movd                 m4, [r4+4*6]
7251*c0909341SAndroid Build Coastguard Worker    movd                m15, [r4+4*7]
7252*c0909341SAndroid Build Coastguard Worker    REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15
7253*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m0     ; t63a
7254*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m8     ; t32a
7255*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m1     ; t62a
7256*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m9     ; t33a
7257*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m2     ; t61a
7258*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m10    ; t34a
7259*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m3     ; t60a
7260*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m15    ; t35a
7261*c0909341SAndroid Build Coastguard Worker    movd                m10, [r4+4*8]
7262*c0909341SAndroid Build Coastguard Worker    movd                m15, [r4+4*9]
7263*c0909341SAndroid Build Coastguard Worker    REPX {pshufd x, x, q0000}, m10, m15
7264*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
7265*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
7266*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m1 ; t33
7267*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1     ; t32
7268*c0909341SAndroid Build Coastguard Worker    psubd                m1, m7, m6 ; t62
7269*c0909341SAndroid Build Coastguard Worker    paddd                m7, m6     ; t63
7270*c0909341SAndroid Build Coastguard Worker    psubd                m6, m3, m2 ; t34
7271*c0909341SAndroid Build Coastguard Worker    paddd                m3, m2     ; t35
7272*c0909341SAndroid Build Coastguard Worker    psubd                m2, m4, m5 ; t61
7273*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5     ; t60
7274*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m8, m1, m6, m2
7275*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m8, m1, m6, m2
7276*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 8, 5, 9, _, 11, 10, 15    ; t33a, t62a
7277*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a
7278*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m0, m3, m7, m4
7279*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m0, m3, m7, m4
7280*c0909341SAndroid Build Coastguard Worker    movd                m10, [r4+4*10]
7281*c0909341SAndroid Build Coastguard Worker    movd                m15, [r4+4*11]
7282*c0909341SAndroid Build Coastguard Worker    REPX {pshufd x, x, q0000}, m10, m15
7283*c0909341SAndroid Build Coastguard Worker    psubd                m5, m0, m3 ; t35a
7284*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3     ; t32a
7285*c0909341SAndroid Build Coastguard Worker    psubd                m3, m7, m4 ; t60a
7286*c0909341SAndroid Build Coastguard Worker    paddd                m7, m4     ; t63a
7287*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1, m6 ; t34
7288*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6     ; t33
7289*c0909341SAndroid Build Coastguard Worker    psubd                m6, m8, m2 ; t61
7290*c0909341SAndroid Build Coastguard Worker    paddd                m8, m2     ; t62
7291*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m5, m3, m4, m6
7292*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m5, m3, m4, m6
7293*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 5, 2, 9, _, 11, 10, 15 ; t35,  t60
7294*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
7295*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m0, m7, m1, m8
7296*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m0, m7, m1, m8
7297*c0909341SAndroid Build Coastguard Worker    add                  r4, 4*12
7298*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*0], m0
7299*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*7], m7
7300*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*1], m1
7301*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*6], m8
7302*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*2], m6
7303*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*5], m4
7304*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*3], m3
7305*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*4], m5
7306*c0909341SAndroid Build Coastguard Worker%else
7307*c0909341SAndroid Build Coastguard Worker    movd                 m7, [r4+4*0]
7308*c0909341SAndroid Build Coastguard Worker    movd                 m6, [r4+4*2]
7309*c0909341SAndroid Build Coastguard Worker    movd                 m5, [r4+4*4]
7310*c0909341SAndroid Build Coastguard Worker    movd                 m4, [r4+4*6]
7311*c0909341SAndroid Build Coastguard Worker    REPX {pshufd x, x, q0000}, m7, m6, m5, m4
7312*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m0     ; t63a
7313*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m1     ; t62a
7314*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m2     ; t61a
7315*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m3     ; t60a
7316*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m6
7317*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m7
7318*c0909341SAndroid Build Coastguard Worker    movd                 m6, [r4+4*1]
7319*c0909341SAndroid Build Coastguard Worker    movd                 m7, [r4+4*3]
7320*c0909341SAndroid Build Coastguard Worker    REPX {pshufd x, x, q0000}, m7, m6
7321*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m6     ; t32a
7322*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m7     ; t33a
7323*c0909341SAndroid Build Coastguard Worker    movd                 m6, [r4+4*5]
7324*c0909341SAndroid Build Coastguard Worker    movd                 m7, [r4+4*7]
7325*c0909341SAndroid Build Coastguard Worker    REPX {pshufd x, x, q0000}, m7, m6
7326*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m6     ; t34a
7327*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m7     ; t35a
7328*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+0*16]
7329*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_2048)]
7330*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
7331*c0909341SAndroid Build Coastguard Worker    paddd                m7, [r3+1*16]
7332*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4
7333*c0909341SAndroid Build Coastguard Worker    mova           [r3+0*16], m5
7334*c0909341SAndroid Build Coastguard Worker    psubd                m5, m0, m1 ; t33
7335*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1     ; t32
7336*c0909341SAndroid Build Coastguard Worker    mova           [r3+1*16], m0
7337*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+0*16]
7338*c0909341SAndroid Build Coastguard Worker    psubd                m1, m7, m6 ; t62
7339*c0909341SAndroid Build Coastguard Worker    paddd                m7, m6     ; t63
7340*c0909341SAndroid Build Coastguard Worker    psubd                m6, m3, m2 ; t34
7341*c0909341SAndroid Build Coastguard Worker    paddd                m3, m2     ; t35
7342*c0909341SAndroid Build Coastguard Worker    psubd                m2, m4, m0 ; t61
7343*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0     ; t60
7344*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(clip_18b_min)]
7345*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4
7346*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, [r3+1*16]
7347*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
7348*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(clip_18b_max)]
7349*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4
7350*c0909341SAndroid Build Coastguard Worker    pminsd               m0, [r3+0*16]
7351*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
7352*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m3
7353*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m4
7354*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m7
7355*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pd_2048)]
7356*c0909341SAndroid Build Coastguard Worker    movd                 m3, [r4+4*8]
7357*c0909341SAndroid Build Coastguard Worker    movd                 m4, [r4+4*9]
7358*c0909341SAndroid Build Coastguard Worker    REPX {pshufd x, x, q0000}, m3, m4
7359*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m2
7360*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 5, 2, 7, _, 0, 3, 4    ; t33a, t62a
7361*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+4*16]
7362*c0909341SAndroid Build Coastguard Worker    mova          [r3+4*16], m5
7363*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a
7364*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+0*16]
7365*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+1*16]
7366*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+2*16]
7367*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+3*16]
7368*c0909341SAndroid Build Coastguard Worker    psubd                m5, m0, m3 ; t35a
7369*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3     ; t32a
7370*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m5
7371*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+4*16]
7372*c0909341SAndroid Build Coastguard Worker    psubd                m3, m7, m4 ; t60a
7373*c0909341SAndroid Build Coastguard Worker    paddd                m7, m4     ; t63a
7374*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1, m6 ; t34
7375*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6     ; t33
7376*c0909341SAndroid Build Coastguard Worker    psubd                m6, m5, m2 ; t61
7377*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5     ; t62
7378*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(clip_18b_min)]
7379*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2
7380*c0909341SAndroid Build Coastguard Worker    pmaxsd               m5, [r3+0*16]
7381*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m5
7382*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(clip_18b_max)]
7383*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2
7384*c0909341SAndroid Build Coastguard Worker    pminsd               m5, [r3+0*16]
7385*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*0], m0
7386*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*7], m7
7387*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*1], m1
7388*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*6], m2
7389*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*2], m4
7390*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_2048)]
7391*c0909341SAndroid Build Coastguard Worker    movd                 m0, [r4+4*10]
7392*c0909341SAndroid Build Coastguard Worker    movd                 m1, [r4+4*11]
7393*c0909341SAndroid Build Coastguard Worker    REPX {pshufd x, x, q0000}, m0, m1
7394*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 5, 2, 4, _, 7, 0, 1 ; t35,  t60
7395*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*3], m3
7396*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*4], m5
7397*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+2*16]
7398*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a
7399*c0909341SAndroid Build Coastguard Worker    add                  r4, 4*12
7400*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*2], m6
7401*c0909341SAndroid Build Coastguard Worker    mova          [r3+16*5], m4
7402*c0909341SAndroid Build Coastguard Worker%endif
7403*c0909341SAndroid Build Coastguard Worker    add                  r3, 16*8
7404*c0909341SAndroid Build Coastguard Worker    ret
7405*c0909341SAndroid Build Coastguard Worker
7406*c0909341SAndroid Build Coastguard Worker.main_part2: ; idct64 steps 6-9
7407*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r3+16*7]
7408*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7409*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(pd_1567)]
7410*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(pd_3784)]
7411*c0909341SAndroid Build Coastguard Worker.main_part2_loop:
7412*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3-16*32] ; t32a
7413*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r4-16*24] ; t39a
7414*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r4-16*32] ; t63a
7415*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3-16*24] ; t56a
7416*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3-16*16] ; t40a
7417*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r4-16* 8] ; t47a
7418*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r4-16*16] ; t55a
7419*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3-16* 8] ; t48a
7420*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m1 ; t39
7421*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1     ; t32
7422*c0909341SAndroid Build Coastguard Worker    psubd                m1, m2, m3 ; t56
7423*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3     ; t63
7424*c0909341SAndroid Build Coastguard Worker    psubd                m3, m5, m4 ; t40
7425*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4     ; t47
7426*c0909341SAndroid Build Coastguard Worker    psubd                m4, m7, m6 ; t55
7427*c0909341SAndroid Build Coastguard Worker    paddd                m7, m6     ; t48
7428*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m8, m1, m3, m4
7429*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m8, m1, m3, m4
7430*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 8, 6, 9, _, 11, 10, 15    ; t39a, t56a
7431*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a
7432*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m0, m2, m5, m7
7433*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m0, m5, m2, m7
7434*c0909341SAndroid Build Coastguard Worker    psubd                m6, m2, m7 ; t48a
7435*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7     ; t63a
7436*c0909341SAndroid Build Coastguard Worker    psubd                m7, m0, m5 ; t47a
7437*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5     ; t32a
7438*c0909341SAndroid Build Coastguard Worker    psubd                m5, m8, m4 ; t55
7439*c0909341SAndroid Build Coastguard Worker    paddd                m8, m4     ; t56
7440*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1, m3 ; t40
7441*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3     ; t39
7442*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m6, m7, m5, m4
7443*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m6, m7, m5, m4
7444*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m14}, m6, m7, m5, m4
7445*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m2, m0, m8, m1
7446*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m2, m0, m8, m1
7447*c0909341SAndroid Build Coastguard Worker    paddd                m6, m11
7448*c0909341SAndroid Build Coastguard Worker    paddd                m5, m11
7449*c0909341SAndroid Build Coastguard Worker    psubd                m3, m6, m7 ; t47
7450*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7     ; t48
7451*c0909341SAndroid Build Coastguard Worker    psubd                m7, m5, m4 ; t40a
7452*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4     ; t55a
7453*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m3, m6, m7, m5
7454*c0909341SAndroid Build Coastguard Worker    mova         [r4-16* 8], m2
7455*c0909341SAndroid Build Coastguard Worker    mova         [r3-16*32], m0
7456*c0909341SAndroid Build Coastguard Worker    mova         [r3-16* 8], m8
7457*c0909341SAndroid Build Coastguard Worker    mova         [r4-16*32], m1
7458*c0909341SAndroid Build Coastguard Worker    mova         [r4-16*24], m3
7459*c0909341SAndroid Build Coastguard Worker    mova         [r3-16*16], m6
7460*c0909341SAndroid Build Coastguard Worker    mova         [r3-16*24], m7
7461*c0909341SAndroid Build Coastguard Worker    mova         [r4-16*16], m5
7462*c0909341SAndroid Build Coastguard Worker%else
7463*c0909341SAndroid Build Coastguard Worker.main_part2_loop:
7464*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3-16*32] ; t32a
7465*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r4-16*24] ; t39a
7466*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r4-16*32] ; t63a
7467*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3-16*24] ; t56a
7468*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3-16*16] ; t40a
7469*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r4-16* 8] ; t47a
7470*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r4-16*16] ; t55a
7471*c0909341SAndroid Build Coastguard Worker    psubd                m7, m0, m1 ; t39
7472*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1     ; t32
7473*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m7
7474*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3-16* 8] ; t48a
7475*c0909341SAndroid Build Coastguard Worker    psubd                m1, m2, m3 ; t56
7476*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3     ; t63
7477*c0909341SAndroid Build Coastguard Worker    psubd                m3, m5, m4 ; t40
7478*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4     ; t47
7479*c0909341SAndroid Build Coastguard Worker    psubd                m4, m7, m6 ; t55
7480*c0909341SAndroid Build Coastguard Worker    paddd                m7, m6     ; t48
7481*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(clip_18b_min)]
7482*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7
7483*c0909341SAndroid Build Coastguard Worker    pmaxsd               m6, [r3+0*16]
7484*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m6
7485*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(clip_18b_max)]
7486*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7
7487*c0909341SAndroid Build Coastguard Worker    pminsd               m6, [r3+0*16]
7488*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m0
7489*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m2
7490*c0909341SAndroid Build Coastguard Worker    mova          [r3+2*16], m5
7491*c0909341SAndroid Build Coastguard Worker    mova          [r3+3*16], m7
7492*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pd_2048)]
7493*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 6, 2, 5, 7, 0, 1567, 3784    ; t39a, t56a
7494*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 3, 2, 5, _, 0,    7, 3784, 4 ; t55a, t40a
7495*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+1*16]
7496*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+3*16]
7497*c0909341SAndroid Build Coastguard Worker    psubd                m5, m2, m7 ; t48a
7498*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7     ; t63a
7499*c0909341SAndroid Build Coastguard Worker    mova          [r3+1*16], m5
7500*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+0*16]
7501*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+2*16]
7502*c0909341SAndroid Build Coastguard Worker    psubd                m7, m0, m5 ; t47a
7503*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5     ; t32a
7504*c0909341SAndroid Build Coastguard Worker    psubd                m5, m6, m4 ; t55
7505*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4     ; t56
7506*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1, m3 ; t40
7507*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3     ; t39
7508*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_min)]
7509*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1
7510*c0909341SAndroid Build Coastguard Worker    pmaxsd               m3, [r3+1*16]
7511*c0909341SAndroid Build Coastguard Worker    mova          [r3+0*16], m3
7512*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(clip_18b_max)]
7513*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1
7514*c0909341SAndroid Build Coastguard Worker    pminsd               m3, [r3+0*16]
7515*c0909341SAndroid Build Coastguard Worker    mova         [r4-16* 8], m2
7516*c0909341SAndroid Build Coastguard Worker    mova         [r3-16*32], m0
7517*c0909341SAndroid Build Coastguard Worker    mova         [r3-16* 8], m6
7518*c0909341SAndroid Build Coastguard Worker    mova         [r4-16*32], m1
7519*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(pd_2896)]
7520*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(pd_2048)]
7521*c0909341SAndroid Build Coastguard Worker    REPX     {pmulld x, m0}, m3, m7, m5, m4
7522*c0909341SAndroid Build Coastguard Worker    REPX     {paddd  x, m1}, m3, m5
7523*c0909341SAndroid Build Coastguard Worker    psubd                m6, m3, m7 ; t47
7524*c0909341SAndroid Build Coastguard Worker    paddd                m3, m7     ; t48
7525*c0909341SAndroid Build Coastguard Worker    psubd                m7, m5, m4 ; t40a
7526*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4     ; t55a
7527*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m6, m3, m7, m5
7528*c0909341SAndroid Build Coastguard Worker    mova         [r4-16*24], m6
7529*c0909341SAndroid Build Coastguard Worker    mova         [r3-16*16], m3
7530*c0909341SAndroid Build Coastguard Worker    mova         [r3-16*24], m7
7531*c0909341SAndroid Build Coastguard Worker    mova         [r4-16*16], m5
7532*c0909341SAndroid Build Coastguard Worker%endif
7533*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
7534*c0909341SAndroid Build Coastguard Worker    sub                  r4, 16
7535*c0909341SAndroid Build Coastguard Worker    cmp                  r3, r4
7536*c0909341SAndroid Build Coastguard Worker    jl .main_part2_loop
7537*c0909341SAndroid Build Coastguard Worker    sub                  r3, 4*16
7538*c0909341SAndroid Build Coastguard Worker    ret
7539*c0909341SAndroid Build Coastguard Worker
7540*c0909341SAndroid Build Coastguard Worker.main_end_loop:
7541*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+16*28] ; idct8  0  + n
7542*c0909341SAndroid Build Coastguard Worker.main_end_loop_start:
7543*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+16*12] ; idct32 16 + n
7544*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r4+16*12] ; idct32 31 - n
7545*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7546*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r4+16*28] ; idct16 15 - n
7547*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r4-16* 4] ; idct64 63 - n
7548*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3-16* 4] ; idct64 48 + n
7549*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r4-16*20] ; idct64 47 - n
7550*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3-16*20] ; idct64 32 + n
7551*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, m12
7552*c0909341SAndroid Build Coastguard Worker    pminsd               m0, m13
7553*c0909341SAndroid Build Coastguard Worker    paddd                m8, m0, m1     ; idct16 out0  + n
7554*c0909341SAndroid Build Coastguard Worker    psubd                m0, m1         ; idct16 out15 - n
7555*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m8, m0
7556*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m8, m0
7557*c0909341SAndroid Build Coastguard Worker    paddd                m1, m8, m3     ; idct32 out0  + n
7558*c0909341SAndroid Build Coastguard Worker    psubd                m8, m3         ; idct32 out31 - n
7559*c0909341SAndroid Build Coastguard Worker    paddd                m3, m0, m2     ; idct32 out15 - n
7560*c0909341SAndroid Build Coastguard Worker    psubd                m0, m2         ; idct32 out16 + n
7561*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m12}, m1, m8, m3, m0
7562*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m13}, m1, m3, m8, m0
7563*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m15}, m1, m3, m0, m8
7564*c0909341SAndroid Build Coastguard Worker    paddd                m2, m1, m4     ; idct64 out0  + n (unshifted)
7565*c0909341SAndroid Build Coastguard Worker    psubd                m1, m4         ; idct64 out63 - n (unshifted)
7566*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3, m5     ; idct64 out15 - n (unshifted)
7567*c0909341SAndroid Build Coastguard Worker    psubd                m3, m5         ; idct64 out48 + n (unshifted)
7568*c0909341SAndroid Build Coastguard Worker    paddd                m5, m0, m6     ; idct64 out16 + n (unshifted)
7569*c0909341SAndroid Build Coastguard Worker    psubd                m0, m6         ; idct64 out47 - n (unshifted)
7570*c0909341SAndroid Build Coastguard Worker    paddd                m6, m8, m7     ; idct64 out31 - n (unshifted)
7571*c0909341SAndroid Build Coastguard Worker    psubd                m8, m7         ; idct64 out32 + n (unshifted)
7572*c0909341SAndroid Build Coastguard Worker    mova         [r3-16*20], m2
7573*c0909341SAndroid Build Coastguard Worker    mova         [r4+16*28], m1
7574*c0909341SAndroid Build Coastguard Worker    mova         [r4-16*20], m4
7575*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*28], m3
7576*c0909341SAndroid Build Coastguard Worker    mova         [r3-16* 4], m5
7577*c0909341SAndroid Build Coastguard Worker    mova         [r4+16*12], m0
7578*c0909341SAndroid Build Coastguard Worker    mova         [r4-16* 4], m6
7579*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*12], m8
7580*c0909341SAndroid Build Coastguard Worker%else
7581*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(clip_18b_min)]
7582*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(clip_18b_max)]
7583*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+16*44] ; idct16 15 - n
7584*c0909341SAndroid Build Coastguard Worker    pmaxsd               m0, m5
7585*c0909341SAndroid Build Coastguard Worker    pminsd               m0, m6
7586*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0, m1     ; idct16 out0  + n
7587*c0909341SAndroid Build Coastguard Worker    psubd                m0, m1         ; idct16 out15 - n
7588*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m5}, m4, m0
7589*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m6}, m4, m0
7590*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4, m3     ; idct32 out0  + n
7591*c0909341SAndroid Build Coastguard Worker    psubd                m4, m3         ; idct32 out31 - n
7592*c0909341SAndroid Build Coastguard Worker    paddd                m3, m0, m2     ; idct32 out15 - n
7593*c0909341SAndroid Build Coastguard Worker    psubd                m0, m2         ; idct32 out16 + n
7594*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsd x, m5}, m1, m4, m3, m0
7595*c0909341SAndroid Build Coastguard Worker    REPX     {pminsd x, m6}, m1, m3, m4, m0
7596*c0909341SAndroid Build Coastguard Worker    REPX     {paddd  x, m7}, m1, m3, m0, m4
7597*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r4-16* 4] ; idct64 63 - n
7598*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3-16* 4] ; idct64 48 + n
7599*c0909341SAndroid Build Coastguard Worker    paddd                m2, m1, m5     ; idct64 out0  + n (unshifted)
7600*c0909341SAndroid Build Coastguard Worker    psubd                m1, m5         ; idct64 out63 - n (unshifted)
7601*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3, m6     ; idct64 out15 - n (unshifted)
7602*c0909341SAndroid Build Coastguard Worker    psubd                m3, m6         ; idct64 out48 + n (unshifted)
7603*c0909341SAndroid Build Coastguard Worker    mova         [r4+16*28], m1
7604*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*28], m3
7605*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r4-16*20] ; idct64 47 - n
7606*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3-16*20] ; idct64 32 + n
7607*c0909341SAndroid Build Coastguard Worker    mova         [r3-16*20], m2
7608*c0909341SAndroid Build Coastguard Worker    mova         [r4-16*20], m5
7609*c0909341SAndroid Build Coastguard Worker    paddd                m5, m0, m6     ; idct64 out16 + n (unshifted)
7610*c0909341SAndroid Build Coastguard Worker    psubd                m0, m6         ; idct64 out47 - n (unshifted)
7611*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4, m1     ; idct64 out31 - n (unshifted)
7612*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1         ; idct64 out32 + n (unshifted)
7613*c0909341SAndroid Build Coastguard Worker    mova         [r3-16* 4], m5
7614*c0909341SAndroid Build Coastguard Worker    mova         [r4+16*12], m0
7615*c0909341SAndroid Build Coastguard Worker    mova         [r4-16* 4], m6
7616*c0909341SAndroid Build Coastguard Worker    mova         [r3+16*12], m4
7617*c0909341SAndroid Build Coastguard Worker%endif
7618*c0909341SAndroid Build Coastguard Worker    sub                  r4, 16
7619*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
7620*c0909341SAndroid Build Coastguard Worker    cmp                  r3, r4
7621*c0909341SAndroid Build Coastguard Worker    jl .main_end_loop
7622*c0909341SAndroid Build Coastguard Worker    ret
7623*c0909341SAndroid Build Coastguard Worker
7624*c0909341SAndroid Build Coastguard Worker.shift_transpose:
7625*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+0*16]
7626*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+1*16]
7627*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+2*16]
7628*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+3*16]
7629*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+4*16]
7630*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+5*16]
7631*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+6*16]
7632*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+7*16]
7633*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
7634*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
7635*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
7636*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
7637*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
7638*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
7639*c0909341SAndroid Build Coastguard Worker    mova          [r4+0*64], m0
7640*c0909341SAndroid Build Coastguard Worker    mova          [r4+1*64], m1
7641*c0909341SAndroid Build Coastguard Worker    mova          [r4+2*64], m2
7642*c0909341SAndroid Build Coastguard Worker    mova          [r4+3*64], m3
7643*c0909341SAndroid Build Coastguard Worker    sub                  r4, 4*64
7644*c0909341SAndroid Build Coastguard Worker    sub                  r3, 8*16
7645*c0909341SAndroid Build Coastguard Worker    cmp                  r3, rsp
7646*c0909341SAndroid Build Coastguard Worker    jg .shift_transpose
7647*c0909341SAndroid Build Coastguard Worker    ret
7648*c0909341SAndroid Build Coastguard Worker
7649*c0909341SAndroid Build Coastguard Worker.dconly:
7650*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
7651*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
7652*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 16
7653*c0909341SAndroid Build Coastguard Worker.dconly1:
7654*c0909341SAndroid Build Coastguard Worker    add                 r5d, 640
7655*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 10
7656*c0909341SAndroid Build Coastguard Worker.dconly2:
7657*c0909341SAndroid Build Coastguard Worker    imul                r5d, 2896
7658*c0909341SAndroid Build Coastguard Worker    add                 r5d, 34816
7659*c0909341SAndroid Build Coastguard Worker    movd                 m0, r5d
7660*c0909341SAndroid Build Coastguard Worker    pshuflw              m0, m0, q1111
7661*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0
7662*c0909341SAndroid Build Coastguard Worker    mova                 m6, [o(pixel_10bpc_max)]
7663*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
7664*c0909341SAndroid Build Coastguard Worker.dconly_loop:
7665*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0, [dstq+16*0]
7666*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0, [dstq+16*1]
7667*c0909341SAndroid Build Coastguard Worker    paddw                m3, m0, [dstq+16*2]
7668*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0, [dstq+16*3]
7669*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsw x, m5}, m1, m2, m3, m4
7670*c0909341SAndroid Build Coastguard Worker    REPX     {pminsw x, m6}, m1, m2, m3, m4
7671*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m1
7672*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m2
7673*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m3
7674*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m4
7675*c0909341SAndroid Build Coastguard Worker    add                dstq, 64
7676*c0909341SAndroid Build Coastguard Worker    btc                 r3d, 16
7677*c0909341SAndroid Build Coastguard Worker    jnc .dconly_loop
7678*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq-128]
7679*c0909341SAndroid Build Coastguard Worker    dec                 r3d
7680*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
7681*c0909341SAndroid Build Coastguard Worker    RET
7682*c0909341SAndroid Build Coastguard Worker
7683*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
7684*c0909341SAndroid Build Coastguard Worker                                         0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \
7685*c0909341SAndroid Build Coastguard Worker                                         dst, stride, c, eob
7686*c0909341SAndroid Build Coastguard Worker    LEA                  r6, base
7687*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
7688*c0909341SAndroid Build Coastguard Worker    jz .dconly
7689*c0909341SAndroid Build Coastguard Worker
7690*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7691*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 0, 4, 1
7692*c0909341SAndroid Build Coastguard Worker    mov [rsp+(8*32+64+8)*16+1*gprsize], dstq
7693*c0909341SAndroid Build Coastguard Worker    mov [rsp+(8*32+64+8)*16+2*gprsize], strideq
7694*c0909341SAndroid Build Coastguard Worker%else
7695*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 4, 7, 8
7696*c0909341SAndroid Build Coastguard Worker%if WIN64
7697*c0909341SAndroid Build Coastguard Worker    mov [rsp+(8*32+64+1)*16+1*gprsize], r7
7698*c0909341SAndroid Build Coastguard Worker    mov [rsp+64*16+0*gprsize], r8
7699*c0909341SAndroid Build Coastguard Worker%endif
7700*c0909341SAndroid Build Coastguard Worker%endif
7701*c0909341SAndroid Build Coastguard Worker%undef cmp
7702*c0909341SAndroid Build Coastguard Worker    ; remove entirely-zero iterations
7703*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 14
7704*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
7705*c0909341SAndroid Build Coastguard Worker    jge .end_zero_loop
7706*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
7707*c0909341SAndroid Build Coastguard Worker.zero_loop:
7708*c0909341SAndroid Build Coastguard Worker    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
7709*c0909341SAndroid Build Coastguard Worker    movzx               t1d, t0b
7710*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 8
7711*c0909341SAndroid Build Coastguard Worker    lea                  t2, [rsp+7*32*16]
7712*c0909341SAndroid Build Coastguard Worker.zero_loop_inner:
7713*c0909341SAndroid Build Coastguard Worker    mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
7714*c0909341SAndroid Build Coastguard Worker    mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
7715*c0909341SAndroid Build Coastguard Worker    mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0
7716*c0909341SAndroid Build Coastguard Worker    mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0
7717*c0909341SAndroid Build Coastguard Worker    sub                  t2, 32*16
7718*c0909341SAndroid Build Coastguard Worker    cmp                  t2, rsp
7719*c0909341SAndroid Build Coastguard Worker    jge .zero_loop_inner
7720*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
7721*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
7722*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
7723*c0909341SAndroid Build Coastguard Worker.end_zero_loop:
7724*c0909341SAndroid Build Coastguard Worker    mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd
7725*c0909341SAndroid Build Coastguard Worker    ; actual first pass after skipping all-zero data
7726*c0909341SAndroid Build Coastguard Worker.loop_pass1:
7727*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7728*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
7729*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
7730*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
7731*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
7732*c0909341SAndroid Build Coastguard Worker%endif
7733*c0909341SAndroid Build Coastguard Worker
7734*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
7735*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(idct64_mul_16bpc)]
7736*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 1+r5*8]
7737*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*31+r5*8]
7738*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*17+r5*8]
7739*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*15+r5*8]
7740*c0909341SAndroid Build Coastguard Worker    call .rect2_mul_fast
7741*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7742*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 7+r5*8]
7743*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*25+r5*8]
7744*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*23+r5*8]
7745*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128* 9+r5*8]
7746*c0909341SAndroid Build Coastguard Worker    call .rect2_mul_fast
7747*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7748*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 5+r5*8]
7749*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*27+r5*8]
7750*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*21+r5*8]
7751*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*11+r5*8]
7752*c0909341SAndroid Build Coastguard Worker    call .rect2_mul_fast
7753*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7754*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 3+r5*8]
7755*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*29+r5*8]
7756*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*19+r5*8]
7757*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*13+r5*8]
7758*c0909341SAndroid Build Coastguard Worker    call .rect2_mul_fast
7759*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7760*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
7761*c0909341SAndroid Build Coastguard Worker
7762*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 2+r5*8]
7763*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*14+r5*8]
7764*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*18+r5*8]
7765*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*30+r5*8]
7766*c0909341SAndroid Build Coastguard Worker    call .rect2_mul_fast
7767*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
7768*c0909341SAndroid Build Coastguard Worker
7769*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 6+r5*8]
7770*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*10+r5*8]
7771*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*22+r5*8]
7772*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*26+r5*8]
7773*c0909341SAndroid Build Coastguard Worker    call .rect2_mul_fast
7774*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
7775*c0909341SAndroid Build Coastguard Worker    add                  r3, 16*(24+4*ARCH_X86_32)
7776*c0909341SAndroid Build Coastguard Worker
7777*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 4+r5*8]
7778*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*12+r5*8]
7779*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*20+r5*8]
7780*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*28+r5*8]
7781*c0909341SAndroid Build Coastguard Worker    call .rect2_mul_fast
7782*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).main_oddhalf_fast
7783*c0909341SAndroid Build Coastguard Worker
7784*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 0+r5*8]
7785*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 8+r5*8]
7786*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*16+r5*8]
7787*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*24+r5*8]
7788*c0909341SAndroid Build Coastguard Worker    call .rect2_mul_fast
7789*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1_fast
7790*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
7791*c0909341SAndroid Build Coastguard Worker    mova [r3-(7+4*ARCH_X86_32)*16], m1
7792*c0909341SAndroid Build Coastguard Worker    mova [r3-(6+4*ARCH_X86_32)*16], m2
7793*c0909341SAndroid Build Coastguard Worker    mova [r3-(5+4*ARCH_X86_32)*16], m3
7794*c0909341SAndroid Build Coastguard Worker    mova [r3-(4+4*ARCH_X86_32)*16], m4
7795*c0909341SAndroid Build Coastguard Worker    mova [r3-(3+4*ARCH_X86_32)*16], m5
7796*c0909341SAndroid Build Coastguard Worker    mova [r3-(2+4*ARCH_X86_32)*16], m6
7797*c0909341SAndroid Build Coastguard Worker    mova [r3-(1+4*ARCH_X86_32)*16], m7
7798*c0909341SAndroid Build Coastguard Worker    sub                  r3, 16*(40+4*ARCH_X86_32-4)
7799*c0909341SAndroid Build Coastguard Worker
7800*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7801*c0909341SAndroid Build Coastguard Worker    psrld               m15, m11, 11 ; pd_1
7802*c0909341SAndroid Build Coastguard Worker%else
7803*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_1)]
7804*c0909341SAndroid Build Coastguard Worker%endif
7805*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
7806*c0909341SAndroid Build Coastguard Worker
7807*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+56*16]
7808*c0909341SAndroid Build Coastguard Worker    lea                  t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16]
7809*c0909341SAndroid Build Coastguard Worker    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
7810*c0909341SAndroid Build Coastguard Worker    movzx               t1d, t0b
7811*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 8
7812*c0909341SAndroid Build Coastguard Worker    call .shift_transpose
7813*c0909341SAndroid Build Coastguard Worker    ; zero cq
7814*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
7815*c0909341SAndroid Build Coastguard Worker    lea                  r4, [cq+30*128+r5*8]
7816*c0909341SAndroid Build Coastguard Worker.zero_cq_loop:
7817*c0909341SAndroid Build Coastguard Worker    REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
7818*c0909341SAndroid Build Coastguard Worker    sub                  r4, 4*128
7819*c0909341SAndroid Build Coastguard Worker    cmp                  r4, cq
7820*c0909341SAndroid Build Coastguard Worker    jg .zero_cq_loop
7821*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
7822*c0909341SAndroid Build Coastguard Worker    jge .loop_pass1
7823*c0909341SAndroid Build Coastguard Worker
7824*c0909341SAndroid Build Coastguard Worker    ; pass=2 code starts here
7825*c0909341SAndroid Build Coastguard Worker    mov                eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16]
7826*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7827*c0909341SAndroid Build Coastguard Worker    mov             strideq, [rsp+gprsize*2+(8*32+64+8)*16]
7828*c0909341SAndroid Build Coastguard Worker%elif WIN64
7829*c0909341SAndroid Build Coastguard Worker    mov                  r8, [rsp+gprsize*0+64*16]
7830*c0909341SAndroid Build Coastguard Worker%endif
7831*c0909341SAndroid Build Coastguard Worker    add                 rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16
7832*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
7833*c0909341SAndroid Build Coastguard Worker    jl .load_veryfast
7834*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
7835*c0909341SAndroid Build Coastguard Worker    jl .load_fast
7836*c0909341SAndroid Build Coastguard Worker    ; load normal
7837*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
7838*c0909341SAndroid Build Coastguard Worker    jmp .run
7839*c0909341SAndroid Build Coastguard Worker.load_fast:
7840*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
7841*c0909341SAndroid Build Coastguard Worker    jmp .run
7842*c0909341SAndroid Build Coastguard Worker.load_veryfast:
7843*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
7844*c0909341SAndroid Build Coastguard Worker    ; fall-through
7845*c0909341SAndroid Build Coastguard Worker.run:
7846*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7847*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+128]
7848*c0909341SAndroid Build Coastguard Worker    mov                  r7, -16
7849*c0909341SAndroid Build Coastguard Worker%else
7850*c0909341SAndroid Build Coastguard Worker    lea                  r2, [rsp+(8*32+3)*16]
7851*c0909341SAndroid Build Coastguard Worker    mov dword [r2+0*gprsize], 8
7852*c0909341SAndroid Build Coastguard Worker%endif
7853*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
7854*c0909341SAndroid Build Coastguard Worker
7855*c0909341SAndroid Build Coastguard Worker.rect2_mul_fast:
7856*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7857*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m14}, m0, m1, m2, m3
7858*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m11}, m0, m1, m2, m3
7859*c0909341SAndroid Build Coastguard Worker%else
7860*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(pd_2896)]
7861*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(pd_2048)]
7862*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m4 }, m0, m1, m2, m3
7863*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m5 }, m0, m1, m2, m3
7864*c0909341SAndroid Build Coastguard Worker%endif
7865*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m0, m1, m2, m3
7866*c0909341SAndroid Build Coastguard Worker    ret
7867*c0909341SAndroid Build Coastguard Worker
7868*c0909341SAndroid Build Coastguard Worker.shift_transpose:
7869*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+0*16]
7870*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+1*16]
7871*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+2*16]
7872*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+3*16]
7873*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+4*16]
7874*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+5*16]
7875*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+6*16]
7876*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+7*16]
7877*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
7878*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
7879*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
7880*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
7881*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
7882*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
7883*c0909341SAndroid Build Coastguard Worker    mova     [t2+0*16+r5*8], m0
7884*c0909341SAndroid Build Coastguard Worker    mova     [t2+8*16+r5*8], m2
7885*c0909341SAndroid Build Coastguard Worker    mova     [t2+0*16+t0*8], m3
7886*c0909341SAndroid Build Coastguard Worker    mova     [t2+0*16+t1*8], m1
7887*c0909341SAndroid Build Coastguard Worker    sub                  t2, 16*32
7888*c0909341SAndroid Build Coastguard Worker    sub                  r3, 8*16
7889*c0909341SAndroid Build Coastguard Worker    cmp                  r3, rsp
7890*c0909341SAndroid Build Coastguard Worker    jg .shift_transpose
7891*c0909341SAndroid Build Coastguard Worker    ret
7892*c0909341SAndroid Build Coastguard Worker
7893*c0909341SAndroid Build Coastguard Worker.dconly:
7894*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
7895*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
7896*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 32
7897*c0909341SAndroid Build Coastguard Worker    add                 r5d, 128
7898*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 8
7899*c0909341SAndroid Build Coastguard Worker    imul                r5d, 181
7900*c0909341SAndroid Build Coastguard Worker    add                 r5d, 384
7901*c0909341SAndroid Build Coastguard Worker    sar                 r5d, 9
7902*c0909341SAndroid Build Coastguard Worker    add                 rsp, (1+8*32+1*WIN64)*16
7903*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
7904*c0909341SAndroid Build Coastguard Worker
7905*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
7906*c0909341SAndroid Build Coastguard Worker                                         0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \
7907*c0909341SAndroid Build Coastguard Worker                                         dst, stride, c, eob
7908*c0909341SAndroid Build Coastguard Worker    LEA                  r6, base
7909*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
7910*c0909341SAndroid Build Coastguard Worker    jz .dconly
7911*c0909341SAndroid Build Coastguard Worker
7912*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7913*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 4, 1, 2, 0, 6
7914*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*1+(64*9+8)*16], r0
7915*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*2+(64*9+8)*16], r1
7916*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*3+(64*9+8)*16], r2
7917*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*4+(64*9+8)*16], r6
7918*c0909341SAndroid Build Coastguard Worker%else
7919*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 8, 9, 4, 7, 0
7920*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*1+(64*9+1)*16], r9
7921*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*0+64*16], r0
7922*c0909341SAndroid Build Coastguard Worker%if WIN64
7923*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*2+(64*9+1)*16], r7
7924*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*3+(64*9+1)*16], r8
7925*c0909341SAndroid Build Coastguard Worker%endif
7926*c0909341SAndroid Build Coastguard Worker%endif
7927*c0909341SAndroid Build Coastguard Worker%undef cmp
7928*c0909341SAndroid Build Coastguard Worker
7929*c0909341SAndroid Build Coastguard Worker    ; remove entirely-zero iterations
7930*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 14
7931*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
7932*c0909341SAndroid Build Coastguard Worker    jge .end_zero_loop
7933*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
7934*c0909341SAndroid Build Coastguard Worker.zero_loop:
7935*c0909341SAndroid Build Coastguard Worker    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
7936*c0909341SAndroid Build Coastguard Worker    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
7937*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t1b
7938*c0909341SAndroid Build Coastguard Worker    movzx               t2d, t3b
7939*c0909341SAndroid Build Coastguard Worker    shr                 t1d, 8
7940*c0909341SAndroid Build Coastguard Worker    shr                 t3d, 8
7941*c0909341SAndroid Build Coastguard Worker    lea                  t4, [rsp+7*64*16]
7942*c0909341SAndroid Build Coastguard Worker.zero_loop_inner:
7943*c0909341SAndroid Build Coastguard Worker    mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0
7944*c0909341SAndroid Build Coastguard Worker    mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0
7945*c0909341SAndroid Build Coastguard Worker    mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0
7946*c0909341SAndroid Build Coastguard Worker    mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0
7947*c0909341SAndroid Build Coastguard Worker    sub                  t4, 64*16
7948*c0909341SAndroid Build Coastguard Worker    cmp                  t4, rsp
7949*c0909341SAndroid Build Coastguard Worker    jge .zero_loop_inner
7950*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7951*c0909341SAndroid Build Coastguard Worker    mov                  r6, [rsp+gprsize*4+(64*9+8)*16]
7952*c0909341SAndroid Build Coastguard Worker%endif
7953*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
7954*c0909341SAndroid Build Coastguard Worker    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
7955*c0909341SAndroid Build Coastguard Worker    jl .zero_loop
7956*c0909341SAndroid Build Coastguard Worker.end_zero_loop:
7957*c0909341SAndroid Build Coastguard Worker    mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd
7958*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7959*c0909341SAndroid Build Coastguard Worker    mov                  cq, [rsp+gprsize*3+(64*9+8)*16]
7960*c0909341SAndroid Build Coastguard Worker%endif
7961*c0909341SAndroid Build Coastguard Worker    ; actual first pass after skipping all-zero data
7962*c0909341SAndroid Build Coastguard Worker.loop_pass1:
7963*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7964*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(pd_2048)]
7965*c0909341SAndroid Build Coastguard Worker    mova                m12, [o(clip_18b_min)]
7966*c0909341SAndroid Build Coastguard Worker    mova                m13, [o(clip_18b_max)]
7967*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(pd_2896)]
7968*c0909341SAndroid Build Coastguard Worker%endif
7969*c0909341SAndroid Build Coastguard Worker
7970*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
7971*c0909341SAndroid Build Coastguard Worker    lea                  r4, [o(idct64_mul_16bpc)]
7972*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 1+r5*8]
7973*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*31+r5*8]
7974*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*17+r5*8]
7975*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*15+r5*8]
7976*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7977*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 7+r5*8]
7978*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*25+r5*8]
7979*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*23+r5*8]
7980*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128* 9+r5*8]
7981*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7982*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 5+r5*8]
7983*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*27+r5*8]
7984*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*21+r5*8]
7985*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*11+r5*8]
7986*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7987*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 3+r5*8]
7988*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*29+r5*8]
7989*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*19+r5*8]
7990*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*13+r5*8]
7991*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7992*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
7993*c0909341SAndroid Build Coastguard Worker
7994*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 2+r5*8]
7995*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*14+r5*8]
7996*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*18+r5*8]
7997*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*30+r5*8]
7998*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
7999*c0909341SAndroid Build Coastguard Worker
8000*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 6+r5*8]
8001*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*10+r5*8]
8002*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*22+r5*8]
8003*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*26+r5*8]
8004*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
8005*c0909341SAndroid Build Coastguard Worker    add                  r3, 16*(24+4*ARCH_X86_32)
8006*c0909341SAndroid Build Coastguard Worker
8007*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 4+r5*8]
8008*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*12+r5*8]
8009*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*20+r5*8]
8010*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*28+r5*8]
8011*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_16bpc).main_oddhalf_fast
8012*c0909341SAndroid Build Coastguard Worker
8013*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 0+r5*8]
8014*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 8+r5*8]
8015*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*16+r5*8]
8016*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*24+r5*8]
8017*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).main_pass1_fast
8018*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).round
8019*c0909341SAndroid Build Coastguard Worker    mova [r3-(7+4*ARCH_X86_32)*16], m1
8020*c0909341SAndroid Build Coastguard Worker    mova [r3-(6+4*ARCH_X86_32)*16], m2
8021*c0909341SAndroid Build Coastguard Worker    mova [r3-(5+4*ARCH_X86_32)*16], m3
8022*c0909341SAndroid Build Coastguard Worker    mova [r3-(4+4*ARCH_X86_32)*16], m4
8023*c0909341SAndroid Build Coastguard Worker    mova [r3-(3+4*ARCH_X86_32)*16], m5
8024*c0909341SAndroid Build Coastguard Worker    mova [r3-(2+4*ARCH_X86_32)*16], m6
8025*c0909341SAndroid Build Coastguard Worker    mova [r3-(1+4*ARCH_X86_32)*16], m7
8026*c0909341SAndroid Build Coastguard Worker    sub                  r3, 16*(40+4*ARCH_X86_32-4)
8027*c0909341SAndroid Build Coastguard Worker
8028*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8029*c0909341SAndroid Build Coastguard Worker    psrld               m15, m11, 10 ; pd_2
8030*c0909341SAndroid Build Coastguard Worker%else
8031*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(pd_2)]
8032*c0909341SAndroid Build Coastguard Worker%endif
8033*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
8034*c0909341SAndroid Build Coastguard Worker
8035*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+56*16]
8036*c0909341SAndroid Build Coastguard Worker    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
8037*c0909341SAndroid Build Coastguard Worker    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
8038*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t1b
8039*c0909341SAndroid Build Coastguard Worker    movzx               t2d, t3b
8040*c0909341SAndroid Build Coastguard Worker    shr                 t1d, 8
8041*c0909341SAndroid Build Coastguard Worker    shr                 t3d, 8
8042*c0909341SAndroid Build Coastguard Worker    lea                  t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
8043*c0909341SAndroid Build Coastguard Worker    call .shift_transpose
8044*c0909341SAndroid Build Coastguard Worker    ; zero cq
8045*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
8046*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8047*c0909341SAndroid Build Coastguard Worker    mov                  cq, [rsp+gprsize*3+(64*9+8)*16]
8048*c0909341SAndroid Build Coastguard Worker%endif
8049*c0909341SAndroid Build Coastguard Worker    lea                  r4, [cq+30*128+r5*8]
8050*c0909341SAndroid Build Coastguard Worker.zero_cq_loop:
8051*c0909341SAndroid Build Coastguard Worker    REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
8052*c0909341SAndroid Build Coastguard Worker    sub                  r4, 4*128
8053*c0909341SAndroid Build Coastguard Worker    cmp                  r4, cq
8054*c0909341SAndroid Build Coastguard Worker    jg .zero_cq_loop
8055*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8056*c0909341SAndroid Build Coastguard Worker    mov                  r6, [rsp+gprsize*4+(64*9+8)*16]
8057*c0909341SAndroid Build Coastguard Worker%endif
8058*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 2
8059*c0909341SAndroid Build Coastguard Worker    jge .loop_pass1
8060*c0909341SAndroid Build Coastguard Worker
8061*c0909341SAndroid Build Coastguard Worker    ; pass=2 code starts here
8062*c0909341SAndroid Build Coastguard Worker    mov                eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
8063*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8064*c0909341SAndroid Build Coastguard Worker    mov             strideq, [rsp+gprsize*2+(9*64+8)*16]
8065*c0909341SAndroid Build Coastguard Worker%else
8066*c0909341SAndroid Build Coastguard Worker    mov                  r0, [rsp+gprsize*0+64*16]
8067*c0909341SAndroid Build Coastguard Worker%endif
8068*c0909341SAndroid Build Coastguard Worker    add                 rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16
8069*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
8070*c0909341SAndroid Build Coastguard Worker    jl .fast
8071*c0909341SAndroid Build Coastguard Worker    ; fall-through
8072*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8073*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 8, 9
8074*c0909341SAndroid Build Coastguard Worker%else
8075*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 1, 5
8076*c0909341SAndroid Build Coastguard Worker%endif
8077*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
8078*c0909341SAndroid Build Coastguard Worker    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
8079*c0909341SAndroid Build Coastguard Worker    jmp .run
8080*c0909341SAndroid Build Coastguard Worker.fast:
8081*c0909341SAndroid Build Coastguard Worker    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
8082*c0909341SAndroid Build Coastguard Worker    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
8083*c0909341SAndroid Build Coastguard Worker.run:
8084*c0909341SAndroid Build Coastguard Worker
8085*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8086*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+128]
8087*c0909341SAndroid Build Coastguard Worker    mov                  r7, -16
8088*c0909341SAndroid Build Coastguard Worker%else
8089*c0909341SAndroid Build Coastguard Worker    lea                  r2, [rsp+(64*8+3)*16]
8090*c0909341SAndroid Build Coastguard Worker    mov      [r2+4*gprsize], t0
8091*c0909341SAndroid Build Coastguard Worker    mov      [r2+5*gprsize], t1
8092*c0909341SAndroid Build Coastguard Worker    mov                  r1, [r2+2*gprsize]
8093*c0909341SAndroid Build Coastguard Worker    mov dword [r2+0*gprsize], 8
8094*c0909341SAndroid Build Coastguard Worker%endif
8095*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
8096*c0909341SAndroid Build Coastguard Worker
8097*c0909341SAndroid Build Coastguard Worker    ; copy of pass=1 tmp-regs
8098*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8099*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 4, 1, 2, 0, 6
8100*c0909341SAndroid Build Coastguard Worker%else
8101*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 8, 9, 4, 7, 0
8102*c0909341SAndroid Build Coastguard Worker%endif
8103*c0909341SAndroid Build Coastguard Worker
8104*c0909341SAndroid Build Coastguard Worker.shift_transpose:
8105*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+0*16]
8106*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+1*16]
8107*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3+2*16]
8108*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r3+3*16]
8109*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3+4*16]
8110*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3+5*16]
8111*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+6*16]
8112*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+7*16]
8113*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
8114*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1
8115*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m3
8116*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
8117*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7
8118*c0909341SAndroid Build Coastguard Worker    call m(idct_8x4_internal_16bpc).transpose4x8packed
8119*c0909341SAndroid Build Coastguard Worker    mova          [t4+t0*8], m0
8120*c0909341SAndroid Build Coastguard Worker    mova          [t4+t1*8], m1
8121*c0909341SAndroid Build Coastguard Worker    mova          [t4+t2*8], m2
8122*c0909341SAndroid Build Coastguard Worker    mova          [t4+t3*8], m3
8123*c0909341SAndroid Build Coastguard Worker    sub                  t4, 16*64
8124*c0909341SAndroid Build Coastguard Worker    sub                  r3, 8*16
8125*c0909341SAndroid Build Coastguard Worker    cmp                  r3, rsp
8126*c0909341SAndroid Build Coastguard Worker    jg .shift_transpose
8127*c0909341SAndroid Build Coastguard Worker    ret
8128*c0909341SAndroid Build Coastguard Worker
8129*c0909341SAndroid Build Coastguard Worker.dconly:
8130*c0909341SAndroid Build Coastguard Worker    imul                r5d, [cq], 181
8131*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
8132*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 64
8133*c0909341SAndroid Build Coastguard Worker    add                 rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \
8134*c0909341SAndroid Build Coastguard Worker                             (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16
8135*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1
8136