xref: /aosp_15_r20/external/libdav1d/src/x86/itx16_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2022-2023, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2022-2023, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Workeridct8x8p:      db  0,  1,  4,  5,  2,  3,  6,  7, 16, 17, 20, 21, 18, 19, 22, 23
34*c0909341SAndroid Build Coastguard Worker               db  8,  9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31
35*c0909341SAndroid Build Coastguard Worker               db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55
36*c0909341SAndroid Build Coastguard Worker               db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63
37*c0909341SAndroid Build Coastguard Workeridtx8x8p:      db  0,  1, 32, 33,  2,  3, 34, 35,  4,  5, 36, 37,  6,  7, 38, 39
38*c0909341SAndroid Build Coastguard Worker               db  8,  9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
39*c0909341SAndroid Build Coastguard Worker               db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
40*c0909341SAndroid Build Coastguard Worker               db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
41*c0909341SAndroid Build Coastguard Workeridct8x16p:     db 54, 55,  2,  3, 22, 23, 34, 35, 38, 39, 18, 19,  6,  7, 50, 51
42*c0909341SAndroid Build Coastguard Worker               db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59
43*c0909341SAndroid Build Coastguard Worker               db 52, 53,  4,  5, 20, 21, 36, 37, 32, 33,  0,  1, 48, 49, 16, 17
44*c0909341SAndroid Build Coastguard Worker               db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41,  8,  9, 56, 57, 24, 25
45*c0909341SAndroid Build Coastguard Workeriadst8x16p:    db  0,  1, 54, 55, 48, 49,  6,  7, 16, 17, 38, 39, 32, 33, 22, 23
46*c0909341SAndroid Build Coastguard Worker               db  8,  9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31
47*c0909341SAndroid Build Coastguard Worker               db  4,  5, 50, 51, 52, 53,  2,  3, 20, 21, 34, 35, 36, 37, 18, 19
48*c0909341SAndroid Build Coastguard Worker               db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27
49*c0909341SAndroid Build Coastguard WorkerpermA:         db  0,  1,  0,  8,  4,  5,  1,  9,  8,  9,  4, 12, 12, 13,  5, 13
50*c0909341SAndroid Build Coastguard Worker               db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29
51*c0909341SAndroid Build Coastguard Worker               db  2,  3,  2, 10,  6,  7,  3, 11, 10, 11,  6, 14, 14, 15,  7, 15
52*c0909341SAndroid Build Coastguard Worker               db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31
53*c0909341SAndroid Build Coastguard WorkerpermB:         db  4,  2,  1,  8,  0,  0,  1,  0, 12,  3,  3, 10,  8,  1,  3,  2
54*c0909341SAndroid Build Coastguard Worker               db  5, 10,  5, 12,  1,  8,  5,  4, 13, 11,  7, 14,  9,  9,  7,  6
55*c0909341SAndroid Build Coastguard Worker               db  6,  6, 13,  4,  2,  4,  4,  5, 14,  7, 15,  6, 10,  5,  6,  7
56*c0909341SAndroid Build Coastguard Worker               db  7, 14,  9,  0,  3, 12,  0,  1, 15, 15, 11,  2, 11, 13,  2,  3
57*c0909341SAndroid Build Coastguard WorkerpermC:         db  0,  9,  0,  0,  0,  1,  4,  4,  2, 11,  2,  2,  2,  3,  6,  6
58*c0909341SAndroid Build Coastguard Worker               db  1,  8,  1,  8,  4,  5,  5, 12,  3, 10,  3, 10,  6,  7,  7, 14
59*c0909341SAndroid Build Coastguard Worker               db  9,  1,  8,  1,  1,  0, 12,  5, 11,  3, 10,  3,  3,  2, 14,  7
60*c0909341SAndroid Build Coastguard Worker               db  8,  0,  9,  9,  5,  4, 13, 13, 10,  2, 11, 11,  7,  6, 15, 15
61*c0909341SAndroid Build Coastguard Workeridct8x32p:     db  0,  1,  4,  5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
62*c0909341SAndroid Build Coastguard Worker               db  8,  9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
63*c0909341SAndroid Build Coastguard Worker               db  2,  3,  6,  7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
64*c0909341SAndroid Build Coastguard Worker               db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
65*c0909341SAndroid Build Coastguard Workeridct32x8p:     db  2, 18,  0, 16,  3, 19,  1, 17, 10, 26,  8, 24, 11, 27,  9, 25
66*c0909341SAndroid Build Coastguard Worker               db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57
67*c0909341SAndroid Build Coastguard Worker               db  6, 22,  4, 20,  7, 23,  5, 21, 14, 30, 12, 28, 15, 31, 13, 29
68*c0909341SAndroid Build Coastguard Worker               db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61
69*c0909341SAndroid Build Coastguard Workeridtx32x8p:     db  0,  8, 16, 24,  4, 12, 20, 28,  2, 10, 18, 26,  6, 14, 22, 30
70*c0909341SAndroid Build Coastguard Worker               db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62
71*c0909341SAndroid Build Coastguard Worker               db  1,  9, 17, 25,  5, 13, 21, 29,  3, 11, 19, 27,  7, 15, 23, 31
72*c0909341SAndroid Build Coastguard Worker               db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63
73*c0909341SAndroid Build Coastguard Worker
74*c0909341SAndroid Build Coastguard Workerpw_2048_m2048: times 16 dw  2048
75*c0909341SAndroid Build Coastguard Workerpw_m2048_2048: times 16 dw -2048
76*c0909341SAndroid Build Coastguard Workerpw_2048:       times 16 dw  2048
77*c0909341SAndroid Build Coastguard Worker
78*c0909341SAndroid Build Coastguard Worker; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-, 4=--
79*c0909341SAndroid Build Coastguard Worker%macro COEF_PAIR 2-3 0 ; a, b, flags
80*c0909341SAndroid Build Coastguard Worker%if %3 == 1
81*c0909341SAndroid Build Coastguard Workerpd_%1_m%2: dd %1, %1, -%2, -%2
82*c0909341SAndroid Build Coastguard Worker%define pd_%1  (pd_%1_m%2 + 4*0)
83*c0909341SAndroid Build Coastguard Worker%define pd_m%2 (pd_%1_m%2 + 4*2)
84*c0909341SAndroid Build Coastguard Worker%elif %3 == 2
85*c0909341SAndroid Build Coastguard Workerpd_m%1_%2: dd -%1, -%1, %2, %2
86*c0909341SAndroid Build Coastguard Worker%define pd_m%1 (pd_m%1_%2 + 4*0)
87*c0909341SAndroid Build Coastguard Worker%define pd_%2  (pd_m%1_%2 + 4*2)
88*c0909341SAndroid Build Coastguard Worker%elif %3 == 4
89*c0909341SAndroid Build Coastguard Workerpd_m%1_m%2: dd -%1, -%1, -%2, -%2
90*c0909341SAndroid Build Coastguard Worker%define pd_m%1 (pd_m%1_m%2 + 4*0)
91*c0909341SAndroid Build Coastguard Worker%define pd_m%2 (pd_m%1_m%2 + 4*2)
92*c0909341SAndroid Build Coastguard Worker%else
93*c0909341SAndroid Build Coastguard Workerpd_%1_%2: dd %1, %1, %2, %2
94*c0909341SAndroid Build Coastguard Worker%define pd_%1  (pd_%1_%2 + 4*0)
95*c0909341SAndroid Build Coastguard Worker%define pd_%2  (pd_%1_%2 + 4*2)
96*c0909341SAndroid Build Coastguard Worker%if %3 == 3
97*c0909341SAndroid Build Coastguard Worker%define pd_%2_m%2 pd_%2
98*c0909341SAndroid Build Coastguard Workerdd -%2, -%2
99*c0909341SAndroid Build Coastguard Worker%endif
100*c0909341SAndroid Build Coastguard Worker%endif
101*c0909341SAndroid Build Coastguard Worker%endmacro
102*c0909341SAndroid Build Coastguard Worker
103*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  101,  501
104*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  201,  601, 1
105*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  201,  995
106*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  401, 1189, 1
107*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  401, 1931
108*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  401, 3920
109*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  401, 4076
110*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  700,  301, 4
111*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  799, 2276, 1
112*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  799, 3406
113*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  799, 4017
114*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 1380,  601
115*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 1751, 2440
116*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 2598, 1189
117*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 2598, 1931, 2
118*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 2598, 3612
119*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 2751, 2106
120*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 2896, 1567, 3
121*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 2896, 3784, 3
122*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3035, 3513
123*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3166, 1931
124*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3166, 3612
125*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3166, 3920
126*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3703, 3290
127*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3857, 4052
128*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4017, 2276
129*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4017, 3406
130*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4036, 4085
131*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4076, 1189
132*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4076, 3612
133*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4076, 3920
134*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4091, 3973
135*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4091, 4052
136*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4095, 4065
137*c0909341SAndroid Build Coastguard Worker
138*c0909341SAndroid Build Coastguard Workerpb_32:           times 4 db 32
139*c0909341SAndroid Build Coastguard Workerpw_5:            times 2 dw 5
140*c0909341SAndroid Build Coastguard Workerpw_4096:         times 2 dw 4096
141*c0909341SAndroid Build Coastguard Workerpw_8192:         times 2 dw 8192
142*c0909341SAndroid Build Coastguard Workerpw_1697x16:      times 2 dw 1697*16
143*c0909341SAndroid Build Coastguard Workerpw_2896x8:       times 2 dw 2896*8
144*c0909341SAndroid Build Coastguard Workerpixel_10bpc_max: times 2 dw 0x03ff
145*c0909341SAndroid Build Coastguard Workerdconly_10bpc:    times 2 dw 0x7c00
146*c0909341SAndroid Build Coastguard Workerclip_18b_min:    dd -0x20000
147*c0909341SAndroid Build Coastguard Workerclip_18b_max:    dd  0x1ffff
148*c0909341SAndroid Build Coastguard Workerpd_1:            dd 1
149*c0909341SAndroid Build Coastguard Workerpd_2:            dd 2
150*c0909341SAndroid Build Coastguard Workerpd_1448:         dd 1448
151*c0909341SAndroid Build Coastguard Workerpd_2048:         dd 2048
152*c0909341SAndroid Build Coastguard Workerpd_3071:         dd 3071 ; 1024 + 2048 - 1
153*c0909341SAndroid Build Coastguard Workerpd_3072:         dd 3072 ; 1024 + 2048
154*c0909341SAndroid Build Coastguard Workerpd_5119:         dd 5119 ; 1024 + 4096 - 1
155*c0909341SAndroid Build Coastguard Workerpd_5120:         dd 5120 ; 1024 + 4096
156*c0909341SAndroid Build Coastguard Workerpd_5793:         dd 5793
157*c0909341SAndroid Build Coastguard Worker
158*c0909341SAndroid Build Coastguard Workercextern dup16_perm
159*c0909341SAndroid Build Coastguard Workercextern int8_permA
160*c0909341SAndroid Build Coastguard Workercextern idct64_mul_16bpc
161*c0909341SAndroid Build Coastguard Workercextern idct_8x8_internal_8bpc_avx512icl.main
162*c0909341SAndroid Build Coastguard Workercextern iadst_8x8_internal_8bpc_avx512icl.main_pass2
163*c0909341SAndroid Build Coastguard Workercextern idct_8x16_internal_8bpc_avx512icl.main
164*c0909341SAndroid Build Coastguard Workercextern idct_8x16_internal_8bpc_avx512icl.main2
165*c0909341SAndroid Build Coastguard Workercextern idct_8x16_internal_8bpc_avx512icl.main_fast
166*c0909341SAndroid Build Coastguard Workercextern idct_8x16_internal_8bpc_avx512icl.main_fast2
167*c0909341SAndroid Build Coastguard Workercextern iadst_8x16_internal_8bpc_avx512icl.main2
168*c0909341SAndroid Build Coastguard Workercextern idct_16x8_internal_8bpc_avx512icl.main
169*c0909341SAndroid Build Coastguard Workercextern iadst_16x8_internal_8bpc_avx512icl.main_pass2
170*c0909341SAndroid Build Coastguard Workercextern idct_16x16_internal_8bpc_avx512icl.main
171*c0909341SAndroid Build Coastguard Workercextern idct_16x16_internal_8bpc_avx512icl.main2
172*c0909341SAndroid Build Coastguard Workercextern idct_16x16_internal_8bpc_avx512icl.main_fast
173*c0909341SAndroid Build Coastguard Workercextern idct_16x16_internal_8bpc_avx512icl.main_fast2
174*c0909341SAndroid Build Coastguard Workercextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b
175*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main
176*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast
177*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2
178*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end
179*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf
180*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast
181*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast2
182*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main
183*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf
184*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast
185*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2
186*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast3
187*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf
188*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast
189*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2
190*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast3
191*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf
192*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast
193*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1
194*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast
195*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast2
196*c0909341SAndroid Build Coastguard Workercextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part2
197*c0909341SAndroid Build Coastguard Worker
198*c0909341SAndroid Build Coastguard WorkerSECTION .text
199*c0909341SAndroid Build Coastguard Worker
200*c0909341SAndroid Build Coastguard Worker%define o_base (pw_2048+4*128)
201*c0909341SAndroid Build Coastguard Worker%define o_base_8bpc (int8_permA+64*18)
202*c0909341SAndroid Build Coastguard Worker%define o(x) (r5 - o_base + (x))
203*c0909341SAndroid Build Coastguard Worker%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
204*c0909341SAndroid Build Coastguard Worker
205*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
206*c0909341SAndroid Build Coastguard Worker
207*c0909341SAndroid Build Coastguard Worker; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
208*c0909341SAndroid Build Coastguard Worker; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
209*c0909341SAndroid Build Coastguard Worker; flags: 1 = inv_dst1, 2 = inv_dst2
210*c0909341SAndroid Build Coastguard Worker; skip round/shift if rnd is not a number
211*c0909341SAndroid Build Coastguard Worker%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
212*c0909341SAndroid Build Coastguard Worker%if %8 < 32
213*c0909341SAndroid Build Coastguard Worker    pmulld              m%4, m%1, m%8
214*c0909341SAndroid Build Coastguard Worker    pmulld              m%3, m%2, m%8
215*c0909341SAndroid Build Coastguard Worker%else
216*c0909341SAndroid Build Coastguard Worker%if %8 < 4096
217*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%3, [o(pd_%8)]
218*c0909341SAndroid Build Coastguard Worker%else
219*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m%3, [o(pd_%8)]
220*c0909341SAndroid Build Coastguard Worker%endif
221*c0909341SAndroid Build Coastguard Worker    pmulld              m%4, m%1, m%3
222*c0909341SAndroid Build Coastguard Worker    pmulld              m%3, m%2
223*c0909341SAndroid Build Coastguard Worker%endif
224*c0909341SAndroid Build Coastguard Worker%if %7 < 32
225*c0909341SAndroid Build Coastguard Worker    pmulld              m%1, m%7
226*c0909341SAndroid Build Coastguard Worker    pmulld              m%2, m%7
227*c0909341SAndroid Build Coastguard Worker%else
228*c0909341SAndroid Build Coastguard Worker%if %7 < 4096
229*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%5, [o(pd_%7)]
230*c0909341SAndroid Build Coastguard Worker%else
231*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m%5, [o(pd_%7)]
232*c0909341SAndroid Build Coastguard Worker%endif
233*c0909341SAndroid Build Coastguard Worker    pmulld              m%1, m%5
234*c0909341SAndroid Build Coastguard Worker    pmulld              m%2, m%5
235*c0909341SAndroid Build Coastguard Worker%endif
236*c0909341SAndroid Build Coastguard Worker%if %9 & 2
237*c0909341SAndroid Build Coastguard Worker    psubd               m%4, m%6, m%4
238*c0909341SAndroid Build Coastguard Worker    psubd               m%2, m%4, m%2
239*c0909341SAndroid Build Coastguard Worker%else
240*c0909341SAndroid Build Coastguard Worker%ifnum %6
241*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m%6
242*c0909341SAndroid Build Coastguard Worker%endif
243*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%4
244*c0909341SAndroid Build Coastguard Worker%endif
245*c0909341SAndroid Build Coastguard Worker%ifnum %6
246*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%6
247*c0909341SAndroid Build Coastguard Worker%endif
248*c0909341SAndroid Build Coastguard Worker%if %9 & 1
249*c0909341SAndroid Build Coastguard Worker    psubd               m%1, m%3, m%1
250*c0909341SAndroid Build Coastguard Worker%else
251*c0909341SAndroid Build Coastguard Worker    psubd               m%1, m%3
252*c0909341SAndroid Build Coastguard Worker%endif
253*c0909341SAndroid Build Coastguard Worker%ifnum %6
254*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 12
255*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 12
256*c0909341SAndroid Build Coastguard Worker%endif
257*c0909341SAndroid Build Coastguard Worker%endmacro
258*c0909341SAndroid Build Coastguard Worker
259*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size
260*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2
261*c0909341SAndroid Build Coastguard Worker    %define %%p1 m(i%1_%4_internal_10bpc)
262*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
263*c0909341SAndroid Build Coastguard Worker    ; Jump to the 1st txfm function if we're not taking the fast path, which
264*c0909341SAndroid Build Coastguard Worker    ; in turn performs an indirect jump to the 2nd txfm function.
265*c0909341SAndroid Build Coastguard Worker    lea tx2q, [m(i%2_%4_internal_10bpc).pass2]
266*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
267*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
268*c0909341SAndroid Build Coastguard Worker    jnz %%p1
269*c0909341SAndroid Build Coastguard Worker%else
270*c0909341SAndroid Build Coastguard Worker%if %3
271*c0909341SAndroid Build Coastguard Worker    add                eobd, %3
272*c0909341SAndroid Build Coastguard Worker%endif
273*c0909341SAndroid Build Coastguard Worker    ; jump to the 1st txfm function unless it's located directly after this
274*c0909341SAndroid Build Coastguard Worker    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
275*c0909341SAndroid Build Coastguard WorkerALIGN function_align
276*c0909341SAndroid Build Coastguard Worker%%end:
277*c0909341SAndroid Build Coastguard Worker%endif
278*c0909341SAndroid Build Coastguard Worker%endmacro
279*c0909341SAndroid Build Coastguard Worker
280*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
281*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, %3, 8x8
282*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
283*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
284*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
285*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8
286*c0909341SAndroid Build Coastguard Worker.dconly:
287*c0909341SAndroid Build Coastguard Worker    add                 r6d, 384
288*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 9
289*c0909341SAndroid Build Coastguard Worker.dconly2:
290*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym2, [o(dconly_10bpc)]
291*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
292*c0909341SAndroid Build Coastguard Worker    add                 r6d, 2176
293*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 12
294*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        ym1, r6d
295*c0909341SAndroid Build Coastguard Worker    paddsw              ym1, ym2
296*c0909341SAndroid Build Coastguard Worker.dconly_loop:
297*c0909341SAndroid Build Coastguard Worker    mova                xm0, [dstq+strideq*0]
298*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, [dstq+strideq*1], 1
299*c0909341SAndroid Build Coastguard Worker    paddsw              ym0, ym1
300*c0909341SAndroid Build Coastguard Worker    psubusw             ym0, ym2
301*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
302*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
303*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
304*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 2
305*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
306*c0909341SAndroid Build Coastguard Worker    RET
307*c0909341SAndroid Build Coastguard Worker%endif
308*c0909341SAndroid Build Coastguard Worker%endmacro
309*c0909341SAndroid Build Coastguard Worker
310*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, dct
311*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, adst
312*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, flipadst
313*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, identity
314*c0909341SAndroid Build Coastguard Worker
315*c0909341SAndroid Build Coastguard Workercglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
316*c0909341SAndroid Build Coastguard Worker    call .load
317*c0909341SAndroid Build Coastguard Worker    vpermi2q             m1, m0, m2 ; 1 5
318*c0909341SAndroid Build Coastguard Worker    vpermi2q             m3, m6, m4 ; 7 3
319*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m5, m4 ; 0 2
320*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m5, m6 ; 4 6
321*c0909341SAndroid Build Coastguard Worker    call .main
322*c0909341SAndroid Build Coastguard Worker    call .main_end
323*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(idct8x8p)]
324*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2     ; 0 1 4 5
325*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3     ; 3 2 7 6
326*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m4, m0
327*c0909341SAndroid Build Coastguard Worker    vprolq               m1, 32
328*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m4, m1
329*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m2
330*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
331*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
332*c0909341SAndroid Build Coastguard Worker.pass2:
333*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
334*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym2, m0, 1
335*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym3, m1, 1
336*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_8bpc).main
337*c0909341SAndroid Build Coastguard Worker    mova                m10, [permC]
338*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [pw_2048]
339*c0909341SAndroid Build Coastguard Worker.end:
340*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m10, m1
341*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m10, m3
342*c0909341SAndroid Build Coastguard Worker.end2:
343*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pixel_10bpc_max]
344*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
345*c0909341SAndroid Build Coastguard Worker    pxor                m10, m10
346*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12, m0
347*c0909341SAndroid Build Coastguard Worker    call .write_8x4_start
348*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12, m2
349*c0909341SAndroid Build Coastguard Worker.write_8x4:
350*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
351*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*2
352*c0909341SAndroid Build Coastguard Worker.write_8x4_start:
353*c0909341SAndroid Build Coastguard Worker    mova                xm9, [dstq+strideq*0]
354*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym9, [dstq+strideq*1], 1
355*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m9, [dstq+strideq*2], 2
356*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m9, [dstq+r6       ], 3
357*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*0], m10
358*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*1], m10
359*c0909341SAndroid Build Coastguard Worker    paddw                m9, m8
360*c0909341SAndroid Build Coastguard Worker    pmaxsw               m9, m10
361*c0909341SAndroid Build Coastguard Worker    pminsw               m9, m11
362*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm9
363*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym9, 1
364*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m9, 2
365*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r6       ], m9, 3
366*c0909341SAndroid Build Coastguard Worker    ret
367*c0909341SAndroid Build Coastguard WorkerALIGN function_align
368*c0909341SAndroid Build Coastguard Worker.load:
369*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64*0] ; 0 1
370*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64*1] ; 2 3
371*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(permB)]
372*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*2] ; 4 5
373*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*3] ; 6 7
374*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
375*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
376*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
377*c0909341SAndroid Build Coastguard Worker    psrlq                m5, m1, 32
378*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
379*c0909341SAndroid Build Coastguard Worker    mova                 m3, m1
380*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_1)]
381*c0909341SAndroid Build Coastguard Worker    ret
382*c0909341SAndroid Build Coastguard WorkerALIGN function_align
383*c0909341SAndroid Build Coastguard Worker.main_fast: ; bottom half is zero
384*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [o(pd_4017_3406)]
385*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [o(pd_799_m2276)]
386*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [o(pd_2896_3784)]
387*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [o(pd_2896_1567)]
388*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m1     ; t4a  t5a
389*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m8     ; t7a  t6a
390*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m0     ; t0   t3
391*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m9     ; t1   t2
392*c0909341SAndroid Build Coastguard Worker    jmp .main2
393*c0909341SAndroid Build Coastguard Worker.main:
394*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 3, 8, 9, 10, _,  799_3406, 4017_2276
395*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         0, 2, 8, 9, 10, _, 2896_1567, 2896_3784
396*c0909341SAndroid Build Coastguard Worker.main2:
397*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m1, m3, m0, m2
398*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m1, m3, m0, m2
399*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m1, m3 ; t4a  t7a
400*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m3     ; t5a  t6a
401*c0909341SAndroid Build Coastguard Worker    psubd                m3, m8, m1 ; t5a  t6a
402*c0909341SAndroid Build Coastguard Worker    paddd                m8, m1     ; t4   t7
403*c0909341SAndroid Build Coastguard Worker    pmaxsd               m3, m14
404*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m2, m0 ; t3   t2
405*c0909341SAndroid Build Coastguard Worker    pminsd               m3, m15
406*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m0     ; t0   t1
407*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12
408*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2, m1 ; dct4 out0 out1
409*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1     ; dct4 out3 out2
410*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m8, m0, m2
411*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m8, m0, m2
412*c0909341SAndroid Build Coastguard Worker.main3:
413*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q1032
414*c0909341SAndroid Build Coastguard Worker    paddd                m3, m13
415*c0909341SAndroid Build Coastguard Worker    psubd                m9, m3, m1
416*c0909341SAndroid Build Coastguard Worker    paddd                m3, m1
417*c0909341SAndroid Build Coastguard Worker    psrad                m9, 12
418*c0909341SAndroid Build Coastguard Worker    psrad                m3, 12
419*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m8, m3   ; t7   t6
420*c0909341SAndroid Build Coastguard Worker    shufpd               m8, m9, 0xaa ; t4   t5
421*c0909341SAndroid Build Coastguard Worker    ret
422*c0909341SAndroid Build Coastguard Worker.main_end:
423*c0909341SAndroid Build Coastguard Worker    paddd                m0, m11
424*c0909341SAndroid Build Coastguard Worker    paddd                m2, m11
425*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m1 ; out7 out6
426*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1     ; out0 out1
427*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2, m8 ; out3 out2
428*c0909341SAndroid Build Coastguard Worker    psubd                m2, m8     ; out4 out5
429*c0909341SAndroid Build Coastguard Worker    REPX   {vpsravd x, m11}, m0, m2, m3, m1
430*c0909341SAndroid Build Coastguard Worker    ret
431*c0909341SAndroid Build Coastguard Worker
432*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, dct
433*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, flipadst
434*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, identity
435*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, adst
436*c0909341SAndroid Build Coastguard Worker
437*c0909341SAndroid Build Coastguard Workercglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
438*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).load
439*c0909341SAndroid Build Coastguard Worker    vpermi2q             m1, m6, m2 ; 7 5
440*c0909341SAndroid Build Coastguard Worker    vpermi2q             m3, m4, m0 ; 3 1
441*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m5, m4 ; 0 2
442*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m5, m6 ; 4 6
443*c0909341SAndroid Build Coastguard Worker    call .main
444*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m2, m4 ;  out4  out6
445*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m0     ; -out5 -out7
446*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m3     ;  out0  out2
447*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m3     ; -out1 -out3
448*c0909341SAndroid Build Coastguard Worker    paddd                m1, m11
449*c0909341SAndroid Build Coastguard Worker    psubd                m3, m11, m2
450*c0909341SAndroid Build Coastguard Worker    paddd                m0, m11
451*c0909341SAndroid Build Coastguard Worker    psubd                m4, m11, m4
452*c0909341SAndroid Build Coastguard Worker.pass1_end:
453*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m1, m0, m3, m4
454*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m1     ; 0 2 4 6
455*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m3     ; 1 3 5 7
456*c0909341SAndroid Build Coastguard Worker    psrlq                m1, [o(permB)], 8
457*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m4
458*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4
459*c0909341SAndroid Build Coastguard Worker    psrlq                m2, m1, 32
460*c0909341SAndroid Build Coastguard Worker    vpermi2q             m1, m0, m3
461*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m2, m3
462*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
463*c0909341SAndroid Build Coastguard Worker.pass2:
464*c0909341SAndroid Build Coastguard Worker    call .main_pass2
465*c0909341SAndroid Build Coastguard Worker    movu                m10, [permC+2]
466*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8     m12, [pw_2048_m2048+16]
467*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x8_internal_10bpc).end
468*c0909341SAndroid Build Coastguard Worker.main_pass2:
469*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym2, m0, 1
470*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym3, m1, 1
471*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
472*c0909341SAndroid Build Coastguard Worker    pshufd              ym4, ym0, q1032
473*c0909341SAndroid Build Coastguard Worker    pshufd              ym5, ym1, q1032
474*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x8_internal_8bpc).main_pass2
475*c0909341SAndroid Build Coastguard WorkerALIGN function_align
476*c0909341SAndroid Build Coastguard Worker.main:
477*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 0, 4, 5, 6, 13,  401_1931, 4076_3612
478*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189
479*c0909341SAndroid Build Coastguard Worker    psubd                m4, m0, m2   ; t4  t6
480*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2       ; t0  t2
481*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m3   ; t5  t7
482*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3       ; t1  t3
483*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m4, m2, m0, m1
484*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m4, m2, m0, m1
485*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
486*c0909341SAndroid Build Coastguard Worker    psubd                m5, m4
487*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m2, 0xaa ; t4  t7
488*c0909341SAndroid Build Coastguard Worker    shufpd               m2, m5, 0xaa ; t5 -t6
489*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 2, 3, 5, 6, 13, 1567, 3784
490*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m0, m1
491*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1
492*c0909341SAndroid Build Coastguard Worker    psubd                m1, m0, m3   ; t2  t3
493*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3       ; out0 -out7
494*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4, m2   ; t7a t6a
495*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m2       ; t5a t4a
496*c0909341SAndroid Build Coastguard Worker    psubd                m2, m4, m3   ; t7  t6
497*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3       ; out6 -out1
498*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m1, m2
499*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m1, m2
500*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m1, m2, 0xaa
501*c0909341SAndroid Build Coastguard Worker    shufpd               m1, m2, 0x55
502*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12
503*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12
504*c0909341SAndroid Build Coastguard Worker    paddd                m3, m13
505*c0909341SAndroid Build Coastguard Worker    psubd                m2, m3, m1
506*c0909341SAndroid Build Coastguard Worker    paddd                m3, m1
507*c0909341SAndroid Build Coastguard Worker    psrad                m2, 12       ; out4 -out5
508*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q1032
509*c0909341SAndroid Build Coastguard Worker    psrad                m3, 12       ; out2 -out3
510*c0909341SAndroid Build Coastguard Worker    ret
511*c0909341SAndroid Build Coastguard Worker
512*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, dct
513*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, adst
514*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, identity
515*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, flipadst
516*c0909341SAndroid Build Coastguard Worker
517*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
518*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).load
519*c0909341SAndroid Build Coastguard Worker    vpermi2q             m1, m6, m2 ; 7 5
520*c0909341SAndroid Build Coastguard Worker    vpermi2q             m3, m4, m0 ; 3 1
521*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m5, m4 ; 0 2
522*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m5, m6 ; 4 6
523*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x8_internal_10bpc).main
524*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m3, m4 ; -out3 -out1
525*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m0     ;  out2  out0
526*c0909341SAndroid Build Coastguard Worker    punpckhdq            m0, m2     ; -out7 -out5
527*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m2     ;  out6  out4
528*c0909341SAndroid Build Coastguard Worker    psubd                m1, m11, m1
529*c0909341SAndroid Build Coastguard Worker    paddd                m3, m11
530*c0909341SAndroid Build Coastguard Worker    psubd                m0, m11, m0
531*c0909341SAndroid Build Coastguard Worker    paddd                m4, m11
532*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x8_internal_10bpc).pass1_end
533*c0909341SAndroid Build Coastguard Worker.pass2:
534*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x8_internal_10bpc).main_pass2
535*c0909341SAndroid Build Coastguard Worker    movu                m10, [permC+1]
536*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8     m12, [pw_m2048_2048+16]
537*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
538*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m10, m1 ; 7 6 5 4
539*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pixel_10bpc_max]
540*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m10, m3 ; 3 2 1 0
541*c0909341SAndroid Build Coastguard Worker    pxor                m10, m10
542*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12, m2
543*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).write_8x4_start
544*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12, m0
545*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x8_internal_10bpc).write_8x4
546*c0909341SAndroid Build Coastguard Worker
547*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, dct
548*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, adst
549*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, flipadst
550*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, identity
551*c0909341SAndroid Build Coastguard Worker
552*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
553*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*0]
554*c0909341SAndroid Build Coastguard Worker    packssdw             m1, [cq+64*2] ; 0 4   1 5
555*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*1] ; 2 6   3 7
556*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [cq+64*3]
557*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(idtx8x8p)]
558*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m0, m1
559*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m0, m2
560*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2    ; 0 1   4 5
561*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2        ; 2 3   6 7
562*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
563*c0909341SAndroid Build Coastguard Worker.pass2:
564*c0909341SAndroid Build Coastguard Worker    movu                 m3, [o(permC+2)]
565*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_4096)]
566*c0909341SAndroid Build Coastguard Worker    psrlq                m2, m3, 32
567*c0909341SAndroid Build Coastguard Worker    vpermi2q             m2, m0, m1
568*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m3, m1
569*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x8_internal_10bpc).end2
570*c0909341SAndroid Build Coastguard Worker
571*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset
572*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, %3, 8x16
573*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
574*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
575*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
576*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
577*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128
578*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8
579*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
580*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
581*c0909341SAndroid Build Coastguard Worker%endif
582*c0909341SAndroid Build Coastguard Worker%endmacro
583*c0909341SAndroid Build Coastguard Worker
584*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, dct
585*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, identity, 35
586*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, flipadst
587*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, adst
588*c0909341SAndroid Build Coastguard Worker
589*c0909341SAndroid Build Coastguard Workercglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
590*c0909341SAndroid Build Coastguard Worker%undef cmp
591*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 43
592*c0909341SAndroid Build Coastguard Worker    jl .fast
593*c0909341SAndroid Build Coastguard Worker    call .load
594*c0909341SAndroid Build Coastguard Worker    call .main
595*c0909341SAndroid Build Coastguard Worker    call .main_end
596*c0909341SAndroid Build Coastguard Worker.pass1_end:
597*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m4
598*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m5
599*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m6
600*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m7
601*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
602*c0909341SAndroid Build Coastguard Worker.pass2:
603*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(idct8x16p)]
604*c0909341SAndroid Build Coastguard Worker    REPX  {vpermb x, m8, x}, m0, m1, m2, m3
605*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m0, m1
606*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1
607*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m2, m3
608*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3
609*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m0, m2 ; 15  1
610*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m2     ;  7  9
611*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m5, m4 ;  3 13
612*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m4     ; 11  5
613*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
614*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym7, m8, 1  ; 14  2
615*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym3, m0, 1  ;  6 10
616*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym6, m1, 1  ; 12  4
617*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym9, m5, 1  ;  8  0
618*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_8bpc).main2
619*c0909341SAndroid Build Coastguard Worker    mova                 m8, [permC]
620*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [pw_2048]
621*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m8, m1
622*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
623*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m8, m3
624*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pixel_10bpc_max]
625*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m8, m5
626*c0909341SAndroid Build Coastguard Worker    pxor                m10, m10
627*c0909341SAndroid Build Coastguard Worker    vpermt2q             m6, m8, m7
628*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12, m0
629*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).write_8x4_start
630*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12, m2
631*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).write_8x4
632*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12, m4
633*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).write_8x4
634*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12, m6
635*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x8_internal_10bpc).write_8x4
636*c0909341SAndroid Build Coastguard Worker.fast:
637*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+64*0]
638*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+64*2]
639*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+64*1]
640*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+64*5]
641*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+64*4]
642*c0909341SAndroid Build Coastguard Worker    mova                ym6, [cq+64*6]
643*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+64*7]
644*c0909341SAndroid Build Coastguard Worker    mova                ym7, [cq+64*3]
645*c0909341SAndroid Build Coastguard Worker    call .round_input_fast
646*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).main
647*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).main_end
648*c0909341SAndroid Build Coastguard Worker    movu                 m6, [o(permC+3)]
649*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m1, m3
650*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m0, m2
651*c0909341SAndroid Build Coastguard Worker    vprolq               m3, 32
652*c0909341SAndroid Build Coastguard Worker    vpermd               m1, m6, m1
653*c0909341SAndroid Build Coastguard Worker    vpermd               m3, m6, m3
654*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym1    ; 0 4
655*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym1, m1, 1  ; 1 5
656*c0909341SAndroid Build Coastguard Worker    mova                ym2, ym3    ; 2 6
657*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym3, m3, 1  ; 3 7
658*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
659*c0909341SAndroid Build Coastguard WorkerALIGN function_align
660*c0909341SAndroid Build Coastguard Worker.round_input_fast:
661*c0909341SAndroid Build Coastguard Worker    movshdup             m8, [o(permB)]
662*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
663*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m8, m4
664*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m8, m5
665*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m8, m6
666*c0909341SAndroid Build Coastguard Worker    vpermt2q             m3, m8, m7
667*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
668*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m0, m1, m2, m3
669*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
670*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
671*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m0, m1, m2, m3
672*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_1)]
673*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m0, m1, m2, m3
674*c0909341SAndroid Build Coastguard Worker    ret
675*c0909341SAndroid Build Coastguard WorkerALIGN function_align
676*c0909341SAndroid Build Coastguard Worker.load:
677*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
678*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
679*c0909341SAndroid Build Coastguard Worker.load2:
680*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
681*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+64*0]
682*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+64*1]
683*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+64*2]
684*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+64*3]
685*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
686*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m12, [cq+64*4]
687*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m12, [cq+64*5]
688*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m12, [cq+64*6]
689*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m12, [cq+64*7]
690*c0909341SAndroid Build Coastguard Worker.round:
691*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m0, m1, m2, m3
692*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m0, m1, m2, m3
693*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m4, m5, m6, m7
694*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m4, m5, m6, m7
695*c0909341SAndroid Build Coastguard Worker    ret
696*c0909341SAndroid Build Coastguard WorkerALIGN function_align
697*c0909341SAndroid Build Coastguard Worker.main_fast2_rect2:
698*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m0, m1
699*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m0, m1
700*c0909341SAndroid Build Coastguard Worker.main_fast2:
701*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12
702*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m1, [o(pd_4017)] {1to16} ; t7a
703*c0909341SAndroid Build Coastguard Worker    pmulld               m8, m1, [o(pd_799)] {1to16}  ; t4a
704*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m0, m6, m8
705*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m0, m6, m8
706*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m6, m12
707*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m8, m12
708*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
709*c0909341SAndroid Build Coastguard Worker    psubd                m4, m5, m1
710*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1
711*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m4, m5
712*c0909341SAndroid Build Coastguard Worker    REPX    {mova   x, m0 }, m1, m2, m3
713*c0909341SAndroid Build Coastguard Worker    ret
714*c0909341SAndroid Build Coastguard Worker.main_fast_rect2:
715*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m0, m1, m2, m3
716*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m0, m1, m2, m3
717*c0909341SAndroid Build Coastguard Worker.main_fast:
718*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12
719*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m3, [o(pd_2276)] {1to16} ; t5a
720*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [o(pd_3406)] {1to16}     ; t6a
721*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m1, [o(pd_4017)] {1to16} ; t7a
722*c0909341SAndroid Build Coastguard Worker    pmulld               m1, [o(pd_799)] {1to16}      ; t4a
723*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m2, [o(pd_3784)] {1to16} ; t3
724*c0909341SAndroid Build Coastguard Worker    pmulld               m2, [o(pd_1567)] {1to16}     ; t2
725*c0909341SAndroid Build Coastguard Worker    paddd                m0, m13
726*c0909341SAndroid Build Coastguard Worker    psubd                m5, m13, m5
727*c0909341SAndroid Build Coastguard Worker    psrad                m0, 12                       ; t0
728*c0909341SAndroid Build Coastguard Worker    mova                 m9, m0                       ; t1
729*c0909341SAndroid Build Coastguard Worker    jmp .main2
730*c0909341SAndroid Build Coastguard Worker.main_rect2:
731*c0909341SAndroid Build Coastguard Worker    call .round
732*c0909341SAndroid Build Coastguard Worker.main:
733*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12
734*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
735*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 7, 8, 9, 10, _,  799, 4017 ; t4a t7a
736*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 6, 8, 9, 10, _, 1567, 3784 ; t2  t3
737*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m12
738*c0909341SAndroid Build Coastguard Worker    paddd                m0, m13
739*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
740*c0909341SAndroid Build Coastguard Worker    psubd                m9, m0, m4 ; t1
741*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t0
742*c0909341SAndroid Build Coastguard Worker    psrad                m9, 12
743*c0909341SAndroid Build Coastguard Worker    psrad                m0, 12
744*c0909341SAndroid Build Coastguard Worker.main2:
745*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m3, m1, m7
746*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m5, m1, m3, m7
747*c0909341SAndroid Build Coastguard Worker    paddd                m8, m1, m5 ; t4
748*c0909341SAndroid Build Coastguard Worker    psubd                m1, m5     ; t5a
749*c0909341SAndroid Build Coastguard Worker    psubd                m5, m7, m3 ; t6a
750*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t7
751*c0909341SAndroid Build Coastguard Worker    pmaxsd               m5, m14
752*c0909341SAndroid Build Coastguard Worker    pmaxsd               m1, m14
753*c0909341SAndroid Build Coastguard Worker    paddd                m2, m13
754*c0909341SAndroid Build Coastguard Worker    paddd                m6, m13
755*c0909341SAndroid Build Coastguard Worker    pminsd               m5, m15
756*c0909341SAndroid Build Coastguard Worker    pminsd               m1, m15
757*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m12
758*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12
759*c0909341SAndroid Build Coastguard Worker    pmaxsd               m8, m14
760*c0909341SAndroid Build Coastguard Worker    pmaxsd               m7, m14
761*c0909341SAndroid Build Coastguard Worker    pminsd               m8, m15
762*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
763*c0909341SAndroid Build Coastguard Worker    psubd                m4, m5, m1
764*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1
765*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m2, m6, m5, m4
766*c0909341SAndroid Build Coastguard Worker    paddd                m1, m9, m2 ; dct4 out1
767*c0909341SAndroid Build Coastguard Worker    psubd                m2, m9, m2 ; dct4 out2
768*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m6 ; dct4 out3
769*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6     ; dct4 out0
770*c0909341SAndroid Build Coastguard Worker    pminsd               m6, m15, m7
771*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m0, m1, m2, m3
772*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m0, m1, m2, m3
773*c0909341SAndroid Build Coastguard Worker    ret
774*c0909341SAndroid Build Coastguard Worker.main_end:
775*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_1)]
776*c0909341SAndroid Build Coastguard Worker.main_end2:
777*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m11}, m0, m1, m2, m3
778*c0909341SAndroid Build Coastguard Worker    psubd                m7, m0, m6 ; out7
779*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6     ; out0
780*c0909341SAndroid Build Coastguard Worker    psubd                m6, m1, m5 ; out6
781*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5     ; out1
782*c0909341SAndroid Build Coastguard Worker    psubd                m5, m2, m4 ; out5
783*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4     ; out2
784*c0909341SAndroid Build Coastguard Worker    psubd                m4, m3, m8 ; out4
785*c0909341SAndroid Build Coastguard Worker    paddd                m3, m8     ; out3
786*c0909341SAndroid Build Coastguard Worker    REPX   {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
787*c0909341SAndroid Build Coastguard Worker    ret
788*c0909341SAndroid Build Coastguard Worker
789*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, dct
790*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, identity, 35
791*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, flipadst
792*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, adst
793*c0909341SAndroid Build Coastguard Worker
794*c0909341SAndroid Build Coastguard Workercglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
795*c0909341SAndroid Build Coastguard Worker%undef cmp
796*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 43
797*c0909341SAndroid Build Coastguard Worker    jl .fast
798*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).load
799*c0909341SAndroid Build Coastguard Worker    call .main
800*c0909341SAndroid Build Coastguard Worker    psrad                m0, 1
801*c0909341SAndroid Build Coastguard Worker    psrad                m1, 1
802*c0909341SAndroid Build Coastguard Worker    psrad                m6, m10, 1
803*c0909341SAndroid Build Coastguard Worker    psrad                m7, m11, 1
804*c0909341SAndroid Build Coastguard Worker    psrad                m2, 12
805*c0909341SAndroid Build Coastguard Worker    psrad                m3, 12
806*c0909341SAndroid Build Coastguard Worker    psrad                m4, m8, 12
807*c0909341SAndroid Build Coastguard Worker    psrad                m5, m9, 12
808*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_10bpc).pass1_end
809*c0909341SAndroid Build Coastguard Worker.fast:
810*c0909341SAndroid Build Coastguard Worker    call .fast_main
811*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m2, m4 ;  out4  out6
812*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m0     ; -out5 -out7
813*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m3     ;  out0  out2
814*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m3     ; -out1 -out3
815*c0909341SAndroid Build Coastguard Worker    paddd                m1, m11
816*c0909341SAndroid Build Coastguard Worker    psubd                m3, m11, m2
817*c0909341SAndroid Build Coastguard Worker    paddd                m0, m11
818*c0909341SAndroid Build Coastguard Worker    psubd                m4, m11, m4
819*c0909341SAndroid Build Coastguard Worker.fast_end:
820*c0909341SAndroid Build Coastguard Worker    movu                 m5, [o(permC+3)]
821*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m1, m0, m3, m4
822*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m0, m1 ; 0 2 4 6
823*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m4, m3 ; 1 3 5 7
824*c0909341SAndroid Build Coastguard Worker    vpermd               m2, m5, m2
825*c0909341SAndroid Build Coastguard Worker    vpermd               m3, m5, m3
826*c0909341SAndroid Build Coastguard Worker    mova                ym0, ym2
827*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym2, m2, 1
828*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym3
829*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym3, m3, 1
830*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
831*c0909341SAndroid Build Coastguard Worker.pass2:
832*c0909341SAndroid Build Coastguard Worker    call .pass2_main
833*c0909341SAndroid Build Coastguard Worker    movu                 m4, [permB+2]
834*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8     m12, [pw_2048_m2048+16]
835*c0909341SAndroid Build Coastguard Worker    psrlq                m7, m4, 8
836*c0909341SAndroid Build Coastguard Worker    vpermi2q             m4, m0, m3 ;  0  1  2  3
837*c0909341SAndroid Build Coastguard Worker    psrlq                m5, m7, 24
838*c0909341SAndroid Build Coastguard Worker    vpermi2q             m7, m0, m3 ; 12 13 14 15
839*c0909341SAndroid Build Coastguard Worker    psrlq                m6, m5, 8
840*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, m1 ;  4  5  6  7
841*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m6, m2 ;  8  9 10 11
842*c0909341SAndroid Build Coastguard Worker.pass2_end:
843*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pixel_10bpc_max]
844*c0909341SAndroid Build Coastguard Worker    pxor                m10, m10
845*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
846*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12, m4
847*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).write_8x4_start
848*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12, m5
849*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).write_8x4
850*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12, m6
851*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).write_8x4
852*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12, m7
853*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x8_internal_10bpc).write_8x4
854*c0909341SAndroid Build Coastguard WorkerALIGN function_align
855*c0909341SAndroid Build Coastguard Worker.main:
856*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 0, 8, 9, 10, 13,  401, 4076 ; t1a, t0a
857*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a
858*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a
859*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a
860*c0909341SAndroid Build Coastguard Worker    psubd                m8, m2, m6 ; t6
861*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6     ; t2
862*c0909341SAndroid Build Coastguard Worker    psubd                m6, m0, m4 ; t4
863*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t0
864*c0909341SAndroid Build Coastguard Worker    psubd                m4, m5, m1 ; t7
865*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1     ; t3
866*c0909341SAndroid Build Coastguard Worker    psubd                m1, m7, m3 ; t5
867*c0909341SAndroid Build Coastguard Worker    paddd                m7, m3     ; t1
868*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7
869*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7
870*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_1567)]
871*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_3784)]
872*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a
873*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a
874*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_1448)]
875*c0909341SAndroid Build Coastguard Worker    psubd                m9, m6, m8 ;  t7
876*c0909341SAndroid Build Coastguard Worker    paddd                m6, m8     ;  out6
877*c0909341SAndroid Build Coastguard Worker    psubd                m3, m7, m5 ;  t3
878*c0909341SAndroid Build Coastguard Worker    paddd                m7, m5     ; -out7
879*c0909341SAndroid Build Coastguard Worker    psubd                m5, m0, m2 ;  t2
880*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2     ;  out0
881*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m4 ;  t6
882*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4     ; -out1
883*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m5, m3, m2, m9
884*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m5, m3, m2, m9
885*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m5, m3, m2, m9
886*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pd_1)]
887*c0909341SAndroid Build Coastguard Worker    psubd                m8, m5, m3 ; (t2 - t3) * 1448
888*c0909341SAndroid Build Coastguard Worker    paddd                m3, m5     ; (t2 + t3) * 1448
889*c0909341SAndroid Build Coastguard Worker    psubd                m5, m2, m9 ; (t6 - t7) * 1448
890*c0909341SAndroid Build Coastguard Worker    paddd                m2, m9     ; (t6 + t7) * 1448
891*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pd_3072)]
892*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
893*c0909341SAndroid Build Coastguard Worker    psubd                m1, m4, m1
894*c0909341SAndroid Build Coastguard Worker    paddd               m10, m6, m4
895*c0909341SAndroid Build Coastguard Worker    psubd               m11, m4, m7
896*c0909341SAndroid Build Coastguard Worker    paddd                m2, m9
897*c0909341SAndroid Build Coastguard Worker    paddd                m8, m9
898*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pd_3071)]
899*c0909341SAndroid Build Coastguard Worker    psubd                m3, m9, m3
900*c0909341SAndroid Build Coastguard Worker    psubd                m9, m5
901*c0909341SAndroid Build Coastguard Worker    ret
902*c0909341SAndroid Build Coastguard WorkerALIGN function_align
903*c0909341SAndroid Build Coastguard Worker.fast_main:
904*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+64*0]
905*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+64*2]
906*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+64*7]
907*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+64*5]
908*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+64*4]
909*c0909341SAndroid Build Coastguard Worker    mova                ym6, [cq+64*6]
910*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+64*3]
911*c0909341SAndroid Build Coastguard Worker    mova                ym7, [cq+64*1]
912*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).round_input_fast
913*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x8_internal_10bpc).main
914*c0909341SAndroid Build Coastguard WorkerALIGN function_align
915*c0909341SAndroid Build Coastguard Worker.pass2_main:
916*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(iadst8x16p)]
917*c0909341SAndroid Build Coastguard Worker    REPX  {vpermb x, m8, x}, m0, m1, m2, m3
918*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_2896x8)]
919*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m0, m1
920*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1
921*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2, m3
922*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3
923*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
924*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m0, m2 ; 12  3   14  1
925*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2     ;  0 15    2 13
926*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m6, m5, m1 ;  8  7   10  5
927*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m1     ;  4 11    6  9
928*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_8bpc).main2
929*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, m4
930*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m4
931*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10    ; -out7   out4   out6  -out5
932*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m10    ;  out8  -out11 -out9   out10
933*c0909341SAndroid Build Coastguard Worker    ret
934*c0909341SAndroid Build Coastguard Worker
935*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, dct
936*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, identity, 35
937*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, adst
938*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, flipadst
939*c0909341SAndroid Build Coastguard Worker
940*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
941*c0909341SAndroid Build Coastguard Worker%undef cmp
942*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 43
943*c0909341SAndroid Build Coastguard Worker    jl .fast
944*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).load
945*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_10bpc).main
946*c0909341SAndroid Build Coastguard Worker    psrad                m7, m0, 1
947*c0909341SAndroid Build Coastguard Worker    psrad                m0, m11, 1
948*c0909341SAndroid Build Coastguard Worker    psrad                m6, m1, 1
949*c0909341SAndroid Build Coastguard Worker    psrad                m1, m10, 1
950*c0909341SAndroid Build Coastguard Worker    psrad                m5, m2, 12
951*c0909341SAndroid Build Coastguard Worker    psrad                m2, m9, 12
952*c0909341SAndroid Build Coastguard Worker    psrad                m4, m3, 12
953*c0909341SAndroid Build Coastguard Worker    psrad                m3, m8, 12
954*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_10bpc).pass1_end
955*c0909341SAndroid Build Coastguard Worker.fast:
956*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_10bpc).fast_main
957*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m3, m4 ; -out3 -out1
958*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m0     ;  out2  out0
959*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m2     ; -out7 -out5
960*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m2     ;  out6  out4
961*c0909341SAndroid Build Coastguard Worker    psubd                m1, m11, m1
962*c0909341SAndroid Build Coastguard Worker    paddd                m3, m11
963*c0909341SAndroid Build Coastguard Worker    psubd                m0, m11, m0
964*c0909341SAndroid Build Coastguard Worker    paddd                m4, m11
965*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x16_internal_10bpc).fast_end
966*c0909341SAndroid Build Coastguard Worker.pass2:
967*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_10bpc).pass2_main
968*c0909341SAndroid Build Coastguard Worker    movu                 m7, [permB+2]
969*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x8     m12, [pw_m2048_2048+16]
970*c0909341SAndroid Build Coastguard Worker    psrlq                m4, m7, 8
971*c0909341SAndroid Build Coastguard Worker    vpermi2q             m7, m3, m0 ;  3  2  1  0
972*c0909341SAndroid Build Coastguard Worker    psrlq                m5, m4, 24
973*c0909341SAndroid Build Coastguard Worker    vpermi2q             m4, m3, m0 ; 15 14 13 12
974*c0909341SAndroid Build Coastguard Worker    psrlq                m6, m5, 8
975*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m5, m2 ; 11 10  9  8
976*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m6, m1 ;  7  6  5  4
977*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x16_internal_10bpc).pass2_end
978*c0909341SAndroid Build Coastguard Worker
979*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, dct
980*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, adst
981*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, flipadst
982*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, identity
983*c0909341SAndroid Build Coastguard Worker
984*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
985*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).load2
986*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_10bpc).pass1_end
987*c0909341SAndroid Build Coastguard Worker.pass2:
988*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_1697x16)]
989*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m8, m0
990*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m8, m1
991*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m8, m2
992*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m8, m3
993*c0909341SAndroid Build Coastguard Worker    REPX      {paddsw x, x}, m0, m1, m2, m3
994*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4
995*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5
996*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m6
997*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m7
998*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_2048)]
999*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0, m1
1000*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
1001*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3
1002*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
1003*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pixel_10bpc_max)]
1004*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m0, m2
1005*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
1006*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4, m1
1007*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m1
1008*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
1009*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m2 ;  1  5  9 13
1010*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2     ;  0  4  8 12
1011*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m4 ;  2  6 10 14
1012*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4     ;  3  7 11 15
1013*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
1014*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7
1015*c0909341SAndroid Build Coastguard Worker    call .write_8x4_start
1016*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7, m1
1017*c0909341SAndroid Build Coastguard Worker    call .write_8x4
1018*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7, m2
1019*c0909341SAndroid Build Coastguard Worker    call .write_8x4
1020*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7, m3
1021*c0909341SAndroid Build Coastguard Worker.write_8x4:
1022*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
1023*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*2
1024*c0909341SAndroid Build Coastguard Worker.write_8x4_start:
1025*c0909341SAndroid Build Coastguard Worker    mova                xm4, [dstq+strideq*0]
1026*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym4, [dstq+strideq*4], 1
1027*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m4, [dstq+strideq*8], 2
1028*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m4, [dstq+r6*4     ], 3
1029*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*0], m5
1030*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*1], m5
1031*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
1032*c0909341SAndroid Build Coastguard Worker    pmaxsw               m4, m5
1033*c0909341SAndroid Build Coastguard Worker    pminsw               m4, m6
1034*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm4
1035*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*4], ym4, 1
1036*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*8], m4, 2
1037*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r6*4     ], m4, 3
1038*c0909341SAndroid Build Coastguard Worker    ret
1039*c0909341SAndroid Build Coastguard Worker
1040*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
1041*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, %3, 16x8
1042*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
1043*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
1044*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
1045*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8
1046*c0909341SAndroid Build Coastguard Worker.dconly:
1047*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128
1048*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8
1049*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
1050*c0909341SAndroid Build Coastguard Worker    add                 r6d, 384
1051*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 9
1052*c0909341SAndroid Build Coastguard Worker.dconly2:
1053*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(dconly_10bpc)]
1054*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
1055*c0909341SAndroid Build Coastguard Worker    add                 r6d, 2176
1056*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 12
1057*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m1, r6d
1058*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2
1059*c0909341SAndroid Build Coastguard Worker.dconly_loop:
1060*c0909341SAndroid Build Coastguard Worker    mova                ym0, [dstq+strideq*0]
1061*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [dstq+strideq*1], 1
1062*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1
1063*c0909341SAndroid Build Coastguard Worker    psubusw              m0, m2
1064*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
1065*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
1066*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1067*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 2
1068*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
1069*c0909341SAndroid Build Coastguard Worker    RET
1070*c0909341SAndroid Build Coastguard Worker%endif
1071*c0909341SAndroid Build Coastguard Worker%endmacro
1072*c0909341SAndroid Build Coastguard Worker
1073*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, dct
1074*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, identity, -21
1075*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, flipadst
1076*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, adst
1077*c0909341SAndroid Build Coastguard Worker
1078*c0909341SAndroid Build Coastguard Workercglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1079*c0909341SAndroid Build Coastguard Worker%undef cmp
1080*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
1081*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m12, [cq+64*0] ;  0  1
1082*c0909341SAndroid Build Coastguard Worker    pmulld               m9, m12, [cq+64*1] ;  2  3
1083*c0909341SAndroid Build Coastguard Worker    pmulld               m8, m12, [cq+64*2] ;  4  5
1084*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m12, [cq+64*3] ;  6  7
1085*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
1086*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
1087*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(permB)]
1088*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m2}, 0, 1, 2, 3
1089*c0909341SAndroid Build Coastguard Worker    psrlq                m0, m15, 32
1090*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m4, m9, m8, m7
1091*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
1092*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m4, m8, m9, m7
1093*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
1094*c0909341SAndroid Build Coastguard Worker    vpermi2q             m0, m4, m8   ;  0  4
1095*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 43
1096*c0909341SAndroid Build Coastguard Worker    jl .fast
1097*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m12, [cq+64*4] ;  8  9
1098*c0909341SAndroid Build Coastguard Worker    pmulld              m10, m12, [cq+64*5] ; 10 11
1099*c0909341SAndroid Build Coastguard Worker    pmulld              m11, m12, [cq+64*6] ; 12 13
1100*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m12, [cq+64*7] ; 14 15
1101*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m2}, 4, 5, 6, 7
1102*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m5, m10, m11, m6
1103*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m10, m5, m11, m6
1104*c0909341SAndroid Build Coastguard Worker    mova                 m2, m1
1105*c0909341SAndroid Build Coastguard Worker    vpermi2q             m1, m9, m10  ;  2 10
1106*c0909341SAndroid Build Coastguard Worker    mova                 m3, m2
1107*c0909341SAndroid Build Coastguard Worker    vpermi2q             m2, m5, m11  ;  8 12
1108*c0909341SAndroid Build Coastguard Worker    vpermi2q             m3, m6, m7   ; 14  6
1109*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m15, m11 ;  1 13
1110*c0909341SAndroid Build Coastguard Worker    vpermt2q             m6, m15, m9  ; 15  3
1111*c0909341SAndroid Build Coastguard Worker    vpermt2q             m5, m15, m8  ;  9  5
1112*c0909341SAndroid Build Coastguard Worker    vpermt2q             m7, m15, m10 ;  7 11
1113*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
1114*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).main
1115*c0909341SAndroid Build Coastguard Worker    call .main
1116*c0909341SAndroid Build Coastguard Worker    jmp .pass1_end
1117*c0909341SAndroid Build Coastguard Worker.fast:
1118*c0909341SAndroid Build Coastguard Worker    vpermi2q             m1, m9, m7   ;  2  6
1119*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m15, m9  ;  1  3
1120*c0909341SAndroid Build Coastguard Worker    vpermt2q             m7, m15, m8  ;  7  5
1121*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
1122*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).main_fast
1123*c0909341SAndroid Build Coastguard Worker    call .main_fast
1124*c0909341SAndroid Build Coastguard Worker.pass1_end:
1125*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_end
1126*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(permA)]
1127*c0909341SAndroid Build Coastguard Worker    psrlq                m9, m8, 8
1128*c0909341SAndroid Build Coastguard Worker.pass1_end2:
1129*c0909341SAndroid Build Coastguard Worker    mova                m10, m9
1130*c0909341SAndroid Build Coastguard Worker    mova                m11, m8
1131*c0909341SAndroid Build Coastguard Worker    call .transpose_16x8
1132*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1133*c0909341SAndroid Build Coastguard Worker.pass2:
1134*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
1135*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_8bpc).main
1136*c0909341SAndroid Build Coastguard Worker    movshdup             m4, [permC]
1137*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pw_2048]
1138*c0909341SAndroid Build Coastguard Worker    psrlq                m5, m4, 8
1139*c0909341SAndroid Build Coastguard Worker.end:
1140*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [pixel_10bpc_max]
1141*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
1142*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m4, m0
1143*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m5, m1
1144*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
1145*c0909341SAndroid Build Coastguard Worker    call .write_16x4
1146*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m4, m2
1147*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m5, m3
1148*c0909341SAndroid Build Coastguard Worker.write_16x4:
1149*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m11
1150*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m11
1151*c0909341SAndroid Build Coastguard Worker.write_16x4_noround:
1152*c0909341SAndroid Build Coastguard Worker    mova               ym10, [dstq+strideq*0]
1153*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m10, [dstq+strideq*1], 1
1154*c0909341SAndroid Build Coastguard Worker    paddw                m8, m10
1155*c0909341SAndroid Build Coastguard Worker    mova               ym10, [dstq+strideq*2]
1156*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m10, [dstq+r6       ], 1
1157*c0909341SAndroid Build Coastguard Worker    paddw                m9, m10
1158*c0909341SAndroid Build Coastguard Worker    pmaxsw               m8, m12
1159*c0909341SAndroid Build Coastguard Worker    pmaxsw               m9, m12
1160*c0909341SAndroid Build Coastguard Worker    pminsw               m8, m13
1161*c0909341SAndroid Build Coastguard Worker    pminsw               m9, m13
1162*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym8
1163*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m8, 1
1164*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], ym9
1165*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+r6       ], m9, 1
1166*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1167*c0909341SAndroid Build Coastguard Worker    ret
1168*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1169*c0909341SAndroid Build Coastguard Worker.main_fast: ; bottom half is zero
1170*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [o(pd_4076_3920)]
1171*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [o(pd_401_m1189)]
1172*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [o(pd_m2598_1931)]
1173*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [o(pd_3166_3612)]
1174*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m4    ; t15a t12a
1175*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m3    ; t8a  t11a
1176*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m7    ; t9a  t10a
1177*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m9    ; t14a t13a
1178*c0909341SAndroid Build Coastguard Worker    jmp .main2
1179*c0909341SAndroid Build Coastguard Worker.main:
1180*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 6, 3, 9, 10, _,  401_3920, 4076_1189
1181*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 7, 3, 9, 10, _, 3166_1931, 2598_3612
1182*c0909341SAndroid Build Coastguard Worker.main2:
1183*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m4, m6, m5, m7
1184*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m4, m5, m6, m7
1185*c0909341SAndroid Build Coastguard Worker    paddd                m9, m4, m5 ; t8   t11
1186*c0909341SAndroid Build Coastguard Worker    psubd                m4, m5     ; t9   t10
1187*c0909341SAndroid Build Coastguard Worker    psubd                m5, m6, m7 ; t14  t13
1188*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7     ; t15  t12
1189*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m5, m4, m9, m6
1190*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m5, m4, m9, m6
1191*c0909341SAndroid Build Coastguard Worker.main3:
1192*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m1 ; dct8 out7 out6
1193*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1     ; dct8 out0 out1
1194*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m7, [o(pd_3784_m3784)]
1195*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m5
1196*c0909341SAndroid Build Coastguard Worker    vpmulld              m5, [o(pd_1567)] {1to16}
1197*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2, m8 ; dct8 out3 out2
1198*c0909341SAndroid Build Coastguard Worker    psubd                m2, m8     ; dct8 out4 out5
1199*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [o(pd_1567_m1567)]
1200*c0909341SAndroid Build Coastguard Worker    pmulld               m8, m4
1201*c0909341SAndroid Build Coastguard Worker    vpmulld              m4, [o(pd_3784)] {1to16}
1202*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m0, m1
1203*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m0, m1
1204*c0909341SAndroid Build Coastguard Worker    paddd                m7, m13
1205*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
1206*c0909341SAndroid Build Coastguard Worker    paddd                m7, m8
1207*c0909341SAndroid Build Coastguard Worker    psubd                m5, m4
1208*c0909341SAndroid Build Coastguard Worker    psrad                m7, 12     ; t14a t10a
1209*c0909341SAndroid Build Coastguard Worker    psrad                m5, 12     ; t9a  t13a
1210*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m9, m7
1211*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m9, m5
1212*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m6, m5
1213*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m7
1214*c0909341SAndroid Build Coastguard Worker    psubd                m7, m8, m4 ; t11a t10
1215*c0909341SAndroid Build Coastguard Worker    paddd                m8, m4     ; t8a  t9
1216*c0909341SAndroid Build Coastguard Worker    psubd                m4, m6, m5 ; t12a t13
1217*c0909341SAndroid Build Coastguard Worker    paddd                m6, m5     ; t15a t14
1218*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m4, m7
1219*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m4, m7
1220*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m12
1221*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m12
1222*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m2, m3, m6, m8
1223*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m2, m3, m6, m8
1224*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
1225*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4, m7
1226*c0909341SAndroid Build Coastguard Worker    psubd                m4, m7
1227*c0909341SAndroid Build Coastguard Worker    psrad                m4, 12     ; t11 t10a
1228*c0909341SAndroid Build Coastguard Worker    psrad                m5, 12     ; t12 t13a
1229*c0909341SAndroid Build Coastguard Worker    ret
1230*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1231*c0909341SAndroid Build Coastguard Worker.transpose_16x8:
1232*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m4
1233*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m5
1234*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m6
1235*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m7
1236*c0909341SAndroid Build Coastguard Worker    vpermi2d             m8, m0, m2
1237*c0909341SAndroid Build Coastguard Worker    vpermt2d             m0, m9, m2
1238*c0909341SAndroid Build Coastguard Worker    vpermi2d            m10, m1, m3
1239*c0909341SAndroid Build Coastguard Worker    vpermi2d            m11, m1, m3
1240*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m8, m0
1241*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m8, m0
1242*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m10, m11
1243*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m10, m11
1244*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2
1245*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2
1246*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4
1247*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4
1248*c0909341SAndroid Build Coastguard Worker    ret
1249*c0909341SAndroid Build Coastguard Worker
1250*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, dct
1251*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, identity, -21
1252*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, flipadst
1253*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, adst
1254*c0909341SAndroid Build Coastguard Worker
1255*c0909341SAndroid Build Coastguard Workercglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1256*c0909341SAndroid Build Coastguard Worker%undef cmp
1257*c0909341SAndroid Build Coastguard Worker    call .main_pass1
1258*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pd_1)]
1259*c0909341SAndroid Build Coastguard Worker    paddd                m0, m9
1260*c0909341SAndroid Build Coastguard Worker    psubd                m1, m9, m1
1261*c0909341SAndroid Build Coastguard Worker    paddd                m2, m9
1262*c0909341SAndroid Build Coastguard Worker    psubd                m3, m9, m3
1263*c0909341SAndroid Build Coastguard Worker    paddd                m4, m9, m5
1264*c0909341SAndroid Build Coastguard Worker    psubd                m5, m9, m6
1265*c0909341SAndroid Build Coastguard Worker    paddd                m6, m9, m7
1266*c0909341SAndroid Build Coastguard Worker    psubd                m7, m9, m8
1267*c0909341SAndroid Build Coastguard Worker.pass1_end:
1268*c0909341SAndroid Build Coastguard Worker    mova                 m9, [o(permA)]
1269*c0909341SAndroid Build Coastguard Worker    psrlq                m8, m9, 8
1270*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7
1271*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_10bpc).pass1_end2
1272*c0909341SAndroid Build Coastguard Worker.pass2:
1273*c0909341SAndroid Build Coastguard Worker    call .main_pass2
1274*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m11, m0
1275*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m11, m1
1276*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4_noround
1277*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m11, m2
1278*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m11, m3
1279*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_10bpc).write_16x4_noround
1280*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1281*c0909341SAndroid Build Coastguard Worker.main_pass1:
1282*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
1283*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+64*0]
1284*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m12, [cq+64*1]
1285*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+64*2]
1286*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m12, [cq+64*3]
1287*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
1288*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
1289*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(permB)]
1290*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
1291*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m2, m7, m1, m5
1292*c0909341SAndroid Build Coastguard Worker    psrlq                m6, m10, 32
1293*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m2, m7, m1, m5
1294*c0909341SAndroid Build Coastguard Worker    mova                 m0, m6
1295*c0909341SAndroid Build Coastguard Worker    vpermi2q             m0, m2, m7  ;  0  2
1296*c0909341SAndroid Build Coastguard Worker    vpermt2q             m7, m10, m2 ;  3  1
1297*c0909341SAndroid Build Coastguard Worker    mova                 m2, m6
1298*c0909341SAndroid Build Coastguard Worker    vpermi2q             m2, m1, m5  ;  4  6
1299*c0909341SAndroid Build Coastguard Worker    vpermt2q             m5, m10, m1 ;  7  5
1300*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 43
1301*c0909341SAndroid Build Coastguard Worker    jl .main_fast
1302*c0909341SAndroid Build Coastguard Worker    pmulld               m8, m12, [cq+64*4]
1303*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+64*5]
1304*c0909341SAndroid Build Coastguard Worker    pmulld               m9, m12, [cq+64*6]
1305*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+64*7]
1306*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m4}, 4, 5, 6, 7
1307*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m8, m3, m9, m1
1308*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m8, m3, m9, m1
1309*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
1310*c0909341SAndroid Build Coastguard Worker    vpermi2q             m4, m8, m3  ;  8 10
1311*c0909341SAndroid Build Coastguard Worker    vpermt2q             m3, m10, m8 ; 11  9
1312*c0909341SAndroid Build Coastguard Worker    vpermi2q             m6, m9, m1  ; 12 14
1313*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m10, m9 ; 15 13
1314*c0909341SAndroid Build Coastguard Worker.main:
1315*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 0, 8, 9, 10, _,  201_995,  4091_3973, 1
1316*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1
1317*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 4, 8, 9, 10, _, 3035_3513, 2751_2106
1318*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 6, 8, 9, 10, _, 3857_4052, 1380_601
1319*c0909341SAndroid Build Coastguard Worker    jmp .main2
1320*c0909341SAndroid Build Coastguard Worker.main_fast:
1321*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m1, [o(pd_4091_3973)]
1322*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [o(pd_201_995)]
1323*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [o(pd_3703_3290)]
1324*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [o(pd_1751_2440)]
1325*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [o(pd_2751_2106)]
1326*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m10, [o(pd_3035_3513)]
1327*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [o(pd_1380_601)]
1328*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m11, [o(pd_3857_4052)]
1329*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m0
1330*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m8
1331*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m2
1332*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m9
1333*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m5
1334*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m10
1335*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m7
1336*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m11
1337*c0909341SAndroid Build Coastguard Worker.main2:
1338*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
1339*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
1340*c0909341SAndroid Build Coastguard Worker    REPX  {psubd x, m13, x}, m1, m3
1341*c0909341SAndroid Build Coastguard Worker    REPX  {paddd x, m13   }, m0, m2, m4, m5, m6, m7
1342*c0909341SAndroid Build Coastguard Worker    REPX  {psrad x, 12    }, m0, m4, m1, m5, m2, m6, m3, m7
1343*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m4 ; t8a  t10a
1344*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4     ; t0a  t2a
1345*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1, m5 ; t9a  t11a
1346*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5     ; t1a  t3a
1347*c0909341SAndroid Build Coastguard Worker    psubd                m5, m2, m6 ; t12a t14a
1348*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6     ; t4a  t6a
1349*c0909341SAndroid Build Coastguard Worker    psubd                m6, m3, m7 ; t13a t15a
1350*c0909341SAndroid Build Coastguard Worker    paddd                m3, m7     ; t5a  t7a
1351*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m8, m4, m5, m6
1352*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m8, m4, m5, m6
1353*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m11, [o(pd_4017_2276)]
1354*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m10, [o(pd_799_3406)]
1355*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         8, 4, 7, 9, _, 13, 10, 11
1356*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 5, 7, 9, _, 13, 11, 10
1357*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m0, m2, m1, m3
1358*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m0, m2, m1, m3
1359*c0909341SAndroid Build Coastguard Worker    psubd                m7, m0, m2 ; t4   t6
1360*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2     ; t0   t2
1361*c0909341SAndroid Build Coastguard Worker    psubd                m2, m1, m3 ; t5   t7
1362*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3     ; t1   t3
1363*c0909341SAndroid Build Coastguard Worker    psubd                m3, m4, m6 ; t12a t14a
1364*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6     ; t8a  t10a
1365*c0909341SAndroid Build Coastguard Worker    psubd                m6, m8, m5 ; t13a t15a
1366*c0909341SAndroid Build Coastguard Worker    paddd                m8, m5     ; t9a  t11a
1367*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m7, m3, m2, m6
1368*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m7, m3, m2, m6
1369*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m3, m7 ; t12a t4
1370*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m7     ; t14a t6
1371*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m7, m6, m2 ; t15a t7
1372*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m2     ; t13a t5
1373*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_1567)]
1374*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_3784)]
1375*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 3, 2, 9, 10, 13, 10, 11
1376*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 6, 2, 9, 10, 13, 11, 10
1377*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m0, m4, m1, m8
1378*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m0, m4, m1, m8
1379*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m4, m0 ; t10a t2
1380*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m0     ; t8a  t0
1381*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m8, m1 ; t11a t3
1382*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m1     ; t9a  t1
1383*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6, m7 ;  out2  -out3
1384*c0909341SAndroid Build Coastguard Worker    psubd                m6, m7     ; t14a t6
1385*c0909341SAndroid Build Coastguard Worker    paddd                m7, m5, m3 ; -out13  out12
1386*c0909341SAndroid Build Coastguard Worker    psubd                m5, m3     ; t15a t7
1387*c0909341SAndroid Build Coastguard Worker    psubd                m3, m8, m0 ; t11  t3a
1388*c0909341SAndroid Build Coastguard Worker    paddd                m8, m0     ;  out14 -out15
1389*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4, m2 ; -out1   out0
1390*c0909341SAndroid Build Coastguard Worker    psubd                m4, m2     ; t10  t2a
1391*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m6, m5, m3, m4
1392*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 0x3333
1393*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m6, m5, m3, m4
1394*c0909341SAndroid Build Coastguard Worker    kmovw                k1, r6d
1395*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m6, m5, m3, m4
1396*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
1397*c0909341SAndroid Build Coastguard Worker    REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8
1398*c0909341SAndroid Build Coastguard Worker    paddd                m6, m13
1399*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
1400*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6, m5 ; -out5   out4
1401*c0909341SAndroid Build Coastguard Worker    psubd                m6, m5     ;  out10 -out11
1402*c0909341SAndroid Build Coastguard Worker    psubd                m5, m4, m3 ; -out9   out8
1403*c0909341SAndroid Build Coastguard Worker    paddd                m3, m4     ;  out6  -out7
1404*c0909341SAndroid Build Coastguard Worker    REPX     {psrad  x, 12}, m2, m3, m5, m6
1405*c0909341SAndroid Build Coastguard Worker    REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6
1406*c0909341SAndroid Build Coastguard Worker    ret
1407*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1408*c0909341SAndroid Build Coastguard Worker.main_pass2:
1409*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
1410*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m0, q1032
1411*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m1, q1032
1412*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_8bpc).main_pass2
1413*c0909341SAndroid Build Coastguard Worker    movshdup            m11, [permC]
1414*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m6
1415*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
1416*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [pixel_10bpc_max]
1417*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
1418*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
1419*c0909341SAndroid Build Coastguard Worker    ret
1420*c0909341SAndroid Build Coastguard Worker
1421*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, dct
1422*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, identity, -21
1423*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, adst
1424*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, flipadst
1425*c0909341SAndroid Build Coastguard Worker
1426*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1427*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_10bpc).main_pass1
1428*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pd_1)]
1429*c0909341SAndroid Build Coastguard Worker    psubd                m4, m9, m3
1430*c0909341SAndroid Build Coastguard Worker    paddd                m3, m9, m5
1431*c0909341SAndroid Build Coastguard Worker    paddd                m5, m9, m2
1432*c0909341SAndroid Build Coastguard Worker    psubd                m2, m9, m6
1433*c0909341SAndroid Build Coastguard Worker    psubd                m6, m9, m1
1434*c0909341SAndroid Build Coastguard Worker    paddd                m1, m9, m7
1435*c0909341SAndroid Build Coastguard Worker    paddd                m7, m9, m0
1436*c0909341SAndroid Build Coastguard Worker    psubd                m0, m9, m8
1437*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x8_internal_10bpc).pass1_end
1438*c0909341SAndroid Build Coastguard Worker.pass2:
1439*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_10bpc).main_pass2
1440*c0909341SAndroid Build Coastguard Worker    psrlq               m11, 8
1441*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m11, m3
1442*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m11, m2
1443*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4_noround
1444*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m11, m1
1445*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m11, m0
1446*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_10bpc).write_16x4_noround
1447*c0909341SAndroid Build Coastguard Worker
1448*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, dct
1449*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, adst
1450*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, flipadst
1451*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, identity
1452*c0909341SAndroid Build Coastguard Worker
1453*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1454*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).load2
1455*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pd_5793)]
1456*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_3072)]
1457*c0909341SAndroid Build Coastguard Worker    pxor                m10, m10
1458*c0909341SAndroid Build Coastguard Worker    REPX     {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
1459*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m10}, 0, 1, 2, 3, 4, 5, 6, 7
1460*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).round
1461*c0909341SAndroid Build Coastguard Worker    psrlq                m8, [o(permA)], 16
1462*c0909341SAndroid Build Coastguard Worker    psrlq                m9, m8, 8
1463*c0909341SAndroid Build Coastguard Worker    mova                m10, m8
1464*c0909341SAndroid Build Coastguard Worker    mova                m11, m9
1465*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).transpose_16x8
1466*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1467*c0909341SAndroid Build Coastguard Worker.pass2:
1468*c0909341SAndroid Build Coastguard Worker    movshdup             m4, [o(permC)]
1469*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_4096)]
1470*c0909341SAndroid Build Coastguard Worker    mova                 m5, m4
1471*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_10bpc).end
1472*c0909341SAndroid Build Coastguard Worker
1473*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
1474*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, %3, 16x16
1475*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
1476*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
1477*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
1478*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
1479*c0909341SAndroid Build Coastguard Worker    add                 r6d, 640
1480*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 10
1481*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
1482*c0909341SAndroid Build Coastguard Worker%endif
1483*c0909341SAndroid Build Coastguard Worker%endmacro
1484*c0909341SAndroid Build Coastguard Worker
1485*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, dct
1486*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, identity, 28
1487*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, flipadst
1488*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, adst
1489*c0909341SAndroid Build Coastguard Worker
1490*c0909341SAndroid Build Coastguard Workercglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1491*c0909341SAndroid Build Coastguard Worker%undef cmp
1492*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
1493*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
1494*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
1495*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
1496*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
1497*c0909341SAndroid Build Coastguard Worker    jl .fast
1498*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 0]
1499*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 2]
1500*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64* 4]
1501*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64* 6]
1502*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64* 8]
1503*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*10]
1504*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*12]
1505*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64*14]
1506*c0909341SAndroid Build Coastguard Worker%if WIN64
1507*c0909341SAndroid Build Coastguard Worker    movaps        [cq+16*0], xmm6
1508*c0909341SAndroid Build Coastguard Worker    movaps        [cq+16*1], xmm7
1509*c0909341SAndroid Build Coastguard Worker%endif
1510*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main
1511*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64* 1]
1512*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64* 3]
1513*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64* 5]
1514*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+64* 7]
1515*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+64* 9]
1516*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+64*11]
1517*c0909341SAndroid Build Coastguard Worker    mova                m22, [cq+64*13]
1518*c0909341SAndroid Build Coastguard Worker    mova                m23, [cq+64*15]
1519*c0909341SAndroid Build Coastguard Worker    call .main
1520*c0909341SAndroid Build Coastguard Worker    call .main_end
1521*c0909341SAndroid Build Coastguard Worker.pass1_end:
1522*c0909341SAndroid Build Coastguard Worker%if WIN64
1523*c0909341SAndroid Build Coastguard Worker    movaps             xmm6, [cq+16*0]
1524*c0909341SAndroid Build Coastguard Worker    movaps             xmm7, [cq+16*1]
1525*c0909341SAndroid Build Coastguard Worker%endif
1526*c0909341SAndroid Build Coastguard Worker    vzeroupper
1527*c0909341SAndroid Build Coastguard Worker.pass1_end2:
1528*c0909341SAndroid Build Coastguard Worker    call .main_end3
1529*c0909341SAndroid Build Coastguard Worker.pass1_end3:
1530*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64*12
1531*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
1532*c0909341SAndroid Build Coastguard Worker.zero_loop:
1533*c0909341SAndroid Build Coastguard Worker    mova       [cq+r6+64*3], m8
1534*c0909341SAndroid Build Coastguard Worker    mova       [cq+r6+64*2], m8
1535*c0909341SAndroid Build Coastguard Worker    mova       [cq+r6+64*1], m8
1536*c0909341SAndroid Build Coastguard Worker    mova       [cq+r6+64*0], m8
1537*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 64*4
1538*c0909341SAndroid Build Coastguard Worker    jge .zero_loop
1539*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1540*c0909341SAndroid Build Coastguard Worker.pass2:
1541*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
1542*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
1543*c0909341SAndroid Build Coastguard Worker    movshdup            m12, [permC]
1544*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pw_2048]
1545*c0909341SAndroid Build Coastguard Worker    psrlq               m13, m12, 8
1546*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m12, m0
1547*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m13, m7
1548*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m13, m1
1549*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m12, m6
1550*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m12, m2
1551*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m13, m5
1552*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m13, m3
1553*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m12, m4
1554*c0909341SAndroid Build Coastguard Worker.pass2_end:
1555*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
1556*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [pixel_10bpc_max]
1557*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
1558*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m11, m8
1559*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m11, m7
1560*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4_noround
1561*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m11, m6
1562*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m11, m5
1563*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4_noround
1564*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m11, m3
1565*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m11, m2
1566*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4_noround
1567*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m11, m1
1568*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m11, m0
1569*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_10bpc).write_16x4_noround
1570*c0909341SAndroid Build Coastguard Worker.fast:
1571*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+64*0]
1572*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+64*4]
1573*c0909341SAndroid Build Coastguard Worker    movshdup             m8, [o(permB)]
1574*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+64*2]
1575*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+64*6]
1576*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+64*1]
1577*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+64*3]
1578*c0909341SAndroid Build Coastguard Worker    mova                ym6, [cq+64*5]
1579*c0909341SAndroid Build Coastguard Worker    mova                ym7, [cq+64*7]
1580*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m8, m2 ; 0 4
1581*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m8, m3 ; 2 6
1582*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m8, m5 ; 1 3
1583*c0909341SAndroid Build Coastguard Worker    vpermt2q             m7, m8, m6 ; 7 5
1584*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).main_fast
1585*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).main_fast
1586*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_2)]
1587*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_end2
1588*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(permA)]
1589*c0909341SAndroid Build Coastguard Worker    psrlq                m9, m8, 8
1590*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2
1591*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1592*c0909341SAndroid Build Coastguard Worker.main_fast2_rect2:
1593*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m16, m17
1594*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m16, m17
1595*c0909341SAndroid Build Coastguard Worker.main_fast2:
1596*c0909341SAndroid Build Coastguard Worker    pmulld              m22, m16, [o(pd_4076)] {1to16} ; t15a
1597*c0909341SAndroid Build Coastguard Worker    pmulld               m9, m16, [o(pd_401)] {1to16}  ; t8a
1598*c0909341SAndroid Build Coastguard Worker    pmulld              m18, m17, [o(pd_1189)] {1to16} ; t11a
1599*c0909341SAndroid Build Coastguard Worker    pmulld              m17, [o(pd_3920)] {1to16}      ; t12a
1600*c0909341SAndroid Build Coastguard Worker    psubd               m18, m13, m18
1601*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m22, m9, m17
1602*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m18, m22, m9, m17
1603*c0909341SAndroid Build Coastguard Worker
1604*c0909341SAndroid Build Coastguard Worker    mova                m20, m9
1605*c0909341SAndroid Build Coastguard Worker    mova                m16, m18
1606*c0909341SAndroid Build Coastguard Worker    mova                m23, m22
1607*c0909341SAndroid Build Coastguard Worker    mova                m19, m17
1608*c0909341SAndroid Build Coastguard Worker    jmp .main3
1609*c0909341SAndroid Build Coastguard Worker.main_fast_rect2:
1610*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m16, m17, m18, m19
1611*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m16, m17, m18, m19
1612*c0909341SAndroid Build Coastguard Worker.main_fast:
1613*c0909341SAndroid Build Coastguard Worker    pmulld              m23, m16, [o(pd_4076)] {1to16} ; t15a
1614*c0909341SAndroid Build Coastguard Worker    pmulld              m16, [o(pd_401)] {1to16}       ; t8a
1615*c0909341SAndroid Build Coastguard Worker    pmulld              m20, m19, [o(pd_2598)] {1to16} ; t9a
1616*c0909341SAndroid Build Coastguard Worker    pmulld              m19, [o(pd_3166)] {1to16}      ; t14a
1617*c0909341SAndroid Build Coastguard Worker    pmulld              m22, m17, [o(pd_1189)] {1to16} ; t11a
1618*c0909341SAndroid Build Coastguard Worker    pmulld              m17, [o(pd_3920)] {1to16}      ; t12a
1619*c0909341SAndroid Build Coastguard Worker    pmulld              m21, m18, [o(pd_3612)] {1to16} ; t13a
1620*c0909341SAndroid Build Coastguard Worker    pmulld              m18, [o(pd_1931)] {1to16}      ; t10a
1621*c0909341SAndroid Build Coastguard Worker    psubd               m20, m13, m20
1622*c0909341SAndroid Build Coastguard Worker    psubd               m22, m13, m22
1623*c0909341SAndroid Build Coastguard Worker    call .round2
1624*c0909341SAndroid Build Coastguard Worker    jmp .main2
1625*c0909341SAndroid Build Coastguard Worker.main_rect2:
1626*c0909341SAndroid Build Coastguard Worker    call .round
1627*c0909341SAndroid Build Coastguard Worker.main:
1628*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        16, 23, 7, 9, 10, _,  401, 4076 ; t8a,  t15a
1629*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        20, 19, 7, 9, 10, _, 3166, 2598 ; t9a,  t14a
1630*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        22, 17, 7, 9, 10, _, 3920, 1189 ; t11a, t12a
1631*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        18, 21, 7, 9, 10, _, 1931, 3612 ; t10a, t13a
1632*c0909341SAndroid Build Coastguard Worker    call .round
1633*c0909341SAndroid Build Coastguard Worker.main2:
1634*c0909341SAndroid Build Coastguard Worker    paddd                m9, m20, m16 ; t8
1635*c0909341SAndroid Build Coastguard Worker    psubd               m20, m16, m20 ; t9
1636*c0909341SAndroid Build Coastguard Worker    psubd               m16, m22, m18 ; t10
1637*c0909341SAndroid Build Coastguard Worker    paddd               m18, m22      ; t11
1638*c0909341SAndroid Build Coastguard Worker    paddd               m22, m23, m19 ; t15
1639*c0909341SAndroid Build Coastguard Worker    psubd               m23, m19      ; t14
1640*c0909341SAndroid Build Coastguard Worker    psubd               m19, m17, m21 ; t13
1641*c0909341SAndroid Build Coastguard Worker    paddd               m17, m21      ; t12
1642*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m20, m23, m16, m19
1643*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m20, m23, m16, m19
1644*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m9, m18, m22, m17
1645*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m9, m18, m22, m17
1646*c0909341SAndroid Build Coastguard Worker.main3:
1647*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_3784)]
1648*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_1567)]
1649*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        23, 20, 21, 7, _, 13, 10, 11
1650*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        19, 16, 21, 7, _, 13, 10, 11, 2
1651*c0909341SAndroid Build Coastguard Worker    paddd               m21, m20, m19 ; t14
1652*c0909341SAndroid Build Coastguard Worker    psubd               m20, m19      ; t13
1653*c0909341SAndroid Build Coastguard Worker    psubd               m19, m9, m18  ; t11a
1654*c0909341SAndroid Build Coastguard Worker    paddd                m9, m18      ; t8a
1655*c0909341SAndroid Build Coastguard Worker    psubd               m18, m23, m16 ; t10
1656*c0909341SAndroid Build Coastguard Worker    paddd               m16, m23      ; t9
1657*c0909341SAndroid Build Coastguard Worker    psubd               m23, m22, m17 ; t12a
1658*c0909341SAndroid Build Coastguard Worker    paddd               m22, m17      ; t15a
1659*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m20, m23, m18, m19
1660*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m20, m23, m18, m19
1661*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m20, m23, m18, m19
1662*c0909341SAndroid Build Coastguard Worker    psubd                m7, m0, m6   ; dct8 out7
1663*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6       ; dct8 out0
1664*c0909341SAndroid Build Coastguard Worker    psubd                m6, m1, m5   ; dct8 out6
1665*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5       ; dct8 out1
1666*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m7, m0, m6, m1
1667*c0909341SAndroid Build Coastguard Worker    psubd                m5, m2, m4   ; dct8 out5
1668*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4       ; dct8 out2
1669*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m7, m0, m6, m1
1670*c0909341SAndroid Build Coastguard Worker    psubd                m4, m3, m8   ; dct8 out4
1671*c0909341SAndroid Build Coastguard Worker    paddd                m3, m8       ; dct8 out3
1672*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m5, m2, m4, m3
1673*c0909341SAndroid Build Coastguard Worker    paddd               m20, m13
1674*c0909341SAndroid Build Coastguard Worker    paddd               m23, m13
1675*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m5, m2, m4, m3
1676*c0909341SAndroid Build Coastguard Worker    psubd               m17, m20, m18 ; t10a
1677*c0909341SAndroid Build Coastguard Worker    paddd               m20, m18      ; t13a
1678*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m22, m21, m16, m9
1679*c0909341SAndroid Build Coastguard Worker    psubd               m18, m23, m19 ; t11
1680*c0909341SAndroid Build Coastguard Worker    paddd               m19, m23      ; t12
1681*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m22, m21, m16, m9
1682*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m20, m19, m18, m17
1683*c0909341SAndroid Build Coastguard Worker    ret
1684*c0909341SAndroid Build Coastguard Worker.main_end:
1685*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_2)]
1686*c0909341SAndroid Build Coastguard Worker.main_end2:
1687*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
1688*c0909341SAndroid Build Coastguard Worker    psubd               m23, m0, m22 ; out15
1689*c0909341SAndroid Build Coastguard Worker    paddd                m0, m22     ; out0
1690*c0909341SAndroid Build Coastguard Worker    psubd               m22, m1, m21 ; out14
1691*c0909341SAndroid Build Coastguard Worker    paddd                m1, m21     ; out1
1692*c0909341SAndroid Build Coastguard Worker    psubd               m21, m2, m20 ; out13
1693*c0909341SAndroid Build Coastguard Worker    paddd                m2, m20     ; out2
1694*c0909341SAndroid Build Coastguard Worker    psubd               m20, m3, m19 ; out12
1695*c0909341SAndroid Build Coastguard Worker    paddd                m3, m19     ; out3
1696*c0909341SAndroid Build Coastguard Worker    psubd               m19, m4, m18 ; out11
1697*c0909341SAndroid Build Coastguard Worker    paddd                m4, m18     ; out4
1698*c0909341SAndroid Build Coastguard Worker    psubd               m18, m5, m17 ; out10
1699*c0909341SAndroid Build Coastguard Worker    paddd                m5, m17     ; out5
1700*c0909341SAndroid Build Coastguard Worker    psubd               m17, m6, m16 ; out9
1701*c0909341SAndroid Build Coastguard Worker    paddd                m6, m16     ; out6
1702*c0909341SAndroid Build Coastguard Worker    psubd               m16, m7, m9  ; out8
1703*c0909341SAndroid Build Coastguard Worker    paddd                m7, m9      ; out7
1704*c0909341SAndroid Build Coastguard Worker    REPX   {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \
1705*c0909341SAndroid Build Coastguard Worker                             m4, m20, m5, m21, m6, m22, m7, m23
1706*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m16
1707*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m17
1708*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m18
1709*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m19
1710*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m20
1711*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m21
1712*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m22
1713*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m23
1714*c0909341SAndroid Build Coastguard Worker    ret
1715*c0909341SAndroid Build Coastguard Worker.main_end3:
1716*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m0, m1
1717*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
1718*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3
1719*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
1720*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4, m5
1721*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5
1722*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m7
1723*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7
1724*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m0, m2
1725*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
1726*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m8, m1
1727*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m1
1728*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m4, m5
1729*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m5
1730*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m3, m6
1731*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m6
1732*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m0, m4, q3232
1733*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym4, 1
1734*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, m8, ym3, 1
1735*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m8, m3, q3232
1736*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, m7, ym1, 1
1737*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m1, q3232
1738*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m2, m5, q3232
1739*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, ym5, 1
1740*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m7, m1, q2020 ; 10 11
1741*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m1, q3131     ; 14 15
1742*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m3, m2, q2020 ;  2  3
1743*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m2, q3131     ;  6  7
1744*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m0, m4, q3131 ;  4  5
1745*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m4, q2020     ;  0  1
1746*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m6, m8, q2020 ;  8  9
1747*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m8, q3131     ; 12 13
1748*c0909341SAndroid Build Coastguard Worker    ret
1749*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1750*c0909341SAndroid Build Coastguard Worker.round:
1751*c0909341SAndroid Build Coastguard Worker    paddd               m20, m13
1752*c0909341SAndroid Build Coastguard Worker    paddd               m22, m13
1753*c0909341SAndroid Build Coastguard Worker.round2:
1754*c0909341SAndroid Build Coastguard Worker    paddd               m16, m13
1755*c0909341SAndroid Build Coastguard Worker    paddd               m18, m13
1756*c0909341SAndroid Build Coastguard Worker.round3:
1757*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m16, m18, m20, m22
1758*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m17, m19, m21, m23
1759*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m17, m19, m21, m23
1760*c0909341SAndroid Build Coastguard Worker    ret
1761*c0909341SAndroid Build Coastguard Worker
1762*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, dct
1763*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, flipadst
1764*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, adst
1765*c0909341SAndroid Build Coastguard Worker
1766*c0909341SAndroid Build Coastguard Workercglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1767*c0909341SAndroid Build Coastguard Worker%undef cmp
1768*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
1769*c0909341SAndroid Build Coastguard Worker    jl .fast
1770*c0909341SAndroid Build Coastguard Worker    call .main_pass1
1771*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m16
1772*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m17
1773*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m18
1774*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m19
1775*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5, m20
1776*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6, m21
1777*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m7, m22
1778*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m8, m23
1779*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_10bpc).pass1_end
1780*c0909341SAndroid Build Coastguard Worker.fast:
1781*c0909341SAndroid Build Coastguard Worker    call .main_pass1_fast
1782*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pd_2)]
1783*c0909341SAndroid Build Coastguard Worker    paddd                m0, m9
1784*c0909341SAndroid Build Coastguard Worker    psubd                m1, m9, m1
1785*c0909341SAndroid Build Coastguard Worker    paddd                m2, m9
1786*c0909341SAndroid Build Coastguard Worker    psubd                m3, m9, m3
1787*c0909341SAndroid Build Coastguard Worker    paddd                m4, m9, m5
1788*c0909341SAndroid Build Coastguard Worker    psubd                m5, m9, m6
1789*c0909341SAndroid Build Coastguard Worker    paddd                m6, m9, m7
1790*c0909341SAndroid Build Coastguard Worker    psubd                m7, m9, m8
1791*c0909341SAndroid Build Coastguard Worker.pass1_fast_end:
1792*c0909341SAndroid Build Coastguard Worker    mova                 m9, [o(permA)]
1793*c0909341SAndroid Build Coastguard Worker    psrlq                m8, m9, 8
1794*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
1795*c0909341SAndroid Build Coastguard Worker.pass1_fast_end2:
1796*c0909341SAndroid Build Coastguard Worker    mova                m10, m9
1797*c0909341SAndroid Build Coastguard Worker    mova                m11, m8
1798*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).transpose_16x8
1799*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
1800*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m4}, m5, m6, m7
1801*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7
1802*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1803*c0909341SAndroid Build Coastguard Worker.pass2:
1804*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
1805*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x16_internal_8bpc).main_pass2b
1806*c0909341SAndroid Build Coastguard Worker    movshdup            m12, [permC]
1807*c0909341SAndroid Build Coastguard Worker    mova                m11, [pw_2048_m2048]
1808*c0909341SAndroid Build Coastguard Worker    psrlq               m13, m12, 8
1809*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m13, m0
1810*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m12, m7
1811*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m13, m1
1812*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m12, m6
1813*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m13, m2
1814*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m12, m5
1815*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m13, m3
1816*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m12, m4
1817*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_10bpc).pass2_end
1818*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1819*c0909341SAndroid Build Coastguard Worker.main_pass1:
1820*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 0]
1821*c0909341SAndroid Build Coastguard Worker%if WIN64
1822*c0909341SAndroid Build Coastguard Worker    movaps        [cq+16*0], xmm6
1823*c0909341SAndroid Build Coastguard Worker    movaps        [cq+16*1], xmm7
1824*c0909341SAndroid Build Coastguard Worker%endif
1825*c0909341SAndroid Build Coastguard Worker    mova                m23, [cq+64*15]
1826*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
1827*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        23,  0, 8, 9, 10, 13,  201, 4091 ; t1  t0
1828*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64* 7]
1829*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64* 8]
1830*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 16, 8, 9, 10, 13, 3035, 2751 ; t9  t8
1831*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64* 2]
1832*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+64*13]
1833*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        21,  2, 8, 9, 10, 13,  995, 3973 ; t3  t2
1834*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64* 5]
1835*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64*10]
1836*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10
1837*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64* 4]
1838*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+64*11]
1839*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        19,  4, 8, 9, 10, 13, 1751, 3703 ; t5  t4
1840*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64* 3]
1841*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+64*12]
1842*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12
1843*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64* 6]
1844*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64* 9]
1845*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        17,  6, 8, 9, 10, 13, 2440, 3290 ; t7  t6
1846*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 1]
1847*c0909341SAndroid Build Coastguard Worker    mova                m22, [cq+64*14]
1848*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 22, 8, 9, 10, 13, 4052,  601 ; t15 t14
1849*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
1850*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
1851*c0909341SAndroid Build Coastguard Worker    psubd                m9, m23, m7  ; t9a
1852*c0909341SAndroid Build Coastguard Worker    paddd               m23, m7       ; t1a
1853*c0909341SAndroid Build Coastguard Worker    psubd                m7, m2, m18  ; t10a
1854*c0909341SAndroid Build Coastguard Worker    paddd               m18, m2       ; t2a
1855*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m9, m23, m7, m18
1856*c0909341SAndroid Build Coastguard Worker    psubd                m2, m17, m1  ; t15a
1857*c0909341SAndroid Build Coastguard Worker    paddd               m17, m1       ; t7a
1858*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m9, m23, m7, m18
1859*c0909341SAndroid Build Coastguard Worker    psubd                m1, m21, m5  ; t11a
1860*c0909341SAndroid Build Coastguard Worker    paddd               m21, m5       ; t3a
1861*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m2, m17, m1, m21
1862*c0909341SAndroid Build Coastguard Worker    psubd                m5, m4, m20  ; t12a
1863*c0909341SAndroid Build Coastguard Worker    paddd                m4, m20      ; t4a
1864*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m2, m17, m1, m21
1865*c0909341SAndroid Build Coastguard Worker    psubd               m20, m19, m3  ; t13a
1866*c0909341SAndroid Build Coastguard Worker    paddd               m19, m3       ; t5a
1867*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m5, m4, m20, m19
1868*c0909341SAndroid Build Coastguard Worker    psubd                m8, m6, m22  ; t14a
1869*c0909341SAndroid Build Coastguard Worker    paddd                m6, m22      ; t6a
1870*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m5, m4, m20, m19
1871*c0909341SAndroid Build Coastguard Worker    psubd               m22, m0, m16  ; t8a
1872*c0909341SAndroid Build Coastguard Worker    paddd               m16, m0       ; t0a
1873*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m8, m6, m22, m16
1874*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_4017)]
1875*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_799)]
1876*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m8, m6, m22, m16
1877*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        22,  9, 0, 3, _, 13, 10, 11 ; t9  t8
1878*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        20,  5, 0, 3, _, 13, 11, 10 ; t12 t13
1879*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_2276)]
1880*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_3406)]
1881*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7,  1, 0, 3, _, 13, 10, 11 ; t11 t10
1882*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2,  8, 0, 3, _, 13, 11, 10 ; t14 t15
1883*c0909341SAndroid Build Coastguard Worker    paddd                m0, m16, m4  ; t0
1884*c0909341SAndroid Build Coastguard Worker    psubd               m16, m4       ; t4
1885*c0909341SAndroid Build Coastguard Worker    psubd                m3, m23, m19 ; t5
1886*c0909341SAndroid Build Coastguard Worker    paddd               m23, m19      ; t1
1887*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m0, m16, m3, m23
1888*c0909341SAndroid Build Coastguard Worker    psubd               m19, m18, m6  ; t6
1889*c0909341SAndroid Build Coastguard Worker    paddd               m18, m6       ; t2
1890*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m0, m16, m3, m23
1891*c0909341SAndroid Build Coastguard Worker    psubd                m6, m21, m17 ; t7
1892*c0909341SAndroid Build Coastguard Worker    paddd               m21, m17      ; t3
1893*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m19, m18, m6, m21
1894*c0909341SAndroid Build Coastguard Worker    paddd               m17, m9, m20  ; t8a
1895*c0909341SAndroid Build Coastguard Worker    psubd                m9, m20      ; t12a
1896*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m19, m18, m6, m21
1897*c0909341SAndroid Build Coastguard Worker    psubd               m20, m22, m5  ; t13a
1898*c0909341SAndroid Build Coastguard Worker    paddd               m22, m5       ; t9a
1899*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m17, m9, m20, m22
1900*c0909341SAndroid Build Coastguard Worker    psubd                m5, m1, m2   ; t14a
1901*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2       ; t10a
1902*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m17, m9, m20, m22
1903*c0909341SAndroid Build Coastguard Worker    psubd                m2, m7, m8   ; t15a
1904*c0909341SAndroid Build Coastguard Worker    paddd                m7, m8       ; t11a
1905*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m5, m1, m2, m7
1906*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_3784)]
1907*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_1567)]
1908*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m5, m1, m2, m7
1909*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        16,  3, 4, 8, _, 13, 10, 11 ; t5a t4a
1910*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a
1911*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         9, 20, 4, 8, _, 13, 10, 11 ; t13 t12
1912*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2,  5, 4, 8, _, 13, 11, 10 ; t14 t15
1913*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m18  ; t2a
1914*c0909341SAndroid Build Coastguard Worker    paddd                m0, m18      ;  out0
1915*c0909341SAndroid Build Coastguard Worker    psubd               m18, m23, m21 ; t3a
1916*c0909341SAndroid Build Coastguard Worker    paddd               m23, m21      ; -out15
1917*c0909341SAndroid Build Coastguard Worker    paddd               m21, m9, m5   ; -out13
1918*c0909341SAndroid Build Coastguard Worker    psubd                m9, m5       ; t15a
1919*c0909341SAndroid Build Coastguard Worker    psubd                m5, m3, m6   ; t6
1920*c0909341SAndroid Build Coastguard Worker    paddd                m3, m6       ; -out3
1921*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m8, m18, m9, m5
1922*c0909341SAndroid Build Coastguard Worker    psubd                m6, m20, m2  ; t14a
1923*c0909341SAndroid Build Coastguard Worker    paddd                m2, m20      ;  out2
1924*c0909341SAndroid Build Coastguard Worker    paddd               m20, m16, m19 ;  out12
1925*c0909341SAndroid Build Coastguard Worker    psubd               m16, m19      ; t7
1926*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m8, m18, m9, m5
1927*c0909341SAndroid Build Coastguard Worker    psubd               m19, m22, m7  ; t11
1928*c0909341SAndroid Build Coastguard Worker    paddd               m22, m7       ;  out14
1929*c0909341SAndroid Build Coastguard Worker    psubd                m7, m17, m1  ; t10
1930*c0909341SAndroid Build Coastguard Worker    paddd                m1, m17      ; -out1
1931*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m6, m16, m19, m7
1932*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_1448)]
1933*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pd_2)]
1934*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_5120)]
1935*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_5119)]
1936*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m6, m16, m19, m7
1937*c0909341SAndroid Build Coastguard Worker    psubd               m17, m7, m19  ; -out9
1938*c0909341SAndroid Build Coastguard Worker    paddd                m7, m19      ;  out6
1939*c0909341SAndroid Build Coastguard Worker    psubd               m19, m5, m16  ; -out11
1940*c0909341SAndroid Build Coastguard Worker    paddd                m5, m16      ;  out4
1941*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m17, m7, m19, m5
1942*c0909341SAndroid Build Coastguard Worker    psubd               m16, m8, m18  ;  out8
1943*c0909341SAndroid Build Coastguard Worker    paddd                m8, m18      ; -out7
1944*c0909341SAndroid Build Coastguard Worker    psubd               m18, m6, m9   ;  out10
1945*c0909341SAndroid Build Coastguard Worker    paddd                m6, m9       ; -out5
1946*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m16, m8, m18, m6
1947*c0909341SAndroid Build Coastguard Worker    REPX  {paddd x, m4    }, m0, m2, m20, m22
1948*c0909341SAndroid Build Coastguard Worker    REPX  {psubd x, m4,  x}, m1, m3, m21, m23
1949*c0909341SAndroid Build Coastguard Worker    REPX  {paddd x, m10   }, m7, m5, m16, m18
1950*c0909341SAndroid Build Coastguard Worker    REPX  {psubd x, m11, x}, m17, m19, m8, m6
1951*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3
1952*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8
1953*c0909341SAndroid Build Coastguard Worker    ret
1954*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1955*c0909341SAndroid Build Coastguard Worker.main_pass1_fast:
1956*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+64*0]
1957*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+64*2]
1958*c0909341SAndroid Build Coastguard Worker    movshdup             m8, [o(permB)]
1959*c0909341SAndroid Build Coastguard Worker    mova                ym6, [cq+64*1]
1960*c0909341SAndroid Build Coastguard Worker    mova                ym7, [cq+64*3]
1961*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+64*4]
1962*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+64*6]
1963*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+64*5]
1964*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+64*7]
1965*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m8, m1 ; 0 2
1966*c0909341SAndroid Build Coastguard Worker    vpermt2q             m7, m8, m6 ; 3 1
1967*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m8, m3 ; 4 6
1968*c0909341SAndroid Build Coastguard Worker    vpermt2q             m5, m8, m4 ; 7 5
1969*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
1970*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
1971*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x8_internal_10bpc).main_fast
1972*c0909341SAndroid Build Coastguard Worker
1973*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, dct
1974*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, adst
1975*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, flipadst
1976*c0909341SAndroid Build Coastguard Worker
1977*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
1978*c0909341SAndroid Build Coastguard Worker%undef cmp
1979*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
1980*c0909341SAndroid Build Coastguard Worker    jl .fast
1981*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x16_internal_10bpc).main_pass1
1982*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m19, m3
1983*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m20, m5
1984*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m18, m2
1985*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m21, m6
1986*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m17, m1
1987*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m22, m7
1988*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m16, m0
1989*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m23, m8
1990*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_10bpc).pass1_end
1991*c0909341SAndroid Build Coastguard Worker.fast:
1992*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x16_internal_10bpc).main_pass1_fast
1993*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pd_2)]
1994*c0909341SAndroid Build Coastguard Worker    psubd                m4, m9, m3
1995*c0909341SAndroid Build Coastguard Worker    paddd                m3, m9, m5
1996*c0909341SAndroid Build Coastguard Worker    paddd                m5, m9, m2
1997*c0909341SAndroid Build Coastguard Worker    psubd                m2, m9, m6
1998*c0909341SAndroid Build Coastguard Worker    psubd                m6, m9, m1
1999*c0909341SAndroid Build Coastguard Worker    paddd                m1, m9, m7
2000*c0909341SAndroid Build Coastguard Worker    paddd                m7, m9, m0
2001*c0909341SAndroid Build Coastguard Worker    psubd                m0, m9, m8
2002*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x16_internal_10bpc).pass1_fast_end
2003*c0909341SAndroid Build Coastguard Worker.pass2:
2004*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
2005*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x16_internal_8bpc).main_pass2b
2006*c0909341SAndroid Build Coastguard Worker    movshdup            m12, [permC]
2007*c0909341SAndroid Build Coastguard Worker    movu                m11, [pw_m2048_2048]
2008*c0909341SAndroid Build Coastguard Worker    psrlq               m13, m12, 8
2009*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m13, m7
2010*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m13, m6
2011*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m13, m5
2012*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m13, m4
2013*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m12, m3
2014*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m12, m2
2015*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m12, m1
2016*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m12, m0
2017*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_10bpc).pass2_end
2018*c0909341SAndroid Build Coastguard Worker
2019*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN identity, dct, -92
2020*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN identity, identity
2021*c0909341SAndroid Build Coastguard Worker
2022*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
2023*c0909341SAndroid Build Coastguard Worker%undef cmp
2024*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_5793)]
2025*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_5120)]
2026*c0909341SAndroid Build Coastguard Worker    mov                  r6, cq
2027*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
2028*c0909341SAndroid Build Coastguard Worker    jl .fast
2029*c0909341SAndroid Build Coastguard Worker    call .pass1_main
2030*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m6, m8
2031*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m7, m9
2032*c0909341SAndroid Build Coastguard Worker    call .pass1_main
2033*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m6, m8
2034*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m7, m9
2035*c0909341SAndroid Build Coastguard Worker    call .pass1_main
2036*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m6, m8
2037*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m7, m9
2038*c0909341SAndroid Build Coastguard Worker    call .pass1_main
2039*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m8
2040*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m9
2041*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_10bpc).pass1_end2
2042*c0909341SAndroid Build Coastguard Worker.fast:
2043*c0909341SAndroid Build Coastguard Worker    call .pass1_main_fast
2044*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m6, m7
2045*c0909341SAndroid Build Coastguard Worker    call .pass1_main_fast
2046*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m6, m7
2047*c0909341SAndroid Build Coastguard Worker    call .pass1_main_fast
2048*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m6, m7
2049*c0909341SAndroid Build Coastguard Worker    call .pass1_main_fast
2050*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m6, m7
2051*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0, m1
2052*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
2053*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3
2054*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
2055*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m4, m1
2056*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m1
2057*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m2
2058*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
2059*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
2060*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m0, m3, q3131
2061*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m3, q2020
2062*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m1, m4, q3131
2063*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m4, q2020
2064*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m7}, m4, m5, m6
2065*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_10bpc).pass1_end3
2066*c0909341SAndroid Build Coastguard Worker.pass2:
2067*c0909341SAndroid Build Coastguard Worker    movshdup            m14, [o(permC)]
2068*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_1697x16)]
2069*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
2070*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2048)]
2071*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
2072*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [pixel_10bpc_max]
2073*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m14, m0
2074*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m14, m1
2075*c0909341SAndroid Build Coastguard Worker    call .pass2_main
2076*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m14, m2
2077*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m14, m3
2078*c0909341SAndroid Build Coastguard Worker    call .pass2_main
2079*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m14, m4
2080*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m14, m5
2081*c0909341SAndroid Build Coastguard Worker    call .pass2_main
2082*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m14, m6
2083*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m14, m7
2084*c0909341SAndroid Build Coastguard Worker.pass2_main:
2085*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15, m8
2086*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15, m9
2087*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m8
2088*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m9
2089*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m0
2090*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m1
2091*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_10bpc).write_16x4
2092*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2093*c0909341SAndroid Build Coastguard Worker.pass1_main:
2094*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m10, [r6+64*0]
2095*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m10, [r6+64*1]
2096*c0909341SAndroid Build Coastguard Worker    pmulld               m8, m10, [r6+64*8]
2097*c0909341SAndroid Build Coastguard Worker    pmulld               m9, m10, [r6+64*9]
2098*c0909341SAndroid Build Coastguard Worker    add                  r6, 64*2
2099*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m11}, m6, m7, m8, m9
2100*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 13 }, m6, m8, m7, m9
2101*c0909341SAndroid Build Coastguard Worker    ret
2102*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2103*c0909341SAndroid Build Coastguard Worker.pass1_main_fast:
2104*c0909341SAndroid Build Coastguard Worker    mova                ym6, [r6+64* 0]
2105*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m6, [r6+64* 4], 1
2106*c0909341SAndroid Build Coastguard Worker    mova                ym7, [r6+64* 8]
2107*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m7, [r6+64*12], 1
2108*c0909341SAndroid Build Coastguard Worker    add                  r6, 64
2109*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m10}, m6, m7
2110*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m11}, m6, m7
2111*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 13 }, m6, m7
2112*c0909341SAndroid Build Coastguard Worker    ret
2113*c0909341SAndroid Build Coastguard Worker
2114*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob
2115*c0909341SAndroid Build Coastguard Worker%undef cmp
2116*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
2117*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
2118*c0909341SAndroid Build Coastguard Worker    jz .dconly
2119*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
2120*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
2121*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
2122*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
2123*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_2)]
2124*c0909341SAndroid Build Coastguard Worker    mova                m20, [o(idct8x32p)]
2125*c0909341SAndroid Build Coastguard Worker    pxor                m21, m21
2126*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 43
2127*c0909341SAndroid Build Coastguard Worker    jl .fast
2128*c0909341SAndroid Build Coastguard Worker    call .pass1_main
2129*c0909341SAndroid Build Coastguard Worker    punpcklwd           m16, m0, m1
2130*c0909341SAndroid Build Coastguard Worker    punpcklwd           m17, m2, m3
2131*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m0, m1
2132*c0909341SAndroid Build Coastguard Worker    punpckhwd           m19, m2, m3
2133*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 107
2134*c0909341SAndroid Build Coastguard Worker    jge .full
2135*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m16, m17 ;  0  2
2136*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m16, m17 ;  4  6
2137*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m18, m19 ;  8 10
2138*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m18, m19 ; 12 14
2139*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
2140*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym14, m0, 1
2141*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym15, m1, 1
2142*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym16, m2, 1
2143*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym17, m3, 1
2144*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_8bpc).main_fast
2145*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
2146*c0909341SAndroid Build Coastguard Worker    jmp .end
2147*c0909341SAndroid Build Coastguard Worker.full:
2148*c0909341SAndroid Build Coastguard Worker    add                  cq, 64
2149*c0909341SAndroid Build Coastguard Worker    call .pass1_main
2150*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m0, m1
2151*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m2, m3
2152*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0, m1
2153*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m2, m3
2154*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m16, m17 ;  0  2
2155*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m16, m17 ;  4  6
2156*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m18, m19 ;  8 10
2157*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m18, m19 ; 12 14
2158*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m5, m6   ; 16 18
2159*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m6       ; 20 22
2160*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7, m8   ; 24 26
2161*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m8       ; 28 30
2162*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
2163*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym14, m0, 1
2164*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym15, m1, 1
2165*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym16, m2, 1
2166*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym17, m3, 1
2167*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym18, m4, 1
2168*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym19, m5, 1
2169*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym20, m6, 1
2170*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym21, m7, 1
2171*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_8bpc).main
2172*c0909341SAndroid Build Coastguard Worker    REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21
2173*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_8x32_8bpc).main
2174*c0909341SAndroid Build Coastguard Worker    jmp .end
2175*c0909341SAndroid Build Coastguard Worker.fast:
2176*c0909341SAndroid Build Coastguard Worker    movshdup             m8, [o(permB)]
2177*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+128*1]
2178*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+128*5]
2179*c0909341SAndroid Build Coastguard Worker    mova                ym7, [cq+128*3]
2180*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+128*7]
2181*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+128*0]
2182*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+128*2]
2183*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+128*4]
2184*c0909341SAndroid Build Coastguard Worker    mova                ym6, [cq+128*6]
2185*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m8, m5 ; 1 5
2186*c0909341SAndroid Build Coastguard Worker    vpermt2q             m3, m8, m7 ; 7 3
2187*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m8, m4 ; 0 2
2188*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m8, m6 ; 4 6
2189*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*0], ym21
2190*c0909341SAndroid Build Coastguard Worker    REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7
2191*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).main
2192*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).main_end
2193*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
2194*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
2195*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m20, m0
2196*c0909341SAndroid Build Coastguard Worker    vprold              m20, 16
2197*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m20, m1
2198*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m2
2199*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
2200*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
2201*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym14, m0, 1
2202*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym15, m1, 1
2203*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_8bpc).main_fast2
2204*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2
2205*c0909341SAndroid Build Coastguard Worker.end:
2206*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper
2207*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*2]
2208*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [pixel_10bpc_max]
2209*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
2210*c0909341SAndroid Build Coastguard Worker    pxor                m11, m11
2211*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r3*8]
2212*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10
2213*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10
2214*c0909341SAndroid Build Coastguard Worker    call .write_8x4x2
2215*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10, m2
2216*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10, m3
2217*c0909341SAndroid Build Coastguard Worker    call .write_8x4x2
2218*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10, m4
2219*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10, m5
2220*c0909341SAndroid Build Coastguard Worker    call .write_8x4x2
2221*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m10, m6
2222*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m10, m7
2223*c0909341SAndroid Build Coastguard Worker.write_8x4x2:
2224*c0909341SAndroid Build Coastguard Worker    mova                xm8, [dstq+strideq*0]
2225*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym8, [dstq+strideq*1], 1
2226*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m8, [dstq+strideq*2], 2
2227*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m8, [dstq+r6       ], 3
2228*c0909341SAndroid Build Coastguard Worker    mova                xm9, [r3  +r6       ]
2229*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym9, [r3  +strideq*2], 1
2230*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m9, [r3  +strideq*1], 2
2231*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m9, [r3  +strideq*0], 3
2232*c0909341SAndroid Build Coastguard Worker    paddw                m8, m0
2233*c0909341SAndroid Build Coastguard Worker    paddw                m9, m1
2234*c0909341SAndroid Build Coastguard Worker    pmaxsw               m8, m11
2235*c0909341SAndroid Build Coastguard Worker    pmaxsw               m9, m11
2236*c0909341SAndroid Build Coastguard Worker    pminsw               m8, m12
2237*c0909341SAndroid Build Coastguard Worker    pminsw               m9, m12
2238*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm8
2239*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym8, 1
2240*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m8, 2
2241*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r6       ], m8, 3
2242*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2243*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r3  +strideq*0], m9, 3
2244*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r3  +strideq*1], m9, 2
2245*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r3  +strideq*2], ym9, 1
2246*c0909341SAndroid Build Coastguard Worker    mova          [r3  +r6       ], xm9
2247*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r3+strideq*4]
2248*c0909341SAndroid Build Coastguard Worker    ret
2249*c0909341SAndroid Build Coastguard Worker.dconly:
2250*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
2251*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
2252*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
2253*c0909341SAndroid Build Coastguard Worker    add                 r6d, 640
2254*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 10
2255*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
2256*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2257*c0909341SAndroid Build Coastguard Worker.pass1_main:
2258*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128*0]
2259*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*1]
2260*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*2]
2261*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*3]
2262*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128*4]
2263*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*5]
2264*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*6]
2265*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*7]
2266*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7
2267*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main
2268*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_end2
2269*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m4
2270*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m5
2271*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m6
2272*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m7
2273*c0909341SAndroid Build Coastguard Worker    REPX {vpermb x, m20, x}, m0, m1, m2, m3
2274*c0909341SAndroid Build Coastguard Worker    ret
2275*c0909341SAndroid Build Coastguard Worker
2276*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob
2277*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pw_5]
2278*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
2279*c0909341SAndroid Build Coastguard Worker    pxor                m10, m10
2280*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq*5]
2281*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pixel_10bpc_max]
2282*c0909341SAndroid Build Coastguard Worker    sub                eobd, 107
2283*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq+r4*2]
2284*c0909341SAndroid Build Coastguard Worker.loop:
2285*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128*0]
2286*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [cq+128*1]
2287*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*2]
2288*c0909341SAndroid Build Coastguard Worker    packssdw             m1, [cq+128*3]
2289*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*4]
2290*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [cq+128*5]
2291*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*6]
2292*c0909341SAndroid Build Coastguard Worker    packssdw             m3, [cq+128*7]
2293*c0909341SAndroid Build Coastguard Worker    lea                  r7, [dstq+strideq*8]
2294*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+128*x], m10}, 0, 1, 2, 3
2295*c0909341SAndroid Build Coastguard Worker    REPX     {paddsw x, m9}, m0, m1, m2, m3
2296*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+128*x], m10}, 4, 5, 6, 7
2297*c0909341SAndroid Build Coastguard Worker    REPX     {psraw  x, 3 }, m0, m1, m2, m3
2298*c0909341SAndroid Build Coastguard Worker    add                  cq, 64
2299*c0909341SAndroid Build Coastguard Worker    mova                xm4, [dstq+strideq*0]
2300*c0909341SAndroid Build Coastguard Worker    mova                xm5, [dstq+strideq*1]
2301*c0909341SAndroid Build Coastguard Worker    mova                xm6, [dstq+strideq*2]
2302*c0909341SAndroid Build Coastguard Worker    mova                xm7, [dstq+r4     *1]
2303*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m0, m1
2304*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym4, [dstq+strideq*4], 1
2305*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
2306*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym5, [dstq+r5     *1], 1
2307*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3
2308*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym6, [dstq+r4     *2], 1
2309*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
2310*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym7, [dstq+r6     *1], 1
2311*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m8
2312*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m4, [r7  +strideq*0], 2
2313*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m8
2314*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m5, [r7  +strideq*1], 2
2315*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m2, m1
2316*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m6, [r7  +strideq*2], 2
2317*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1
2318*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m7, [r7  +r4     *1], 2
2319*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m2
2320*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m4, [r7  +strideq*4], 3
2321*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2
2322*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m5, [r7  +r5     *1], 3
2323*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m8
2324*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m6, [r7  +r4     *2], 3
2325*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m8
2326*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m7, [r7  +r6     *1], 3
2327*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
2328*c0909341SAndroid Build Coastguard Worker    paddw                m1, m5
2329*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6
2330*c0909341SAndroid Build Coastguard Worker    paddw                m3, m7
2331*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsw x, m10}, m0, m1, m2, m3
2332*c0909341SAndroid Build Coastguard Worker    REPX    {pminsw x, m11}, m0, m1, m2, m3
2333*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
2334*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*1], xm1
2335*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], xm2
2336*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r4     *1], xm3
2337*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*4], ym0, 1
2338*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r5     *1], ym1, 1
2339*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r4     *2], ym2, 1
2340*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r6     *1], ym3, 1
2341*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r7+strideq*8]
2342*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r7  +strideq*0], m0, 2
2343*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r7  +strideq*1], m1, 2
2344*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r7  +strideq*2], m2, 2
2345*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r7  +r4     *1], m3, 2
2346*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r7  +strideq*4], m0, 3
2347*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r7  +r5     *1], m1, 3
2348*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r7  +r4     *2], m2, 3
2349*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r7  +r6     *1], m3, 3
2350*c0909341SAndroid Build Coastguard Worker    add                eobd, 0x80000000
2351*c0909341SAndroid Build Coastguard Worker    jnc .loop
2352*c0909341SAndroid Build Coastguard Worker    RET
2353*c0909341SAndroid Build Coastguard Worker
2354*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
2355*c0909341SAndroid Build Coastguard Worker%undef cmp
2356*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
2357*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
2358*c0909341SAndroid Build Coastguard Worker    jz .dconly
2359*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(permB)]
2360*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 0] ;  0  1
2361*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64* 1] ;  2  3
2362*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 2] ;  4  5
2363*c0909341SAndroid Build Coastguard Worker    mova                 m8, [cq+64* 3] ;  6  7
2364*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
2365*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
2366*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
2367*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
2368*c0909341SAndroid Build Coastguard Worker    psrlq               m10, m11, 32
2369*c0909341SAndroid Build Coastguard Worker%if WIN64
2370*c0909341SAndroid Build Coastguard Worker    movaps        [cq+16*0], xmm6
2371*c0909341SAndroid Build Coastguard Worker    movaps        [cq+16*1], xmm7
2372*c0909341SAndroid Build Coastguard Worker%endif
2373*c0909341SAndroid Build Coastguard Worker    mova                m16, m11
2374*c0909341SAndroid Build Coastguard Worker    vpermi2q            m16, m0, m1     ;  1  5
2375*c0909341SAndroid Build Coastguard Worker    mova                m17, m11
2376*c0909341SAndroid Build Coastguard Worker    vpermi2q            m17, m8, m4     ;  7  3
2377*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 43
2378*c0909341SAndroid Build Coastguard Worker    jl .fast
2379*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64* 4] ;  8  9
2380*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+64* 5] ; 10 11
2381*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64* 6] ; 12 13
2382*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64* 7] ; 14 15
2383*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m10, m18   ;  0  8
2384*c0909341SAndroid Build Coastguard Worker    vpermt2q            m18, m11, m6    ;  9 13
2385*c0909341SAndroid Build Coastguard Worker    mova                m19, m11
2386*c0909341SAndroid Build Coastguard Worker    vpermi2q            m19, m7, m20    ; 15 11
2387*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 107
2388*c0909341SAndroid Build Coastguard Worker    jge .full
2389*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m10, m6    ;  4 12
2390*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m10, m8    ;  2  6
2391*c0909341SAndroid Build Coastguard Worker    vpermt2q             m7, m10, m20   ; 14 10
2392*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64*1
2393*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).main_fast
2394*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).main_fast
2395*c0909341SAndroid Build Coastguard Worker    call .main_fast
2396*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_end
2397*c0909341SAndroid Build Coastguard Worker    jmp .end
2398*c0909341SAndroid Build Coastguard Worker.full:
2399*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64* 8] ; 16 17
2400*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64* 9] ; 18 19
2401*c0909341SAndroid Build Coastguard Worker    mova                 m9, [cq+64*10] ; 20 21
2402*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+64*11] ; 22 23
2403*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m10, m9    ;  4 20
2404*c0909341SAndroid Build Coastguard Worker    vpermt2q             m7, m10, m21   ; 14 22
2405*c0909341SAndroid Build Coastguard Worker    vpermt2q            m21, m11, m5    ; 23 19
2406*c0909341SAndroid Build Coastguard Worker    vpermt2q             m5, m10, m20   ; 18 10
2407*c0909341SAndroid Build Coastguard Worker    mova                m20, m11
2408*c0909341SAndroid Build Coastguard Worker    vpermi2q            m20, m2, m9     ; 17 21
2409*c0909341SAndroid Build Coastguard Worker    mova                m22, [cq+64*12] ; 24 25
2410*c0909341SAndroid Build Coastguard Worker    mova                 m9, [cq+64*13] ; 26 27
2411*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*14] ; 28 29
2412*c0909341SAndroid Build Coastguard Worker    mova                m23, [cq+64*15] ; 30 31
2413*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m10, m22   ; 16 24
2414*c0909341SAndroid Build Coastguard Worker    vpermt2q            m22, m11, m3    ; 25 29
2415*c0909341SAndroid Build Coastguard Worker    vpermt2q             m3, m10, m6    ; 28 12
2416*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m10, m9    ;  2 26
2417*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
2418*c0909341SAndroid Build Coastguard Worker    vpermi2q             m6, m23, m8    ; 30  6
2419*c0909341SAndroid Build Coastguard Worker    vpermt2q            m23, m11, m9    ; 31 27
2420*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64*3
2421*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).main
2422*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).main
2423*c0909341SAndroid Build Coastguard Worker    call .main
2424*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_end
2425*c0909341SAndroid Build Coastguard Worker    jmp .end
2426*c0909341SAndroid Build Coastguard Worker.fast:
2427*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m10, m0    ;  0  0
2428*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m10, m1    ;  4  4
2429*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m10, m8    ;  2  6
2430*c0909341SAndroid Build Coastguard Worker    xor                 r6d, r6d
2431*c0909341SAndroid Build Coastguard Worker    call .main_fast2
2432*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_end
2433*c0909341SAndroid Build Coastguard Worker.end:
2434*c0909341SAndroid Build Coastguard Worker%if WIN64
2435*c0909341SAndroid Build Coastguard Worker    movaps             xmm6, [cq+16*0]
2436*c0909341SAndroid Build Coastguard Worker    movaps             xmm7, [cq+16*1]
2437*c0909341SAndroid Build Coastguard Worker%endif
2438*c0909341SAndroid Build Coastguard Worker    vzeroupper
2439*c0909341SAndroid Build Coastguard Worker    call .transpose_8x32
2440*c0909341SAndroid Build Coastguard Worker    pxor                m14, m14
2441*c0909341SAndroid Build Coastguard Worker.zero_loop:
2442*c0909341SAndroid Build Coastguard Worker    mova     [cq+r6*4+64*3], m14
2443*c0909341SAndroid Build Coastguard Worker    mova     [cq+r6*4+64*2], m14
2444*c0909341SAndroid Build Coastguard Worker    mova     [cq+r6*4+64*1], m14
2445*c0909341SAndroid Build Coastguard Worker    mova     [cq+r6*4+64*0], m14
2446*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 64
2447*c0909341SAndroid Build Coastguard Worker    jge .zero_loop
2448*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
2449*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m2
2450*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2
2451*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m4
2452*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4
2453*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m5, m7
2454*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m7
2455*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m7, m6, m8
2456*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m8
2457*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
2458*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
2459*c0909341SAndroid Build Coastguard Worker.write_32x8_start:
2460*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pw_2048]
2461*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [pixel_10bpc_max]
2462*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2463*c0909341SAndroid Build Coastguard Worker.write_32x8:
2464*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m11
2465*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m11
2466*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m11
2467*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m11
2468*c0909341SAndroid Build Coastguard Worker    call .write_32x4
2469*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m11, m4
2470*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m11, m5
2471*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m11, m6
2472*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m11, m7
2473*c0909341SAndroid Build Coastguard Worker.write_32x4:
2474*c0909341SAndroid Build Coastguard Worker    paddw                m0, [dstq+strideq*0]
2475*c0909341SAndroid Build Coastguard Worker    paddw                m1, [dstq+strideq*1]
2476*c0909341SAndroid Build Coastguard Worker    paddw                m2, [dstq+strideq*2]
2477*c0909341SAndroid Build Coastguard Worker    paddw                m3, [dstq+r3       ]
2478*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsw x, m12}, m0, m1, m2, m3
2479*c0909341SAndroid Build Coastguard Worker    REPX    {pminsw x, m13}, m0, m1, m2, m3
2480*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
2481*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
2482*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*2], m2
2483*c0909341SAndroid Build Coastguard Worker    mova   [dstq+r3       ], m3
2484*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2485*c0909341SAndroid Build Coastguard Worker    ret
2486*c0909341SAndroid Build Coastguard Worker.dconly:
2487*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
2488*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
2489*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8
2490*c0909341SAndroid Build Coastguard Worker    add                 r6d, 640
2491*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 10
2492*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
2493*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2494*c0909341SAndroid Build Coastguard Worker.main_fast3:
2495*c0909341SAndroid Build Coastguard Worker    ; assuming m0=in0 in0, m4=in2 in2, and m16=in1 in3
2496*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [o(pd_401_4076)]
2497*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m0, m12
2498*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m5
2499*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m3, m4
2500*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m3, m4     ; m3=idct8:t0-7, m4=t8a t15a
2501*c0909341SAndroid Build Coastguard Worker
2502*c0909341SAndroid Build Coastguard Worker    ; t8a t15a -> t8/9 t14/15
2503*c0909341SAndroid Build Coastguard Worker
2504*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [o(pd_3784_m3784)]
2505*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m4, q1032
2506*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m4, [o(pd_1567)]{bcstd}
2507*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m7
2508*c0909341SAndroid Build Coastguard Worker    paddd                m6, m13
2509*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
2510*c0909341SAndroid Build Coastguard Worker    psrad                m5, 12         ; m5=t9a t14a
2511*c0909341SAndroid Build Coastguard Worker
2512*c0909341SAndroid Build Coastguard Worker    ; t14a t9a -> t13/14 t9/10 [m5] & t8 15 -> t8/11a t12/15a [m4]
2513*c0909341SAndroid Build Coastguard Worker
2514*c0909341SAndroid Build Coastguard Worker    shufps               m6, m4, m5, q1032     ; t12  t13
2515*c0909341SAndroid Build Coastguard Worker    shufps               m8, m4, m5, q3210     ; t11a t10
2516*c0909341SAndroid Build Coastguard Worker    pmulld               m9, m6, m12
2517*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m8, m12
2518*c0909341SAndroid Build Coastguard Worker    paddd                m9, m13
2519*c0909341SAndroid Build Coastguard Worker    paddd                m5, m9, m7     ; t12 t13a
2520*c0909341SAndroid Build Coastguard Worker    psubd                m4, m9, m7     ; t11 t10a
2521*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m5, m4
2522*c0909341SAndroid Build Coastguard Worker
2523*c0909341SAndroid Build Coastguard Worker    psubd                m7, m3, m6   ; dct16 out15 out14
2524*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3, m6   ; dct16 out0  out1
2525*c0909341SAndroid Build Coastguard Worker    psubd                m6, m3, m5   ; dct16 out12 out13
2526*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3, m5   ; dct16 out3  out2
2527*c0909341SAndroid Build Coastguard Worker    psubd                m5, m3, m4   ; dct16 out11 out10
2528*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3, m4   ; dct16 out4  out5
2529*c0909341SAndroid Build Coastguard Worker    psubd                m4, m3, m8   ; dct16 out8  out9
2530*c0909341SAndroid Build Coastguard Worker    paddd                m3, m8       ; dct16 out7  out6
2531*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
2532*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
2533*c0909341SAndroid Build Coastguard Worker
2534*c0909341SAndroid Build Coastguard Worker    ; idct32_bottomhalf
2535*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m18, [o(pd_201_m601)]
2536*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m19, [o(pd_4091_4052)]
2537*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m16, m19
2538*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m18
2539*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m17, m16
2540*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m17, m16
2541*c0909341SAndroid Build Coastguard Worker
2542*c0909341SAndroid Build Coastguard Worker    ; m17: t31a t24a -> t30/31 t24/25, m16: t16a t23a -> t16/17 t22/23 [step2]
2543*c0909341SAndroid Build Coastguard Worker
2544*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m10, [o(pd_799_m2276)]
2545*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m11, [o(pd_4017_3406)]
2546*c0909341SAndroid Build Coastguard Worker    pmulld              m18, m17, m10
2547*c0909341SAndroid Build Coastguard Worker    pmulld              m19, m17, m11
2548*c0909341SAndroid Build Coastguard Worker    pmulld               m8, m16, m11
2549*c0909341SAndroid Build Coastguard Worker    pmulld               m9, m16, m10
2550*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m18, m19
2551*c0909341SAndroid Build Coastguard Worker    psubd               m18, m8
2552*c0909341SAndroid Build Coastguard Worker    paddd               m19, m9
2553*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m18, m19
2554*c0909341SAndroid Build Coastguard Worker
2555*c0909341SAndroid Build Coastguard Worker    ; m17=t31  t24  -> t28/31a t24/27a, m16=t16  t23  -> t16/19a t20/23a
2556*c0909341SAndroid Build Coastguard Worker    ; m18=t17a t22a -> t17/18  t21/22,  m19=t30a t25a -> t29/30  t25/26
2557*c0909341SAndroid Build Coastguard Worker
2558*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m23, m17, m19   ; t24a t25 [or t27a t26]
2559*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m20, m16, m18   ; t16a t17 [or t19a t18]
2560*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m22, m16, m18   ; t23a t22 [or t20a t21]
2561*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m16, m17, m19   ; t28a t29 [or t31a t30]
2562*c0909341SAndroid Build Coastguard Worker    mova                m21, m23
2563*c0909341SAndroid Build Coastguard Worker    mova                m18, m20
2564*c0909341SAndroid Build Coastguard Worker    mova                m17, m22
2565*c0909341SAndroid Build Coastguard Worker    mova                m19, m16
2566*c0909341SAndroid Build Coastguard Worker
2567*c0909341SAndroid Build Coastguard Worker    jmp .main4
2568*c0909341SAndroid Build Coastguard Worker.main_fast2: ; bottom three-quarters are zero
2569*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [o(pd_799_4017)]
2570*c0909341SAndroid Build Coastguard Worker    pmulld               m8, m1     ; t4  t7
2571*c0909341SAndroid Build Coastguard Worker    vpmulld              m0, [o(pd_2896)] {1to16} ; t0 t1
2572*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m8, m0
2573*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m8, m0
2574*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m8, m12
2575*c0909341SAndroid Build Coastguard Worker    mova                 m2, m0       ;  t3   t2
2576*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).main3
2577*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [o(pd_4076_3920)]
2578*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [o(pd_401_m1189)]
2579*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m4       ;  t15  t12
2580*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m3       ;  t9   t10
2581*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m6, m4
2582*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m6, m4
2583*c0909341SAndroid Build Coastguard Worker    mova                 m5, m6       ;  t14  t13
2584*c0909341SAndroid Build Coastguard Worker    mova                 m9, m4       ;  t8   t11
2585*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).main3
2586*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m23, [o(pd_4091_3973)]
2587*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m7, [o(pd_201_995)]
2588*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m22, [o(pd_1380_601)]
2589*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [o(pd_3857_4052)]
2590*c0909341SAndroid Build Coastguard Worker    pmulld              m23, m16      ;  t16  t20
2591*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m7       ;  t31  t27
2592*c0909341SAndroid Build Coastguard Worker    pmulld              m22, m17      ; -t19 -t25
2593*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m9       ;  t28  t24
2594*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m23, m16, m17
2595*c0909341SAndroid Build Coastguard Worker    psubd               m22, m13, m22
2596*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m23, m16, m22, m17
2597*c0909341SAndroid Build Coastguard Worker    mova                m20, m23      ;  t30  t26
2598*c0909341SAndroid Build Coastguard Worker    mova                 m9, m16      ;  t17  t21
2599*c0909341SAndroid Build Coastguard Worker    mova                m19, m22      ;  t18  t22
2600*c0909341SAndroid Build Coastguard Worker    mova                m18, m17      ;  t29  t25
2601*c0909341SAndroid Build Coastguard Worker    jmp .main3
2602*c0909341SAndroid Build Coastguard Worker.main_fast: ; bottom half is zero
2603*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m23, [o(pd_4091_3973)]
2604*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m7, [o(pd_201_995)]
2605*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m20, [o(pd_2751_2106)]
2606*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [o(pd_3035_3513)]
2607*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m21, [o(pd_3703_3290)]
2608*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m10, [o(pd_1751_2440)]
2609*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m22, [o(pd_1380_601)]
2610*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m11, [o(pd_3857_4052)]
2611*c0909341SAndroid Build Coastguard Worker    pmulld              m23, m16      ;  t16a  t20a
2612*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m7       ;  t31a  t27a
2613*c0909341SAndroid Build Coastguard Worker    pmulld              m20, m19      ; -t17a -t21a
2614*c0909341SAndroid Build Coastguard Worker    pmulld              m19, m9       ;  t30a  t26a
2615*c0909341SAndroid Build Coastguard Worker    pmulld              m21, m18      ;  t18a  t22a
2616*c0909341SAndroid Build Coastguard Worker    pmulld              m18, m10      ;  t29a  t25a
2617*c0909341SAndroid Build Coastguard Worker    pmulld              m22, m17      ; -t19a -t25a
2618*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m11      ;  t28a  t24a
2619*c0909341SAndroid Build Coastguard Worker    psubd               m20, m13, m20
2620*c0909341SAndroid Build Coastguard Worker    psubd               m22, m13, m22
2621*c0909341SAndroid Build Coastguard Worker    jmp .main2
2622*c0909341SAndroid Build Coastguard Worker.main:
2623*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        16, 23, 7, 9, 10, _,  201_995,  4091_3973
2624*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        20, 19, 7, 9, 10, _, 3035_3513, 2751_2106
2625*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        18, 21, 7, 9, 10, _, 1751_2440, 3703_3290
2626*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        22, 17, 7, 9, 10, _, 3857_4052, 1380_601
2627*c0909341SAndroid Build Coastguard Worker    paddd               m20, m13
2628*c0909341SAndroid Build Coastguard Worker    paddd               m22, m13
2629*c0909341SAndroid Build Coastguard Worker.main2:
2630*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m16, m23, m19
2631*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m16, m20, m23, m19
2632*c0909341SAndroid Build Coastguard Worker    psubd                m9, m16, m20 ; t17  t21
2633*c0909341SAndroid Build Coastguard Worker    paddd               m16, m20      ; t16  t20
2634*c0909341SAndroid Build Coastguard Worker    psubd               m20, m23, m19 ; t30  t26
2635*c0909341SAndroid Build Coastguard Worker    paddd               m23, m19      ; t31  t27
2636*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m9, m16, m20, m23
2637*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m21, m18, m17
2638*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m18, m22, m21, m17
2639*c0909341SAndroid Build Coastguard Worker    psubd               m19, m22, m18 ; t18  t22
2640*c0909341SAndroid Build Coastguard Worker    paddd               m22, m18      ; t19  t23
2641*c0909341SAndroid Build Coastguard Worker    psubd               m18, m17, m21 ; t29  t25
2642*c0909341SAndroid Build Coastguard Worker    paddd               m17, m21      ; t28  t24
2643*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m19, m22, m18, m17
2644*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17
2645*c0909341SAndroid Build Coastguard Worker.main3:
2646*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m11, [o(pd_4017_2276)]
2647*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m10, [o(pd_799_3406)]
2648*c0909341SAndroid Build Coastguard Worker    psubd                m7, m0, m6   ; dct16 out15 out14
2649*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6       ; dct16 out0  out1
2650*c0909341SAndroid Build Coastguard Worker    psubd                m6, m1, m5   ; dct16 out12 out13
2651*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5       ; dct16 out3  out2
2652*c0909341SAndroid Build Coastguard Worker    psubd                m5, m2, m4   ; dct16 out11 out10
2653*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4       ; dct16 out4  out5
2654*c0909341SAndroid Build Coastguard Worker    psubd                m4, m3, m8   ; dct16 out8  out9
2655*c0909341SAndroid Build Coastguard Worker    paddd                m3, m8       ; dct16 out7  out6
2656*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        20,  9, 8, 21, _, 13, 10, 11
2657*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        18, 19, 8, 21, _, 13, 10, 11, 2
2658*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
2659*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m21, m16, m20 ; t20  t21a
2660*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m16, m20      ; t16  t17a
2661*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m20, m22, m19 ; t19  t18a
2662*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m22, m19      ; t23  t22a
2663*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
2664*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m19, m23, m9  ; t31  t30a
2665*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m23, m9       ; t27  t26a
2666*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m9, m17, m18 ; t24  t25a
2667*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m17, m18      ; t28  t29a
2668*c0909341SAndroid Build Coastguard Worker    psubd               m18, m16, m20 ; t19a t18
2669*c0909341SAndroid Build Coastguard Worker    paddd               m20, m16      ; t16a t17
2670*c0909341SAndroid Build Coastguard Worker    psubd               m16, m19, m17 ; t28a t29
2671*c0909341SAndroid Build Coastguard Worker    paddd               m19, m17      ; t31a t30
2672*c0909341SAndroid Build Coastguard Worker    psubd               m17, m22, m21 ; t20a t21
2673*c0909341SAndroid Build Coastguard Worker    paddd               m22, m21      ; t23a t22
2674*c0909341SAndroid Build Coastguard Worker    psubd               m21, m9, m23  ; t27a t26
2675*c0909341SAndroid Build Coastguard Worker    paddd               m23, m9       ; t24a t25
2676*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m18, m16, m17, m21
2677*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m16, m18, m21, m17
2678*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m20, m22, m19, m23
2679*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m20, m22, m19, m23
2680*c0909341SAndroid Build Coastguard Worker.main4:
2681*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_3784)]
2682*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_1567)]
2683*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        16, 18, 8, 9, _, 13, 10, 11
2684*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        21, 17, 8, 9, _, 13, 10, 11, 2
2685*c0909341SAndroid Build Coastguard Worker    paddd                m9, m20, m22 ; t16  t17a
2686*c0909341SAndroid Build Coastguard Worker    psubd               m20, m22      ; t23  t22a
2687*c0909341SAndroid Build Coastguard Worker    paddd               m22, m19, m23 ; t31  t30a
2688*c0909341SAndroid Build Coastguard Worker    psubd               m19, m23      ; t24  t25a
2689*c0909341SAndroid Build Coastguard Worker    psubd               m23, m16, m17 ; t20a t21
2690*c0909341SAndroid Build Coastguard Worker    paddd               m16, m17      ; t19a t18
2691*c0909341SAndroid Build Coastguard Worker    psubd               m17, m18, m21 ; t27a t26
2692*c0909341SAndroid Build Coastguard Worker    paddd               m21, m18      ; t28a t29
2693*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m20, m19, m23, m17
2694*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m19, m20, m17, m23
2695*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m19, m20, m17, m23
2696*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m22, m21, m16, m9
2697*c0909341SAndroid Build Coastguard Worker    paddd               m19, m13
2698*c0909341SAndroid Build Coastguard Worker    paddd               m17, m13
2699*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m22, m21, m16, m9
2700*c0909341SAndroid Build Coastguard Worker    psubd               m18, m19, m20 ; t23a t22
2701*c0909341SAndroid Build Coastguard Worker    paddd               m19, m20      ; t24a t25
2702*c0909341SAndroid Build Coastguard Worker    paddd               m20, m17, m23 ; t27  t26a
2703*c0909341SAndroid Build Coastguard Worker    psubd               m17, m23      ; t20  t21a
2704*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m20, m19, m18, m17
2705*c0909341SAndroid Build Coastguard Worker    ret
2706*c0909341SAndroid Build Coastguard Worker.transpose_8x32:
2707*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(idct32x8p)]
2708*c0909341SAndroid Build Coastguard Worker    psrlw                m8, m10, 8
2709*c0909341SAndroid Build Coastguard Worker    mova                 m9, m8
2710*c0909341SAndroid Build Coastguard Worker    vpermi2w             m8, m1, m5
2711*c0909341SAndroid Build Coastguard Worker    vpermt2w             m1, m10, m5
2712*c0909341SAndroid Build Coastguard Worker    vprold               m5, m9, 16
2713*c0909341SAndroid Build Coastguard Worker    vpermi2w             m9, m3, m7
2714*c0909341SAndroid Build Coastguard Worker    vpermt2w             m3, m10, m7
2715*c0909341SAndroid Build Coastguard Worker    vprold              m10, 16
2716*c0909341SAndroid Build Coastguard Worker    mova                 m7, m5
2717*c0909341SAndroid Build Coastguard Worker    vpermi2w             m5, m0, m4
2718*c0909341SAndroid Build Coastguard Worker    vpermt2w             m0, m10, m4
2719*c0909341SAndroid Build Coastguard Worker    vpermi2w             m7, m2, m6
2720*c0909341SAndroid Build Coastguard Worker    vpermt2w             m2, m10, m6
2721*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m5, m8
2722*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m8
2723*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m7, m9
2724*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m9
2725*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m2, m3
2726*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3
2727*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m0, m1
2728*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1
2729*c0909341SAndroid Build Coastguard Worker    ret
2730*c0909341SAndroid Build Coastguard Worker
2731*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob
2732*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [pw_4096]
2733*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
2734*c0909341SAndroid Build Coastguard Worker    mova                 m6, [idtx32x8p]
2735*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq*5]
2736*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pixel_10bpc_max]
2737*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq+r4*2]
2738*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
2739*c0909341SAndroid Build Coastguard Worker    sub                eobd, 107
2740*c0909341SAndroid Build Coastguard Worker    psrlw                m7, m6, 8
2741*c0909341SAndroid Build Coastguard Worker.loop:
2742*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64*0]
2743*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [cq+64*1] ; 02 13
2744*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*2]
2745*c0909341SAndroid Build Coastguard Worker    packssdw             m1, [cq+64*3] ; 46 57
2746*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*4]
2747*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [cq+64*5] ; 8a 9b
2748*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*6]
2749*c0909341SAndroid Build Coastguard Worker    packssdw             m3, [cq+64*7] ; ce df
2750*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
2751*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 0, 1, 2, 3
2752*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2753*c0909341SAndroid Build Coastguard Worker    vpermi2w             m4, m1, m3
2754*c0909341SAndroid Build Coastguard Worker    vpermt2w             m1, m7, m3
2755*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
2756*c0909341SAndroid Build Coastguard Worker    mova                 m3, m7
2757*c0909341SAndroid Build Coastguard Worker    vpermi2w             m3, m0, m2
2758*c0909341SAndroid Build Coastguard Worker    vpermt2w             m0, m6, m2
2759*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*8
2760*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m1 ; 4 5
2761*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m1     ; 6 7
2762*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m4 ; 2 3
2763*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m4     ; 0 1
2764*c0909341SAndroid Build Coastguard Worker    mova                ym4, [dstq+strideq*0]
2765*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, [dstq+strideq*1], 1
2766*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
2767*c0909341SAndroid Build Coastguard Worker    mova                ym4, [dstq+strideq*2]
2768*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, [dstq+r4     *1], 1
2769*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4
2770*c0909341SAndroid Build Coastguard Worker    mova                ym4, [dstq+strideq*4]
2771*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, [dstq+r5     *1], 1
2772*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
2773*c0909341SAndroid Build Coastguard Worker    mova                ym4, [dstq+r4     *2]
2774*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, [dstq+r6     *1], 1
2775*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
2776*c0909341SAndroid Build Coastguard Worker    REPX     {pmaxsw x, m8}, m0, m1, m2, m3
2777*c0909341SAndroid Build Coastguard Worker    REPX     {pminsw x, m9}, m0, m1, m2, m3
2778*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
2779*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
2780*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], ym1
2781*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+r4     *1], m1, 1
2782*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*4], ym2
2783*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+r5     *1], m2, 1
2784*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r4     *2], ym3
2785*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+r6     *1], m3, 1
2786*c0909341SAndroid Build Coastguard Worker    add                dstq, 32
2787*c0909341SAndroid Build Coastguard Worker    add                eobd, 0x80000000
2788*c0909341SAndroid Build Coastguard Worker    jnc .loop
2789*c0909341SAndroid Build Coastguard Worker    RET
2790*c0909341SAndroid Build Coastguard Worker
2791*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
2792*c0909341SAndroid Build Coastguard Worker%undef cmp
2793*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
2794*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
2795*c0909341SAndroid Build Coastguard Worker    jz .dconly
2796*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
2797*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
2798*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
2799*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
2800*c0909341SAndroid Build Coastguard Worker%if WIN64
2801*c0909341SAndroid Build Coastguard Worker    movaps         [rsp+ 8], xmm6
2802*c0909341SAndroid Build Coastguard Worker    movaps         [rsp+24], xmm7
2803*c0909341SAndroid Build Coastguard Worker%endif
2804*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
2805*c0909341SAndroid Build Coastguard Worker    jl .fast
2806*c0909341SAndroid Build Coastguard Worker    call .pass1
2807*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
2808*c0909341SAndroid Build Coastguard Worker    jge .full
2809*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
2810*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
2811*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m1, m1 ;  2
2812*c0909341SAndroid Build Coastguard Worker    punpckhwd           m14, m1, m1 ;  3
2813*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m3 ;  6
2814*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m3, m3 ;  7
2815*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6, m6 ; 12
2816*c0909341SAndroid Build Coastguard Worker    punpckhwd           m19, m6, m6 ; 13
2817*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m9, m4 ; __  8
2818*c0909341SAndroid Build Coastguard Worker    punpckhwd           m20, m4, m4 ;  9
2819*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m5, m5 ; 11
2820*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m5     ; 10
2821*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m0     ; __  0
2822*c0909341SAndroid Build Coastguard Worker    punpckhwd           m21, m0, m0 ;  1
2823*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m7, m7 ; 14
2824*c0909341SAndroid Build Coastguard Worker    punpckhwd           m17, m7, m7 ; 15
2825*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m2, m2 ;  4
2826*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m2, m2 ;  5
2827*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast
2828*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
2829*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64*3
2830*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
2831*c0909341SAndroid Build Coastguard Worker.zero_loop:
2832*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+r6*8+128*x], m8}, 3, 2, 1, 0
2833*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 64
2834*c0909341SAndroid Build Coastguard Worker    jge .zero_loop
2835*c0909341SAndroid Build Coastguard Worker    jmp .pass2_end
2836*c0909341SAndroid Build Coastguard Worker.full:
2837*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*0], m0
2838*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*1], m1
2839*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*2], m2
2840*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*3], m3
2841*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*4], m4
2842*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*5], m5
2843*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*6], m6
2844*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*7], m7
2845*c0909341SAndroid Build Coastguard Worker    add                  cq, 64
2846*c0909341SAndroid Build Coastguard Worker    call .pass1
2847*c0909341SAndroid Build Coastguard Worker    mova                 m9, [cq-64* 1] ;  0  1
2848*c0909341SAndroid Build Coastguard Worker    mova                m14, [cq+64* 1] ;  2  3
2849*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64* 3] ;  4  5
2850*c0909341SAndroid Build Coastguard Worker    mova                m15, [cq+64* 5] ;  6  7
2851*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+64* 7] ;  8  9
2852*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64* 9] ; 10 11
2853*c0909341SAndroid Build Coastguard Worker    mova                m22, [cq+64*11] ; 12 13
2854*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+64*13] ; 14 15
2855*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
2856*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m7, m14   ; 30  2
2857*c0909341SAndroid Build Coastguard Worker    punpckhwd           m21, m7, m9    ; 31  1
2858*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m6, m18   ; 28  4
2859*c0909341SAndroid Build Coastguard Worker    punpckhwd           m14, m6        ;  3 29
2860*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m0, m9    ; 16  0
2861*c0909341SAndroid Build Coastguard Worker    punpckhwd           m17, m19, m0   ; 15 17
2862*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m19, m1   ; 14 18
2863*c0909341SAndroid Build Coastguard Worker    punpckhwd           m19, m1, m22   ; 19 13
2864*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m15, m5   ;  6 26
2865*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m5, m18   ; 27  5
2866*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m4, m20   ; 24  8
2867*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m4        ;  7 25
2868*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m3, m16   ; 22 10
2869*c0909341SAndroid Build Coastguard Worker    punpckhwd           m20, m3, m20   ; 23  9
2870*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m22, m2   ; 12 20
2871*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m2        ; 11 21
2872*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main2
2873*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
2874*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 32*7
2875*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
2876*c0909341SAndroid Build Coastguard Worker.full_zero_loop:
2877*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+r6*8+64*x], m8}, 2, 1, 0, -1
2878*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 32
2879*c0909341SAndroid Build Coastguard Worker    jge .full_zero_loop
2880*c0909341SAndroid Build Coastguard Worker    jmp .pass2_end
2881*c0909341SAndroid Build Coastguard Worker.fast:
2882*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+128*0]
2883*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+128*4]
2884*c0909341SAndroid Build Coastguard Worker    movshdup             m8, [o(permB)]
2885*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+128*2]
2886*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+128*6]
2887*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+128*1]
2888*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+128*3]
2889*c0909341SAndroid Build Coastguard Worker    mova                ym6, [cq+128*5]
2890*c0909341SAndroid Build Coastguard Worker    mova                ym7, [cq+128*7]
2891*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m8, m2 ; 0 4
2892*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m8, m3 ; 2 6
2893*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m8, m5 ; 1 3
2894*c0909341SAndroid Build Coastguard Worker    vpermt2q             m7, m8, m6 ; 7 5
2895*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m0, m1, m4, m7
2896*c0909341SAndroid Build Coastguard Worker    pxor               ym16, ym16
2897*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*0], ym16
2898*c0909341SAndroid Build Coastguard Worker    REPX {vmovdqa32 [cq+128*x], ym16}, 1, 2, 3, 4, 5, 6, 7
2899*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m0, m1, m4, m7
2900*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m0, m1, m4, m7
2901*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).main_fast
2902*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).main_fast
2903*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_1)]
2904*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_end2
2905*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(idct8x32p)]
2906*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m4
2907*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m5
2908*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m6
2909*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m7
2910*c0909341SAndroid Build Coastguard Worker    mova                 m6, [dup16_perm]
2911*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m8, m0
2912*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m8, m2
2913*c0909341SAndroid Build Coastguard Worker    vprold               m8, 16
2914*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m8, m1
2915*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m8, m3
2916*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m0, m2
2917*c0909341SAndroid Build Coastguard Worker    punpckhdq            m0, m2
2918*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m1, m3
2919*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m3
2920*c0909341SAndroid Build Coastguard Worker    punpckldq           m21, m4, m2
2921*c0909341SAndroid Build Coastguard Worker    punpckhdq           m14, m4, m2
2922*c0909341SAndroid Build Coastguard Worker    punpckldq           m18, m0, m1
2923*c0909341SAndroid Build Coastguard Worker    punpckhdq           m15, m0, m1
2924*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m6, m14 ; 2
2925*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m6, m15 ; 6
2926*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m6, m18 ; 4
2927*c0909341SAndroid Build Coastguard Worker    pmovzxwd             m9, ym21    ; 0
2928*c0909341SAndroid Build Coastguard Worker    vpord                m6, [o(pb_32)] {1to16}
2929*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
2930*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m6, m21 ; 1
2931*c0909341SAndroid Build Coastguard Worker    vpermb              m15, m6, m15 ; 7
2932*c0909341SAndroid Build Coastguard Worker    vpermb              m18, m6, m18 ; 5
2933*c0909341SAndroid Build Coastguard Worker    vpermb              m14, m6, m14 ; 3
2934*c0909341SAndroid Build Coastguard Worker    pslld                m9, 16
2935*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast2
2936*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
2937*c0909341SAndroid Build Coastguard Worker.pass2_end:
2938*c0909341SAndroid Build Coastguard Worker    movshdup            m22, [permC]
2939*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pw_2048]
2940*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [pixel_10bpc_max]
2941*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
2942*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
2943*c0909341SAndroid Build Coastguard Worker    psrlq               m23, m22, 8
2944*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m22, m0
2945*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m23, m1
2946*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
2947*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m22, m2
2948*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m23, m3
2949*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
2950*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m22, m4
2951*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m23, m5
2952*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
2953*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m22, m6
2954*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m23, m7
2955*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
2956*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m22, m14
2957*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m23, m15
2958*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
2959*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m22, m16
2960*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m23, m17
2961*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
2962*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m22, m18
2963*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m23, m19
2964*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
2965*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m22, m20
2966*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m23, m21
2967*c0909341SAndroid Build Coastguard Worker%if WIN64
2968*c0909341SAndroid Build Coastguard Worker    movaps             xmm6, [rsp+ 8]
2969*c0909341SAndroid Build Coastguard Worker    movaps             xmm7, [rsp+24]
2970*c0909341SAndroid Build Coastguard Worker%endif
2971*c0909341SAndroid Build Coastguard Worker    vzeroupper
2972*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_10bpc).write_16x4
2973*c0909341SAndroid Build Coastguard Worker.pass1:
2974*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 0]
2975*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+128* 2]
2976*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+128* 4]
2977*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128* 6]
2978*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m12, [cq+128* 8]
2979*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m12, [cq+128*10]
2980*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m12, [cq+128*12]
2981*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m12, [cq+128*14]
2982*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_rect2
2983*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m12, [cq+128* 1]
2984*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m12, [cq+128* 3]
2985*c0909341SAndroid Build Coastguard Worker    pmulld              m18, m12, [cq+128* 5]
2986*c0909341SAndroid Build Coastguard Worker    pmulld              m19, m12, [cq+128* 7]
2987*c0909341SAndroid Build Coastguard Worker    pmulld              m20, m12, [cq+128* 9]
2988*c0909341SAndroid Build Coastguard Worker    pmulld              m21, m12, [cq+128*11]
2989*c0909341SAndroid Build Coastguard Worker    pmulld              m22, m12, [cq+128*13]
2990*c0909341SAndroid Build Coastguard Worker    pmulld              m23, m12, [cq+128*15]
2991*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_rect2
2992*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_1)]
2993*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_end2
2994*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_10bpc).main_end3
2995*c0909341SAndroid Build Coastguard Worker.dconly:
2996*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
2997*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
2998*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
2999*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly
3000*c0909341SAndroid Build Coastguard Worker
3001*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 16, dst, stride, c, eob
3002*c0909341SAndroid Build Coastguard Worker%undef cmp
3003*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pw_2896x8]
3004*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pw_1697x16]
3005*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [pw_8192]
3006*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [pixel_10bpc_max]
3007*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*9]
3008*c0909341SAndroid Build Coastguard Worker    pxor                m14, m14
3009*c0909341SAndroid Build Coastguard Worker    paddw               m12, m13, m13 ; pw_16384
3010*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
3011*c0909341SAndroid Build Coastguard Worker    jl .main
3012*c0909341SAndroid Build Coastguard Worker    call .main
3013*c0909341SAndroid Build Coastguard Worker    add                  cq, 64-128*4
3014*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
3015*c0909341SAndroid Build Coastguard Worker.main:
3016*c0909341SAndroid Build Coastguard Worker    call .main_internal
3017*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*4
3018*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13, m2
3019*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m13, m4
3020*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m13, m6
3021*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m13, m8
3022*c0909341SAndroid Build Coastguard Worker    call .main_internal
3023*c0909341SAndroid Build Coastguard Worker.main2:
3024*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m13
3025*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m13
3026*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m13
3027*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m13
3028*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m2 ;  0  8
3029*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m2     ;  1  9
3030*c0909341SAndroid Build Coastguard Worker    call .write_16x2x2
3031*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m3, m4 ;  2 10
3032*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m3, m4 ;  3 11
3033*c0909341SAndroid Build Coastguard Worker    call .write_16x2x2
3034*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m5, m6 ;  4 12
3035*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m5, m6 ;  5 13
3036*c0909341SAndroid Build Coastguard Worker    call .write_16x2x2
3037*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m7, m8 ;  6 14
3038*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m7, m8 ;  7 15
3039*c0909341SAndroid Build Coastguard Worker.write_16x2x2:
3040*c0909341SAndroid Build Coastguard Worker    mova                ym2, [dstq+strideq*0]
3041*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, [dstq+strideq*8], 1
3042*c0909341SAndroid Build Coastguard Worker    mova                ym9, [dstq+strideq*1]
3043*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m9, [dstq+r6       ], 1
3044*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
3045*c0909341SAndroid Build Coastguard Worker    paddw                m1, m9
3046*c0909341SAndroid Build Coastguard Worker    pmaxsw               m0, m14
3047*c0909341SAndroid Build Coastguard Worker    pmaxsw               m1, m14
3048*c0909341SAndroid Build Coastguard Worker    pminsw               m0, m15
3049*c0909341SAndroid Build Coastguard Worker    pminsw               m1, m15
3050*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
3051*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*8], m0, 1
3052*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*1], ym1
3053*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+r6       ], m1, 1
3054*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3055*c0909341SAndroid Build Coastguard Worker    ret
3056*c0909341SAndroid Build Coastguard Worker.main_internal:
3057*c0909341SAndroid Build Coastguard Worker    mova                 m8, [cq+128* 0]
3058*c0909341SAndroid Build Coastguard Worker    packssdw             m8, [cq+128* 8]
3059*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128* 1]
3060*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [cq+128* 9]
3061*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 2]
3062*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [cq+128*10]
3063*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 3]
3064*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [cq+128*11]
3065*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m8, m6, m0, m2
3066*c0909341SAndroid Build Coastguard Worker    REPX {vpermq x, x, q3120}, m8, m6, m0, m2
3067*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m11, m8
3068*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m11, m6
3069*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+128*x], m14}, 0, 1, 2, 3
3070*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12
3071*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m12
3072*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m4
3073*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m9
3074*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m11, m0
3075*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m11, m2
3076*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+128*x], m14}, 8, 9, 10, 11
3077*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12
3078*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m12
3079*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4
3080*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m9
3081*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m8, m6
3082*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m6
3083*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0, m2
3084*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2
3085*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4, m6 ; 0 1
3086*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m6     ; 2 3
3087*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m8, m0 ; 4 5
3088*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m0     ; 6 7
3089*c0909341SAndroid Build Coastguard Worker    ret
3090*c0909341SAndroid Build Coastguard Worker
3091*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
3092*c0909341SAndroid Build Coastguard Worker%undef cmp
3093*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
3094*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
3095*c0909341SAndroid Build Coastguard Worker    jz .dconly
3096*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
3097*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
3098*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
3099*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
3100*c0909341SAndroid Build Coastguard Worker%if WIN64
3101*c0909341SAndroid Build Coastguard Worker    movaps         [rsp+ 8], xmm6
3102*c0909341SAndroid Build Coastguard Worker    movaps         [rsp+24], xmm7
3103*c0909341SAndroid Build Coastguard Worker%endif
3104*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 8*12
3105*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
3106*c0909341SAndroid Build Coastguard Worker    jl .fast
3107*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+64* 0]
3108*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+64* 4]
3109*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+64* 8]
3110*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+64*12]
3111*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m12, [cq+64* 2]
3112*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m12, [cq+64* 6]
3113*c0909341SAndroid Build Coastguard Worker    pmulld              m18, m12, [cq+64*10]
3114*c0909341SAndroid Build Coastguard Worker    pmulld              m19, m12, [cq+64*14]
3115*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
3116*c0909341SAndroid Build Coastguard Worker    jge .full
3117*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_fast_rect2
3118*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_fast_rect2
3119*c0909341SAndroid Build Coastguard Worker    call .idct16_sumsub
3120*c0909341SAndroid Build Coastguard Worker    call .pass1_load_spill
3121*c0909341SAndroid Build Coastguard Worker    call .main_fast_rect2
3122*c0909341SAndroid Build Coastguard Worker    jmp .pass1_end
3123*c0909341SAndroid Build Coastguard Worker.full:
3124*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m12, [cq+64*16]
3125*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m12, [cq+64*20]
3126*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m12, [cq+64*24]
3127*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m12, [cq+64*28]
3128*c0909341SAndroid Build Coastguard Worker    pmulld              m20, m12, [cq+64*18]
3129*c0909341SAndroid Build Coastguard Worker    pmulld              m21, m12, [cq+64*22]
3130*c0909341SAndroid Build Coastguard Worker    pmulld              m22, m12, [cq+64*26]
3131*c0909341SAndroid Build Coastguard Worker    pmulld              m23, m12, [cq+64*30]
3132*c0909341SAndroid Build Coastguard Worker    add                 r6d, 8*16
3133*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_rect2
3134*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_rect2
3135*c0909341SAndroid Build Coastguard Worker    call .idct16_sumsub
3136*c0909341SAndroid Build Coastguard Worker    call .pass1_load_spill
3137*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m12, [cq+64*17]
3138*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m12, [cq+64*19]
3139*c0909341SAndroid Build Coastguard Worker    pmulld              m18, m12, [cq+64*21]
3140*c0909341SAndroid Build Coastguard Worker    pmulld              m19, m12, [cq+64*23]
3141*c0909341SAndroid Build Coastguard Worker    pmulld              m20, m12, [cq+64*25]
3142*c0909341SAndroid Build Coastguard Worker    pmulld              m21, m12, [cq+64*27]
3143*c0909341SAndroid Build Coastguard Worker    pmulld              m22, m12, [cq+64*29]
3144*c0909341SAndroid Build Coastguard Worker    pmulld              m23, m12, [cq+64*31]
3145*c0909341SAndroid Build Coastguard Worker    call .main_rect2
3146*c0909341SAndroid Build Coastguard Worker.pass1_end:
3147*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_1)]
3148*c0909341SAndroid Build Coastguard Worker    lea                  r4, [cq+64]
3149*c0909341SAndroid Build Coastguard Worker    call .idct32_pass1_end
3150*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
3151*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m19, m5, m16  ; 11
3152*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m16      ; 10
3153*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m16, m2, m1   ;  5
3154*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m1       ;  4
3155*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m15, m4  ;  2
3156*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m15, m4       ;  3
3157*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m14, m18 ;  8
3158*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m18, m14, m18 ;  9
3159*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m14, m0, m20  ;  1
3160*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m20      ;  0
3161*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m20, m6, m17  ; 13
3162*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m17      ; 12
3163*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m17, m3, m21  ;  7
3164*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m21      ;  6
3165*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m21, m7, m8   ; 15
3166*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m7, m8       ; 14
3167*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
3168*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
3169*c0909341SAndroid Build Coastguard Worker    jmp .end
3170*c0909341SAndroid Build Coastguard Worker.fast:
3171*c0909341SAndroid Build Coastguard Worker    pmulld              ym0, ym12, [cq+64*0]
3172*c0909341SAndroid Build Coastguard Worker    pmulld              ym1, ym12, [cq+64*4]
3173*c0909341SAndroid Build Coastguard Worker    movshdup             m7, [o(permB)]
3174*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+64*2]
3175*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+64*6]
3176*c0909341SAndroid Build Coastguard Worker    mova               ym16, [cq+64*1]
3177*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+64*5]
3178*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+64*3]
3179*c0909341SAndroid Build Coastguard Worker    mova               ym17, [cq+64*7]
3180*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m7, m5 ;  2  6
3181*c0909341SAndroid Build Coastguard Worker    vpermt2q            m16, m7, m2 ;  1  5
3182*c0909341SAndroid Build Coastguard Worker    vpermt2q            m17, m7, m3 ;  7  3
3183*c0909341SAndroid Build Coastguard Worker    paddd               ym0, ym13
3184*c0909341SAndroid Build Coastguard Worker    paddd               ym1, ym13
3185*c0909341SAndroid Build Coastguard Worker    psrad               ym0, 12
3186*c0909341SAndroid Build Coastguard Worker    psrad               ym1, 12
3187*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, m0 ;  0  0
3188*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m7, m1 ;  4  4
3189*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m4, m16, m17
3190*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m4, m16, m17
3191*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m4, m16, m17
3192*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
3193*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_1)]
3194*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_end2
3195*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
3196*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
3197*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m14, m0, m2 ; 1
3198*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2     ; 0
3199*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m3, m4 ; 2
3200*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m15, m3, m4 ; 3
3201*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m5, m7 ; 4
3202*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m16, m5, m7 ; 5
3203*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m6, m8 ; 6
3204*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m17, m6, m8 ; 7
3205*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
3206*c0909341SAndroid Build Coastguard Worker.end:
3207*c0909341SAndroid Build Coastguard Worker%if WIN64
3208*c0909341SAndroid Build Coastguard Worker    movaps             xmm6, [rsp+ 8]
3209*c0909341SAndroid Build Coastguard Worker    movaps             xmm7, [rsp+24]
3210*c0909341SAndroid Build Coastguard Worker%endif
3211*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
3212*c0909341SAndroid Build Coastguard Worker.zero_loop:
3213*c0909341SAndroid Build Coastguard Worker    mova     [cq+r6*8+64*3], m12
3214*c0909341SAndroid Build Coastguard Worker    mova     [cq+r6*8+64*2], m12
3215*c0909341SAndroid Build Coastguard Worker    mova     [cq+r6*8+64*1], m12
3216*c0909341SAndroid Build Coastguard Worker    mova     [cq+r6*8+64*0], m12
3217*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 8*4
3218*c0909341SAndroid Build Coastguard Worker    jge .zero_loop
3219*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start
3220*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m11, m14
3221*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m11, m15
3222*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m11, m16
3223*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m11, m17
3224*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
3225*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m11, m18
3226*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m11, m19
3227*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m11, m20
3228*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m11, m21
3229*c0909341SAndroid Build Coastguard Worker    vzeroupper
3230*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
3231*c0909341SAndroid Build Coastguard Worker.dconly:
3232*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
3233*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
3234*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
3235*c0909341SAndroid Build Coastguard Worker.dconly3:
3236*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128
3237*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8
3238*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
3239*c0909341SAndroid Build Coastguard Worker    add                 r6d, 384
3240*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 9
3241*c0909341SAndroid Build Coastguard Worker.dconly2:
3242*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(dconly_10bpc)]
3243*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
3244*c0909341SAndroid Build Coastguard Worker    add                 r6d, 2176
3245*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 12
3246*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, r6d
3247*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m3
3248*c0909341SAndroid Build Coastguard Worker.dconly_loop:
3249*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2, [dstq+strideq*0]
3250*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, [dstq+strideq*1]
3251*c0909341SAndroid Build Coastguard Worker    psubusw              m0, m3
3252*c0909341SAndroid Build Coastguard Worker    psubusw              m1, m3
3253*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
3254*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m1
3255*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3256*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 2
3257*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
3258*c0909341SAndroid Build Coastguard Worker    RET
3259*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3260*c0909341SAndroid Build Coastguard Worker.idct16_sumsub:
3261*c0909341SAndroid Build Coastguard Worker    psubd               m23, m0, m22 ; t15
3262*c0909341SAndroid Build Coastguard Worker    paddd                m0, m22     ; t0
3263*c0909341SAndroid Build Coastguard Worker    psubd               m22, m1, m21 ; t14
3264*c0909341SAndroid Build Coastguard Worker    paddd                m1, m21     ; t1
3265*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m23, m0, m22, m1
3266*c0909341SAndroid Build Coastguard Worker    psubd               m21, m2, m20 ; t13
3267*c0909341SAndroid Build Coastguard Worker    paddd                m2, m20     ; t2
3268*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m23, m0, m22, m1
3269*c0909341SAndroid Build Coastguard Worker    psubd               m20, m3, m19 ; t12
3270*c0909341SAndroid Build Coastguard Worker    paddd                m3, m19     ; t3
3271*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m21, m2, m20, m3
3272*c0909341SAndroid Build Coastguard Worker    psubd               m19, m4, m18 ; t11
3273*c0909341SAndroid Build Coastguard Worker    paddd                m4, m18     ; t4
3274*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m21, m2, m20, m3
3275*c0909341SAndroid Build Coastguard Worker    psubd               m18, m5, m17 ; t10
3276*c0909341SAndroid Build Coastguard Worker    paddd                m5, m17     ; t5
3277*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m19, m4, m18, m5
3278*c0909341SAndroid Build Coastguard Worker    psubd               m17, m6, m16 ; t9
3279*c0909341SAndroid Build Coastguard Worker    paddd                m6, m16     ; t6
3280*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m19, m4, m18, m5
3281*c0909341SAndroid Build Coastguard Worker    psubd               m16, m7, m9  ; t8
3282*c0909341SAndroid Build Coastguard Worker    paddd                m7, m9      ; t7
3283*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m17, m6, m16, m7
3284*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m17, m6, m16, m7
3285*c0909341SAndroid Build Coastguard Worker    ret
3286*c0909341SAndroid Build Coastguard Worker.idct32_pass1_end:
3287*c0909341SAndroid Build Coastguard Worker    psrlq               m12, [o(permC)], 24 ;  0  2  8 10  1  3  9 11
3288*c0909341SAndroid Build Coastguard Worker    psrlq               m13, m12, 32        ;  4  6 12 14  5  7 13 15
3289*c0909341SAndroid Build Coastguard Worker%macro IDCT32_PASS1_END 2 ; low, high
3290*c0909341SAndroid Build Coastguard Worker    paddd                m8, m11, [r4+128*%1]
3291*c0909341SAndroid Build Coastguard Worker    paddd                m9, m11, [cq+128*%1]
3292*c0909341SAndroid Build Coastguard Worker    psubd               m10, m8, m%1  ; out 16+n
3293*c0909341SAndroid Build Coastguard Worker    paddd                m8, m%1      ; out 15-n
3294*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m9, m%2  ; out  0+n
3295*c0909341SAndroid Build Coastguard Worker    psubd                m9, m%2      ; out 31-n
3296*c0909341SAndroid Build Coastguard Worker    REPX   {vpsravd x, m11}, m10, m%1, m8, m9
3297*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m10      ;  0+n 16+n
3298*c0909341SAndroid Build Coastguard Worker    packssdw            m%2, m8, m9   ; 15-n 31-n
3299*c0909341SAndroid Build Coastguard Worker%endmacro
3300*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      0, 23       ;  0 16, 15 31
3301*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      7, 16       ;  7 23,  8 24
3302*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      1, 22       ;  1 17, 14 30
3303*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      6, 17       ;  6 22,  9 25
3304*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      2, 21       ;  2 18, 13 29
3305*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      5, 18       ;  5 21, 10 26
3306*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      3, 20       ;  3 19, 12 28
3307*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      4, 19       ;  4 20, 11 27
3308*c0909341SAndroid Build Coastguard Worker.transpose_16x32:
3309*c0909341SAndroid Build Coastguard Worker    mova                m14, m13
3310*c0909341SAndroid Build Coastguard Worker    vpermi2q            m14, m0, m16
3311*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m12, m16
3312*c0909341SAndroid Build Coastguard Worker    mova                m15, m13
3313*c0909341SAndroid Build Coastguard Worker    vpermi2q            m15, m1, m17
3314*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m12, m17
3315*c0909341SAndroid Build Coastguard Worker    mova                m16, m13
3316*c0909341SAndroid Build Coastguard Worker    vpermi2q            m16, m2, m18
3317*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m12, m18
3318*c0909341SAndroid Build Coastguard Worker    mova                m17, m13
3319*c0909341SAndroid Build Coastguard Worker    vpermi2q            m17, m3, m19
3320*c0909341SAndroid Build Coastguard Worker    vpermt2q             m3, m12, m19
3321*c0909341SAndroid Build Coastguard Worker    mova                m18, m13
3322*c0909341SAndroid Build Coastguard Worker    vpermi2q            m18, m4, m20
3323*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m12, m20
3324*c0909341SAndroid Build Coastguard Worker    mova                m19, m13
3325*c0909341SAndroid Build Coastguard Worker    vpermi2q            m19, m5, m21
3326*c0909341SAndroid Build Coastguard Worker    vpermt2q             m5, m12, m21
3327*c0909341SAndroid Build Coastguard Worker    mova                m20, m13
3328*c0909341SAndroid Build Coastguard Worker    vpermi2q            m20, m6, m22
3329*c0909341SAndroid Build Coastguard Worker    vpermt2q             m6, m12, m22
3330*c0909341SAndroid Build Coastguard Worker    mova                m21, m13
3331*c0909341SAndroid Build Coastguard Worker    vpermi2q            m21, m7, m23
3332*c0909341SAndroid Build Coastguard Worker    vpermt2q             m7, m12, m23
3333*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m2, m3   ; c04 d04 c05 d05 c06 d06 c07 d07
3334*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3       ; c00 d00 c01 d01 c02 d02 c03 d03
3335*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m1   ; a04 b04 a05 b05 a06 b06 a07 b07
3336*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1       ; a00 b00 a01 b01 a02 b02 a03 b03
3337*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m4, m5   ; e04 f04 e05 f05 e06 f06 e07 f07
3338*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5       ; e00 f00 e01 f01 e02 f02 e03 f03
3339*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6, m7   ; g04 h04 g05 h05 g06 h06 g07 h07
3340*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7       ; g00 h00 g01 h01 g02 h02 g03 h03
3341*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m14, m15 ; a12 b12 a13 b13 a14 b14 a15 b15
3342*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15      ; a08 b08 a09 b09 a10 b10 a11 b11
3343*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m16, m17 ; c12 d12 c13 d13 c14 d14 c15 d15
3344*c0909341SAndroid Build Coastguard Worker    punpcklwd           m16, m17      ; c08 d08 c09 d09 c10 d10 c11 d11
3345*c0909341SAndroid Build Coastguard Worker    punpckhwd           m17, m18, m19 ; e12 f12 e13 f13 e14 f14 e15 f15
3346*c0909341SAndroid Build Coastguard Worker    punpcklwd           m18, m19      ; e08 f08 e09 f09 e10 f10 e11 f11
3347*c0909341SAndroid Build Coastguard Worker    punpckhwd           m19, m20, m21 ; g12 h12 g13 h13 g14 h14 g15 h15
3348*c0909341SAndroid Build Coastguard Worker    punpcklwd           m20, m21      ; g08 h08 g09 h09 g10 h10 g11 h11
3349*c0909341SAndroid Build Coastguard Worker    punpckhdq           m21, m1, m5   ; e06 f06 g06 h06 e07 f07 g07 h07
3350*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m5       ; e04 f04 g04 h04 e05 f05 g05 h05
3351*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m14, m16 ; a10 b10 c10 d10 a11 b11 c11 d11
3352*c0909341SAndroid Build Coastguard Worker    punpckldq           m14, m16      ; a08 b08 c08 d08 a09 b09 c09 d09
3353*c0909341SAndroid Build Coastguard Worker    punpckhdq           m16, m18, m20 ; e10 f10 g10 h10 e11 f11 g11 h11
3354*c0909341SAndroid Build Coastguard Worker    punpckldq           m18, m20      ; e08 f08 g08 h08 e09 f09 g09 h09
3355*c0909341SAndroid Build Coastguard Worker    punpckldq           m20, m4, m6   ; e00 f00 g00 h00 e01 f01 g01 h01
3356*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m6       ; e02 f02 g02 h02 e03 f03 g03 h03
3357*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7, m15  ; a12 b12 c12 d12 a13 b13 c13 d13
3358*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m15      ; a14 b14 c14 d14 a15 b15 c15 d15
3359*c0909341SAndroid Build Coastguard Worker    punpckhdq           m15, m0, m2   ; a02 b02 c02 d02 a03 b03 c03 d03
3360*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2       ; a00 b00 c00 d00 a01 b01 c01 d01
3361*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m8   ; a04 b04 c04 d04 a05 b05 c05 d05
3362*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m8       ; a06 b06 c06 d06 a07 b07 c07 d07
3363*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m17, m19 ; e14 f14 g14 h14 e15 f15 g15 h15
3364*c0909341SAndroid Build Coastguard Worker    punpckldq           m17, m19      ; e12 f12 g12 h12 e13 f13 g13 h13
3365*c0909341SAndroid Build Coastguard Worker    ret
3366*c0909341SAndroid Build Coastguard Worker.pass1_load_spill:
3367*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m0
3368*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m1
3369*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m2
3370*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m3
3371*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m4
3372*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m5
3373*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m6
3374*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m7
3375*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+64* 1]
3376*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+64* 3]
3377*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+64* 5]
3378*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+64* 7]
3379*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m12, [cq+64* 9]
3380*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m12, [cq+64*11]
3381*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m12, [cq+64*13]
3382*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m12, [cq+64*15]
3383*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m23
3384*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m22
3385*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m21
3386*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m20
3387*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 9], m19
3388*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*11], m18
3389*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*13], m17
3390*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*15], m16
3391*c0909341SAndroid Build Coastguard Worker    ret
3392*c0909341SAndroid Build Coastguard Worker.main_fast2_rect2:
3393*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m0, m1, m2, m3
3394*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m0, m1, m2, m3
3395*c0909341SAndroid Build Coastguard Worker.main_fast2: ; bottom 3/4 is zero
3396*c0909341SAndroid Build Coastguard Worker    pmulld              m23, m0, [o(pd_4091)] {1to16} ; t31a
3397*c0909341SAndroid Build Coastguard Worker    pmulld               m0, [o(pd_201)] {1to16}      ; t16a
3398*c0909341SAndroid Build Coastguard Worker    pmulld              m20, m3, [o(pd_1380)] {1to16} ; t19a
3399*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [o(pd_3857)] {1to16}     ; t28a
3400*c0909341SAndroid Build Coastguard Worker    pmulld              m21, m2, [o(pd_3973)] {1to16} ; t27a
3401*c0909341SAndroid Build Coastguard Worker    pmulld               m2, [o(pd_995)] {1to16}      ; t20a
3402*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m1, [o(pd_601)] {1to16}  ; t23a
3403*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m1, [o(pd_4052)] {1to16} ; t24a
3404*c0909341SAndroid Build Coastguard Worker    REPX  {psubd x, m13, x}, m20, m6
3405*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m23, m0, m3, m21, m2, m17
3406*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m20, m6, m23, m0, m3, m21, m2, m17
3407*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
3408*c0909341SAndroid Build Coastguard Worker    mova                m16, m23
3409*c0909341SAndroid Build Coastguard Worker    mova                 m7, m20
3410*c0909341SAndroid Build Coastguard Worker    mova                 m4, m3
3411*c0909341SAndroid Build Coastguard Worker    mova                m19, m2
3412*c0909341SAndroid Build Coastguard Worker    mova                m18, m21
3413*c0909341SAndroid Build Coastguard Worker    mova                 m5, m6
3414*c0909341SAndroid Build Coastguard Worker    mova                m22, m17
3415*c0909341SAndroid Build Coastguard Worker    jmp .main3
3416*c0909341SAndroid Build Coastguard Worker.main_fast_rect2:
3417*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).round
3418*c0909341SAndroid Build Coastguard Worker.main_fast: ; bottom half is zero
3419*c0909341SAndroid Build Coastguard Worker    pmulld              m23, m0, [o(pd_4091)] {1to16} ; t31a
3420*c0909341SAndroid Build Coastguard Worker    pmulld               m0, [o(pd_201)] {1to16}      ; t16a
3421*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m7, [o(pd_2751)] {1to16} ; t17a
3422*c0909341SAndroid Build Coastguard Worker    pmulld               m7, [o(pd_3035)] {1to16}     ; t30a
3423*c0909341SAndroid Build Coastguard Worker    pmulld              m19, m4, [o(pd_3703)] {1to16} ; t29a
3424*c0909341SAndroid Build Coastguard Worker    pmulld               m4, [o(pd_1751)] {1to16}     ; t18a
3425*c0909341SAndroid Build Coastguard Worker    pmulld              m20, m3, [o(pd_1380)] {1to16} ; t19a
3426*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [o(pd_3857)] {1to16}     ; t28a
3427*c0909341SAndroid Build Coastguard Worker    pmulld              m21, m2, [o(pd_3973)] {1to16} ; t27a
3428*c0909341SAndroid Build Coastguard Worker    pmulld               m2, [o(pd_995)] {1to16}      ; t20a
3429*c0909341SAndroid Build Coastguard Worker    pmulld              m18, m5, [o(pd_2106)] {1to16} ; t21a
3430*c0909341SAndroid Build Coastguard Worker    pmulld               m5, [o(pd_3513)] {1to16}     ; t26a
3431*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m6, [o(pd_3290)] {1to16} ; t25a
3432*c0909341SAndroid Build Coastguard Worker    pmulld               m6, [o(pd_2440)] {1to16}     ; t22a
3433*c0909341SAndroid Build Coastguard Worker    pmulld              m22, m1, [o(pd_601)] {1to16}  ; t23a
3434*c0909341SAndroid Build Coastguard Worker    pmulld               m1, [o(pd_4052)] {1to16}     ; t24a
3435*c0909341SAndroid Build Coastguard Worker    REPX  {psubd x, m13, x}, m16, m20, m18, m22
3436*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).round3
3437*c0909341SAndroid Build Coastguard Worker    jmp .main2
3438*c0909341SAndroid Build Coastguard Worker.main_rect2:
3439*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).round
3440*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).round
3441*c0909341SAndroid Build Coastguard Worker.main:
3442*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         0, 23,  8,  9, 10, _,  201, 4091 ; t16a, t31a
3443*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        16,  7,  8,  9, 10, _, 3035, 2751 ; t17a, t30a
3444*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 19,  8,  9, 10, _, 1751, 3703 ; t18a, t29a
3445*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        20,  3,  8,  9, 10, _, 3857, 1380 ; t19a, t28a
3446*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 21,  8,  9, 10, _,  995, 3973 ; t20a, t27a
3447*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        18,  5,  8,  9, 10, _, 3513, 2106 ; t21a, t26a
3448*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 17,  8,  9, 10, _, 2440, 3290 ; t22a, t25a
3449*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        22,  1,  8,  9, 10, _, 4052,  601 ; t23a, t24a
3450*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).round
3451*c0909341SAndroid Build Coastguard Worker.main2:
3452*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).round
3453*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m16  ; t17
3454*c0909341SAndroid Build Coastguard Worker    paddd                m0, m16      ; t16
3455*c0909341SAndroid Build Coastguard Worker    psubd               m16, m23, m7  ; t30
3456*c0909341SAndroid Build Coastguard Worker    paddd               m23, m7       ; t31
3457*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m8, m0, m16, m23
3458*c0909341SAndroid Build Coastguard Worker    paddd                m7, m20, m4  ; t19
3459*c0909341SAndroid Build Coastguard Worker    psubd               m20, m4       ; t18
3460*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m8, m0, m16, m23
3461*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3, m19  ; t28
3462*c0909341SAndroid Build Coastguard Worker    psubd                m3, m19      ; t29
3463*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m7, m20, m4, m3
3464*c0909341SAndroid Build Coastguard Worker    psubd               m19, m2, m18  ; t21
3465*c0909341SAndroid Build Coastguard Worker    paddd                m2, m18      ; t20
3466*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m7, m20, m4, m3
3467*c0909341SAndroid Build Coastguard Worker    psubd               m18, m21, m5  ; t26
3468*c0909341SAndroid Build Coastguard Worker    paddd               m21, m5       ; t27
3469*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m19, m2, m18, m21
3470*c0909341SAndroid Build Coastguard Worker    psubd                m5, m22, m6  ; t22
3471*c0909341SAndroid Build Coastguard Worker    paddd                m6, m22      ; t23
3472*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m19, m2, m18, m21
3473*c0909341SAndroid Build Coastguard Worker    psubd               m22, m1, m17  ; t25
3474*c0909341SAndroid Build Coastguard Worker    paddd               m17, m1       ; t24
3475*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m5, m6, m22, m17
3476*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m5, m6, m22, m17
3477*c0909341SAndroid Build Coastguard Worker.main3:
3478*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_4017)]
3479*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_799)]
3480*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        16,  8, 9, 1, _, 13, 10, 11    ; t17a, t30a
3481*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a
3482*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_2276)]
3483*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_3406)]
3484*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        18, 19, 9, 1, _, 13, 10, 11    ; t21a, t26a
3485*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        22,  5, 9, 1, _, 13, 10, 11, 2 ; t25a, t22a
3486*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6, m2   ; t23a
3487*c0909341SAndroid Build Coastguard Worker    psubd                m6, m2       ; t20a
3488*c0909341SAndroid Build Coastguard Worker    psubd                m2, m17, m21 ; t27a
3489*c0909341SAndroid Build Coastguard Worker    paddd               m17, m21      ; t24a
3490*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m1, m6, m2, m17
3491*c0909341SAndroid Build Coastguard Worker    psubd               m21, m23, m4  ; t28a
3492*c0909341SAndroid Build Coastguard Worker    paddd               m23, m4       ; t31a
3493*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m1, m6, m2, m17
3494*c0909341SAndroid Build Coastguard Worker    psubd                m4, m16, m20 ; t18
3495*c0909341SAndroid Build Coastguard Worker    paddd               m16, m20      ; t17
3496*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m21, m23, m4, m16
3497*c0909341SAndroid Build Coastguard Worker    psubd               m20, m0, m7   ; t19a
3498*c0909341SAndroid Build Coastguard Worker    paddd                m0, m7       ; t16a
3499*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m21, m23, m4, m16
3500*c0909341SAndroid Build Coastguard Worker    psubd                m7, m8, m3   ; t29
3501*c0909341SAndroid Build Coastguard Worker    paddd                m3, m8       ; t30
3502*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m20, m0, m7, m3
3503*c0909341SAndroid Build Coastguard Worker    paddd                m8, m5, m18  ; t22
3504*c0909341SAndroid Build Coastguard Worker    psubd                m5, m18      ; t21
3505*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m20, m0, m7, m3
3506*c0909341SAndroid Build Coastguard Worker    psubd               m18, m22, m19 ; t26
3507*c0909341SAndroid Build Coastguard Worker    paddd               m22, m19      ; t25
3508*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m8, m5, m18, m22
3509*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_3784)]
3510*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_1567)]
3511*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m8, m5, m18, m22
3512*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        21, 20, 9, 19, _, 13, 10, 11    ; t19,  t28
3513*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2,  6, 9, 19, _, 13, 10, 11, 2 ; t27,  t20
3514*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7,  4, 9, 19, _, 13, 10, 11    ; t18a, t29a
3515*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        18,  5, 9, 19, _, 13, 10, 11, 2 ; t26a, t21a
3516*c0909341SAndroid Build Coastguard Worker    psubd               m19, m0, m1   ; t23
3517*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1       ; t16
3518*c0909341SAndroid Build Coastguard Worker    paddd                m1, m8, m16  ; t17a
3519*c0909341SAndroid Build Coastguard Worker    psubd                m8, m16, m8  ; t22a
3520*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m19, m0, m1, m8
3521*c0909341SAndroid Build Coastguard Worker    psubd               m16, m23, m17 ; t24
3522*c0909341SAndroid Build Coastguard Worker    paddd               m23, m17      ; t31
3523*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m19, m0, m1, m8
3524*c0909341SAndroid Build Coastguard Worker    psubd               m17, m3, m22  ; t25a
3525*c0909341SAndroid Build Coastguard Worker    paddd               m22, m3       ; t30a
3526*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m16, m23, m17, m22
3527*c0909341SAndroid Build Coastguard Worker    paddd                m3, m6, m21  ; t19a
3528*c0909341SAndroid Build Coastguard Worker    psubd                m6, m21, m6  ; t20a
3529*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m16, m23, m17, m22
3530*c0909341SAndroid Build Coastguard Worker    paddd               m21, m18, m4  ; t29
3531*c0909341SAndroid Build Coastguard Worker    psubd               m18, m4, m18  ; t26
3532*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m3, m6, m21, m18
3533*c0909341SAndroid Build Coastguard Worker    psubd                m4, m20, m2  ; t27a
3534*c0909341SAndroid Build Coastguard Worker    paddd               m20, m2       ; t28a
3535*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m3, m6, m21, m18
3536*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7, m5   ; t18
3537*c0909341SAndroid Build Coastguard Worker    psubd                m7, m5       ; t21
3538*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m4, m20, m2, m7
3539*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m4, m20, m2, m7
3540*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m18, m16, m4, m17, m7, m19, m6, m8
3541*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m18, m16, m4, m17
3542*c0909341SAndroid Build Coastguard Worker    psubd                m5, m18, m7  ; t21a
3543*c0909341SAndroid Build Coastguard Worker    paddd               m18, m7       ; t26a
3544*c0909341SAndroid Build Coastguard Worker    psubd                m7, m16, m19 ; t23a
3545*c0909341SAndroid Build Coastguard Worker    paddd               m16, m19      ; t24a
3546*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m5, m18, m7, m16
3547*c0909341SAndroid Build Coastguard Worker    paddd               m19, m4, m6   ; t27
3548*c0909341SAndroid Build Coastguard Worker    psubd                m4, m6       ; t20
3549*c0909341SAndroid Build Coastguard Worker    psubd                m6, m17, m8  ; t22
3550*c0909341SAndroid Build Coastguard Worker    paddd               m17, m8       ; t25
3551*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m19, m4, m6, m17
3552*c0909341SAndroid Build Coastguard Worker    ret
3553*c0909341SAndroid Build Coastguard Worker
3554*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 16, dst, stride, c, eob
3555*c0909341SAndroid Build Coastguard Worker%undef cmp
3556*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pw_2896x8]
3557*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pw_1697x16]
3558*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [pw_2048]
3559*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [pixel_10bpc_max]
3560*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*9]
3561*c0909341SAndroid Build Coastguard Worker    pxor                m14, m14
3562*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
3563*c0909341SAndroid Build Coastguard Worker    jl .main
3564*c0909341SAndroid Build Coastguard Worker    mov                  r4, dstq
3565*c0909341SAndroid Build Coastguard Worker    call .main
3566*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*12
3567*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r4+32]
3568*c0909341SAndroid Build Coastguard Worker.main:
3569*c0909341SAndroid Build Coastguard Worker    call .main_internal
3570*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*4
3571*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13, m2
3572*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m13, m4
3573*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m13, m6
3574*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m13, m8
3575*c0909341SAndroid Build Coastguard Worker    call .main_internal
3576*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
3577*c0909341SAndroid Build Coastguard Worker.main_internal:
3578*c0909341SAndroid Build Coastguard Worker    mova                 m8, [cq+64* 0]
3579*c0909341SAndroid Build Coastguard Worker    packssdw             m8, [cq+64* 8]
3580*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64* 1]
3581*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [cq+64* 9]
3582*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 2]
3583*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [cq+64*10]
3584*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64* 3]
3585*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [cq+64*11]
3586*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m8, m6, m0, m2
3587*c0909341SAndroid Build Coastguard Worker    REPX  {paddsw   x, x  }, m8, m6, m0, m2
3588*c0909341SAndroid Build Coastguard Worker    REPX {vpermq x, x, q3120}, m8, m6, m0, m2
3589*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m11, m8
3590*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m11, m6
3591*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m8
3592*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m6
3593*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m14}, 0, 1, 2, 3
3594*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m4
3595*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m9
3596*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m11, m0
3597*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m11, m2
3598*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m0
3599*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m2
3600*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m14}, 8, 9, 10, 11
3601*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4
3602*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m9
3603*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m8, m6
3604*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m6
3605*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0, m2
3606*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2
3607*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4, m6 ; 0 1
3608*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m6     ; 2 3
3609*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m8, m0 ; 4 5
3610*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m0     ; 6 7
3611*c0909341SAndroid Build Coastguard Worker    ret
3612*c0909341SAndroid Build Coastguard Worker
3613*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
3614*c0909341SAndroid Build Coastguard Worker%undef cmp
3615*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
3616*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
3617*c0909341SAndroid Build Coastguard Worker    jz .dconly
3618*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
3619*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
3620*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
3621*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
3622*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      30
3623*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
3624*c0909341SAndroid Build Coastguard Worker    jl .fast
3625*c0909341SAndroid Build Coastguard Worker    add                  cq, 64
3626*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 543
3627*c0909341SAndroid Build Coastguard Worker    jge .full
3628*c0909341SAndroid Build Coastguard Worker    call .pass1_fast ; bottomright 16x16 zero
3629*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 16*12
3630*c0909341SAndroid Build Coastguard Worker    jmp .lefthalf
3631*c0909341SAndroid Build Coastguard Worker.full:
3632*c0909341SAndroid Build Coastguard Worker    call .pass1
3633*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 16*28
3634*c0909341SAndroid Build Coastguard Worker.lefthalf:
3635*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 0], m0
3636*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 1], m1
3637*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 2], m2
3638*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 3], m3
3639*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 4], m14
3640*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 5], m15
3641*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 6], m16
3642*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 7], m17
3643*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 8], m22
3644*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 9], m23
3645*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*10], m24
3646*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*11], m25
3647*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*12], m26
3648*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*13], m27
3649*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*14], m28
3650*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*15], m29
3651*c0909341SAndroid Build Coastguard Worker    sub                  cq, 64
3652*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
3653*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
3654*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
3655*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
3656*c0909341SAndroid Build Coastguard Worker    call .pass1
3657*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
3658*c0909341SAndroid Build Coastguard Worker    call .pass2_start
3659*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
3660*c0909341SAndroid Build Coastguard Worker.right_zero_loop:
3661*c0909341SAndroid Build Coastguard Worker    mova [cq+r6*8+64+128*3], m12
3662*c0909341SAndroid Build Coastguard Worker    mova [cq+r6*8+64+128*2], m12
3663*c0909341SAndroid Build Coastguard Worker    mova [cq+r6*8+64+128*1], m12
3664*c0909341SAndroid Build Coastguard Worker    mova [cq+r6*8+64+128*0], m12
3665*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 16*4
3666*c0909341SAndroid Build Coastguard Worker    jge .right_zero_loop
3667*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 16*28
3668*c0909341SAndroid Build Coastguard Worker    jmp .end2
3669*c0909341SAndroid Build Coastguard Worker.pass2_start:
3670*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64+128* 0]
3671*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64+128* 1]
3672*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64+128* 2]
3673*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64+128* 3]
3674*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64+128* 4]
3675*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+64+128* 5]
3676*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+64+128* 6]
3677*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+64+128* 7]
3678*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
3679*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
3680*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*0], m14
3681*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*1], m15
3682*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*2], m16
3683*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*3], m17
3684*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*4], m18
3685*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*5], m19
3686*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*6], m20
3687*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*7], m21
3688*c0909341SAndroid Build Coastguard Worker    mova                m14, [cq+64+128* 8]
3689*c0909341SAndroid Build Coastguard Worker    mova                m15, [cq+64+128* 9]
3690*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64+128*10]
3691*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64+128*11]
3692*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64+128*12]
3693*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+64+128*13]
3694*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+64+128*14]
3695*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+64+128*15]
3696*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
3697*c0909341SAndroid Build Coastguard Worker.fast: ; topleft 16x16 nonzero
3698*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
3699*c0909341SAndroid Build Coastguard Worker    jl .fast2
3700*c0909341SAndroid Build Coastguard Worker    call .pass1_fast
3701*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
3702*c0909341SAndroid Build Coastguard Worker    call .pass2_fast_start
3703*c0909341SAndroid Build Coastguard Worker    jmp .end
3704*c0909341SAndroid Build Coastguard Worker.pass2_fast_start:
3705*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
3706*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*0], m14
3707*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*1], m15
3708*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*2], m16
3709*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*3], m17
3710*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*4], m18
3711*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*5], m19
3712*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*6], m20
3713*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*7], m21
3714*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
3715*c0909341SAndroid Build Coastguard Worker.fast2: ; topleft 8x8 nonzero
3716*c0909341SAndroid Build Coastguard Worker    movshdup             m7, [o(permB)]
3717*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+128*0]
3718*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+128*4]
3719*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+128*2]
3720*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+128*6]
3721*c0909341SAndroid Build Coastguard Worker    mova               ym16, [cq+128*1]
3722*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+128*5]
3723*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+128*3]
3724*c0909341SAndroid Build Coastguard Worker    mova               ym17, [cq+128*7]
3725*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 16*4
3726*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, m0 ;  0  0
3727*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m7, m1 ;  4  4
3728*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m7, m5 ;  2  6
3729*c0909341SAndroid Build Coastguard Worker    vpermt2q            m16, m7, m2 ;  1  5
3730*c0909341SAndroid Build Coastguard Worker    vpermt2q            m17, m7, m3 ;  7  3
3731*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
3732*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_end
3733*c0909341SAndroid Build Coastguard Worker    call .pass2_fast2_start
3734*c0909341SAndroid Build Coastguard Worker.end:
3735*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
3736*c0909341SAndroid Build Coastguard Worker.end2:
3737*c0909341SAndroid Build Coastguard Worker    call .pass2_end
3738*c0909341SAndroid Build Coastguard Worker.zero_loop:
3739*c0909341SAndroid Build Coastguard Worker    mova    [cq+r6*8+128*3], m12
3740*c0909341SAndroid Build Coastguard Worker    mova    [cq+r6*8+128*2], m12
3741*c0909341SAndroid Build Coastguard Worker    mova    [cq+r6*8+128*1], m12
3742*c0909341SAndroid Build Coastguard Worker    mova    [cq+r6*8+128*0], m12
3743*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 16*4
3744*c0909341SAndroid Build Coastguard Worker    jge .zero_loop
3745*c0909341SAndroid Build Coastguard Worker    WIN64_RESTORE_XMM
3746*c0909341SAndroid Build Coastguard Worker    vzeroupper
3747*c0909341SAndroid Build Coastguard Worker    ret
3748*c0909341SAndroid Build Coastguard Worker.pass2_fast2_start:
3749*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
3750*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
3751*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m22, m0, m2 ; 1
3752*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2     ; 0
3753*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m5, m7 ; 4
3754*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m24, m5, m7 ; 5
3755*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m3, m4 ; 2
3756*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m23, m3, m4 ; 3
3757*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m6, m8 ; 6
3758*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m25, m6, m8 ; 7
3759*c0909341SAndroid Build Coastguard Worker    mova                m10, m13
3760*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
3761*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*0], m14
3762*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*1], m15
3763*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*2], m16
3764*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*3], m17
3765*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*4], m18
3766*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*5], m19
3767*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*6], m20
3768*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*7], m21
3769*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
3770*c0909341SAndroid Build Coastguard Worker.pass2_end:
3771*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m0, m29 ; out31
3772*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m29     ; out0
3773*c0909341SAndroid Build Coastguard Worker    psubsw              m29, m1, m28 ; out30
3774*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m28     ; out1
3775*c0909341SAndroid Build Coastguard Worker    psubsw              m28, m2, m27 ; out29
3776*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m27     ; out2
3777*c0909341SAndroid Build Coastguard Worker    psubsw              m27, m3, m26 ; out28
3778*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m26     ; out3
3779*c0909341SAndroid Build Coastguard Worker    psubsw              m26, m4, m25 ; out27
3780*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m25     ; out4
3781*c0909341SAndroid Build Coastguard Worker    psubsw              m25, m5, m24 ; out26
3782*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m24     ; out5
3783*c0909341SAndroid Build Coastguard Worker    psubsw              m24, m6, m23 ; out25
3784*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m23     ; out6
3785*c0909341SAndroid Build Coastguard Worker    psubsw              m23, m7, m22 ; out24
3786*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m22     ; out7
3787*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start
3788*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128*0]
3789*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*1]
3790*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*2]
3791*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*3]
3792*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128*4]
3793*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*5]
3794*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*6]
3795*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*7]
3796*c0909341SAndroid Build Coastguard Worker    psubsw              m22, m0, m21 ; out23
3797*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m21     ; out8
3798*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m1, m20 ; out22
3799*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m20     ; out9
3800*c0909341SAndroid Build Coastguard Worker    psubsw              m20, m2, m19 ; out21
3801*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m19     ; out10
3802*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m3, m18 ; out20
3803*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m18     ; out11
3804*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m4, m17 ; out19
3805*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m17     ; out12
3806*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m5, m16 ; out18
3807*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m16     ; out13
3808*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m6, m15 ; out17
3809*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m15     ; out14
3810*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m7, m14 ; out16
3811*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m14     ; out15
3812*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
3813*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m11, m15
3814*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m11, m16
3815*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m11, m17
3816*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m11, m18
3817*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
3818*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m11, m19
3819*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m11, m20
3820*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m11, m21
3821*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m11, m22
3822*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
3823*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m11, m23
3824*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m11, m24
3825*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m11, m25
3826*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m11, m26
3827*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
3828*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m11, m27
3829*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m11, m28
3830*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m11, m29
3831*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m11, m9
3832*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
3833*c0909341SAndroid Build Coastguard Worker.dconly:
3834*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
3835*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
3836*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
3837*c0909341SAndroid Build Coastguard Worker    add                 r6d, 640
3838*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 10
3839*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
3840*c0909341SAndroid Build Coastguard Worker.pass1_fast:
3841*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 0]
3842*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 4]
3843*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 8]
3844*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*12]
3845*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 16*12
3846*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_fast
3847*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+128* 2]
3848*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+128* 6]
3849*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+128*10]
3850*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+128*14]
3851*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_fast
3852*c0909341SAndroid Build Coastguard Worker    call .pass1_load_spill
3853*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
3854*c0909341SAndroid Build Coastguard Worker    jmp .pass1_end
3855*c0909341SAndroid Build Coastguard Worker.pass1:
3856*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 0]
3857*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 4]
3858*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 8]
3859*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*12]
3860*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128*16]
3861*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*20]
3862*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*24]
3863*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*28]
3864*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main
3865*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+128* 2]
3866*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+128* 6]
3867*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+128*10]
3868*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+128*14]
3869*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+128*18]
3870*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+128*22]
3871*c0909341SAndroid Build Coastguard Worker    mova                m22, [cq+128*26]
3872*c0909341SAndroid Build Coastguard Worker    mova                m23, [cq+128*30]
3873*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main
3874*c0909341SAndroid Build Coastguard Worker    call .pass1_load_spill
3875*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+128*17]
3876*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+128*19]
3877*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+128*21]
3878*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+128*23]
3879*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+128*25]
3880*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+128*27]
3881*c0909341SAndroid Build Coastguard Worker    mova                m22, [cq+128*29]
3882*c0909341SAndroid Build Coastguard Worker    mova                m23, [cq+128*31]
3883*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).main
3884*c0909341SAndroid Build Coastguard Worker.pass1_end:
3885*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_2)]
3886*c0909341SAndroid Build Coastguard Worker    lea                  r4, [cq+128*8]
3887*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end
3888*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m22, m0, m20  ;  1
3889*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m20      ;  0
3890*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m24, m2, m1   ;  5
3891*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m2, m1   ;  4
3892*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m14, m18 ;  8
3893*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m26, m14, m18 ;  9
3894*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m15, m4  ;  2
3895*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m23, m15, m4  ;  3
3896*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m25, m3, m21  ;  7
3897*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m3, m21  ;  6
3898*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m28, m6, m17  ; 13
3899*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m6, m17  ; 12
3900*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m27, m5, m16  ; 11
3901*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m16, m5, m16  ; 10
3902*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m29, m7, m8   ; 15
3903*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m17, m7, m8   ; 14
3904*c0909341SAndroid Build Coastguard Worker    ret
3905*c0909341SAndroid Build Coastguard Worker.pass1_load_spill:
3906*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
3907*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 0], m0
3908*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 1]
3909*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 1], m1
3910*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 2], m2
3911*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 3]
3912*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 5]
3913*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 3], m3
3914*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 4], m4
3915*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128* 7]
3916*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128* 9]
3917*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 5], m5
3918*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 6], m6
3919*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 7], m7
3920*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*11]
3921*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*13]
3922*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*15]
3923*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 8], m23
3924*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 9], m22
3925*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*10], m21
3926*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*11], m20
3927*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*12], m19
3928*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*13], m18
3929*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*14], m17
3930*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*15], m16
3931*c0909341SAndroid Build Coastguard Worker    ret
3932*c0909341SAndroid Build Coastguard Worker
3933*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eob
3934*c0909341SAndroid Build Coastguard Worker%undef cmp
3935*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [pw_8192]
3936*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [pixel_10bpc_max]
3937*c0909341SAndroid Build Coastguard Worker    pxor                m14, m14
3938*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*9]
3939*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
3940*c0909341SAndroid Build Coastguard Worker    jl .main
3941*c0909341SAndroid Build Coastguard Worker    mov                  r4, dstq
3942*c0909341SAndroid Build Coastguard Worker    call .main
3943*c0909341SAndroid Build Coastguard Worker    add                  cq, 64-128*4
3944*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
3945*c0909341SAndroid Build Coastguard Worker    call .main
3946*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*12-64
3947*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r4+32]
3948*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 543
3949*c0909341SAndroid Build Coastguard Worker    jl .main
3950*c0909341SAndroid Build Coastguard Worker    call .main
3951*c0909341SAndroid Build Coastguard Worker    add                  cq, 64-128*4
3952*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
3953*c0909341SAndroid Build Coastguard Worker.main:
3954*c0909341SAndroid Build Coastguard Worker    call .main_internal
3955*c0909341SAndroid Build Coastguard Worker    add                  cq, 128*4
3956*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13, m2
3957*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m13, m4
3958*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m13, m6
3959*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m13, m8
3960*c0909341SAndroid Build Coastguard Worker    call .main_internal
3961*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
3962*c0909341SAndroid Build Coastguard Worker.main_internal:
3963*c0909341SAndroid Build Coastguard Worker    mova                 m8, [cq+128* 0]
3964*c0909341SAndroid Build Coastguard Worker    packssdw             m8, [cq+128* 8]
3965*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128* 1]
3966*c0909341SAndroid Build Coastguard Worker    packssdw             m6, [cq+128* 9]
3967*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 2]
3968*c0909341SAndroid Build Coastguard Worker    packssdw             m0, [cq+128*10]
3969*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 3]
3970*c0909341SAndroid Build Coastguard Worker    packssdw             m2, [cq+128*11]
3971*c0909341SAndroid Build Coastguard Worker    REPX {vpermq x, x, q3120}, m8, m6, m0, m2
3972*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+128*x], m14}, 0, 1, 2, 3
3973*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m8, m6
3974*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m6
3975*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0, m2
3976*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2
3977*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+128*x], m14}, 8, 9, 10, 11
3978*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4, m6 ; 0 1
3979*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m6     ; 2 3
3980*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m8, m0 ; 4 5
3981*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m0     ; 6 7
3982*c0909341SAndroid Build Coastguard Worker    ret
3983*c0909341SAndroid Build Coastguard Worker
3984*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
3985*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
3986*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
3987*c0909341SAndroid Build Coastguard Worker    jz .dconly
3988*c0909341SAndroid Build Coastguard Worker
3989*c0909341SAndroid Build Coastguard Worker    PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob
3990*c0909341SAndroid Build Coastguard Worker%undef cmp
3991*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
3992*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
3993*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
3994*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
3995*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
3996*c0909341SAndroid Build Coastguard Worker    jl .fast
3997*c0909341SAndroid Build Coastguard Worker    call .pass1
3998*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
3999*c0909341SAndroid Build Coastguard Worker    jge .full
4000*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
4001*c0909341SAndroid Build Coastguard Worker
4002*c0909341SAndroid Build Coastguard Worker    punpckhwd           m22, m0, m0
4003*c0909341SAndroid Build Coastguard Worker    punpckhwd           m23, m1, m1
4004*c0909341SAndroid Build Coastguard Worker    punpckhwd           m24, m2, m2
4005*c0909341SAndroid Build Coastguard Worker    punpckhwd           m25, m3, m3
4006*c0909341SAndroid Build Coastguard Worker    punpckhwd           m26, m4, m4
4007*c0909341SAndroid Build Coastguard Worker    punpckhwd           m27, m5, m5
4008*c0909341SAndroid Build Coastguard Worker    punpckhwd           m28, m6, m6
4009*c0909341SAndroid Build Coastguard Worker    punpckhwd           m29, m7, m7
4010*c0909341SAndroid Build Coastguard Worker    punpcklwd           m21, m1, m1
4011*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m3, m3
4012*c0909341SAndroid Build Coastguard Worker    punpcklwd           m18, m5, m5
4013*c0909341SAndroid Build Coastguard Worker    punpcklwd           m15, m7, m7
4014*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
4015*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m9, m0
4016*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m2, m2
4017*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m4, m4
4018*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m6, m6
4019*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast2
4020*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
4021*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*0], m14
4022*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*1], m15
4023*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*2], m16
4024*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*3], m17
4025*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*4], m18
4026*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*5], m19
4027*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*6], m20
4028*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*7], m21
4029*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
4030*c0909341SAndroid Build Coastguard Worker
4031*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
4032*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 64*3
4033*c0909341SAndroid Build Coastguard Worker.zero_loop:
4034*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3
4035*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 64
4036*c0909341SAndroid Build Coastguard Worker    jge .zero_loop
4037*c0909341SAndroid Build Coastguard Worker
4038*c0909341SAndroid Build Coastguard Worker    jmp .pass2_end
4039*c0909341SAndroid Build Coastguard Worker.full:
4040*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*0], m0
4041*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*1], m1
4042*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*2], m2
4043*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*3], m3
4044*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*4], m4
4045*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*5], m5
4046*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*6], m6
4047*c0909341SAndroid Build Coastguard Worker    mova         [cq+128*7], m7
4048*c0909341SAndroid Build Coastguard Worker    add                  cq, 64
4049*c0909341SAndroid Build Coastguard Worker    call .pass1
4050*c0909341SAndroid Build Coastguard Worker    sub                  cq, 64
4051*c0909341SAndroid Build Coastguard Worker    mova                m22, [cq+128*0] ;  0  1
4052*c0909341SAndroid Build Coastguard Worker    mova                m23, [cq+128*1] ;  2  3
4053*c0909341SAndroid Build Coastguard Worker    mova                m24, [cq+128*2] ;  4  5
4054*c0909341SAndroid Build Coastguard Worker    mova                m25, [cq+128*3] ;  6  7
4055*c0909341SAndroid Build Coastguard Worker    mova                m26, [cq+128*4] ;  8  9
4056*c0909341SAndroid Build Coastguard Worker    mova                m27, [cq+128*5] ; 10 11
4057*c0909341SAndroid Build Coastguard Worker    mova                m28, [cq+128*6] ; 12 13
4058*c0909341SAndroid Build Coastguard Worker    mova                m29, [cq+128*7] ; 14 15
4059*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m0
4060*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 9], m1
4061*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m2
4062*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*11], m3
4063*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m4
4064*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*13], m5
4065*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m6
4066*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*15], m7
4067*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
4068*c0909341SAndroid Build Coastguard Worker
4069*c0909341SAndroid Build Coastguard Worker    punpcklwd           m20, m1, m1
4070*c0909341SAndroid Build Coastguard Worker    punpcklwd           m16, m3, m3
4071*c0909341SAndroid Build Coastguard Worker    punpcklwd           m19, m5, m5
4072*c0909341SAndroid Build Coastguard Worker    punpcklwd           m17, m7, m7
4073*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m24, m24 ;  4
4074*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m2, m2   ; 20
4075*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m28, m28 ; 12
4076*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m26, m26 ;  8
4077*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m4   ; 24
4078*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m6, m6   ; 28
4079*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
4080*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m9, m0   ; __ 16
4081*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
4082*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m9, m22  ; __  0
4083*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast
4084*c0909341SAndroid Build Coastguard Worker    punpcklwd           m21, m23, m23 ;  2
4085*c0909341SAndroid Build Coastguard Worker    punpcklwd           m15, m29, m29 ; 14
4086*c0909341SAndroid Build Coastguard Worker    punpcklwd           m18, m27, m27 ; 10
4087*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m25, m25 ;  6
4088*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4089*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*0], m14
4090*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*1], m15
4091*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*2], m16
4092*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*3], m17
4093*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*4], m18
4094*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*5], m19
4095*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*6], m20
4096*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*7], m21
4097*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+64*15]
4098*c0909341SAndroid Build Coastguard Worker    mova                m14, [cq+64* 8]
4099*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64*11]
4100*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64*12]
4101*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+64*13]
4102*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64*10]
4103*c0909341SAndroid Build Coastguard Worker    mova                m15, [cq+64* 9]
4104*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+64*14]
4105*c0909341SAndroid Build Coastguard Worker    REPX   {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
4106*c0909341SAndroid Build Coastguard Worker                             m24, m19, m16, m27, m28, m15, m20, m23
4107*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
4108*c0909341SAndroid Build Coastguard Worker
4109*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
4110*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 32*7
4111*c0909341SAndroid Build Coastguard Worker.full_zero_loop:
4112*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3
4113*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 32
4114*c0909341SAndroid Build Coastguard Worker    jge .full_zero_loop
4115*c0909341SAndroid Build Coastguard Worker
4116*c0909341SAndroid Build Coastguard Worker    jmp .pass2_end
4117*c0909341SAndroid Build Coastguard Worker.fast:
4118*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+128*0]
4119*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+128*4]
4120*c0909341SAndroid Build Coastguard Worker    movshdup             m8, [o(permB)]
4121*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+128*2]
4122*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+128*6]
4123*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+128*1]
4124*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+128*3]
4125*c0909341SAndroid Build Coastguard Worker    mova                ym6, [cq+128*5]
4126*c0909341SAndroid Build Coastguard Worker    mova                ym7, [cq+128*7]
4127*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m8, m2 ; 0 4
4128*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m8, m3 ; 2 6
4129*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m8, m5 ; 1 3
4130*c0909341SAndroid Build Coastguard Worker    vpermt2q             m7, m8, m6 ; 7 5
4131*c0909341SAndroid Build Coastguard Worker    call m(idct_8x8_internal_10bpc).main_fast
4132*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).main_fast
4133*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_2)]
4134*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_end2
4135*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(idct8x32p)]
4136*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m4
4137*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m5
4138*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m6
4139*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m7
4140*c0909341SAndroid Build Coastguard Worker    mova                 m6, [dup16_perm]
4141*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m8, m0
4142*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m8, m2
4143*c0909341SAndroid Build Coastguard Worker    vprold               m8, 16
4144*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m8, m1
4145*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m8, m3
4146*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m0, m2
4147*c0909341SAndroid Build Coastguard Worker    punpckhdq            m0, m2
4148*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m1, m3
4149*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m3
4150*c0909341SAndroid Build Coastguard Worker    punpckldq           m21, m4, m2
4151*c0909341SAndroid Build Coastguard Worker    punpckhdq           m14, m4, m2
4152*c0909341SAndroid Build Coastguard Worker    punpckldq           m18, m0, m1
4153*c0909341SAndroid Build Coastguard Worker    punpckhdq           m15, m0, m1
4154*c0909341SAndroid Build Coastguard Worker    vpord                m7, m6, [o(pb_32)] {1to16}
4155*c0909341SAndroid Build Coastguard Worker    vpermb              m22, m7, m21 ; 1
4156*c0909341SAndroid Build Coastguard Worker    pmovzxwd             m9, ym21    ; 0
4157*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m6, m18 ; 4
4158*c0909341SAndroid Build Coastguard Worker    vpermb              m24, m7, m18 ; 5
4159*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m6, m14 ; 2
4160*c0909341SAndroid Build Coastguard Worker    vpermb              m23, m7, m14 ; 3
4161*c0909341SAndroid Build Coastguard Worker    vpermb              m14, m6, m15 ; 6
4162*c0909341SAndroid Build Coastguard Worker    vpermb              m25, m7, m15 ; 7
4163*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
4164*c0909341SAndroid Build Coastguard Worker    pslld                m9, 16
4165*c0909341SAndroid Build Coastguard Worker
4166*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
4167*c0909341SAndroid Build Coastguard Worker    REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29
4168*c0909341SAndroid Build Coastguard Worker
4169*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast2
4170*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
4171*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*0], m14
4172*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*1], m15
4173*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*2], m16
4174*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*3], m17
4175*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*4], m18
4176*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*5], m19
4177*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*6], m20
4178*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*7], m21
4179*c0909341SAndroid Build Coastguard Worker
4180*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
4181*c0909341SAndroid Build Coastguard Worker
4182*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
4183*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
4184*c0909341SAndroid Build Coastguard Worker.pass2_end:
4185*c0909341SAndroid Build Coastguard Worker    movshdup            m30, [permC]
4186*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pw_2048]
4187*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [pixel_10bpc_max]
4188*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
4189*c0909341SAndroid Build Coastguard Worker    psrlq               m31, m30, 8
4190*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m0
4191*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m1
4192*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4193*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m2
4194*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m3
4195*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4196*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m4
4197*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m5
4198*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4199*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m6
4200*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m7
4201*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4202*c0909341SAndroid Build Coastguard Worker
4203*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+mmsize*0]
4204*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+mmsize*1]
4205*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+mmsize*2]
4206*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+mmsize*3]
4207*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+mmsize*4]
4208*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+mmsize*5]
4209*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+mmsize*6]
4210*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+mmsize*7]
4211*c0909341SAndroid Build Coastguard Worker
4212*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1, m21
4213*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m1, m21
4214*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, m20
4215*c0909341SAndroid Build Coastguard Worker    psubsw              m20, m2, m20
4216*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m3, m19
4217*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m3, m19
4218*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m4, m18
4219*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m4, m18
4220*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m5, m17
4221*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m5, m17
4222*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m6, m16
4223*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m6, m16
4224*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m7, m15
4225*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m7, m15
4226*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m8, m14
4227*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m8, m14
4228*c0909341SAndroid Build Coastguard Worker
4229*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m0
4230*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m1
4231*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4232*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m2
4233*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m3
4234*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4235*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m4
4236*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m5
4237*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4238*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m6
4239*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m7
4240*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4241*c0909341SAndroid Build Coastguard Worker
4242*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m14
4243*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m15
4244*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4245*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m16
4246*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m17
4247*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4248*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m18
4249*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m19
4250*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4251*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m20
4252*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m21
4253*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4254*c0909341SAndroid Build Coastguard Worker
4255*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m22
4256*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m23
4257*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4258*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m24
4259*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m25
4260*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4261*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m26
4262*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m27
4263*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4264*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m30, m28
4265*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m31, m29
4266*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_10bpc).write_16x4
4267*c0909341SAndroid Build Coastguard Worker    RET
4268*c0909341SAndroid Build Coastguard Worker.pass1:
4269*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 0]
4270*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 2]
4271*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 4]
4272*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128* 6]
4273*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128* 8]
4274*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*10]
4275*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*12]
4276*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*14]
4277*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main
4278*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+128* 1]
4279*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+128* 3]
4280*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+128* 5]
4281*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+128* 7]
4282*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+128* 9]
4283*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+128*11]
4284*c0909341SAndroid Build Coastguard Worker    mova                m22, [cq+128*13]
4285*c0909341SAndroid Build Coastguard Worker    mova                m23, [cq+128*15]
4286*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main
4287*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_end
4288*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_10bpc).main_end3
4289*c0909341SAndroid Build Coastguard Worker.dconly:
4290*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
4291*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
4292*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
4293*c0909341SAndroid Build Coastguard Worker    add                 r6d, 640
4294*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 10
4295*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
4296*c0909341SAndroid Build Coastguard Worker
4297*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
4298*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
4299*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
4300*c0909341SAndroid Build Coastguard Worker    jz .dconly
4301*c0909341SAndroid Build Coastguard Worker    PROLOGUE 4, 7, 32, -64*40, dst, stride, c, eob
4302*c0909341SAndroid Build Coastguard Worker%undef cmp
4303*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
4304*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
4305*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
4306*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
4307*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
4308*c0909341SAndroid Build Coastguard Worker    jl .fast
4309*c0909341SAndroid Build Coastguard Worker    add                  cq, 64
4310*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 543
4311*c0909341SAndroid Build Coastguard Worker    jge .full
4312*c0909341SAndroid Build Coastguard Worker    call .pass1_fast ; bottomright 16x16 zero
4313*c0909341SAndroid Build Coastguard Worker    jmp .lefthalf
4314*c0909341SAndroid Build Coastguard Worker.full:
4315*c0909341SAndroid Build Coastguard Worker    call .pass1
4316*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 16*28
4317*c0909341SAndroid Build Coastguard Worker.lefthalf:
4318*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 0], m27
4319*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 1], m14
4320*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 2], m28
4321*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 3], m15
4322*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 4], m22
4323*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 5], m23
4324*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 6], m24
4325*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 7], m25
4326*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 8], m0
4327*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 9], m26
4328*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*10], m20
4329*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*11], m21
4330*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*12], m18
4331*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*13], m16
4332*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*14], m17
4333*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*15], m3
4334*c0909341SAndroid Build Coastguard Worker    sub                  cq, 64
4335*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
4336*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
4337*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
4338*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
4339*c0909341SAndroid Build Coastguard Worker    call .pass1
4340*c0909341SAndroid Build Coastguard Worker    call .pass2_start
4341*c0909341SAndroid Build Coastguard Worker
4342*c0909341SAndroid Build Coastguard Worker    pxor                m31, m31
4343*c0909341SAndroid Build Coastguard Worker.right_zero_loop:
4344*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+r3*8+64+128*x], m31}, 0, 1, 2, 3
4345*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 16*4
4346*c0909341SAndroid Build Coastguard Worker    jge .right_zero_loop
4347*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 16*28
4348*c0909341SAndroid Build Coastguard Worker    jmp .left_zero_loop
4349*c0909341SAndroid Build Coastguard Worker.pass2_start:
4350*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
4351*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
4352*c0909341SAndroid Build Coastguard Worker
4353*c0909341SAndroid Build Coastguard Worker    lea                  r4, [rsp+gprsize]
4354*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*15+64]
4355*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 8+64]
4356*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
4357*c0909341SAndroid Build Coastguard Worker    mova                 m0, m21
4358*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*12+64]
4359*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*11+64]
4360*c0909341SAndroid Build Coastguard Worker    mova                 m3, m18
4361*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
4362*c0909341SAndroid Build Coastguard Worker    mova                 m0, m20
4363*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*13+64]
4364*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*10+64]
4365*c0909341SAndroid Build Coastguard Worker    mova                 m3, m16
4366*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
4367*c0909341SAndroid Build Coastguard Worker    mova                 m0, m26
4368*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*14+64]
4369*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 9+64]
4370*c0909341SAndroid Build Coastguard Worker    mova                 m3, m17
4371*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
4372*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
4373*c0909341SAndroid Build Coastguard Worker
4374*c0909341SAndroid Build Coastguard Worker    mova                 m0, m27
4375*c0909341SAndroid Build Coastguard Worker    mova                 m1, m28
4376*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128* 0+64]
4377*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128* 2+64]
4378*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+128* 1+64]
4379*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+128* 3+64]
4380*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
4381*c0909341SAndroid Build Coastguard Worker    mova                m26, [cq+128* 4+64]
4382*c0909341SAndroid Build Coastguard Worker    mova                m27, [cq+128* 5+64]
4383*c0909341SAndroid Build Coastguard Worker    mova                m28, [cq+128* 6+64]
4384*c0909341SAndroid Build Coastguard Worker    mova                m29, [cq+128* 7+64]
4385*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*32+gprsize], m14
4386*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*33+gprsize], m15
4387*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*34+gprsize], m16
4388*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*35+gprsize], m17
4389*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*36+gprsize], m18
4390*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*37+gprsize], m19
4391*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*38+gprsize], m20
4392*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*39+gprsize], m21
4393*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
4394*c0909341SAndroid Build Coastguard Worker.fast: ; topleft 16x16 nonzero
4395*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
4396*c0909341SAndroid Build Coastguard Worker    jl .fast2
4397*c0909341SAndroid Build Coastguard Worker    call .pass1_fast
4398*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
4399*c0909341SAndroid Build Coastguard Worker    call .pass2_fast_start
4400*c0909341SAndroid Build Coastguard Worker    jmp .end
4401*c0909341SAndroid Build Coastguard Worker.fast2: ; topleft 8x8 nonzero
4402*c0909341SAndroid Build Coastguard Worker    movshdup             m7, [o(permB)]
4403*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+128*0]
4404*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+128*4]
4405*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+128*2]
4406*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+128*6]
4407*c0909341SAndroid Build Coastguard Worker    mova               ym16, [cq+128*1]
4408*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+128*5]
4409*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+128*3]
4410*c0909341SAndroid Build Coastguard Worker    mova               ym17, [cq+128*7]
4411*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 16*4
4412*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, m0 ;  0  0
4413*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m7, m1 ;  4  4
4414*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m7, m5 ;  2  6
4415*c0909341SAndroid Build Coastguard Worker    vpermt2q            m16, m7, m2 ;  1  5
4416*c0909341SAndroid Build Coastguard Worker    vpermt2q            m17, m7, m3 ;  7  3
4417*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m0, m1, m4, m16, m17
4418*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m0, m1, m4, m16, m17
4419*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m0, m1, m4, m16, m17
4420*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
4421*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_1)]
4422*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_end2
4423*c0909341SAndroid Build Coastguard Worker
4424*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
4425*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m27, m0, m2 ; 0
4426*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m2     ; 1
4427*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m22, m3, m4 ; 2
4428*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m26, m3, m4 ; 3
4429*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m5, m7 ; 4
4430*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m20, m5, m7 ; 5
4431*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m23, m6, m8 ; 6
4432*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m21, m6, m8 ; 7
4433*c0909341SAndroid Build Coastguard Worker
4434*c0909341SAndroid Build Coastguard Worker    mova                m10, m13
4435*c0909341SAndroid Build Coastguard Worker    call .pass2_fast2_start
4436*c0909341SAndroid Build Coastguard Worker.end:
4437*c0909341SAndroid Build Coastguard Worker
4438*c0909341SAndroid Build Coastguard Worker    pxor                m31, m31
4439*c0909341SAndroid Build Coastguard Worker
4440*c0909341SAndroid Build Coastguard Worker.left_zero_loop:
4441*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+r3*8+128*x], m31}, 0, 1, 2, 3
4442*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 16*4
4443*c0909341SAndroid Build Coastguard Worker    jge .left_zero_loop
4444*c0909341SAndroid Build Coastguard Worker
4445*c0909341SAndroid Build Coastguard Worker    call .pass2_end
4446*c0909341SAndroid Build Coastguard Worker    RET
4447*c0909341SAndroid Build Coastguard Worker.pass2_end:
4448*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, _, dst2, stride32, stklo, stkhi
4449*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m30, [pixel_10bpc_max]
4450*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [pw_2048]
4451*c0909341SAndroid Build Coastguard Worker
4452*c0909341SAndroid Build Coastguard Worker    mov           stride32q, strideq
4453*c0909341SAndroid Build Coastguard Worker    shl           stride32q, 5
4454*c0909341SAndroid Build Coastguard Worker    lea              stkhiq, [rsp+31*mmsize+gprsize]
4455*c0909341SAndroid Build Coastguard Worker    lea               dst2q, [dstq+stride32q]
4456*c0909341SAndroid Build Coastguard Worker    lea              stkloq, [rsp+gprsize]
4457*c0909341SAndroid Build Coastguard Worker    sub               dst2q, strideq    ; dst31
4458*c0909341SAndroid Build Coastguard Worker
4459*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m0, m29    ; t0[idct32]
4460*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m0, m29    ; t31[idct32]
4461*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4462*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m1, m28    ; t1[idct32]
4463*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m1, m28    ; t30[idct32]
4464*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4465*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m2, m27    ; t2[idct32]
4466*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m2, m27    ; t29[idct32]
4467*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4468*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m3, m26    ; t3[idct32]
4469*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m3, m26    ; t28[idct32]
4470*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4471*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m4, m25    ; t4[idct32]
4472*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m4, m25    ; t27[idct32]
4473*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4474*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m5, m24    ; t5[idct32]
4475*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m5, m24    ; t26[idct32]
4476*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4477*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m6, m23    ; t6[idct32]
4478*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m6, m23    ; t25[idct32]
4479*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4480*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m7, m22    ; t7[idct32]
4481*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m7, m22    ; t24[idct32]
4482*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4483*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+64*32+gprsize]
4484*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+64*33+gprsize]
4485*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+64*34+gprsize]
4486*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+64*35+gprsize]
4487*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+64*36+gprsize]
4488*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+64*37+gprsize]
4489*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+64*38+gprsize]
4490*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+64*39+gprsize]
4491*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m0, m21    ; t8[idct32]
4492*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m0, m21    ; t23[idct32]
4493*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4494*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m1, m20    ; t9[idct32]
4495*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m1, m20    ; t22[idct32]
4496*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4497*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m2, m19    ; t10[idct32]
4498*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m2, m19    ; t21[idct32]
4499*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4500*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m3, m18    ; t11[idct32]
4501*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m3, m18    ; t20[idct32]
4502*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4503*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m4, m17    ; t12[idct32]
4504*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m4, m17    ; t19[idct32]
4505*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4506*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m5, m16    ; t13[idct32]
4507*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m5, m16    ; t18[idct32]
4508*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4509*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m6, m15    ; t14[idct32]
4510*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m6, m15    ; t17[idct32]
4511*c0909341SAndroid Build Coastguard Worker    call .end_sumsub_write
4512*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m7, m14    ; t15[idct32]
4513*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m7, m14    ; t16[idct32]
4514*c0909341SAndroid Build Coastguard Worker    ; fall-through
4515*c0909341SAndroid Build Coastguard Worker.end_sumsub_write:
4516*c0909341SAndroid Build Coastguard Worker    mova                m10, [stkhiq]   ; t63-n
4517*c0909341SAndroid Build Coastguard Worker    mova                m12, [stkloq]   ; t32+n
4518*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m8, m10    ; out63-n
4519*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m10        ; out0 +n
4520*c0909341SAndroid Build Coastguard Worker    psubsw              m10, m9, m12    ; out32+n
4521*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m12        ; out32-n
4522*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m13}, m11, m8, m10, m9
4523*c0909341SAndroid Build Coastguard Worker    paddw                m8, [dstq]
4524*c0909341SAndroid Build Coastguard Worker    paddw                m9, [dst2q]
4525*c0909341SAndroid Build Coastguard Worker    paddw               m10, [dstq+stride32q]
4526*c0909341SAndroid Build Coastguard Worker    paddw               m11, [dst2q+stride32q]
4527*c0909341SAndroid Build Coastguard Worker    REPX  {pminsw   x, m30}, m11, m8, m10, m9
4528*c0909341SAndroid Build Coastguard Worker    REPX  {pmaxsw   x, m31}, m11, m8, m10, m9
4529*c0909341SAndroid Build Coastguard Worker    mova  [dstq           ], m8
4530*c0909341SAndroid Build Coastguard Worker    mova  [dst2q          ], m9
4531*c0909341SAndroid Build Coastguard Worker    mova  [dstq +stride32q], m10
4532*c0909341SAndroid Build Coastguard Worker    mova  [dst2q+stride32q], m11
4533*c0909341SAndroid Build Coastguard Worker    add              stkloq, mmsize
4534*c0909341SAndroid Build Coastguard Worker    sub              stkhiq, mmsize
4535*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4536*c0909341SAndroid Build Coastguard Worker    sub               dst2q, strideq
4537*c0909341SAndroid Build Coastguard Worker    ret
4538*c0909341SAndroid Build Coastguard Worker.pass2_fast_start:
4539*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
4540*c0909341SAndroid Build Coastguard Worker    lea                  r4, [rsp+gprsize]
4541*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
4542*c0909341SAndroid Build Coastguard Worker    mova                 m0, m21
4543*c0909341SAndroid Build Coastguard Worker    mova                 m3, m18
4544*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
4545*c0909341SAndroid Build Coastguard Worker    mova                 m0, m20
4546*c0909341SAndroid Build Coastguard Worker    mova                 m3, m16
4547*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
4548*c0909341SAndroid Build Coastguard Worker    mova                 m0, m26
4549*c0909341SAndroid Build Coastguard Worker    mova                 m3, m17
4550*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
4551*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
4552*c0909341SAndroid Build Coastguard Worker
4553*c0909341SAndroid Build Coastguard Worker    mova                 m0, m27
4554*c0909341SAndroid Build Coastguard Worker    mova                 m1, m28
4555*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
4556*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*32+gprsize], m14
4557*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*33+gprsize], m15
4558*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*34+gprsize], m16
4559*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*35+gprsize], m17
4560*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*36+gprsize], m18
4561*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*37+gprsize], m19
4562*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*38+gprsize], m20
4563*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*39+gprsize], m21
4564*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
4565*c0909341SAndroid Build Coastguard Worker.pass2_fast2_start:
4566*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
4567*c0909341SAndroid Build Coastguard Worker    lea                  r4, [rsp+gprsize]
4568*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
4569*c0909341SAndroid Build Coastguard Worker    mova                 m0, m21
4570*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
4571*c0909341SAndroid Build Coastguard Worker    mova                 m0, m20
4572*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
4573*c0909341SAndroid Build Coastguard Worker    mova                 m0, m26
4574*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
4575*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
4576*c0909341SAndroid Build Coastguard Worker
4577*c0909341SAndroid Build Coastguard Worker    mova                 m0, m27
4578*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast3
4579*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*32+gprsize], m14
4580*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*33+gprsize], m15
4581*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*34+gprsize], m16
4582*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*35+gprsize], m17
4583*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*36+gprsize], m18
4584*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*37+gprsize], m19
4585*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*38+gprsize], m20
4586*c0909341SAndroid Build Coastguard Worker    mova        [rsp+64*39+gprsize], m21
4587*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast3
4588*c0909341SAndroid Build Coastguard Worker.dconly:
4589*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, stride, c, eob
4590*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
4591*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
4592*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
4593*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly3
4594*c0909341SAndroid Build Coastguard Worker.pass1_fast:
4595*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 0]
4596*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+128* 4]
4597*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+128* 8]
4598*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128*12]
4599*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 16*12
4600*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_fast_rect2
4601*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m12, [cq+128* 2]
4602*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m12, [cq+128* 6]
4603*c0909341SAndroid Build Coastguard Worker    pmulld              m18, m12, [cq+128*10]
4604*c0909341SAndroid Build Coastguard Worker    pmulld              m19, m12, [cq+128*14]
4605*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_fast_rect2
4606*c0909341SAndroid Build Coastguard Worker    call .pass1_load_spill
4607*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2
4608*c0909341SAndroid Build Coastguard Worker    jmp .pass1_end
4609*c0909341SAndroid Build Coastguard Worker.pass1:
4610*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 0]
4611*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+128* 4]
4612*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+128* 8]
4613*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128*12]
4614*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m12, [cq+128*16]
4615*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m12, [cq+128*20]
4616*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m12, [cq+128*24]
4617*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m12, [cq+128*28]
4618*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_rect2
4619*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m12, [cq+128* 2]
4620*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m12, [cq+128* 6]
4621*c0909341SAndroid Build Coastguard Worker    pmulld              m18, m12, [cq+128*10]
4622*c0909341SAndroid Build Coastguard Worker    pmulld              m19, m12, [cq+128*14]
4623*c0909341SAndroid Build Coastguard Worker    pmulld              m20, m12, [cq+128*18]
4624*c0909341SAndroid Build Coastguard Worker    pmulld              m21, m12, [cq+128*22]
4625*c0909341SAndroid Build Coastguard Worker    pmulld              m22, m12, [cq+128*26]
4626*c0909341SAndroid Build Coastguard Worker    pmulld              m23, m12, [cq+128*30]
4627*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_rect2
4628*c0909341SAndroid Build Coastguard Worker    call .pass1_load_spill
4629*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m12, [cq+128*17]
4630*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m12, [cq+128*19]
4631*c0909341SAndroid Build Coastguard Worker    pmulld              m18, m12, [cq+128*21]
4632*c0909341SAndroid Build Coastguard Worker    pmulld              m19, m12, [cq+128*23]
4633*c0909341SAndroid Build Coastguard Worker    pmulld              m20, m12, [cq+128*25]
4634*c0909341SAndroid Build Coastguard Worker    pmulld              m21, m12, [cq+128*27]
4635*c0909341SAndroid Build Coastguard Worker    pmulld              m22, m12, [cq+128*29]
4636*c0909341SAndroid Build Coastguard Worker    pmulld              m23, m12, [cq+128*31]
4637*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_rect2
4638*c0909341SAndroid Build Coastguard Worker.pass1_end:
4639*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_1)]
4640*c0909341SAndroid Build Coastguard Worker    lea                  r4, [cq+128*8]
4641*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end
4642*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m27, m0, m20  ;  0
4643*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m20      ;  1
4644*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m24, m5, m16  ; 10
4645*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m16, m5, m16  ; 11
4646*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m23, m3, m21  ;  6
4647*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m21, m3, m21  ;  7
4648*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m25, m7, m8   ; 14
4649*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m7, m8   ; 15
4650*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m22, m15, m4  ;  2
4651*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m26, m15, m4  ;  3
4652*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m6, m17  ; 12
4653*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m17, m6, m17  ; 13
4654*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m28, m14, m18 ;  8
4655*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m18, m14, m18 ;  9
4656*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m2, m1   ;  4
4657*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m20, m2, m1   ;  5
4658*c0909341SAndroid Build Coastguard Worker    ret
4659*c0909341SAndroid Build Coastguard Worker.pass1_load_spill:
4660*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
4661*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 0], m0
4662*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 1]
4663*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 1], m1
4664*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 2], m2
4665*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+128* 3]
4666*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+128* 5]
4667*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 3], m3
4668*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 4], m4
4669*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128* 7]
4670*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m12, [cq+128* 9]
4671*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 5], m5
4672*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 6], m6
4673*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 7], m7
4674*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m12, [cq+128*11]
4675*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m12, [cq+128*13]
4676*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m12, [cq+128*15]
4677*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 8], m23
4678*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 9], m22
4679*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*10], m21
4680*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*11], m20
4681*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*12], m19
4682*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*13], m18
4683*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*14], m17
4684*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*15], m16
4685*c0909341SAndroid Build Coastguard Worker    ret
4686*c0909341SAndroid Build Coastguard Worker
4687*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
4688*c0909341SAndroid Build Coastguard Worker%undef cmp
4689*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
4690*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
4691*c0909341SAndroid Build Coastguard Worker    jz .dconly
4692*c0909341SAndroid Build Coastguard Worker
4693*c0909341SAndroid Build Coastguard Worker    PROLOGUE 4, 7, 32, -64*32, dst, stride, c, eob
4694*c0909341SAndroid Build Coastguard Worker%undef cmp
4695*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
4696*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
4697*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
4698*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
4699*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
4700*c0909341SAndroid Build Coastguard Worker    jl .fast ; 8x8
4701*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
4702*c0909341SAndroid Build Coastguard Worker    jge .full ; 16x16
4703*c0909341SAndroid Build Coastguard Worker    lea                  r4, [idct64_mul_16bpc]
4704*c0909341SAndroid Build Coastguard Worker    lea                  r6, [rsp+4*64]
4705*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 1]
4706*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*15]
4707*c0909341SAndroid Build Coastguard Worker    call .main_part1_fast
4708*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 7]
4709*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64* 9]
4710*c0909341SAndroid Build Coastguard Worker    call .main_part1_fast
4711*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 5]
4712*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*11]
4713*c0909341SAndroid Build Coastguard Worker    call .main_part1_fast
4714*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 3]
4715*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*13]
4716*c0909341SAndroid Build Coastguard Worker    call .main_part1_fast
4717*c0909341SAndroid Build Coastguard Worker    call .main_part2
4718*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 0]
4719*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 8]
4720*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64* 4]
4721*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64*12]
4722*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_fast2
4723*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_fast2
4724*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
4725*c0909341SAndroid Build Coastguard Worker    call .pass1_load_spill
4726*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2
4727*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 12*8
4728*c0909341SAndroid Build Coastguard Worker    jmp .idct64_end
4729*c0909341SAndroid Build Coastguard Worker.full:
4730*c0909341SAndroid Build Coastguard Worker    lea                  r4, [idct64_mul_16bpc]
4731*c0909341SAndroid Build Coastguard Worker    lea                  r6, [rsp+4*64]
4732*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 1]
4733*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*31]
4734*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*17]
4735*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*15]
4736*c0909341SAndroid Build Coastguard Worker    call .main_part1
4737*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 7]
4738*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*25]
4739*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*23]
4740*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64* 9]
4741*c0909341SAndroid Build Coastguard Worker    call .main_part1
4742*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 5]
4743*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*27]
4744*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*21]
4745*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*11]
4746*c0909341SAndroid Build Coastguard Worker    call .main_part1
4747*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 3]
4748*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*29]
4749*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*19]
4750*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*13]
4751*c0909341SAndroid Build Coastguard Worker    call .main_part1
4752*c0909341SAndroid Build Coastguard Worker    call .main_part2
4753*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 0]
4754*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 8]
4755*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*16]
4756*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*24]
4757*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64* 4]
4758*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64*12]
4759*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64*20]
4760*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+64*28]
4761*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_fast
4762*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_fast
4763*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
4764*c0909341SAndroid Build Coastguard Worker    call .pass1_load_spill
4765*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64*18]
4766*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*22]
4767*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*26]
4768*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64*30]
4769*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
4770*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 28*8
4771*c0909341SAndroid Build Coastguard Worker    jmp .idct64_end
4772*c0909341SAndroid Build Coastguard Worker.dconly:
4773*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
4774*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
4775*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
4776*c0909341SAndroid Build Coastguard Worker.dconly1:
4777*c0909341SAndroid Build Coastguard Worker    add                 r6d, 640
4778*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 10
4779*c0909341SAndroid Build Coastguard Worker.dconly2:
4780*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(dconly_10bpc)]
4781*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
4782*c0909341SAndroid Build Coastguard Worker    add                 r6d, 2176
4783*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 12
4784*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m2, r6d
4785*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m3
4786*c0909341SAndroid Build Coastguard Worker.dconly_loop:
4787*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2, [dstq+64*0]
4788*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, [dstq+64*1]
4789*c0909341SAndroid Build Coastguard Worker    psubusw              m0, m3
4790*c0909341SAndroid Build Coastguard Worker    psubusw              m1, m3
4791*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*0], m0
4792*c0909341SAndroid Build Coastguard Worker    mova        [dstq+64*1], m1
4793*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4794*c0909341SAndroid Build Coastguard Worker    dec                 r3d
4795*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
4796*c0909341SAndroid Build Coastguard Worker    ret
4797*c0909341SAndroid Build Coastguard Worker.pass1_load_spill:
4798*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m0
4799*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 2]
4800*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m1
4801*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 6]
4802*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m2
4803*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m3
4804*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*10]
4805*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*14]
4806*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m4
4807*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m5
4808*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m6
4809*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m7
4810*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m23
4811*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m22
4812*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m21
4813*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m20
4814*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 9], m19
4815*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*11], m18
4816*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*13], m17
4817*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*15], m16
4818*c0909341SAndroid Build Coastguard Worker    ret
4819*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4820*c0909341SAndroid Build Coastguard Worker.main_part1_fast_rect2:
4821*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m0, m3
4822*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m0, m3
4823*c0909341SAndroid Build Coastguard Worker.main_part1_fast:
4824*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m0, [r4+4*0]{bcstd}    ; t63a
4825*c0909341SAndroid Build Coastguard Worker    pmulld               m0, [r4+4*1]{bcstd}        ; t32a
4826*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m3, [r4+4*6]{bcstd}    ; t60a
4827*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [r4+4*7]{bcstd}        ; t35a
4828*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [r4+4*8]
4829*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [r4+4*9]
4830*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m7, m0, m4, m3
4831*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m7, m0, m4, m3
4832*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
4833*c0909341SAndroid Build Coastguard Worker    mova                 m1, m7
4834*c0909341SAndroid Build Coastguard Worker    mova                 m6, m3
4835*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
4836*c0909341SAndroid Build Coastguard Worker    jmp .main_part1b
4837*c0909341SAndroid Build Coastguard Worker.main_part1_rect2:
4838*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m0, m1, m2, m3
4839*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m0, m1, m2, m3
4840*c0909341SAndroid Build Coastguard Worker.main_part1: ; idct64 steps 1-5
4841*c0909341SAndroid Build Coastguard Worker    ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
4842*c0909341SAndroid Build Coastguard Worker    ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
4843*c0909341SAndroid Build Coastguard Worker    ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
4844*c0909341SAndroid Build Coastguard Worker    ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
4845*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m0, [r4+4*0]{bcstd}    ; t63a
4846*c0909341SAndroid Build Coastguard Worker    pmulld               m0, [r4+4*1]{bcstd}        ; t32a
4847*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m1, [r4+4*2]{bcstd}    ; t62a
4848*c0909341SAndroid Build Coastguard Worker    pmulld               m1, [r4+4*3]{bcstd}        ; t33a
4849*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m2, [r4+4*4]{bcstd}    ; t61a
4850*c0909341SAndroid Build Coastguard Worker    pmulld               m2, [r4+4*5]{bcstd}        ; t34a
4851*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m3, [r4+4*6]{bcstd}    ; t60a
4852*c0909341SAndroid Build Coastguard Worker    pmulld               m3, [r4+4*7]{bcstd}        ; t35a
4853*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [r4+4*8]
4854*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [r4+4*9]
4855*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m13}, m7, m0, m6, m1, m5, m2, m4, m3
4856*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
4857*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m1 ; t33
4858*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1     ; t32
4859*c0909341SAndroid Build Coastguard Worker    psubd                m1, m7, m6 ; t62
4860*c0909341SAndroid Build Coastguard Worker    paddd                m7, m6     ; t63
4861*c0909341SAndroid Build Coastguard Worker    psubd                m6, m3, m2 ; t34
4862*c0909341SAndroid Build Coastguard Worker    paddd                m3, m2     ; t35
4863*c0909341SAndroid Build Coastguard Worker    psubd                m2, m4, m5 ; t61
4864*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5     ; t60
4865*c0909341SAndroid Build Coastguard Worker.main_part1b:
4866*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m8, m1, m6, m2
4867*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m8, m1, m6, m2
4868*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 8, 5, 9, _, 13, 10, 11    ; t33a, t62a
4869*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 6, 5, 9, _, 13, 10, 11, 2 ; t61a, t34a
4870*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m0, m3, m7, m4
4871*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m0, m3, m7, m4
4872*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [r4+4*10]
4873*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [r4+4*11]
4874*c0909341SAndroid Build Coastguard Worker    psubd                m5, m0, m3 ; t35a
4875*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3     ; t32a
4876*c0909341SAndroid Build Coastguard Worker    psubd                m3, m7, m4 ; t60a
4877*c0909341SAndroid Build Coastguard Worker    paddd                m7, m4     ; t63a
4878*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1, m6 ; t34
4879*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6     ; t33
4880*c0909341SAndroid Build Coastguard Worker    psubd                m6, m8, m2 ; t61
4881*c0909341SAndroid Build Coastguard Worker    paddd                m8, m2     ; t62
4882*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m5, m3, m4, m6
4883*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m5, m3, m4, m6
4884*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 5, 2, 9, _, 13, 10, 11 ; t35,  t60
4885*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         6, 4, 2, 9, _, 13, 10, 11 ; t34a, t61a
4886*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m0, m7, m1, m8
4887*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m0, m7, m1, m8
4888*c0909341SAndroid Build Coastguard Worker    add                  r4, 4*12
4889*c0909341SAndroid Build Coastguard Worker    mova          [r6-64*4], m0
4890*c0909341SAndroid Build Coastguard Worker    mova          [r6+64*3], m7
4891*c0909341SAndroid Build Coastguard Worker    mova          [r6-64*3], m1
4892*c0909341SAndroid Build Coastguard Worker    mova          [r6+64*2], m8
4893*c0909341SAndroid Build Coastguard Worker    mova          [r6-64*2], m6
4894*c0909341SAndroid Build Coastguard Worker    mova          [r6+64*1], m4
4895*c0909341SAndroid Build Coastguard Worker    mova          [r6-64*1], m3
4896*c0909341SAndroid Build Coastguard Worker    mova          [r6+64*0], m5
4897*c0909341SAndroid Build Coastguard Worker    add                  r6, 64*8
4898*c0909341SAndroid Build Coastguard Worker    ret
4899*c0909341SAndroid Build Coastguard Worker.main_part2: ; idct64 steps 6-9
4900*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r6+64*3]
4901*c0909341SAndroid Build Coastguard Worker    sub                  r6, 64*4
4902*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [pd_1567]
4903*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pd_3784]
4904*c0909341SAndroid Build Coastguard Worker.main_part2_loop:
4905*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r6-64*32] ; t32a
4906*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r4-64*24] ; t39a
4907*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r4-64*32] ; t63a
4908*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r6-64*24] ; t56a
4909*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r6-64*16] ; t40a
4910*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r4-64* 8] ; t47a
4911*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r4-64*16] ; t55a
4912*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r6-64* 8] ; t48a
4913*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m1 ; t39
4914*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1     ; t32
4915*c0909341SAndroid Build Coastguard Worker    psubd                m1, m2, m3 ; t56
4916*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3     ; t63
4917*c0909341SAndroid Build Coastguard Worker    psubd                m3, m5, m4 ; t40
4918*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4     ; t47
4919*c0909341SAndroid Build Coastguard Worker    psubd                m4, m7, m6 ; t55
4920*c0909341SAndroid Build Coastguard Worker    paddd                m7, m6     ; t48
4921*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m8, m1, m3, m4
4922*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m8, m1, m3, m4
4923*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 8, 6, 9, _, 13, 10, 11    ; t39a, t56a
4924*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 3, 6, 9, _, 13, 10, 11, 2 ; t55a, t40a
4925*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m0, m2, m5, m7
4926*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m0, m5, m2, m7
4927*c0909341SAndroid Build Coastguard Worker    psubd                m6, m2, m7 ; t48a
4928*c0909341SAndroid Build Coastguard Worker    paddd                m2, m7     ; t63a
4929*c0909341SAndroid Build Coastguard Worker    psubd                m7, m0, m5 ; t47a
4930*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5     ; t32a
4931*c0909341SAndroid Build Coastguard Worker    psubd                m5, m8, m4 ; t55
4932*c0909341SAndroid Build Coastguard Worker    paddd                m8, m4     ; t56
4933*c0909341SAndroid Build Coastguard Worker    psubd                m4, m1, m3 ; t40
4934*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3     ; t39
4935*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m6, m7, m5, m4
4936*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m6, m7, m5, m4
4937*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m6, m7, m5, m4
4938*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m2, m0, m8, m1
4939*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m2, m0, m8, m1
4940*c0909341SAndroid Build Coastguard Worker    paddd                m6, m13
4941*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
4942*c0909341SAndroid Build Coastguard Worker    psubd                m3, m6, m7 ; t47
4943*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7     ; t48
4944*c0909341SAndroid Build Coastguard Worker    psubd                m7, m5, m4 ; t40a
4945*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4     ; t55a
4946*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m3, m6, m7, m5
4947*c0909341SAndroid Build Coastguard Worker    mova         [r4-64* 8], m2
4948*c0909341SAndroid Build Coastguard Worker    mova         [r6-64*32], m0
4949*c0909341SAndroid Build Coastguard Worker    mova         [r6-64* 8], m8
4950*c0909341SAndroid Build Coastguard Worker    mova         [r4-64*32], m1
4951*c0909341SAndroid Build Coastguard Worker    mova         [r4-64*24], m3
4952*c0909341SAndroid Build Coastguard Worker    mova         [r6-64*16], m6
4953*c0909341SAndroid Build Coastguard Worker    mova         [r6-64*24], m7
4954*c0909341SAndroid Build Coastguard Worker    mova         [r4-64*16], m5
4955*c0909341SAndroid Build Coastguard Worker    add                  r6, 64
4956*c0909341SAndroid Build Coastguard Worker    sub                  r4, 64
4957*c0909341SAndroid Build Coastguard Worker    cmp                  r6, r4
4958*c0909341SAndroid Build Coastguard Worker    jl .main_part2_loop
4959*c0909341SAndroid Build Coastguard Worker    ret
4960*c0909341SAndroid Build Coastguard Worker.idct64_main_end:
4961*c0909341SAndroid Build Coastguard Worker%macro IDCT64_PASS1_END 9
4962*c0909341SAndroid Build Coastguard Worker    mova                m%5, [%9+%1*128]    ; t0+n [idct32] + idct64 rounding
4963*c0909341SAndroid Build Coastguard Worker    psubd               m%6, m%5, m%2       ; out31-n [idct32] = t31-n [idct64]
4964*c0909341SAndroid Build Coastguard Worker    paddd               m%5, m%2            ; out0+n [idct32] = t0+n [idct64]
4965*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m%6, m%5
4966*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m%6, m%5
4967*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m11}, m%6, m%5
4968*c0909341SAndroid Build Coastguard Worker    mova                m%2, [r3+%3*64]     ; t32+n [idct64]
4969*c0909341SAndroid Build Coastguard Worker    mova                m%7, [r3+%4*64]     ; t63-n [idct64]
4970*c0909341SAndroid Build Coastguard Worker    psubd               m%8, m%5, m%7       ; out63-n
4971*c0909341SAndroid Build Coastguard Worker    paddd               m%5, m%7            ; out0+n
4972*c0909341SAndroid Build Coastguard Worker    psubd               m%7, m%6, m%2       ; out32+n
4973*c0909341SAndroid Build Coastguard Worker    paddd               m%6, m%2            ; out31-n
4974*c0909341SAndroid Build Coastguard Worker    REPX   {vpsravd x, m11}, m%8, m%5, m%7, m%6
4975*c0909341SAndroid Build Coastguard Worker%endmacro
4976*c0909341SAndroid Build Coastguard Worker
4977*c0909341SAndroid Build Coastguard Worker%macro IDCT64_PASS1_ENDx4 1
4978*c0909341SAndroid Build Coastguard Worker%assign %%m1 %1         ; t32+n
4979*c0909341SAndroid Build Coastguard Worker%assign %%m2 (7-%1)     ; t39-n
4980*c0909341SAndroid Build Coastguard Worker%assign %%m3 (8+%1)     ; t40+n
4981*c0909341SAndroid Build Coastguard Worker%assign %%m4 (15-%1)    ; t47-n
4982*c0909341SAndroid Build Coastguard Worker%assign %%m5 (16+%1)    ; t48+n
4983*c0909341SAndroid Build Coastguard Worker%assign %%m6 (23-%1)    ; t55-n
4984*c0909341SAndroid Build Coastguard Worker%assign %%m7 (24+%1)    ; t56+n
4985*c0909341SAndroid Build Coastguard Worker%assign %%m8 (31-%1)    ; t63-n
4986*c0909341SAndroid Build Coastguard Worker
4987*c0909341SAndroid Build Coastguard Worker%assign %%r1 %1         ; t16+n
4988*c0909341SAndroid Build Coastguard Worker%assign %%r2 (7-%1)     ; t23-n
4989*c0909341SAndroid Build Coastguard Worker%assign %%r3 (16+%1)    ; t24-n
4990*c0909341SAndroid Build Coastguard Worker%assign %%r4 (23-%1)    ; t31-n
4991*c0909341SAndroid Build Coastguard Worker
4992*c0909341SAndroid Build Coastguard Worker%assign %%c1 (%1)       ; t0/8+n
4993*c0909341SAndroid Build Coastguard Worker%assign %%c2 (7-%1)     ; t7/15-n
4994*c0909341SAndroid Build Coastguard Worker
4995*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_END   %%c1, %%r4, %%m1, %%m8, 24, 25, 26, 27, cq ; out0/31/32/63
4996*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_END   %%c1, %%r1, %%m4, %%m5, 28, 29, 30, 31, r4 ; out15/16/47/48
4997*c0909341SAndroid Build Coastguard Worker    packssdw      m %+ %%r1, m24, m29
4998*c0909341SAndroid Build Coastguard Worker    packssdw      m %+ %%r4, m28, m25
4999*c0909341SAndroid Build Coastguard Worker    packssdw            m26, m31
5000*c0909341SAndroid Build Coastguard Worker    packssdw            m30, m27
5001*c0909341SAndroid Build Coastguard Worker    mova   [r3+%%m5*mmsize], m26
5002*c0909341SAndroid Build Coastguard Worker    mova   [r3+%%m8*mmsize], m30
5003*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_END   %%c2, %%r3, %%m2, %%m7, 24, 25, 26, 27, cq ; out7/24/39/56
5004*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_END   %%c2, %%r2, %%m3, %%m6, 28, 29, 30, 31, r4 ; out8/23/40/55
5005*c0909341SAndroid Build Coastguard Worker    packssdw      m %+ %%r2, m24, m29
5006*c0909341SAndroid Build Coastguard Worker    packssdw      m %+ %%r3, m28, m25
5007*c0909341SAndroid Build Coastguard Worker    packssdw            m26, m31
5008*c0909341SAndroid Build Coastguard Worker    packssdw            m30, m27
5009*c0909341SAndroid Build Coastguard Worker    mova   [r3+%%m6*mmsize], m26
5010*c0909341SAndroid Build Coastguard Worker    mova   [r3+%%m7*mmsize], m30
5011*c0909341SAndroid Build Coastguard Worker%endmacro
5012*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_ENDx4    0
5013*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_ENDx4    1
5014*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_ENDx4    2
5015*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_ENDx4    3
5016*c0909341SAndroid Build Coastguard Worker    ret
5017*c0909341SAndroid Build Coastguard Worker.idct64_end:
5018*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_2)]
5019*c0909341SAndroid Build Coastguard Worker    lea                  r4, [cq+64]
5020*c0909341SAndroid Build Coastguard Worker    mov                  r3, rsp
5021*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
5022*c0909341SAndroid Build Coastguard Worker    call .idct64_main_end
5023*c0909341SAndroid Build Coastguard Worker
5024*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
5025*c0909341SAndroid Build Coastguard Worker.zero_loop:
5026*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+r6*8+64*x], m12}, 0, 1, 2, 3
5027*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 8*4
5028*c0909341SAndroid Build Coastguard Worker    jge .zero_loop
5029*c0909341SAndroid Build Coastguard Worker
5030*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
5031*c0909341SAndroid Build Coastguard Worker    mov                  r4, dstq
5032*c0909341SAndroid Build Coastguard Worker    call .pass2
5033*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16*mmsize]
5034*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+17*mmsize]
5035*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+18*mmsize]
5036*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+19*mmsize]
5037*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+20*mmsize]
5038*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+21*mmsize]
5039*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+22*mmsize]
5040*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+23*mmsize]
5041*c0909341SAndroid Build Coastguard Worker    mova                m16, [rsp+24*mmsize]
5042*c0909341SAndroid Build Coastguard Worker    mova                m17, [rsp+25*mmsize]
5043*c0909341SAndroid Build Coastguard Worker    mova                m18, [rsp+26*mmsize]
5044*c0909341SAndroid Build Coastguard Worker    mova                m19, [rsp+27*mmsize]
5045*c0909341SAndroid Build Coastguard Worker    mova                m20, [rsp+28*mmsize]
5046*c0909341SAndroid Build Coastguard Worker    mova                m21, [rsp+29*mmsize]
5047*c0909341SAndroid Build Coastguard Worker    mova                m22, [rsp+30*mmsize]
5048*c0909341SAndroid Build Coastguard Worker    mova                m23, [rsp+31*mmsize]
5049*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r4+64]
5050*c0909341SAndroid Build Coastguard Worker    call .pass2
5051*c0909341SAndroid Build Coastguard Worker    RET
5052*c0909341SAndroid Build Coastguard Worker.pass2:
5053*c0909341SAndroid Build Coastguard Worker    psrlq               m12, [permC], 24    ;  0  2  8 10  1  3  9 11
5054*c0909341SAndroid Build Coastguard Worker    psrlq               m13, m12, 32        ;  4  6 12 14  5  7 13 15
5055*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
5056*c0909341SAndroid Build Coastguard Worker
5057*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m19, m5, m16  ; 11
5058*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m16      ; 10
5059*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m16, m2, m1   ;  5
5060*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m1       ;  4
5061*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m15, m4  ;  2
5062*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m15, m4       ;  3
5063*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m14, m18 ;  8
5064*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m18, m14, m18 ;  9
5065*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m14, m0, m20  ;  1
5066*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m20      ;  0
5067*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m20, m6, m17  ; 13
5068*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m17      ; 12
5069*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m17, m3, m21  ;  7
5070*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m21      ;  6
5071*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m21, m7, m8   ; 15
5072*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m7, m8       ; 14
5073*c0909341SAndroid Build Coastguard Worker
5074*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
5075*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
5076*c0909341SAndroid Build Coastguard Worker.write:
5077*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [pw_2048]
5078*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
5079*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [pixel_10bpc_max]
5080*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
5081*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m11, m14
5082*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m11, m15
5083*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m11, m16
5084*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m11, m17
5085*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
5086*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m11, m18
5087*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m11, m19
5088*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m11, m20
5089*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m11, m21
5090*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
5091*c0909341SAndroid Build Coastguard Worker.fast: ; 8x8 packed
5092*c0909341SAndroid Build Coastguard Worker    movshdup             m7, [o(permB)]
5093*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+64*1]
5094*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+64*5]
5095*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+64*3]
5096*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+64*7]
5097*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m7, m2 ;  1  5
5098*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m7, m3 ;  7  3
5099*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf_packed
5100*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 0*mmsize], m0
5101*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 1*mmsize], m1
5102*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 2*mmsize], m2
5103*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 3*mmsize], m3
5104*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 4*mmsize], m4
5105*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 5*mmsize], m5
5106*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 6*mmsize], m6
5107*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 7*mmsize], m7
5108*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 8*mmsize], m16
5109*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 9*mmsize], m17
5110*c0909341SAndroid Build Coastguard Worker    mova    [rsp+10*mmsize], m18
5111*c0909341SAndroid Build Coastguard Worker    mova    [rsp+11*mmsize], m19
5112*c0909341SAndroid Build Coastguard Worker    mova    [rsp+12*mmsize], m20
5113*c0909341SAndroid Build Coastguard Worker    mova    [rsp+13*mmsize], m21
5114*c0909341SAndroid Build Coastguard Worker    mova    [rsp+14*mmsize], m22
5115*c0909341SAndroid Build Coastguard Worker    mova    [rsp+15*mmsize], m23
5116*c0909341SAndroid Build Coastguard Worker
5117*c0909341SAndroid Build Coastguard Worker    movshdup             m7, [o(permB)]
5118*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+64*0]
5119*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+64*4]
5120*c0909341SAndroid Build Coastguard Worker    mova               ym16, [cq+64*2]
5121*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+64*6]
5122*c0909341SAndroid Build Coastguard Worker    vpermt2q            m16, m7, m5 ;  2  6
5123*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, m0 ;  0  0
5124*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m7, m4 ;  4  4
5125*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
5126*c0909341SAndroid Build Coastguard Worker    ; m0-7,9,16-22 contain un-sumsub'ed dct32 output data
5127*c0909341SAndroid Build Coastguard Worker
5128*c0909341SAndroid Build Coastguard Worker    ; zero input coefs
5129*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
5130*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+x*64], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
5131*c0909341SAndroid Build Coastguard Worker
5132*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_2)]
5133*c0909341SAndroid Build Coastguard Worker    call .main_end
5134*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
5135*c0909341SAndroid Build Coastguard Worker    mov                  r4, dstq
5136*c0909341SAndroid Build Coastguard Worker    call .pass2_fast
5137*c0909341SAndroid Build Coastguard Worker    mova                 m0, m24
5138*c0909341SAndroid Build Coastguard Worker    mova                 m1, m25
5139*c0909341SAndroid Build Coastguard Worker    mova                 m2, m26
5140*c0909341SAndroid Build Coastguard Worker    mova                 m3, m27
5141*c0909341SAndroid Build Coastguard Worker    mova                 m4, m28
5142*c0909341SAndroid Build Coastguard Worker    mova                 m5, m29
5143*c0909341SAndroid Build Coastguard Worker    mova                 m6, m30
5144*c0909341SAndroid Build Coastguard Worker    mova                 m7, m31
5145*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r4+64]
5146*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
5147*c0909341SAndroid Build Coastguard Worker    call .pass2_fast
5148*c0909341SAndroid Build Coastguard Worker    RET
5149*c0909341SAndroid Build Coastguard Worker.pass2_fast:
5150*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
5151*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
5152*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m14, m0, m2 ; 1
5153*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2     ; 0
5154*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m3, m4 ; 2
5155*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m15, m3, m4 ; 3
5156*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m5, m7 ; 4
5157*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m16, m5, m7 ; 5
5158*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m6, m8 ; 6
5159*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m17, m6, m8 ; 7
5160*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
5161*c0909341SAndroid Build Coastguard Worker    jmp .write
5162*c0909341SAndroid Build Coastguard Worker.main_end:
5163*c0909341SAndroid Build Coastguard Worker
5164*c0909341SAndroid Build Coastguard Worker%macro IDCT64_PASS1_PACKED_END 7
5165*c0909341SAndroid Build Coastguard Worker    psubd               m%5, m%1, m%2       ; out31-n [idct32] = t31-n [idct64]
5166*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%2            ; out0+n [idct32] = t0+n [idct64]
5167*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m%5, m%1
5168*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m%5, m%1
5169*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m11}, m%5, m%1
5170*c0909341SAndroid Build Coastguard Worker    mova                m%2, [rsp+%6*64+gprsize]    ; t32+n [idct64]
5171*c0909341SAndroid Build Coastguard Worker    mova                m%3, [rsp+%7*64+gprsize]    ; t63-n [idct64]
5172*c0909341SAndroid Build Coastguard Worker    psubd               m%4, m%1, m%3       ; out63-n
5173*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%3            ; out0+n
5174*c0909341SAndroid Build Coastguard Worker    psubd               m%3, m%5, m%2       ; out32+n
5175*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%5            ; out31-n
5176*c0909341SAndroid Build Coastguard Worker    REPX   {vpsravd x, m11}, m%4, m%1, m%3, m%2
5177*c0909341SAndroid Build Coastguard Worker%endmacro
5178*c0909341SAndroid Build Coastguard Worker
5179*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_PACKED_END  0, 22, 24, 10, 12, 0, 15   ; out0/1,31/30,32/33,63/62
5180*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_PACKED_END  7,  9, 31, 13, 12, 7,  8   ; out15/14,16/17,47/46,48/49
5181*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m9
5182*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m22
5183*c0909341SAndroid Build Coastguard Worker    packssdw            m24, m13
5184*c0909341SAndroid Build Coastguard Worker    packssdw            m31, m10
5185*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_PACKED_END  1, 21, 25, 10, 12, 1, 14   ; out3/2,28/29,35/34,60/61
5186*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_PACKED_END  6, 16, 30, 13, 12, 6,  9   ; out12/13,19/18,44/45,51/50
5187*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m16
5188*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m21
5189*c0909341SAndroid Build Coastguard Worker    packssdw            m25, m13
5190*c0909341SAndroid Build Coastguard Worker    packssdw            m30, m10
5191*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_PACKED_END  2, 20, 26, 10, 12, 2, 13   ; out4/5,27/26,36/37,59/58
5192*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_PACKED_END  5, 17, 29, 13, 12, 5, 10   ; out11/10,20/21,43/42,52/53
5193*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m17
5194*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m20
5195*c0909341SAndroid Build Coastguard Worker    packssdw            m26, m13
5196*c0909341SAndroid Build Coastguard Worker    packssdw            m29, m10
5197*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_PACKED_END  3, 19, 27, 10, 12, 3, 12   ; out7/6,24/25,39/38,56/57
5198*c0909341SAndroid Build Coastguard Worker    IDCT64_PASS1_PACKED_END  4, 18, 28, 13, 12, 4, 11   ; out8/9,23/22,40/41,55/54
5199*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m18
5200*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m19
5201*c0909341SAndroid Build Coastguard Worker    packssdw            m27, m13
5202*c0909341SAndroid Build Coastguard Worker    packssdw            m28, m10
5203*c0909341SAndroid Build Coastguard Worker    ret
5204*c0909341SAndroid Build Coastguard Worker.main_oddhalf_packed_rect2:
5205*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m0, m1
5206*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m0, m1
5207*c0909341SAndroid Build Coastguard Worker.main_oddhalf_packed:
5208*c0909341SAndroid Build Coastguard Worker    ; m0=in1 in5, m1=in7 in3
5209*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [o(pd_101_501)]
5210*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m3, [o(pd_m700_m301)]
5211*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [o(pd_4095_4065)]
5212*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [o(pd_4036_4085)]
5213*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m0
5214*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m1
5215*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m4
5216*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m5
5217*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m2, m3, m0, m1
5218*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m2, m3, m0, m1
5219*c0909341SAndroid Build Coastguard Worker
5220*c0909341SAndroid Build Coastguard Worker    ; m2=t32a t40a -> t32/33 t40/41, m3=t39a t47a -> t38/39 t46/47
5221*c0909341SAndroid Build Coastguard Worker    ; m0=t63a t55a -> t62/63 t54/55, m1=t56a t48a -> t56/57 t48/49
5222*c0909341SAndroid Build Coastguard Worker    ; end of step 1-2
5223*c0909341SAndroid Build Coastguard Worker
5224*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m10, [o(pd_401_1931)]
5225*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m11, [o(pd_4076_3612)]
5226*c0909341SAndroid Build Coastguard Worker    mova                 m4, m0
5227*c0909341SAndroid Build Coastguard Worker    mova                 m5, m2
5228*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 5, 8, 9, _, 13, 10, 11
5229*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m10, [o(pd_3166_3920)]
5230*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m11, [o(pd_2598_1189)]
5231*c0909341SAndroid Build Coastguard Worker    mova                 m6, m3
5232*c0909341SAndroid Build Coastguard Worker    mova                 m7, m1
5233*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         7, 6, 8, 9, _, 13, 10, 11, 2
5234*c0909341SAndroid Build Coastguard Worker
5235*c0909341SAndroid Build Coastguard Worker    ; m4=t33a t41a -> t41/42  t33/34,  m5=t63a t54a -> t61/62  t53/54
5236*c0909341SAndroid Build Coastguard Worker    ; m6=t38a t46a -> t37/38  t45/46,  m7=t57a t49a -> t57/58  t49/50
5237*c0909341SAndroid Build Coastguard Worker    ; and from earlier:
5238*c0909341SAndroid Build Coastguard Worker    ; m0=t63  t55  -> t60/63a t52/55a, m1=t56  t48  -> t56/59a t48/51a
5239*c0909341SAndroid Build Coastguard Worker    ; m2=t32  t40  -> t32/35a t40/43a, m3=t39  t47  -> t36/39a t44/47a
5240*c0909341SAndroid Build Coastguard Worker    ; end of step 3-4
5241*c0909341SAndroid Build Coastguard Worker
5242*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m22, m2, m4     ; t32a/33 or t35a/34
5243*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m21, m3, m6     ; t36a/37 or t39a/38
5244*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m18, m2, m4     ; t40a/41 or t43a/42
5245*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m17, m3, m6     ; t44a/45 or t47a/46
5246*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m6, m1, m7     ; t48a/49 or t51a/50
5247*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m19, m0, m5     ; t52a/53 or t55a/54
5248*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m1, m7     ; t56a/57 or t59a/58
5249*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m23, m0, m5     ; t60a/61 or t63a/62
5250*c0909341SAndroid Build Coastguard Worker    mova                 m0, m22
5251*c0909341SAndroid Build Coastguard Worker    mova                 m7, m21
5252*c0909341SAndroid Build Coastguard Worker    mova                 m3, m18
5253*c0909341SAndroid Build Coastguard Worker    mova                m16, m17
5254*c0909341SAndroid Build Coastguard Worker    mova                 m5, m6
5255*c0909341SAndroid Build Coastguard Worker    mova                 m4, m19
5256*c0909341SAndroid Build Coastguard Worker    mova                 m2, m8
5257*c0909341SAndroid Build Coastguard Worker    mova                 m1, m23
5258*c0909341SAndroid Build Coastguard Worker    ; m0/22/7/21,18/3/17/16,6/5/19/4,2/8/1/23: t32-63[a]
5259*c0909341SAndroid Build Coastguard Worker
5260*c0909341SAndroid Build Coastguard Worker    ; step5
5261*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_799)]
5262*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_4017)]
5263*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         1, 22, 20, 9, _, 13, 10, 11    ; t35/34a, t60/61a
5264*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         8,  7, 20, 9, _, 13, 10, 11, 2 ; t59/58a, t36/37a
5265*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_3406)]
5266*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_2276)]
5267*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        19,  3, 20, 9, _, 13, 10, 11    ; t43/42a, t52/53a
5268*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         5, 17, 20, 9, _, 13, 10, 11, 2 ; t51/50a, t44/45a
5269*c0909341SAndroid Build Coastguard Worker    ; m0-1/7/21: t32-39[a], m18-19/17-16: t40-47[a]
5270*c0909341SAndroid Build Coastguard Worker    ; m6-5/3-4: t48-55[a], m2/8/22-23: t56-63[a]
5271*c0909341SAndroid Build Coastguard Worker
5272*c0909341SAndroid Build Coastguard Worker    ; step6
5273*c0909341SAndroid Build Coastguard Worker    psubd               m20, m0, m21    ; t39/38a
5274*c0909341SAndroid Build Coastguard Worker    paddd                m0, m21        ; t32/33a
5275*c0909341SAndroid Build Coastguard Worker    psubd               m21, m1, m7     ; t36a/37
5276*c0909341SAndroid Build Coastguard Worker    paddd                m1, m7         ; t35a/34
5277*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m20, m0, m21, m1
5278*c0909341SAndroid Build Coastguard Worker    psubd                m7, m16, m18   ; t40/41a
5279*c0909341SAndroid Build Coastguard Worker    paddd               m16, m18        ; t47/46a
5280*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m20, m0, m21, m1
5281*c0909341SAndroid Build Coastguard Worker    psubd               m18, m17, m19   ; t43a/42
5282*c0909341SAndroid Build Coastguard Worker    paddd               m17, m19        ; t44a/45
5283*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m7, m16, m18, m17
5284*c0909341SAndroid Build Coastguard Worker    psubd               m19, m6, m4     ; t55/54a
5285*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4         ; t48/49a
5286*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m7, m16, m18, m17
5287*c0909341SAndroid Build Coastguard Worker    psubd                m4, m5, m3     ; t52a/53
5288*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3         ; t51a/50
5289*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m19, m6, m4, m5
5290*c0909341SAndroid Build Coastguard Worker    psubd                m3, m23, m2    ; t56/57a
5291*c0909341SAndroid Build Coastguard Worker    paddd               m23, m2         ; t63/62a
5292*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m19, m6, m4, m5
5293*c0909341SAndroid Build Coastguard Worker    psubd                m2, m22, m8    ; t59a/58
5294*c0909341SAndroid Build Coastguard Worker    paddd               m22, m8         ; t60a/61
5295*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m3, m23, m2, m22
5296*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m3, m23, m2, m22
5297*c0909341SAndroid Build Coastguard Worker    ; m0-1: t32-35[a], m17-16: t44-47[a], m6-5: t48-51[a], m22-23: t60-63[a]
5298*c0909341SAndroid Build Coastguard Worker    ; m21-20: t36-39[a], m7/18: t40-43[a], m4/19: t52-55[a], m3-2: t56-59[a]
5299*c0909341SAndroid Build Coastguard Worker
5300*c0909341SAndroid Build Coastguard Worker    ; step7
5301*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_1567)]
5302*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_3784)]
5303*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         2, 21, 8, 9, _, 13, 10, 11    ; t36/37a, t59/58a
5304*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         3, 20, 8, 9, _, 13, 10, 11    ; t39a/38, t56a/57
5305*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D        19,  7, 8, 9, _, 13, 10, 11, 2 ; t55a/54, t40a/41
5306*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2D         4, 18, 8, 9, _, 13, 10, 11, 2 ; t52/53a, t43/42a
5307*c0909341SAndroid Build Coastguard Worker    ; m0-3: t32-39[a], m7,18-16: t40-47[a], m6-4,19: t48-55[a], m20-23: t56-63[a]
5308*c0909341SAndroid Build Coastguard Worker
5309*c0909341SAndroid Build Coastguard Worker    ; step8
5310*c0909341SAndroid Build Coastguard Worker    psubd                m8, m0, m16    ; t47a/46
5311*c0909341SAndroid Build Coastguard Worker    paddd                m0, m16        ; t32a/33
5312*c0909341SAndroid Build Coastguard Worker    psubd               m16, m1, m17    ; t44/45a
5313*c0909341SAndroid Build Coastguard Worker    paddd                m1, m17        ; t35/34a
5314*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m8, m0, m16, m1
5315*c0909341SAndroid Build Coastguard Worker    psubd               m17, m2, m18    ; t43a/42
5316*c0909341SAndroid Build Coastguard Worker    paddd                m2, m18        ; t36a/37
5317*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m8, m0, m16, m1
5318*c0909341SAndroid Build Coastguard Worker    psubd               m18, m3, m7     ; t40/41a
5319*c0909341SAndroid Build Coastguard Worker    paddd                m3, m7         ; t39/38a
5320*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m17, m2, m18, m3
5321*c0909341SAndroid Build Coastguard Worker    psubd                m7, m23, m6    ; t48a/49
5322*c0909341SAndroid Build Coastguard Worker    paddd               m23, m6         ; t63a/62
5323*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m17, m2, m18, m3
5324*c0909341SAndroid Build Coastguard Worker    psubd                m6, m22, m5    ; t51/50a
5325*c0909341SAndroid Build Coastguard Worker    paddd               m22, m5         ; t60/61a
5326*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m7, m23, m6, m22
5327*c0909341SAndroid Build Coastguard Worker    psubd                m5, m21, m4    ; t52a/53
5328*c0909341SAndroid Build Coastguard Worker    paddd               m21, m4         ; t59a/58
5329*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m7, m23, m6, m22
5330*c0909341SAndroid Build Coastguard Worker    psubd                m4, m20, m19   ; t55/54a
5331*c0909341SAndroid Build Coastguard Worker    paddd               m20, m19        ; t56/57a
5332*c0909341SAndroid Build Coastguard Worker    REPX    {pmaxsd x, m14}, m5, m21, m4, m20
5333*c0909341SAndroid Build Coastguard Worker    REPX    {pminsd x, m15}, m5, m21, m4, m20
5334*c0909341SAndroid Build Coastguard Worker    ; m0-3=t32-39[a], m18-16,8: t40-47[a], m7-4=t48-55[a], m20-23=t56-63[a]
5335*c0909341SAndroid Build Coastguard Worker
5336*c0909341SAndroid Build Coastguard Worker    ; step9
5337*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m4, m18, m5, m17, m6, m16, m7, m8
5338*c0909341SAndroid Build Coastguard Worker    REPX    {paddd  x, m13}, m4, m5, m6, m7
5339*c0909341SAndroid Build Coastguard Worker    paddd               m19, m4, m18    ; t55a/54
5340*c0909341SAndroid Build Coastguard Worker    psubd                m4, m18        ; t40a/41
5341*c0909341SAndroid Build Coastguard Worker    paddd               m18, m5, m17    ; t52/53a
5342*c0909341SAndroid Build Coastguard Worker    psubd                m5, m17        ; t43/42a
5343*c0909341SAndroid Build Coastguard Worker    paddd               m17, m6, m16    ; t51a/50
5344*c0909341SAndroid Build Coastguard Worker    psubd                m6, m16        ; t44a/45
5345*c0909341SAndroid Build Coastguard Worker    paddd               m16, m7, m8     ; t48/49a
5346*c0909341SAndroid Build Coastguard Worker    psubd                m7, m8         ; t47/46a
5347*c0909341SAndroid Build Coastguard Worker    REPX    {psrad  x, 12 }, m19, m4, m18, m5, m17, m6, m16, m7
5348*c0909341SAndroid Build Coastguard Worker    ; m4-7=t40-47[a], m16-19=t48-55[a]
5349*c0909341SAndroid Build Coastguard Worker    ret
5350*c0909341SAndroid Build Coastguard Worker
5351*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
5352*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
5353*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
5354*c0909341SAndroid Build Coastguard Worker    jz .dconly
5355*c0909341SAndroid Build Coastguard Worker
5356*c0909341SAndroid Build Coastguard Worker    PROLOGUE 4, 8, 32, -64*32, dst, stride, c, eob
5357*c0909341SAndroid Build Coastguard Worker%undef cmp
5358*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
5359*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
5360*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
5361*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
5362*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
5363*c0909341SAndroid Build Coastguard Worker    jl .fast
5364*c0909341SAndroid Build Coastguard Worker    add                  cq, 64
5365*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 543
5366*c0909341SAndroid Build Coastguard Worker    jge .full
5367*c0909341SAndroid Build Coastguard Worker    call .pass1_fast ; bottomright 16x16 zero
5368*c0909341SAndroid Build Coastguard Worker    mov                 r7d, 16*12
5369*c0909341SAndroid Build Coastguard Worker    jmp .lefthalf
5370*c0909341SAndroid Build Coastguard Worker.full:
5371*c0909341SAndroid Build Coastguard Worker    call .pass1
5372*c0909341SAndroid Build Coastguard Worker    mov                 r7d, 16*28
5373*c0909341SAndroid Build Coastguard Worker.lefthalf:
5374*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 0], m0
5375*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 1], m1
5376*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 2], m2
5377*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 3], m3
5378*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 4], m14
5379*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 5], m15
5380*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 6], m16
5381*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 7], m17
5382*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 8], m22
5383*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 9], m23
5384*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*10], m24
5385*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*11], m25
5386*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*12], m26
5387*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*13], m27
5388*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*14], m28
5389*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*15], m29
5390*c0909341SAndroid Build Coastguard Worker    sub                  cq, 64
5391*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
5392*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
5393*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
5394*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
5395*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 16*64
5396*c0909341SAndroid Build Coastguard Worker    call .pass1
5397*c0909341SAndroid Build Coastguard Worker    add                 rsp, 16*64
5398*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
5399*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start
5400*c0909341SAndroid Build Coastguard Worker    mov                  r4, dstq
5401*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
5402*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
5403*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r4+64]
5404*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16*mmsize]
5405*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+17*mmsize]
5406*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+18*mmsize]
5407*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+19*mmsize]
5408*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+20*mmsize]
5409*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+21*mmsize]
5410*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+22*mmsize]
5411*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+23*mmsize]
5412*c0909341SAndroid Build Coastguard Worker    mova                m16, [rsp+24*mmsize]
5413*c0909341SAndroid Build Coastguard Worker    mova                m17, [rsp+25*mmsize]
5414*c0909341SAndroid Build Coastguard Worker    mova                m18, [rsp+26*mmsize]
5415*c0909341SAndroid Build Coastguard Worker    mova                m19, [rsp+27*mmsize]
5416*c0909341SAndroid Build Coastguard Worker    mova                m20, [rsp+28*mmsize]
5417*c0909341SAndroid Build Coastguard Worker    mova                m21, [rsp+29*mmsize]
5418*c0909341SAndroid Build Coastguard Worker    mova                m22, [rsp+30*mmsize]
5419*c0909341SAndroid Build Coastguard Worker    mova                m23, [rsp+31*mmsize]
5420*c0909341SAndroid Build Coastguard Worker    call .transpose
5421*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 0+64], m0
5422*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 1+64], m1
5423*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 2+64], m2
5424*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 3+64], m3
5425*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 4+64], m14
5426*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 5+64], m15
5427*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 6+64], m16
5428*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 7+64], m17
5429*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 8+64], m22
5430*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 9+64], m23
5431*c0909341SAndroid Build Coastguard Worker    mova     [cq+128*10+64], m24
5432*c0909341SAndroid Build Coastguard Worker    mova     [cq+128*11+64], m25
5433*c0909341SAndroid Build Coastguard Worker    mova     [cq+128*12+64], m26
5434*c0909341SAndroid Build Coastguard Worker    mova     [cq+128*13+64], m27
5435*c0909341SAndroid Build Coastguard Worker    mova     [cq+128*14+64], m28
5436*c0909341SAndroid Build Coastguard Worker    mova     [cq+128*15+64], m29
5437*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+ 0*mmsize]
5438*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+ 1*mmsize]
5439*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+ 2*mmsize]
5440*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+ 3*mmsize]
5441*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+ 4*mmsize]
5442*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+ 5*mmsize]
5443*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+ 6*mmsize]
5444*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+ 7*mmsize]
5445*c0909341SAndroid Build Coastguard Worker    mova                m16, [rsp+ 8*mmsize]
5446*c0909341SAndroid Build Coastguard Worker    mova                m17, [rsp+ 9*mmsize]
5447*c0909341SAndroid Build Coastguard Worker    mova                m18, [rsp+10*mmsize]
5448*c0909341SAndroid Build Coastguard Worker    mova                m19, [rsp+11*mmsize]
5449*c0909341SAndroid Build Coastguard Worker    mova                m20, [rsp+12*mmsize]
5450*c0909341SAndroid Build Coastguard Worker    mova                m21, [rsp+13*mmsize]
5451*c0909341SAndroid Build Coastguard Worker    mova                m22, [rsp+14*mmsize]
5452*c0909341SAndroid Build Coastguard Worker    mova                m23, [rsp+15*mmsize]
5453*c0909341SAndroid Build Coastguard Worker    call .transpose
5454*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start
5455*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
5456*c0909341SAndroid Build Coastguard Worker.right_zero_loop:
5457*c0909341SAndroid Build Coastguard Worker    mova [cq+r7*8+64+128*3], m12
5458*c0909341SAndroid Build Coastguard Worker    mova [cq+r7*8+64+128*2], m12
5459*c0909341SAndroid Build Coastguard Worker    mova [cq+r7*8+64+128*1], m12
5460*c0909341SAndroid Build Coastguard Worker    mova [cq+r7*8+64+128*0], m12
5461*c0909341SAndroid Build Coastguard Worker    sub                 r7d, 16*4
5462*c0909341SAndroid Build Coastguard Worker    jge .right_zero_loop
5463*c0909341SAndroid Build Coastguard Worker    mov                 r7d, 16*28
5464*c0909341SAndroid Build Coastguard Worker    jmp .end
5465*c0909341SAndroid Build Coastguard Worker.fast: ; topleft 16x16 nonzero
5466*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
5467*c0909341SAndroid Build Coastguard Worker    jl .fast2
5468*c0909341SAndroid Build Coastguard Worker    call .pass1_fast
5469*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base_8bpc]
5470*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start
5471*c0909341SAndroid Build Coastguard Worker    mov                  r4, dstq
5472*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
5473*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
5474*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r4+64]
5475*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16*mmsize]
5476*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+17*mmsize]
5477*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+18*mmsize]
5478*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+19*mmsize]
5479*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+20*mmsize]
5480*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+21*mmsize]
5481*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+22*mmsize]
5482*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+23*mmsize]
5483*c0909341SAndroid Build Coastguard Worker    mova                m16, [rsp+24*mmsize]
5484*c0909341SAndroid Build Coastguard Worker    mova                m17, [rsp+25*mmsize]
5485*c0909341SAndroid Build Coastguard Worker    mova                m18, [rsp+26*mmsize]
5486*c0909341SAndroid Build Coastguard Worker    mova                m19, [rsp+27*mmsize]
5487*c0909341SAndroid Build Coastguard Worker    mova                m20, [rsp+28*mmsize]
5488*c0909341SAndroid Build Coastguard Worker    mova                m21, [rsp+29*mmsize]
5489*c0909341SAndroid Build Coastguard Worker    mova                m22, [rsp+30*mmsize]
5490*c0909341SAndroid Build Coastguard Worker    mova                m23, [rsp+31*mmsize]
5491*c0909341SAndroid Build Coastguard Worker    call .transpose
5492*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start
5493*c0909341SAndroid Build Coastguard Worker    mov                 r7d, 16*12
5494*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
5495*c0909341SAndroid Build Coastguard Worker    jmp .end
5496*c0909341SAndroid Build Coastguard Worker.fast2: ; topleft 8x8 nonzero
5497*c0909341SAndroid Build Coastguard Worker    movshdup             m7, [o(permB)]
5498*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+128*1]
5499*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+128*5]
5500*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+128*3]
5501*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+128*7]
5502*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m7, m2 ;  1  5
5503*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m7, m3 ;  7  3
5504*c0909341SAndroid Build Coastguard Worker    REPX    {pmulld x, m12}, m0, m1
5505*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed_rect2
5506*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 0*mmsize], m0
5507*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 1*mmsize], m1
5508*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 2*mmsize], m2
5509*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 3*mmsize], m3
5510*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 4*mmsize], m4
5511*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 5*mmsize], m5
5512*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 6*mmsize], m6
5513*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 7*mmsize], m7
5514*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 8*mmsize], m16
5515*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 9*mmsize], m17
5516*c0909341SAndroid Build Coastguard Worker    mova    [rsp+10*mmsize], m18
5517*c0909341SAndroid Build Coastguard Worker    mova    [rsp+11*mmsize], m19
5518*c0909341SAndroid Build Coastguard Worker    mova    [rsp+12*mmsize], m20
5519*c0909341SAndroid Build Coastguard Worker    mova    [rsp+13*mmsize], m21
5520*c0909341SAndroid Build Coastguard Worker    mova    [rsp+14*mmsize], m22
5521*c0909341SAndroid Build Coastguard Worker    mova    [rsp+15*mmsize], m23
5522*c0909341SAndroid Build Coastguard Worker
5523*c0909341SAndroid Build Coastguard Worker    movshdup             m7, [o(permB)]
5524*c0909341SAndroid Build Coastguard Worker    pmulld              ym0, ym12, [cq+128*0]
5525*c0909341SAndroid Build Coastguard Worker    pmulld              ym4, ym12, [cq+128*4]
5526*c0909341SAndroid Build Coastguard Worker    mova               ym16, [cq+128*2]
5527*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+128*6]
5528*c0909341SAndroid Build Coastguard Worker    REPX    {paddd x, ym13}, ym0, ym4
5529*c0909341SAndroid Build Coastguard Worker    REPX    {psrad x, 12  }, ym0, ym4
5530*c0909341SAndroid Build Coastguard Worker    vpermt2q            m16, m7, m5 ;  2  6
5531*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, m0 ;  0  0
5532*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m7, m4 ;  4  4
5533*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m12
5534*c0909341SAndroid Build Coastguard Worker    paddd               m16, m13
5535*c0909341SAndroid Build Coastguard Worker    psrad               m16, 12
5536*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
5537*c0909341SAndroid Build Coastguard Worker
5538*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_1)]
5539*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
5540*c0909341SAndroid Build Coastguard Worker    mova    [rsp+16*mmsize], m24
5541*c0909341SAndroid Build Coastguard Worker    mova    [rsp+17*mmsize], m25
5542*c0909341SAndroid Build Coastguard Worker    mova    [rsp+18*mmsize], m26
5543*c0909341SAndroid Build Coastguard Worker    mova    [rsp+19*mmsize], m27
5544*c0909341SAndroid Build Coastguard Worker    mova    [rsp+20*mmsize], m28
5545*c0909341SAndroid Build Coastguard Worker    mova    [rsp+21*mmsize], m29
5546*c0909341SAndroid Build Coastguard Worker    mova    [rsp+22*mmsize], m30
5547*c0909341SAndroid Build Coastguard Worker    mova    [rsp+23*mmsize], m31
5548*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
5549*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start
5550*c0909341SAndroid Build Coastguard Worker    mov                 r7d, 16*4
5551*c0909341SAndroid Build Coastguard Worker    mov                  r4, dstq
5552*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
5553*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
5554*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r4+64]
5555*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+16*mmsize]
5556*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+17*mmsize]
5557*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+18*mmsize]
5558*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+19*mmsize]
5559*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+20*mmsize]
5560*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+21*mmsize]
5561*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+22*mmsize]
5562*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+23*mmsize]
5563*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
5564*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
5565*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start
5566*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
5567*c0909341SAndroid Build Coastguard Worker.end:
5568*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
5569*c0909341SAndroid Build Coastguard Worker.zero_loop:
5570*c0909341SAndroid Build Coastguard Worker    mova    [cq+r7*8+128*3], m12
5571*c0909341SAndroid Build Coastguard Worker    mova    [cq+r7*8+128*2], m12
5572*c0909341SAndroid Build Coastguard Worker    mova    [cq+r7*8+128*1], m12
5573*c0909341SAndroid Build Coastguard Worker    mova    [cq+r7*8+128*0], m12
5574*c0909341SAndroid Build Coastguard Worker    sub                 r7d, 16*4
5575*c0909341SAndroid Build Coastguard Worker    jge .zero_loop
5576*c0909341SAndroid Build Coastguard Worker    RET
5577*c0909341SAndroid Build Coastguard Worker.dconly:
5578*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
5579*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
5580*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
5581*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128
5582*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8
5583*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
5584*c0909341SAndroid Build Coastguard Worker    add                 r6d, 384
5585*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 9
5586*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
5587*c0909341SAndroid Build Coastguard Worker.pass1_fast:
5588*c0909341SAndroid Build Coastguard Worker    lea                  r4, [idct64_mul_16bpc]
5589*c0909341SAndroid Build Coastguard Worker    lea                  r6, [rsp+4*64+gprsize]
5590*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 1]
5591*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128*15]
5592*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
5593*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 7]
5594*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128* 9]
5595*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
5596*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 5]
5597*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128*11]
5598*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
5599*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 3]
5600*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128*13]
5601*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
5602*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
5603*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 0]
5604*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+128* 8]
5605*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m12, [cq+128* 4]
5606*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m12, [cq+128*12]
5607*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_fast2_rect2
5608*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_fast2_rect2
5609*c0909341SAndroid Build Coastguard Worker    call .pass1_load_spill
5610*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2_rect2
5611*c0909341SAndroid Build Coastguard Worker    jmp .pass1_end
5612*c0909341SAndroid Build Coastguard Worker.pass1:
5613*c0909341SAndroid Build Coastguard Worker    lea                  r4, [idct64_mul_16bpc]
5614*c0909341SAndroid Build Coastguard Worker    lea                  r6, [rsp+4*64+gprsize]
5615*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 1]
5616*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+128*31]
5617*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+128*17]
5618*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128*15]
5619*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
5620*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 7]
5621*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+128*25]
5622*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+128*23]
5623*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128* 9]
5624*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
5625*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 5]
5626*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+128*27]
5627*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+128*21]
5628*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128*11]
5629*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
5630*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 3]
5631*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+128*29]
5632*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+128*19]
5633*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128*13]
5634*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
5635*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
5636*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 0]
5637*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+128* 8]
5638*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+128*16]
5639*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128*24]
5640*c0909341SAndroid Build Coastguard Worker    pmulld              m16, m12, [cq+128* 4]
5641*c0909341SAndroid Build Coastguard Worker    pmulld              m17, m12, [cq+128*12]
5642*c0909341SAndroid Build Coastguard Worker    pmulld              m18, m12, [cq+128*20]
5643*c0909341SAndroid Build Coastguard Worker    pmulld              m19, m12, [cq+128*28]
5644*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_fast_rect2
5645*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_fast_rect2
5646*c0909341SAndroid Build Coastguard Worker    call .pass1_load_spill
5647*c0909341SAndroid Build Coastguard Worker    pmulld               m4, m12, [cq+128*18]
5648*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m12, [cq+128*22]
5649*c0909341SAndroid Build Coastguard Worker    pmulld               m6, m12, [cq+128*26]
5650*c0909341SAndroid Build Coastguard Worker    pmulld               m7, m12, [cq+128*30]
5651*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2
5652*c0909341SAndroid Build Coastguard Worker.pass1_end:
5653*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_1)]
5654*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+gprsize]
5655*c0909341SAndroid Build Coastguard Worker    lea                  r4, [cq+8*128]
5656*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end
5657*c0909341SAndroid Build Coastguard Worker    ; transpose one half immediately, we can transpose lower half later
5658*c0909341SAndroid Build Coastguard Worker.transpose:
5659*c0909341SAndroid Build Coastguard Worker    ; transpose m0-7,16-23
5660*c0909341SAndroid Build Coastguard Worker    psrlq               m12, [permC], 24    ;  0  2  8 10  1  3  9 11
5661*c0909341SAndroid Build Coastguard Worker    psrlq               m13, m12, 32        ;  4  6 12 14  5  7 13 15
5662*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
5663*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m22, m0, m20  ;  1
5664*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m20      ;  0
5665*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m24, m2, m1   ;  5
5666*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m2, m1   ;  4
5667*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m14, m18 ;  8
5668*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m26, m14, m18 ;  9
5669*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m15, m4  ;  2
5670*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m23, m15, m4  ;  3
5671*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m25, m3, m21  ;  7
5672*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m3, m21  ;  6
5673*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m28, m6, m17  ; 13
5674*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m6, m17  ; 12
5675*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m27, m5, m16  ; 11
5676*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m16, m5, m16  ; 10
5677*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m29, m7, m8   ; 15
5678*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m17, m7, m8   ; 14
5679*c0909341SAndroid Build Coastguard Worker    ret
5680*c0909341SAndroid Build Coastguard Worker.pass1_load_spill:
5681*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
5682*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 0], m0
5683*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 1], m1
5684*c0909341SAndroid Build Coastguard Worker    pmulld               m0, m12, [cq+128* 2]
5685*c0909341SAndroid Build Coastguard Worker    pmulld               m1, m12, [cq+128* 6]
5686*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 2], m2
5687*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 3], m3
5688*c0909341SAndroid Build Coastguard Worker    pmulld               m2, m12, [cq+128*10]
5689*c0909341SAndroid Build Coastguard Worker    pmulld               m3, m12, [cq+128*14]
5690*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 4], m4
5691*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 5], m5
5692*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 6], m6
5693*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 7], m7
5694*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 8], m23
5695*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 9], m22
5696*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*10], m21
5697*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*11], m20
5698*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*12], m19
5699*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*13], m18
5700*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*14], m17
5701*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*15], m16
5702*c0909341SAndroid Build Coastguard Worker    ret
5703*c0909341SAndroid Build Coastguard Worker
5704*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
5705*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
5706*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
5707*c0909341SAndroid Build Coastguard Worker    jz .dconly
5708*c0909341SAndroid Build Coastguard Worker
5709*c0909341SAndroid Build Coastguard Worker    PROLOGUE 4, 9, 32, -64*32, dst, stride, c, eob
5710*c0909341SAndroid Build Coastguard Worker%undef cmp
5711*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
5712*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
5713*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
5714*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
5715*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
5716*c0909341SAndroid Build Coastguard Worker    jl .fast
5717*c0909341SAndroid Build Coastguard Worker    add                  cq, 64
5718*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 543
5719*c0909341SAndroid Build Coastguard Worker    jge .full
5720*c0909341SAndroid Build Coastguard Worker    call .pass1_fast ; bottomright 16x16 zero
5721*c0909341SAndroid Build Coastguard Worker    mov                 r7d, 16*12
5722*c0909341SAndroid Build Coastguard Worker    jmp .lefthalf
5723*c0909341SAndroid Build Coastguard Worker.full:
5724*c0909341SAndroid Build Coastguard Worker    call .pass1
5725*c0909341SAndroid Build Coastguard Worker    mov                 r7d, 16*28
5726*c0909341SAndroid Build Coastguard Worker.lefthalf:
5727*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 0], m27
5728*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 1], m14
5729*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 2], m28
5730*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 3], m15
5731*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 4], m22
5732*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 5], m23
5733*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 6], m24
5734*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 7], m25
5735*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 8], m0
5736*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 9], m26
5737*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*10], m20
5738*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*11], m21
5739*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*12], m18
5740*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*13], m16
5741*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*14], m17
5742*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*15], m3
5743*c0909341SAndroid Build Coastguard Worker    sub                  cq, 64
5744*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pd_2896)]
5745*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pd_2048)]
5746*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(clip_18b_min)]
5747*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(clip_18b_max)]
5748*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 16*64
5749*c0909341SAndroid Build Coastguard Worker    call .pass1
5750*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 24*64
5751*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start
5752*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
5753*c0909341SAndroid Build Coastguard Worker    pxor                m31, m31
5754*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
5755*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r8+64]
5756*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+56*mmsize]
5757*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+57*mmsize]
5758*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+58*mmsize]
5759*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+59*mmsize]
5760*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+60*mmsize]
5761*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+61*mmsize]
5762*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+62*mmsize]
5763*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+63*mmsize]
5764*c0909341SAndroid Build Coastguard Worker    mova                m16, [rsp+64*mmsize]
5765*c0909341SAndroid Build Coastguard Worker    mova                m17, [rsp+65*mmsize]
5766*c0909341SAndroid Build Coastguard Worker    mova                m18, [rsp+66*mmsize]
5767*c0909341SAndroid Build Coastguard Worker    mova                m19, [rsp+67*mmsize]
5768*c0909341SAndroid Build Coastguard Worker    mova                m20, [rsp+68*mmsize]
5769*c0909341SAndroid Build Coastguard Worker    mova                m21, [rsp+69*mmsize]
5770*c0909341SAndroid Build Coastguard Worker    mova                m22, [rsp+70*mmsize]
5771*c0909341SAndroid Build Coastguard Worker    mova                m23, [rsp+71*mmsize]
5772*c0909341SAndroid Build Coastguard Worker    call .transpose
5773*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 0+64], m27
5774*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 1+64], m14
5775*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 2+64], m28
5776*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 3+64], m15
5777*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 4+64], m22
5778*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 5+64], m23
5779*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 6+64], m24
5780*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 7+64], m25
5781*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 8+64], m0
5782*c0909341SAndroid Build Coastguard Worker    mova     [cq+128* 9+64], m26
5783*c0909341SAndroid Build Coastguard Worker    mova     [cq+128*10+64], m20
5784*c0909341SAndroid Build Coastguard Worker    mova     [cq+128*11+64], m21
5785*c0909341SAndroid Build Coastguard Worker    mova     [cq+128*12+64], m18
5786*c0909341SAndroid Build Coastguard Worker    mova     [cq+128*13+64], m16
5787*c0909341SAndroid Build Coastguard Worker    mova     [cq+128*14+64], m17
5788*c0909341SAndroid Build Coastguard Worker    mova     [cq+128*15+64], m3
5789*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+40*mmsize]
5790*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+41*mmsize]
5791*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+42*mmsize]
5792*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+43*mmsize]
5793*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+44*mmsize]
5794*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+45*mmsize]
5795*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+46*mmsize]
5796*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+47*mmsize]
5797*c0909341SAndroid Build Coastguard Worker    mova                m16, [rsp+48*mmsize]
5798*c0909341SAndroid Build Coastguard Worker    mova                m17, [rsp+49*mmsize]
5799*c0909341SAndroid Build Coastguard Worker    mova                m18, [rsp+50*mmsize]
5800*c0909341SAndroid Build Coastguard Worker    mova                m19, [rsp+51*mmsize]
5801*c0909341SAndroid Build Coastguard Worker    mova                m20, [rsp+52*mmsize]
5802*c0909341SAndroid Build Coastguard Worker    mova                m21, [rsp+53*mmsize]
5803*c0909341SAndroid Build Coastguard Worker    mova                m22, [rsp+54*mmsize]
5804*c0909341SAndroid Build Coastguard Worker    mova                m23, [rsp+55*mmsize]
5805*c0909341SAndroid Build Coastguard Worker    add                 rsp, 32*64
5806*c0909341SAndroid Build Coastguard Worker    call .transpose
5807*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
5808*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start
5809*c0909341SAndroid Build Coastguard Worker.right_zero_loop:
5810*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+r7*8+64+128*x], m31}, 0, 1, 2, 3
5811*c0909341SAndroid Build Coastguard Worker    sub                 r7d, 16*4
5812*c0909341SAndroid Build Coastguard Worker    jge .right_zero_loop
5813*c0909341SAndroid Build Coastguard Worker    mov                 r7d, 16*28
5814*c0909341SAndroid Build Coastguard Worker    jmp .end
5815*c0909341SAndroid Build Coastguard Worker.fast: ; topleft 16x16 nonzero
5816*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 36
5817*c0909341SAndroid Build Coastguard Worker    jl .fast2
5818*c0909341SAndroid Build Coastguard Worker    call .pass1_fast
5819*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 24*64
5820*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
5821*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start
5822*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
5823*c0909341SAndroid Build Coastguard Worker    pxor                m31, m31
5824*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
5825*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r8+64]
5826*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+40*mmsize]
5827*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+41*mmsize]
5828*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+42*mmsize]
5829*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+43*mmsize]
5830*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+44*mmsize]
5831*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+45*mmsize]
5832*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+46*mmsize]
5833*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+47*mmsize]
5834*c0909341SAndroid Build Coastguard Worker    mova                m16, [rsp+48*mmsize]
5835*c0909341SAndroid Build Coastguard Worker    mova                m17, [rsp+49*mmsize]
5836*c0909341SAndroid Build Coastguard Worker    mova                m18, [rsp+50*mmsize]
5837*c0909341SAndroid Build Coastguard Worker    mova                m19, [rsp+51*mmsize]
5838*c0909341SAndroid Build Coastguard Worker    mova                m20, [rsp+52*mmsize]
5839*c0909341SAndroid Build Coastguard Worker    mova                m21, [rsp+53*mmsize]
5840*c0909341SAndroid Build Coastguard Worker    mova                m22, [rsp+54*mmsize]
5841*c0909341SAndroid Build Coastguard Worker    mova                m23, [rsp+55*mmsize]
5842*c0909341SAndroid Build Coastguard Worker    add                 rsp, 16*64
5843*c0909341SAndroid Build Coastguard Worker    call .transpose
5844*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
5845*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
5846*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start
5847*c0909341SAndroid Build Coastguard Worker    mov                 r7d, 16*12
5848*c0909341SAndroid Build Coastguard Worker    jmp .end
5849*c0909341SAndroid Build Coastguard Worker.fast2: ; topleft 8x8 nonzero
5850*c0909341SAndroid Build Coastguard Worker    movshdup             m7, [o(permB)]
5851*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+128*1]
5852*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+128*5]
5853*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+128*3]
5854*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+128*7]
5855*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m7, m2 ;  1  5
5856*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m7, m3 ;  7  3
5857*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed
5858*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 0*mmsize], m0
5859*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 1*mmsize], m1
5860*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 2*mmsize], m2
5861*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 3*mmsize], m3
5862*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 4*mmsize], m4
5863*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 5*mmsize], m5
5864*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 6*mmsize], m6
5865*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 7*mmsize], m7
5866*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 8*mmsize], m16
5867*c0909341SAndroid Build Coastguard Worker    mova    [rsp+ 9*mmsize], m17
5868*c0909341SAndroid Build Coastguard Worker    mova    [rsp+10*mmsize], m18
5869*c0909341SAndroid Build Coastguard Worker    mova    [rsp+11*mmsize], m19
5870*c0909341SAndroid Build Coastguard Worker    mova    [rsp+12*mmsize], m20
5871*c0909341SAndroid Build Coastguard Worker    mova    [rsp+13*mmsize], m21
5872*c0909341SAndroid Build Coastguard Worker    mova    [rsp+14*mmsize], m22
5873*c0909341SAndroid Build Coastguard Worker    mova    [rsp+15*mmsize], m23
5874*c0909341SAndroid Build Coastguard Worker
5875*c0909341SAndroid Build Coastguard Worker    movshdup             m7, [o(permB)]
5876*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+128*0]
5877*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+128*4]
5878*c0909341SAndroid Build Coastguard Worker    mova               ym16, [cq+128*2]
5879*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+128*6]
5880*c0909341SAndroid Build Coastguard Worker    vpermt2q            m16, m7, m5 ;  2  6
5881*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, m0 ;  0  0
5882*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m7, m4 ;  4  4
5883*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
5884*c0909341SAndroid Build Coastguard Worker
5885*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_2)]
5886*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
5887*c0909341SAndroid Build Coastguard Worker    sub                 rsp, 16*64
5888*c0909341SAndroid Build Coastguard Worker    mova    [rsp+40*mmsize], m24
5889*c0909341SAndroid Build Coastguard Worker    mova    [rsp+41*mmsize], m25
5890*c0909341SAndroid Build Coastguard Worker    mova    [rsp+42*mmsize], m26
5891*c0909341SAndroid Build Coastguard Worker    mova    [rsp+43*mmsize], m27
5892*c0909341SAndroid Build Coastguard Worker    mova    [rsp+44*mmsize], m28
5893*c0909341SAndroid Build Coastguard Worker    mova    [rsp+45*mmsize], m29
5894*c0909341SAndroid Build Coastguard Worker    mova    [rsp+46*mmsize], m30
5895*c0909341SAndroid Build Coastguard Worker    mova    [rsp+47*mmsize], m31
5896*c0909341SAndroid Build Coastguard Worker    call .pass2_fast2_start
5897*c0909341SAndroid Build Coastguard Worker    mov                 r7d, 16*4
5898*c0909341SAndroid Build Coastguard Worker    mov                  r8, dstq
5899*c0909341SAndroid Build Coastguard Worker    pxor                m31, m31
5900*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
5901*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r8+64]
5902*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+40*mmsize]
5903*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+41*mmsize]
5904*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+42*mmsize]
5905*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+43*mmsize]
5906*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+44*mmsize]
5907*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+45*mmsize]
5908*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+46*mmsize]
5909*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+47*mmsize]
5910*c0909341SAndroid Build Coastguard Worker    add                 rsp, 8*64
5911*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
5912*c0909341SAndroid Build Coastguard Worker    call .pass2_fast2_start
5913*c0909341SAndroid Build Coastguard Worker.end:
5914*c0909341SAndroid Build Coastguard Worker    pxor                m31, m31
5915*c0909341SAndroid Build Coastguard Worker.zero_loop:
5916*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+r7*8+128*x], m31}, 0, 1, 2, 3
5917*c0909341SAndroid Build Coastguard Worker    sub                 r7d, 16*4
5918*c0909341SAndroid Build Coastguard Worker    jge .zero_loop
5919*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
5920*c0909341SAndroid Build Coastguard Worker    add                 rsp, 8*64  ; FIXME adjust stack_size_padded instead?
5921*c0909341SAndroid Build Coastguard Worker    RET
5922*c0909341SAndroid Build Coastguard Worker.pass2_fast2_start:
5923*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
5924*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m27, m0, m2 ; 0
5925*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m2     ; 1
5926*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m22, m3, m4 ; 2
5927*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m26, m3, m4 ; 3
5928*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m5, m7 ; 4
5929*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m20, m5, m7 ; 5
5930*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m23, m6, m8 ; 6
5931*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m21, m6, m8 ; 7
5932*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
5933*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast2_start
5934*c0909341SAndroid Build Coastguard Worker.dconly:
5935*c0909341SAndroid Build Coastguard Worker    imul                r6d, [cq], 181
5936*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
5937*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
5938*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly1
5939*c0909341SAndroid Build Coastguard Worker.pass1_fast:
5940*c0909341SAndroid Build Coastguard Worker    lea                  r4, [idct64_mul_16bpc]
5941*c0909341SAndroid Build Coastguard Worker    lea                  r6, [rsp+4*64+gprsize]
5942*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 1]
5943*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*15]
5944*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
5945*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 7]
5946*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128* 9]
5947*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
5948*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 5]
5949*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*11]
5950*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
5951*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 3]
5952*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*13]
5953*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
5954*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
5955*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 0]
5956*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 8]
5957*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+128* 4]
5958*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+128*12]
5959*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_fast2
5960*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_fast2
5961*c0909341SAndroid Build Coastguard Worker    call .pass1_load_spill
5962*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2
5963*c0909341SAndroid Build Coastguard Worker    jmp .pass1_end
5964*c0909341SAndroid Build Coastguard Worker.pass1:
5965*c0909341SAndroid Build Coastguard Worker    lea                  r4, [idct64_mul_16bpc]
5966*c0909341SAndroid Build Coastguard Worker    lea                  r6, [rsp+4*64+gprsize]
5967*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 1]
5968*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*31]
5969*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*17]
5970*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*15]
5971*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
5972*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 7]
5973*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*25]
5974*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*23]
5975*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128* 9]
5976*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
5977*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 5]
5978*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*27]
5979*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*21]
5980*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*11]
5981*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
5982*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 3]
5983*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128*29]
5984*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*19]
5985*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*13]
5986*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
5987*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
5988*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 0]
5989*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 8]
5990*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*16]
5991*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*24]
5992*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+128* 4]
5993*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+128*12]
5994*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+128*20]
5995*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+128*28]
5996*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_10bpc).main_fast
5997*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_10bpc).main_fast
5998*c0909341SAndroid Build Coastguard Worker    call .pass1_load_spill
5999*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+128*18]
6000*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+128*22]
6001*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+128*26]
6002*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+128*30]
6003*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
6004*c0909341SAndroid Build Coastguard Worker.pass1_end:
6005*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pd_2)]
6006*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+gprsize]
6007*c0909341SAndroid Build Coastguard Worker    lea                  r4, [cq+8*128]
6008*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end
6009*c0909341SAndroid Build Coastguard Worker    ; transpose one half immediately, we can transpose lower half later
6010*c0909341SAndroid Build Coastguard Worker.transpose:
6011*c0909341SAndroid Build Coastguard Worker    ; transpose m0-7,16-23
6012*c0909341SAndroid Build Coastguard Worker    psrlq               m12, [permC], 24 ;  0  2  8 10  1  3  9 11
6013*c0909341SAndroid Build Coastguard Worker    psrlq               m13, m12, 32        ;  4  6 12 14  5  7 13 15
6014*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
6015*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m27, m0, m20  ;  0
6016*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m20      ;  1
6017*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m24, m5, m16  ; 10
6018*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m16, m5, m16  ; 11
6019*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m23, m3, m21  ;  6
6020*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m21, m3, m21  ;  7
6021*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m25, m7, m8   ; 14
6022*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m7, m8   ; 15
6023*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m22, m15, m4  ;  2
6024*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m26, m15, m4  ;  3
6025*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m6, m17  ; 12
6026*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m17, m6, m17  ; 13
6027*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m28, m14, m18 ;  8
6028*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m18, m14, m18 ;  9
6029*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m2, m1   ;  4
6030*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m20, m2, m1   ;  5
6031*c0909341SAndroid Build Coastguard Worker    ret
6032*c0909341SAndroid Build Coastguard Worker.pass1_load_spill:
6033*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
6034*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 0], m0
6035*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 1], m1
6036*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+128* 2]
6037*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+128* 6]
6038*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 2], m2
6039*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 3], m3
6040*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+128*10]
6041*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+128*14]
6042*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 4], m4
6043*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 5], m5
6044*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 6], m6
6045*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 7], m7
6046*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 8], m23
6047*c0909341SAndroid Build Coastguard Worker    mova        [cq+128* 9], m22
6048*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*10], m21
6049*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*11], m20
6050*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*12], m19
6051*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*13], m18
6052*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*14], m17
6053*c0909341SAndroid Build Coastguard Worker    mova        [cq+128*15], m16
6054*c0909341SAndroid Build Coastguard Worker    ret
6055*c0909341SAndroid Build Coastguard Worker
6056*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
6057