xref: /aosp_15_r20/external/libdav1d/src/x86/itx_avx512.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2020-2023, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2020-2023, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64
32*c0909341SAndroid Build Coastguard Workerconst \
33*c0909341SAndroid Build Coastguard Workerdup16_perm,  db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
34*c0909341SAndroid Build Coastguard Worker             db  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15
35*c0909341SAndroid Build Coastguard Worker             db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23
36*c0909341SAndroid Build Coastguard Worker             db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31
37*c0909341SAndroid Build Coastguard Workerconst \
38*c0909341SAndroid Build Coastguard Workerint8_permA,  db  0,  1, 16, 17, 32, 33, 48, 49,  2,  3, 18, 19, 34, 35, 50, 51
39*c0909341SAndroid Build Coastguard Worker             db  4,  5, 20, 21, 36, 37, 52, 53,  6,  7, 22, 23, 38, 39, 54, 55
40*c0909341SAndroid Build Coastguard Worker             db  8,  9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
41*c0909341SAndroid Build Coastguard Worker             db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
42*c0909341SAndroid Build Coastguard Workerint8_permB:  db  0,  1, 16, 17, 32, 33, 48, 49,  2,  3, 18, 19, 34, 35, 50, 51
43*c0909341SAndroid Build Coastguard Worker             db  8,  9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
44*c0909341SAndroid Build Coastguard Worker             db  4,  5, 20, 21, 36, 37, 52, 53,  6,  7, 22, 23, 38, 39, 54, 55
45*c0909341SAndroid Build Coastguard Worker             db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
46*c0909341SAndroid Build Coastguard Workerint16_perm:  db  0,  1, 32, 33,  2,  3, 34, 35,  4,  5, 36, 37,  6,  7, 38, 39
47*c0909341SAndroid Build Coastguard Worker             db  8,  9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
48*c0909341SAndroid Build Coastguard Worker             db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
49*c0909341SAndroid Build Coastguard Worker             db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
50*c0909341SAndroid Build Coastguard Workeridtx_16x4p:  db  0,  1,  4,  5, 16, 17, 20, 21,  2,  3,  6,  7, 18, 19, 22, 23
51*c0909341SAndroid Build Coastguard Worker             db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55
52*c0909341SAndroid Build Coastguard Worker             db  8,  9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31
53*c0909341SAndroid Build Coastguard Worker             db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63
54*c0909341SAndroid Build Coastguard Workeridct_8x32p:  db 60, 61,  4,  5, 32, 33,  0,  1, 28, 29, 36, 37, 56, 57,  8,  9
55*c0909341SAndroid Build Coastguard Worker             db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17
56*c0909341SAndroid Build Coastguard Worker             db 62, 63,  2,  3,  6,  7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51
57*c0909341SAndroid Build Coastguard Worker             db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35
58*c0909341SAndroid Build Coastguard Workeridct_16x32p: db  6,  7, 58, 59, 38, 39, 26, 27, 32, 33,  0,  1, 30, 31, 34, 35
59*c0909341SAndroid Build Coastguard Worker             db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21
60*c0909341SAndroid Build Coastguard Worker             db 62, 63,  2,  3, 48, 49, 16, 17, 56, 57,  8,  9, 14, 15, 50, 51
61*c0909341SAndroid Build Coastguard Worker             db 54, 55, 10, 11, 60, 61,  4,  5, 12, 13, 52, 53, 28, 29, 36, 37
62*c0909341SAndroid Build Coastguard Workerend_16x32p:  db  0, 32,  1, 48,  2, 36,  3, 52, 16, 40, 17, 56, 18, 44, 19, 60
63*c0909341SAndroid Build Coastguard Worker             db  4, 33,  5, 49,  6, 37,  7, 53, 20, 41, 21, 57, 22, 45, 23, 61
64*c0909341SAndroid Build Coastguard Worker             db  8, 35,  9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63
65*c0909341SAndroid Build Coastguard Worker             db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62
66*c0909341SAndroid Build Coastguard Worker
67*c0909341SAndroid Build Coastguard Worker; packed 4-bit qword shuffle indices
68*c0909341SAndroid Build Coastguard WorkerpermA:       dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262
69*c0909341SAndroid Build Coastguard Worker             dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373
70*c0909341SAndroid Build Coastguard Worker             dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb
71*c0909341SAndroid Build Coastguard Worker             dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea
72*c0909341SAndroid Build Coastguard WorkerpermB:       dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604
73*c0909341SAndroid Build Coastguard Worker             dq 0xc824352d56128751, 0xd906171e74301e15
74*c0909341SAndroid Build Coastguard Worker             dq 0x6271604b03472d62, 0x735342782165b426
75*c0909341SAndroid Build Coastguard Worker             dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37
76*c0909341SAndroid Build Coastguard WorkerpermC:       dq 0x9d409d041551c2e0, 0xbf62bf263773a486
77*c0909341SAndroid Build Coastguard Worker             dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597
78*c0909341SAndroid Build Coastguard Worker             dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e
79*c0909341SAndroid Build Coastguard Worker             dq 0x5115049dd9045b79, 0x733726bffb263d1f
80*c0909341SAndroid Build Coastguard WorkerpermD:       dq 0x0cda098800041504, 0x0edb09b2028c3726
81*c0909341SAndroid Build Coastguard Worker             dq 0x0f11fa9c01150415, 0x0988f326039d2637
82*c0909341SAndroid Build Coastguard Worker             dq 0x05640f1108269d8c, 0x05290edb0aaebfae
83*c0909341SAndroid Build Coastguard Worker             dq 0x0005000509378c9d, 0xffffffff0bbfaebf
84*c0909341SAndroid Build Coastguard Worker
85*c0909341SAndroid Build Coastguard Workerpd_0to15:    dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
86*c0909341SAndroid Build Coastguard Workergather8a:    dd  0,  2,  1,  3,  8, 10,  9, 11
87*c0909341SAndroid Build Coastguard Workergather8b:    dd  0,  1,  4,  5,  8,  9, 12, 13
88*c0909341SAndroid Build Coastguard Workergather8c:    dd  0,  4,  2,  6, 12,  8, 14, 10
89*c0909341SAndroid Build Coastguard Workergather8d:    dd  0, 19,  1, 18,  2, 17,  3, 16
90*c0909341SAndroid Build Coastguard Worker
91*c0909341SAndroid Build Coastguard Workerint_shuf1:   db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
92*c0909341SAndroid Build Coastguard Workerint_shuf2:   db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7
93*c0909341SAndroid Build Coastguard Workerint_shuf3:   db  0,  1,  8,  9,  4,  5, 12, 13,  2,  3, 10, 11,  6,  7, 14, 15
94*c0909341SAndroid Build Coastguard Workerint_shuf4:   db  8,  9,  0,  1, 12, 13,  4,  5, 10, 11,  2,  3, 14, 15,  6,  7
95*c0909341SAndroid Build Coastguard Workerdeint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
96*c0909341SAndroid Build Coastguard Workerint_mshift:  db 12, 20,  0,  0, 44, 52,  0,  0
97*c0909341SAndroid Build Coastguard Worker
98*c0909341SAndroid Build Coastguard Workerpb_32:           times 4 db 32
99*c0909341SAndroid Build Coastguard Workerpw_2048:         times 2 dw 2048
100*c0909341SAndroid Build Coastguard Workerpw_4096:         times 2 dw 4096
101*c0909341SAndroid Build Coastguard Workerpw_8192:         times 2 dw 8192
102*c0909341SAndroid Build Coastguard Workerpw_16384:        times 2 dw 16384
103*c0909341SAndroid Build Coastguard Workerpw_1697x16:      times 2 dw 1697*16
104*c0909341SAndroid Build Coastguard Workerpw_1697x8:       times 2 dw 1697*8
105*c0909341SAndroid Build Coastguard Workerpw_2896x8:       times 2 dw 2896*8
106*c0909341SAndroid Build Coastguard Workerpd_2048:         dd  2048
107*c0909341SAndroid Build Coastguard Worker
108*c0909341SAndroid Build Coastguard Worker%define pw_5          (permD+52)
109*c0909341SAndroid Build Coastguard Worker%define pd_m1         (permD+60)
110*c0909341SAndroid Build Coastguard Worker%define pw_3803_1321  (permD+44)
111*c0909341SAndroid Build Coastguard Worker%define pw_2482_3803  (permD+12)
112*c0909341SAndroid Build Coastguard Worker%define pw_2440_3290  (permD+ 4)
113*c0909341SAndroid Build Coastguard Worker%define pw_m3290_2440 (permD+28)
114*c0909341SAndroid Build Coastguard Worker%define pw_3857_1380  (permD+36)
115*c0909341SAndroid Build Coastguard Worker%define pw_m1380_3857 (permD+20)
116*c0909341SAndroid Build Coastguard Worker
117*c0909341SAndroid Build Coastguard Workerpw_8192_m8192:   dw   8192,  -8192
118*c0909341SAndroid Build Coastguard Workerpw_m8192_8192:   dw  -8192,   8192
119*c0909341SAndroid Build Coastguard Workerpw_16384_m16384: dw  16384, -16384
120*c0909341SAndroid Build Coastguard Workerpw_m16384_16384: dw -16384,  16384
121*c0909341SAndroid Build Coastguard Worker
122*c0909341SAndroid Build Coastguard Workerpw_m1321_2482:   dw  -1321,  2482
123*c0909341SAndroid Build Coastguard Workerpw_m3344_3344:   dw  -3344,  3344
124*c0909341SAndroid Build Coastguard Workerpw_2482_3344:    dw   2482,  3344
125*c0909341SAndroid Build Coastguard Workerpw_m3803_3344:   dw  -3803,  3344
126*c0909341SAndroid Build Coastguard Workerpd_3344:         dd   3344
127*c0909341SAndroid Build Coastguard Workerpw_m1321_m3344:  dw  -1321, -3344
128*c0909341SAndroid Build Coastguard Workerpw_2896_m2896:   dw   2896, -2896
129*c0909341SAndroid Build Coastguard Worker
130*c0909341SAndroid Build Coastguard Workerpw_1567_m3784:   dw   1567, -3784
131*c0909341SAndroid Build Coastguard Workerpw_3784_m1567:   dw   3784, -1567
132*c0909341SAndroid Build Coastguard Workerpw_4017_m799:    dw   4017,  -799
133*c0909341SAndroid Build Coastguard Workerpw_2276_m3406:   dw   2276, -3406
134*c0909341SAndroid Build Coastguard Workerpw_m799_m4017:   dw   -799, -4017
135*c0909341SAndroid Build Coastguard Workerpw_m3406_m2276:  dw  -3406, -2276
136*c0909341SAndroid Build Coastguard Worker
137*c0909341SAndroid Build Coastguard Worker%macro COEF_PAIR 2-3 0
138*c0909341SAndroid Build Coastguard Workerpw_%1_%2:   dw  %1,  %2
139*c0909341SAndroid Build Coastguard Workerpw_m%2_%1:  dw -%2,  %1
140*c0909341SAndroid Build Coastguard Worker%if %3
141*c0909341SAndroid Build Coastguard Workerpw_m%1_m%2: dw -%1, -%2
142*c0909341SAndroid Build Coastguard Worker%endif
143*c0909341SAndroid Build Coastguard Worker%endmacro
144*c0909341SAndroid Build Coastguard Worker
145*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 2896, 2896
146*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 1567, 3784, 1
147*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3784, 1567
148*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  201, 4091
149*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  995, 3973
150*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 1751, 3703
151*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3035, 2751
152*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3513, 2106
153*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4052,  601
154*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3166, 2598, 1
155*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3920, 1189, 1
156*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 2276, 3406
157*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4017,  799
158*c0909341SAndroid Build Coastguard Worker
159*c0909341SAndroid Build Coastguard Worker%macro COEF_X8 1-*
160*c0909341SAndroid Build Coastguard Worker%rep %0
161*c0909341SAndroid Build Coastguard Worker    dw %1*8, %1*8
162*c0909341SAndroid Build Coastguard Worker    %rotate 1
163*c0909341SAndroid Build Coastguard Worker%endrep
164*c0909341SAndroid Build Coastguard Worker%endmacro
165*c0909341SAndroid Build Coastguard Worker
166*c0909341SAndroid Build Coastguard Workerpw_m2276x8: COEF_X8 -2276
167*c0909341SAndroid Build Coastguard Workerpw_3406x8:  COEF_X8  3406
168*c0909341SAndroid Build Coastguard Workerpw_4017x8:  COEF_X8  4017
169*c0909341SAndroid Build Coastguard Workerpw_799x8:   COEF_X8   799
170*c0909341SAndroid Build Coastguard Workerpw_3784x8:  COEF_X8  3784
171*c0909341SAndroid Build Coastguard Workerpw_1567x8:  COEF_X8  1567
172*c0909341SAndroid Build Coastguard Worker
173*c0909341SAndroid Build Coastguard Workerpw_4076x8:  COEF_X8  4076
174*c0909341SAndroid Build Coastguard Workerpw_401x8:   COEF_X8   401
175*c0909341SAndroid Build Coastguard Workerpw_m2598x8: COEF_X8 -2598
176*c0909341SAndroid Build Coastguard Workerpw_3166x8:  COEF_X8  3166
177*c0909341SAndroid Build Coastguard Workerpw_3612x8:  COEF_X8  3612
178*c0909341SAndroid Build Coastguard Workerpw_1931x8:  COEF_X8  1931
179*c0909341SAndroid Build Coastguard Workerpw_m1189x8: COEF_X8 -1189
180*c0909341SAndroid Build Coastguard Workerpw_3920x8:  COEF_X8  3920
181*c0909341SAndroid Build Coastguard Worker
182*c0909341SAndroid Build Coastguard Workerpw_4091x8:  COEF_X8  4091
183*c0909341SAndroid Build Coastguard Workerpw_201x8:   COEF_X8   201
184*c0909341SAndroid Build Coastguard Workerpw_m2751x8: COEF_X8 -2751
185*c0909341SAndroid Build Coastguard Workerpw_3035x8:  COEF_X8  3035
186*c0909341SAndroid Build Coastguard Workerpw_3703x8:  COEF_X8  3703
187*c0909341SAndroid Build Coastguard Workerpw_1751x8:  COEF_X8  1751
188*c0909341SAndroid Build Coastguard Workerpw_m1380x8: COEF_X8 -1380
189*c0909341SAndroid Build Coastguard Workerpw_3857x8:  COEF_X8  3857
190*c0909341SAndroid Build Coastguard Workerpw_3973x8:  COEF_X8  3973
191*c0909341SAndroid Build Coastguard Workerpw_995x8:   COEF_X8   995
192*c0909341SAndroid Build Coastguard Workerpw_m2106x8: COEF_X8 -2106
193*c0909341SAndroid Build Coastguard Workerpw_3513x8:  COEF_X8  3513
194*c0909341SAndroid Build Coastguard Workerpw_3290x8:  COEF_X8  3290
195*c0909341SAndroid Build Coastguard Workerpw_2440x8:  COEF_X8  2440
196*c0909341SAndroid Build Coastguard Workerpw_m601x8:  COEF_X8  -601
197*c0909341SAndroid Build Coastguard Workerpw_4052x8:  COEF_X8  4052
198*c0909341SAndroid Build Coastguard Worker
199*c0909341SAndroid Build Coastguard Workerpw_401_4076x8:   dw   401*8, 4076*8
200*c0909341SAndroid Build Coastguard Workerpw_m2598_3166x8: dw -2598*8, 3166*8
201*c0909341SAndroid Build Coastguard Workerpw_1931_3612x8:  dw  1931*8, 3612*8
202*c0909341SAndroid Build Coastguard Workerpw_m1189_3920x8: dw -1189*8, 3920*8
203*c0909341SAndroid Build Coastguard Workerpw_799_4017x8:   dw   799*8, 4017*8
204*c0909341SAndroid Build Coastguard Workerpw_m2276_3406x8: dw -2276*8, 3406*8
205*c0909341SAndroid Build Coastguard Worker
206*c0909341SAndroid Build Coastguard Workerpw_201_4091x8:   dw   201*8, 4091*8
207*c0909341SAndroid Build Coastguard Workerpw_m601_4052x8:  dw  -601*8, 4052*8
208*c0909341SAndroid Build Coastguard Workerpw_995_3973x8:   dw   995*8, 3973*8
209*c0909341SAndroid Build Coastguard Workerpw_m1380_3857x8: dw -1380*8, 3857*8
210*c0909341SAndroid Build Coastguard Workerpw_1751_3703x8:  dw  1751*8, 3703*8
211*c0909341SAndroid Build Coastguard Workerpw_m2106_3513x8: dw -2106*8, 3513*8
212*c0909341SAndroid Build Coastguard Workerpw_2440_3290x8:  dw  2440*8, 3290*8
213*c0909341SAndroid Build Coastguard Workerpw_m2751_3035x8: dw -2751*8, 3035*8
214*c0909341SAndroid Build Coastguard Worker
215*c0909341SAndroid Build Coastguard Workerpw_101_4095x8:   dw   101*8, 4095*8
216*c0909341SAndroid Build Coastguard Workerpw_m2824_2967x8: dw -2824*8, 2967*8
217*c0909341SAndroid Build Coastguard Workerpw_1660_3745x8:  dw  1660*8, 3745*8
218*c0909341SAndroid Build Coastguard Workerpw_m1474_3822x8: dw -1474*8, 3822*8
219*c0909341SAndroid Build Coastguard Workerpw_897_3996x8:   dw   897*8, 3996*8
220*c0909341SAndroid Build Coastguard Workerpw_m2191_3461x8: dw -2191*8, 3461*8
221*c0909341SAndroid Build Coastguard Workerpw_2359_3349x8:  dw  2359*8, 3349*8
222*c0909341SAndroid Build Coastguard Workerpw_m700_4036x8:  dw  -700*8, 4036*8
223*c0909341SAndroid Build Coastguard Workerpw_501_4065x8:   dw   501*8, 4065*8
224*c0909341SAndroid Build Coastguard Workerpw_m2520_3229x8: dw -2520*8, 3229*8
225*c0909341SAndroid Build Coastguard Workerpw_2019_3564x8:  dw  2019*8, 3564*8
226*c0909341SAndroid Build Coastguard Workerpw_m1092_3948x8: dw -1092*8, 3948*8
227*c0909341SAndroid Build Coastguard Workerpw_1285_3889x8:  dw  1285*8, 3889*8
228*c0909341SAndroid Build Coastguard Workerpw_m1842_3659x8: dw -1842*8, 3659*8
229*c0909341SAndroid Build Coastguard Workerpw_2675_3102x8:  dw  2675*8, 3102*8
230*c0909341SAndroid Build Coastguard Workerpw_m301_4085x8:  dw  -301*8, 4085*8
231*c0909341SAndroid Build Coastguard Worker
232*c0909341SAndroid Build Coastguard Workeridct64_mul: COEF_X8  4095,   101,  2967, -2824,  3745,  1660,  3822, -1474
233*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  401, 4076, 1
234*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  799, 4017
235*c0909341SAndroid Build Coastguard Worker            COEF_X8  -700,  4036,  2359,  3349, -2191,  3461,   897,  3996
236*c0909341SAndroid Build Coastguard Workerdw    -2598, -3166,  3166, -2598,  2598,  3166, -4017,  -799,   799, -4017
237*c0909341SAndroid Build Coastguard Worker            COEF_X8  4065,   501,  3229, -2520,  3564,  2019,  3948, -1092
238*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 1931, 3612, 1
239*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3406, 2276
240*c0909341SAndroid Build Coastguard Worker            COEF_X8  -301,  4085,  2675,  3102, -1842,  3659,  1285,  3889
241*c0909341SAndroid Build Coastguard Workerdw    -1189, -3920,  3920, -1189,  1189,  3920, -2276, -3406,  3406, -2276
242*c0909341SAndroid Build Coastguard Worker
243*c0909341SAndroid Build Coastguard WorkerSECTION .text
244*c0909341SAndroid Build Coastguard Worker
245*c0909341SAndroid Build Coastguard Worker%define o_base int8_permA+64*18
246*c0909341SAndroid Build Coastguard Worker%define o(x) (r5 - (o_base) + (x))
247*c0909341SAndroid Build Coastguard Worker%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
248*c0909341SAndroid Build Coastguard Worker
249*c0909341SAndroid Build Coastguard Worker; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
250*c0909341SAndroid Build Coastguard Worker;        16 = special_mul1, 32 = special_mul2
251*c0909341SAndroid Build Coastguard Worker%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
252*c0909341SAndroid Build Coastguard Worker    mova                m%2, m%4
253*c0909341SAndroid Build Coastguard Worker%if %7 & 16
254*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%2, m%1, [o(pw_%5)] {bcstd}
255*c0909341SAndroid Build Coastguard Worker    mova                m%3, m%4
256*c0909341SAndroid Build Coastguard Worker%if %7 & 32
257*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%3, m%1, [o(pw_%6)] {bcstd}
258*c0909341SAndroid Build Coastguard Worker%else
259*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%3, m%1, m%6
260*c0909341SAndroid Build Coastguard Worker%endif
261*c0909341SAndroid Build Coastguard Worker%elif %7 & 32
262*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%2, m%1, m%5
263*c0909341SAndroid Build Coastguard Worker    mova                m%3, m%4
264*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%3, m%1, [o(pw_%6)] {bcstd}
265*c0909341SAndroid Build Coastguard Worker%elif %6 < 32
266*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%2, m%1, m%5
267*c0909341SAndroid Build Coastguard Worker    mova                m%3, m%4
268*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%3, m%1, m%6
269*c0909341SAndroid Build Coastguard Worker%elif %7 & 1
270*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%2, m%1, [o(pw_%5_%6)] {bcstd}
271*c0909341SAndroid Build Coastguard Worker    mova                m%3, m%4
272*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%3, m%1, [o(pw_m%6_%5)] {bcstd}
273*c0909341SAndroid Build Coastguard Worker%else
274*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%2, m%1, [o(pw_m%6_%5)] {bcstd}
275*c0909341SAndroid Build Coastguard Worker    mova                m%3, m%4
276*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%3, m%1, [o(pw_%5_%6)] {bcstd}
277*c0909341SAndroid Build Coastguard Worker%endif
278*c0909341SAndroid Build Coastguard Worker%if %7 & 2
279*c0909341SAndroid Build Coastguard Worker    psrld               m%2, 12
280*c0909341SAndroid Build Coastguard Worker    pslld               m%3, 4
281*c0909341SAndroid Build Coastguard Worker    vpshrdd             m%1, m%3, m%2, 16
282*c0909341SAndroid Build Coastguard Worker%elif %7 & 4
283*c0909341SAndroid Build Coastguard Worker    ; compared to using shifts (as above) this has better throughput,
284*c0909341SAndroid Build Coastguard Worker    ; but worse latency and requires setting up the opmask/index
285*c0909341SAndroid Build Coastguard Worker    ; registers, so only use this method for the larger transforms
286*c0909341SAndroid Build Coastguard Worker    pslld               m%1, m%2, 4
287*c0909341SAndroid Build Coastguard Worker    vpmultishiftqb  m%1{k7}, m13, m%3
288*c0909341SAndroid Build Coastguard Worker%else
289*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 12
290*c0909341SAndroid Build Coastguard Worker    psrad               m%3, 12
291*c0909341SAndroid Build Coastguard Worker%if %7 & 8 == 0
292*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m%3, m%2
293*c0909341SAndroid Build Coastguard Worker%endif
294*c0909341SAndroid Build Coastguard Worker%endif
295*c0909341SAndroid Build Coastguard Worker%endmacro
296*c0909341SAndroid Build Coastguard Worker
297*c0909341SAndroid Build Coastguard Worker; flags: same as ITX_MUL2X_PACK
298*c0909341SAndroid Build Coastguard Worker%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags
299*c0909341SAndroid Build Coastguard Worker%if %11 & 1
300*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%4, [o(pw_%9_%10)]
301*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m%4{k1}, [o(pw_%7_%8)]
302*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%5, [o(pw_m%10_%9)]
303*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m%5{k1}, [o(pw_m%8_%7)]
304*c0909341SAndroid Build Coastguard Worker%else
305*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%4, [o(pw_m%10_%9)]
306*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m%4{k1}, [o(pw_m%8_%7)]
307*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%5, [o(pw_%9_%10)]
308*c0909341SAndroid Build Coastguard Worker    vpbroadcastd    m%5{k1}, [o(pw_%7_%8)]
309*c0909341SAndroid Build Coastguard Worker%endif
310*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       %1, %2, %3, %6, %4, %5, %11
311*c0909341SAndroid Build Coastguard Worker%endmacro
312*c0909341SAndroid Build Coastguard Worker
313*c0909341SAndroid Build Coastguard Worker; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
314*c0909341SAndroid Build Coastguard Worker; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
315*c0909341SAndroid Build Coastguard Worker%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
316*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%3, m%2, m%1
317*c0909341SAndroid Build Coastguard Worker    punpckhwd           m%2, m%1
318*c0909341SAndroid Build Coastguard Worker%if %7 < 32
319*c0909341SAndroid Build Coastguard Worker    mova                m%1, m%5
320*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%1, m%3, m%7
321*c0909341SAndroid Build Coastguard Worker    mova                m%4, m%5
322*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%4, m%2, m%7
323*c0909341SAndroid Build Coastguard Worker%else
324*c0909341SAndroid Build Coastguard Worker    mova                m%1, m%5
325*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%1, m%3, [o(pw_m%7_%6)] {bcstd}
326*c0909341SAndroid Build Coastguard Worker    mova                m%4, m%5
327*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%4, m%2, [o(pw_m%7_%6)] {bcstd}
328*c0909341SAndroid Build Coastguard Worker%endif
329*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 12
330*c0909341SAndroid Build Coastguard Worker    psrad               m%4, 12
331*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m%4
332*c0909341SAndroid Build Coastguard Worker    mova                m%4, m%5
333*c0909341SAndroid Build Coastguard Worker%if %7 < 32
334*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%4, m%2, m%6
335*c0909341SAndroid Build Coastguard Worker    mova                m%2, m%5
336*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%2, m%3, m%6
337*c0909341SAndroid Build Coastguard Worker%else
338*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%4, m%2, [o(pw_%6_%7)] {bcstd}
339*c0909341SAndroid Build Coastguard Worker    mova                m%2, m%5
340*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m%2, m%3, [o(pw_%6_%7)] {bcstd}
341*c0909341SAndroid Build Coastguard Worker%endif
342*c0909341SAndroid Build Coastguard Worker    psrad               m%4, 12
343*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 12
344*c0909341SAndroid Build Coastguard Worker%if %0 == 8
345*c0909341SAndroid Build Coastguard Worker    packssdw            m%8, m%2, m%4
346*c0909341SAndroid Build Coastguard Worker%else
347*c0909341SAndroid Build Coastguard Worker    packssdw            m%2, m%4
348*c0909341SAndroid Build Coastguard Worker%endif
349*c0909341SAndroid Build Coastguard Worker%endmacro
350*c0909341SAndroid Build Coastguard Worker
351*c0909341SAndroid Build Coastguard Worker%macro WRAP_XMM 1+
352*c0909341SAndroid Build Coastguard Worker    %xdefine %%reset RESET_MM_PERMUTATION
353*c0909341SAndroid Build Coastguard Worker    INIT_XMM cpuname
354*c0909341SAndroid Build Coastguard Worker    DEFINE_MMREGS xmm
355*c0909341SAndroid Build Coastguard Worker    AVX512_MM_PERMUTATION
356*c0909341SAndroid Build Coastguard Worker    %1
357*c0909341SAndroid Build Coastguard Worker    %%reset
358*c0909341SAndroid Build Coastguard Worker%endmacro
359*c0909341SAndroid Build Coastguard Worker
360*c0909341SAndroid Build Coastguard Worker%macro WRAP_YMM 1+
361*c0909341SAndroid Build Coastguard Worker    INIT_YMM cpuname
362*c0909341SAndroid Build Coastguard Worker    %1
363*c0909341SAndroid Build Coastguard Worker    INIT_ZMM cpuname
364*c0909341SAndroid Build Coastguard Worker%endmacro
365*c0909341SAndroid Build Coastguard Worker
366*c0909341SAndroid Build Coastguard Worker%macro ITX4_END 4-5 2048 ; row[1-4], rnd
367*c0909341SAndroid Build Coastguard Worker%if %5
368*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_%5)]
369*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m2
370*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
371*c0909341SAndroid Build Coastguard Worker%endif
372*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+strideq*2]
373*c0909341SAndroid Build Coastguard Worker%assign %%i 1
374*c0909341SAndroid Build Coastguard Worker%rep 4
375*c0909341SAndroid Build Coastguard Worker    %if %1 & 2
376*c0909341SAndroid Build Coastguard Worker        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
377*c0909341SAndroid Build Coastguard Worker    %else
378*c0909341SAndroid Build Coastguard Worker        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
379*c0909341SAndroid Build Coastguard Worker    %endif
380*c0909341SAndroid Build Coastguard Worker    %assign %%i %%i + 1
381*c0909341SAndroid Build Coastguard Worker    %rotate 1
382*c0909341SAndroid Build Coastguard Worker%endrep
383*c0909341SAndroid Build Coastguard Worker    movd                 m2, [%%row_adr1]
384*c0909341SAndroid Build Coastguard Worker    pinsrd               m2, [%%row_adr2], 1
385*c0909341SAndroid Build Coastguard Worker    movd                 m3, [%%row_adr3]
386*c0909341SAndroid Build Coastguard Worker    pinsrd               m3, [%%row_adr4], 1
387*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m2, m2
388*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m3, m3
389*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
390*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
391*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
392*c0909341SAndroid Build Coastguard Worker    movd       [%%row_adr1], m0
393*c0909341SAndroid Build Coastguard Worker    pextrd     [%%row_adr2], m0, 1
394*c0909341SAndroid Build Coastguard Worker    pextrd     [%%row_adr3], m0, 2
395*c0909341SAndroid Build Coastguard Worker    pextrd     [%%row_adr4], m0, 3
396*c0909341SAndroid Build Coastguard Worker    ret
397*c0909341SAndroid Build Coastguard Worker%endmacro
398*c0909341SAndroid Build Coastguard Worker
399*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_FN 3 ; type1, type2, size
400*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base
401*c0909341SAndroid Build Coastguard Worker    %define %%p1 m(i%1_%3_internal_8bpc)
402*c0909341SAndroid Build Coastguard Worker    lea               baseq, [o_base]
403*c0909341SAndroid Build Coastguard Worker    ; Jump to the 1st txfm function if we're not taking the fast path, which
404*c0909341SAndroid Build Coastguard Worker    ; in turn performs an indirect jump to the 2nd txfm function.
405*c0909341SAndroid Build Coastguard Worker    lea tx2q, [m(i%2_%3_internal_8bpc).pass2]
406*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
407*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
408*c0909341SAndroid Build Coastguard Worker    jnz %%p1
409*c0909341SAndroid Build Coastguard Worker%else
410*c0909341SAndroid Build Coastguard Worker    ; jump to the 1st txfm function unless it's located directly after this
411*c0909341SAndroid Build Coastguard Worker    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
412*c0909341SAndroid Build Coastguard WorkerALIGN function_align
413*c0909341SAndroid Build Coastguard Worker%%end:
414*c0909341SAndroid Build Coastguard Worker%endif
415*c0909341SAndroid Build Coastguard Worker%endmacro
416*c0909341SAndroid Build Coastguard Worker
417*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_4X4_FN 2 ; type1, type2
418*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 4x4
419*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
420*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, [cq]
421*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(pw_2896x8)]
422*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m1
423*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
424*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m1
425*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
426*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x4_internal_8bpc).end2
427*c0909341SAndroid Build Coastguard Worker%endif
428*c0909341SAndroid Build Coastguard Worker%endmacro
429*c0909341SAndroid Build Coastguard Worker
430*c0909341SAndroid Build Coastguard Worker%macro IDCT4_1D_PACKED 0
431*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pd_2048)]
432*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1, m0
433*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0
434*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784
435*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896
436*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1, m2 ; out0 out1
437*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2     ; out3 out2
438*c0909341SAndroid Build Coastguard Worker%endmacro
439*c0909341SAndroid Build Coastguard Worker
440*c0909341SAndroid Build Coastguard Worker%macro IADST4_1D_PACKED 0
441*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m1, m0 ; in2 in0
442*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m1, m0 ; in3 in1
443*c0909341SAndroid Build Coastguard Worker.main2:
444*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pd_2048)]
445*c0909341SAndroid Build Coastguard Worker    mova                 m0, m3
446*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m4, [o(pw_3803_1321)] {bcstd}
447*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
448*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m4, [o(pw_m1321_2482)] {bcstd}
449*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
450*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m4, [o(pw_m3344_3344)] {bcstd}
451*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m4, [o(pw_2482_3803)] {bcstd}
452*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m0, m5, [o(pw_2482_3344)] {bcstd}
453*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m5, [o(pw_m3803_3344)] {bcstd}
454*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m5, [o(pd_3344)] {bcstd}
455*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m5, [o(pw_m1321_m3344)] {bcstd}
456*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m0, m2, m1, m3
457*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2 ; out0 out1
458*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3 ; out2 out3
459*c0909341SAndroid Build Coastguard Worker%endmacro
460*c0909341SAndroid Build Coastguard Worker
461*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx512icl
462*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, dct
463*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, adst
464*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, flipadst
465*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, identity
466*c0909341SAndroid Build Coastguard Worker
467*c0909341SAndroid Build Coastguard Workercglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
468*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+16*0]
469*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*1]
470*c0909341SAndroid Build Coastguard Worker    IDCT4_1D_PACKED
471*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(deint_shuf)]
472*c0909341SAndroid Build Coastguard Worker    shufps               m3, m0, m1, q1331
473*c0909341SAndroid Build Coastguard Worker    shufps               m0, m0, m1, q0220
474*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2
475*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m2
476*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
477*c0909341SAndroid Build Coastguard Worker.pass2:
478*c0909341SAndroid Build Coastguard Worker    IDCT4_1D_PACKED
479*c0909341SAndroid Build Coastguard Worker    pxor              ymm16, ymm16
480*c0909341SAndroid Build Coastguard Worker    mova               [cq], ymm16
481*c0909341SAndroid Build Coastguard Worker    ITX4_END              0, 1, 3, 2
482*c0909341SAndroid Build Coastguard Worker
483*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, dct
484*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, adst
485*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, flipadst
486*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, identity
487*c0909341SAndroid Build Coastguard Worker
488*c0909341SAndroid Build Coastguard Workercglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
489*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+16*0]
490*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*1]
491*c0909341SAndroid Build Coastguard Worker    call .main
492*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m1
493*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
494*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m3
495*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3
496*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
497*c0909341SAndroid Build Coastguard Worker.pass2:
498*c0909341SAndroid Build Coastguard Worker    call .main
499*c0909341SAndroid Build Coastguard Worker.end:
500*c0909341SAndroid Build Coastguard Worker    pxor              ymm16, ymm16
501*c0909341SAndroid Build Coastguard Worker    mova               [cq], ymm16
502*c0909341SAndroid Build Coastguard Worker.end2:
503*c0909341SAndroid Build Coastguard Worker    ITX4_END              0, 1, 2, 3
504*c0909341SAndroid Build Coastguard WorkerALIGN function_align
505*c0909341SAndroid Build Coastguard Worker.main:
506*c0909341SAndroid Build Coastguard Worker    IADST4_1D_PACKED
507*c0909341SAndroid Build Coastguard Worker    ret
508*c0909341SAndroid Build Coastguard Worker
509*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, dct
510*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, adst
511*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, flipadst
512*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, identity
513*c0909341SAndroid Build Coastguard Worker
514*c0909341SAndroid Build Coastguard Workercglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
515*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+16*0]
516*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*1]
517*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x4_internal_8bpc).main
518*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m0
519*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0
520*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
521*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
522*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
523*c0909341SAndroid Build Coastguard Worker.pass2:
524*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x4_internal_8bpc).main
525*c0909341SAndroid Build Coastguard Worker.end:
526*c0909341SAndroid Build Coastguard Worker    pxor              ymm16, ymm16
527*c0909341SAndroid Build Coastguard Worker    mova               [cq], ymm16
528*c0909341SAndroid Build Coastguard Worker.end2:
529*c0909341SAndroid Build Coastguard Worker    ITX4_END              3, 2, 1, 0
530*c0909341SAndroid Build Coastguard Worker
531*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, dct
532*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, adst
533*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, flipadst
534*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, identity
535*c0909341SAndroid Build Coastguard Worker
536*c0909341SAndroid Build Coastguard Workercglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
537*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+16*0]
538*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*1]
539*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_1697x8)]
540*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3, m0
541*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m1
542*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2
543*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3
544*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m1
545*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
546*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m2
547*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
548*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
549*c0909341SAndroid Build Coastguard Worker.pass2:
550*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_1697x8)]
551*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3, m0
552*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m1
553*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2
554*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3
555*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x4_internal_8bpc).end
556*c0909341SAndroid Build Coastguard Worker
557*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_4X8_FN 2 ; type1, type2
558*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 4x8
559*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
560*c0909341SAndroid Build Coastguard Worker    movd               xmm1, [o(pw_2896x8)]
561*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm0, xmm1, [cq]
562*c0909341SAndroid Build Coastguard Worker    movd               xmm2, [o(pw_2048)]
563*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm0, xmm1
564*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm0, xmm1
565*c0909341SAndroid Build Coastguard Worker    pmulhrsw           xmm0, xmm2
566*c0909341SAndroid Build Coastguard Worker    vpbroadcastw        ym0, xmm0
567*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym0
568*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x8_internal_8bpc).end3
569*c0909341SAndroid Build Coastguard Worker%endif
570*c0909341SAndroid Build Coastguard Worker%endmacro
571*c0909341SAndroid Build Coastguard Worker
572*c0909341SAndroid Build Coastguard Worker%macro IDCT8_1D_PACKED 0
573*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m3, m0 ; in7 in1
574*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m1, m2 ; in3 in5
575*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m1     ; in6 in2
576*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m0     ; in4 in0
577*c0909341SAndroid Build Coastguard Worker.main2:
578*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pd_2048)]
579*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 0, 1, 6,  799, 4017, 3 ; t4a t7a
580*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
581*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 0, 1, 6, 1567, 3784    ; t3 t2
582*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m5, m4 ; t5a t6a (interleaved)
583*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m5     ; t4  t7  (interleaved)
584*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 1, 5, 6, 2896, 2896    ; t0 t1
585*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 1, 5, 6, 2896, 2896, 1 ; t6 t5
586*c0909341SAndroid Build Coastguard Worker%if mmsize > 16
587*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m1, [o(deint_shuf)]
588*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m1
589*c0909341SAndroid Build Coastguard Worker%else
590*c0909341SAndroid Build Coastguard Worker    pshufb               m4, [o(deint_shuf)]
591*c0909341SAndroid Build Coastguard Worker%endif
592*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2, m3 ; tmp3 tmp2
593*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m2     ; tmp0 tmp1
594*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m4, m0 ; t7 t6
595*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m0     ; t4 t5
596*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m3, m2 ; out0 out1
597*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m2     ; out7 out6
598*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m1, m4 ; out4 out5
599*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m4     ; out3 out2
600*c0909341SAndroid Build Coastguard Worker%endmacro
601*c0909341SAndroid Build Coastguard Worker
602*c0909341SAndroid Build Coastguard Worker%macro IADST8_1D_PACKED 1 ; pass
603*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pd_2048)]
604*c0909341SAndroid Build Coastguard Worker%if %1 == 1
605*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076, 3 ; t1a t0a
606*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
607*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
608*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
609*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m0, m2 ; t5 t4
610*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2     ; t1 t0
611*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m1, m3 ; t6 t7
612*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3     ; t2 t3
613*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
614*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
615*c0909341SAndroid Build Coastguard Worker%if mmsize > 16
616*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [o(deint_shuf)]
617*c0909341SAndroid Build Coastguard Worker%else
618*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(deint_shuf)]
619*c0909341SAndroid Build Coastguard Worker%endif
620*c0909341SAndroid Build Coastguard Worker    vprord               m1, 16
621*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m0, m1 ; t3 t2
622*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1     ; -out7  out0
623*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m4, m5 ; t7 t6
624*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m5     ;  out6 -out1
625*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2
626*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2
627*c0909341SAndroid Build Coastguard Worker    mova                 m2, m6
628*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m3, [o(pw_m2896_2896)] {bcstd}
629*c0909341SAndroid Build Coastguard Worker    mova                 m5, m6
630*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m1, [o(pw_m2896_2896)] {bcstd}
631*c0909341SAndroid Build Coastguard Worker    psrad                m2, 12
632*c0909341SAndroid Build Coastguard Worker    psrad                m5, 12
633*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m5     ; out4 -out5
634*c0909341SAndroid Build Coastguard Worker    mova                 m5, m6
635*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m3, [o(pw_2896_2896)] {bcstd}
636*c0909341SAndroid Build Coastguard Worker    mova                 m3, m6
637*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m3, m1, [o(pw_2896_2896)] {bcstd}
638*c0909341SAndroid Build Coastguard Worker    psrad                m5, 12
639*c0909341SAndroid Build Coastguard Worker    psrad                m3, 12
640*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3, m5 ; out2 -out3
641*c0909341SAndroid Build Coastguard Worker%else
642*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m4, m3 ; 0 7
643*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m5, m2 ; 2 5
644*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5     ; 4 3
645*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4     ; 6 1
646*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
647*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
648*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
649*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
650*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m0, m2 ; t4 t5
651*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2     ; t0 t1
652*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m1, m3 ; t6 t7
653*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3     ; t2 t3
654*c0909341SAndroid Build Coastguard Worker    shufps               m2, m5, m4, q1032
655*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2
656*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m2
657*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784    ; t4a t5a
658*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a
659*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m0, m1 ; t2 t3
660*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1     ; out0 -out7
661*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m4, m5 ; t6 t7
662*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m5     ; -out1 out6
663*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2896x8)]
664*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m2, m1 ; t3 t7
665*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m1     ; t2 t6
666*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, m3 ; t2+t3 t6+t7
667*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m3     ; t2-t3 t6-t7
668*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4, m0 ; out6 -out7
669*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m4     ; out0 -out1
670*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5     ; out4 -out5
671*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1032
672*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5     ; out2 -out3
673*c0909341SAndroid Build Coastguard Worker%endif
674*c0909341SAndroid Build Coastguard Worker%endmacro
675*c0909341SAndroid Build Coastguard Worker
676*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx512icl
677*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, dct
678*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, identity
679*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, adst
680*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, flipadst
681*c0909341SAndroid Build Coastguard Worker
682*c0909341SAndroid Build Coastguard Workercglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
683*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*0], q3120
684*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [cq+32*1], q3120
685*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_2896x8)]
686*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m2
687*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
688*c0909341SAndroid Build Coastguard Worker    IDCT4_1D_PACKED
689*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m2, [o(deint_shuf)]
690*c0909341SAndroid Build Coastguard Worker    shufps               m3, m0, m1, q1331
691*c0909341SAndroid Build Coastguard Worker    shufps               m0, m0, m1, q0220
692*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2
693*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m2
694*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
695*c0909341SAndroid Build Coastguard Worker.pass2:
696*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m0, 1
697*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, m1, 1
698*c0909341SAndroid Build Coastguard Worker    call .main
699*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2048)]
700*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, m0, xm2, 1
701*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, m1, xm3, 1
702*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1032
703*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x8_internal_8bpc).end2
704*c0909341SAndroid Build Coastguard WorkerALIGN function_align
705*c0909341SAndroid Build Coastguard Worker.main:
706*c0909341SAndroid Build Coastguard Worker    WRAP_XMM IDCT8_1D_PACKED
707*c0909341SAndroid Build Coastguard Worker    ret
708*c0909341SAndroid Build Coastguard Worker
709*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, dct
710*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, adst
711*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, flipadst
712*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, identity
713*c0909341SAndroid Build Coastguard Worker
714*c0909341SAndroid Build Coastguard Workercglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
715*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*0], q3120
716*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [cq+32*1], q3120
717*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_2896x8)]
718*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m2
719*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
720*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x4_internal_8bpc).main
721*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m1
722*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
723*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m3
724*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3
725*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
726*c0909341SAndroid Build Coastguard Worker.pass2:
727*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m0, 1
728*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, m1, 1
729*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm0, q1032
730*c0909341SAndroid Build Coastguard Worker    pshufd              xm5, xm1, q1032
731*c0909341SAndroid Build Coastguard Worker    call .main_pass2
732*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2048)]
733*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, xm2, 1
734*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, xm3, 1
735*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
736*c0909341SAndroid Build Coastguard Worker    psubw                m5, m4
737*c0909341SAndroid Build Coastguard Worker.end:
738*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m5
739*c0909341SAndroid Build Coastguard Worker.end2:
740*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
741*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
742*c0909341SAndroid Build Coastguard Worker.end3:
743*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, strided
744*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m3, [o(pd_0to15)]
745*c0909341SAndroid Build Coastguard Worker    kxnorb               k1, k1, k1
746*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
747*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m3{k1}, [dstq+m5]
748*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
749*c0909341SAndroid Build Coastguard Worker    mova               [cq], zmm20
750*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m4
751*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
752*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
753*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
754*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
755*c0909341SAndroid Build Coastguard Worker    vpscatterdd [dstq+m5]{k2}, m0
756*c0909341SAndroid Build Coastguard Worker    RET
757*c0909341SAndroid Build Coastguard WorkerALIGN function_align
758*c0909341SAndroid Build Coastguard Worker.main_pass1:
759*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm0, xm4, xm3 ; 0 7
760*c0909341SAndroid Build Coastguard Worker    punpckhwd           xm1, xm5, xm2 ; 2 5
761*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm2, xm5      ; 4 3
762*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm4      ; 6 1
763*c0909341SAndroid Build Coastguard Worker    WRAP_XMM IADST8_1D_PACKED 1
764*c0909341SAndroid Build Coastguard Worker    punpcklqdq          xm3, xm4, xm0 ; out6 -out7
765*c0909341SAndroid Build Coastguard Worker    punpckhqdq          xm0, xm4      ; out0 -out1
766*c0909341SAndroid Build Coastguard Worker    ret
767*c0909341SAndroid Build Coastguard WorkerALIGN function_align
768*c0909341SAndroid Build Coastguard Worker.main_pass2:
769*c0909341SAndroid Build Coastguard Worker    WRAP_XMM IADST8_1D_PACKED 2
770*c0909341SAndroid Build Coastguard Worker    ret
771*c0909341SAndroid Build Coastguard Worker
772*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, dct
773*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, adst
774*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, flipadst
775*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, identity
776*c0909341SAndroid Build Coastguard Worker
777*c0909341SAndroid Build Coastguard Workercglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
778*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*0], q3120
779*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [cq+32*1], q3120
780*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_2896x8)]
781*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m2
782*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
783*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x4_internal_8bpc).main
784*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m1, m0
785*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0
786*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m3
787*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3
788*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
789*c0909341SAndroid Build Coastguard Worker.pass2:
790*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, m0, 1
791*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, m1, 1
792*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm0, q1032
793*c0909341SAndroid Build Coastguard Worker    pshufd              xm5, xm1, q1032
794*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x8_internal_8bpc).main_pass2
795*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2048)]
796*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m3, xm1, 1
797*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m2, xm0, 1
798*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
799*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
800*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q1032
801*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m2, q1032
802*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x8_internal_8bpc).end
803*c0909341SAndroid Build Coastguard Worker
804*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
805*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, dct
806*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, adst
807*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, flipadst
808*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, identity
809*c0909341SAndroid Build Coastguard Worker
810*c0909341SAndroid Build Coastguard Workercglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
811*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [o(pw_2896x8)]
812*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, [cq]
813*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(int8_permB)]
814*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_1697x8)]
815*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m1, m0
816*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m0
817*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2
818*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym1, m0, 1
819*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
820*c0909341SAndroid Build Coastguard Worker.pass2:
821*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym4, [o(pw_4096)]
822*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x8_internal_8bpc).end2
823*c0909341SAndroid Build Coastguard Worker
824*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_4X16_FN 2 ; type1, type2
825*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 4x16
826*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
827*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
828*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
829*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
830*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+256
831*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+1
832*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
833*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+2048
834*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+4
835*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, r6d
836*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
837*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x16_internal_8bpc).end3
838*c0909341SAndroid Build Coastguard Worker%endif
839*c0909341SAndroid Build Coastguard Worker%endmacro
840*c0909341SAndroid Build Coastguard Worker
841*c0909341SAndroid Build Coastguard Worker%macro IDCT16_1D_PACKED 0
842*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m7, m0 ; dct16 in15 in1
843*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m4, m0 ; dct4  in2  in0
844*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m3, m4 ; dct16 in7  in9
845*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m1     ; dct8  in7  in1
846*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m6     ; dct16 in3  in13
847*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m5     ; dct8  in3  in5
848*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m2     ; dct16 in11 in5
849*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m2     ; dct4  in3  in1
850*c0909341SAndroid Build Coastguard Workercglobal_label .main2
851*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
852*c0909341SAndroid Build Coastguard Worker.main3:
853*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m13, [o(int_mshift)]
854*c0909341SAndroid Build Coastguard Worker    vpcmpub              k7, m13, m10, 6 ; 0x33...
855*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        8, 2, 4, 10,  401, 4076, 5 ; t8a  t15a
856*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 5 ; t9a  t14a
857*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a
858*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a
859*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 5 ; t4a  t7a
860*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 5 ; t5a  t6a
861*c0909341SAndroid Build Coastguard Worker.main4:
862*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m8, m0 ; t9  t14
863*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m0     ; t8  t15
864*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m1, m5 ; t10 t13
865*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5     ; t11 t12
866*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        6, 0, 5, 10, 1567,  3784    ; t3   t2
867*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m8, m1 ; t11a t12a
868*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m1     ; t8a  t15a
869*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m7, m3 ; t5a  t6a
870*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m3     ; t4   t7
871*c0909341SAndroid Build Coastguard Worker.main5:
872*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 3, 5, 10, 1567,  3784, 5 ; t9a  t14a
873*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a
874*c0909341SAndroid Build Coastguard Worker%if mmsize > 16
875*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [o(deint_shuf)]
876*c0909341SAndroid Build Coastguard Worker%else
877*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(deint_shuf)]
878*c0909341SAndroid Build Coastguard Worker%endif
879*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m2896_2896)]
880*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2896_2896)]
881*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m2, m4 ; t9   t14
882*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m4     ; t10  t13
883*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m5
884*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m5
885*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5
886*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        9, 4,  5, 10, 11, 12    ; t0   t1
887*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 4,  5, 10, 12, 11    ; t5   t6
888*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 4,  5, 10, 11, 12, 8 ; t11  t12
889*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 0, 11, 10, 11, 12, 8 ; t10a t13a
890*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m7, m1 ; t7 t6
891*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m7, m1     ; t4 t5
892*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m9, m6 ; dct4 out3 out2
893*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m6     ; dct4 out0 out1
894*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m11    ; t12  t13a
895*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m0     ; t11  t10a
896*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m8, m3 ; t15a t14
897*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m3     ; t8a  t9
898*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m9, m2 ; dct8 out7 out6
899*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m2     ; dct8 out0 out1
900*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m1, m7 ; dct8 out4 out5
901*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m7     ; dct8 out3 out2
902*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m9, m0 ; out15 out14
903*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m9     ; out0  out1
904*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m1, m5 ; out12 out13
905*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5     ; out3  out2
906*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m2, m4 ; out11 out10
907*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m4     ; out4  out5
908*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m3, m8 ; out8  out9
909*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m8     ; out7  out6
910*c0909341SAndroid Build Coastguard Worker%endmacro
911*c0909341SAndroid Build Coastguard Worker
912*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, dct
913*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, identity
914*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, adst
915*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, flipadst
916*c0909341SAndroid Build Coastguard Worker
917*c0909341SAndroid Build Coastguard Workercglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
918*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+32*2]
919*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [cq+32*0], 1
920*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(int16_perm)]
921*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+32*3]
922*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, [cq+32*1], 1
923*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pd_2048)]
924*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3
925*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3
926*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896, 2
927*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784, 2
928*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_16384)]
929*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m1, m2
930*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2     ; out0 out1
931*c0909341SAndroid Build Coastguard Worker    vprord               m3, 16     ; out2 out3
932*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m3
933*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m3
934*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
935*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
936*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
937*c0909341SAndroid Build Coastguard Worker.pass2:
938*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm2, ym0, 1
939*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm3, ym1, 1
940*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm4, m0, 2
941*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm5, m1, 2
942*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm6, m0, 3
943*c0909341SAndroid Build Coastguard Worker    vextracti32x4       xm7, m1, 3
944*c0909341SAndroid Build Coastguard Worker    call .main
945*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, xm2, 1
946*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, xm3, 1
947*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym4, xm6, 1
948*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym5, xm7, 1
949*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym4, 1
950*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, ym5, 1
951*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2048)]
952*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1032
953*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x16_internal_8bpc).end2
954*c0909341SAndroid Build Coastguard WorkerALIGN function_align
955*c0909341SAndroid Build Coastguard Worker.main:
956*c0909341SAndroid Build Coastguard Worker    WRAP_XMM IDCT16_1D_PACKED
957*c0909341SAndroid Build Coastguard Worker    ret
958*c0909341SAndroid Build Coastguard Worker
959*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, dct
960*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, adst
961*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, flipadst
962*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, identity
963*c0909341SAndroid Build Coastguard Worker
964*c0909341SAndroid Build Coastguard Workercglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
965*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(permB)]
966*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m1, [cq+64*0]
967*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, [cq+64*1]
968*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x4_internal_8bpc).main
969*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_16384)]
970*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m1
971*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
972*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
973*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
974*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m2
975*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
976*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
977*c0909341SAndroid Build Coastguard Worker.pass2:
978*c0909341SAndroid Build Coastguard Worker    call .main
979*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2048)]
980*c0909341SAndroid Build Coastguard Worker    psrlq               m10, 4
981*c0909341SAndroid Build Coastguard Worker    psubw                m6, m8, m5
982*c0909341SAndroid Build Coastguard Worker.end:
983*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_2896x8)]
984*c0909341SAndroid Build Coastguard Worker    paddsw              ym1, ym2, ym4
985*c0909341SAndroid Build Coastguard Worker    psubsw              ym2, ym4
986*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, ym2, 1
987*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7      ; -out7   out4   out6  -out5   out8  -out11 -out9   out10
988*c0909341SAndroid Build Coastguard Worker    psrlq                m0, m10, 4
989*c0909341SAndroid Build Coastguard Worker    vpermi2q             m0, m1, m3  ; 0 1   4 5   8 9   c d
990*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m10, m3 ; 2 3   6 7   a b   e f
991*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m6
992*c0909341SAndroid Build Coastguard Worker.end2:
993*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
994*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
995*c0909341SAndroid Build Coastguard Worker.end3:
996*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, strided
997*c0909341SAndroid Build Coastguard Worker    pmulld               m5, m3, [o(pd_0to15)]
998*c0909341SAndroid Build Coastguard Worker    kxnorw               k1, k1, k1
999*c0909341SAndroid Build Coastguard Worker    kmovw                k2, k1
1000*c0909341SAndroid Build Coastguard Worker    vpgatherdd       m3{k1}, [dstq+m5]
1001*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
1002*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*0], m4
1003*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*1], m4
1004*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m4
1005*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
1006*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1007*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1008*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1009*c0909341SAndroid Build Coastguard Worker    vpscatterdd [dstq+m5]{k2}, m0
1010*c0909341SAndroid Build Coastguard Worker    RET
1011*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1012*c0909341SAndroid Build Coastguard Worker.main:
1013*c0909341SAndroid Build Coastguard Worker    movu                 m3, [o(permB+1)]
1014*c0909341SAndroid Build Coastguard Worker    psrlq               m10, m3, 4
1015*c0909341SAndroid Build Coastguard Worker.main2:
1016*c0909341SAndroid Build Coastguard Worker    vpermi2q             m3, m0, m1  ; in15 in12 in13 in14 in11 in8  in9  in10
1017*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m10, m1 ; in0  in3  in2  in1  in4  in7  in6  in5
1018*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pd_2048)]
1019*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ym13, [o(int_mshift)]
1020*c0909341SAndroid Build Coastguard Worker    kxnorb               k1, k1, k1
1021*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m3, m0  ; in12 in3  in14 in1
1022*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3      ; in0  in15 in2  in13
1023*c0909341SAndroid Build Coastguard Worker    kshiftrb             k1, k1, 4
1024*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym3, m4, 1   ; in8  in7  in10 in5
1025*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym1, m0, 1   ; in4  in11 in6  in9
1026*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx512icl
1027*c0909341SAndroid Build Coastguard Worker    vpcmpub              k7, m13, m9, 6 ; 0x33...
1028*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
1029*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        0, 2, 5, 6, 7, 9,  201, 4091,  995, 3973, 5
1030*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5
1031*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5
1032*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        4, 2, 5, 6, 7, 9, 3857, 1380, 4052,  601, 5
1033*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m0, m3 ; t9a  t8a  t11a t10a
1034*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m3     ; t1a  t0a  t3a  t2a
1035*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m1, m4 ; t13a t12a t15a t14a
1036*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m1     ; t5a  t4a  t7a  t6a
1037*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        2, 1, 5, 6, 7, 9,  799, 4017, 3406, 2276, 5
1038*c0909341SAndroid Build Coastguard Worker    psubw                m7, m8, m7
1039*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 1, 5, 9, 7, 6, 4
1040*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_3784_m1567)]
1041*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m6{k1}, [o(pw_m3784_1567)]
1042*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m0, m4 ; t5   t4   t7   t6
1043*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4     ; t1   t0   t3   t2
1044*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m2, m3 ; t13a t12a t15a t14a
1045*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m3     ; t9a  t8a  t11a t10a
1046*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a
1047*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14
1048*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [o(deint_shuf)]
1049*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5
1050*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5
1051*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m0, m2, 0x03  ; t3   t2   t11a t10a
1052*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, xm2, 1        ; t1   t0   t9a  t8a
1053*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m1, m4, 0x03  ; t7a  t6a  t15  t14
1054*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, xm4, 1        ; t4a  t5a  t12  t13
1055*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
1056*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m0, m3        ; t3a t2a t11 t10
1057*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m3            ; -out15  out0   out14 -out1
1058*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m1, m2        ;  out12 -out3  -out13  out2
1059*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2            ; t7 t6 t15a t14a
1060*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m4, m1        ; t2a t6  t10 t14a
1061*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m1            ; t3a t7  t11 t15a
1062*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
1063*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, ym0, 1        ; out12 -out3  -out13  out2  -out15  out0   out14 -out1
1064*c0909341SAndroid Build Coastguard Worker    ret
1065*c0909341SAndroid Build Coastguard Worker
1066*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, dct
1067*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, adst
1068*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, flipadst
1069*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, identity
1070*c0909341SAndroid Build Coastguard Worker
1071*c0909341SAndroid Build Coastguard Workercglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1072*c0909341SAndroid Build Coastguard Worker    mova                 m1, [o(permB)]
1073*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m1, [cq+64*0]
1074*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, [cq+64*1]
1075*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x4_internal_8bpc).main
1076*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_16384)]
1077*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m0
1078*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0
1079*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1080*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
1081*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
1082*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
1083*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1084*c0909341SAndroid Build Coastguard Worker.pass2:
1085*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x16_internal_8bpc).main
1086*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_2048)]
1087*c0909341SAndroid Build Coastguard Worker    psrlq               m10, 12
1088*c0909341SAndroid Build Coastguard Worker    psubw                m5, m8, m6
1089*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x16_internal_8bpc).end
1090*c0909341SAndroid Build Coastguard Worker
1091*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, dct
1092*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, adst
1093*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, flipadst
1094*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, identity
1095*c0909341SAndroid Build Coastguard Worker
1096*c0909341SAndroid Build Coastguard Workercglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1097*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(int16_perm)]
1098*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m2, [cq+64*0]
1099*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m2, [cq+64*1]
1100*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_1697x8)]
1101*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [o(pd_m1)]
1102*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4, m1    ; we want to do a signed avg, but pavgw is
1103*c0909341SAndroid Build Coastguard Worker    vpcmpw               k1, m1, m0, 4 ; unsigned. as long as both signs are equal
1104*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m2        ; it still works, but if the input is -1 the
1105*c0909341SAndroid Build Coastguard Worker    vpcmpw               k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes
1106*c0909341SAndroid Build Coastguard Worker    vpavgw        m1{k1}{z}, m3        ; pavgw to output -32768 instead of 0 unless
1107*c0909341SAndroid Build Coastguard Worker    vpavgw        m2{k2}{z}, m4        ; we explicitly deal with that case here.
1108*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2
1109*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2
1110*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1111*c0909341SAndroid Build Coastguard Worker.pass2:
1112*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_1697x16)]
1113*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2048)]
1114*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3, m0
1115*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m1
1116*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m0
1117*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m1
1118*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2
1119*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3
1120*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x16_internal_8bpc).end2
1121*c0909341SAndroid Build Coastguard Worker
1122*c0909341SAndroid Build Coastguard Worker%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3]
1123*c0909341SAndroid Build Coastguard Worker    movq               xm%3, [dstq   ]
1124*c0909341SAndroid Build Coastguard Worker    movhps             xm%3, [dstq+%5]
1125*c0909341SAndroid Build Coastguard Worker    movq               xm%4, [dstq+%6]
1126*c0909341SAndroid Build Coastguard Worker    movhps             xm%4, [dstq+%7]
1127*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%3, xm%3
1128*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%4, xm%4
1129*c0909341SAndroid Build Coastguard Worker%ifnum %1
1130*c0909341SAndroid Build Coastguard Worker    paddw               m%3, m%1
1131*c0909341SAndroid Build Coastguard Worker%else
1132*c0909341SAndroid Build Coastguard Worker    paddw               m%3, %1
1133*c0909341SAndroid Build Coastguard Worker%endif
1134*c0909341SAndroid Build Coastguard Worker%ifnum %2
1135*c0909341SAndroid Build Coastguard Worker    paddw               m%4, m%2
1136*c0909341SAndroid Build Coastguard Worker%else
1137*c0909341SAndroid Build Coastguard Worker    paddw               m%4, %2
1138*c0909341SAndroid Build Coastguard Worker%endif
1139*c0909341SAndroid Build Coastguard Worker    packuswb            m%3, m%4
1140*c0909341SAndroid Build Coastguard Worker    vextracti32x4      xm%4, m%3, 1
1141*c0909341SAndroid Build Coastguard Worker    movq          [dstq   ], xm%3
1142*c0909341SAndroid Build Coastguard Worker    movhps        [dstq+%6], xm%3
1143*c0909341SAndroid Build Coastguard Worker    movq          [dstq+%5], xm%4
1144*c0909341SAndroid Build Coastguard Worker    movhps        [dstq+%7], xm%4
1145*c0909341SAndroid Build Coastguard Worker%endmacro
1146*c0909341SAndroid Build Coastguard Worker
1147*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X4_FN 2 ; type1, type2
1148*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 8x4
1149*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
1150*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
1151*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
1152*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_2048)]
1153*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
1154*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
1155*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm2
1156*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
1157*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
1158*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x4_internal_8bpc).end3
1159*c0909341SAndroid Build Coastguard Worker%endif
1160*c0909341SAndroid Build Coastguard Worker%endmacro
1161*c0909341SAndroid Build Coastguard Worker
1162*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx512icl
1163*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, dct
1164*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, adst
1165*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, flipadst
1166*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, identity
1167*c0909341SAndroid Build Coastguard Worker
1168*c0909341SAndroid Build Coastguard Workercglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1169*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, [o(pw_2896x8)]
1170*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm3, [cq+16*0]
1171*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm3, [cq+16*1]
1172*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm2, xm3, [cq+16*2]
1173*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm3,      [cq+16*3]
1174*c0909341SAndroid Build Coastguard Worker    call m(idct_4x8_internal_8bpc).main
1175*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [o(deint_shuf)]
1176*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m3, m1, xm3, 1
1177*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, m0, xm2, 1
1178*c0909341SAndroid Build Coastguard Worker    shufps               m0, m1, m3, q0220
1179*c0909341SAndroid Build Coastguard Worker    shufps               m1, m3, q1331
1180*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1181*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1182*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1183*c0909341SAndroid Build Coastguard Worker.pass2:
1184*c0909341SAndroid Build Coastguard Worker    IDCT4_1D_PACKED
1185*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
1186*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q2031
1187*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x4_internal_8bpc).end2
1188*c0909341SAndroid Build Coastguard Worker
1189*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, dct
1190*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, adst
1191*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, flipadst
1192*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, identity
1193*c0909341SAndroid Build Coastguard Worker
1194*c0909341SAndroid Build Coastguard Workercglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1195*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [o(pw_2896x8)]
1196*c0909341SAndroid Build Coastguard Worker    pshufd              xm4,      [cq+16*0], q1032
1197*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm3, xm0, [cq+16*3]
1198*c0909341SAndroid Build Coastguard Worker    pshufd              xm5,      [cq+16*1], q1032
1199*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm2, xm0, [cq+16*2]
1200*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm4, xm0
1201*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm5, xm0
1202*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x8_internal_8bpc).main_pass1
1203*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, xm2, 1
1204*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, xm3, 1
1205*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
1206*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m1
1207*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
1208*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m2
1209*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m3
1210*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3
1211*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1212*c0909341SAndroid Build Coastguard Worker.pass2:
1213*c0909341SAndroid Build Coastguard Worker    call .main
1214*c0909341SAndroid Build Coastguard Worker.end:
1215*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
1216*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3120
1217*c0909341SAndroid Build Coastguard Worker.end2:
1218*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_2048)]
1219*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m2
1220*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1221*c0909341SAndroid Build Coastguard Worker.end3:
1222*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
1223*c0909341SAndroid Build Coastguard Worker    mova               [cq], zmm18
1224*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
1225*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             0, 1, 4, 5
1226*c0909341SAndroid Build Coastguard Worker    RET
1227*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1228*c0909341SAndroid Build Coastguard Worker.main:
1229*c0909341SAndroid Build Coastguard Worker    IADST4_1D_PACKED
1230*c0909341SAndroid Build Coastguard Worker    ret
1231*c0909341SAndroid Build Coastguard Worker
1232*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, dct
1233*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, adst
1234*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, flipadst
1235*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, identity
1236*c0909341SAndroid Build Coastguard Worker
1237*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1238*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [o(pw_2896x8)]
1239*c0909341SAndroid Build Coastguard Worker    pshufd              xm4,      [cq+16*0], q1032
1240*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm3, xm0, [cq+16*3]
1241*c0909341SAndroid Build Coastguard Worker    pshufd              xm5,      [cq+16*1], q1032
1242*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm2, xm0, [cq+16*2]
1243*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm4, xm0
1244*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm5, xm0
1245*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x8_internal_8bpc).main_pass1
1246*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m3, m3, xm1, 1
1247*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m2, m2, xm0, 1
1248*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3, m2
1249*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m2
1250*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
1251*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m1
1252*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m3
1253*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3
1254*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1255*c0909341SAndroid Build Coastguard Worker.pass2:
1256*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x4_internal_8bpc).main
1257*c0909341SAndroid Build Coastguard Worker    mova                 m2, m1
1258*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m0, q2031
1259*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m2, q2031
1260*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x4_internal_8bpc).end2
1261*c0909341SAndroid Build Coastguard Worker
1262*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, dct
1263*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, adst
1264*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, flipadst
1265*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, identity
1266*c0909341SAndroid Build Coastguard Worker
1267*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1268*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+16*0]
1269*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq+16*1]
1270*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m2, [cq+16*2], 1
1271*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [cq+16*3], 1
1272*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_2896x8)]
1273*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m0
1274*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0
1275*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
1276*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1277*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
1278*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
1279*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m0
1280*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m1
1281*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1282*c0909341SAndroid Build Coastguard Worker.pass2:
1283*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_1697x8)]
1284*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3, m0
1285*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m1
1286*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2
1287*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3
1288*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x4_internal_8bpc).end
1289*c0909341SAndroid Build Coastguard Worker
1290*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X8_FN 2 ; type1, type2
1291*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 8x8
1292*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
1293*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
1294*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
1295*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
1296*c0909341SAndroid Build Coastguard Worker.dconly:
1297*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
1298*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+256
1299*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+1
1300*c0909341SAndroid Build Coastguard Worker.dconly2:
1301*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym2, strided
1302*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
1303*c0909341SAndroid Build Coastguard Worker    pmulld              ym5, ym2, [o(pd_0to15)]
1304*c0909341SAndroid Build Coastguard Worker    kxnorb               k1, k1, k1
1305*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+2048
1306*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+4
1307*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
1308*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m4, r6d
1309*c0909341SAndroid Build Coastguard Worker.dconly_loop:
1310*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
1311*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m2{k1}, [dstq+ym5]
1312*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m2, m3
1313*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2, m3
1314*c0909341SAndroid Build Coastguard Worker    paddw                m0, m4
1315*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4
1316*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1317*c0909341SAndroid Build Coastguard Worker    kmovb                k1, k2
1318*c0909341SAndroid Build Coastguard Worker    vpscatterdq [dstq+ym5]{k2}, m0
1319*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
1320*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 8
1321*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
1322*c0909341SAndroid Build Coastguard Worker    RET
1323*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx512icl
1324*c0909341SAndroid Build Coastguard Worker%endif
1325*c0909341SAndroid Build Coastguard Worker%endmacro
1326*c0909341SAndroid Build Coastguard Worker
1327*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, dct
1328*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, identity
1329*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, adst
1330*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, flipadst
1331*c0909341SAndroid Build Coastguard Worker
1332*c0909341SAndroid Build Coastguard Workercglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1333*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*0], q3120 ; 0 1
1334*c0909341SAndroid Build Coastguard Worker    vpermq               m3, [cq+32*3], q3120 ; 6 7
1335*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [cq+32*2], q3120 ; 4 5
1336*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [cq+32*1], q3120 ; 2 3
1337*c0909341SAndroid Build Coastguard Worker    call .main
1338*c0909341SAndroid Build Coastguard Worker    shufps               m4, m0, m1, q0220
1339*c0909341SAndroid Build Coastguard Worker    shufps               m5, m0, m1, q1331
1340*c0909341SAndroid Build Coastguard Worker    shufps               m1, m2, m3, q0220
1341*c0909341SAndroid Build Coastguard Worker    shufps               m3, m2, m3, q1331
1342*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m0, [o(deint_shuf)]
1343*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_16384)]
1344*c0909341SAndroid Build Coastguard Worker    REPX   {pshufb   x, m0}, m4, m5, m1, m3
1345*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m2}, m4, m5, m1, m3
1346*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, m4, xm1, 1
1347*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m4, m1, 0x03
1348*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, m5, xm3, 1
1349*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m5, m3, 0x03
1350*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1351*c0909341SAndroid Build Coastguard Worker.pass2:
1352*c0909341SAndroid Build Coastguard Worker    call .main
1353*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2048)]
1354*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
1355*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q2031
1356*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m2, q3120
1357*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q2031
1358*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x8_internal_8bpc).end2
1359*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1360*c0909341SAndroid Build Coastguard Workercglobal_label .main
1361*c0909341SAndroid Build Coastguard Worker    IDCT8_1D_PACKED
1362*c0909341SAndroid Build Coastguard Worker    ret
1363*c0909341SAndroid Build Coastguard Worker
1364*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, dct
1365*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, adst
1366*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, flipadst
1367*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, identity
1368*c0909341SAndroid Build Coastguard Worker
1369*c0909341SAndroid Build Coastguard Workercglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1370*c0909341SAndroid Build Coastguard Worker    vpermq               m4, [cq+32*0], q1302 ; 1 0
1371*c0909341SAndroid Build Coastguard Worker    vpermq               m3, [cq+32*3], q3120 ; 6 7
1372*c0909341SAndroid Build Coastguard Worker    vpermq               m5, [cq+32*1], q1302 ; 3 2
1373*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [cq+32*2], q3120 ; 4 5
1374*c0909341SAndroid Build Coastguard Worker    call .main_pass1
1375*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_16384_m16384)]
1376*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0, m1
1377*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1
1378*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m3
1379*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3
1380*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m0
1381*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0
1382*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
1383*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
1384*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m3, m4, m0, m1
1385*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m3, m0, 0x03
1386*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, m3, xm0, 1
1387*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m4, m1, 0x03
1388*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, m4, xm1, 1
1389*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1390*c0909341SAndroid Build Coastguard Worker.pass2:
1391*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m0, q1032
1392*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m1, q1032
1393*c0909341SAndroid Build Coastguard Worker    call .main_pass2
1394*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2048)]
1395*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [o(pw_4096)]
1396*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5 ; lower half = 2048, upper half = -2048
1397*c0909341SAndroid Build Coastguard Worker.end:
1398*c0909341SAndroid Build Coastguard Worker    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
1399*c0909341SAndroid Build Coastguard Worker.end2:
1400*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
1401*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
1402*c0909341SAndroid Build Coastguard Worker.end3:
1403*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
1404*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
1405*c0909341SAndroid Build Coastguard Worker.end4:
1406*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
1407*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*0], m4
1408*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*1], m4
1409*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*2], m4
1410*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*3], m4
1411*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq*3]
1412*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             0, 1, 4, 5
1413*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1414*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             2, 3, 4, 5
1415*c0909341SAndroid Build Coastguard Worker    RET
1416*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1417*c0909341SAndroid Build Coastguard Worker.main_pass1:
1418*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m4, m3 ; 0 7
1419*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m5, m2 ; 2 5
1420*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5     ; 4 3
1421*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4     ; 6 1
1422*c0909341SAndroid Build Coastguard Worker    IADST8_1D_PACKED 1
1423*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m4, m0        ; out6 -out7
1424*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m4            ; out0 -out1
1425*c0909341SAndroid Build Coastguard Worker    ret
1426*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1427*c0909341SAndroid Build Coastguard Workercglobal_label .main_pass2
1428*c0909341SAndroid Build Coastguard Worker    IADST8_1D_PACKED 2
1429*c0909341SAndroid Build Coastguard Worker    ret
1430*c0909341SAndroid Build Coastguard Worker
1431*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, dct
1432*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, adst
1433*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, flipadst
1434*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, identity
1435*c0909341SAndroid Build Coastguard Worker
1436*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1437*c0909341SAndroid Build Coastguard Worker    vpermq               m4, [cq+32*0], q1302 ; 1 0
1438*c0909341SAndroid Build Coastguard Worker    vpermq               m3, [cq+32*3], q3120 ; 6 7
1439*c0909341SAndroid Build Coastguard Worker    vpermq               m5, [cq+32*1], q1302 ; 3 2
1440*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [cq+32*2], q3120 ; 4 5
1441*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x8_internal_8bpc).main_pass1
1442*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_m16384_16384)]
1443*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m3, m2
1444*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m2
1445*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1, m0
1446*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0
1447*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m4, m3
1448*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m3
1449*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2, m1
1450*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1
1451*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m0, m4, m3, m2
1452*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, m0, xm3, 1
1453*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m0, m3, 0x03
1454*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, m4, xm2, 1
1455*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m4, m2, 0x03
1456*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1457*c0909341SAndroid Build Coastguard Worker.pass2:
1458*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m0, q1032
1459*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m1, q1032
1460*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x8_internal_8bpc).main_pass2
1461*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2048)]
1462*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm5, [o(pw_4096)]
1463*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5 ; lower half = -2048, upper half = 2048
1464*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m3, q2031
1465*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m0, q2031
1466*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m2, q2031
1467*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m1, q2031
1468*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m0, m4
1469*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5, m4
1470*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x8_internal_8bpc).end3
1471*c0909341SAndroid Build Coastguard Worker
1472*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, dct
1473*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, adst
1474*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, flipadst
1475*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, identity
1476*c0909341SAndroid Build Coastguard Worker
1477*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1478*c0909341SAndroid Build Coastguard Worker    mova                xm3, [cq+16*0]
1479*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+16*1]
1480*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m3, [cq+16*4], 1
1481*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m2, [cq+16*5], 1
1482*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq+16*2]
1483*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq+16*3]
1484*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m4, [cq+16*6], 1
1485*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m0, [cq+16*7], 1
1486*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m2
1487*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2
1488*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0
1489*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0
1490*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2
1491*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2
1492*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4
1493*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4
1494*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1495*c0909341SAndroid Build Coastguard Worker.pass2:
1496*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_4096)]
1497*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x8_internal_8bpc).end
1498*c0909341SAndroid Build Coastguard Worker
1499*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X16_FN 2 ; type1, type2
1500*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 8x16
1501*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
1502*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
1503*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
1504*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
1505*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
1506*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128
1507*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8
1508*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
1509*c0909341SAndroid Build Coastguard Worker%endif
1510*c0909341SAndroid Build Coastguard Worker%endmacro
1511*c0909341SAndroid Build Coastguard Worker
1512*c0909341SAndroid Build Coastguard Worker%macro ITX_8X16_LOAD_COEFS 0
1513*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2896x8)]
1514*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4, [cq+32*0]
1515*c0909341SAndroid Build Coastguard Worker    add                  cq, 32*4
1516*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m4, [cq+32*3]
1517*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4, [cq-32*3]
1518*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m4, [cq+32*2]
1519*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4, [cq-32*2]
1520*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m4, [cq+32*1]
1521*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4, [cq-32*1]
1522*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4,     [cq+32*0]
1523*c0909341SAndroid Build Coastguard Worker%endmacro
1524*c0909341SAndroid Build Coastguard Worker
1525*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
1526*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, dct
1527*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, identity
1528*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, adst
1529*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, flipadst
1530*c0909341SAndroid Build Coastguard Worker
1531*c0909341SAndroid Build Coastguard Workercglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1532*c0909341SAndroid Build Coastguard Worker    mova                 m3, [o(permB)]
1533*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m3, [cq+64*0]
1534*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2896x8)]
1535*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m3, [cq+64*1]
1536*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m3, [cq+64*2]
1537*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, [cq+64*3]
1538*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
1539*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_8bpc).main
1540*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_16384)]
1541*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3
1542*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2     ; a0 e0 a1 e1 a2 e2 a3 e3
1543*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3
1544*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3     ; d0 h0 d1 h1 d2 h2 d3 h3
1545*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m4, m0, m2, m1
1546*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3
1547*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4     ; a0 b0 e0 f0 a1 b1 e1 f1
1548*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3
1549*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1     ; c0 d0 g0 h0 c1 d1 g1 h1
1550*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m2 ;  1  5  9 13
1551*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2     ;  0  4  8 12
1552*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4 ;  2  6 10 14
1553*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4     ;  3  7 11 15
1554*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1555*c0909341SAndroid Build Coastguard Worker.pass2:
1556*c0909341SAndroid Build Coastguard Worker    vprord               m5, [o(int16_perm)], 16
1557*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m2, q1320     ;  2 10 14  6
1558*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m1, m3, q2310 ;  1  5 15 11
1559*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m3, q0132     ;  9 13  7  3
1560*c0909341SAndroid Build Coastguard Worker    vpermb               m9, m5, m0
1561*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m5, m2
1562*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m5, m4
1563*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m5, m1
1564*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym6, m9, 1
1565*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym3, m7, 1
1566*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym5, m8, 1
1567*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym1, m0, 1
1568*c0909341SAndroid Build Coastguard Worker    call .main2
1569*c0909341SAndroid Build Coastguard Worker    mova                ym8, [o(gather8a)]
1570*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+strideq*4]
1571*c0909341SAndroid Build Coastguard Worker    pmovzxdq             m9, ym8
1572*c0909341SAndroid Build Coastguard Worker    pshufd              ym8, ym8, q1230
1573*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m9, m4
1574*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m9, m5
1575*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m9, m6
1576*c0909341SAndroid Build Coastguard Worker    vpermt2q             m3, m9, m7
1577*c0909341SAndroid Build Coastguard Worker.end:
1578*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_2048)]
1579*c0909341SAndroid Build Coastguard Worker.end2:
1580*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7
1581*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
1582*c0909341SAndroid Build Coastguard Worker.end3:
1583*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m7
1584*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m7
1585*c0909341SAndroid Build Coastguard Worker.end4:
1586*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym6, strided
1587*c0909341SAndroid Build Coastguard Worker    kxnorb               k1, k1, k1
1588*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
1589*c0909341SAndroid Build Coastguard Worker    pmulld              ym8, ym6
1590*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
1591*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m6{k1}, [dstq+ym8]
1592*c0909341SAndroid Build Coastguard Worker    kmovb                k1, k2
1593*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m7{k2}, [r3+ym8]
1594*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*0], m4
1595*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*1], m4
1596*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
1597*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*2], m4
1598*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*3], m4
1599*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6, m4
1600*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m4
1601*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5
1602*c0909341SAndroid Build Coastguard Worker    paddw                m1, m6
1603*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1604*c0909341SAndroid Build Coastguard Worker    vpscatterdq [dstq+ym8]{k1}, m0
1605*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m7, m4
1606*c0909341SAndroid Build Coastguard Worker    punpckhbw            m7, m4
1607*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6
1608*c0909341SAndroid Build Coastguard Worker    paddw                m3, m7
1609*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m3
1610*c0909341SAndroid Build Coastguard Worker    vpscatterdq [r3+ym8]{k2}, m2
1611*c0909341SAndroid Build Coastguard Worker    RET
1612*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1613*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast2 ; bottom three-quarters are zero
1614*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym10, [o(pd_2048)]
1615*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ym13, [o(int_mshift)]
1616*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym3, [o(pw_401_4076x8)]
1617*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym5, [o(pw_799_4017x8)]
1618*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym4, [o(pw_m1189_3920x8)]
1619*c0909341SAndroid Build Coastguard Worker    pxor                ym6, ym6
1620*c0909341SAndroid Build Coastguard Worker    punpckhwd           ym2, ym0, ym0
1621*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym2, ym3      ; t8a  t15a
1622*c0909341SAndroid Build Coastguard Worker    punpcklwd           ym7, ym1, ym1
1623*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym7, ym5      ; t4a  t7a
1624*c0909341SAndroid Build Coastguard Worker    punpckhwd           ym1, ym1
1625*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym4, ym1      ; t11a t12a
1626*c0909341SAndroid Build Coastguard Worker    vpcmpub              k7, ym13, ym10, 6
1627*c0909341SAndroid Build Coastguard Worker    punpcklwd           ym9, ym6, ym0
1628*c0909341SAndroid Build Coastguard Worker    psubsw              ym0, ym2, ym4 ; t11a t12a
1629*c0909341SAndroid Build Coastguard Worker    paddsw              ym8, ym2, ym4 ; t8a  t15a
1630*c0909341SAndroid Build Coastguard Worker    mova                ym1, ym7
1631*c0909341SAndroid Build Coastguard Worker    jmp .main5
1632*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1633*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast ; bottom half is zero
1634*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym10, [o(pd_2048)]
1635*c0909341SAndroid Build Coastguard Worker    vpbroadcastq       ym13, [o(int_mshift)]
1636*c0909341SAndroid Build Coastguard Worker    pxor                ym6, ym6
1637*c0909341SAndroid Build Coastguard Worker    punpckhwd           ym8, ym0, ym0
1638*c0909341SAndroid Build Coastguard Worker    punpckhwd           ym4, ym3, ym3
1639*c0909341SAndroid Build Coastguard Worker    punpckhwd           ym5, ym2, ym2
1640*c0909341SAndroid Build Coastguard Worker    punpcklwd           ym7, ym1, ym1
1641*c0909341SAndroid Build Coastguard Worker    punpckhwd           ym1, ym1
1642*c0909341SAndroid Build Coastguard Worker    punpcklwd           ym3, ym3
1643*c0909341SAndroid Build Coastguard Worker    punpcklwd           ym9, ym6, ym0
1644*c0909341SAndroid Build Coastguard Worker    punpcklwd           ym6, ym2
1645*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym2, [o(pw_401_4076x8)]
1646*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym0, [o(pw_m2598_3166x8)]
1647*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym11, [o(pw_1931_3612x8)]
1648*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       ym12, [o(pw_m1189_3920x8)]
1649*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym8, ym2  ; t8a  t15a
1650*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym2, [o(pw_799_4017x8)]
1651*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym0, ym4  ; t9a  t14a
1652*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym4, [o(pw_m2276_3406x8)]
1653*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym5, ym11 ; t10a t13a
1654*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym12 ; t11a t12a
1655*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym7, ym2  ; t4a  t7a
1656*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym3, ym4  ; t5a  t6a
1657*c0909341SAndroid Build Coastguard Worker    vpcmpub              k7, ym13, ym10, 6
1658*c0909341SAndroid Build Coastguard Worker    jmp .main4
1659*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1660*c0909341SAndroid Build Coastguard Workercglobal_label .main
1661*c0909341SAndroid Build Coastguard Worker    WRAP_YMM IDCT16_1D_PACKED
1662*c0909341SAndroid Build Coastguard Worker    ret
1663*c0909341SAndroid Build Coastguard Worker
1664*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, dct
1665*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, adst
1666*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, flipadst
1667*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, identity
1668*c0909341SAndroid Build Coastguard Worker
1669*c0909341SAndroid Build Coastguard Workercglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1670*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_8bpc).main_pass1
1671*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [o(int_shuf1)]
1672*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_16384_m16384)]
1673*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
1674*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0     ; g0 h0 g1 h1 g2 h2 g3 h3
1675*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
1676*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6     ; e0 f0 e1 f1 e2 f2 e3 f3
1677*c0909341SAndroid Build Coastguard Worker.pass1_end:
1678*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m3, m5, m4, m2
1679*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1
1680*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m5     ; a2 b2 c2 d2 a3 b3 c3 d3
1681*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3
1682*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4     ; e0 f0 g0 h0 e1 f1 g1 h1
1683*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m2
1684*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2
1685*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m5
1686*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m5
1687*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1688*c0909341SAndroid Build Coastguard Worker.pass2:
1689*c0909341SAndroid Build Coastguard Worker    call .main_pass2
1690*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_2048)]
1691*c0909341SAndroid Build Coastguard Worker    psrlq               m10, 4
1692*c0909341SAndroid Build Coastguard Worker    psubw                m7, m8, m6
1693*c0909341SAndroid Build Coastguard Worker.pass2_end:
1694*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2896x8)]
1695*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, m4
1696*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m4
1697*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5      ; -out7   out4   out6  -out5
1698*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m2      ;  out8  -out11 -out9   out10
1699*c0909341SAndroid Build Coastguard Worker    mova                ym8, [o(gather8c)]
1700*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+strideq]
1701*c0909341SAndroid Build Coastguard Worker    psrlq                m2, m10, 4
1702*c0909341SAndroid Build Coastguard Worker    vpermi2q             m2, m0, m3  ;  1  3 13 15
1703*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m10, m3 ;  0  2 12 14
1704*c0909341SAndroid Build Coastguard Worker    psrlq                m3, m10, 8
1705*c0909341SAndroid Build Coastguard Worker    vpermi2q             m3, m1, m5  ;  5  7  9 11
1706*c0909341SAndroid Build Coastguard Worker    psrlq               m10, 12
1707*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m10, m5 ;  4  6  8 10
1708*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m6
1709*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
1710*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_8bpc).end3
1711*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1712*c0909341SAndroid Build Coastguard Worker.main_pass1:
1713*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_2896x8)]
1714*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m2, [cq+64*0]
1715*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m2, [cq+64*3]
1716*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2, [cq+64*1]
1717*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2,     [cq+64*2]
1718*c0909341SAndroid Build Coastguard Worker    movu                 m4, [o(permA+3)]
1719*c0909341SAndroid Build Coastguard Worker    psrlq               m10, m4, 4
1720*c0909341SAndroid Build Coastguard Worker    mova                 m6, m4
1721*c0909341SAndroid Build Coastguard Worker    vpermi2q             m4, m5, m3  ; in0  in12 in2  in14
1722*c0909341SAndroid Build Coastguard Worker    vpermt2q             m5, m10, m3 ; in15 in3  in13 in1
1723*c0909341SAndroid Build Coastguard Worker    vpermi2q             m6, m1, m2  ; in4  in8  in6  in10
1724*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m10, m2 ; in11 in7  in9  in5
1725*c0909341SAndroid Build Coastguard Worker    jmp .main
1726*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1727*c0909341SAndroid Build Coastguard Worker.main_pass2:
1728*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(permC)]
1729*c0909341SAndroid Build Coastguard Worker    psrlq                m5, m4, 4
1730*c0909341SAndroid Build Coastguard Worker    vpermi2q             m4, m0, m2  ; in0  in12 in2  in14
1731*c0909341SAndroid Build Coastguard Worker    psrlq                m6, m5, 4
1732*c0909341SAndroid Build Coastguard Worker    vpermi2q             m5, m1, m3  ; in15 in3  in13 in1
1733*c0909341SAndroid Build Coastguard Worker    psrlq               m10, m6, 4
1734*c0909341SAndroid Build Coastguard Worker    vpermi2q             m6, m0, m2  ; in4  in8  in6  in10
1735*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m10, m3 ; in11 in7  in9  in5
1736*c0909341SAndroid Build Coastguard Worker.main:
1737*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4, m5  ; in0  in15 in2  in13
1738*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5      ; in12 in3  in14 in1
1739*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m1  ; in4  in11 in6  in9
1740*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m1      ; in8  in7  in10 in5
1741*c0909341SAndroid Build Coastguard Workercglobal_label .main2
1742*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pd_2048)]
1743*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m13, [o(int_mshift)]
1744*c0909341SAndroid Build Coastguard Worker    kxnorb               k1, k1, k1
1745*c0909341SAndroid Build Coastguard Worker    vpcmpub              k7, m13, m9, 6 ; 0x33...
1746*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
1747*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        0, 1, 2, 3, 7, 9,  201, 4091,  995, 3973, 5
1748*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5
1749*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        4, 1, 2, 3, 7, 9, 3857, 1380, 4052,  601, 5
1750*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5
1751*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m0, m6 ; t9a  t8a  t11a t10a
1752*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m6     ; t1a  t0a  t3a  t2a
1753*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m5, m4 ; t13a t12a t15a t14a
1754*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m4     ; t5a  t4a  t7a  t6a
1755*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        2, 4, 1, 6, 7, 9,  799, 4017, 3406, 2276, 5
1756*c0909341SAndroid Build Coastguard Worker    psubw                m7, m8, m7
1757*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 4, 1, 9, 7, 6, 4
1758*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_3784_m1567)]
1759*c0909341SAndroid Build Coastguard Worker    vpbroadcastd     m6{k1}, [o(pw_m3784_1567)]
1760*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m0, m5 ; t5   t4   t7   t6
1761*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m5     ; t1   t0   t3   t2
1762*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m2, m3 ; t13a t12a t15a t14a
1763*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m3     ; t9a  t8a  t11a t10a
1764*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a
1765*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15
1766*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [o(deint_shuf)]
1767*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5
1768*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5
1769*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m0, m2, q3232 ; t3   t2   t11a t10a
1770*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym2, 1        ; t1   t0   t9a  t8a
1771*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m1, m4, q3232 ; t6a  t7a  t14  t15
1772*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, ym4, 1        ; t5a  t4a  t13  t12
1773*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q1032     ; t7a  t6a  t15  t14
1774*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m0, m3        ; t3a t2a t11 t10
1775*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m3            ; -out15  out0   out14 -out1
1776*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m1, m2        ;  out12 -out3  -out13  out2
1777*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2            ; t7 t6 t15a t14a
1778*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m4, m1        ; t2a t6  t10 t14a
1779*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m1            ; t3a t7  t11 t15a
1780*c0909341SAndroid Build Coastguard Worker    ret
1781*c0909341SAndroid Build Coastguard Worker
1782*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, dct
1783*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, adst
1784*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, flipadst
1785*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, identity
1786*c0909341SAndroid Build Coastguard Worker
1787*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1788*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_8bpc).main_pass1
1789*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m6, [o(int_shuf2)]
1790*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_m16384_16384)]
1791*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
1792*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0     ; g0 h0 g1 h1 g2 h2 g3 h3
1793*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
1794*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
1795*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x16_internal_8bpc).pass1_end
1796*c0909341SAndroid Build Coastguard Worker.pass2:
1797*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_8bpc).main_pass2
1798*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_2048)]
1799*c0909341SAndroid Build Coastguard Worker    psrlq               m10, 36
1800*c0909341SAndroid Build Coastguard Worker    psubw                m6, m8, m7
1801*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x16_internal_8bpc).pass2_end
1802*c0909341SAndroid Build Coastguard Worker
1803*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, dct
1804*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, adst
1805*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, flipadst
1806*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, identity
1807*c0909341SAndroid Build Coastguard Worker
1808*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1809*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(int16_perm)]
1810*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
1811*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
1812*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
1813*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
1814*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2896x8)]
1815*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m3, m2        ; a0 b0 c0 d0 a1 b1 c1 d1
1816*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m2            ; a2 b2 c2 d2 a3 b3 c3 d3
1817*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4, m0        ; e0 f0 g0 h0 a1 f1 g1 h1
1818*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m0            ; e2 f2 g2 h2 e3 f3 g3 h3
1819*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m1, m2, m3, m4
1820*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m2        ; a0 b0 c0 d0 e0 f0 g0 h0
1821*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m2            ; a1 b1 c1 d1 e1 f1 g1 h1
1822*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m4        ; a2 b2 c2 d2 e2 f2 g2 h2
1823*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4            ; a3 b3 c3 d3 e3 f3 g3 h3
1824*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1825*c0909341SAndroid Build Coastguard Worker.pass2:
1826*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_1697x16)]
1827*c0909341SAndroid Build Coastguard Worker    mova                ym8, [o(gather8b)]
1828*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+strideq*2]
1829*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7, m0
1830*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7, m1
1831*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7, m2
1832*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m3
1833*c0909341SAndroid Build Coastguard Worker    REPX      {paddsw x, x}, m0, m1, m2, m3
1834*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4
1835*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5
1836*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m6
1837*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m7
1838*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_8bpc).end
1839*c0909341SAndroid Build Coastguard Worker
1840*c0909341SAndroid Build Coastguard Worker%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
1841*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%3, [dstq+%5]
1842*c0909341SAndroid Build Coastguard Worker%ifnum %1
1843*c0909341SAndroid Build Coastguard Worker    paddw               m%3, m%1
1844*c0909341SAndroid Build Coastguard Worker%else
1845*c0909341SAndroid Build Coastguard Worker    paddw               m%3, %1
1846*c0909341SAndroid Build Coastguard Worker%endif
1847*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%4, [dstq+%6]
1848*c0909341SAndroid Build Coastguard Worker%ifnum %2
1849*c0909341SAndroid Build Coastguard Worker    paddw               m%4, m%2
1850*c0909341SAndroid Build Coastguard Worker%else
1851*c0909341SAndroid Build Coastguard Worker    paddw               m%4, %2
1852*c0909341SAndroid Build Coastguard Worker%endif
1853*c0909341SAndroid Build Coastguard Worker    packuswb            m%3, m%4
1854*c0909341SAndroid Build Coastguard Worker    vpermq              m%3, m%3, q3120
1855*c0909341SAndroid Build Coastguard Worker    mova          [dstq+%5], xm%3
1856*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+%6], m%3, 1
1857*c0909341SAndroid Build Coastguard Worker%endmacro
1858*c0909341SAndroid Build Coastguard Worker
1859*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X4_FN 2 ; type1, type2
1860*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 16x4
1861*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
1862*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
1863*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
1864*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2
1865*c0909341SAndroid Build Coastguard Worker%endif
1866*c0909341SAndroid Build Coastguard Worker%endmacro
1867*c0909341SAndroid Build Coastguard Worker
1868*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
1869*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, dct
1870*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, adst
1871*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, flipadst
1872*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, identity
1873*c0909341SAndroid Build Coastguard Worker
1874*c0909341SAndroid Build Coastguard Workercglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1875*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq+16*0]
1876*c0909341SAndroid Build Coastguard Worker    mova                xm1, [cq+16*1]
1877*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+16*2]
1878*c0909341SAndroid Build Coastguard Worker    mova                xm3, [cq+16*3]
1879*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq+16*4]
1880*c0909341SAndroid Build Coastguard Worker    mova                xm5, [cq+16*5]
1881*c0909341SAndroid Build Coastguard Worker    mova                xm6, [cq+16*6]
1882*c0909341SAndroid Build Coastguard Worker    mova                xm7, [cq+16*7]
1883*c0909341SAndroid Build Coastguard Worker    call m(idct_4x16_internal_8bpc).main
1884*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_16384)]
1885*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, xm3, 1 ; 3 2   7 6
1886*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym5, xm7, 1 ; b a   f e
1887*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, xm2, 1 ; 0 1   4 5
1888*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym4, xm6, 1 ; 8 9   c d
1889*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, ym5, 1 ; 3 2   7 6   b a   f e
1890*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym4, 1 ; 0 1   4 5   8 9   c d
1891*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m8
1892*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m8
1893*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1032
1894*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m1
1895*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
1896*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m2
1897*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
1898*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1899*c0909341SAndroid Build Coastguard Worker.pass2:
1900*c0909341SAndroid Build Coastguard Worker    IDCT4_1D_PACKED
1901*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(permA)]
1902*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x4_internal_8bpc).end
1903*c0909341SAndroid Build Coastguard Worker
1904*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, dct
1905*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, adst
1906*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, flipadst
1907*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, identity
1908*c0909341SAndroid Build Coastguard Worker
1909*c0909341SAndroid Build Coastguard Workercglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1910*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64*0]
1911*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*1]
1912*c0909341SAndroid Build Coastguard Worker    movshdup             m3, [o(permB)]
1913*c0909341SAndroid Build Coastguard Worker    psrlq               m10, m3, 4
1914*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x16_internal_8bpc).main2
1915*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_16384_m16384)]
1916*c0909341SAndroid Build Coastguard Worker    psrlq                m0, m10, 4
1917*c0909341SAndroid Build Coastguard Worker    psrlq               m10, 8
1918*c0909341SAndroid Build Coastguard Worker.pass1_end:
1919*c0909341SAndroid Build Coastguard Worker    punpcklwd           ym5, ym4, ym2
1920*c0909341SAndroid Build Coastguard Worker    punpckhwd           ym4, ym2
1921*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, ym4, 1
1922*c0909341SAndroid Build Coastguard Worker    mova                 m1, m9
1923*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m5, [o(pw_m2896_2896)] {1to16}
1924*c0909341SAndroid Build Coastguard Worker    mova                 m4, m9
1925*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m4, m5, [o(pw_2896_2896)] {1to16}
1926*c0909341SAndroid Build Coastguard Worker    psrad                m1, 12
1927*c0909341SAndroid Build Coastguard Worker    psrad                m4, 12
1928*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m4 ;  out8  -out7  -out9   out6  -out11  out4   out10 -out5
1929*c0909341SAndroid Build Coastguard Worker    vpermi2q             m0, m1, m3  ; 0 1   4 5   8 9   c d
1930*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m10, m3 ; 2 3   6 7   a b   e f
1931*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m1
1932*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
1933*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m2
1934*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
1935*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m6
1936*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
1937*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1938*c0909341SAndroid Build Coastguard Worker.pass2:
1939*c0909341SAndroid Build Coastguard Worker    call .main
1940*c0909341SAndroid Build Coastguard Worker    movu                 m2, [o(permA+1)]
1941*c0909341SAndroid Build Coastguard Worker.end:
1942*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_2048)]
1943*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
1944*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
1945*c0909341SAndroid Build Coastguard Worker.end2:
1946*c0909341SAndroid Build Coastguard Worker    psrlq                m3, m2, 4
1947*c0909341SAndroid Build Coastguard Worker    vpermi2q             m2, m0, m1
1948*c0909341SAndroid Build Coastguard Worker    vpermi2q             m3, m0, m1
1949*c0909341SAndroid Build Coastguard Worker.end3:
1950*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+strideq*2]
1951*c0909341SAndroid Build Coastguard Worker    mova                xm1, [dstq+strideq*0]
1952*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, [dstq+strideq*1], 1
1953*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [r3  +strideq*0], 2
1954*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [r3  +strideq*1], 3
1955*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
1956*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*0], m4
1957*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*1], m4
1958*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m4
1959*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m4
1960*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
1961*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
1962*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1963*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
1964*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
1965*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r3  +strideq*0], m0, 2
1966*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r3  +strideq*1], m0, 3
1967*c0909341SAndroid Build Coastguard Worker    RET
1968*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1969*c0909341SAndroid Build Coastguard Worker.main:
1970*c0909341SAndroid Build Coastguard Worker    IADST4_1D_PACKED
1971*c0909341SAndroid Build Coastguard Worker    ret
1972*c0909341SAndroid Build Coastguard Worker
1973*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, dct
1974*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, adst
1975*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, flipadst
1976*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, identity
1977*c0909341SAndroid Build Coastguard Worker
1978*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1979*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64*0]
1980*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*1]
1981*c0909341SAndroid Build Coastguard Worker    movshdup             m3, [o(permB)]
1982*c0909341SAndroid Build Coastguard Worker    psrlq               m10, m3, 4
1983*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x16_internal_8bpc).main2
1984*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_m16384_16384)]
1985*c0909341SAndroid Build Coastguard Worker    psrlq                m0, m10, 12
1986*c0909341SAndroid Build Coastguard Worker    psrlq               m10, 16
1987*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x4_internal_8bpc).pass1_end
1988*c0909341SAndroid Build Coastguard Worker.pass2:
1989*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x4_internal_8bpc).main
1990*c0909341SAndroid Build Coastguard Worker    movu                m2, [o(permA+2)]
1991*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x4_internal_8bpc).end
1992*c0909341SAndroid Build Coastguard Worker
1993*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, dct
1994*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, adst
1995*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, flipadst
1996*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, identity
1997*c0909341SAndroid Build Coastguard Worker
1998*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
1999*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*0]
2000*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*1]
2001*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_1697x16)]
2002*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_16384)]
2003*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(idtx_16x4p)]
2004*c0909341SAndroid Build Coastguard Worker    shufps               m0, m1, m2, q2020
2005*c0909341SAndroid Build Coastguard Worker    shufps               m1, m2, q3131
2006*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3, m0
2007*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m1
2008*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
2009*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
2010*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2
2011*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3
2012*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m5, m0
2013*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m5, m1
2014*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2015*c0909341SAndroid Build Coastguard Worker.pass2:
2016*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_1697x8)]
2017*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3, m0
2018*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m1
2019*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2
2020*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3
2021*c0909341SAndroid Build Coastguard Worker    movu                 m2, [o(permA+1)]
2022*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x4_internal_8bpc).end
2023*c0909341SAndroid Build Coastguard Worker
2024*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X8_FN 2 ; type1, type2
2025*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 16x8
2026*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
2027*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
2028*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
2029*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8
2030*c0909341SAndroid Build Coastguard Worker.dconly:
2031*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
2032*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128
2033*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8
2034*c0909341SAndroid Build Coastguard Worker.dconly2:
2035*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
2036*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+256
2037*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+1
2038*c0909341SAndroid Build Coastguard Worker.dconly3:
2039*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
2040*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
2041*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+2048
2042*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+4
2043*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
2044*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, r6d
2045*c0909341SAndroid Build Coastguard Worker.dconly_loop:
2046*c0909341SAndroid Build Coastguard Worker    mova                xm1, [dstq+strideq*0]
2047*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym1, [dstq+strideq*1], 1
2048*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [dstq+strideq*2], 2
2049*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m1, [dstq+r2       ], 3
2050*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
2051*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
2052*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
2053*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2054*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2055*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
2056*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
2057*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m0, 2
2058*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r2       ], m0, 3
2059*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2060*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 4
2061*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
2062*c0909341SAndroid Build Coastguard Worker    RET
2063*c0909341SAndroid Build Coastguard Worker%endif
2064*c0909341SAndroid Build Coastguard Worker%endmacro
2065*c0909341SAndroid Build Coastguard Worker
2066*c0909341SAndroid Build Coastguard Worker%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
2067*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2896x8)]
2068*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*0], q3120
2069*c0909341SAndroid Build Coastguard Worker    add                  cq, 32*4
2070*c0909341SAndroid Build Coastguard Worker    vpermq               m7, [cq+32*3], q%1
2071*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [cq-32*3], q%1
2072*c0909341SAndroid Build Coastguard Worker    vpermq               m6, [cq+32*2], q3120
2073*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [cq-32*2], q3120
2074*c0909341SAndroid Build Coastguard Worker    vpermq               m5, [cq+32*1], q%1
2075*c0909341SAndroid Build Coastguard Worker    vpermq               m3, [cq-32*1], q%1
2076*c0909341SAndroid Build Coastguard Worker    vpermq               m4, [cq+32*0], q3120
2077*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
2078*c0909341SAndroid Build Coastguard Worker%endmacro
2079*c0909341SAndroid Build Coastguard Worker
2080*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, dct
2081*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, identity
2082*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, adst
2083*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, flipadst
2084*c0909341SAndroid Build Coastguard Worker
2085*c0909341SAndroid Build Coastguard Workercglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2086*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(pw_2896x8)]
2087*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+64*0], q3120
2088*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [cq+64*1], q3120
2089*c0909341SAndroid Build Coastguard Worker    vpermq               m4, [cq+64*2], q3120
2090*c0909341SAndroid Build Coastguard Worker    vpermq               m6, [cq+64*3], q3120
2091*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6
2092*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym1, m0, 1
2093*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym3, m2, 1
2094*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym5, m4, 1
2095*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym7, m6, 1
2096*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_8bpc).main
2097*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [o(int_shuf1)]
2098*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [o(int_shuf2)]
2099*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3
2100*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3
2101*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3
2102*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3
2103*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_16384)]
2104*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8     ; a0 b0 a1 b1 a2 b2 a3 b3
2105*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m9     ; c0 d0 c1 d1 c2 d2 c3 d3
2106*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3
2107*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3
2108*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m2}, m0, m1, m6, m7
2109*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1
2110*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3
2111*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1
2112*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3
2113*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2114*c0909341SAndroid Build Coastguard Worker.pass2:
2115*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m2, m4, q2020 ; 0 1
2116*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m4, q3131     ; 4 5
2117*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m3, m5, q2020 ; 2 3
2118*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m5, q3131     ; 6 7
2119*c0909341SAndroid Build Coastguard Worker    call .main
2120*c0909341SAndroid Build Coastguard Worker    movshdup             m4, [o(permC)]
2121*c0909341SAndroid Build Coastguard Worker    psrlq                m6, m4, 4
2122*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m4, q1032
2123*c0909341SAndroid Build Coastguard Worker    vpermi2q             m4, m0, m2 ; a2 a3   b2 b3   e2 e3   f2 f3
2124*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m6, m2 ; a0 a1   b0 b1   e0 e1   f0 f1
2125*c0909341SAndroid Build Coastguard Worker    psrlq                m6, m5, 4
2126*c0909341SAndroid Build Coastguard Worker    vpermi2q             m5, m1, m3 ; c2 c3   d2 d3   g2 g3   h2 h3
2127*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m6, m3 ; c0 c1   d0 d1   g0 g1   h0 h1
2128*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_2048)]
2129*c0909341SAndroid Build Coastguard Worker.end:
2130*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m6}, m0, m4, m1, m5
2131*c0909341SAndroid Build Coastguard Worker.end2:
2132*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+strideq*4]
2133*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
2134*c0909341SAndroid Build Coastguard Worker    mova                xm3, [dstq+strideq*0]
2135*c0909341SAndroid Build Coastguard Worker    mova                xm6, [dstq+strideq*2]
2136*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym3, [dstq+strideq*1], 1
2137*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym6, [dstq+r4       ], 1
2138*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m3, [r3  +strideq*0], 2
2139*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m6, [r3  +strideq*2], 2
2140*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m3, [r3  +strideq*1], 3
2141*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m6, [r3  +r4       ], 3
2142*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
2143*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*0], m7
2144*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*1], m7
2145*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*2], m7
2146*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*3], m7
2147*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m7
2148*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m7
2149*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
2150*c0909341SAndroid Build Coastguard Worker    paddw                m4, m3
2151*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m4
2152*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
2153*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
2154*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r3  +strideq*0], m0, 2
2155*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r3  +strideq*1], m0, 3
2156*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m6, m7
2157*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m7
2158*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
2159*c0909341SAndroid Build Coastguard Worker    paddw                m5, m6
2160*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m5
2161*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], xm1
2162*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r4       ], ym1, 1
2163*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r3  +strideq*2], m1, 2
2164*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r3  +r4       ], m1, 3
2165*c0909341SAndroid Build Coastguard Worker    RET
2166*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2167*c0909341SAndroid Build Coastguard Workercglobal_label .main
2168*c0909341SAndroid Build Coastguard Worker    IDCT8_1D_PACKED
2169*c0909341SAndroid Build Coastguard Worker    ret
2170*c0909341SAndroid Build Coastguard Worker
2171*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, dct
2172*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, adst
2173*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, flipadst
2174*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, identity
2175*c0909341SAndroid Build Coastguard Worker
2176*c0909341SAndroid Build Coastguard Workercglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2177*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_8bpc).main_pass1
2178*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_16384_m16384)]
2179*c0909341SAndroid Build Coastguard Worker    psrlq               m10, 4
2180*c0909341SAndroid Build Coastguard Worker.pass1_end:
2181*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m4, m2
2182*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2
2183*c0909341SAndroid Build Coastguard Worker    mova                 m1, m9
2184*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m5, [o(pw_m2896_2896)] {1to16}
2185*c0909341SAndroid Build Coastguard Worker    mova                 m6, m9
2186*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m5, [o(pw_2896_2896)] {1to16}
2187*c0909341SAndroid Build Coastguard Worker    mova                 m2, m9
2188*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m4, [o(pw_m2896_2896)] {1to16}
2189*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m4, [o(pw_2896_2896)] {1to16}
2190*c0909341SAndroid Build Coastguard Worker    psrad                m1, 12
2191*c0909341SAndroid Build Coastguard Worker    psrad                m6, 12
2192*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m6 ;  out8  -out7  -out9   out6
2193*c0909341SAndroid Build Coastguard Worker    psrad                m2, 12
2194*c0909341SAndroid Build Coastguard Worker    psrad                m9, 12
2195*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m9 ; -out11  out4   out10 -out5
2196*c0909341SAndroid Build Coastguard Worker    psrlq                m4, m10, 4
2197*c0909341SAndroid Build Coastguard Worker    vpermi2q             m4, m0, m2
2198*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m10, m2
2199*c0909341SAndroid Build Coastguard Worker    psrlq                m5, m10, 8
2200*c0909341SAndroid Build Coastguard Worker    vpermi2q             m5, m1, m3
2201*c0909341SAndroid Build Coastguard Worker    psrlq               m10, 12
2202*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m10, m3
2203*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3
2204*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5     ; b0 d0 b1 d1 b2 d2 b3 d3
2205*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3
2206*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0     ; j0 l0 j1 l1 j2 l2 j3 l3
2207*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1
2208*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4     ; a2 b2 c2 d2 a3 b3 c3 d3
2209*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1
2210*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m1     ; i2 j2 k2 l2 i3 j3 k3 l3
2211*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m2, m3, m4, m5
2212*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2213*c0909341SAndroid Build Coastguard Worker.pass2:
2214*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m2, m4, q2020
2215*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m4, q3131     ; 4 5
2216*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m3, m5, q2020
2217*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m5, q3131     ; 6 7
2218*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m0, q1032     ; 1 0
2219*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m1, q1032     ; 3 2
2220*c0909341SAndroid Build Coastguard Worker    call .main_pass2
2221*c0909341SAndroid Build Coastguard Worker    movshdup             m4, [o(permC)]
2222*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m6
2223*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
2224*c0909341SAndroid Build Coastguard Worker    psrlq                m6, m4, 4
2225*c0909341SAndroid Build Coastguard Worker    mova                 m5, m4
2226*c0909341SAndroid Build Coastguard Worker    vpermi2q             m4, m0, m2
2227*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m6, m2
2228*c0909341SAndroid Build Coastguard Worker    vpermi2q             m5, m1, m3
2229*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m6, m3
2230*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_8bpc).end2
2231*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2232*c0909341SAndroid Build Coastguard Worker.main_pass1:
2233*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2896x8)]
2234*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4, [cq+64*0]
2235*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4, [cq+64*3]
2236*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4, [cq+64*1]
2237*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, [cq+64*2]
2238*c0909341SAndroid Build Coastguard Worker    mova                 m5, [o(int16_perm)]
2239*c0909341SAndroid Build Coastguard Worker    kxnorb               k1, k1, k1
2240*c0909341SAndroid Build Coastguard Worker    vpblendmd        m0{k1}, m1, m3 ; 0 7
2241*c0909341SAndroid Build Coastguard Worker    vmovdqa32        m3{k1}, m1     ; 6 1
2242*c0909341SAndroid Build Coastguard Worker    vpblendmd        m1{k1}, m4, m2 ; 2 5
2243*c0909341SAndroid Build Coastguard Worker    vmovdqa32        m2{k1}, m4     ; 4 3
2244*c0909341SAndroid Build Coastguard Worker    REPX  {vpermb x, m5, x}, m0, m1, m2, m3
2245*c0909341SAndroid Build Coastguard Worker    IADST8_1D_PACKED 1
2246*c0909341SAndroid Build Coastguard Worker    ret
2247*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2248*c0909341SAndroid Build Coastguard Workercglobal_label .main_pass2
2249*c0909341SAndroid Build Coastguard Worker    IADST8_1D_PACKED 2
2250*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
2251*c0909341SAndroid Build Coastguard Worker    psubd                m5, m6
2252*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m5
2253*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
2254*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
2255*c0909341SAndroid Build Coastguard Worker    ret
2256*c0909341SAndroid Build Coastguard Worker
2257*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, dct
2258*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, adst
2259*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, flipadst
2260*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, identity
2261*c0909341SAndroid Build Coastguard Worker
2262*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2263*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_8bpc).main_pass1
2264*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_m16384_16384)]
2265*c0909341SAndroid Build Coastguard Worker    psrlq               m10, 20
2266*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x8_internal_8bpc).pass1_end
2267*c0909341SAndroid Build Coastguard Worker.pass2:
2268*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m2, m4, q2020
2269*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m4, q3131     ; 4 5
2270*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m3, m5, q2020
2271*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m5, q3131     ; 6 7
2272*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m0, q1032     ; 1 0
2273*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m1, q1032     ; 3 2
2274*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_8bpc).main_pass2
2275*c0909341SAndroid Build Coastguard Worker    movshdup             m4, [o(permC)]
2276*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m6, m0
2277*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m6, m1
2278*c0909341SAndroid Build Coastguard Worker    psrlq                m1, m4, 12
2279*c0909341SAndroid Build Coastguard Worker    psrlq                m4, 8
2280*c0909341SAndroid Build Coastguard Worker    mova                 m7, m4
2281*c0909341SAndroid Build Coastguard Worker    vpermi2q             m4, m0, m3
2282*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m1, m3
2283*c0909341SAndroid Build Coastguard Worker    vpermi2q             m1, m5, m2
2284*c0909341SAndroid Build Coastguard Worker    vpermt2q             m5, m7, m2
2285*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_8bpc).end2
2286*c0909341SAndroid Build Coastguard Worker
2287*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, dct
2288*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, adst
2289*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, flipadst
2290*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, identity
2291*c0909341SAndroid Build Coastguard Worker
2292*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2293*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [o(pw_2896x8)]
2294*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m0, [cq+64*0]
2295*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m0, [cq+64*1]
2296*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m0, [cq+64*2]
2297*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0,     [cq+64*3]
2298*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_1697x16)]
2299*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_16384)]
2300*c0909341SAndroid Build Coastguard Worker    shufps               m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5
2301*c0909341SAndroid Build Coastguard Worker    shufps               m3, m4, q3131     ; a2 a3 a6 a7 e2 e3 e6 e7
2302*c0909341SAndroid Build Coastguard Worker    shufps               m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5
2303*c0909341SAndroid Build Coastguard Worker    shufps               m5, m0, q3131     ; i2 i3 i6 i7 m2 m3 m6 m7
2304*c0909341SAndroid Build Coastguard Worker    mova                 m9, [o(int8_permA)]
2305*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7, m2
2306*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7, m3
2307*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7, m4
2308*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m5
2309*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m0, m1, m6, m7
2310*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m0
2311*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m1
2312*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m6
2313*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m7
2314*c0909341SAndroid Build Coastguard Worker    REPX  {vpermb x, m9, x}, m2, m3, m4, m5
2315*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2316*c0909341SAndroid Build Coastguard Worker.pass2:
2317*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(permB)]
2318*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_4096)]
2319*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, m2
2320*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m7, m4
2321*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m7, m3
2322*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m7, m5
2323*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_8bpc).end
2324*c0909341SAndroid Build Coastguard Worker
2325*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X16_FN 2 ; type1, type2
2326*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 16x16
2327*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
2328*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
2329*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
2330*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
2331*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
2332*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+512
2333*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+2
2334*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
2335*c0909341SAndroid Build Coastguard Worker%endif
2336*c0909341SAndroid Build Coastguard Worker%endmacro
2337*c0909341SAndroid Build Coastguard Worker
2338*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, dct
2339*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, identity
2340*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, adst
2341*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, flipadst
2342*c0909341SAndroid Build Coastguard Worker
2343*c0909341SAndroid Build Coastguard Workercglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2344*c0909341SAndroid Build Coastguard Worker    mova                 m7, [o(permB)]
2345*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, [cq+64*0]
2346*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m7, [cq+64*1]
2347*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m7, [cq+64*2]
2348*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m7, [cq+64*3]
2349*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m7, [cq+64*4]
2350*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m7, [cq+64*5]
2351*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m7, [cq+64*6]
2352*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m7, [cq+64*7]
2353*c0909341SAndroid Build Coastguard Worker    call .main
2354*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m12, [o(int_shuf1)]
2355*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m11, [o(int_shuf2)]
2356*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_8192)]
2357*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m12
2358*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m1, m11
2359*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m12
2360*c0909341SAndroid Build Coastguard Worker    pshufb               m9, m3, m11
2361*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m12
2362*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m5, m11
2363*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m12
2364*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m7, m11
2365*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11
2366*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m8
2367*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m8
2368*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m2, m9
2369*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m9
2370*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m4, m10
2371*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m10
2372*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m6, m11
2373*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m11
2374*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2375*c0909341SAndroid Build Coastguard Worker.pass2:
2376*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m8, m4, m6, q3232 ; i8 ic m8 mc
2377*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, ym6, 1        ; i0 i4 m0 m4
2378*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m0, m2, q3232 ; a8 ac e8 ec
2379*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym2, 1        ; a0 a4 e0 e4
2380*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m9, m5, m7, q3232 ; ia ie ma me
2381*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, ym7, 1        ; i2 i6 m2 m6
2382*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m1, m3, q3232 ; aa ae ea ee
2383*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, ym3, 1        ; a2 a6 e2 e6
2384*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m0, m4, q3131 ;  4  5
2385*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m4, q2020     ;  0  1
2386*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m6, m8, q2020 ;  8  9
2387*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m8, q3131     ; 12 13
2388*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m1, m5, q3131 ;  6  7
2389*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m5, q2020     ;  2  3
2390*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m7, m9, q2020 ; 10 11
2391*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m9, q3131     ; 14 15
2392*c0909341SAndroid Build Coastguard Worker    call .main
2393*c0909341SAndroid Build Coastguard Worker    mova                  m8, [o(permD)]
2394*c0909341SAndroid Build Coastguard Worker    psrlq                m12, m8, 4
2395*c0909341SAndroid Build Coastguard Worker    psrlq                 m9, m8, 8
2396*c0909341SAndroid Build Coastguard Worker    psrlq                m13, m8, 12
2397*c0909341SAndroid Build Coastguard Worker    mova                 m10, m8
2398*c0909341SAndroid Build Coastguard Worker    vpermi2q              m8, m0, m2 ;  0  1  4  5
2399*c0909341SAndroid Build Coastguard Worker    vpermt2q              m0, m12, m2
2400*c0909341SAndroid Build Coastguard Worker    mova                 m11, m9
2401*c0909341SAndroid Build Coastguard Worker    vpermi2q              m9, m1, m3 ;  2  3  6  7
2402*c0909341SAndroid Build Coastguard Worker    vpermt2q              m1, m13, m3
2403*c0909341SAndroid Build Coastguard Worker    vpermi2q             m10, m4, m6 ;  8  9 12 13
2404*c0909341SAndroid Build Coastguard Worker    vpermt2q              m4, m12, m6
2405*c0909341SAndroid Build Coastguard Worker    vpermi2q             m11, m5, m7 ; 10 11 14 15
2406*c0909341SAndroid Build Coastguard Worker    vpermt2q              m5, m13, m7
2407*c0909341SAndroid Build Coastguard Worker.end:
2408*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2048)]
2409*c0909341SAndroid Build Coastguard Worker.end2:
2410*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m0, m1, m4, m5
2411*c0909341SAndroid Build Coastguard Worker.end3:
2412*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m8, m9, m10, m11
2413*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2414*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dstq+strideq*4]
2415*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dstq+strideq*8]
2416*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r4  +strideq*8]
2417*c0909341SAndroid Build Coastguard Worker    mova                xm3, [dstq+strideq*0]
2418*c0909341SAndroid Build Coastguard Worker    mova                xm6, [dstq+strideq*2]
2419*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym3, [dstq+strideq*1], 1
2420*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym6, [dstq+r3       ], 1
2421*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m3, [r4+strideq*0], 2
2422*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m6, [r4+strideq*2], 2
2423*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m3, [r4+strideq*1], 3
2424*c0909341SAndroid Build Coastguard Worker    vinserti32x4         m6, [r4+r3       ], 3
2425*c0909341SAndroid Build Coastguard Worker    mova               xm12, [r5+strideq*0]
2426*c0909341SAndroid Build Coastguard Worker    mova               xm13, [r5+strideq*2]
2427*c0909341SAndroid Build Coastguard Worker    vinserti32x4       ym12, [r5+strideq*1], 1
2428*c0909341SAndroid Build Coastguard Worker    vinserti32x4       ym13, [r5+r3       ], 1
2429*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m12, [r6+strideq*0], 2
2430*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m13, [r6+strideq*2], 2
2431*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m12, [r6+strideq*1], 3
2432*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m13, [r6+r3       ], 3
2433*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
2434*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
2435*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m7
2436*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m7
2437*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
2438*c0909341SAndroid Build Coastguard Worker    paddw                m8, m3
2439*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m8
2440*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m6, m7
2441*c0909341SAndroid Build Coastguard Worker    punpckhbw            m6, m7
2442*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
2443*c0909341SAndroid Build Coastguard Worker    paddw                m9, m6
2444*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m9
2445*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m12, m7
2446*c0909341SAndroid Build Coastguard Worker    punpckhbw           m12, m7
2447*c0909341SAndroid Build Coastguard Worker    paddw                m2, m4
2448*c0909341SAndroid Build Coastguard Worker    paddw               m10, m12
2449*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m10
2450*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m13, m7
2451*c0909341SAndroid Build Coastguard Worker    punpckhbw           m13, m7
2452*c0909341SAndroid Build Coastguard Worker    paddw                m3, m5
2453*c0909341SAndroid Build Coastguard Worker    paddw               m11, m13
2454*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m11
2455*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm0
2456*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym0, 1
2457*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], xm1
2458*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r3       ], ym1, 1
2459*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r4+strideq*0], m0, 2
2460*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r4+strideq*1], m0, 3
2461*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r4+strideq*2], m1, 2
2462*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r4+r3       ], m1, 3
2463*c0909341SAndroid Build Coastguard Worker    mova          [r5+strideq*0], xm2
2464*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r5+strideq*1], ym2, 1
2465*c0909341SAndroid Build Coastguard Worker    mova          [r5+strideq*2], xm3
2466*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r5+r3       ], ym3, 1
2467*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r6+strideq*0], m2, 2
2468*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r6+strideq*1], m2, 3
2469*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r6+strideq*2], m3, 2
2470*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r6+r3       ], m3, 3
2471*c0909341SAndroid Build Coastguard Worker    RET
2472*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2473*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast2 ; bottom three-quarters are zero
2474*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
2475*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m13, [o(int_mshift)]
2476*c0909341SAndroid Build Coastguard Worker    vpcmpub              k7, m13, m10, 6
2477*c0909341SAndroid Build Coastguard Worker.main_fast4:
2478*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_401_4076x8)]
2479*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_m1189_3920x8)]
2480*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_799_4017x8)]
2481*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m8     ; t8a  t15a
2482*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m1     ; t11a t12a
2483*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m3     ; t4a  t7a
2484*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
2485*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m2, m4 ; t11a t12a
2486*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m2, m4 ; t8a  t15a
2487*c0909341SAndroid Build Coastguard Worker    mova                 m1, m7
2488*c0909341SAndroid Build Coastguard Worker    jmp .main5
2489*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2490*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast ; bottom half is zero
2491*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
2492*c0909341SAndroid Build Coastguard Worker.main_fast3:
2493*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m13, [o(int_mshift)]
2494*c0909341SAndroid Build Coastguard Worker    vpcmpub              k7, m13, m10, 6
2495*c0909341SAndroid Build Coastguard Worker.main_fast5:
2496*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_401_4076x8)]
2497*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_m2598_3166x8)]
2498*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_1931_3612x8)]
2499*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m1189_3920x8)]
2500*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m2  ; t8a  t15a
2501*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_799_4017x8)]
2502*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4  ; t9a  t14a
2503*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_m2276_3406x8)]
2504*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m11 ; t10a t13a
2505*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m12 ; t11a t12a
2506*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m2  ; t4a  t7a
2507*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4  ; t5a  t6a
2508*c0909341SAndroid Build Coastguard Worker    jmp .main4
2509*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2510*c0909341SAndroid Build Coastguard Workercglobal_label .main
2511*c0909341SAndroid Build Coastguard Worker    IDCT16_1D_PACKED
2512*c0909341SAndroid Build Coastguard Worker    ret
2513*c0909341SAndroid Build Coastguard Worker
2514*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, dct
2515*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, adst
2516*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, flipadst
2517*c0909341SAndroid Build Coastguard Worker
2518*c0909341SAndroid Build Coastguard Workercglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2519*c0909341SAndroid Build Coastguard Worker    call .main_pass1
2520*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_8192_m8192)]
2521*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3
2522*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1     ; a0 c0 a1 c1 a2 c2 a3 c3
2523*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3
2524*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m8     ; a0 b0 c0 d0 a1 b1 c1 d1
2525*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3
2526*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3     ; e0 g0 e1 g1 e2 g2 e3 g3
2527*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3
2528*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m8     ; e0 f0 g0 h0 e1 f1 g1 h1
2529*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3
2530*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5     ; j0 l0 j1 l1 j2 l2 j3 l3
2531*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3
2532*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m8     ; i0 j0 k0 l0 i1 j1 k1 l1
2533*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3
2534*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7     ; n0 p0 n1 p1 n2 p2 n3 p3
2535*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3
2536*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m8     ; m0 n0 o0 p0 m1 n1 o1 p1
2537*c0909341SAndroid Build Coastguard Worker.pass1_end:
2538*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
2539*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2540*c0909341SAndroid Build Coastguard Worker.pass2:
2541*c0909341SAndroid Build Coastguard Worker    call .main_pass2
2542*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(permD)]
2543*c0909341SAndroid Build Coastguard Worker    psrlq                m8, m10, 8
2544*c0909341SAndroid Build Coastguard Worker    psrlq               m12, m10, 12
2545*c0909341SAndroid Build Coastguard Worker    psrlq               m13, m10, 4
2546*c0909341SAndroid Build Coastguard Worker    mova                 m9, m8
2547*c0909341SAndroid Build Coastguard Worker    vpermi2q             m8, m0, m2 ;  0  1  4  5
2548*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m12, m2
2549*c0909341SAndroid Build Coastguard Worker    vpermi2q             m9, m1, m3 ;  2  3  6  7
2550*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m12, m3
2551*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2048)]
2552*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0xff00ff00
2553*c0909341SAndroid Build Coastguard Worker    mova                m11, m10
2554*c0909341SAndroid Build Coastguard Worker    vpermi2q            m10, m4, m6 ;  8  9 12 13
2555*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m13, m6
2556*c0909341SAndroid Build Coastguard Worker    kmovd                k1, r3d
2557*c0909341SAndroid Build Coastguard Worker    vpermi2q            m11, m5, m7 ; 10 11 14 15
2558*c0909341SAndroid Build Coastguard Worker    vpermt2q             m5, m13, m7
2559*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
2560*c0909341SAndroid Build Coastguard Worker    vpsubw          m12{k1}, m7, m12
2561*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_8bpc).end2
2562*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2563*c0909341SAndroid Build Coastguard Worker.main_pass1:
2564*c0909341SAndroid Build Coastguard Worker    mova                 m4, [o(permB)]
2565*c0909341SAndroid Build Coastguard Worker    psrlq                m3, m4, 4
2566*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m4, [cq+64*0]
2567*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m3, [cq+64*7]
2568*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m4, [cq+64*6]
2569*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m3, [cq+64*1]
2570*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m4, [cq+64*2]
2571*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m3, [cq+64*5]
2572*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m4, [cq+64*4]
2573*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, [cq+64*3]
2574*c0909341SAndroid Build Coastguard Worker    call .main
2575*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_2896_2896)]
2576*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m2896_2896)]
2577*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
2578*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m5, m13       ; -out5
2579*c0909341SAndroid Build Coastguard Worker    mova                 m8, m10
2580*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m11, m13      ;  out4
2581*c0909341SAndroid Build Coastguard Worker    mova                 m9, m10
2582*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m5, m12       ;  out10
2583*c0909341SAndroid Build Coastguard Worker    mova                 m5, m10
2584*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m11, m12      ; -out11
2585*c0909341SAndroid Build Coastguard Worker    mova                m11, m10
2586*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m11, m3, m13       ; -out7
2587*c0909341SAndroid Build Coastguard Worker    mova                m14, m10
2588*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m14, m4, m13       ;  out6
2589*c0909341SAndroid Build Coastguard Worker    mova                m13, m10
2590*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m13, m3, m12       ;  out8
2591*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9
2592*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10
2593*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m8            ; -out5   out4
2594*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m9, m5        ;  out10 -out11
2595*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m11, m14      ; -out7   out6
2596*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m13, m10      ;  out8  -out9
2597*c0909341SAndroid Build Coastguard Worker    ret
2598*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2599*c0909341SAndroid Build Coastguard Worker.main_pass2:
2600*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m8, m4, m6, q3232 ; i8 ic m8 mc
2601*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, ym6, 1        ; i0 i4 m0 m4
2602*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m0, m2, q3232 ; a8 ac e8 ec
2603*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym2, 1        ; a0 a4 e0 e4
2604*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m9, m5, m7, q3232 ; ia ie ma me
2605*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, ym7, 1        ; i2 i6 m2 m6
2606*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m1, m3, q3232 ; aa ae ea ee
2607*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, ym3, 1        ; a2 a6 e2 e6
2608*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m0, m4, q3131 ;  4  5
2609*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m4, q2020     ;  0  1
2610*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m6, m8, q2020 ;  8  9
2611*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m8, q3131     ; 12 13
2612*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m1, m5, q3131 ;  6  7
2613*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m5, q2020     ;  2  3
2614*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m7, m9, q2020 ; 10 11
2615*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m9, q3131     ; 14 15
2616*c0909341SAndroid Build Coastguard Workercglobal_label .main_pass2b
2617*c0909341SAndroid Build Coastguard Worker    REPX {pshufd x, x, q1032}, m1, m3, m5, m7
2618*c0909341SAndroid Build Coastguard Worker    call .main
2619*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2896x8)]
2620*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m11, m12
2621*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m12
2622*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m12
2623*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m12
2624*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m9, m5, m2        ;  t15a   t7
2625*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m2            ;  t14a   t6
2626*c0909341SAndroid Build Coastguard Worker    shufps               m2, m3, m4, q1032 ;  t2a    t10
2627*c0909341SAndroid Build Coastguard Worker    shufps               m3, m4, q3210     ;  t3a    t11
2628*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m2, m3        ;  out8  -out9
2629*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m2            ; -out7   out6
2630*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m5, m9        ; -out5   out4
2631*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m9            ;  out10 -out11
2632*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m2, m3, m4, m5
2633*c0909341SAndroid Build Coastguard Worker    ret
2634*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2635*c0909341SAndroid Build Coastguard Worker.main:
2636*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
2637*c0909341SAndroid Build Coastguard Worker    vpbroadcastq        m13, [o(int_mshift)]
2638*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m7, m0 ; in14 in1
2639*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m7     ; in0  in15
2640*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m6, m1 ; in12 in3
2641*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m6     ; in2  in13
2642*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m5, m2 ; in10 in5
2643*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5     ; in4  in11
2644*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m4, m3 ; in8  in7
2645*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4     ; in6  in9
2646*c0909341SAndroid Build Coastguard Worker    vpcmpub              k7, m13, m10, 6 ; 0x33...
2647*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 4, 9, 10,  201, 4091, 5 ; t0  t1
2648*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 4, 9, 10,  995, 3973, 5 ; t2  t3
2649*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 4, 9, 10, 1751, 3703, 5 ; t4  t5
2650*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 4, 9, 10, 2440, 3290, 5 ; t6  t7
2651*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 4, 9, 10, 3035, 2751, 5 ; t8  t9
2652*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 5 ; t10 t11
2653*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 5 ; t12 t13
2654*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 5 ; t14 t15
2655*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m0, m5 ; t9a  t8a
2656*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m5     ; t1a  t0a
2657*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m1, m6 ; t11a t10a
2658*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m6     ; t3a  t2a
2659*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m2, m7 ; t13a t12a
2660*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m7     ; t5a  t4a
2661*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m3, m8 ; t15a t14a
2662*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m8     ; t7a  t6a
2663*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 8, 9, 10, 799,       4017,        4 ; t8  t9
2664*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        6, 8, 9, 10, 799_4017,  4017_m799,  52 ; t12 t13
2665*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 8, 9, 10, 3406,      2276,        4 ; t10 t11
2666*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15
2667*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m1, m3 ; t7   t6
2668*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3     ; t3   t2
2669*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m0, m2 ; t5   t4
2670*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2     ; t1   t0
2671*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m5, m7 ; t14a t15a
2672*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m5     ; t10a t11a
2673*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m4, m6 ; t12a t13a
2674*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m6     ; t8a  t9a
2675*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 6, 9, 10, 1567,       3784,        5 ; t5a t4a
2676*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        8, 6, 9, 10, 3784_m1567, 1567_3784,  52 ; t7a t6a
2677*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 6, 9, 10, 3784,       1567,        4 ; t15 t14
2678*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 6, 9, 10, 3784_1567,  1567_m3784, 52 ; t13 t12
2679*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m12, [o(deint_shuf)]
2680*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m4, m7        ; -out1  out14
2681*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m7            ;  t10    t11
2682*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m3, m8        ;  t7     t6
2683*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m3            ;  out12 -out3
2684*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m0, m1        ;  t3a    t2a
2685*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1            ; -out15  out0
2686*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, m5        ; -out13  out2
2687*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m2            ;  t15a   t14a
2688*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m12
2689*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m12
2690*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m12
2691*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m12
2692*c0909341SAndroid Build Coastguard Worker    shufps               m7, m6, m0, q1032 ;  out14 -out15
2693*c0909341SAndroid Build Coastguard Worker    shufps               m0, m6, m0, q3210 ; -out1   out0
2694*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m8, m1        ;  out12 -out13
2695*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m8, m1        ; -out3   out2
2696*c0909341SAndroid Build Coastguard Worker    ret
2697*c0909341SAndroid Build Coastguard Worker
2698*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, dct
2699*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, adst
2700*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, flipadst
2701*c0909341SAndroid Build Coastguard Worker
2702*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2703*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x16_internal_8bpc).main_pass1
2704*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_m8192_8192)]
2705*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3
2706*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3
2707*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3
2708*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m6     ; b0 d0 b1 d1 b2 d2 b3 d3
2709*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1
2710*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m7     ; a2 b2 c2 d2 a3 b3 c3 d3
2711*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1
2712*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3
2713*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3
2714*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3
2715*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3
2716*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m4     ; f0 h0 f1 h1 f2 h2 f3 h3
2717*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1
2718*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m5     ; e2 f2 g2 h2 e3 f3 g3 h3
2719*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1
2720*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3
2721*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x16_internal_8bpc).pass1_end
2722*c0909341SAndroid Build Coastguard Worker.pass2:
2723*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x16_internal_8bpc).main_pass2
2724*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(permD)]
2725*c0909341SAndroid Build Coastguard Worker    psrlq                m8, m10, 8
2726*c0909341SAndroid Build Coastguard Worker    psrlq               m12, m10, 12
2727*c0909341SAndroid Build Coastguard Worker    psrlq               m13, m10, 4
2728*c0909341SAndroid Build Coastguard Worker    mova                 m9, m8
2729*c0909341SAndroid Build Coastguard Worker    vpermi2q             m8, m7, m5 ;  0  1  4  5
2730*c0909341SAndroid Build Coastguard Worker    vpermt2q             m7, m12, m5
2731*c0909341SAndroid Build Coastguard Worker    vpermi2q             m9, m6, m4 ;  2  3  6  7
2732*c0909341SAndroid Build Coastguard Worker    vpermt2q             m6, m12, m4
2733*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2048)]
2734*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 0x00ff00ff
2735*c0909341SAndroid Build Coastguard Worker    mova                m11, m10
2736*c0909341SAndroid Build Coastguard Worker    vpermi2q            m10, m3, m1 ;  8  9 12 13
2737*c0909341SAndroid Build Coastguard Worker    vpermt2q             m3, m13, m1
2738*c0909341SAndroid Build Coastguard Worker    kmovd                k1, r3d
2739*c0909341SAndroid Build Coastguard Worker    vpermi2q            m11, m2, m0 ; 10 11 14 15
2740*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m13, m0
2741*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
2742*c0909341SAndroid Build Coastguard Worker    vpsubw          m12{k1}, m0, m12
2743*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7, m12
2744*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6, m12
2745*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m3, m12
2746*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m2, m12
2747*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_8bpc).end3
2748*c0909341SAndroid Build Coastguard Worker
2749*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN identity, dct
2750*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN identity, identity
2751*c0909341SAndroid Build Coastguard Worker
2752*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
2753*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(int16_perm)]
2754*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
2755*c0909341SAndroid Build Coastguard Worker    vpermb               m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
2756*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [o(pw_1697x16)]
2757*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
2758*c0909341SAndroid Build Coastguard Worker    vpermb               m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
2759*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3
2760*c0909341SAndroid Build Coastguard Worker    vpermb               m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3
2761*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3
2762*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3
2763*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m0, m1
2764*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m0, m2
2765*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m0, m3
2766*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m0, m4
2767*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m13, m0, m5
2768*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m0, m6
2769*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m0, m7
2770*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m8
2771*c0909341SAndroid Build Coastguard Worker    REPX       {psraw x, 1}, m9, m10, m11, m12
2772*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m9
2773*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m10
2774*c0909341SAndroid Build Coastguard Worker    pavgw                m3, m11
2775*c0909341SAndroid Build Coastguard Worker    pavgw                m4, m12
2776*c0909341SAndroid Build Coastguard Worker    REPX       {psraw x, 1}, m13, m14, m15, m0
2777*c0909341SAndroid Build Coastguard Worker    pavgw                m5, m13
2778*c0909341SAndroid Build Coastguard Worker    pavgw                m6, m14
2779*c0909341SAndroid Build Coastguard Worker    pavgw                m7, m15
2780*c0909341SAndroid Build Coastguard Worker    pavgw                m8, m0
2781*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
2782*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2     ; a2 b2 c2 d2 a3 b3 c3 d3
2783*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1
2784*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4     ; e2 f2 g2 h2 e3 f3 g3 h3
2785*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1
2786*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m6     ; i2 j2 k2 l2 i3 j3 k3 l3
2787*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1
2788*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m8     ; m2 n2 o2 p2 m3 n3 o3 p3
2789*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2790*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2791*c0909341SAndroid Build Coastguard Worker.pass2:
2792*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_1697x16)]
2793*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m11, m0
2794*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m13, m11, m1
2795*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m11, m2
2796*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m11, m3
2797*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m11, m4
2798*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m11, m5
2799*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m11, m6
2800*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m7
2801*c0909341SAndroid Build Coastguard Worker    REPX      {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
2802*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m12
2803*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m13
2804*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m14
2805*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m15
2806*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m4
2807*c0909341SAndroid Build Coastguard Worker    movu                 m4, [o(permD+2)]
2808*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m5
2809*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m10
2810*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m11
2811*c0909341SAndroid Build Coastguard Worker    psrlq               m12, m4, 4
2812*c0909341SAndroid Build Coastguard Worker    mova                 m5, m4
2813*c0909341SAndroid Build Coastguard Worker    mova                m10, m4
2814*c0909341SAndroid Build Coastguard Worker    mova                m11, m4
2815*c0909341SAndroid Build Coastguard Worker    vpermi2q             m4, m0, m2  ;  8  9 12 13
2816*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m12, m2 ;  0  1  4  5
2817*c0909341SAndroid Build Coastguard Worker    vpermi2q             m5, m1, m3  ; 10 11 14 15
2818*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m12, m3 ;  2  3  6  7
2819*c0909341SAndroid Build Coastguard Worker    vpermi2q            m10, m8, m6
2820*c0909341SAndroid Build Coastguard Worker    vpermt2q             m8, m12, m6
2821*c0909341SAndroid Build Coastguard Worker    vpermi2q            m11, m9, m7
2822*c0909341SAndroid Build Coastguard Worker    vpermt2q             m9, m12, m7
2823*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_8bpc).end
2824*c0909341SAndroid Build Coastguard Worker
2825*c0909341SAndroid Build Coastguard Worker%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4]
2826*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%4, [o(pw_%5_%6x8)]
2827*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%1, m%3, m%3
2828*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m%4
2829*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%4, [o(pw_%7_%8x8)]
2830*c0909341SAndroid Build Coastguard Worker    punpckhwd           m%2, m%3, m%3
2831*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%2, m%4
2832*c0909341SAndroid Build Coastguard Worker%endmacro
2833*c0909341SAndroid Build Coastguard Worker
2834*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
2835*c0909341SAndroid Build Coastguard Worker%undef cmp
2836*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
2837*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
2838*c0909341SAndroid Build Coastguard Worker    jz .dconly
2839*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 107
2840*c0909341SAndroid Build Coastguard Worker    jb .fast
2841*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*5]
2842*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*3]
2843*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*1]
2844*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64*7]
2845*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*2]
2846*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*6]
2847*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64*0]
2848*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64*4]
2849*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
2850*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(idct_8x32p)]
2851*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_8192)]
2852*c0909341SAndroid Build Coastguard Worker    REPX  {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7
2853*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m0, m1 ; ab
2854*c0909341SAndroid Build Coastguard Worker    punpckhdq            m0, m1
2855*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m2, m3 ; cd
2856*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m3
2857*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m4, m5 ; ef
2858*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m5
2859*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m6, m7 ; gh
2860*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m7
2861*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6
2862*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m18, m8, m1 ; 30  2    6 26   31  1   23  9
2863*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m14, m8, m1 ; 16  0   12 20    3 29   11 21
2864*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m21, m0, m2 ; 14 18   22 10   27  5   19 13
2865*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m15, m0, m2 ; 18  4   24  8    7 25   15 17
2866*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m20, m3, m5
2867*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m16, m3, m5
2868*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m19, m4, m6
2869*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m17, m4, m6
2870*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym8, ym18, xm20, 1
2871*c0909341SAndroid Build Coastguard Worker    vshufi32x4          ym1, ym18, ym20, 0x03
2872*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym9, ym14, xm16, 1
2873*c0909341SAndroid Build Coastguard Worker    vshufi32x4          ym3, ym14, ym16, 0x03
2874*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, ym21, xm19, 1
2875*c0909341SAndroid Build Coastguard Worker    vshufi32x4          ym5, ym21, ym19, 0x03
2876*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym7, ym15, xm17, 1
2877*c0909341SAndroid Build Coastguard Worker    vshufi32x4          ym6, ym15, ym17, 0x03
2878*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_8bpc).main2
2879*c0909341SAndroid Build Coastguard Worker    psrlq               m12, [o(permB)], 60
2880*c0909341SAndroid Build Coastguard Worker    vpermt2q            m14, m12, m16
2881*c0909341SAndroid Build Coastguard Worker    vpermt2q            m21, m12, m19
2882*c0909341SAndroid Build Coastguard Worker    vpermt2q            m15, m12, m17
2883*c0909341SAndroid Build Coastguard Worker    vpermi2q            m12, m18, m20
2884*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym16, m14, 1
2885*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym19, m21, 1
2886*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym17, m15, 1
2887*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym20, m12, 1
2888*c0909341SAndroid Build Coastguard Worker    call .main2
2889*c0909341SAndroid Build Coastguard Worker    jmp .end
2890*c0909341SAndroid Build Coastguard Worker.fast: ; right half is zero
2891*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(int16_perm)]
2892*c0909341SAndroid Build Coastguard Worker    mova                ym2, [cq+64*4]
2893*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, [cq+64*0], 1
2894*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+64*6]
2895*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [cq+64*2], 1
2896*c0909341SAndroid Build Coastguard Worker    mova                ym4, [cq+64*3]
2897*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, [cq+64*5], 1
2898*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+64*7]
2899*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, [cq+64*1], 1
2900*c0909341SAndroid Build Coastguard Worker    REPX  {vpermb x, m0, x}, m2, m3, m4, m5
2901*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_8bpc).main2
2902*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m4, [o(int_shuf3)]
2903*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m5, [o(int_shuf4)]
2904*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4     ; e0 f0 e2 f2 e1 f1 e3 f3
2905*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5     ; g0 h0 g2 h2 g1 h1 g3 h3
2906*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4     ; a0 b0 a2 b2 a1 b1 a3 b3
2907*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m5     ; c0 d0 c2 d2 c1 d1 c3 d3
2908*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_8192)]
2909*c0909341SAndroid Build Coastguard Worker    psrlq                m5, [o(permB)], 60
2910*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2
2911*c0909341SAndroid Build Coastguard Worker    punpckhdq           m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3
2912*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2
2913*c0909341SAndroid Build Coastguard Worker    punpckhdq           m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3
2914*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m4}, m6, m17, m2, m16
2915*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym0, ym2, xm6, 1      ;  0  2
2916*c0909341SAndroid Build Coastguard Worker    vshufi32x4          ym1, ym2, ym6, 0x03   ;  4  6
2917*c0909341SAndroid Build Coastguard Worker    vinserti32x4       ym14, ym16, xm17, 1    ;  1  3
2918*c0909341SAndroid Build Coastguard Worker    vshufi32x4         ym15, ym16, ym17, 0x03 ;  5  7
2919*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m5, m6           ;  8 10
2920*c0909341SAndroid Build Coastguard Worker    vpermt2q            m16, m5, m17          ;  9 11
2921*c0909341SAndroid Build Coastguard Worker    vextracti32x8       ym3, m2, 1            ; 12 14
2922*c0909341SAndroid Build Coastguard Worker    vextracti32x8      ym17, m16, 1           ; 13 15
2923*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_8bpc).main_fast
2924*c0909341SAndroid Build Coastguard Worker    call .main_fast
2925*c0909341SAndroid Build Coastguard Worker.end:
2926*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym8, strided
2927*c0909341SAndroid Build Coastguard Worker    pmulld              ym8, [o(gather8d)]
2928*c0909341SAndroid Build Coastguard Worker    call .main_end
2929*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+strideq*4]
2930*c0909341SAndroid Build Coastguard Worker    kxnorb               k1, k1, k1
2931*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dstq+strideq*8]
2932*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
2933*c0909341SAndroid Build Coastguard Worker    lea                  r1, [r3+strideq*8]
2934*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
2935*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m12{k1}, [r0+ym8]
2936*c0909341SAndroid Build Coastguard Worker    kmovb                k1, k2
2937*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m13{k2}, [r3+ym8]
2938*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
2939*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m14{k1}, [r4+ym8]
2940*c0909341SAndroid Build Coastguard Worker    kmovb                k1, k2
2941*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m15{k2}, [r1+ym8]
2942*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
2943*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7
2944*c0909341SAndroid Build Coastguard Worker    punpcklbw           m11, m12, m9
2945*c0909341SAndroid Build Coastguard Worker    punpckhbw           m12, m9
2946*c0909341SAndroid Build Coastguard Worker    paddw                m0, m11
2947*c0909341SAndroid Build Coastguard Worker    paddw                m1, m12
2948*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2949*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
2950*c0909341SAndroid Build Coastguard Worker    vpscatterdq [r0+ym8]{k1}, m0
2951*c0909341SAndroid Build Coastguard Worker    punpcklbw           m12, m13, m9
2952*c0909341SAndroid Build Coastguard Worker    punpckhbw           m13, m9
2953*c0909341SAndroid Build Coastguard Worker    paddw                m2, m12
2954*c0909341SAndroid Build Coastguard Worker    paddw                m3, m13
2955*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m3
2956*c0909341SAndroid Build Coastguard Worker    kmovb                k1, k2
2957*c0909341SAndroid Build Coastguard Worker    vpscatterdq [r3+ym8]{k2}, m2
2958*c0909341SAndroid Build Coastguard Worker    punpcklbw           m13, m14, m9
2959*c0909341SAndroid Build Coastguard Worker    punpckhbw           m14, m9
2960*c0909341SAndroid Build Coastguard Worker    paddw                m4, m13
2961*c0909341SAndroid Build Coastguard Worker    paddw                m5, m14
2962*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
2963*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
2964*c0909341SAndroid Build Coastguard Worker    vpscatterdq [r4+ym8]{k1}, m4
2965*c0909341SAndroid Build Coastguard Worker    punpcklbw           m14, m15, m9
2966*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m9
2967*c0909341SAndroid Build Coastguard Worker    paddw                m6, m14
2968*c0909341SAndroid Build Coastguard Worker    paddw                m7, m15
2969*c0909341SAndroid Build Coastguard Worker    packuswb             m6, m7
2970*c0909341SAndroid Build Coastguard Worker    vpscatterdq [r1+ym8]{k2}, m6
2971*c0909341SAndroid Build Coastguard Worker    RET
2972*c0909341SAndroid Build Coastguard Worker.dconly:
2973*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
2974*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
2975*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
2976*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
2977*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+512
2978*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+2
2979*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
2980*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx512icl
2981*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2982*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast2 ; bottom three-quarters are zero
2983*c0909341SAndroid Build Coastguard Worker    ITX_UNPACK_MULHRSW   12, 14, 14, 8,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
2984*c0909341SAndroid Build Coastguard Worker    ITX_UNPACK_MULHRSW   21, 20, 15, 8,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
2985*c0909341SAndroid Build Coastguard Worker    mova                m11, m12
2986*c0909341SAndroid Build Coastguard Worker    mova                m17, m20
2987*c0909341SAndroid Build Coastguard Worker    mova                m15, m21
2988*c0909341SAndroid Build Coastguard Worker    mova                m16, m14
2989*c0909341SAndroid Build Coastguard Worker    jmp .main4
2990*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2991*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast ; bottom half is zero
2992*c0909341SAndroid Build Coastguard Worker    ITX_UNPACK_MULHRSW   12, 14, 14, 8,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
2993*c0909341SAndroid Build Coastguard Worker    ITX_UNPACK_MULHRSW   21, 15, 15, 8,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
2994*c0909341SAndroid Build Coastguard Worker    ITX_UNPACK_MULHRSW   20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
2995*c0909341SAndroid Build Coastguard Worker    ITX_UNPACK_MULHRSW   19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
2996*c0909341SAndroid Build Coastguard Worker    jmp .main3
2997*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2998*c0909341SAndroid Build Coastguard Workercglobal_label .main
2999*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m21, m14 ; in31 in1
3000*c0909341SAndroid Build Coastguard Worker    punpckhwd           m14, m21      ; in3  in29
3001*c0909341SAndroid Build Coastguard Worker    punpcklwd           m21, m20, m15 ; in27 in5
3002*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m20      ; in7  in25
3003*c0909341SAndroid Build Coastguard Worker    punpcklwd           m20, m19, m16 ; in23 in9
3004*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m19      ; in11 in21
3005*c0909341SAndroid Build Coastguard Worker    punpcklwd           m19, m18, m17 ; in19 in13
3006*c0909341SAndroid Build Coastguard Worker    punpckhwd           m17, m18      ; in15 in17
3007*c0909341SAndroid Build Coastguard Worker.main2:
3008*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       12, 8, 9, 10,  201, 4091, 5 ; t16a, t31a
3009*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       14, 8, 9, 10, 4052,  601, 5 ; t23a, t24a
3010*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       21, 8, 9, 10,  995, 3973, 5 ; t20a, t27a
3011*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
3012*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
3013*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
3014*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
3015*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
3016*c0909341SAndroid Build Coastguard Worker.main3:
3017*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m12, m17 ; t17 t30
3018*c0909341SAndroid Build Coastguard Worker    paddsw              m12, m17      ; t16 t31
3019*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m15, m20 ; t18 t29
3020*c0909341SAndroid Build Coastguard Worker    paddsw              m20, m15      ; t19 t28
3021*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m21, m16 ; t21 t26
3022*c0909341SAndroid Build Coastguard Worker    paddsw              m21, m16      ; t20 t27
3023*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m14, m19 ; t22 t25
3024*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m19      ; t23 t24
3025*c0909341SAndroid Build Coastguard Worker.main4:
3026*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       11, 18, 19, 10,   799, 4017, 5 ; t17a t30a
3027*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       17, 18, 19, 10, m4017,  799, 5 ; t18a t29a
3028*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       15, 18, 19, 10,  3406, 2276, 5 ; t21a t26a
3029*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a
3030*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_m3784_1567)]
3031*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m12, m20 ; t19a t28a
3032*c0909341SAndroid Build Coastguard Worker    paddsw              m20, m12      ; t16a t31a
3033*c0909341SAndroid Build Coastguard Worker    psubsw              m12, m14, m21 ; t20a t27a
3034*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m21      ; t23a t24a
3035*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m11, m17 ; t18  t29
3036*c0909341SAndroid Build Coastguard Worker    paddsw              m11, m17      ; t17  t30
3037*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m16, m15 ; t21  t26
3038*c0909341SAndroid Build Coastguard Worker    paddsw              m16, m15      ; t22  t25
3039*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       21, 18, 15, 10, 1567_3784, 8,   20 ; t18a t29a
3040*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       19, 18, 15, 10, 1567_3784, 8,   20 ; t19  t28
3041*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       12, 18, 15, 10, 8, m1567_m3784, 36 ; t20  t27
3042*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a
3043*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m18, [o(deint_shuf)]
3044*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_m2896_2896)]
3045*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_2896_2896)]
3046*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m20, m14 ; t23  t24
3047*c0909341SAndroid Build Coastguard Worker    paddsw              m20, m14      ; t16  t31
3048*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m11, m16 ; t22a t25a
3049*c0909341SAndroid Build Coastguard Worker    paddsw              m11, m16      ; t17a t30a
3050*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m21, m17 ; t21  t26
3051*c0909341SAndroid Build Coastguard Worker    paddsw              m21, m17      ; t18  t29
3052*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m19, m12 ; t20a t27a
3053*c0909341SAndroid Build Coastguard Worker    paddsw              m19, m12      ; t19a t28a
3054*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m18}, m20, m11, m21, m19
3055*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       15, 18, 12, 10, 8, 9, 8 ; t23a t22a
3056*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       14, 13, 15, 10, 8, 9, 8 ; t22  t25
3057*c0909341SAndroid Build Coastguard Worker    packssdw            m18, m13      ; t23a t22
3058*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m15      ; t24a t25
3059*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       16, 13, 15, 10, 8, 9, 8 ; t21a t26a
3060*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       17, 16, 14, 10, 8, 9, 8 ; t20  t27
3061*c0909341SAndroid Build Coastguard Worker    packssdw            m16, m13      ; t20  t21a
3062*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15      ; t27  t26a
3063*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m13, m19, m21 ; t19a t18
3064*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m19, m21      ; t28a t29
3065*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m21, m20, m11 ; t16  t17a
3066*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m20, m11      ; t31  t30a
3067*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl
3068*c0909341SAndroid Build Coastguard Worker    mova                m15, [o(permA)]
3069*c0909341SAndroid Build Coastguard Worker    ret
3070*c0909341SAndroid Build Coastguard Workercglobal_label .main_end
3071*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_2048)]
3072*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m15, m1  ; t0   t1   t2   t3
3073*c0909341SAndroid Build Coastguard Worker    vpermt2q            m20, m15, m19 ; t31  t30a t29  t28a
3074*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m15, m3  ; t4   t5   t6   t7
3075*c0909341SAndroid Build Coastguard Worker    vpermt2q            m14, m15, m12 ; t27  t26a t25  t24a
3076*c0909341SAndroid Build Coastguard Worker    vpermt2q             m4, m15, m5  ; t8   t9   t10  t11
3077*c0909341SAndroid Build Coastguard Worker    vpermt2q            m18, m15, m16 ; t23a t22  t21a t20
3078*c0909341SAndroid Build Coastguard Worker    vpermt2q             m6, m15, m7  ; t12  t13  t14  t15
3079*c0909341SAndroid Build Coastguard Worker    vpermt2q            m13, m15, m21 ; t19a t18  t17a t16
3080*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m0, m20  ; out31 out30 out29 out28
3081*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m20      ; out0  out1  out2  out3
3082*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m2, m14  ; out27 out26 out25 out24
3083*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m14      ; out4  out5  out6  out7
3084*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m4, m18  ; out23 out22 out21 out20
3085*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m18      ; out8  out9  out10 out11
3086*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m6, m13  ; out19 out18 out17 out16
3087*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m13      ; out12 out13 out14 out15
3088*c0909341SAndroid Build Coastguard Worker    vzeroupper
3089*c0909341SAndroid Build Coastguard Worker    ret
3090*c0909341SAndroid Build Coastguard Worker
3091*c0909341SAndroid Build Coastguard Worker%macro LOAD_PACKED_16X2 3 ; dst, row[1-2]
3092*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4    ym%1, [cq+16*%2]
3093*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     ym8, [cq+16*%3]
3094*c0909341SAndroid Build Coastguard Worker    shufpd             ym%1, ym8, 0x0c
3095*c0909341SAndroid Build Coastguard Worker%endmacro
3096*c0909341SAndroid Build Coastguard Worker
3097*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
3098*c0909341SAndroid Build Coastguard Worker%undef cmp
3099*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
3100*c0909341SAndroid Build Coastguard Worker    jz .dconly
3101*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
3102*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      0,  0,  2 ; in0  in2
3103*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      1,  4,  6 ; in4  in6
3104*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      2,  8, 10 ; in8  in10
3105*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      3, 12, 14 ; in12 in14
3106*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     14,  1,  3 ; in1  in3
3107*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     15,  5,  7 ; in5  in7
3108*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     16,  9, 11 ; in9  in11
3109*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     17, 13, 15 ; in13 in15
3110*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
3111*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
3112*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 107
3113*c0909341SAndroid Build Coastguard Worker    jb .fast
3114*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      4, 16, 18 ; in16 in18
3115*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      5, 20, 22 ; in20 in22
3116*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      6, 24, 26 ; in24 in26
3117*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      7, 28, 30 ; in28 in30
3118*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_8bpc).main
3119*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     18, 19, 17 ; in19 in17
3120*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     19, 23, 21 ; in23 in21
3121*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     20, 27, 25 ; in27 in25
3122*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     21, 31, 29 ; in31 in29
3123*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
3124*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
3125*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_8x32_8bpc).main
3126*c0909341SAndroid Build Coastguard Worker    jmp .pass2
3127*c0909341SAndroid Build Coastguard Worker.fast: ; bottom half is zero
3128*c0909341SAndroid Build Coastguard Worker    mova                ym5, ym4
3129*c0909341SAndroid Build Coastguard Worker    mova                ym6, ym4
3130*c0909341SAndroid Build Coastguard Worker    mova                ym7, ym4
3131*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_8bpc).main
3132*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
3133*c0909341SAndroid Build Coastguard Worker.pass2:
3134*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_8192)]
3135*c0909341SAndroid Build Coastguard Worker    vpermt2q             m0, m15, m4       ; t0   t1   t9   t8
3136*c0909341SAndroid Build Coastguard Worker    vpermt2q            m20, m15, m18      ; t31  t30a t23a t22
3137*c0909341SAndroid Build Coastguard Worker    vpermt2q             m3, m15, m7       ; t7   t6   t14  t15
3138*c0909341SAndroid Build Coastguard Worker    vpermt2q            m12, m15, m21      ; t25  t24a t17a t16
3139*c0909341SAndroid Build Coastguard Worker    vpermt2q             m2, m15, m6       ; t4   t5   t13  t12
3140*c0909341SAndroid Build Coastguard Worker    vpermt2q            m14, m15, m13      ; t23a t22  t21a t20
3141*c0909341SAndroid Build Coastguard Worker    vpermt2q             m1, m15, m5       ; t3   t2   t10  t11
3142*c0909341SAndroid Build Coastguard Worker    vpermt2q            m19, m15, m16      ; t27  t26a t19a t18
3143*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m0, m20       ; out31 out30 out22 out23
3144*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m20           ; out0  out1  out9  out8
3145*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m3, m12       ; out7  out6  out14 out15
3146*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m12           ; out24 out25 out17 out16
3147*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m2, m14       ; out27 out26 out18 out19
3148*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m2, m14       ; out4  out5  out13 out12
3149*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m1, m19       ; out28 out29 out21 out20
3150*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m1, m19       ; out3  out2  out10 out11
3151*c0909341SAndroid Build Coastguard Worker    vzeroupper
3152*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m0, m3, q1221 ; out1  out9  out17 out25
3153*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m3, q0330     ; out0  out8  out16 out24
3154*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m2, m5, q0330 ; out3  out11 out19 out27
3155*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m5, q1221     ; out2  out10 out18 out26
3156*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m4, m7, q1221 ; out5  out13 out21 out29
3157*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m7, q0330     ; out4  out12 out20 out28
3158*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m6, m8, q0330 ; out7  out15 out23 out31
3159*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m8, q1221     ; out6  out14 out22 out30
3160*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
3161*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
3162*c0909341SAndroid Build Coastguard Worker    call .main
3163*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2048)]
3164*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
3165*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
3166*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+strideq*4]
3167*c0909341SAndroid Build Coastguard Worker    movshdup            m12, [o(permD)]
3168*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m8, [dstq+strideq*0]
3169*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m9, [dstq+strideq*1]
3170*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m10, [dstq+strideq*2]
3171*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m11, [dstq+r2       ]
3172*c0909341SAndroid Build Coastguard Worker    paddw                m0, m8
3173*c0909341SAndroid Build Coastguard Worker    paddw                m1, m9
3174*c0909341SAndroid Build Coastguard Worker    paddw                m2, m10
3175*c0909341SAndroid Build Coastguard Worker    paddw                m3, m11
3176*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m8, [r3+strideq*0]
3177*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m9, [r3+strideq*1]
3178*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m10, [r3+strideq*2]
3179*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m11, [r3+r2       ]
3180*c0909341SAndroid Build Coastguard Worker    paddw                m4, m8
3181*c0909341SAndroid Build Coastguard Worker    paddw                m5, m9
3182*c0909341SAndroid Build Coastguard Worker    paddw                m6, m10
3183*c0909341SAndroid Build Coastguard Worker    paddw                m7, m11
3184*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3185*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m3
3186*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m12, m0
3187*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m12, m2
3188*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
3189*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
3190*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], ym2
3191*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+r2       ], m2, 1
3192*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
3193*c0909341SAndroid Build Coastguard Worker    packuswb             m6, m7
3194*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m12, m4
3195*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m12, m6
3196*c0909341SAndroid Build Coastguard Worker    mova          [r3+strideq*0], ym4
3197*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r3+strideq*1], m4, 1
3198*c0909341SAndroid Build Coastguard Worker    mova          [r3+strideq*2], ym6
3199*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r3+r2       ], m6, 1
3200*c0909341SAndroid Build Coastguard Worker    RET
3201*c0909341SAndroid Build Coastguard Worker.dconly:
3202*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
3203*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
3204*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8
3205*c0909341SAndroid Build Coastguard Worker.dconly2:
3206*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
3207*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+512
3208*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+2
3209*c0909341SAndroid Build Coastguard Worker.dconly3:
3210*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
3211*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+2048
3212*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+4
3213*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
3214*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, r6d
3215*c0909341SAndroid Build Coastguard Worker.dconly_loop:
3216*c0909341SAndroid Build Coastguard Worker    mova                ym1, [dstq+strideq*0]
3217*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [dstq+strideq*1], 1
3218*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
3219*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
3220*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
3221*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
3222*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3223*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
3224*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
3225*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
3226*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 2
3227*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
3228*c0909341SAndroid Build Coastguard Worker    RET
3229*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3230*c0909341SAndroid Build Coastguard Workercglobal_label .main
3231*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       m10, [o(pd_2048)]
3232*c0909341SAndroid Build Coastguard Worker.main2:
3233*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a
3234*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        1, 7, 8, 9, 10,  799, 4017 ; t4a, t7a
3235*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        2, 6, 8, 9, 10, 1567, 3784 ; t2, t3
3236*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       m11, [o(pw_2896_2896)]
3237*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       m12, [o(pw_m2896_2896)]
3238*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        0, 4, 8, 9, 10, 11, 12 ; t1, t0
3239*c0909341SAndroid Build Coastguard Worker.main3:
3240*c0909341SAndroid Build Coastguard Worker    paddsw              m8, m1, m5 ; t4
3241*c0909341SAndroid Build Coastguard Worker    psubsw              m1, m5     ; t5a
3242*c0909341SAndroid Build Coastguard Worker    paddsw              m9, m7, m3 ; t7
3243*c0909341SAndroid Build Coastguard Worker    psubsw              m7, m3     ; t6a
3244*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        7, 1, 3, 5, 10, 11, 12 ; t5, t6
3245*c0909341SAndroid Build Coastguard Worker    psubsw              m5, m0, m2 ; dct4 out2
3246*c0909341SAndroid Build Coastguard Worker    paddsw              m2, m0     ; dct4 out1
3247*c0909341SAndroid Build Coastguard Worker    paddsw              m0, m4, m6 ; dct4 out0
3248*c0909341SAndroid Build Coastguard Worker    psubsw              m4, m6     ; dct4 out3
3249*c0909341SAndroid Build Coastguard Worker    psubsw              m6, m2, m1 ; out6
3250*c0909341SAndroid Build Coastguard Worker    paddsw              m1, m2     ; out1
3251*c0909341SAndroid Build Coastguard Worker    paddsw              m2, m5, m7 ; out2
3252*c0909341SAndroid Build Coastguard Worker    psubsw              m5, m7     ; out5
3253*c0909341SAndroid Build Coastguard Worker    psubsw              m7, m0, m9 ; out7
3254*c0909341SAndroid Build Coastguard Worker    paddsw              m0, m9     ; out0
3255*c0909341SAndroid Build Coastguard Worker    paddsw              m3, m4, m8 ; out3
3256*c0909341SAndroid Build Coastguard Worker    psubsw              m4, m8     ; out4
3257*c0909341SAndroid Build Coastguard Worker    ret
3258*c0909341SAndroid Build Coastguard Worker
3259*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c
3260*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [pw_5]
3261*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m7, [cq+64*0]
3262*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m7, [cq+64*1]
3263*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        ym9, strided
3264*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m7, [cq+64*2]
3265*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m7, [cq+64*3]
3266*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m7, [cq+64*4]
3267*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m7, [cq+64*5]
3268*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m7, [cq+64*6]
3269*c0909341SAndroid Build Coastguard Worker    paddsw               m7,     [cq+64*7]
3270*c0909341SAndroid Build Coastguard Worker    pmulld             ym14, ym9, [pd_0to15]
3271*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+strideq*1]
3272*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dstq+strideq*2]
3273*c0909341SAndroid Build Coastguard Worker    kxnorb               k1, k1, k1
3274*c0909341SAndroid Build Coastguard Worker    pxor                m13, m13
3275*c0909341SAndroid Build Coastguard Worker    add                  r1, r4 ; dstq+strideq*3
3276*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
3277*c0909341SAndroid Build Coastguard Worker    vpgatherdq       m9{k1}, [r0+ym14*4]
3278*c0909341SAndroid Build Coastguard Worker    kmovb                k1, k2
3279*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m10{k2}, [r3+ym14*4]
3280*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
3281*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
3282*c0909341SAndroid Build Coastguard Worker    REPX       {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
3283*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m11{k1}, [r4+ym14*4]
3284*c0909341SAndroid Build Coastguard Worker    kmovb                k1, k2
3285*c0909341SAndroid Build Coastguard Worker    vpgatherdq      m12{k2}, [r1+ym14*4]
3286*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
3287*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m9, m13  ;  0  8 16 24
3288*c0909341SAndroid Build Coastguard Worker    punpckhbw            m9, m13      ;  4 12 20 28
3289*c0909341SAndroid Build Coastguard Worker    paddw                m0, m8
3290*c0909341SAndroid Build Coastguard Worker    paddw                m4, m9
3291*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m4
3292*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
3293*c0909341SAndroid Build Coastguard Worker    vpscatterdq [r0+ym14*4]{k1}, m0
3294*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m10, m13 ;  1  9 17 25
3295*c0909341SAndroid Build Coastguard Worker    punpckhbw           m10, m13      ;  5 13 21 29
3296*c0909341SAndroid Build Coastguard Worker    paddw                m1, m8
3297*c0909341SAndroid Build Coastguard Worker    paddw                m5, m10
3298*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m5
3299*c0909341SAndroid Build Coastguard Worker    kmovb                k1, k2
3300*c0909341SAndroid Build Coastguard Worker    vpscatterdq [r3+ym14*4]{k2}, m1
3301*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m11, m13 ;  2 10 18 26
3302*c0909341SAndroid Build Coastguard Worker    punpckhbw           m11, m13      ;  6 14 22 30
3303*c0909341SAndroid Build Coastguard Worker    paddw                m2, m8
3304*c0909341SAndroid Build Coastguard Worker    paddw                m6, m11
3305*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m6
3306*c0909341SAndroid Build Coastguard Worker    kmovb                k2, k1
3307*c0909341SAndroid Build Coastguard Worker    vpscatterdq [r4+ym14*4]{k1}, m2
3308*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m12, m13 ;  3 11 19 27
3309*c0909341SAndroid Build Coastguard Worker    punpckhbw           m12, m13      ;  7 15 23 31
3310*c0909341SAndroid Build Coastguard Worker    paddw                m3, m8
3311*c0909341SAndroid Build Coastguard Worker    paddw                m7, m12
3312*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m7
3313*c0909341SAndroid Build Coastguard Worker    vpscatterdq [r1+ym14*4]{k2}, m3
3314*c0909341SAndroid Build Coastguard Worker    RET
3315*c0909341SAndroid Build Coastguard Worker
3316*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c
3317*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [pw_4096]
3318*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m0, [cq+64*0]
3319*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m0, [cq+64*4]
3320*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m0, [cq+64*1]
3321*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m0, [cq+64*5]
3322*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m0, [cq+64*2]
3323*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m0, [cq+64*6]
3324*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m0, [cq+64*3]
3325*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0,     [cq+64*7]
3326*c0909341SAndroid Build Coastguard Worker    mova                m13, [int8_permA]
3327*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3328*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dstq+strideq*4]
3329*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m3, m4
3330*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4
3331*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m6, m5
3332*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m5
3333*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m7, m2
3334*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m2
3335*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m8, m0
3336*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m0
3337*c0909341SAndroid Build Coastguard Worker    mova                ym9, [dstq+strideq*0]
3338*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m9, [dstq+strideq*2], 1
3339*c0909341SAndroid Build Coastguard Worker    mova               ym10, [dstq+strideq*1]
3340*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m10, [dstq+r3       ], 1
3341*c0909341SAndroid Build Coastguard Worker    mova               ym11, [r4+strideq*0]
3342*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m11, [r4+strideq*2], 1
3343*c0909341SAndroid Build Coastguard Worker    mova               ym12, [r4+strideq*1]
3344*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m12, [r4+r3       ], 1
3345*c0909341SAndroid Build Coastguard Worker    REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8
3346*c0909341SAndroid Build Coastguard Worker    pxor                m13, m13
3347*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
3348*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m4 ; a0 a2   c0 c2
3349*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m4     ; b0 b2   d0 d2
3350*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m5, m2 ; a1 a3   c1 c3
3351*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m2     ; b1 b3   d1 d3
3352*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m6 ; e0 e2   g0 g2
3353*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m6     ; f0 f2   h0 h2
3354*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m7, m8 ; e1 e3   g1 g3
3355*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m7, m8     ; f1 f3   h1 h3
3356*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m9, m13
3357*c0909341SAndroid Build Coastguard Worker    punpckhbw            m9, m13
3358*c0909341SAndroid Build Coastguard Worker    paddw                m0, m8
3359*c0909341SAndroid Build Coastguard Worker    paddw                m4, m9
3360*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m4
3361*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
3362*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*2], m0, 1
3363*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m10, m13
3364*c0909341SAndroid Build Coastguard Worker    punpckhbw           m10, m13
3365*c0909341SAndroid Build Coastguard Worker    paddw                m1, m8
3366*c0909341SAndroid Build Coastguard Worker    paddw                m5, m10
3367*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m5
3368*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*1], ym1
3369*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+r3       ], m1, 1
3370*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m11, m13
3371*c0909341SAndroid Build Coastguard Worker    punpckhbw           m11, m13
3372*c0909341SAndroid Build Coastguard Worker    paddw                m2, m8
3373*c0909341SAndroid Build Coastguard Worker    paddw                m6, m11
3374*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m6
3375*c0909341SAndroid Build Coastguard Worker    mova          [r4+strideq*0], ym2
3376*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r4+strideq*2], m2, 1
3377*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m12, m13
3378*c0909341SAndroid Build Coastguard Worker    punpckhbw           m12, m13
3379*c0909341SAndroid Build Coastguard Worker    paddw                m3, m8
3380*c0909341SAndroid Build Coastguard Worker    paddw                m7, m12
3381*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m7
3382*c0909341SAndroid Build Coastguard Worker    mova          [r4+strideq*1], ym3
3383*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r4+r3       ], m3, 1
3384*c0909341SAndroid Build Coastguard Worker    RET
3385*c0909341SAndroid Build Coastguard Worker
3386*c0909341SAndroid Build Coastguard Worker%macro IDCT_16x32_END 3 ; src[1-2], row
3387*c0909341SAndroid Build Coastguard Worker    mova                xm8, [dstq+strideq*0]
3388*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym8, [dstq+strideq*1], 1
3389*c0909341SAndroid Build Coastguard Worker    mova                xm9, [dstq+r3       ]
3390*c0909341SAndroid Build Coastguard Worker    vinserti32x4        ym9, [dstq+strideq*2], 1
3391*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m10
3392*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%2, m10
3393*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m11, m8
3394*c0909341SAndroid Build Coastguard Worker    vpermb               m9, m11, m9
3395*c0909341SAndroid Build Coastguard Worker    mova   [cq+64*(%3*2+0)], m13
3396*c0909341SAndroid Build Coastguard Worker    mova   [cq+64*(%3*2+1)], m13
3397*c0909341SAndroid Build Coastguard Worker    paddw                m8, m%1
3398*c0909341SAndroid Build Coastguard Worker    paddw                m9, m%2
3399*c0909341SAndroid Build Coastguard Worker    packuswb             m8, m9
3400*c0909341SAndroid Build Coastguard Worker    vpermd               m8, m12, m8
3401*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], xm8
3402*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*1], ym8, 1
3403*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+strideq*2], m8, 2
3404*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r3       ], m8, 3
3405*c0909341SAndroid Build Coastguard Worker%if %1 != 20
3406*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3407*c0909341SAndroid Build Coastguard Worker%endif
3408*c0909341SAndroid Build Coastguard Worker%endmacro
3409*c0909341SAndroid Build Coastguard Worker
3410*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob
3411*c0909341SAndroid Build Coastguard Worker%undef cmp
3412*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
3413*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
3414*c0909341SAndroid Build Coastguard Worker    jz .dconly
3415*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_2896x8)]
3416*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
3417*c0909341SAndroid Build Coastguard Worker    jb .fast
3418*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m15, [cq+64*10]
3419*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m15, [cq+64* 6]
3420*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15, [cq+64* 2]
3421*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m15, [cq+64*14]
3422*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m15, [cq+64* 4]
3423*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m15, [cq+64*12]
3424*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15, [cq+64* 0]
3425*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m15, [cq+64* 8]
3426*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
3427*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m15, [cq+64* 1]
3428*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m15, [cq+64*15]
3429*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m15, [cq+64* 9]
3430*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m15, [cq+64* 7]
3431*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m15, [cq+64* 5]
3432*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m15, [cq+64*11]
3433*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m15, [cq+64*13]
3434*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15,      [cq+64* 3]
3435*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
3436*c0909341SAndroid Build Coastguard Worker    mova                 m8, [o(idct_16x32p)]
3437*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_16384)]
3438*c0909341SAndroid Build Coastguard Worker    REPX {vpermb x, m8, x}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
3439*c0909341SAndroid Build Coastguard Worker                            m14, m15, m16, m17, m18, m19, m20, m21
3440*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m0, m1
3441*c0909341SAndroid Build Coastguard Worker    punpckhdq            m0, m1
3442*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m2, m3
3443*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m3
3444*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m8, m0, m1, m2
3445*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m4, m5
3446*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m5
3447*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m6, m7
3448*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m7
3449*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m3, m4, m5, m6
3450*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m14, m15
3451*c0909341SAndroid Build Coastguard Worker    punpckhdq           m14, m15
3452*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m16, m17
3453*c0909341SAndroid Build Coastguard Worker    punpckhdq           m16, m17
3454*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m7, m14, m15, m16
3455*c0909341SAndroid Build Coastguard Worker    punpckldq           m17, m18, m19
3456*c0909341SAndroid Build Coastguard Worker    punpckhdq           m18, m19
3457*c0909341SAndroid Build Coastguard Worker    punpckldq           m19, m20, m21
3458*c0909341SAndroid Build Coastguard Worker    punpckhdq           m20, m21
3459*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m17, m18, m19, m20
3460*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m21, m8, m1
3461*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m8, m1
3462*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m0, m2
3463*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m2
3464*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m5
3465*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m5
3466*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m4, m6
3467*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m6
3468*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m7, m15
3469*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m7, m15
3470*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m14, m16
3471*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m14, m16
3472*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m16, m17, m19
3473*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m17, m19
3474*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m19, m18, m20
3475*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m18, m20
3476*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m20, m21, ym2, 1
3477*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m21, m2, q3232
3478*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, m8, ym3, 1
3479*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m8, m3, q3232
3480*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, m1, ym5, 1
3481*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m5, q3232
3482*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, m0, ym4, 1
3483*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m4, q3232
3484*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, m6, ym16, 1
3485*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m16, q3232
3486*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m16, m7, ym17, 1
3487*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m17, q3232
3488*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m17, m15, ym19, 1
3489*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m19, q3232
3490*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, m14, ym18, 1
3491*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m18, q3232
3492*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m18, m21, m6, q3131 ; 27  5
3493*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m21, m6, q2020      ; 31  1
3494*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m8, m7, q2020  ; 24  8
3495*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m8, m7, q3131      ; 30  2
3496*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m1, m15, q2020 ; 28  4
3497*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m15, q3131     ;  6 26
3498*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m0, m14, q2020 ;  7 25
3499*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m14, q3131     ; 14 18
3500*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m20, m4, q2020 ;  3 29
3501*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m20, m4, q3131      ; 23  9
3502*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m9, m3, m17, q2020 ; 16  0
3503*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m17, q3131     ; 12 20
3504*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m5, m19, q2020 ; 15 17
3505*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m19, q3131     ; 22 10
3506*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m2, m16, q2020 ; 19 13
3507*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m2, m16, q3131 ; 11 21
3508*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main3
3509*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf
3510*c0909341SAndroid Build Coastguard Worker    jmp .pass2
3511*c0909341SAndroid Build Coastguard Worker.fast: ; right half is zero
3512*c0909341SAndroid Build Coastguard Worker    mova                ym8, [cq+64*15]
3513*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m8, [cq+64* 1], 1
3514*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(int16_perm)]
3515*c0909341SAndroid Build Coastguard Worker    mova                ym9, [cq+64* 8]
3516*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m9, [cq+64* 0], 1
3517*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+64* 7]
3518*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [cq+64* 9], 1
3519*c0909341SAndroid Build Coastguard Worker    mova                ym7, [cq+64*14]
3520*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m7, [cq+64* 2], 1
3521*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+64* 3]
3522*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [cq+64*13], 1
3523*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+64* 6]
3524*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [cq+64*10], 1
3525*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+64*11]
3526*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, [cq+64* 5], 1
3527*c0909341SAndroid Build Coastguard Worker    mova                ym6, [cq+64*12]
3528*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m6, [cq+64* 4], 1
3529*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6
3530*c0909341SAndroid Build Coastguard Worker    REPX  {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
3531*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main2
3532*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [o(int_shuf3)]
3533*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [o(int_shuf4)]
3534*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_16384)]
3535*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
3536*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m9
3537*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m8
3538*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m9
3539*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
3540*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m8
3541*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m9
3542*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m8
3543*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m9
3544*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m11}, m4, m5, m6, m7
3545*c0909341SAndroid Build Coastguard Worker    punpckhdq           m17, m0, m1
3546*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1
3547*c0909341SAndroid Build Coastguard Worker    punpckhdq           m16, m2, m3
3548*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3
3549*c0909341SAndroid Build Coastguard Worker    punpckhdq           m18, m4, m5
3550*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m5
3551*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m6, m7
3552*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7
3553*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, m0, ym2, 1
3554*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m0, m2, q3232
3555*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, m4, ym6, 1
3556*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m6, q3232
3557*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m15, m17, ym16, 1
3558*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m16, q3232
3559*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m16, m18, ym5, 1
3560*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m18, m5, q3232
3561*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m1, m2, q2020   ;  0  2
3562*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m2, q3131       ;  4  6
3563*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m3, m4, q2020   ;  8 10
3564*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m4, q3131       ; 12 14
3565*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m15, m16, q2020 ;  1  3
3566*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m16, q3131      ;  5  7
3567*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m17, m18, q2020 ;  9 11
3568*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m18, q3131      ; 13 15
3569*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
3570*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m0, m0
3571*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m6, m0
3572*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m3, m3
3573*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m2, m2
3574*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m1, m1
3575*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m1
3576*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m3
3577*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m2
3578*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast5
3579*c0909341SAndroid Build Coastguard Worker    punpcklwd           m21, m14, m14
3580*c0909341SAndroid Build Coastguard Worker    punpckhwd           m14, m14
3581*c0909341SAndroid Build Coastguard Worker    punpcklwd           m18, m15, m15
3582*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m15
3583*c0909341SAndroid Build Coastguard Worker    punpcklwd           m20, m16, m16
3584*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m16
3585*c0909341SAndroid Build Coastguard Worker    punpcklwd           m19, m17, m17
3586*c0909341SAndroid Build Coastguard Worker    punpckhwd           m17, m17
3587*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf_fast
3588*c0909341SAndroid Build Coastguard Worker.pass2:
3589*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_2048)]
3590*c0909341SAndroid Build Coastguard Worker    mova                m11, [o(end_16x32p)]
3591*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3592*c0909341SAndroid Build Coastguard Worker    pxor                m13, m13
3593*c0909341SAndroid Build Coastguard Worker    psrld               m12, m11, 8
3594*c0909341SAndroid Build Coastguard Worker    IDCT_16x32_END        0,  1,  0
3595*c0909341SAndroid Build Coastguard Worker    IDCT_16x32_END        2,  3,  1
3596*c0909341SAndroid Build Coastguard Worker    IDCT_16x32_END        4,  5,  2
3597*c0909341SAndroid Build Coastguard Worker    IDCT_16x32_END        6,  7,  3
3598*c0909341SAndroid Build Coastguard Worker    IDCT_16x32_END       14, 15,  4
3599*c0909341SAndroid Build Coastguard Worker    IDCT_16x32_END       16, 17,  5
3600*c0909341SAndroid Build Coastguard Worker    IDCT_16x32_END       18, 19,  6
3601*c0909341SAndroid Build Coastguard Worker    IDCT_16x32_END       20, 21,  7
3602*c0909341SAndroid Build Coastguard Worker    RET
3603*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3604*c0909341SAndroid Build Coastguard Worker.dconly:
3605*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
3606*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
3607*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
3608*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly
3609*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3610*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
3611*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_201_4091x8)]
3612*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m20, [o(pw_m1380_3857x8)]
3613*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_995_3973x8)]
3614*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m16, [o(pw_m601_4052x8)]
3615*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m8  ; t16a, t31a
3616*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m15 ; t19a, t28a
3617*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m9  ; t20a, t27a
3618*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m16 ; t23a, t24a
3619*c0909341SAndroid Build Coastguard Worker    mova                 m8, m21
3620*c0909341SAndroid Build Coastguard Worker    mova                m17, m20
3621*c0909341SAndroid Build Coastguard Worker    mova                m15, m18
3622*c0909341SAndroid Build Coastguard Worker    mova                m16, m14
3623*c0909341SAndroid Build Coastguard Worker    jmp .main3
3624*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3625*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast ; bottom half is zero
3626*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_201_4091x8)]
3627*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_m2751_3035x8)]
3628*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_1751_3703x8)]
3629*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m1380_3857x8)]
3630*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m8  ; t16a, t31a
3631*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_995_3973x8)]
3632*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m9  ; t17a, t30a
3633*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_m2106_3513x8)]
3634*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m11 ; t18a, t29a
3635*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2440_3290x8)]
3636*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m12 ; t19a, t28a
3637*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m601_4052x8)]
3638*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m8  ; t20a, t27a
3639*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m9  ; t21a, t26a
3640*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m11 ; t22a, t25a
3641*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m12 ; t23a, t24a
3642*c0909341SAndroid Build Coastguard Worker    jmp .main2
3643*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3644*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf
3645*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       21, 8, 9, 10,  201, 4091, 5 ; t16a, t31a
3646*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
3647*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
3648*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
3649*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       18, 8, 9, 10,  995, 3973, 5 ; t20a, t27a
3650*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
3651*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
3652*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       14, 8, 9, 10, 4052,  601, 5 ; t23a, t24a
3653*c0909341SAndroid Build Coastguard Worker.main2:
3654*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m21, m17 ; t17 t30
3655*c0909341SAndroid Build Coastguard Worker    paddsw              m21, m17      ; t16 t31
3656*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m15, m20 ; t18 t29
3657*c0909341SAndroid Build Coastguard Worker    paddsw              m20, m15      ; t19 t28
3658*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m18, m16 ; t21 t26
3659*c0909341SAndroid Build Coastguard Worker    paddsw              m18, m16      ; t20 t27
3660*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m14, m19 ; t22 t25
3661*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m19      ; t23 t24
3662*c0909341SAndroid Build Coastguard Worker.main3:
3663*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        8, 9, 19, 10,   799, 4017, 5 ; t17a t30a
3664*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       17, 9, 19, 10, m4017,  799, 5 ; t18a t29a
3665*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       15, 9, 19, 10,  3406, 2276, 5 ; t21a t26a
3666*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a
3667*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m3784_1567)]
3668*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m21, m20 ; t19a t28a
3669*c0909341SAndroid Build Coastguard Worker    paddsw              m21, m20      ; t16a t31a
3670*c0909341SAndroid Build Coastguard Worker    psubsw              m20, m14, m18 ; t20a t27a
3671*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m18      ; t23a t24a
3672*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m8, m17  ; t18  t29
3673*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m17      ; t17  t30
3674*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m16, m15 ; t21  t26
3675*c0909341SAndroid Build Coastguard Worker    paddsw              m15, m16      ; t22  t25
3676*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       18, 9, 16, 10, 1567_3784, 11,   20 ; t18a t29a
3677*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       19, 9, 16, 10, 1567_3784, 11,   20 ; t19  t28
3678*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       20, 9, 16, 10, 11, m1567_m3784, 36 ; t20  t27
3679*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a
3680*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [o(deint_shuf)]
3681*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m21, m14 ; t23  t24
3682*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m21      ; t16  t31
3683*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m8, m15  ; t22a t25a
3684*c0909341SAndroid Build Coastguard Worker    paddsw              m15, m8       ; t17a t30a
3685*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m18, m17 ; t21  t26
3686*c0909341SAndroid Build Coastguard Worker    paddsw              m18, m17      ; t18  t29
3687*c0909341SAndroid Build Coastguard Worker    paddsw              m17, m19, m20 ; t19a t28a
3688*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m20      ; t20a t27a
3689*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m2896_2896)]
3690*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2896_2896)]
3691*c0909341SAndroid Build Coastguard Worker    REPX     {pshufb x, m9}, m14, m15, m18, m17
3692*c0909341SAndroid Build Coastguard Worker    mova                 m9, m10
3693*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m16, m11
3694*c0909341SAndroid Build Coastguard Worker    mova                m20, m10
3695*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m21, m11
3696*c0909341SAndroid Build Coastguard Worker    psrad                m9, 12
3697*c0909341SAndroid Build Coastguard Worker    psrad               m20, 12
3698*c0909341SAndroid Build Coastguard Worker    packssdw             m9, m20      ; t23a t22
3699*c0909341SAndroid Build Coastguard Worker    mova                m20, m10
3700*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m16, m12
3701*c0909341SAndroid Build Coastguard Worker    mova                m16, m10
3702*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m21, m12
3703*c0909341SAndroid Build Coastguard Worker    psrad               m20, 12
3704*c0909341SAndroid Build Coastguard Worker    psrad               m16, 12
3705*c0909341SAndroid Build Coastguard Worker    packssdw            m16, m20, m16 ; t24a t25
3706*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        8, 21, 20, 10, 11, 12, 8 ; t21a t26a
3707*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       19,  8, 11, 10, 11, 12, 8 ; t20  t27
3708*c0909341SAndroid Build Coastguard Worker    packssdw            m11, m20      ; t27  t26a
3709*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m21      ; t20  t21a
3710*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m20, m14, m15 ; t16  t17a
3711*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m14, m15      ; t31  t30a
3712*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m15, m17, m18 ; t28a t29
3713*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m17, m18      ; t19a t18
3714*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m0, m14  ; out31 out30
3715*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m14      ; out0  out1
3716*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m7, m20  ; out16 out17
3717*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m20      ; out15 out14
3718*c0909341SAndroid Build Coastguard Worker    psubsw              m20, m1, m15  ; out28 out29
3719*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m15      ; out3  out2
3720*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m6, m17  ; out19 out18
3721*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m17      ; out12 out13
3722*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m4, m9   ; out23 out22
3723*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m9       ; out8  out9
3724*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m3, m16  ; out24 out25
3725*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m16      ; out7  out6
3726*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m5, m8   ; out20 out21
3727*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m8       ; out11 out10
3728*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m2, m11  ; out27 out26
3729*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m11      ; out4  out5
3730*c0909341SAndroid Build Coastguard Worker    ret
3731*c0909341SAndroid Build Coastguard Worker
3732*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob
3733*c0909341SAndroid Build Coastguard Worker%undef cmp
3734*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
3735*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
3736*c0909341SAndroid Build Coastguard Worker    jz .dconly
3737*c0909341SAndroid Build Coastguard Worker    mova                m21, [o(permB)]
3738*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m21, [cq+64* 0] ;  0  1
3739*c0909341SAndroid Build Coastguard Worker    vpermq              m14, m21, [cq+64* 1] ;  2  3
3740*c0909341SAndroid Build Coastguard Worker    vpermq              m20, m21, [cq+64* 2] ;  4  5
3741*c0909341SAndroid Build Coastguard Worker    vpermq              m15, m21, [cq+64* 3] ;  6  7
3742*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2896x8)]
3743*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m21, [cq+64* 4] ;  8  9
3744*c0909341SAndroid Build Coastguard Worker    vpermq              m16, m21, [cq+64* 5] ; 10 11
3745*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m21, [cq+64* 6] ; 12 13
3746*c0909341SAndroid Build Coastguard Worker    vpermq              m17, m21, [cq+64* 7] ; 14 15
3747*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17
3748*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
3749*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7
3750*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
3751*c0909341SAndroid Build Coastguard Worker    jb .fast
3752*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m21, [cq+64* 8] ; 16 17
3753*c0909341SAndroid Build Coastguard Worker    vpermq              m19, m21, [cq+64* 9] ; 18 19
3754*c0909341SAndroid Build Coastguard Worker    vpermq               m4, m21, [cq+64*10] ; 20 21
3755*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m21, [cq+64*11] ; 22 23
3756*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m21, [cq+64*12] ; 24 25
3757*c0909341SAndroid Build Coastguard Worker    vpermq              m18, m21, [cq+64*13] ; 26 27
3758*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m21, [cq+64*14] ; 28 29
3759*c0909341SAndroid Build Coastguard Worker    vpermq              m21, m21, [cq+64*15] ; 30 31
3760*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21
3761*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15
3762*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m21, m14 ; 30  2
3763*c0909341SAndroid Build Coastguard Worker    punpckhwd           m21, m1       ; 31  1
3764*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m17, m19 ; 14 18
3765*c0909341SAndroid Build Coastguard Worker    punpckhwd           m17, m9       ; 15 17
3766*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m1       ; 16  0
3767*c0909341SAndroid Build Coastguard Worker    punpckhwd           m14, m7       ;  3 29
3768*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m15, m18 ;  6 26
3769*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m6       ;  7 25
3770*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m2       ; 24  8
3771*c0909341SAndroid Build Coastguard Worker    punpckhwd           m19, m3       ; 19 13
3772*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4       ; 12 20
3773*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m20      ; 27  5
3774*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m20      ; 28  4
3775*c0909341SAndroid Build Coastguard Worker    punpckhwd           m20, m5, m2   ; 23  9
3776*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m16      ; 22 10
3777*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m4       ; 11 21
3778*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main2
3779*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
3780*c0909341SAndroid Build Coastguard Worker    jmp .pass2
3781*c0909341SAndroid Build Coastguard Worker.fast: ; bottom half zero
3782*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m14, m14 ;  2
3783*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m17, m17 ; 14
3784*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m16, m16 ; 10
3785*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m12, m1  ; __  0
3786*c0909341SAndroid Build Coastguard Worker    punpckhwd           m21, m1, m1   ;  1
3787*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m15, m15 ;  6
3788*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m20, m20 ;  4
3789*c0909341SAndroid Build Coastguard Worker    punpckhwd           m19, m3, m3   ; 13
3790*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m3       ; 12
3791*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m12, m2  ; __  8
3792*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m20, m20 ;  5
3793*c0909341SAndroid Build Coastguard Worker    punpckhwd           m20, m2, m2   ;  9
3794*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast
3795*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m15      ;  7
3796*c0909341SAndroid Build Coastguard Worker    punpckhwd           m14, m14      ;  3
3797*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m16      ; 11
3798*c0909341SAndroid Build Coastguard Worker    punpckhwd           m17, m17      ; 15
3799*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
3800*c0909341SAndroid Build Coastguard Worker.pass2:
3801*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_16384)]
3802*c0909341SAndroid Build Coastguard Worker    call .transpose_round
3803*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m14, m2, q3131 ;  5
3804*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m2, q2020      ;  1
3805*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m0, m3, q3131  ;  4
3806*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m3, q2020      ;  0
3807*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m1, m18, q3131 ;  6
3808*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m18, q2020     ;  2
3809*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m18, m20, m6, q2020 ;  9
3810*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m20, m6, q3131      ; 13
3811*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m21, m4, q3131 ; 12
3812*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m21, m4, q2020 ;  8
3813*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m21, m19, m7, q3131 ; 15
3814*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m7, q2020      ; 11
3815*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m5, m15, q3131 ; 14
3816*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m15, q2020     ; 10
3817*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m17, m9, q2020 ;  3
3818*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m9, q3131      ;  7
3819*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
3820*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf
3821*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2048)]
3822*c0909341SAndroid Build Coastguard Worker    movshdup            m13, [o(permD)]
3823*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
3824*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m8, [dstq+strideq*0]
3825*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m9, [dstq+strideq*1]
3826*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m10, [dstq+strideq*2]
3827*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m11, [dstq+r2       ]
3828*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3
3829*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+strideq*4]
3830*c0909341SAndroid Build Coastguard Worker    paddw                m0, m8
3831*c0909341SAndroid Build Coastguard Worker    paddw                m1, m9
3832*c0909341SAndroid Build Coastguard Worker    paddw                m2, m10
3833*c0909341SAndroid Build Coastguard Worker    paddw                m3, m11
3834*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m8, [r3+strideq*0]
3835*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m9, [r3+strideq*1]
3836*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m10, [r3+strideq*2]
3837*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m11, [r3+r2       ]
3838*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m4, m5, m6, m7
3839*c0909341SAndroid Build Coastguard Worker    lea                  r4, [dstq+strideq*8]
3840*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
3841*c0909341SAndroid Build Coastguard Worker    paddw                m4, m8
3842*c0909341SAndroid Build Coastguard Worker    paddw                m5, m9
3843*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m3
3844*c0909341SAndroid Build Coastguard Worker    paddw                m6, m10
3845*c0909341SAndroid Build Coastguard Worker    paddw                m7, m11
3846*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m8, [r4+strideq*0]
3847*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m9, [r4+strideq*1]
3848*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m10, [r4+strideq*2]
3849*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m11, [r4+r2       ]
3850*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m14, m15, m16, m17
3851*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r3+strideq*8]
3852*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
3853*c0909341SAndroid Build Coastguard Worker    paddw               m14, m8
3854*c0909341SAndroid Build Coastguard Worker    paddw               m15, m9
3855*c0909341SAndroid Build Coastguard Worker    packuswb             m6, m7
3856*c0909341SAndroid Build Coastguard Worker    paddw               m16, m10
3857*c0909341SAndroid Build Coastguard Worker    paddw               m17, m11
3858*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m8, [r5+strideq*0]
3859*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m9, [r5+strideq*1]
3860*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m10, [r5+strideq*2]
3861*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m11, [r5+r2       ]
3862*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m18, m19, m20, m21
3863*c0909341SAndroid Build Coastguard Worker    packuswb            m14, m15
3864*c0909341SAndroid Build Coastguard Worker    paddw               m18, m8
3865*c0909341SAndroid Build Coastguard Worker    paddw               m19, m9
3866*c0909341SAndroid Build Coastguard Worker    packuswb            m16, m17
3867*c0909341SAndroid Build Coastguard Worker    paddw               m20, m10
3868*c0909341SAndroid Build Coastguard Worker    paddw               m21, m11
3869*c0909341SAndroid Build Coastguard Worker    packuswb            m18, m19
3870*c0909341SAndroid Build Coastguard Worker    packuswb            m20, m21
3871*c0909341SAndroid Build Coastguard Worker    REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20
3872*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym0
3873*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*1], m0, 1
3874*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*2], ym2
3875*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+r2       ], m2, 1
3876*c0909341SAndroid Build Coastguard Worker    mova          [r3+strideq*0], ym4
3877*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r3+strideq*1], m4, 1
3878*c0909341SAndroid Build Coastguard Worker    mova          [r3+strideq*2], ym6
3879*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r3+r2       ], m6, 1
3880*c0909341SAndroid Build Coastguard Worker    mova          [r4+strideq*0], ym14
3881*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r4+strideq*1], m14, 1
3882*c0909341SAndroid Build Coastguard Worker    mova          [r4+strideq*2], ym16
3883*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r4+r2       ], m16, 1
3884*c0909341SAndroid Build Coastguard Worker    mova          [r5+strideq*0], ym18
3885*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r5+strideq*1], m18, 1
3886*c0909341SAndroid Build Coastguard Worker    mova          [r5+strideq*2], ym20
3887*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r5+r2       ], m20, 1
3888*c0909341SAndroid Build Coastguard Worker    RET
3889*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3890*c0909341SAndroid Build Coastguard Worker.dconly:
3891*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
3892*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
3893*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
3894*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
3895*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128
3896*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8
3897*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
3898*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+256
3899*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+1
3900*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
3901*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3902*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero
3903*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2896x8)]
3904*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_4076x8)]
3905*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_401x8)]
3906*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m0  ; t0
3907*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m14 ; t15a
3908*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m14 ; t8a
3909*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m3, m4
3910*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m3, m4
3911*c0909341SAndroid Build Coastguard Worker    mova                 m2, m10
3912*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m2, m9, [o(pw_m3784_1567)] {bcstd}
3913*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
3914*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m5, [o(pw_m3784_1567)] {bcstd}
3915*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
3916*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m5, [o(pw_1567_3784)] {bcstd}
3917*c0909341SAndroid Build Coastguard Worker    mova                 m5, m10
3918*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m5, m9, [o(pw_1567_3784)] {bcstd}
3919*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2896_2896)]
3920*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m2896_2896)]
3921*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m8, m4 ; out15
3922*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m8, m4 ; out0
3923*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m8, m3 ; out8
3924*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m8, m3 ; out7
3925*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m2, m1, m6, m5
3926*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m1     ; t9a
3927*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6     ; t14a
3928*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         4, 3, 16, 17, 10, 11, 12 ; t11,  t12
3929*c0909341SAndroid Build Coastguard Worker    psubsw              m20, m8, m5 ; out14
3930*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m8, m5 ; out1
3931*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m8, m2 ; out9
3932*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m8, m2 ; out6
3933*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         5, 2, 16, 17, 10, 11, 12 ; t10a, t13a
3934*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m8, m3 ; out12
3935*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m8     ; out3
3936*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m8, m4 ; out11
3937*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m8     ; out4
3938*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m8, m2 ; out13
3939*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m8     ; out2
3940*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m8, m5 ; out10
3941*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m8     ; out5
3942*c0909341SAndroid Build Coastguard Worker    ret
3943*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
3944*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_2896x8)]
3945*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_4017x8)]
3946*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_799x8)]
3947*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m18, [o(pw_4076x8)]
3948*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m19, [o(pw_401x8)]
3949*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m20, [o(pw_m1189x8)]
3950*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m16, [o(pw_3920x8)]
3951*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m0  ; t0
3952*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m1  ; t7a
3953*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3  ; t4a
3954*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m14 ; t15a
3955*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m19 ; t8a
3956*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m15 ; t11a
3957*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m16 ; t12a
3958*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m9, m2 ; idct8 out7
3959*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m9, m2 ; idct8 out0
3960*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m9, m1 ; idct8 out4
3961*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m9, m1 ; idct8 out3
3962*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         2, 1, 5, 6, 10, 2896, 2896 ; t5, t6
3963*c0909341SAndroid Build Coastguard Worker    mova                m21, m18
3964*c0909341SAndroid Build Coastguard Worker    mova                m19, m14
3965*c0909341SAndroid Build Coastguard Worker    mova                m16, m15
3966*c0909341SAndroid Build Coastguard Worker    mova                 m8, m20
3967*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m9, m1 ; idct8 out6
3968*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m9     ; idct8 out1
3969*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m9, m2 ; idct8 out5
3970*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m9     ; idct8 out2
3971*c0909341SAndroid Build Coastguard Worker    jmp .main3
3972*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3973*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast ; bottom half is zero
3974*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_m2276x8)]
3975*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_3406x8)]
3976*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_4017x8)]
3977*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_799x8)]
3978*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_3784x8)]
3979*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_1567x8)]
3980*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2896x8)]
3981*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m3  ; t5a
3982*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m11 ; t6a
3983*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m1  ; t7a
3984*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m12 ; t4a
3985*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m2  ; t3
3986*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m10 ; t2
3987*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m0  ; t0
3988*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2896_2896)]
3989*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m2896_2896)]
3990*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
3991*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4  ; t1
3992*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main3
3993*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m21, [o(pw_4076x8)]
3994*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_401x8)]
3995*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m18, [o(pw_m2598x8)]
3996*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_3166x8)]
3997*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m19, [o(pw_3612x8)]
3998*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_1931x8)]
3999*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m20, [o(pw_m1189x8)]
4000*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_3920x8)]
4001*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m14 ; t15a
4002*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m8  ; t8a
4003*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m17 ; t9a
4004*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m9  ; t14a
4005*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m16 ; t13a
4006*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m11 ; t10a
4007*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m15 ; t11a
4008*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m12 ; t12a
4009*c0909341SAndroid Build Coastguard Worker    jmp .main2
4010*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4011*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf
4012*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        14, 21, 8, 9, 10,  401, 4076 ; t8a,  t15a
4013*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        18, 17, 8, 9, 10, 3166, 2598 ; t9a,  t14a
4014*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a
4015*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a
4016*c0909341SAndroid Build Coastguard Worker.main2:
4017*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m20, m16 ; t11
4018*c0909341SAndroid Build Coastguard Worker    psubsw              m20, m16      ; t10
4019*c0909341SAndroid Build Coastguard Worker    paddsw              m16, m15, m19 ; t12
4020*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m19      ; t13
4021*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m14, m18 ; t9
4022*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m18      ; t8
4023*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m21, m17 ; t14
4024*c0909341SAndroid Build Coastguard Worker    paddsw              m21, m17      ; t15
4025*c0909341SAndroid Build Coastguard Worker.main3:
4026*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_1567_3784)]
4027*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m3784_1567)]
4028*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        18, 19, 9, 17, 10, 11, 12 ; t9a,  t14a
4029*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m1567_m3784)]
4030*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        15, 20, 9, 17, 10, 12, 11 ; t10a, t13a
4031*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2896_2896)]
4032*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m2896_2896)]
4033*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m14, m8  ; t11a
4034*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m14      ; t8a
4035*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m18, m15 ; t9
4036*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m15      ; t10
4037*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m19, m20 ; t13
4038*c0909341SAndroid Build Coastguard Worker    paddsw              m19, m20      ; t14
4039*c0909341SAndroid Build Coastguard Worker    paddsw              m20, m21, m16 ; t15a
4040*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m21, m16 ; t12a
4041*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        15, 18, 9, 21, 10, 11, 12 ; t10a, t13a
4042*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        16, 17, 9, 21, 10, 11, 12 ; t11,  t12
4043*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m0, m20 ; out15
4044*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m20     ; out0
4045*c0909341SAndroid Build Coastguard Worker    psubsw              m20, m1, m19 ; out14
4046*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m19     ; out1
4047*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m2, m18 ; out13
4048*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m18     ; out2
4049*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m3, m17 ; out12
4050*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m17     ; out3
4051*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m4, m16 ; out11
4052*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m16     ; out4
4053*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m5, m15 ; out10
4054*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m15     ; out5
4055*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m6, m14 ; out9
4056*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m14     ; out6
4057*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m7, m8  ; out8
4058*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m8      ; out7
4059*c0909341SAndroid Build Coastguard Worker    ret
4060*c0909341SAndroid Build Coastguard Worker.transpose_round:
4061*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m0, m2
4062*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2
4063*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m3
4064*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3
4065*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m6
4066*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m6
4067*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m5, m7
4068*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m7
4069*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m14, m16
4070*c0909341SAndroid Build Coastguard Worker    punpckhwd           m14, m16
4071*c0909341SAndroid Build Coastguard Worker    punpcklwd           m16, m15, m17
4072*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m17
4073*c0909341SAndroid Build Coastguard Worker    punpcklwd           m17, m19, m21
4074*c0909341SAndroid Build Coastguard Worker    punpckhwd           m19, m21
4075*c0909341SAndroid Build Coastguard Worker    punpckhwd           m21, m18, m20
4076*c0909341SAndroid Build Coastguard Worker    punpcklwd           m18, m20
4077*c0909341SAndroid Build Coastguard Worker    punpcklwd           m20, m8, m1
4078*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m1
4079*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0, m2
4080*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2
4081*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m5
4082*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m5
4083*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m4, m6
4084*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m6
4085*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m20, m8, m1, m0
4086*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m15
4087*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m15
4088*c0909341SAndroid Build Coastguard Worker    punpcklwd           m15, m14, m16
4089*c0909341SAndroid Build Coastguard Worker    punpckhwd           m14, m16
4090*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m2, m3, m5, m4
4091*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m18, m19
4092*c0909341SAndroid Build Coastguard Worker    punpcklwd           m18, m19
4093*c0909341SAndroid Build Coastguard Worker    punpcklwd           m19, m21, m17
4094*c0909341SAndroid Build Coastguard Worker    punpckhwd           m21, m17
4095*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m6, m7, m15, m14
4096*c0909341SAndroid Build Coastguard Worker    punpcklwd           m17, m8, m0         ; a2   a6   aa   ae
4097*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m0             ; a3   a7   ab   af
4098*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m20, m1        ; a0   a4   a8   ac
4099*c0909341SAndroid Build Coastguard Worker    punpckhwd           m20, m1             ; a1   a5   a9   ad
4100*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m16, m18, m19, m21
4101*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m5         ; b0   b4   b8   bc
4102*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m5             ; b1   b5   b9   bd
4103*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m3, m4         ; b2   b6   ba   be
4104*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4             ; b3   b7   bb   bf
4105*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m6, m15        ; c0   c4   c8   cc
4106*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m15            ; c1   c5   c9   cd
4107*c0909341SAndroid Build Coastguard Worker    punpcklwd           m15, m7, m14        ; c2   c6   ca   ce
4108*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m14            ; c3   c7   cb   cf
4109*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m18, m19       ; d0   d4   d8   dc
4110*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m19            ; d1   d5   d9   dd
4111*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m16, m21       ; d2   d6   da   de
4112*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m21            ; d3   d7   db   df
4113*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m21, m0, m1, q3232  ; a8   ac   b8   bc
4114*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym1, 1         ; a0   a4   b0   b4
4115*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, m17, ym5, 1    ; a2   a6   b2   b6
4116*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m17, m5, q3232 ; aa   ae   ba   be
4117*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m17, m8, ym3, 1     ; a3   a7   b3   b7
4118*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m8, m3, q3232  ; ab   af   bb   bf
4119*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, m4, ym14, 1    ; c0   c4   d0   d4
4120*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m14, q3232     ; c8   cc   d8   dc
4121*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m14, m20, ym2, 1    ; a1   a5   b1   b5
4122*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m20, m2, q3232      ; a9   ad   b9   bd
4123*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, m6, ym18, 1    ; c1   c5   d1   d5
4124*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m18, q3232     ; c9   cd   d9   dd
4125*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, m15, ym9, 1    ; c2   c6   d2   d6
4126*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m9, q3232      ; ca   ce   da   de
4127*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m9, m7, ym16, 1    ; c3   c7   d3   d7
4128*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m16, q3232     ; cb   cf   db   df
4129*c0909341SAndroid Build Coastguard Worker    ret
4130*c0909341SAndroid Build Coastguard Worker
4131*c0909341SAndroid Build Coastguard Worker%macro IDTX_16x32 4 ; src/dst[1-4]
4132*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m15, [cq+64*%1]
4133*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%2, m15, [cq+64*%2]
4134*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%3, m15, [cq+64*%3]
4135*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%4, m15, [cq+64*%4]
4136*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m16, m%1
4137*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m16, m%2
4138*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m16, m%3
4139*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m16, m%4
4140*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m17}, m18, m19, m20, m21
4141*c0909341SAndroid Build Coastguard Worker    paddsw              m%1, m18
4142*c0909341SAndroid Build Coastguard Worker    paddsw              m%2, m19
4143*c0909341SAndroid Build Coastguard Worker    paddsw              m%3, m20
4144*c0909341SAndroid Build Coastguard Worker    paddsw              m%4, m21
4145*c0909341SAndroid Build Coastguard Worker%endmacro
4146*c0909341SAndroid Build Coastguard Worker
4147*c0909341SAndroid Build Coastguard Worker%macro IDTX_16x32_STORE 2 ; src[1-2]
4148*c0909341SAndroid Build Coastguard Worker    mova               xm17, [dstq+r3*0]
4149*c0909341SAndroid Build Coastguard Worker    vinserti128        ym17, [dstq+r3*4], 1
4150*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m17, [dstq+r3*8], 2
4151*c0909341SAndroid Build Coastguard Worker    vinserti32x4        m17, [dstq+r4*8], 3
4152*c0909341SAndroid Build Coastguard Worker    mova   [cq+64*(%1*2+0)], m18
4153*c0909341SAndroid Build Coastguard Worker    mova   [cq+64*(%1*2+1)], m18
4154*c0909341SAndroid Build Coastguard Worker    punpcklbw           m16, m17, m18
4155*c0909341SAndroid Build Coastguard Worker    punpckhbw           m17, m18
4156*c0909341SAndroid Build Coastguard Worker    paddw               m16, m%1
4157*c0909341SAndroid Build Coastguard Worker    paddw               m17, m%2
4158*c0909341SAndroid Build Coastguard Worker    packuswb            m16, m17
4159*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r3*0], xm16
4160*c0909341SAndroid Build Coastguard Worker    vextracti128  [dstq+r3*4], ym16, 1
4161*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r3*8], m16, 2
4162*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [dstq+r4*8], m16, 3
4163*c0909341SAndroid Build Coastguard Worker%if %1 != 7
4164*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4165*c0909341SAndroid Build Coastguard Worker%endif
4166*c0909341SAndroid Build Coastguard Worker%endmacro
4167*c0909341SAndroid Build Coastguard Worker
4168*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c
4169*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [pw_2896x8]
4170*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m16, [pw_1697x16]
4171*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m17, [pw_16384]
4172*c0909341SAndroid Build Coastguard Worker    IDTX_16x32            0,  1,  2,  3
4173*c0909341SAndroid Build Coastguard Worker    IDTX_16x32            4,  5,  6,  7
4174*c0909341SAndroid Build Coastguard Worker    IDTX_16x32            8,  9, 10, 11
4175*c0909341SAndroid Build Coastguard Worker    IDTX_16x32           12, 13, 14, 15
4176*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m16, [pw_8192]
4177*c0909341SAndroid Build Coastguard Worker    call .transpose_2x8x8_round
4178*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*2]
4179*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
4180*c0909341SAndroid Build Coastguard Worker    pxor                m18, m18
4181*c0909341SAndroid Build Coastguard Worker    IDTX_16x32_STORE      0,  8
4182*c0909341SAndroid Build Coastguard Worker    IDTX_16x32_STORE      1,  9
4183*c0909341SAndroid Build Coastguard Worker    IDTX_16x32_STORE      2, 10
4184*c0909341SAndroid Build Coastguard Worker    IDTX_16x32_STORE      3, 11
4185*c0909341SAndroid Build Coastguard Worker    IDTX_16x32_STORE      4, 12
4186*c0909341SAndroid Build Coastguard Worker    IDTX_16x32_STORE      5, 13
4187*c0909341SAndroid Build Coastguard Worker    IDTX_16x32_STORE      6, 14
4188*c0909341SAndroid Build Coastguard Worker    IDTX_16x32_STORE      7, 15
4189*c0909341SAndroid Build Coastguard Worker    RET
4190*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4191*c0909341SAndroid Build Coastguard Worker.transpose_2x8x8_round:
4192*c0909341SAndroid Build Coastguard Worker    punpckhwd           m17, m4, m5
4193*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5
4194*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0, m1
4195*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
4196*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m6, m7
4197*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7
4198*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m2, m3
4199*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
4200*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m0, m2
4201*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
4202*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4, m6
4203*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m6
4204*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m5, m7
4205*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m7
4206*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m17, m1
4207*c0909341SAndroid Build Coastguard Worker    punpckhdq           m17, m1
4208*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17
4209*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m2
4210*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2
4211*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m4
4212*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4
4213*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m5, m7
4214*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m7
4215*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m7, m6, m17
4216*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m17
4217*c0909341SAndroid Build Coastguard Worker    punpckhwd           m17, m12, m13
4218*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m13
4219*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m8, m9
4220*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m9
4221*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m14, m15
4222*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15
4223*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m10, m11
4224*c0909341SAndroid Build Coastguard Worker    punpcklwd           m10, m11
4225*c0909341SAndroid Build Coastguard Worker    punpckhdq           m11, m8, m10
4226*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m10
4227*c0909341SAndroid Build Coastguard Worker    punpckldq           m10, m12, m14
4228*c0909341SAndroid Build Coastguard Worker    punpckhdq           m12, m14
4229*c0909341SAndroid Build Coastguard Worker    punpckhdq           m14, m13, m15
4230*c0909341SAndroid Build Coastguard Worker    punpckldq           m13, m15
4231*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m17, m9
4232*c0909341SAndroid Build Coastguard Worker    punpckhdq           m17, m9
4233*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17
4234*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m9, m8, m10
4235*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m10
4236*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m10, m11, m12
4237*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m11, m12
4238*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m12, m13, m15
4239*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m13, m15
4240*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m15, m14, m17
4241*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m17
4242*c0909341SAndroid Build Coastguard Worker    ret
4243*c0909341SAndroid Build Coastguard Worker
4244*c0909341SAndroid Build Coastguard Worker%macro IDTX_32x16 4 ; dst[1-4]
4245*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%2, m12, [cq+32*(%1+ 0)]
4246*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m12, [cq+32*(%1+16)]
4247*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%4, m12, [cq+32*(%3+ 0)]
4248*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m12, [cq+32*(%3+16)]
4249*c0909341SAndroid Build Coastguard Worker    REPX      {paddsw x, x}, m%2, m18, m%4, m19
4250*c0909341SAndroid Build Coastguard Worker    mova                m%1, m14
4251*c0909341SAndroid Build Coastguard Worker    vpermi2q            m%1, m%2, m18
4252*c0909341SAndroid Build Coastguard Worker    vpermt2q            m%2, m16, m18
4253*c0909341SAndroid Build Coastguard Worker%if %3 != 14
4254*c0909341SAndroid Build Coastguard Worker    mova                m%3, m14
4255*c0909341SAndroid Build Coastguard Worker%endif
4256*c0909341SAndroid Build Coastguard Worker    vpermi2q            m%3, m%4, m19
4257*c0909341SAndroid Build Coastguard Worker    vpermt2q            m%4, m16, m19
4258*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m17, m%1
4259*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m17, m%2
4260*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m17, m%3
4261*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m17, m%4
4262*c0909341SAndroid Build Coastguard Worker    REPX      {paddsw x, x}, m%1, m%2, m%3, m%4
4263*c0909341SAndroid Build Coastguard Worker    paddsw              m%1, m18
4264*c0909341SAndroid Build Coastguard Worker    paddsw              m%2, m19
4265*c0909341SAndroid Build Coastguard Worker    paddsw              m%3, m20
4266*c0909341SAndroid Build Coastguard Worker    paddsw              m%4, m21
4267*c0909341SAndroid Build Coastguard Worker%endmacro
4268*c0909341SAndroid Build Coastguard Worker
4269*c0909341SAndroid Build Coastguard Worker%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32
4270*c0909341SAndroid Build Coastguard Worker    mova               ym19, [dstq+strideq*0]
4271*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, [dstq+strideq*8], 1
4272*c0909341SAndroid Build Coastguard Worker%if %3 == 0
4273*c0909341SAndroid Build Coastguard Worker    mova   [cq+64*(%1*2+0)], m20
4274*c0909341SAndroid Build Coastguard Worker    mova   [cq+64*(%1*2+1)], m20
4275*c0909341SAndroid Build Coastguard Worker%endif
4276*c0909341SAndroid Build Coastguard Worker    punpcklbw           m18, m19, m20
4277*c0909341SAndroid Build Coastguard Worker    punpckhbw           m19, m20
4278*c0909341SAndroid Build Coastguard Worker    paddw               m18, m%1
4279*c0909341SAndroid Build Coastguard Worker    paddw               m19, m%2
4280*c0909341SAndroid Build Coastguard Worker    packuswb            m18, m19
4281*c0909341SAndroid Build Coastguard Worker    mova          [dstq+strideq*0], ym18
4282*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+strideq*8], m18, 1
4283*c0909341SAndroid Build Coastguard Worker%if %3 || %1 != 7
4284*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4285*c0909341SAndroid Build Coastguard Worker%endif
4286*c0909341SAndroid Build Coastguard Worker%endmacro
4287*c0909341SAndroid Build Coastguard Worker
4288*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c
4289*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [pw_2896x8]
4290*c0909341SAndroid Build Coastguard Worker    movu                m14, [permB+7]
4291*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m17, [pw_1697x16]
4292*c0909341SAndroid Build Coastguard Worker    psrlq               m16, m14, 4
4293*c0909341SAndroid Build Coastguard Worker    IDTX_32x16            0,  1,  2,  3
4294*c0909341SAndroid Build Coastguard Worker    IDTX_32x16            4,  5,  6,  7
4295*c0909341SAndroid Build Coastguard Worker    IDTX_32x16            8,  9, 10, 11
4296*c0909341SAndroid Build Coastguard Worker    IDTX_32x16           12, 13, 14, 15
4297*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m16, [pw_2048]
4298*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
4299*c0909341SAndroid Build Coastguard Worker    pxor                m20, m20
4300*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      0,  8
4301*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      1,  9
4302*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      2, 10
4303*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      3, 11
4304*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      4, 12
4305*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      5, 13
4306*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      6, 14
4307*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      7, 15
4308*c0909341SAndroid Build Coastguard Worker    RET
4309*c0909341SAndroid Build Coastguard Worker
4310*c0909341SAndroid Build Coastguard Worker%macro IDCT_32x32_END 4 ; src, mem, stride[1-2]
4311*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m10, [dstq+%3]
4312*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m11, [r3  +%4]
4313*c0909341SAndroid Build Coastguard Worker%if %2 < 8
4314*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m%2, m%1
4315*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m%2, m%1
4316*c0909341SAndroid Build Coastguard Worker%else
4317*c0909341SAndroid Build Coastguard Worker    mova                 m9, [cq+64*(%2*2-16)]
4318*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m9, m%1
4319*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m%1
4320*c0909341SAndroid Build Coastguard Worker%endif
4321*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12
4322*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m12
4323*c0909341SAndroid Build Coastguard Worker%if %2 >= 8
4324*c0909341SAndroid Build Coastguard Worker%if %2 == 8
4325*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
4326*c0909341SAndroid Build Coastguard Worker%endif
4327*c0909341SAndroid Build Coastguard Worker    mova  [cq+64*(%2*2-16)], m0
4328*c0909341SAndroid Build Coastguard Worker    mova  [cq+64*(%2*2-15)], m0
4329*c0909341SAndroid Build Coastguard Worker%endif
4330*c0909341SAndroid Build Coastguard Worker    paddw                m8, m10
4331*c0909341SAndroid Build Coastguard Worker    paddw                m9, m11
4332*c0909341SAndroid Build Coastguard Worker    packuswb             m8, m9
4333*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m13, m8
4334*c0909341SAndroid Build Coastguard Worker    mova          [dstq+%3], ym8
4335*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r3  +%4], m8, 1
4336*c0909341SAndroid Build Coastguard Worker%if %2 == 3 || %2 == 7 || %2 == 11
4337*c0909341SAndroid Build Coastguard Worker    add                dstq, r5
4338*c0909341SAndroid Build Coastguard Worker    sub                  r3, r5
4339*c0909341SAndroid Build Coastguard Worker%endif
4340*c0909341SAndroid Build Coastguard Worker%endmacro
4341*c0909341SAndroid Build Coastguard Worker
4342*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob
4343*c0909341SAndroid Build Coastguard Worker%undef cmp
4344*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
4345*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
4346*c0909341SAndroid Build Coastguard Worker    jz .dconly
4347*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      30
4348*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
4349*c0909341SAndroid Build Coastguard Worker    jb .fast
4350*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*20]
4351*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*12]
4352*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 4]
4353*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64*28]
4354*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64* 8]
4355*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*24]
4356*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 0]
4357*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64*16]
4358*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
4359*c0909341SAndroid Build Coastguard Worker    mova                m14, [cq+64* 2]
4360*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+64*30]
4361*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64*18]
4362*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64*14]
4363*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64*10]
4364*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+64*22]
4365*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+64*26]
4366*c0909341SAndroid Build Coastguard Worker    mova                m15, [cq+64* 6]
4367*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
4368*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m14
4369*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m15
4370*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m16
4371*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m17
4372*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m18
4373*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m19
4374*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m20
4375*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m21
4376*c0909341SAndroid Build Coastguard Worker    mova                m22, [cq+64* 1]
4377*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+64*31]
4378*c0909341SAndroid Build Coastguard Worker    mova                m14, [cq+64*17]
4379*c0909341SAndroid Build Coastguard Worker    mova                m29, [cq+64*15]
4380*c0909341SAndroid Build Coastguard Worker    mova                m26, [cq+64* 9]
4381*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64*23]
4382*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64*25]
4383*c0909341SAndroid Build Coastguard Worker    mova                m25, [cq+64* 7]
4384*c0909341SAndroid Build Coastguard Worker    mova                m24, [cq+64* 5]
4385*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+64*27]
4386*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64*21]
4387*c0909341SAndroid Build Coastguard Worker    mova                m27, [cq+64*11]
4388*c0909341SAndroid Build Coastguard Worker    mova                m28, [cq+64*13]
4389*c0909341SAndroid Build Coastguard Worker    mova                m15, [cq+64*19]
4390*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+64*29]
4391*c0909341SAndroid Build Coastguard Worker    mova                m23, [cq+64* 3]
4392*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf
4393*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_8192)]
4394*c0909341SAndroid Build Coastguard Worker    psubsw              m13, m0, m29 ; 31
4395*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m29     ;  0
4396*c0909341SAndroid Build Coastguard Worker    psubsw              m29, m1, m28 ; 30
4397*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m28     ;  1
4398*c0909341SAndroid Build Coastguard Worker    psubsw              m28, m2, m27 ; 29
4399*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m27     ;  2
4400*c0909341SAndroid Build Coastguard Worker    psubsw              m27, m3, m26 ; 28
4401*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m26     ;  3
4402*c0909341SAndroid Build Coastguard Worker    psubsw              m26, m4, m25 ; 27
4403*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m25     ;  4
4404*c0909341SAndroid Build Coastguard Worker    psubsw              m25, m5, m24 ; 26
4405*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m24     ;  5
4406*c0909341SAndroid Build Coastguard Worker    psubsw              m24, m6, m23 ; 25
4407*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m23     ;  6
4408*c0909341SAndroid Build Coastguard Worker    psubsw              m23, m7, m22 ; 24
4409*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m22     ;  7
4410*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
4411*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
4412*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1     ; a0 b0 a1 b1 a2 b2 a3 b3
4413*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
4414*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3     ; c0 d0 c1 d1 c2 d2 c3 d3
4415*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
4416*c0909341SAndroid Build Coastguard Worker    punpckhwd           m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
4417*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5     ; e0 f0 e1 f1 e2 f2 e3 f3
4418*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
4419*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7     ; g0 h0 g1 h1 g2 h2 g3 h3
4420*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
4421*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m23, m24
4422*c0909341SAndroid Build Coastguard Worker    punpcklwd           m23, m24
4423*c0909341SAndroid Build Coastguard Worker    punpckhwd           m24, m25, m26
4424*c0909341SAndroid Build Coastguard Worker    punpcklwd           m25, m26
4425*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
4426*c0909341SAndroid Build Coastguard Worker    punpckhwd           m26, m27, m28
4427*c0909341SAndroid Build Coastguard Worker    punpcklwd           m27, m28
4428*c0909341SAndroid Build Coastguard Worker    punpckhwd           m28, m29, m13
4429*c0909341SAndroid Build Coastguard Worker    punpcklwd           m29, m13
4430*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
4431*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m0, m2  ; a2 b2 c2 d2 a3 b3 c3 d3
4432*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2      ; a0 b0 c0 d0 a1 b1 c1 d1
4433*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m4, m6  ; e2 f2 g2 h2 e3 f3 g3 h3
4434*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m6      ; e0 f0 g0 h0 e1 f1 g1 h1
4435*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m8, m1  ; a6 b6 c6 d6 a7 b7 c7 d7
4436*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m1      ; a4 b4 c4 d4 a5 b5 c5 d5
4437*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
4438*c0909341SAndroid Build Coastguard Worker    punpckldq           m22, m5      ; e4 f4 g4 h5 e5 f5 g5 h5
4439*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m0, m4, m8, m22
4440*c0909341SAndroid Build Coastguard Worker    punpckhdq           m13, m23, m25
4441*c0909341SAndroid Build Coastguard Worker    punpckldq           m23, m25
4442*c0909341SAndroid Build Coastguard Worker    punpckhdq           m25, m27, m29
4443*c0909341SAndroid Build Coastguard Worker    punpckldq           m27, m29
4444*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m13, m23, m25, m27
4445*c0909341SAndroid Build Coastguard Worker    punpckhdq            m9, m3, m24
4446*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m24
4447*c0909341SAndroid Build Coastguard Worker    punpckhdq           m24, m26, m28
4448*c0909341SAndroid Build Coastguard Worker    punpckldq           m26, m28
4449*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m5, m23, m27 ; d00 d08 d16 d24
4450*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m23, m27      ; d01 d09 d17 d25
4451*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m27, m13, m25 ; d03 d11 d19 d27
4452*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m13, m25      ; d02 d10 d18 d26
4453*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m25, m3, m26  ; d05 d13 d21 d29
4454*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m26      ; d04 d12 d20 d28
4455*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m26, m9, m24  ; d07 d15 d23 d31
4456*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m9, m24      ; d06 d14 d22 d30
4457*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m25, m3, m26
4458*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 9], m23
4459*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*11], m27
4460*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*13], m25
4461*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*15], m26
4462*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m24, m8, m22  ; a05 a13 a21 a29
4463*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m22      ; a04 a12 a20 a28
4464*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m22, m0, m4   ; a01 a09 a17 a25
4465*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m4       ; a00 a08 a16 a24
4466*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m23, m7, m2   ; a03 a11 a19 a27
4467*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m7, m2       ; a02 a10 a18 a26
4468*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m25, m6, m1   ; a07 a15 a23 a31
4469*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m1       ; a06 a14 a22 a30
4470*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64* 0]
4471*c0909341SAndroid Build Coastguard Worker    mova                m11, [cq+64* 2]
4472*c0909341SAndroid Build Coastguard Worker    mova                m12, [cq+64* 4]
4473*c0909341SAndroid Build Coastguard Worker    mova                m29, [cq+64* 6]
4474*c0909341SAndroid Build Coastguard Worker    mova                m27, [cq+64* 8]
4475*c0909341SAndroid Build Coastguard Worker    mova                m26, [cq+64*10]
4476*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64*12]
4477*c0909341SAndroid Build Coastguard Worker    mova                m28, [cq+64*14]
4478*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2, m21  ; 23
4479*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m21      ;  8
4480*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m11, m20 ; 22
4481*c0909341SAndroid Build Coastguard Worker    paddsw              m11, m20      ;  9
4482*c0909341SAndroid Build Coastguard Worker    psubsw              m20, m12, m19 ; 21
4483*c0909341SAndroid Build Coastguard Worker    paddsw              m12, m19      ; 10
4484*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m29, m18 ; 20
4485*c0909341SAndroid Build Coastguard Worker    paddsw              m29, m18      ; 11
4486*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m27, m17 ; 19
4487*c0909341SAndroid Build Coastguard Worker    paddsw              m27, m17      ; 12
4488*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m26, m16 ; 18
4489*c0909341SAndroid Build Coastguard Worker    paddsw              m26, m16      ; 13
4490*c0909341SAndroid Build Coastguard Worker    paddsw              m16, m4, m15  ; 14
4491*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m15      ; 17
4492*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m6, m10
4493*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m28, m14 ; 16
4494*c0909341SAndroid Build Coastguard Worker    paddsw              m28, m14      ; 15
4495*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m7, m10
4496*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m6, m4
4497*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m4
4498*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m17, m18
4499*c0909341SAndroid Build Coastguard Worker    punpcklwd           m17, m18
4500*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m19, m20
4501*c0909341SAndroid Build Coastguard Worker    punpcklwd           m19, m20
4502*c0909341SAndroid Build Coastguard Worker    punpckhwd           m20, m21, m1
4503*c0909341SAndroid Build Coastguard Worker    punpcklwd           m21, m1
4504*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m11  ; i4 j4 i5 j5 i6 j6 i7 j7
4505*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m11      ; i0 j1 i1 j1 i2 j2 i3 j3
4506*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
4507*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m29      ; k0 l0 k1 l1 k2 l2 k3 l3
4508*c0909341SAndroid Build Coastguard Worker    punpckhwd           m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
4509*c0909341SAndroid Build Coastguard Worker    punpcklwd           m27, m26      ; m0 n0 m1 n1 m2 n2 m3 n3
4510*c0909341SAndroid Build Coastguard Worker    punpckhwd           m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
4511*c0909341SAndroid Build Coastguard Worker    punpcklwd           m16, m28      ; o0 p0 o1 p1 o2 p2 o3 p3
4512*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m23, m10
4513*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m25, m10
4514*c0909341SAndroid Build Coastguard Worker    punpckhdq           m28, m2, m12  ; i2 j2 k2 l2 i3 j3 k3 l3
4515*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m12      ; i0 j0 k0 l0 i1 j1 k1 l1
4516*c0909341SAndroid Build Coastguard Worker    punpckhdq           m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3
4517*c0909341SAndroid Build Coastguard Worker    punpckldq           m27, m16      ; m0 n0 o0 p0 m1 n1 o1 p1
4518*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m28, m2, m12, m27
4519*c0909341SAndroid Build Coastguard Worker    punpckhdq           m16, m1, m11  ; i6 j6 k6 l6 i7 j7 k7 l7
4520*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m11      ; i4 j4 k4 l4 i5 j5 k5 l5
4521*c0909341SAndroid Build Coastguard Worker    punpckhdq           m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
4522*c0909341SAndroid Build Coastguard Worker    punpckldq           m29, m26      ; m4 n4 o4 p4 m5 n5 o5 p5
4523*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m16, m1, m11, m29
4524*c0909341SAndroid Build Coastguard Worker    punpckhdq           m26, m19, m21
4525*c0909341SAndroid Build Coastguard Worker    punpckldq           m19, m21
4526*c0909341SAndroid Build Coastguard Worker    punpckhdq           m21, m6, m4
4527*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m4
4528*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m26, m19, m21, m6
4529*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m18, m20
4530*c0909341SAndroid Build Coastguard Worker    punpckldq           m18, m20
4531*c0909341SAndroid Build Coastguard Worker    punpckhdq           m20, m7, m17
4532*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m17
4533*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m4, m18, m20, m7
4534*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m17, m28, m12 ; b02 b10 b18 b26
4535*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m28, m12      ; b03 b11 b19 b27
4536*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m12, m2, m27  ; b01 b09 b17 b25
4537*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m27      ; b00 b08 b16 b24
4538*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m27, m1, m29  ; b05 b13 b21 b29
4539*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m29      ; b04 b12 b20 b28
4540*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m29, m16, m11 ; b07 b15 b23 b31
4541*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m16, m11      ; b06 b14 b22 b30
4542*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m12
4543*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m28
4544*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m27
4545*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m29
4546*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m27, m20, m26 ; c03 c11 c19 c27
4547*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m20, m26      ; c02 c10 c18 c26
4548*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m26, m7, m19  ; c01 c09 c17 c25
4549*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m7, m19      ; c00 c08 c16 c24
4550*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m28, m6, m18  ; c05 c13 c21 c29
4551*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m18      ; c04 c12 c20 c28
4552*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m29, m21, m4  ; c07 c15 c23 c31
4553*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m21, m4       ; c06 c14 c22 c30
4554*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m9, m10
4555*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m0, m2, q3232   ; a16 a24 b16 b24
4556*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym2, 1          ; a00 a08 b00 b08
4557*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m7, m5, q3232   ; c16 c24 d16 d24
4558*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m7, ym5, 1          ; c00 c08 d00 d08
4559*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m8, m1, q3232   ; a20 a28 b20 b28
4560*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, m8, ym1, 1      ; a04 a12 b04 b12
4561*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m8, m6, m3, q3232   ; c20 c28 d20 d28
4562*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m6, ym3, 1          ; c04 c12 d04 d12
4563*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m1, m6, q3131   ; 12
4564*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m6, q2020       ;  4
4565*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m4, m2, q3131   ; 24
4566*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m2, q2020       ; 16
4567*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m0, m7, q3131   ;  8
4568*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m7, q2020       ;  0
4569*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m5, m8, q3131   ; 28
4570*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m8, q2020       ; 20
4571*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
4572*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m18, m14, m17, q3232 ; a18 a26 b18 b26
4573*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m14, ym17, 1         ; a02 a10 b02 b10
4574*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m20, m13, q3232 ; c18 c26 d18 d26
4575*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m20, ym13, 1         ; c02 c10 d02 d10
4576*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m13, m21, m19, q3232 ; c22 c30 d22 d30
4577*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m21, ym19, 1         ; c06 c14 d06 d14
4578*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m15, m16, q3232 ; a22 a30 b22 b30
4579*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m15, ym16, 1         ; a06 a14 b06 b14
4580*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m14, m20, q3131 ; 10
4581*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m20, q2020      ;  2
4582*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m20, m18, m17, q3131 ; 26
4583*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m18, m17, q2020      ; 18
4584*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m15, m21, q3131 ; 14
4585*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m21, q2020      ;  6
4586*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m21, m19, m13, q3131 ; 30
4587*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m13, q2020      ; 22
4588*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
4589*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m14
4590*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m15
4591*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m16
4592*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m17
4593*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m18
4594*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m19
4595*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m20
4596*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m21
4597*c0909341SAndroid Build Coastguard Worker    mova                m15, [cq+64* 1]
4598*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64* 3]
4599*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64* 5]
4600*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+64* 7]
4601*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+64* 9]
4602*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+64*11]
4603*c0909341SAndroid Build Coastguard Worker    mova                m13, [cq+64*13]
4604*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64*15]
4605*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m22, m15, q3232 ; a17 a25 b17 b25
4606*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m22, ym15, 1         ; a01 a09 b01 b09
4607*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m23, m16, q3232 ; a19 a27 b19 b27
4608*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m23, ym16, 1         ; a03 a11 b03 b11
4609*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m24, m17, q3232 ; a21 a29 b21 b29
4610*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m24, ym17, 1         ; a05 a13 b05 b13
4611*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m25, m19, q3232 ; a23 a31 b23 b31
4612*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m25, ym19, 1         ; a07 a15 b07 b15
4613*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m8, m26, ym20, 1    ; c01 c09 d01 d09
4614*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m26, m20, q3232      ; c17 c25 d17 d25
4615*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m9, m27, ym21, 1    ; c03 c11 d03 d11
4616*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m21, q3232      ; c19 c27 d19 d27
4617*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m11, m28, ym13, 1    ; c05 c13 d05 d13
4618*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m28, m13, q3232      ; c21 c29 d21 d29
4619*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m12, m29, ym18, 1    ; c07 c15 d07 d15
4620*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m29, m18, q3232      ; c23 c31 d23 d31
4621*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m18, m14, m26, q3131 ; 25
4622*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m26, q2020      ; 17
4623*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m15, m27, q3131 ; 27
4624*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m27, q2020      ; 19
4625*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m20, m16, m28, q3131 ; 29
4626*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m28, q2020      ; 21
4627*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m21, m17, m29, q3131 ; 31
4628*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m29, q2020      ; 23
4629*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m26, m22, m8, q3131  ;  9
4630*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m22, m8, q2020       ;  1
4631*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m23, m9, q3131  ; 11
4632*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m23, m9, q2020       ;  3
4633*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m28, m24, m11, q3131 ; 13
4634*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m24, m11, q2020      ;  5
4635*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m29, m25, m12, q3131 ; 15
4636*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m25, m12, q2020      ;  7
4637*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf
4638*c0909341SAndroid Build Coastguard Worker    jmp .end
4639*c0909341SAndroid Build Coastguard Worker.fast: ; bottom/right halves are zero
4640*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(dup16_perm)]
4641*c0909341SAndroid Build Coastguard Worker    pmovzxwd             m9,       [cq+64* 0]
4642*c0909341SAndroid Build Coastguard Worker    pmovzxwd             m6,       [cq+64* 8]
4643*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m14,  [cq+64* 2]
4644*c0909341SAndroid Build Coastguard Worker    vpermb              ym0, ym14, [cq+64*14]
4645*c0909341SAndroid Build Coastguard Worker    vpermb              ym5, ym14, [cq+64*10]
4646*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m14,  [cq+64* 6]
4647*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m14,  [cq+64* 4]
4648*c0909341SAndroid Build Coastguard Worker    vpermb              ym3, ym14, [cq+64*12]
4649*c0909341SAndroid Build Coastguard Worker    pslld                m9, 16
4650*c0909341SAndroid Build Coastguard Worker    pslld                m6, 16
4651*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast
4652*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m14,  [cq+64* 1]
4653*c0909341SAndroid Build Coastguard Worker    vpermb             ym17, ym14, [cq+64*15]
4654*c0909341SAndroid Build Coastguard Worker    vpermb             ym20, ym14, [cq+64* 9]
4655*c0909341SAndroid Build Coastguard Worker    vpermb              m15, m14,  [cq+64* 7]
4656*c0909341SAndroid Build Coastguard Worker    vpermb              m18, m14,  [cq+64* 5]
4657*c0909341SAndroid Build Coastguard Worker    vpermb             ym16, ym14, [cq+64*11]
4658*c0909341SAndroid Build Coastguard Worker    vpermb             ym19, ym14, [cq+64*13]
4659*c0909341SAndroid Build Coastguard Worker    vpermb              m14, m14,  [cq+64* 3]
4660*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4661*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_8192)]
4662*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
4663*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m22, m14, m2, q2020 ;  1
4664*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m24, m14, m2, q3131 ;  5
4665*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m23, m17, m9, q2020 ;  3
4666*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m25, m17, m9, q3131 ;  7
4667*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m5, m15, q2020 ; 10
4668*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m5, m15, q3131 ; 14
4669*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m1, m18, q2020 ;  2
4670*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m1, m18, q3131 ;  6
4671*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m0, m3, q3131  ;  4
4672*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m3, q2020      ;  0
4673*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m21, m4, q3131 ; 12
4674*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m21, m4, q2020 ;  8
4675*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m26, m20, m6, q2020 ;  9
4676*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m28, m20, m6, q3131 ; 13
4677*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m19, m7, q2020 ; 11
4678*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m29, m19, m7, q3131 ; 15
4679*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
4680*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m14
4681*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m15
4682*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m16
4683*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m17
4684*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m18
4685*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m19
4686*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m20
4687*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m21
4688*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf_fast
4689*c0909341SAndroid Build Coastguard Worker.end:
4690*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
4691*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2048)]
4692*c0909341SAndroid Build Coastguard Worker    movshdup            m13, [o(permD)]
4693*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r4*8]
4694*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq+r4] ; stride*4
4695*c0909341SAndroid Build Coastguard Worker    add                  r3, r5           ; dst+stride*28
4696*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       29,  0, strideq*0, r4
4697*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       28,  1, strideq*1, strideq*2
4698*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       27,  2, strideq*2, strideq*1
4699*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       26,  3, r4       , strideq*0
4700*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       25,  4, strideq*0, r4
4701*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       24,  5, strideq*1, strideq*2
4702*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       23,  6, strideq*2, strideq*1
4703*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       22,  7, r4       , strideq*0
4704*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       21,  8, strideq*0, r4
4705*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       20,  9, strideq*1, strideq*2
4706*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       19, 10, strideq*2, strideq*1
4707*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       18, 11, r4       , strideq*0
4708*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       17, 12, strideq*0, r4
4709*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       16, 13, strideq*1, strideq*2
4710*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       15, 14, strideq*2, strideq*1
4711*c0909341SAndroid Build Coastguard Worker    IDCT_32x32_END       14, 15, r4       , strideq*0
4712*c0909341SAndroid Build Coastguard Worker    RET
4713*c0909341SAndroid Build Coastguard Worker.dconly:
4714*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
4715*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
4716*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
4717*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2
4718*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4719*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero
4720*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m21, [o(pw_4091x8)]
4721*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_201x8)]
4722*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m24, [o(pw_m601x8)]
4723*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_4052x8)]
4724*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m22 ; t31a
4725*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m22, m8  ; t16a
4726*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m24, m23 ; t23a
4727*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m23, m12 ; t24a
4728*c0909341SAndroid Build Coastguard Worker
4729*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m22, m21
4730*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m22, m21
4731*c0909341SAndroid Build Coastguard Worker    mova                m15, m10
4732*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m15, m9, [o(pw_m4017_799)] {bcstd}
4733*c0909341SAndroid Build Coastguard Worker    mova                m17, m10
4734*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m8, [o(pw_m4017_799)] {bcstd}
4735*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m15, m17
4736*c0909341SAndroid Build Coastguard Worker    packssdw            m15, m17
4737*c0909341SAndroid Build Coastguard Worker    mova                m17, m10
4738*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m8, [o(pw_799_4017)] {bcstd}
4739*c0909341SAndroid Build Coastguard Worker    mova                 m8, m10
4740*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m8, m9, [o(pw_799_4017)] {bcstd}
4741*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m17, m8
4742*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m17
4743*c0909341SAndroid Build Coastguard Worker
4744*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m24, m23
4745*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m24, m23
4746*c0909341SAndroid Build Coastguard Worker    mova                m20, m10
4747*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m20, m9, [o(pw_m3406_m2276)] {bcstd}
4748*c0909341SAndroid Build Coastguard Worker    mova                m17, m10
4749*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m16, [o(pw_m3406_m2276)] {bcstd}
4750*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m20, m17
4751*c0909341SAndroid Build Coastguard Worker    packssdw            m20, m17
4752*c0909341SAndroid Build Coastguard Worker    mova                m17, m10
4753*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m17, m16, [o(pw_m2276_3406)] {bcstd}
4754*c0909341SAndroid Build Coastguard Worker    mova                m16, m10
4755*c0909341SAndroid Build Coastguard Worker    vpdpwssd            m16, m9, [o(pw_m2276_3406)] {bcstd}
4756*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m17, m16
4757*c0909341SAndroid Build Coastguard Worker    packssdw            m16, m17
4758*c0909341SAndroid Build Coastguard Worker
4759*c0909341SAndroid Build Coastguard Worker    mova                m17, m21
4760*c0909341SAndroid Build Coastguard Worker    mova                m27, m15
4761*c0909341SAndroid Build Coastguard Worker    mova                m25, m20
4762*c0909341SAndroid Build Coastguard Worker    mova                m29, m8
4763*c0909341SAndroid Build Coastguard Worker    mova                m18, m22
4764*c0909341SAndroid Build Coastguard Worker    mova                m14, m24
4765*c0909341SAndroid Build Coastguard Worker    mova                m28, m16
4766*c0909341SAndroid Build Coastguard Worker    mova                m26, m23
4767*c0909341SAndroid Build Coastguard Worker    jmp .main4
4768*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
4769*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m21, [o(pw_4091x8)]
4770*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_201x8)]
4771*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m18, [o(pw_m1380x8)]
4772*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_3857x8)]
4773*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m19, [o(pw_3973x8)]
4774*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_995x8)]
4775*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m28, [o(pw_m601x8)]
4776*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_4052x8)]
4777*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m22 ; t31a
4778*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m22, m8  ; t16a
4779*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m25 ; t19a
4780*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m25, m9 ; t28a
4781*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m24 ; t27a
4782*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m24, m11 ; t20a
4783*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m28, m23 ; t23a
4784*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m23, m12 ; t24a
4785*c0909341SAndroid Build Coastguard Worker    mova                m15, m21
4786*c0909341SAndroid Build Coastguard Worker    mova                 m8, m22
4787*c0909341SAndroid Build Coastguard Worker    mova                m14, m18
4788*c0909341SAndroid Build Coastguard Worker    mova                m27, m25
4789*c0909341SAndroid Build Coastguard Worker    mova                m29, m19
4790*c0909341SAndroid Build Coastguard Worker    mova                m26, m24
4791*c0909341SAndroid Build Coastguard Worker    mova                m16, m28
4792*c0909341SAndroid Build Coastguard Worker    mova                m20, m23
4793*c0909341SAndroid Build Coastguard Worker    jmp .main3
4794*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4795*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast ; bottom half is zero
4796*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m21, [o(pw_4091x8)]
4797*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_201x8)]
4798*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(pw_m2751x8)]
4799*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_3035x8)]
4800*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m17, [o(pw_3703x8)]
4801*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_1751x8)]
4802*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m18, [o(pw_m1380x8)]
4803*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_3857x8)]
4804*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m22 ; t31a
4805*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m19, [o(pw_3973x8)]
4806*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m22, m8  ; t16a
4807*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_995x8)]
4808*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m29 ; t30a
4809*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m16, [o(pw_m2106x8)]
4810*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m29, m9  ; t17a
4811*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_3513x8)]
4812*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m26 ; t29a
4813*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_3290x8)]
4814*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m26, m11 ; t18a
4815*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2440x8)]
4816*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m25 ; t19a
4817*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m20, [o(pw_m601x8)]
4818*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m25, m12 ; t28a
4819*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_4052x8)]
4820*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m24 ; t27a
4821*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m24, m8  ; t20a
4822*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m27 ; t21a
4823*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m27, m9  ; t26a
4824*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m28 ; t25a
4825*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m28, m11 ; t22a
4826*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m23 ; t23a
4827*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m23, m12 ; t24a
4828*c0909341SAndroid Build Coastguard Worker    jmp .main2
4829*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4830*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf
4831*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        22, 21,  8,  9, 10,  201, 4091 ; t16a, t31a
4832*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        14, 29,  8,  9, 10, 3035, 2751 ; t17a, t30a
4833*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        26, 17,  8,  9, 10, 1751, 3703 ; t18a, t29a
4834*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        18, 25,  8,  9, 10, 3857, 1380 ; t19a, t28a
4835*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        24, 19,  8,  9, 10,  995, 3973 ; t20a, t27a
4836*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        16, 27,  8,  9, 10, 3513, 2106 ; t21a, t26a
4837*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        28, 15,  8,  9, 10, 2440, 3290 ; t22a, t25a
4838*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        20, 23,  8,  9, 10, 4052,  601 ; t23a, t24a
4839*c0909341SAndroid Build Coastguard Worker.main2:
4840*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m22, m14 ; t17
4841*c0909341SAndroid Build Coastguard Worker    paddsw              m22, m14      ; t16
4842*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m18, m26 ; t19
4843*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m26      ; t18
4844*c0909341SAndroid Build Coastguard Worker    psubsw              m26, m24, m16 ; t21
4845*c0909341SAndroid Build Coastguard Worker    paddsw              m24, m16      ; t20
4846*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m20, m28 ; t22
4847*c0909341SAndroid Build Coastguard Worker    paddsw              m28, m20      ; t23
4848*c0909341SAndroid Build Coastguard Worker    psubsw              m20, m23, m15 ; t25
4849*c0909341SAndroid Build Coastguard Worker    paddsw              m23, m15      ; t24
4850*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m21, m29 ; t30
4851*c0909341SAndroid Build Coastguard Worker    paddsw              m21, m29      ; t31
4852*c0909341SAndroid Build Coastguard Worker    psubsw              m29, m19, m27 ; t26
4853*c0909341SAndroid Build Coastguard Worker    paddsw              m19, m27      ; t27
4854*c0909341SAndroid Build Coastguard Worker    paddsw              m27, m25, m17 ; t28
4855*c0909341SAndroid Build Coastguard Worker    psubsw              m25, m17      ; t29
4856*c0909341SAndroid Build Coastguard Worker.main3:
4857*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        15,  8,  9, 17, 10,   799, 4017 ; t17a, t30a
4858*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        25, 18,  9, 17, 10, m4017,  799 ; t18a, t29a
4859*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        29, 26,  9, 17, 10,  3406, 2276 ; t21a, t26a
4860*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        20, 16,  9, 17, 10, m2276, 3406 ; t22a, t25a
4861*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m21, m27 ; t28a
4862*c0909341SAndroid Build Coastguard Worker    paddsw              m21, m27      ; t31a
4863*c0909341SAndroid Build Coastguard Worker    psubsw              m27, m15, m25 ; t18
4864*c0909341SAndroid Build Coastguard Worker    paddsw              m15, m25      ; t17
4865*c0909341SAndroid Build Coastguard Worker    psubsw              m25, m20, m29 ; t21
4866*c0909341SAndroid Build Coastguard Worker    paddsw              m20, m29      ; t22
4867*c0909341SAndroid Build Coastguard Worker    psubsw              m29, m8, m18  ; t29
4868*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m18      ; t30
4869*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m22, m14 ; t19a
4870*c0909341SAndroid Build Coastguard Worker    paddsw              m22, m14      ; t16a
4871*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m28, m24 ; t20a
4872*c0909341SAndroid Build Coastguard Worker    paddsw              m24, m28      ; t23a
4873*c0909341SAndroid Build Coastguard Worker    paddsw              m28, m16, m26 ; t25
4874*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m26      ; t26
4875*c0909341SAndroid Build Coastguard Worker    psubsw              m26, m23, m19 ; t27a
4876*c0909341SAndroid Build Coastguard Worker    paddsw              m23, m19      ; t24a
4877*c0909341SAndroid Build Coastguard Worker.main4:
4878*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m3784_1567)]
4879*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_1567_3784)]
4880*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        29, 27,  9, 19, 10, 11, 12 ; t18a, t29a
4881*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        17, 18,  9, 19, 10, 11, 12 ; t19,  t28
4882*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m1567_m3784)]
4883*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        16, 25,  9, 19, 10, 12, 11 ; t21a, t26a
4884*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        26, 14,  9, 19, 10, 12, 11 ; t20,  t27
4885*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m2896_2896)]
4886*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2896_2896)]
4887*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m27, m25 ; t26
4888*c0909341SAndroid Build Coastguard Worker    paddsw              m27, m25      ; t29
4889*c0909341SAndroid Build Coastguard Worker    psubsw              m25, m17, m26 ; t20a
4890*c0909341SAndroid Build Coastguard Worker    paddsw              m17, m26      ; t19a
4891*c0909341SAndroid Build Coastguard Worker    paddsw              m26, m18, m14 ; t28a
4892*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m14      ; t27a
4893*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m22, m24 ; t16
4894*c0909341SAndroid Build Coastguard Worker    psubsw              m22, m24      ; t23
4895*c0909341SAndroid Build Coastguard Worker    psubsw              m24, m29, m16 ; t21
4896*c0909341SAndroid Build Coastguard Worker    paddsw              m16, m29      ; t18
4897*c0909341SAndroid Build Coastguard Worker    paddsw              m29, m21, m23 ; t31
4898*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m23      ; t24
4899*c0909341SAndroid Build Coastguard Worker    psubsw              m23, m15, m20 ; t22a
4900*c0909341SAndroid Build Coastguard Worker    paddsw              m15, m20      ; t17a
4901*c0909341SAndroid Build Coastguard Worker    psubsw              m20, m8, m28  ; t25a
4902*c0909341SAndroid Build Coastguard Worker    paddsw              m28, m8       ; t30a
4903*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        18, 25,  8,  9, 10, 11, 12 ; t20,  t27
4904*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        19, 24,  8,  9, 10, 11, 12 ; t21a, t26a
4905*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        21, 22,  8,  9, 10, 11, 12 ; t23a, t24a
4906*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        20, 23,  8,  9, 10, 11, 12 ; t22,  t25
4907*c0909341SAndroid Build Coastguard Worker    ret
4908*c0909341SAndroid Build Coastguard Worker
4909*c0909341SAndroid Build Coastguard Worker%macro IDTX_32x32 2 ; dst[1-2]
4910*c0909341SAndroid Build Coastguard Worker    vmovdqa32           ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which
4911*c0909341SAndroid Build Coastguard Worker    vmovdqa32           ym17, [cq+64*(%1+16)] ; reduces code size due to
4912*c0909341SAndroid Build Coastguard Worker    vmovdqa32           ym%2, [cq+64*(%2+ 0)] ; compressed displacements
4913*c0909341SAndroid Build Coastguard Worker    vmovdqa32           ym18, [cq+64*(%2+16)]
4914*c0909341SAndroid Build Coastguard Worker    vpermt2q             m%1, m21, m17
4915*c0909341SAndroid Build Coastguard Worker    vpermt2q             m%2, m21, m18
4916*c0909341SAndroid Build Coastguard Worker%endmacro
4917*c0909341SAndroid Build Coastguard Worker
4918*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c
4919*c0909341SAndroid Build Coastguard Worker    movu                 m21, [permB+7]
4920*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m16, [pw_8192]
4921*c0909341SAndroid Build Coastguard Worker    pxor                 m20, m20
4922*c0909341SAndroid Build Coastguard Worker.loop:
4923*c0909341SAndroid Build Coastguard Worker    IDTX_32x32            0,  1
4924*c0909341SAndroid Build Coastguard Worker    IDTX_32x32            2,  3
4925*c0909341SAndroid Build Coastguard Worker    IDTX_32x32            4,  5
4926*c0909341SAndroid Build Coastguard Worker    IDTX_32x32            6,  7
4927*c0909341SAndroid Build Coastguard Worker    IDTX_32x32            8,  9
4928*c0909341SAndroid Build Coastguard Worker    IDTX_32x32           10, 11
4929*c0909341SAndroid Build Coastguard Worker    IDTX_32x32           12, 13
4930*c0909341SAndroid Build Coastguard Worker    IDTX_32x32           14, 15
4931*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
4932*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      0,  8, 1
4933*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      1,  9, 1
4934*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      2, 10, 1
4935*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      3, 11, 1
4936*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      4, 12, 1
4937*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      5, 13, 1
4938*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      6, 14, 1
4939*c0909341SAndroid Build Coastguard Worker    IDTX_32x16_STORE      7, 15, 1
4940*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*8]
4941*c0909341SAndroid Build Coastguard Worker    btc                  cq, 5
4942*c0909341SAndroid Build Coastguard Worker    jnc .loop
4943*c0909341SAndroid Build Coastguard Worker    mov                 r0d, 8
4944*c0909341SAndroid Build Coastguard Worker.zero_loop:
4945*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*0], m20
4946*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*1], m20
4947*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*2], m20
4948*c0909341SAndroid Build Coastguard Worker    mova          [cq+64*3], m20
4949*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*4
4950*c0909341SAndroid Build Coastguard Worker    dec                 r0d
4951*c0909341SAndroid Build Coastguard Worker    jg .zero_loop
4952*c0909341SAndroid Build Coastguard Worker    RET
4953*c0909341SAndroid Build Coastguard Worker
4954*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob
4955*c0909341SAndroid Build Coastguard Worker%undef cmp
4956*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
4957*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
4958*c0909341SAndroid Build Coastguard Worker    jz .dconly
4959*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      30
4960*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
4961*c0909341SAndroid Build Coastguard Worker    jb .fast
4962*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*10]
4963*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64* 6]
4964*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 2]
4965*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64*14]
4966*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64* 4]
4967*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*12]
4968*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 0]
4969*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64* 8]
4970*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
4971*c0909341SAndroid Build Coastguard Worker    mova                m14, [cq+64* 1]
4972*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+64*15]
4973*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64* 9]
4974*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64* 7]
4975*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64* 5]
4976*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+64*11]
4977*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+64*13]
4978*c0909341SAndroid Build Coastguard Worker    mova                m15, [cq+64* 3]
4979*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
4980*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_8192)]
4981*c0909341SAndroid Build Coastguard Worker%macro TRANSPOSE_8x4_ROUND 4
4982*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7
4983*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%3, m%4      ; c0 d0 c1 d1 c2 d2 c3 d3
4984*c0909341SAndroid Build Coastguard Worker    punpckhwd           m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7
4985*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%1, m%2      ; a0 b0 a1 b1 a2 b2 a3 b3
4986*c0909341SAndroid Build Coastguard Worker    punpckhdq           m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3
4987*c0909341SAndroid Build Coastguard Worker    punpckldq           m%1, m%3      ; a0 b0 c0 d0 a1 b1 c1 d1
4988*c0909341SAndroid Build Coastguard Worker    punpckldq           m%3, m%4, m8  ; a4 b4 c4 d4 a5 b5 c5 d5
4989*c0909341SAndroid Build Coastguard Worker    punpckhdq           m%4, m8       ; a6 b6 c6 d6 a7 b7 c7 d7
4990*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m%2, m%1, m%3, m%4
4991*c0909341SAndroid Build Coastguard Worker%endmacro
4992*c0909341SAndroid Build Coastguard Worker    TRANSPOSE_8x4_ROUND   0,  1,  2,  3
4993*c0909341SAndroid Build Coastguard Worker    TRANSPOSE_8x4_ROUND   4,  5,  6,  7
4994*c0909341SAndroid Build Coastguard Worker    TRANSPOSE_8x4_ROUND  14, 15, 16, 17
4995*c0909341SAndroid Build Coastguard Worker    TRANSPOSE_8x4_ROUND  18, 19, 20, 21
4996*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m26, m0, ym4, 1     ; a0  a4  b0  b4
4997*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m4, q3232      ; a8  a12 b8  b12
4998*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m27, m1, ym5, 1     ; a1  a5  b1  b5
4999*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m5, q3232      ; a9  a13 b9  b13
5000*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m28, m2, ym6, 1     ; a2  a6  b2  b6
5001*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m6, q3232      ; a10 a14 b10 b14
5002*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m29, m3, ym7, 1     ; a3  a7  b3  b7
5003*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m8, m3, m7, q3232  ; a11 a15 b11 b15
5004*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m4, m14, ym18, 1   ; c0  c4  d0  d4
5005*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m18, q3232     ; c8  c12 d8  d12
5006*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, m15, ym19, 1   ; c1  c5  d1  d5
5007*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m19, q3232     ; c9  c13 d9  d13
5008*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m6, m16, ym20, 1   ; c2  c6  d2  d6
5009*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m20, q3232     ; c10 c14 d10 d14
5010*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m7, m17, ym21, 1   ; c3  c7  d3  d7
5011*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m21, q3232     ; c11 c15 d11 d15
5012*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m22, m26, m4, q2020 ;  0  1
5013*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m26, m4, q3131      ;  8  9
5014*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m23, m27, m5, q2020 ;  2  3
5015*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m5, q3131      ; 10 11
5016*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m24, m28, m6, q2020 ;  4  5
5017*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m28, m6, q3131      ; 12 13
5018*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m25, m29, m7, q2020 ;  6  7
5019*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m29, m7, q3131      ; 14 15
5020*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m0, m14, q2020 ; 16 17
5021*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m0, m14, q3131 ; 24 25
5022*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m20, m1, m15, q2020 ; 18 19
5023*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m1, m15, q3131 ; 26 27
5024*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m2, m16, q2020 ; 20 21
5025*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m2, m16, q3131 ; 28 29
5026*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m8, m17, q2020 ; 22 23
5027*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m8, m17, q3131 ; 30 31
5028*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
5029*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m4
5030*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m5
5031*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m3
5032*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m0
5033*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m24, m24 ;  4
5034*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m0       ; 28
5035*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m5       ; 20
5036*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m28, m28 ; 12
5037*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m26, m26 ;  8
5038*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m3       ; 24
5039*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m6, m22  ; __  0
5040*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m4       ; __ 16
5041*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast3
5042*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m20
5043*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m16
5044*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m19
5045*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m17
5046*c0909341SAndroid Build Coastguard Worker    punpcklwd           m21, m23, m23 ;  2
5047*c0909341SAndroid Build Coastguard Worker    punpcklwd           m17, m17      ; 30
5048*c0909341SAndroid Build Coastguard Worker    punpcklwd           m20, m20      ; 18
5049*c0909341SAndroid Build Coastguard Worker    punpcklwd           m15, m29, m29 ; 14
5050*c0909341SAndroid Build Coastguard Worker    punpcklwd           m18, m27, m27 ; 10
5051*c0909341SAndroid Build Coastguard Worker    punpcklwd           m16, m16      ; 22
5052*c0909341SAndroid Build Coastguard Worker    punpcklwd           m19, m19      ; 26
5053*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m25, m25 ;  6
5054*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5055*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m14
5056*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 9], m15
5057*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m16
5058*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*11], m17
5059*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m18
5060*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*13], m19
5061*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m20
5062*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*15], m21
5063*c0909341SAndroid Build Coastguard Worker    mova                m21, [cq+64* 7]
5064*c0909341SAndroid Build Coastguard Worker    mova                m14, [cq+64* 0]
5065*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64* 3]
5066*c0909341SAndroid Build Coastguard Worker    mova                m18, [cq+64* 4]
5067*c0909341SAndroid Build Coastguard Worker    mova                m19, [cq+64* 5]
5068*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64* 2]
5069*c0909341SAndroid Build Coastguard Worker    mova                m15, [cq+64* 1]
5070*c0909341SAndroid Build Coastguard Worker    mova                m20, [cq+64* 6]
5071*c0909341SAndroid Build Coastguard Worker    REPX   {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
5072*c0909341SAndroid Build Coastguard Worker                             m24, m19, m16, m27, m28, m15, m20, m23
5073*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf
5074*c0909341SAndroid Build Coastguard Worker    jmp .end
5075*c0909341SAndroid Build Coastguard Worker.fast: ; right half is zero
5076*c0909341SAndroid Build Coastguard Worker    mova                ym8, [cq+64*15]
5077*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m8, [cq+64* 1], 1
5078*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(int16_perm)]
5079*c0909341SAndroid Build Coastguard Worker    mova                ym9, [cq+64* 8]
5080*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m9, [cq+64* 0], 1
5081*c0909341SAndroid Build Coastguard Worker    mova                ym0, [cq+64* 7]
5082*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, [cq+64* 9], 1
5083*c0909341SAndroid Build Coastguard Worker    mova                ym7, [cq+64*14]
5084*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m7, [cq+64* 2], 1
5085*c0909341SAndroid Build Coastguard Worker    mova                ym1, [cq+64* 3]
5086*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, [cq+64*13], 1
5087*c0909341SAndroid Build Coastguard Worker    mova                ym3, [cq+64* 6]
5088*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, [cq+64*10], 1
5089*c0909341SAndroid Build Coastguard Worker    mova                ym5, [cq+64*11]
5090*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, [cq+64* 5], 1
5091*c0909341SAndroid Build Coastguard Worker    mova                ym6, [cq+64*12]
5092*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m6, [cq+64* 4], 1
5093*c0909341SAndroid Build Coastguard Worker    REPX  {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
5094*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main2
5095*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m8, [o(int_shuf3)]
5096*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4      m9, [o(int_shuf4)]
5097*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_8192)]
5098*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m8
5099*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m9
5100*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m8
5101*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m9
5102*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
5103*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m8
5104*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m9
5105*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m8
5106*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m9
5107*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m11}, m4, m5, m6, m7
5108*c0909341SAndroid Build Coastguard Worker    punpckhdq           m28, m0, m1
5109*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1
5110*c0909341SAndroid Build Coastguard Worker    punpckhdq           m27, m2, m3
5111*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3
5112*c0909341SAndroid Build Coastguard Worker    punpckhdq           m22, m4, m5
5113*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m5
5114*c0909341SAndroid Build Coastguard Worker    punpckhdq           m23, m6, m7
5115*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7
5116*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m14, m0, ym2, 1
5117*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m0, m2, q3232
5118*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m2, m4, ym6, 1
5119*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m6, q3232
5120*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m21, m14, m2, q2020 ;  0  2
5121*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m2, q3131      ;  4  6
5122*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m18, m15, m4, q2020 ;  8 10
5123*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m4, q3131      ; 12 14
5124*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
5125*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m14, m14 ;  4
5126*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m15, m15 ; 12
5127*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m18, m18 ;  8
5128*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m21      ; __  0
5129*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast4
5130*c0909341SAndroid Build Coastguard Worker    punpckhwd           m21, m21      ;  2
5131*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m15      ; 14
5132*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m18      ; 10
5133*c0909341SAndroid Build Coastguard Worker    punpckhwd           m14, m14      ;  6
5134*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
5135*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m24, m28, ym27, 1
5136*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m28, m27, q3232
5137*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m27, m22, ym23, 1
5138*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m22, m23, q3232
5139*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m23, m24, m27, q2020 ;  1  3
5140*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m24, m27, q3131      ;  5  7
5141*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m28, m22, q2020 ;  9 11
5142*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m28, m22, q3131      ; 13 15
5143*c0909341SAndroid Build Coastguard Worker    punpcklwd           m22, m23, m23 ;  1
5144*c0909341SAndroid Build Coastguard Worker    punpckhwd           m29, m28, m28 ; 15
5145*c0909341SAndroid Build Coastguard Worker    punpcklwd           m26, m27, m27 ;  9
5146*c0909341SAndroid Build Coastguard Worker    punpckhwd           m25, m24, m24 ;  7
5147*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m14
5148*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 9], m15
5149*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m16
5150*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*11], m17
5151*c0909341SAndroid Build Coastguard Worker    punpcklwd           m24, m24      ;  5
5152*c0909341SAndroid Build Coastguard Worker    punpckhwd           m27, m27      ; 11
5153*c0909341SAndroid Build Coastguard Worker    punpcklwd           m28, m28      ; 13
5154*c0909341SAndroid Build Coastguard Worker    punpckhwd           m23, m23      ;  3
5155*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m18
5156*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*13], m19
5157*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m20
5158*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*15], m21
5159*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf_fast
5160*c0909341SAndroid Build Coastguard Worker.end:
5161*c0909341SAndroid Build Coastguard Worker    imul                 r6, strideq, 60
5162*c0909341SAndroid Build Coastguard Worker    mova                m10, [o(end_16x32p)]
5163*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2048)]
5164*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
5165*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
5166*c0909341SAndroid Build Coastguard Worker    add                  r6, dstq         ; dst+stride*60
5167*c0909341SAndroid Build Coastguard Worker    psrldq              m13, m10, 1
5168*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq+r3] ; stride*4
5169*c0909341SAndroid Build Coastguard Worker%macro IDCT_16x64_END 3 ; idct32, idct64, tmp
5170*c0909341SAndroid Build Coastguard Worker%if %1 & 1
5171*c0909341SAndroid Build Coastguard Worker    %define %%s0 r3
5172*c0909341SAndroid Build Coastguard Worker    %define %%s1 strideq*2
5173*c0909341SAndroid Build Coastguard Worker    %define %%s2 strideq*1
5174*c0909341SAndroid Build Coastguard Worker    %define %%s3 strideq*0
5175*c0909341SAndroid Build Coastguard Worker%else
5176*c0909341SAndroid Build Coastguard Worker    %define %%s0 strideq*0
5177*c0909341SAndroid Build Coastguard Worker    %define %%s1 strideq*1
5178*c0909341SAndroid Build Coastguard Worker    %define %%s2 strideq*2
5179*c0909341SAndroid Build Coastguard Worker    %define %%s3 r3
5180*c0909341SAndroid Build Coastguard Worker%if %1
5181*c0909341SAndroid Build Coastguard Worker    add                dstq, r4
5182*c0909341SAndroid Build Coastguard Worker    sub                  r6, r4
5183*c0909341SAndroid Build Coastguard Worker%endif
5184*c0909341SAndroid Build Coastguard Worker%endif
5185*c0909341SAndroid Build Coastguard Worker%if %1 < 8
5186*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m11, m%1
5187*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m11, m%2
5188*c0909341SAndroid Build Coastguard Worker%else
5189*c0909341SAndroid Build Coastguard Worker    mova                 m9, [cq+64*%1]
5190*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m9, m%2 ; out  0+n,  1+n
5191*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m%2     ; out 63-n, 62-n
5192*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m11
5193*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m11
5194*c0909341SAndroid Build Coastguard Worker%endif
5195*c0909341SAndroid Build Coastguard Worker    mova               xm29, [dstq+%%s0]
5196*c0909341SAndroid Build Coastguard Worker    vinserti128        ym29, [dstq+%%s1], 1
5197*c0909341SAndroid Build Coastguard Worker    mova               xm%3, [r6  +%%s3]
5198*c0909341SAndroid Build Coastguard Worker    vinserti128        ym%3, [r6  +%%s2], 1
5199*c0909341SAndroid Build Coastguard Worker    vpermb              m29, m10, m29
5200*c0909341SAndroid Build Coastguard Worker    vpermb              m%3, m10, m%3
5201*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*%1], m12
5202*c0909341SAndroid Build Coastguard Worker    paddw               m29, m8
5203*c0909341SAndroid Build Coastguard Worker    paddw               m%3, m9
5204*c0909341SAndroid Build Coastguard Worker    packuswb            m29, m%3
5205*c0909341SAndroid Build Coastguard Worker    vpermd              m29, m13, m29
5206*c0909341SAndroid Build Coastguard Worker    mova          [dstq+%%s0], xm29
5207*c0909341SAndroid Build Coastguard Worker    vextracti128  [dstq+%%s1], ym29, 1
5208*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r6  +%%s2], m29, 2
5209*c0909341SAndroid Build Coastguard Worker    vextracti32x4 [r6  +%%s3], m29, 3
5210*c0909341SAndroid Build Coastguard Worker%endmacro
5211*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END        0, 29,  0
5212*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END        1, 28, 28
5213*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END        2, 27, 28
5214*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END        3, 26, 28
5215*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END        4, 25, 28
5216*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END        5, 24, 28
5217*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END        6, 23, 28
5218*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END        7, 22, 28
5219*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END        8, 21, 28
5220*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END        9, 20, 28
5221*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END       10, 19, 28
5222*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END       11, 18, 28
5223*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END       12, 17, 28
5224*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END       13, 16, 28
5225*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END       14, 15, 28
5226*c0909341SAndroid Build Coastguard Worker    IDCT_16x64_END       15, 14, 28
5227*c0909341SAndroid Build Coastguard Worker    RET
5228*c0909341SAndroid Build Coastguard Worker.dconly:
5229*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
5230*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
5231*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
5232*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
5233*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+512
5234*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+2
5235*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
5236*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5237*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast ; bottom three-quarters are zero
5238*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_101_4095x8)]
5239*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m21, [o(pw_m1474_3822x8)]
5240*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(pw_897_3996x8)]
5241*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m17, [o(pw_m700_4036x8)]
5242*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m18, [o(pw_501_4065x8)]
5243*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m19, [o(pw_m1092_3948x8)]
5244*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m16, [o(pw_1285_3889x8)]
5245*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_m301_4085x8)]
5246*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m22 ; t32a t63a
5247*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m29 ; t35a t60a
5248*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m26 ; t36a t59a
5249*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m25 ; t39a t56
5250*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m24 ; t40a t55a
5251*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m27 ; t43a t52a
5252*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m28 ; t44a t51a
5253*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m23 ; t47a t48a
5254*c0909341SAndroid Build Coastguard Worker    mova                m22, m8
5255*c0909341SAndroid Build Coastguard Worker    mova                m29, m21
5256*c0909341SAndroid Build Coastguard Worker    mova                m26, m14
5257*c0909341SAndroid Build Coastguard Worker    mova                m25, m17
5258*c0909341SAndroid Build Coastguard Worker    mova                m24, m18
5259*c0909341SAndroid Build Coastguard Worker    mova                m27, m19
5260*c0909341SAndroid Build Coastguard Worker    mova                m28, m16
5261*c0909341SAndroid Build Coastguard Worker    mova                m20, m15
5262*c0909341SAndroid Build Coastguard Worker    jmp .main_oddhalf2
5263*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5264*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf
5265*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_101_4095x8)]
5266*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_m2824_2967x8)]
5267*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_1660_3745x8)]
5268*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m1474_3822x8)]
5269*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m22, m8       ; t32a t63a
5270*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_897_3996x8)]
5271*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m9       ; t33a t62a
5272*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_m2191_3461x8)]
5273*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m11      ; t34a t61a
5274*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2359_3349x8)]
5275*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m29, m12      ; t35a t60a
5276*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m700_4036x8)]
5277*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m26, m8       ; t36a t59a
5278*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_501_4065x8)]
5279*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m9       ; t37a t58a
5280*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_m2520_3229x8)]
5281*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m11      ; t38a t57a
5282*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2019_3564x8)]
5283*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m25, m12      ; t39a t56a
5284*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m1092_3948x8)]
5285*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m24, m8       ; t40a t55a
5286*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_1285_3889x8)]
5287*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m9       ; t41a t54a
5288*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_m1842_3659x8)]
5289*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m11      ; t42a t53a
5290*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2675_3102x8)]
5291*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m27, m12      ; t43a t52a
5292*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m301_4085x8)]
5293*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m28, m8       ; t44a t51a
5294*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m9       ; t45a t50a
5295*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m11      ; t46a t49a
5296*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m23, m12      ; t47a t48a
5297*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m22, m21 ; t33  t62
5298*c0909341SAndroid Build Coastguard Worker    paddsw              m22, m21      ; t32  t63
5299*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m29, m14 ; t34  t61
5300*c0909341SAndroid Build Coastguard Worker    paddsw              m29, m14      ; t35  t60
5301*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m26, m17 ; t37  t58
5302*c0909341SAndroid Build Coastguard Worker    paddsw              m26, m17      ; t36  t59
5303*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m25, m18 ; t38  t57
5304*c0909341SAndroid Build Coastguard Worker    paddsw              m25, m18      ; t39  t56
5305*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m24, m19 ; t41  t54
5306*c0909341SAndroid Build Coastguard Worker    paddsw              m24, m19      ; t40  t55
5307*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m27, m16 ; t42  t53
5308*c0909341SAndroid Build Coastguard Worker    paddsw              m27, m16      ; t43  t52
5309*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m28, m15 ; t45  t50
5310*c0909341SAndroid Build Coastguard Worker    paddsw              m28, m15      ; t44  t51
5311*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m23, m20 ; t46  t49
5312*c0909341SAndroid Build Coastguard Worker    paddsw              m20, m23      ; t47  t48
5313*c0909341SAndroid Build Coastguard Worker.main_oddhalf2:
5314*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        8, 9, 23, 10,   401, 4076, 5 ; t33a t62a
5315*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       21, 9, 23, 10, m4076,  401, 5 ; t34a t61a
5316*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       14, 9, 23, 10,  3166, 2598, 5 ; t37a t58a
5317*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a
5318*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       18, 9, 23, 10,  1931, 3612, 5 ; t41a t54a
5319*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a
5320*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       16, 9, 23, 10,  3920, 1189, 5 ; t45a t50a
5321*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a
5322*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m4017_799)]
5323*c0909341SAndroid Build Coastguard Worker    psubsw              m23, m25, m26 ; t36a t59a
5324*c0909341SAndroid Build Coastguard Worker    paddsw              m25, m26      ; t39a t56a
5325*c0909341SAndroid Build Coastguard Worker    psubsw              m26, m24, m27 ; t43a t52a
5326*c0909341SAndroid Build Coastguard Worker    paddsw              m27, m24      ; t40a t55a
5327*c0909341SAndroid Build Coastguard Worker    psubsw              m24, m20, m28 ; t44a t51a
5328*c0909341SAndroid Build Coastguard Worker    paddsw              m20, m28      ; t47a t48a
5329*c0909341SAndroid Build Coastguard Worker    psubsw              m28, m8, m21  ; t34  t61
5330*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m21      ; t33  t62
5331*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m17, m14 ; t37  t58
5332*c0909341SAndroid Build Coastguard Worker    paddsw              m17, m14      ; t38  t57
5333*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m18, m19 ; t42  t53
5334*c0909341SAndroid Build Coastguard Worker    paddsw              m18, m19      ; t41  t54
5335*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m15, m16 ; t45  t50
5336*c0909341SAndroid Build Coastguard Worker    paddsw              m15, m16      ; t46  t49
5337*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m22, m29 ; t35a t60a
5338*c0909341SAndroid Build Coastguard Worker    paddsw              m22, m29      ; t32a t63a
5339*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       16, 9, 29, 10, 799_4017, 11,    20 ; t35  t60
5340*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       28, 9, 29, 10, 799_4017, 11,    20 ; t34a t61a
5341*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       23, 9, 29, 10, 11, m799_m4017,  36 ; t36  t59
5342*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       21, 9, 29, 10, 11, m799_m4017,  36 ; t37a t58a
5343*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m2276_3406)]
5344*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       26, 9, 29, 10, 3406_2276, 11,   20 ; t43  t52
5345*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       14, 9, 29, 10, 3406_2276, 11,   20 ; t42a t53a
5346*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       24, 9, 29, 10, 11, m3406_m2276, 36 ; t44  t51
5347*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a
5348*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_1567_3784)]
5349*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m3784_1567)]
5350*c0909341SAndroid Build Coastguard Worker    psubsw              m29, m22, m25 ; t39  t56
5351*c0909341SAndroid Build Coastguard Worker    paddsw              m22, m25      ; t32  t63
5352*c0909341SAndroid Build Coastguard Worker    psubsw              m25, m20, m27 ; t40  t55
5353*c0909341SAndroid Build Coastguard Worker    paddsw              m20, m27      ; t47  t48
5354*c0909341SAndroid Build Coastguard Worker    psubsw              m27, m8, m17  ; t38a t57a
5355*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m17      ; t33a t62a
5356*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m15, m18 ; t41a t54a
5357*c0909341SAndroid Build Coastguard Worker    paddsw              m15, m18      ; t46a t49a
5358*c0909341SAndroid Build Coastguard Worker    paddsw              m18, m16, m23 ; t35a t60a
5359*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m23      ; t36a t59a
5360*c0909341SAndroid Build Coastguard Worker    psubsw              m23, m24, m26 ; t43a t52a
5361*c0909341SAndroid Build Coastguard Worker    paddsw              m24, m26      ; t44a t51a
5362*c0909341SAndroid Build Coastguard Worker    paddsw              m26, m28, m21 ; t34  t61
5363*c0909341SAndroid Build Coastguard Worker    psubsw              m28, m21      ; t37  t58
5364*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m19, m14 ; t42  t53
5365*c0909341SAndroid Build Coastguard Worker    paddsw              m19, m14      ; t45  t50
5366*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       29, 9, 14, 10, 11, 12, 4 ; t39a t56a
5367*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       27, 9, 14, 10, 11, 12, 4 ; t38  t57
5368*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       16, 9, 14, 10, 11, 12, 4 ; t36  t59
5369*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       28, 9, 14, 10, 11, 12, 4 ; t37a t58a
5370*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m1567_m3784)]
5371*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       25, 9, 14, 10, 12, 11, 4 ; t40a t55a
5372*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       17, 9, 14, 10, 12, 11, 4 ; t41  t54
5373*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       23, 9, 14, 10, 12, 11, 4 ; t43  t52
5374*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       21, 9, 14, 10, 12, 11, 4 ; t42a t53a
5375*c0909341SAndroid Build Coastguard Worker    vbroadcasti32x4     m13, [o(deint_shuf)]
5376*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2896_2896)]
5377*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m2896_2896)]
5378*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m22, m20 ; t32a t63a
5379*c0909341SAndroid Build Coastguard Worker    psubsw              m22, m20      ; t47a t48a
5380*c0909341SAndroid Build Coastguard Worker    psubsw              m20, m8, m15  ; t46  t49
5381*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m15      ; t33  t62
5382*c0909341SAndroid Build Coastguard Worker    paddsw              m15, m18, m24 ; t35  t60
5383*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m24      ; t44  t51
5384*c0909341SAndroid Build Coastguard Worker    psubsw              m24, m26, m19 ; t45a t50a
5385*c0909341SAndroid Build Coastguard Worker    paddsw              m26, m19      ; t34a t61a
5386*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m13}, m14, m8, m15, m26
5387*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m29, m25 ; t40  t55
5388*c0909341SAndroid Build Coastguard Worker    paddsw              m25, m29      ; t39  t56
5389*c0909341SAndroid Build Coastguard Worker    psubsw              m29, m27, m17 ; t41a t54a
5390*c0909341SAndroid Build Coastguard Worker    paddsw              m27, m17      ; t38a t57a
5391*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m16, m23 ; t43a t52a
5392*c0909341SAndroid Build Coastguard Worker    paddsw              m16, m23      ; t36a t59a
5393*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m28, m21 ; t42  t53
5394*c0909341SAndroid Build Coastguard Worker    paddsw              m28, m21      ; t37  t58
5395*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m13}, m25, m27, m16, m28
5396*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       22, 13, 21, 10, 11, 12, 8 ; t47  t48
5397*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       20, 23, 22, 10, 11, 12, 8 ; t46a t49a
5398*c0909341SAndroid Build Coastguard Worker    packssdw            m21, m22      ; t47  t46a
5399*c0909341SAndroid Build Coastguard Worker    packssdw            m13, m23      ; t48  t49a
5400*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       18, 22, 20, 10, 11, 12, 8 ; t44a t51a
5401*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       24, 23, 18, 10, 11, 12, 8 ; t45  t50
5402*c0909341SAndroid Build Coastguard Worker    packssdw            m20, m18      ; t44a t45
5403*c0909341SAndroid Build Coastguard Worker    packssdw            m22, m23      ; t51a t50
5404*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       19, 24, 18, 10, 11, 12, 8 ; t40a t55a
5405*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       29, 23, 19, 10, 11, 12, 8 ; t41  t54
5406*c0909341SAndroid Build Coastguard Worker    packssdw            m18, m19      ; t40a t41
5407*c0909341SAndroid Build Coastguard Worker    packssdw            m24, m23      ; t55a t54
5408*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       17, 23, 19, 10, 11, 12, 8 ; t43  t52
5409*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        9, 29, 17, 10, 11, 12, 8 ; t42a t53a
5410*c0909341SAndroid Build Coastguard Worker    packssdw            m19, m17      ; t43  t42a
5411*c0909341SAndroid Build Coastguard Worker    packssdw            m23, m29      ; t52  t53a
5412*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m17, m25, m27 ; t39  t38a
5413*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m25, m27      ; t56  t57a
5414*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m27, m15, m26 ; t60  t61a
5415*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m26      ; t35  t34a
5416*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m26, m16, m28 ; t59a t58
5417*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m16, m28      ; t36a t37
5418*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m28, m14, m8  ; t63a t62
5419*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m8       ; t32a t33
5420*c0909341SAndroid Build Coastguard Worker    psubsw              m29, m0, m28  ; out63 out62
5421*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m28      ; out0  out1
5422*c0909341SAndroid Build Coastguard Worker    psubsw              m28, m1, m27  ; out60 out61
5423*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m27      ; out3  out2
5424*c0909341SAndroid Build Coastguard Worker    psubsw              m27, m2, m26  ; out59 out58
5425*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m26      ; out4  out5
5426*c0909341SAndroid Build Coastguard Worker    psubsw              m26, m3, m25  ; out56 out57
5427*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m25      ; out7  out6
5428*c0909341SAndroid Build Coastguard Worker    psubsw              m25, m4, m24  ; out55 out54
5429*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m24      ; out8  out9
5430*c0909341SAndroid Build Coastguard Worker    psubsw              m24, m5, m23  ; out52 out53
5431*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m23      ; out11 out10
5432*c0909341SAndroid Build Coastguard Worker    psubsw              m23, m6, m22  ; out51 out50
5433*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m22      ; out12 out13
5434*c0909341SAndroid Build Coastguard Worker    psubsw              m22, m7, m13  ; out48 out49
5435*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m13      ; out15 out14
5436*c0909341SAndroid Build Coastguard Worker    ret
5437*c0909341SAndroid Build Coastguard Worker
5438*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob
5439*c0909341SAndroid Build Coastguard Worker%undef cmp
5440*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
5441*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
5442*c0909341SAndroid Build Coastguard Worker    jnz .normal
5443*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
5444*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
5445*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
5446*c0909341SAndroid Build Coastguard Worker.dconly:
5447*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
5448*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+512
5449*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+2
5450*c0909341SAndroid Build Coastguard Worker.dconly2:
5451*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
5452*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+2048
5453*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+4
5454*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
5455*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m3, r6d
5456*c0909341SAndroid Build Coastguard Worker.dconly_loop:
5457*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq]
5458*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2
5459*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
5460*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
5461*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
5462*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
5463*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
5464*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
5465*c0909341SAndroid Build Coastguard Worker    dec                 r3d
5466*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
5467*c0909341SAndroid Build Coastguard Worker    RET
5468*c0909341SAndroid Build Coastguard Worker.normal:
5469*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      31
5470*c0909341SAndroid Build Coastguard Worker    mova                m19, [o(dup16_perm)]
5471*c0909341SAndroid Build Coastguard Worker    mova                m24, [cq+64* 2]
5472*c0909341SAndroid Build Coastguard Worker    mova                m28, [cq+64* 6]
5473*c0909341SAndroid Build Coastguard Worker    mova                m26, [cq+64* 4]
5474*c0909341SAndroid Build Coastguard Worker    mova                m22, [cq+64* 0]
5475*c0909341SAndroid Build Coastguard Worker    mova                m23, [cq+64* 1]
5476*c0909341SAndroid Build Coastguard Worker    mova                m29, [cq+64* 7]
5477*c0909341SAndroid Build Coastguard Worker    mova                m27, [cq+64* 5]
5478*c0909341SAndroid Build Coastguard Worker    mova                m25, [cq+64* 3]
5479*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m19, m24        ;  4
5480*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m19, m28        ; 12
5481*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m19, m26        ;  8
5482*c0909341SAndroid Build Coastguard Worker    vpermb               m9, m19, m22        ; __  0
5483*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m19, m23        ;  2
5484*c0909341SAndroid Build Coastguard Worker    vpermb              m15, m19, m29        ; 14
5485*c0909341SAndroid Build Coastguard Worker    vpermb              m18, m19, m27        ; 10
5486*c0909341SAndroid Build Coastguard Worker    vpermb              m14, m19, m25        ;  6
5487*c0909341SAndroid Build Coastguard Worker    pslld                m9, 16
5488*c0909341SAndroid Build Coastguard Worker    vpord               m30, m19, [o(pb_32)] {1to16}
5489*c0909341SAndroid Build Coastguard Worker    REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23
5490*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 151
5491*c0909341SAndroid Build Coastguard Worker    jb .fast
5492*c0909341SAndroid Build Coastguard Worker    vpermb               m0, m19, [cq+64*14] ; 28
5493*c0909341SAndroid Build Coastguard Worker    vpermb               m5, m19, [cq+64*10] ; 20
5494*c0909341SAndroid Build Coastguard Worker    vpermb               m3, m19, [cq+64*12] ; 24
5495*c0909341SAndroid Build Coastguard Worker    vpermb               m6, m19, [cq+64* 8] ; __ 16
5496*c0909341SAndroid Build Coastguard Worker    pslld                m6, 16
5497*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast
5498*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m19, [cq+64*15] ; 30
5499*c0909341SAndroid Build Coastguard Worker    vpermb              m20, m19, [cq+64* 9] ; 18
5500*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m19, [cq+64*11] ; 22
5501*c0909341SAndroid Build Coastguard Worker    vpermb              m19, m19, [cq+64*13] ; 26
5502*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5503*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m14
5504*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m15
5505*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m16
5506*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m17
5507*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m18
5508*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m19
5509*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m20
5510*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m21
5511*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m30, [cq+64*15]
5512*c0909341SAndroid Build Coastguard Worker    vpermb              m14, m30, [cq+64* 8]
5513*c0909341SAndroid Build Coastguard Worker    vpermb              m17, m30, [cq+64*11]
5514*c0909341SAndroid Build Coastguard Worker    vpermb              m18, m30, [cq+64*12]
5515*c0909341SAndroid Build Coastguard Worker    vpermb              m19, m30, [cq+64*13]
5516*c0909341SAndroid Build Coastguard Worker    vpermb              m16, m30, [cq+64*10]
5517*c0909341SAndroid Build Coastguard Worker    vpermb              m15, m30, [cq+64* 9]
5518*c0909341SAndroid Build Coastguard Worker    vpermb              m20, m30, [cq+64*14]
5519*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
5520*c0909341SAndroid Build Coastguard Worker    jmp .end
5521*c0909341SAndroid Build Coastguard Worker.fast: ; bottom half is zero
5522*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast2
5523*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
5524*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m14
5525*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m15
5526*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m16
5527*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m17
5528*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m18
5529*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m19
5530*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m20
5531*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m21
5532*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
5533*c0909341SAndroid Build Coastguard Worker.end:
5534*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m4
5535*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 9], m5
5536*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m6
5537*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*11], m7
5538*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m26
5539*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*13], m27
5540*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m28
5541*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*15], m29
5542*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_8192)]
5543*c0909341SAndroid Build Coastguard Worker    call .pass1_end
5544*c0909341SAndroid Build Coastguard Worker    call .pass2
5545*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m0
5546*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m1
5547*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m2
5548*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m3
5549*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m4
5550*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m5
5551*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m6
5552*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m7
5553*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13, [cq+64* 8]
5554*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13, [cq+64* 9]
5555*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m13, [cq+64*10]
5556*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m13, [cq+64*11]
5557*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m30, [o(pw_2048)]
5558*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m13, m22
5559*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m13, m23
5560*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m13, m24
5561*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m13, m25
5562*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m22, m30, m14
5563*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m13, m26
5564*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m23, m30, m15
5565*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m13, m27
5566*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m24, m30, m16
5567*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m13, m28
5568*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m25, m30, m17
5569*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m13, m29
5570*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m26, m30, m18
5571*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m13, [cq+64*12]
5572*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m27, m30, m19
5573*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m13, [cq+64*13]
5574*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m28, m30, m20
5575*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m13, [cq+64*14]
5576*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m29, m30, m21
5577*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m13, [cq+64*15]
5578*c0909341SAndroid Build Coastguard Worker    call .transpose_round
5579*c0909341SAndroid Build Coastguard Worker    call .pass2
5580*c0909341SAndroid Build Coastguard Worker    pxor                m10, m10
5581*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
5582*c0909341SAndroid Build Coastguard Worker%macro IDCT_64x16_END 4
5583*c0909341SAndroid Build Coastguard Worker    mova                 m9, [dstq+%4]
5584*c0909341SAndroid Build Coastguard Worker%if %1 < 8
5585*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%3, m30, [cq+64*%1]
5586*c0909341SAndroid Build Coastguard Worker%endif
5587*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%2, m30
5588*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*%1], m10
5589*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m9, m10
5590*c0909341SAndroid Build Coastguard Worker    punpckhbw            m9, m10
5591*c0909341SAndroid Build Coastguard Worker    paddw                m8, m%3
5592*c0909341SAndroid Build Coastguard Worker    paddw                m9, m%2
5593*c0909341SAndroid Build Coastguard Worker    packuswb             m8, m9
5594*c0909341SAndroid Build Coastguard Worker    mova          [dstq+%4], m8
5595*c0909341SAndroid Build Coastguard Worker%if %1 == 3 || %1 == 7 || %1 == 11
5596*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
5597*c0909341SAndroid Build Coastguard Worker%endif
5598*c0909341SAndroid Build Coastguard Worker%endmacro
5599*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END        0,  0, 11, strideq*0
5600*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END        1,  1, 11, strideq*1
5601*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END        2,  2, 11, strideq*2
5602*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END        3,  3, 11, r3
5603*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END        4,  4, 11, strideq*0
5604*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END        5,  5, 11, strideq*1
5605*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END        6,  6, 11, strideq*2
5606*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END        7,  7, 11, r3
5607*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END        8, 14, 22, strideq*0
5608*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END        9, 15, 23, strideq*1
5609*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END       10, 16, 24, strideq*2
5610*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END       11, 17, 25, r3
5611*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END       12, 18, 26, strideq*0
5612*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END       13, 19, 27, strideq*1
5613*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END       14, 20, 28, strideq*2
5614*c0909341SAndroid Build Coastguard Worker    IDCT_64x16_END       15, 21, 29, r3
5615*c0909341SAndroid Build Coastguard Worker    RET
5616*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5617*c0909341SAndroid Build Coastguard Worker.pass1_end:
5618*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64* 0]
5619*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64* 1]
5620*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64* 2]
5621*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64* 3]
5622*c0909341SAndroid Build Coastguard Worker    mova                 m8, [cq+64* 4]
5623*c0909341SAndroid Build Coastguard Worker    mova                 m9, [cq+64* 5]
5624*c0909341SAndroid Build Coastguard Worker    mova                m11, [cq+64* 6]
5625*c0909341SAndroid Build Coastguard Worker    mova                m12, [cq+64* 7]
5626*c0909341SAndroid Build Coastguard Worker    psubsw              m29, m4, m21  ; out47 out46
5627*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m21      ; out16 out17
5628*c0909341SAndroid Build Coastguard Worker    psubsw              m28, m5, m20  ; out44 out45
5629*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m20      ; out19 out18
5630*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m13}, m0, m1, m2, m3
5631*c0909341SAndroid Build Coastguard Worker    psubsw              m27, m6, m19  ; out43 out42
5632*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m19      ; out20 out21
5633*c0909341SAndroid Build Coastguard Worker    psubsw              m26, m7, m18  ; out40 out41
5634*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m18      ; out23 out22
5635*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m13, m22
5636*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m13, m23
5637*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m13, m24
5638*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m13, m25
5639*c0909341SAndroid Build Coastguard Worker    paddsw              m25, m12, m14 ; out31 out30
5640*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m12, m14 ; out32 out33
5641*c0909341SAndroid Build Coastguard Worker    paddsw              m24, m11, m15 ; out28 out29
5642*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m11, m15 ; out35 out34
5643*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m13}, m4, m5, m6, m7
5644*c0909341SAndroid Build Coastguard Worker    paddsw              m23, m9, m16  ; out27 out26
5645*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m9, m16  ; out36 out37
5646*c0909341SAndroid Build Coastguard Worker    paddsw              m22, m8, m17  ; out24 out25
5647*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m8, m17  ; out39 out38
5648*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m13}, m14, m15, m16, m17
5649*c0909341SAndroid Build Coastguard Worker.transpose_round:
5650*c0909341SAndroid Build Coastguard Worker%macro TRANSPOSE_8x4_PACKED 4
5651*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3
5652*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%1, m%3      ; a0 e0 a1 e1 a2 e2 a3 e3
5653*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3
5654*c0909341SAndroid Build Coastguard Worker    punpckhwd           m%2, m%4      ; c0 g0 c1 g1 c2 g2 c3 g3
5655*c0909341SAndroid Build Coastguard Worker    punpckhwd           m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3
5656*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%1, m%2      ; a0 c0 e0 g0 a1 c1 e1 g1
5657*c0909341SAndroid Build Coastguard Worker    punpckhwd           m%2, m8, m%3  ; b2 d2 f2 h2 b3 d3 f3 h3
5658*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m%3      ; b0 d0 f0 h0 b1 d1 f1 h1
5659*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%3, m%4, m%2 ; 2
5660*c0909341SAndroid Build Coastguard Worker    punpckhwd           m%4, m%2      ; 3
5661*c0909341SAndroid Build Coastguard Worker    punpckhwd           m%2, m%1, m8  ; 1
5662*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%1, m8       ; 0
5663*c0909341SAndroid Build Coastguard Worker%endmacro
5664*c0909341SAndroid Build Coastguard Worker    TRANSPOSE_8x4_PACKED  0,  1,  2,  3
5665*c0909341SAndroid Build Coastguard Worker    TRANSPOSE_8x4_PACKED 18, 19, 20, 21
5666*c0909341SAndroid Build Coastguard Worker    TRANSPOSE_8x4_PACKED  4,  5,  6,  7
5667*c0909341SAndroid Build Coastguard Worker    TRANSPOSE_8x4_PACKED 14, 15, 16, 17
5668*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m8, m0, m4, q3232   ; a02 a03 b02 b03
5669*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym4, 1          ; a00 a01 b00 b01
5670*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m1, m5, q3232   ; a12 a13 b12 b13
5671*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m9, m1, ym5, 1      ; a10 a11 b10 b11
5672*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m2, m6, q3232   ; a22 a23 b22 b23
5673*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, m2, ym6, 1      ; a20 a21 b20 b21
5674*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m3, m7, q3232   ; a32 a33 b32 b33
5675*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m11, m3, ym7, 1      ; a30 a31 b30 b31
5676*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m14, m18, q3232 ; c02 c03 d02 d03
5677*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, m14, ym18, 1    ; c00 c01 d00 d01
5678*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m18, m15, m19, q3232 ; c12 c13 d12 d13
5679*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m15, ym19, 1         ; c10 c11 d10 d11
5680*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m16, m20, q3232 ; c22 c23 d22 d23
5681*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m16, ym20, 1         ; c20 c21 d20 d21
5682*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m20, m17, m21, q3232 ; c32 c33 d32 d33
5683*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m17, ym21, 1         ; c30 c31 d30 d31
5684*c0909341SAndroid Build Coastguard Worker    ret
5685*c0909341SAndroid Build Coastguard Worker.pass2:
5686*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m7, m5, m19, q3131  ; 14
5687*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m5, m19, q2020      ; 10
5688*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m21, m6, m20, q3131  ; 15
5689*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m6, m20, q2020  ; 11
5690*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m20, m4, m18, q3131  ; 13
5691*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m18, m4, m18, q2020  ;  9
5692*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m6, m8, m2, q3131   ; 12
5693*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m4, m8, m2, q2020   ;  8
5694*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m0, m3, q3131   ;  4
5695*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m3, q2020       ;  0
5696*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m1, m16, q3131  ;  6
5697*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m16, q2020      ;  2
5698*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m9, m15, q3131  ;  5
5699*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m9, m15, q2020  ;  1
5700*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m11, m17, q2020 ;  3
5701*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m11, m17, q3131 ;  7
5702*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
5703*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
5704*c0909341SAndroid Build Coastguard Worker
5705*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob
5706*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
5707*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
5708*c0909341SAndroid Build Coastguard Worker    jz .dconly
5709*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 9, 30, 64*32, dst, stride, c, eob
5710*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m23, [o(pw_2896x8)]
5711*c0909341SAndroid Build Coastguard Worker%undef cmp
5712*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
5713*c0909341SAndroid Build Coastguard Worker    jb .fast
5714*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m23, [cq+64*20]
5715*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m23, [cq+64*12]
5716*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m23, [cq+64* 4]
5717*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m23, [cq+64*28]
5718*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m23, [cq+64* 8]
5719*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m23, [cq+64*24]
5720*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m23, [cq+64* 0]
5721*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m23, [cq+64*16]
5722*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
5723*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m23, [cq+64* 2]
5724*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m23, [cq+64*30]
5725*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m23, [cq+64*18]
5726*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m23, [cq+64*14]
5727*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m23, [cq+64*10]
5728*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m23, [cq+64*22]
5729*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m23, [cq+64*26]
5730*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m23, [cq+64* 6]
5731*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
5732*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m14
5733*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m15
5734*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m16
5735*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m17
5736*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m18
5737*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m19
5738*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m20
5739*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m21
5740*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m22, m23, [cq+64* 1]
5741*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m23, [cq+64*31]
5742*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m23, [cq+64*17]
5743*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m29, m23, [cq+64*15]
5744*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m26, m23, [cq+64* 9]
5745*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m23, [cq+64*23]
5746*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m23, [cq+64*25]
5747*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m25, m23, [cq+64* 7]
5748*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m24, m23, [cq+64* 5]
5749*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m23, [cq+64*27]
5750*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m23, [cq+64*21]
5751*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m27, m23, [cq+64*11]
5752*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m28, m23, [cq+64*13]
5753*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m23, [cq+64*19]
5754*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m23, [cq+64*29]
5755*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m23,      [cq+64* 3]
5756*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
5757*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_16384)]
5758*c0909341SAndroid Build Coastguard Worker    psubsw              m13, m0, m29 ; 31
5759*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m29     ;  0
5760*c0909341SAndroid Build Coastguard Worker    psubsw              m29, m1, m28 ; 30
5761*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m28     ;  1
5762*c0909341SAndroid Build Coastguard Worker    psubsw              m28, m2, m27 ; 29
5763*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m27     ;  2
5764*c0909341SAndroid Build Coastguard Worker    psubsw              m27, m3, m26 ; 28
5765*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m26     ;  3
5766*c0909341SAndroid Build Coastguard Worker    psubsw              m26, m4, m25 ; 27
5767*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m25     ;  4
5768*c0909341SAndroid Build Coastguard Worker    psubsw              m25, m5, m24 ; 26
5769*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m24     ;  5
5770*c0909341SAndroid Build Coastguard Worker    psubsw              m24, m6, m23 ; 25
5771*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m23     ;  6
5772*c0909341SAndroid Build Coastguard Worker    psubsw              m23, m7, m22 ; 24
5773*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m22     ;  7
5774*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
5775*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
5776*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1     ; a0 b0 a1 b1 a2 b2 a3 b3
5777*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
5778*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3     ; c0 d0 c1 d1 c2 d2 c3 d3
5779*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
5780*c0909341SAndroid Build Coastguard Worker    punpckhwd           m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
5781*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5     ; e0 f0 e1 f1 e2 f2 e3 f3
5782*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
5783*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7     ; g0 h0 g1 h1 g2 h2 g3 h3
5784*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
5785*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m23, m24
5786*c0909341SAndroid Build Coastguard Worker    punpcklwd           m23, m24
5787*c0909341SAndroid Build Coastguard Worker    punpckhwd           m24, m25, m26
5788*c0909341SAndroid Build Coastguard Worker    punpcklwd           m25, m26
5789*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
5790*c0909341SAndroid Build Coastguard Worker    punpckhwd           m26, m27, m28
5791*c0909341SAndroid Build Coastguard Worker    punpcklwd           m27, m28
5792*c0909341SAndroid Build Coastguard Worker    punpckhwd           m28, m29, m13
5793*c0909341SAndroid Build Coastguard Worker    punpcklwd           m29, m13
5794*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
5795*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m0, m2  ; a2 b2 c2 d2 a3 b3 c3 d3
5796*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2      ; a0 b0 c0 d0 a1 b1 c1 d1
5797*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m4, m6  ; e2 f2 g2 h2 e3 f3 g3 h3
5798*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m6      ; e0 f0 g0 h0 e1 f1 g1 h1
5799*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m7, m0, m2, m4
5800*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m8, m1  ; a6 b6 c6 d6 a7 b7 c7 d7
5801*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m1      ; a4 b4 c4 d4 a5 b5 c5 d5
5802*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
5803*c0909341SAndroid Build Coastguard Worker    punpckldq           m22, m5      ; e4 f4 g4 h5 e5 f5 g5 h5
5804*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m6, m8, m1, m22
5805*c0909341SAndroid Build Coastguard Worker    punpckhdq           m13, m23, m25
5806*c0909341SAndroid Build Coastguard Worker    punpckldq           m23, m25
5807*c0909341SAndroid Build Coastguard Worker    punpckhdq           m25, m27, m29
5808*c0909341SAndroid Build Coastguard Worker    punpckldq           m27, m29
5809*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m13, m23, m25, m27
5810*c0909341SAndroid Build Coastguard Worker    punpckhdq            m9, m3, m24
5811*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m24
5812*c0909341SAndroid Build Coastguard Worker    punpckhdq           m24, m26, m28
5813*c0909341SAndroid Build Coastguard Worker    punpckldq           m26, m28
5814*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m9, m3, m24, m26
5815*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m23, m27 ; d01 d09 d17 d25
5816*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m23, m27      ; d00 d08 d16 d24
5817*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m27, m13, m25 ; d02 d10 d18 d26
5818*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m13, m25      ; d03 d11 d19 d27
5819*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m25, m3, m26  ; d04 d12 d20 d28
5820*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m26      ; d05 d13 d21 d29
5821*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m26, m9, m24  ; d06 d14 d22 d30
5822*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m9, m24      ; d07 d15 d23 d31
5823*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m23
5824*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*13], m27
5825*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m25
5826*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*15], m26
5827*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m24, m8, m22  ; a05 a13 a21 a29
5828*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m22      ; a04 a12 a20 a28
5829*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m22, m0, m4   ; a01 a09 a17 a25
5830*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m4       ; a00 a08 a16 a24
5831*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m23, m7, m2   ; a03 a11 a19 a27
5832*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m7, m2       ; a02 a10 a18 a26
5833*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m25, m6, m1   ; a07 a15 a23 a31
5834*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m1       ; a06 a14 a22 a30
5835*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m0
5836*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 9], m7
5837*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m8
5838*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*11], m6
5839*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64* 0]
5840*c0909341SAndroid Build Coastguard Worker    mova                m11, [cq+64* 2]
5841*c0909341SAndroid Build Coastguard Worker    mova                 m8, [cq+64* 4]
5842*c0909341SAndroid Build Coastguard Worker    mova                m29, [cq+64* 6]
5843*c0909341SAndroid Build Coastguard Worker    mova                m27, [cq+64* 8]
5844*c0909341SAndroid Build Coastguard Worker    mova                m26, [cq+64*10]
5845*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64*12]
5846*c0909341SAndroid Build Coastguard Worker    mova                m28, [cq+64*14]
5847*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2, m21  ; 23
5848*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m21      ;  8
5849*c0909341SAndroid Build Coastguard Worker    psubsw              m21, m11, m20 ; 22
5850*c0909341SAndroid Build Coastguard Worker    paddsw              m11, m20      ;  9
5851*c0909341SAndroid Build Coastguard Worker    psubsw              m20, m8, m19  ; 21
5852*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m19      ; 10
5853*c0909341SAndroid Build Coastguard Worker    psubsw              m19, m29, m18 ; 20
5854*c0909341SAndroid Build Coastguard Worker    paddsw              m29, m18      ; 11
5855*c0909341SAndroid Build Coastguard Worker    psubsw              m18, m27, m17 ; 19
5856*c0909341SAndroid Build Coastguard Worker    paddsw              m27, m17      ; 12
5857*c0909341SAndroid Build Coastguard Worker    psubsw              m17, m26, m16 ; 18
5858*c0909341SAndroid Build Coastguard Worker    paddsw              m26, m16      ; 13
5859*c0909341SAndroid Build Coastguard Worker    psubsw              m16, m4, m15  ; 17
5860*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m15      ; 14
5861*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m28, m14 ; 16
5862*c0909341SAndroid Build Coastguard Worker    paddsw              m28, m14      ; 15
5863*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m16
5864*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m16
5865*c0909341SAndroid Build Coastguard Worker    punpckhwd           m16, m17, m18
5866*c0909341SAndroid Build Coastguard Worker    punpcklwd           m17, m18
5867*c0909341SAndroid Build Coastguard Worker    punpckhwd           m18, m19, m20
5868*c0909341SAndroid Build Coastguard Worker    punpcklwd           m19, m20
5869*c0909341SAndroid Build Coastguard Worker    punpckhwd           m20, m21, m1
5870*c0909341SAndroid Build Coastguard Worker    punpcklwd           m21, m1
5871*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m11  ; i4 j4 i5 j5 i6 j6 i7 j7
5872*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m11      ; i0 j1 i1 j1 i2 j2 i3 j3
5873*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m8, m29  ; k4 l4 k5 l5 k6 l6 k7 l7
5874*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m29      ; k0 l0 k1 l1 k2 l2 k3 l3
5875*c0909341SAndroid Build Coastguard Worker    punpckhwd           m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
5876*c0909341SAndroid Build Coastguard Worker    punpcklwd           m27, m26      ; m0 n0 m1 n1 m2 n2 m3 n3
5877*c0909341SAndroid Build Coastguard Worker    punpckhwd           m26, m4, m28  ; o4 p4 o5 p5 o6 p6 o7 p7
5878*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m28      ; o0 p0 o1 p1 o2 p2 o3 p3
5879*c0909341SAndroid Build Coastguard Worker    punpckhdq           m28, m2, m8   ; i2 j2 k2 l2 i3 j3 k3 l3
5880*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m8       ; i0 j0 k0 l0 i1 j1 k1 l1
5881*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m27, m4  ; m2 n2 o2 p2 m3 n3 o3 p3
5882*c0909341SAndroid Build Coastguard Worker    punpckldq           m27, m4       ; m0 n0 o0 p0 m1 n1 o1 p1
5883*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m28, m2, m8, m27
5884*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m1, m11  ; i6 j6 k6 l6 i7 j7 k7 l7
5885*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m11      ; i4 j4 k4 l4 i5 j5 k5 l5
5886*c0909341SAndroid Build Coastguard Worker    punpckhdq           m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
5887*c0909341SAndroid Build Coastguard Worker    punpckldq           m29, m26      ; m4 n4 o4 p4 m5 n5 o5 p5
5888*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m4, m1, m11, m29
5889*c0909341SAndroid Build Coastguard Worker    punpckhdq           m26, m19, m21
5890*c0909341SAndroid Build Coastguard Worker    punpckldq           m19, m21
5891*c0909341SAndroid Build Coastguard Worker    punpckhdq           m21, m15, m16
5892*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m16
5893*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m26, m19, m21, m15
5894*c0909341SAndroid Build Coastguard Worker    punpckhdq           m16, m18, m20
5895*c0909341SAndroid Build Coastguard Worker    punpckldq           m18, m20
5896*c0909341SAndroid Build Coastguard Worker    punpckhdq           m20, m14, m17
5897*c0909341SAndroid Build Coastguard Worker    punpckldq           m14, m17
5898*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m16, m18, m20, m14
5899*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m17, m28, m8  ; b03 b11 b19 b27
5900*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m28, m8       ; b02 b10 b18 b26
5901*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m8, m2, m27  ; b01 b09 b17 b25
5902*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m27      ; b00 b08 b16 b24
5903*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m27, m1, m29  ; b04 b12 b20 b28
5904*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m29      ; b05 b13 b21 b29
5905*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m29, m4, m11  ; b06 b14 b22 b30
5906*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m11      ; b07 b15 b23 b31
5907*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m2
5908*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m28
5909*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m27
5910*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m29
5911*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m27, m20, m26 ; c03 c11 c19 c27
5912*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m20, m26      ; c02 c10 c18 c26
5913*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m26, m14, m19 ; c01 c09 c17 c25
5914*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m19      ; c00 c08 c16 c24
5915*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m28, m15, m18 ; c05 c13 c21 c29
5916*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m18      ; c04 c12 c20 c28
5917*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m29, m21, m16 ; c07 c15 c23 c31
5918*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m21, m16      ; c06 c14 c22 c30
5919*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m14
5920*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m20
5921*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m15
5922*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m21
5923*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m22, m8, q3232  ; a17 a25 b17 b25
5924*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m22, ym8, 1          ; a01 a09 b01 b09
5925*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m23, m17, q3232 ; a19 a27 b19 b27
5926*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m23, ym17, 1         ; a03 a11 b03 b11
5927*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m24, m1, q3232  ; a21 a29 b21 b29
5928*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m24, ym1, 1          ; a05 a13 b05 b13
5929*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m25, m4, q3232  ; a23 a31 b23 b31
5930*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m25, ym4, 1          ; a07 a15 b07 b15
5931*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m19, m26, ym5, 1     ; c01 c09 d01 d09
5932*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m26, m5, q3232       ; c17 c25 d17 d25
5933*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m20, m27, ym13, 1    ; c03 c11 d03 d11
5934*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m13, q3232      ; c19 c27 d19 d27
5935*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m21, m28, ym3, 1     ; c05 c13 d05 d13
5936*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m28, m3, q3232       ; c21 c29 d21 d29
5937*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, m29, ym9, 1     ; c07 c15 d07 d15
5938*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m29, m9, q3232       ; c23 c31 d23 d31
5939*c0909341SAndroid Build Coastguard Worker    mov                  r4, rsp
5940*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m22, m19, q2020 ;  1
5941*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m17, m29, q3131 ; 31
5942*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m14, m26, q2020 ; 17
5943*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m25, m18, q3131 ; 15
5944*c0909341SAndroid Build Coastguard Worker    call .main_part1
5945*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m25, m18, q2020 ;  7
5946*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m14, m26, q3131 ; 25
5947*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m17, m29, q2020 ; 23
5948*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m22, m19, q3131 ;  9
5949*c0909341SAndroid Build Coastguard Worker    call .main_part1
5950*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m24, m21, q2020 ;  5
5951*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m15, m27, q3131 ; 27
5952*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m16, m28, q2020 ; 21
5953*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m23, m20, q3131 ; 11
5954*c0909341SAndroid Build Coastguard Worker    call .main_part1
5955*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m23, m20, q2020 ;  3
5956*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m16, m28, q3131 ; 29
5957*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m15, m27, q2020 ; 19
5958*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m24, m21, q3131 ; 13
5959*c0909341SAndroid Build Coastguard Worker    call .main_part1
5960*c0909341SAndroid Build Coastguard Worker    call .main_part2
5961*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 1] ; a0
5962*c0909341SAndroid Build Coastguard Worker    mova                m15, [cq+64* 0] ; b0
5963*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64* 2] ; c0
5964*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64* 3] ; d0
5965*c0909341SAndroid Build Coastguard Worker    mova                m14, [cq+64* 5] ; a4
5966*c0909341SAndroid Build Coastguard Worker    mova                 m8, [cq+64* 4] ; b4
5967*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64* 6] ; c4
5968*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 7] ; d4
5969*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m0, m15, q3232  ; a16 a24 b16 b24
5970*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m0, ym15, 1         ; a00 a08 b00 b08
5971*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m3, m16, q3232  ; c16 c24 d16 d24
5972*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m3, ym16, 1         ; c00 c08 d00 d08
5973*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m14, m8, q3232  ; a20 a28 b20 b28
5974*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m14, ym8, 1          ; a04 a12 b04 b12
5975*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m8, m17, m1, q3232  ; c20 c28 d20 d28
5976*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m17, ym1, 1          ; c04 c12 d04 d12
5977*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m0, m3, q3131   ;  8
5978*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m3, q2020       ;  0
5979*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m2, m15, q3131  ; 24
5980*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m15, q2020      ; 16
5981*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m14, m17, q3131 ; 12
5982*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m17, q2020      ;  4
5983*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m16, m8, q3131  ; 28
5984*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m8, q2020       ; 20
5985*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
5986*c0909341SAndroid Build Coastguard Worker    mova                 m8, [cq+64* 8]
5987*c0909341SAndroid Build Coastguard Worker    mova                 m9, [cq+64*12]
5988*c0909341SAndroid Build Coastguard Worker    mova                m11, [cq+64*10]
5989*c0909341SAndroid Build Coastguard Worker    mova                m12, [cq+64*14]
5990*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m14
5991*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m15
5992*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m16
5993*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m17
5994*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m18
5995*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m19
5996*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m20
5997*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m21
5998*c0909341SAndroid Build Coastguard Worker    mova                m22, [cq+64* 9]
5999*c0909341SAndroid Build Coastguard Worker    mova                m27, [cq+64*13]
6000*c0909341SAndroid Build Coastguard Worker    mova                m23, [cq+64*11]
6001*c0909341SAndroid Build Coastguard Worker    mova                m24, [cq+64*15]
6002*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m26, m22, m8, q3232  ; a18 a26 b18 b26
6003*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m22, ym8, 1          ; a02 a10 b02 b10
6004*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m8, m9, m27, q3232  ; c18 c26 d18 d26
6005*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m9, ym27, 1         ; c02 c10 d02 d10
6006*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m23, m11, q3232 ; a22 a30 b22 b30
6007*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m23, ym11, 1         ; a06 a14 b06 b14
6008*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m11, m12, m24, q3232 ; c22 c30 d22 d30
6009*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m12, ym24, 1         ; c06 c14 d06 d14
6010*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m28, m26, m8, q3131  ; 26
6011*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m26, m8, q2020       ; 18
6012*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m24, m22, m9, q3131  ; 10
6013*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m22, m9, q2020       ;  2
6014*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m29, m27, m11, q3131 ; 30
6015*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m11, q2020      ; 22
6016*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m25, m23, m12, q3131 ; 14
6017*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m23, m12, q2020      ;  6
6018*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
6019*c0909341SAndroid Build Coastguard Worker    jmp .end
6020*c0909341SAndroid Build Coastguard Worker.fast: ; bottom/right halves are zero
6021*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym9, ym23, [cq+64* 0]
6022*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym6, ym23, [cq+64* 8]
6023*c0909341SAndroid Build Coastguard Worker    mova                m14, [o(dup16_perm)]
6024*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym8, ym23, [cq+64* 2]
6025*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm23, [cq+64*14]
6026*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm5, xm23, [cq+64*10]
6027*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym1, ym23, [cq+64* 6]
6028*c0909341SAndroid Build Coastguard Worker    pmulhrsw            ym7, ym23, [cq+64* 4]
6029*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm3, xm23, [cq+64*12]
6030*c0909341SAndroid Build Coastguard Worker    pmovzxwd             m9, ym9
6031*c0909341SAndroid Build Coastguard Worker    pmovzxwd             m6, ym6
6032*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m14, m8
6033*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm0, xm0
6034*c0909341SAndroid Build Coastguard Worker    vpermb              ym5, ym14, ym5
6035*c0909341SAndroid Build Coastguard Worker    vpermb               m1, m14, m1
6036*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m14, m7
6037*c0909341SAndroid Build Coastguard Worker    punpcklwd           xm3, xm3
6038*c0909341SAndroid Build Coastguard Worker    pslld                m9, 16
6039*c0909341SAndroid Build Coastguard Worker    pslld                m6, 16
6040*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast
6041*c0909341SAndroid Build Coastguard Worker          vpmulhrsw    ym21, ym23, [cq+64* 1]
6042*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    xm17, xm23, [cq+64*15] ; force EVEX encoding, which
6043*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    xm20, xm23, [cq+64* 9] ; reduces code size due to
6044*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    ym15, ym23, [cq+64* 7] ; compressed displacements
6045*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    ym18, ym23, [cq+64* 5]
6046*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    xm16, xm23, [cq+64*11]
6047*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    xm19, xm23, [cq+64*13]
6048*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    ym23,       [cq+64* 3]
6049*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m14, m21
6050*c0909341SAndroid Build Coastguard Worker    punpcklwd          xm17, xm17
6051*c0909341SAndroid Build Coastguard Worker    vpermb             ym20, ym14, ym20
6052*c0909341SAndroid Build Coastguard Worker    vpermb              m15, m14, m15
6053*c0909341SAndroid Build Coastguard Worker    vpermb              m18, m14, m18
6054*c0909341SAndroid Build Coastguard Worker    vpermb             ym16, ym14, ym16
6055*c0909341SAndroid Build Coastguard Worker    punpcklwd          xm19, xm19
6056*c0909341SAndroid Build Coastguard Worker    vpermb              m14, m14, m23
6057*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
6058*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_16384)]
6059*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
6060*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m0, m3, q2020  ;  0
6061*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m26, m0, m3, q3131  ;  4
6062*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m14, m2, q2020 ;  1
6063*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m2, q3131      ;  5
6064*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m19, m7, q3131 ; 15
6065*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m7, q2020      ; 11
6066*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m17, m9, q2020 ;  3
6067*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m9, q3131      ;  7
6068*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m28, m20, m6, q2020 ;  9
6069*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m20, m6, q3131      ; 13
6070*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m22, m1, m18, q2020 ;  2
6071*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m23, m1, m18, q3131 ;  6
6072*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m24, m5, m15, q2020 ; 10
6073*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m25, m5, m15, q3131 ; 14
6074*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m21, m4, q3131 ; 12
6075*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m21, m21, m4, q2020 ;  8
6076*c0909341SAndroid Build Coastguard Worker    mov                  r4, rsp
6077*c0909341SAndroid Build Coastguard Worker    call .main_part1_fast
6078*c0909341SAndroid Build Coastguard Worker    mova                 m0, m17
6079*c0909341SAndroid Build Coastguard Worker    mova                 m3, m28
6080*c0909341SAndroid Build Coastguard Worker    call .main_part1_fast
6081*c0909341SAndroid Build Coastguard Worker    mova                 m0, m14
6082*c0909341SAndroid Build Coastguard Worker    mova                 m3, m19
6083*c0909341SAndroid Build Coastguard Worker    call .main_part1_fast
6084*c0909341SAndroid Build Coastguard Worker    mova                 m0, m27
6085*c0909341SAndroid Build Coastguard Worker    mova                 m3, m20
6086*c0909341SAndroid Build Coastguard Worker    call .main_part1_fast
6087*c0909341SAndroid Build Coastguard Worker    call .main_part2
6088*c0909341SAndroid Build Coastguard Worker    mova                 m0, m16
6089*c0909341SAndroid Build Coastguard Worker    mova                 m1, m21
6090*c0909341SAndroid Build Coastguard Worker    mova                m14, m26
6091*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
6092*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m21
6093*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m14
6094*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m17
6095*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m18
6096*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m19
6097*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m16
6098*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m15
6099*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m20
6100*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
6101*c0909341SAndroid Build Coastguard Worker.end:
6102*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
6103*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2048)]
6104*c0909341SAndroid Build Coastguard Worker    movshdup            m13, [o(permD)]
6105*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r4+strideq]   ; stride*4
6106*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r4*8]
6107*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq+r5*8] ; stride*33
6108*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r4+r5*8]      ; stride*35
6109*c0909341SAndroid Build Coastguard Worker    add                  r3, r5             ; dst+stride*28
6110*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r6+strideq]   ; stride*34
6111*c0909341SAndroid Build Coastguard Worker%macro IDCT_32x64_END 6 ; src, mem, stride[1-4]
6112*c0909341SAndroid Build Coastguard Worker%if %2 < 8
6113*c0909341SAndroid Build Coastguard Worker    paddsw              m10, m%2, m%1
6114*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m%2, m%1
6115*c0909341SAndroid Build Coastguard Worker%else
6116*c0909341SAndroid Build Coastguard Worker    mova                m11, [cq+64*(%2*2-16)]
6117*c0909341SAndroid Build Coastguard Worker    paddsw              m10, m11, m%1
6118*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m%1
6119*c0909341SAndroid Build Coastguard Worker%endif
6120*c0909341SAndroid Build Coastguard Worker    mova                 m9, [rsp+64*(31-%2)]
6121*c0909341SAndroid Build Coastguard Worker    mova                m%1, [rsp+64*%2]
6122*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m10, m9
6123*c0909341SAndroid Build Coastguard Worker    psubsw              m10, m9
6124*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m11, m%1
6125*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, [dstq+%3]
6126*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m%1
6127*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%1, [r3  +%4]
6128*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m8, m10, m9, m11
6129*c0909341SAndroid Build Coastguard Worker    paddw                m8, m0
6130*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m0, [r3  +%5]
6131*c0909341SAndroid Build Coastguard Worker    paddw               m10, m%1
6132*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%1, [dstq+%6]
6133*c0909341SAndroid Build Coastguard Worker    paddw                m9, m0
6134*c0909341SAndroid Build Coastguard Worker    paddw               m11, m%1
6135*c0909341SAndroid Build Coastguard Worker%if %2 >= 8
6136*c0909341SAndroid Build Coastguard Worker%if %2 == 8
6137*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
6138*c0909341SAndroid Build Coastguard Worker%endif
6139*c0909341SAndroid Build Coastguard Worker    mova  [cq+64*(%2*2-16)], m1
6140*c0909341SAndroid Build Coastguard Worker    mova  [cq+64*(%2*2-15)], m1
6141*c0909341SAndroid Build Coastguard Worker%endif
6142*c0909341SAndroid Build Coastguard Worker    packuswb             m8, m10
6143*c0909341SAndroid Build Coastguard Worker    packuswb             m9, m11
6144*c0909341SAndroid Build Coastguard Worker    vpermq               m8, m13, m8
6145*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m13, m9
6146*c0909341SAndroid Build Coastguard Worker    mova          [dstq+%3], ym8
6147*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [r3  +%4], m8, 1
6148*c0909341SAndroid Build Coastguard Worker    mova          [r3  +%5], ym9
6149*c0909341SAndroid Build Coastguard Worker    vextracti32x8 [dstq+%6], m9, 1
6150*c0909341SAndroid Build Coastguard Worker%if %2 == 3 || %2 == 7 || %2 == 11
6151*c0909341SAndroid Build Coastguard Worker    add                dstq, r5
6152*c0909341SAndroid Build Coastguard Worker    sub                  r3, r5
6153*c0909341SAndroid Build Coastguard Worker%endif
6154*c0909341SAndroid Build Coastguard Worker%endmacro
6155*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       29,  0, strideq*0, r8,   r4       , r5*8
6156*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       28,  1, strideq*1, r7,   strideq*2, r6
6157*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       27,  2, strideq*2, r6,   strideq*1, r7
6158*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       26,  3, r4       , r5*8, strideq*0, r8
6159*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       25,  4, strideq*0, r8,   r4       , r5*8
6160*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       24,  5, strideq*1, r7,   strideq*2, r6
6161*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       23,  6, strideq*2, r6,   strideq*1, r7
6162*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       22,  7, r4       , r5*8, strideq*0, r8
6163*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       21,  8, strideq*0, r8,   r4       , r5*8
6164*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       20,  9, strideq*1, r7,   strideq*2, r6
6165*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       19, 10, strideq*2, r6,   strideq*1, r7
6166*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       18, 11, r4       , r5*8, strideq*0, r8
6167*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       17, 12, strideq*0, r8,   r4       , r5*8
6168*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       16, 13, strideq*1, r7,   strideq*2, r6
6169*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       15, 14, strideq*2, r6,   strideq*1, r7
6170*c0909341SAndroid Build Coastguard Worker    IDCT_32x64_END       14, 15, r4       , r5*8, strideq*0, r8
6171*c0909341SAndroid Build Coastguard Worker    RET
6172*c0909341SAndroid Build Coastguard Worker.dconly:
6173*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
6174*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
6175*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
6176*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
6177*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128
6178*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8
6179*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
6180*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+256
6181*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+1
6182*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
6183*c0909341SAndroid Build Coastguard WorkerALIGN function_align ; bottom three-quarters are zero
6184*c0909341SAndroid Build Coastguard Workercglobal_label .main_part1_fast2
6185*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(idct64_mul+4*0)]
6186*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(idct64_mul+4*1)]
6187*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m0     ; t63a
6188*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m8     ; t32a
6189*c0909341SAndroid Build Coastguard Worker
6190*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0, m7
6191*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m0, m7
6192*c0909341SAndroid Build Coastguard Worker    mova                 m1, m10
6193*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m1, m4, [o(idct64_mul+4*9)] {bcstd}
6194*c0909341SAndroid Build Coastguard Worker    mova                 m9, m10
6195*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m6, [o(idct64_mul+4*9)] {bcstd}
6196*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m1, m9
6197*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m9
6198*c0909341SAndroid Build Coastguard Worker    mova                 m9, m10
6199*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m9, m6, [o(idct64_mul+4*8)] {bcstd}
6200*c0909341SAndroid Build Coastguard Worker    mova                 m6, m10
6201*c0909341SAndroid Build Coastguard Worker    vpdpwssd             m6, m4, [o(idct64_mul+4*8)] {bcstd}
6202*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m9, m6
6203*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m9
6204*c0909341SAndroid Build Coastguard Worker
6205*c0909341SAndroid Build Coastguard Worker    mova                 m4, m0
6206*c0909341SAndroid Build Coastguard Worker    mova                 m3, m7
6207*c0909341SAndroid Build Coastguard Worker    mova                 m5, m1
6208*c0909341SAndroid Build Coastguard Worker    mova                 m2, m6
6209*c0909341SAndroid Build Coastguard Worker    jmp .main_part1c
6210*c0909341SAndroid Build Coastguard Workercglobal_label .main_part1_fast
6211*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(idct64_mul+4*0)]
6212*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(idct64_mul+4*1)]
6213*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(idct64_mul+4*6)]
6214*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(idct64_mul+4*7)]
6215*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m0     ; t63a
6216*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m8     ; t32a
6217*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3     ; t60a
6218*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m9     ; t35a
6219*c0909341SAndroid Build Coastguard Worker    mova                 m8, m0
6220*c0909341SAndroid Build Coastguard Worker    mova                 m7, m1
6221*c0909341SAndroid Build Coastguard Worker    mova                 m6, m3
6222*c0909341SAndroid Build Coastguard Worker    mova                 m5, m2
6223*c0909341SAndroid Build Coastguard Worker    jmp .main_part1b
6224*c0909341SAndroid Build Coastguard Workercglobal_label .main_part1
6225*c0909341SAndroid Build Coastguard Worker    ; idct64 steps 1-5:
6226*c0909341SAndroid Build Coastguard Worker    ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
6227*c0909341SAndroid Build Coastguard Worker    ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
6228*c0909341SAndroid Build Coastguard Worker    ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
6229*c0909341SAndroid Build Coastguard Worker    ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
6230*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(idct64_mul+4*0)]
6231*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(idct64_mul+4*1)]
6232*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(idct64_mul+4*2)]
6233*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(idct64_mul+4*3)]
6234*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m0     ; t63a
6235*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(idct64_mul+4*4)]
6236*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m8     ; t32a
6237*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(idct64_mul+4*5)]
6238*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m1     ; t62a
6239*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(idct64_mul+4*6)]
6240*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m9     ; t33a
6241*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(idct64_mul+4*7)]
6242*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m2     ; t61a
6243*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m8     ; t34a
6244*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m3     ; t60a
6245*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m9     ; t35a
6246*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m0, m1 ; t33
6247*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1     ; t32
6248*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m7, m6 ; t62
6249*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m6     ; t63
6250*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m3, m2 ; t34
6251*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m2     ; t35
6252*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m4, m5 ; t61
6253*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m4     ; t60
6254*c0909341SAndroid Build Coastguard Worker.main_part1b:
6255*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(idct64_mul+4*8)]
6256*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(idct64_mul+4*9)]
6257*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         1, 8, 4, 9, 10, 11, 12 ; t33a, t62a
6258*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(idct64_mul+4*10)]
6259*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         2, 6, 4, 9, 10, 12, 11 ; t34a, t61a
6260*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m0, m3 ; t35a
6261*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m3     ; t32a
6262*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m7, m5 ; t60a
6263*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m5     ; t63a
6264*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m1, m2 ; t34
6265*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2     ; t33
6266*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m8, m6 ; t61
6267*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m8     ; t62
6268*c0909341SAndroid Build Coastguard Worker.main_part1c:
6269*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(idct64_mul+4*11)]
6270*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(idct64_mul+4*12)]
6271*c0909341SAndroid Build Coastguard Worker    add                  r5, 4*13
6272*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         3, 4, 8, 9, 10, 11, 12 ; t35,  t60
6273*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         2, 5, 8, 9, 10, 11, 12 ; t34a, t61a
6274*c0909341SAndroid Build Coastguard Worker    mova          [r4+64*0], m0
6275*c0909341SAndroid Build Coastguard Worker    mova          [r4+64*7], m7
6276*c0909341SAndroid Build Coastguard Worker    mova          [r4+64*1], m1
6277*c0909341SAndroid Build Coastguard Worker    mova          [r4+64*6], m6
6278*c0909341SAndroid Build Coastguard Worker    mova          [r4+64*3], m3
6279*c0909341SAndroid Build Coastguard Worker    mova          [r4+64*4], m4
6280*c0909341SAndroid Build Coastguard Worker    mova          [r4+64*2], m2
6281*c0909341SAndroid Build Coastguard Worker    mova          [r4+64*5], m5
6282*c0909341SAndroid Build Coastguard Worker    add                  r4, 64*8
6283*c0909341SAndroid Build Coastguard Worker    ret
6284*c0909341SAndroid Build Coastguard Workercglobal_label .main_part2
6285*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_1567_3784  -16*13)]
6286*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m3784_1567 -16*13)]
6287*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r4+64*7]
6288*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m17, [o(pw_m1567_m3784-16*13)]
6289*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m18, [o(pw_2896_2896  -16*13)]
6290*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m19, [o(pw_m2896_2896 -16*13)]
6291*c0909341SAndroid Build Coastguard Worker    sub                  r5, 16*13
6292*c0909341SAndroid Build Coastguard Worker.main_part2_loop:
6293*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r4-64*32] ; t32a
6294*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r6-64*24] ; t39a
6295*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r6-64*32] ; t63a
6296*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r4-64*24] ; t56a
6297*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r4-64*16] ; t40a
6298*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r6-64* 8] ; t47a
6299*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r6-64*16] ; t55a
6300*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r4-64* 8] ; t48a
6301*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m0, m1 ; t39
6302*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1     ; t32
6303*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2, m3 ; t56
6304*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m3     ; t63
6305*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m5, m4 ; t40
6306*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m4     ; t47
6307*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m7, m6 ; t55
6308*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m6     ; t48
6309*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         1, 8, 6, 9, 10, 11, 12 ; t39a, t56a
6310*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         4, 3, 6, 9, 10, 12, 17 ; t40a, t55a
6311*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m2, m7 ; t48a
6312*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m7     ; t63a
6313*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m0, m5 ; t47a
6314*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m5     ; t32a
6315*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m8, m3 ; t55
6316*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m3     ; t56
6317*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m1, m4 ; t40
6318*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m4     ; t39
6319*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         6, 7, 4, 9, 10, 18, 19 ; t47,  t48
6320*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         5, 3, 4, 9, 10, 18, 19 ; t40a, t55a
6321*c0909341SAndroid Build Coastguard Worker    mova         [r6-64* 8], m2
6322*c0909341SAndroid Build Coastguard Worker    mova         [r4-64*32], m0
6323*c0909341SAndroid Build Coastguard Worker    mova         [r4-64* 8], m8
6324*c0909341SAndroid Build Coastguard Worker    mova         [r6-64*32], m1
6325*c0909341SAndroid Build Coastguard Worker    mova         [r6-64*24], m6
6326*c0909341SAndroid Build Coastguard Worker    mova         [r4-64*16], m7
6327*c0909341SAndroid Build Coastguard Worker    mova         [r4-64*24], m5
6328*c0909341SAndroid Build Coastguard Worker    mova         [r6-64*16], m3
6329*c0909341SAndroid Build Coastguard Worker    add                  r4, 64
6330*c0909341SAndroid Build Coastguard Worker    sub                  r6, 64
6331*c0909341SAndroid Build Coastguard Worker    cmp                  r4, r6
6332*c0909341SAndroid Build Coastguard Worker    jb .main_part2_loop
6333*c0909341SAndroid Build Coastguard Worker    ret
6334*c0909341SAndroid Build Coastguard Worker
6335*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob
6336*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
6337*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
6338*c0909341SAndroid Build Coastguard Worker    jz .dconly
6339*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 7, 30, 64*32, dst, stride, c, eob
6340*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m23, [o(pw_2896x8)]
6341*c0909341SAndroid Build Coastguard Worker%undef cmp
6342*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
6343*c0909341SAndroid Build Coastguard Worker    jb .fast
6344*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m23, [cq+64* 1]
6345*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m23, [cq+64*31]
6346*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m23, [cq+64*17]
6347*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m23, [cq+64*15]
6348*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
6349*c0909341SAndroid Build Coastguard Worker    mov                  r4, rsp
6350*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6351*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m23, [cq+64* 7]
6352*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m23, [cq+64*25]
6353*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m23, [cq+64*23]
6354*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m23, [cq+64* 9]
6355*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6356*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m23, [cq+64* 5]
6357*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m23, [cq+64*27]
6358*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m23, [cq+64*21]
6359*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m23, [cq+64*11]
6360*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6361*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m23, [cq+64* 3]
6362*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m23, [cq+64*29]
6363*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m23, [cq+64*19]
6364*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m23, [cq+64*13]
6365*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
6366*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
6367*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m23, [cq+64*24]
6368*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m23, [cq+64* 8]
6369*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m23, [cq+64*16]
6370*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m23, [cq+64* 0]
6371*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m23, [cq+64* 4]
6372*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m23, [cq+64*28]
6373*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m23, [cq+64*20]
6374*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m23, [cq+64*12]
6375*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
6376*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m22, m23, [cq+64* 2]
6377*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m29, m23, [cq+64*30]
6378*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m26, m23, [cq+64*18]
6379*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m25, m23, [cq+64*14]
6380*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m24, m23, [cq+64*10]
6381*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m27, m23, [cq+64*22]
6382*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m28, m23, [cq+64*26]
6383*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m23,      [cq+64* 6]
6384*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m14
6385*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m15
6386*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m16
6387*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m17
6388*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m18
6389*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m19
6390*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m20
6391*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m21
6392*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
6393*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_16384)]
6394*c0909341SAndroid Build Coastguard Worker    call .pass1_end_part1
6395*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*16], m1
6396*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*17], m3
6397*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*18], m5
6398*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*19], m7
6399*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*24], m23
6400*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*25], m25
6401*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*26], m27
6402*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*27], m29
6403*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m23, m13, m0 ; a0
6404*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m25, m13, m2 ; a2
6405*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m27, m13, m4 ; a4
6406*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m29, m13, m6 ; a6
6407*c0909341SAndroid Build Coastguard Worker    REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6
6408*c0909341SAndroid Build Coastguard Worker    call .pass1_end_part2
6409*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*20], m15
6410*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*21], m17
6411*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*22], m19
6412*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*23], m21
6413*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*28], m1
6414*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*29], m3
6415*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*30], m5
6416*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*31], m7
6417*c0909341SAndroid Build Coastguard Worker    REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6
6418*c0909341SAndroid Build Coastguard Worker    REPX {pmulhrsw x, m13}, m0, m2, m4, m6     ; g0 g2 g4 g6
6419*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m3, m23, ym14, 1 ; a00 a01 c00 c01
6420*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m23, m14, q3232   ; a02 a03 c02 c03
6421*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m15, m22, ym0, 1  ; e00 e01 g00 g01
6422*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m22, m0, q3232    ; e02 e03 g02 g03
6423*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m1, m27, ym18, 1 ; a40 a41 c40 c41
6424*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m27, m18, q3232   ; a42 a43 c42 c43
6425*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m18, m26, ym4, 1  ; e40 e41 g40 g41
6426*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m26, m4, q3232    ; e42 e43 g42 g43
6427*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m14, m25, ym16, 1 ; a20 a21 c20 c21
6428*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m25, m16, q3232   ; a22 a23 c22 c23
6429*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m17, m24, ym2, 1  ; e20 e21 g20 g21
6430*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m24, m2, q3232    ; e22 e23 g22 g23
6431*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m19, m29, ym20, 1 ; a60 a61 c60 c61
6432*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m29, m20, q3232   ; a62 a63 c62 c63
6433*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m20, m28, ym6, 1  ; e60 e61 g60 g61
6434*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m28, m6, q3232    ; e62 e63 g62 g63
6435*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m2, m3, m15, q3131  ;  8
6436*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m0, m3, m15, q2020  ;  0
6437*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m6, m23, m22, q3131 ; 24
6438*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m4, m23, m22, q2020 ; 16
6439*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m3, m1, m18, q3131  ; 12
6440*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m1, m18, q2020      ;  4
6441*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m7, m27, m26, q3131 ; 28
6442*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m5, m27, m26, q2020 ; 20
6443*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
6444*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m16, m14, m17, q3131 ; 10
6445*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m14, m17, q2020      ;  2
6446*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m17, m19, m20, q3131 ; 14
6447*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m15, m19, m20, q2020 ;  6
6448*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m20, m25, m24, q3131 ; 26
6449*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m18, m25, m24, q2020 ; 18
6450*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m21, m29, m28, q3131 ; 30
6451*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m19, m29, m28, q2020 ; 22
6452*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
6453*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m22, m13, [cq+64*16] ; a1
6454*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m23, m13, [cq+64*20] ; c1
6455*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m24, m13, [cq+64*24] ; e1
6456*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m25, m13, [cq+64*28] ; g1
6457*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m26, m13, [cq+64*17] ; a3
6458*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m27, m13, [cq+64*21] ; c3
6459*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m28, m13, [cq+64*25] ; e3
6460*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m29, m13, [cq+64*29] ; g3
6461*c0909341SAndroid Build Coastguard Worker    mova        [cq+64* 8], m14
6462*c0909341SAndroid Build Coastguard Worker    mova        [cq+64* 9], m15
6463*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*10], m16
6464*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*11], m17
6465*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*12], m18
6466*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*13], m19
6467*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*14], m20
6468*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*15], m21
6469*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m14, m13, [cq+64*18] ; a5
6470*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m15, m13, [cq+64*22] ; c5
6471*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m16, m13, [cq+64*26] ; e5
6472*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m17, m13, [cq+64*30] ; g5
6473*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m18, m13, [cq+64*19] ; a7
6474*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m19, m13, [cq+64*23] ; c7
6475*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m20, m13, [cq+64*27] ; e7
6476*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m21, m13, [cq+64*31] ; g7
6477*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m8, m22, ym23, 1 ; a10 a11 c10 c11
6478*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m22, m23, q3232   ; a12 a13 c12 c13
6479*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m9, m24, ym25, 1 ; e10 e11 g10 g11
6480*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m24, m25, q3232   ; e12 e13 g12 g13
6481*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m23, m26, ym27, 1 ; a30 a31 c30 c31
6482*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m26, m27, q3232   ; a32 a33 c32 c33
6483*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m11, m28, ym29, 1 ; e30 e31 g30 g31
6484*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m28, m29, q3232   ; e32 e33 g32 g33
6485*c0909341SAndroid Build Coastguard Worker    mova        [cq+64* 0], m0
6486*c0909341SAndroid Build Coastguard Worker    mova        [cq+64* 1], m1
6487*c0909341SAndroid Build Coastguard Worker    mova        [cq+64* 2], m2
6488*c0909341SAndroid Build Coastguard Worker    mova        [cq+64* 3], m3
6489*c0909341SAndroid Build Coastguard Worker    mova        [cq+64* 4], m4
6490*c0909341SAndroid Build Coastguard Worker    mova        [cq+64* 5], m5
6491*c0909341SAndroid Build Coastguard Worker    mova        [cq+64* 6], m6
6492*c0909341SAndroid Build Coastguard Worker    mova        [cq+64* 7], m7
6493*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m12, m14, ym15, 1 ; a50 a51 c50 c51
6494*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m14, m15, q3232   ; a52 a53 c52 c53
6495*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m13, m16, ym17, 1 ; e50 e51 g50 g51
6496*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m16, m17, q3232   ; e52 e53 g52 g53
6497*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m25, m18, ym19, 1 ; a70 a71 c70 c71
6498*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m18, m19, q3232   ; a72 a73 c72 c73
6499*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m17, m20, ym21, 1 ; e70 e71 g70 g71
6500*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m20, m21, q3232   ; e72 e73 g72 g73
6501*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m27, m23, m11, q3131 ; 11 m27
6502*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m23, m11, q2020      ;  3 m23
6503*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m19, m26, m28, q3131 ; 27 m19
6504*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m15, m26, m28, q2020 ; 19 m15
6505*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m29, m25, m17, q3131 ; 15 m29
6506*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m25, m17, q2020      ;  7 m25
6507*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m21, m18, m20, q3131 ; 31 m21
6508*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m17, m18, m20, q2020 ; 23 m17
6509*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m20, m14, m16, q3131 ; 29 m20
6510*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m16, m14, m16, q2020 ; 21 m16
6511*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m18, m22, m24, q3131 ; 25 m18
6512*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m14, m22, m24, q2020 ; 17 m14
6513*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m26, m8, m9, q3131   ;  9 m26
6514*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m22, m8, m9, q2020   ;  1 m22
6515*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m28, m12, m13, q3131 ; 13 m28
6516*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m24, m12, m13, q2020 ;  5 m24
6517*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
6518*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       m13, [o(pw_16384)]
6519*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m0, m13, [r4-64*21]
6520*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m1, m13, [r4-64*22]
6521*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m2, m13, [r4-64*23]
6522*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m3, m13, [r4-64*24]
6523*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m4, m13, [r4-64*25]
6524*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m5, m13, [r4-64*26]
6525*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m6, m13, [r4-64*27]
6526*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m7, m13, [r4-64*28]
6527*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*16], m14
6528*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*17], m15
6529*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*18], m16
6530*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*19], m17
6531*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*20], m18
6532*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*21], m19
6533*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*22], m20
6534*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*23], m21
6535*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m14, m13, [r4-64*12]
6536*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m15, m13, [r4-64*11]
6537*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m16, m13, [r4-64*10]
6538*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m17, m13, [r4-64* 9]
6539*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m18, m13, [r4-64* 8]
6540*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m19, m13, [r4-64* 7]
6541*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m20, m13, [r4-64* 6]
6542*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m21, m13, [r4-64* 5]
6543*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*24], m22
6544*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*25], m23
6545*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*26], m24
6546*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*27], m25
6547*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*28], m26
6548*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*29], m27
6549*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*30], m28
6550*c0909341SAndroid Build Coastguard Worker    mova        [cq+64*31], m29
6551*c0909341SAndroid Build Coastguard Worker    call .transpose_2x8x8_lo
6552*c0909341SAndroid Build Coastguard Worker    mova        [r4-64*12], m1
6553*c0909341SAndroid Build Coastguard Worker    mova        [r4-64*11], m3
6554*c0909341SAndroid Build Coastguard Worker    mova        [r4-64*10], m5
6555*c0909341SAndroid Build Coastguard Worker    mova        [r4-64* 9], m7
6556*c0909341SAndroid Build Coastguard Worker    mova        [r4-64* 8], m15
6557*c0909341SAndroid Build Coastguard Worker    mova        [r4-64* 7], m17
6558*c0909341SAndroid Build Coastguard Worker    mova        [r4-64* 6], m19
6559*c0909341SAndroid Build Coastguard Worker    mova        [r4-64* 5], m21
6560*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m22, m0, ym14, 1     ; f00 f01 h00 h01
6561*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m23, m0, m14, q3232  ; f02 f03 h02 h03
6562*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m24, m2, ym16, 1     ; f20 f21 h20 h21
6563*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m25, m2, m16, q3232  ; f22 f23 h22 h23
6564*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m26, m4, ym18, 1     ; f40 f41 h40 h41
6565*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m27, m4, m18, q3232  ; f42 f43 h42 h43
6566*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m28, m6, ym20, 1     ; f60 f61 h60 h61
6567*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m29, m6, m20, q3232  ; f62 f63 h62 h63
6568*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m0, m13, [r4-64*20]
6569*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m1, m13, [r4-64*19]
6570*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m2, m13, [r4-64*18]
6571*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m3, m13, [r4-64*17]
6572*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m4, m13, [r4-64*16]
6573*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m5, m13, [r4-64*15]
6574*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m6, m13, [r4-64*14]
6575*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m7, m13, [r4-64*13]
6576*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m14, m13, [r4-64*29]
6577*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m15, m13, [r4-64*30]
6578*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m16, m13, [r4-64*31]
6579*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m17, m13, [r4-64*32]
6580*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m18, m13, [r4-64*33]
6581*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m19, m13, [r4-64*34]
6582*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m20, m13, [r4-64*35]
6583*c0909341SAndroid Build Coastguard Worker    pmulhrsw           m21, m13, [r4-64*36]
6584*c0909341SAndroid Build Coastguard Worker    call .transpose_2x8x8_lo
6585*c0909341SAndroid Build Coastguard Worker    mova       [r4-64*20], m1
6586*c0909341SAndroid Build Coastguard Worker    mova       [r4-64*19], m3
6587*c0909341SAndroid Build Coastguard Worker    mova       [r4-64*18], m5
6588*c0909341SAndroid Build Coastguard Worker    mova       [r4-64*17], m7
6589*c0909341SAndroid Build Coastguard Worker    mova       [r4-64*16], m15
6590*c0909341SAndroid Build Coastguard Worker    mova       [r4-64*15], m17
6591*c0909341SAndroid Build Coastguard Worker    mova       [r4-64*14], m19
6592*c0909341SAndroid Build Coastguard Worker    mova       [r4-64*13], m21
6593*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m1, m4, ym18, 1     ; b40 b41 d40 d41
6594*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m5, m4, m18, q3232  ; b42 b43 d42 d43
6595*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m4, m0, m14, q3232  ; b02 b03 d02 d03
6596*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m0, ym14, 1         ; b00 b01 d00 d01
6597*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m14, m2, ym16, 1     ; b20 b21 d20 d21
6598*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m18, m2, m16, q3232  ; b22 b23 d22 d23
6599*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m15, m6, ym20, 1     ; b60 b61 d60 d61
6600*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m19, m6, m20, q3232  ; b62 b63 d62 d63
6601*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m2, m0, m22, q3131  ;  8
6602*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m0, m22, q2020      ;  0
6603*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m3, m1, m26, q3131  ; 12
6604*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m1, m26, q2020      ;  4
6605*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m6, m4, m23, q3131  ; 24
6606*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m4, m23, q2020      ; 16
6607*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m7, m5, m27, q3131  ; 28
6608*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m5, m27, q2020      ; 20
6609*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
6610*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m16, m14, m24, q3131 ; 10
6611*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m14, m24, q2020      ;  2
6612*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m17, m15, m28, q3131 ; 14
6613*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m15, m28, q2020      ;  6
6614*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m20, m18, m25, q3131 ; 26
6615*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m18, m25, q2020      ; 18
6616*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m21, m19, m29, q3131 ; 30
6617*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m19, m29, q2020      ; 22
6618*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
6619*c0909341SAndroid Build Coastguard Worker    mova               m22, [r4-64*20]
6620*c0909341SAndroid Build Coastguard Worker    mova               m26, [r4-64*16]
6621*c0909341SAndroid Build Coastguard Worker    mova               m23, [r4-64*19]
6622*c0909341SAndroid Build Coastguard Worker    mova               m27, [r4-64*15]
6623*c0909341SAndroid Build Coastguard Worker    mova               m24, [r4-64*18]
6624*c0909341SAndroid Build Coastguard Worker    mova               m28, [r4-64*14]
6625*c0909341SAndroid Build Coastguard Worker    mova               m25, [r4-64*17]
6626*c0909341SAndroid Build Coastguard Worker    mova               m29, [r4-64*13]
6627*c0909341SAndroid Build Coastguard Worker    mova        [r4-64*20], m14
6628*c0909341SAndroid Build Coastguard Worker    mova        [r4-64*19], m15
6629*c0909341SAndroid Build Coastguard Worker    mova        [r4-64*18], m16
6630*c0909341SAndroid Build Coastguard Worker    mova        [r4-64*17], m17
6631*c0909341SAndroid Build Coastguard Worker    mova        [r4-64*16], m18
6632*c0909341SAndroid Build Coastguard Worker    mova        [r4-64*15], m19
6633*c0909341SAndroid Build Coastguard Worker    mova        [r4-64*14], m20
6634*c0909341SAndroid Build Coastguard Worker    mova        [r4-64*13], m21
6635*c0909341SAndroid Build Coastguard Worker    mova               m19, [r4-64*12]
6636*c0909341SAndroid Build Coastguard Worker    mova               m11, [r4-64* 8]
6637*c0909341SAndroid Build Coastguard Worker    mova               m20, [r4-64*11]
6638*c0909341SAndroid Build Coastguard Worker    mova               m12, [r4-64* 7]
6639*c0909341SAndroid Build Coastguard Worker    mova               m21, [r4-64*10]
6640*c0909341SAndroid Build Coastguard Worker    mova                m8, [r4-64* 6]
6641*c0909341SAndroid Build Coastguard Worker    mova                m9, [r4-64* 9]
6642*c0909341SAndroid Build Coastguard Worker    mova               m18, [r4-64* 5]
6643*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m14, m22, m26, q3232 ; b12 b13 d12 d13
6644*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m22, ym26, 1         ; b10 b11 d10 d11
6645*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m15, m23, m27, q3232 ; b32 b33 d32 d33
6646*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m23, ym27, 1         ; b30 b31 d30 d31
6647*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m16, m24, m28, q3232 ; b52 b53 d52 d53
6648*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m24, ym28, 1         ; b50 b51 d50 d51
6649*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m17, m25, m29, q3232 ; b72 b73 d72 d73
6650*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m25, ym29, 1         ; b70 b71 d70 d71
6651*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m27, m19, ym11, 1    ; f10 f11 h10 h11
6652*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m19, m11, q3232      ; f12 f13 h12 h13
6653*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m28, m20, ym12, 1    ; f30 f31 h30 h31
6654*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m20, m12, q3232      ; f32 f33 h32 h33
6655*c0909341SAndroid Build Coastguard Worker    vinserti32x8       m29, m21, ym8, 1     ; f50 f51 h50 h51
6656*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m21, m8, q3232       ; f52 f53 h52 h53
6657*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m8, m9, ym18, 1     ; f70 f71 h70 h71
6658*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m9, m18, q3232      ; f72 f73 h72 h73
6659*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m26, m22, m27, q3131 ;  9
6660*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m22, m27, q2020      ;  1
6661*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m27, m23, m28, q3131 ; 11
6662*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m23, m28, q2020      ;  3
6663*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m28, m24, m29, q3131 ; 13
6664*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m24, m29, q2020      ;  5
6665*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m29, m25, m8, q3131  ; 15
6666*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m25, m8, q2020       ;  7
6667*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m18, m14, m19, q3131 ; 25
6668*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m14, m19, q2020      ; 17
6669*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m19, m15, m20, q3131 ; 27
6670*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m15, m20, q2020      ; 19
6671*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m20, m16, m21, q3131 ; 29
6672*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m16, m21, q2020      ; 21
6673*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m21, m17, m9, q3131  ; 31
6674*c0909341SAndroid Build Coastguard Worker    vshufi32x4         m17, m9, q2020       ; 23
6675*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
6676*c0909341SAndroid Build Coastguard Worker    jmp .end
6677*c0909341SAndroid Build Coastguard Worker.fast: ; bottom/right halves are zero
6678*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw     ym8, ym23, [cq+64* 4]
6679*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw     xm1, xm23, [cq+64*12]
6680*c0909341SAndroid Build Coastguard Worker    mova                m28, [o(dup16_perm)]
6681*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw     ym7, ym23, [cq+64* 8]
6682*c0909341SAndroid Build Coastguard Worker          vpmulhrsw    ym22, ym23, [cq+64* 0]
6683*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m28, m8
6684*c0909341SAndroid Build Coastguard Worker    vpermb              ym1, ym28, ym1
6685*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m28, m7
6686*c0909341SAndroid Build Coastguard Worker    pmovzxwd             m9, ym22
6687*c0909341SAndroid Build Coastguard Worker    pslld                m9, 16
6688*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast2
6689*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    ym21, ym23, [cq+64* 2]
6690*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    xm15, xm23, [cq+64*14]
6691*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    xm18, xm23, [cq+64*10]
6692*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    ym14, ym23, [cq+64* 6]
6693*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m28, m21
6694*c0909341SAndroid Build Coastguard Worker    punpcklwd          xm15, xm15
6695*c0909341SAndroid Build Coastguard Worker    vpermb             ym18, ym28, ym18
6696*c0909341SAndroid Build Coastguard Worker    vpermb              m14, m28, m14
6697*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
6698*c0909341SAndroid Build Coastguard Worker          vpmulhrsw    ym22, ym23, [cq+64* 1]
6699*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    xm29, xm23, [cq+64*15]
6700*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    xm26, xm23, [cq+64* 9]
6701*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    ym25, ym23, [cq+64* 7]
6702*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    ym24, ym23, [cq+64* 5]
6703*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    xm27, xm23, [cq+64*11]
6704*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw     xm8, xm23, [cq+64*13]
6705*c0909341SAndroid Build Coastguard Worker    {evex}vpmulhrsw    ym23,       [cq+64* 3]
6706*c0909341SAndroid Build Coastguard Worker    vpermb              m22, m28, m22
6707*c0909341SAndroid Build Coastguard Worker    punpcklwd          xm29, xm29
6708*c0909341SAndroid Build Coastguard Worker    vpermb             ym26, ym28, ym26
6709*c0909341SAndroid Build Coastguard Worker    vpermb              m25, m28, m25
6710*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m14
6711*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m15
6712*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m16
6713*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m17
6714*c0909341SAndroid Build Coastguard Worker    REPX {vpermb x, m28, x}, m24, m27, m23
6715*c0909341SAndroid Build Coastguard Worker    punpcklwd          xm28, xm8, xm8
6716*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m18
6717*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m19
6718*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m20
6719*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m21
6720*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
6721*c0909341SAndroid Build Coastguard Worker    mov                  r4, rsp
6722*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_16384)]
6723*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*16], m4
6724*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*17], m5
6725*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*18], m6
6726*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*19], m7
6727*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*28], m26
6728*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*29], m27
6729*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*30], m28
6730*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*31], m29
6731*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
6732*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*20], m22
6733*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*21], m23
6734*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*22], m24
6735*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*23], m25
6736*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*24], m26
6737*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*25], m27
6738*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*26], m28
6739*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*27], m29
6740*c0909341SAndroid Build Coastguard Worker    call .pass2_fast
6741*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 8], m14
6742*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 9], m15
6743*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*10], m16
6744*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*11], m17
6745*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*12], m18
6746*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*13], m19
6747*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*14], m20
6748*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*15], m21
6749*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
6750*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m0
6751*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m1
6752*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m2
6753*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m3
6754*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m4
6755*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m5
6756*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m6
6757*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m7
6758*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13, [r4+64*16]
6759*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13, [r4+64*17]
6760*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m13, [r4+64*18]
6761*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m13, [r4+64*19]
6762*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m13, [r4+64*20]
6763*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m13, [r4+64*21]
6764*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m13, [r4+64*22]
6765*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m13, [r4+64*23]
6766*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*16], m14
6767*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*17], m15
6768*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*18], m16
6769*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*19], m17
6770*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*20], m18
6771*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*21], m19
6772*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*22], m20
6773*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*23], m21
6774*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m13, [r4+64*24]
6775*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m13, [r4+64*25]
6776*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m13, [r4+64*26]
6777*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m13, [r4+64*27]
6778*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m13, [r4+64*28]
6779*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m13, [r4+64*29]
6780*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m13, [r4+64*30]
6781*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m13, [r4+64*31]
6782*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*24], m22
6783*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*25], m23
6784*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*26], m24
6785*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*27], m25
6786*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*28], m26
6787*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*29], m27
6788*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*30], m28
6789*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*31], m29
6790*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
6791*c0909341SAndroid Build Coastguard Worker    call .pass2_fast
6792*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*16], m14
6793*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*17], m15
6794*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*18], m16
6795*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*19], m17
6796*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*20], m18
6797*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*21], m19
6798*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*22], m20
6799*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*23], m21
6800*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
6801*c0909341SAndroid Build Coastguard Worker.end:
6802*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_2048)]
6803*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq*3]
6804*c0909341SAndroid Build Coastguard Worker    pxor                m12, m12
6805*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r5*8]
6806*c0909341SAndroid Build Coastguard Worker    lea                  r6, [strideq+r5] ; stride*4
6807*c0909341SAndroid Build Coastguard Worker    add                  r3, r6           ; dst+stride*28
6808*c0909341SAndroid Build Coastguard Worker%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi
6809*c0909341SAndroid Build Coastguard Worker    mova                m11, [cq+64*(   %3)] ;  0
6810*c0909341SAndroid Build Coastguard Worker    mova                 m9, [cq+64*(31-%3)] ; 31
6811*c0909341SAndroid Build Coastguard Worker%if %3 >= 8
6812*c0909341SAndroid Build Coastguard Worker    mova                m%1, [rsp+64*(%1+16)]
6813*c0909341SAndroid Build Coastguard Worker%endif
6814*c0909341SAndroid Build Coastguard Worker    mova                m10, [dstq+%4]
6815*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m11, m9
6816*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m9
6817*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m%1, m%2
6818*c0909341SAndroid Build Coastguard Worker    psubsw              m%1, m%2
6819*c0909341SAndroid Build Coastguard Worker    punpcklbw           m%2, m10, m12
6820*c0909341SAndroid Build Coastguard Worker    punpckhbw           m10, m12
6821*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m13
6822*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m13
6823*c0909341SAndroid Build Coastguard Worker    paddw                m8, m%2
6824*c0909341SAndroid Build Coastguard Worker    paddw                m9, m10
6825*c0909341SAndroid Build Coastguard Worker    mova                m10, [r3+%5]
6826*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m13
6827*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m13
6828*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*(   %3)], m12
6829*c0909341SAndroid Build Coastguard Worker    mova    [cq+64*(31-%3)], m12
6830*c0909341SAndroid Build Coastguard Worker    punpcklbw           m%2, m10, m12
6831*c0909341SAndroid Build Coastguard Worker    punpckhbw           m10, m12
6832*c0909341SAndroid Build Coastguard Worker    packuswb             m8, m9
6833*c0909341SAndroid Build Coastguard Worker    paddw               m11, m%2
6834*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m10
6835*c0909341SAndroid Build Coastguard Worker    packuswb            m11, m%1
6836*c0909341SAndroid Build Coastguard Worker    mova          [dstq+%4], m8
6837*c0909341SAndroid Build Coastguard Worker    mova          [r3  +%5], m11
6838*c0909341SAndroid Build Coastguard Worker%if %3 == 3 || %3 == 7 || %3 == 11
6839*c0909341SAndroid Build Coastguard Worker    add                dstq, r6
6840*c0909341SAndroid Build Coastguard Worker    sub                  r3, r6
6841*c0909341SAndroid Build Coastguard Worker%endif
6842*c0909341SAndroid Build Coastguard Worker%endmacro
6843*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        0, 29,  0, strideq*0, r5
6844*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        1, 28,  1, strideq*1, strideq*2
6845*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        2, 27,  2, strideq*2, strideq*1
6846*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        3, 26,  3, r5       , strideq*0
6847*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        4, 25,  4, strideq*0, r5
6848*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        5, 24,  5, strideq*1, strideq*2
6849*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        6, 23,  6, strideq*2, strideq*1
6850*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        7, 22,  7, r5       , strideq*0
6851*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        0, 21,  8, strideq*0, r5
6852*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        1, 20,  9, strideq*1, strideq*2
6853*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        2, 19, 10, strideq*2, strideq*1
6854*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        3, 18, 11, r5       , strideq*0
6855*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        4, 17, 12, strideq*0, r5
6856*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        5, 16, 13, strideq*1, strideq*2
6857*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        6, 15, 14, strideq*2, strideq*1
6858*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_END        7, 14, 15, r5       , strideq*0
6859*c0909341SAndroid Build Coastguard Worker    RET
6860*c0909341SAndroid Build Coastguard WorkerALIGN function_align
6861*c0909341SAndroid Build Coastguard Worker.dconly:
6862*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
6863*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
6864*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
6865*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
6866*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128
6867*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8
6868*c0909341SAndroid Build Coastguard Worker    imul                r6d, 181
6869*c0909341SAndroid Build Coastguard Worker    add                 r6d, 128+256
6870*c0909341SAndroid Build Coastguard Worker    sar                 r6d, 8+1
6871*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2
6872*c0909341SAndroid Build Coastguard WorkerALIGN function_align
6873*c0909341SAndroid Build Coastguard Worker.pass1_end_part1:
6874*c0909341SAndroid Build Coastguard Worker%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64
6875*c0909341SAndroid Build Coastguard Worker%if %1 != %3
6876*c0909341SAndroid Build Coastguard Worker    mova                m%1, [cq+64*%1]
6877*c0909341SAndroid Build Coastguard Worker%endif
6878*c0909341SAndroid Build Coastguard Worker    mova                 m9, [r4+64*(%3-36)] ; idct64 32+n
6879*c0909341SAndroid Build Coastguard Worker    mova                m11, [r4+64*(-5-%3)] ; idct64 63-n
6880*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m%1, m%2        ; idct32 31-n
6881*c0909341SAndroid Build Coastguard Worker    paddsw              m%1, m%2             ; idct32  0+n
6882*c0909341SAndroid Build Coastguard Worker%if %1 == %3
6883*c0909341SAndroid Build Coastguard Worker    psubsw              m%2, m8, m9   ; out 32+n e
6884*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m9       ; out 31-n d
6885*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m%1, m11 ; out 63-n h
6886*c0909341SAndroid Build Coastguard Worker    paddsw              m%1, m11      ; out  0+n a
6887*c0909341SAndroid Build Coastguard Worker%else
6888*c0909341SAndroid Build Coastguard Worker    paddsw              m%2, m8, m9   ; out 23-n c
6889*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m9       ; out 40+n f
6890*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m%1, m11 ; out  8+n b
6891*c0909341SAndroid Build Coastguard Worker    psubsw              m%1, m11      ; out 55-n g
6892*c0909341SAndroid Build Coastguard Worker%endif
6893*c0909341SAndroid Build Coastguard Worker    mova   [r4+64*(%3-36)], m8
6894*c0909341SAndroid Build Coastguard Worker    mova   [r4+64*(-5-%3)], m9
6895*c0909341SAndroid Build Coastguard Worker%endmacro
6896*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  0, 29,  0
6897*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  1, 28,  1
6898*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  2, 27,  2
6899*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  3, 26,  3
6900*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  4, 25,  4
6901*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  5, 24,  5
6902*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  6, 23,  6
6903*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  7, 22,  7
6904*c0909341SAndroid Build Coastguard Worker.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted)
6905*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3
6906*c0909341SAndroid Build Coastguard Worker    punpckhwd           m25, m24      ; e4 f4 e5 f5 e6 f6 e7 f7
6907*c0909341SAndroid Build Coastguard Worker    punpcklwd           m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3
6908*c0909341SAndroid Build Coastguard Worker    punpckhwd           m23, m22      ; g4 h4 g5 h5 g6 h6 g7 h7
6909*c0909341SAndroid Build Coastguard Worker    punpcklwd           m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3
6910*c0909341SAndroid Build Coastguard Worker    punpckhwd           m29, m28      ; a4 b4 a5 b5 a6 b6 a7 b7
6911*c0909341SAndroid Build Coastguard Worker    punpcklwd           m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3
6912*c0909341SAndroid Build Coastguard Worker    punpckhwd           m27, m26      ; c4 d4 c5 d5 c6 d6 c7 d7
6913*c0909341SAndroid Build Coastguard Worker    punpckldq           m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5
6914*c0909341SAndroid Build Coastguard Worker    punpckhdq           m29, m27      ; a6 b6 c6 d6 a7 b7 c7 d7
6915*c0909341SAndroid Build Coastguard Worker    punpckldq           m27, m8, m24  ; e0 f0 g0 h0 e1 f1 g1 h1
6916*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m24      ; e2 f2 g2 h2 e3 f3 g3 h3
6917*c0909341SAndroid Build Coastguard Worker    punpckhdq           m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3
6918*c0909341SAndroid Build Coastguard Worker    punpckldq           m22, m28      ; a0 b0 c0 d0 a1 b1 c1 d1
6919*c0909341SAndroid Build Coastguard Worker    punpckldq           m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5
6920*c0909341SAndroid Build Coastguard Worker    punpckhdq           m25, m23      ; e6 f6 g6 h6 e7 f7 g7 h7
6921*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m23, m22, m27 ;  1 23
6922*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m22, m27      ;  0 22
6923*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m27, m26, m28 ;  5 27
6924*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m26, m28      ;  4 26
6925*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m28, m29, m25 ;  6 28
6926*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m29, m25      ;  7 29
6927*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m25, m24, m8  ;  3 25
6928*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m24, m8       ;  2 24
6929*c0909341SAndroid Build Coastguard Worker.transpose_8x8:
6930*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m4, m5
6931*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5
6932*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0, m1
6933*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
6934*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m6, m7
6935*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7
6936*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m2, m3
6937*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
6938*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m0, m2
6939*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
6940*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4, m6
6941*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m6
6942*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m5, m7
6943*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m7
6944*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m8, m1
6945*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m1
6946*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m2
6947*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2
6948*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m4
6949*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4
6950*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m5, m7
6951*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m7
6952*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m7, m6, m8
6953*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m8
6954*c0909341SAndroid Build Coastguard Worker    ret
6955*c0909341SAndroid Build Coastguard Worker.pass1_end_part2:
6956*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  0, 21,  8
6957*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  1, 20,  9
6958*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  2, 19, 10
6959*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  3, 18, 11
6960*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  4, 17, 12
6961*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  5, 16, 13
6962*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  6, 15, 14
6963*c0909341SAndroid Build Coastguard Worker    IDCT_64x32_PASS1_END  7, 14, 15
6964*c0909341SAndroid Build Coastguard Worker.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21
6965*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m3, m2
6966*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2
6967*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m0
6968*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0
6969*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m7, m6
6970*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m6
6971*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m5, m4
6972*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m4
6973*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m7, m5
6974*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m5
6975*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m8, m2
6976*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m2
6977*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m0, m6
6978*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m6
6979*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m3, m1
6980*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m1
6981*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m5
6982*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m5
6983*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m4, m6
6984*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m6
6985*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m7, m3
6986*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m7, m3
6987*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m2, m8
6988*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m8
6989*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m18, m19
6990*c0909341SAndroid Build Coastguard Worker    punpcklwd           m18, m19
6991*c0909341SAndroid Build Coastguard Worker    punpckhwd           m19, m14, m15
6992*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15
6993*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m20, m21
6994*c0909341SAndroid Build Coastguard Worker    punpcklwd           m20, m21
6995*c0909341SAndroid Build Coastguard Worker    punpckhwd           m21, m16, m17
6996*c0909341SAndroid Build Coastguard Worker    punpcklwd           m16, m17
6997*c0909341SAndroid Build Coastguard Worker    punpckhdq           m17, m14, m16
6998*c0909341SAndroid Build Coastguard Worker    punpckldq           m14, m16
6999*c0909341SAndroid Build Coastguard Worker    punpckldq           m16, m18, m20
7000*c0909341SAndroid Build Coastguard Worker    punpckhdq           m18, m20
7001*c0909341SAndroid Build Coastguard Worker    punpckhdq           m20, m19, m21
7002*c0909341SAndroid Build Coastguard Worker    punpckldq           m19, m21
7003*c0909341SAndroid Build Coastguard Worker    punpckldq           m21, m8, m15
7004*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m15
7005*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m15, m14, m16
7006*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m16
7007*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m16, m17, m18
7008*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m17, m18
7009*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m18, m19, m21
7010*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m19, m21
7011*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m21, m20, m8
7012*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m20, m8
7013*c0909341SAndroid Build Coastguard Worker    ret
7014*c0909341SAndroid Build Coastguard Worker.pass2_fast:
7015*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m24, m9, m15, q3131  ;  5
7016*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m22, m9, m15, q2020  ;  1
7017*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m1, m16, q3131  ;  6
7018*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m1, m16, q2020  ;  2
7019*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m0, m3, q3131   ;  4
7020*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m3, q2020       ;  0
7021*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m8, m2, q3131   ; 12
7022*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m8, m2, q2020   ;  8
7023*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m25, m11, m17, q3131 ;  7
7024*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m23, m11, m17, q2020 ;  3
7025*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m5, m19, q3131  ; 14
7026*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m5, m19, q2020  ; 10
7027*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m29, m6, m20, q3131  ; 15
7028*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m6, m20, q2020  ; 11
7029*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m28, m4, m18, q3131  ; 13
7030*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m26, m4, m18, q2020  ;  9
7031*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
7032*c0909341SAndroid Build Coastguard Worker
7033*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob
7034*c0909341SAndroid Build Coastguard Worker    lea                  r5, [o_base]
7035*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
7036*c0909341SAndroid Build Coastguard Worker    jz .dconly
7037*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 7, 30, 64*96, dst, stride, c, eob
7038*c0909341SAndroid Build Coastguard Worker%undef cmp
7039*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 136
7040*c0909341SAndroid Build Coastguard Worker    jb .fast
7041*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 1]
7042*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*31]
7043*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*17]
7044*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*15]
7045*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
7046*c0909341SAndroid Build Coastguard Worker    mov                  r4, rsp
7047*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7048*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 7]
7049*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*25]
7050*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*23]
7051*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64* 9]
7052*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7053*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 5]
7054*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*27]
7055*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*21]
7056*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*11]
7057*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7058*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 3]
7059*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*29]
7060*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*19]
7061*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*13]
7062*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7063*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
7064*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 0]
7065*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64* 8]
7066*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*16]
7067*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*24]
7068*c0909341SAndroid Build Coastguard Worker    mova                m14, [cq+64* 4]
7069*c0909341SAndroid Build Coastguard Worker    mova                m15, [cq+64*12]
7070*c0909341SAndroid Build Coastguard Worker    mova                m16, [cq+64*20]
7071*c0909341SAndroid Build Coastguard Worker    mova                m17, [cq+64*28]
7072*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
7073*c0909341SAndroid Build Coastguard Worker    mova                m22, [cq+64* 2]
7074*c0909341SAndroid Build Coastguard Worker    mova                m29, [cq+64*30]
7075*c0909341SAndroid Build Coastguard Worker    mova                m26, [cq+64*18]
7076*c0909341SAndroid Build Coastguard Worker    mova                m25, [cq+64*14]
7077*c0909341SAndroid Build Coastguard Worker    mova                m24, [cq+64*10]
7078*c0909341SAndroid Build Coastguard Worker    mova                m27, [cq+64*22]
7079*c0909341SAndroid Build Coastguard Worker    mova                m28, [cq+64*26]
7080*c0909341SAndroid Build Coastguard Worker    mova                m23, [cq+64* 6]
7081*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m14
7082*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m15
7083*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m16
7084*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m17
7085*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m18
7086*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m19
7087*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m20
7088*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m21
7089*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
7090*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_8192)]
7091*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1
7092*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*36], m1
7093*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*37], m3
7094*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*38], m5
7095*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*39], m7
7096*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*44], m23
7097*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*45], m25
7098*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*46], m27
7099*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*47], m29
7100*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m23, m13, m0 ; a0
7101*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m25, m13, m2 ; a2
7102*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m27, m13, m4 ; a4
7103*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m29, m13, m6 ; a6
7104*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2
7105*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r4-64*4]
7106*c0909341SAndroid Build Coastguard Worker    add                  r4, 64*28
7107*c0909341SAndroid Build Coastguard Worker    call .pass2_end
7108*c0909341SAndroid Build Coastguard Worker    mov                  r4, rsp
7109*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r4+64*23]
7110*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r4+64*22]
7111*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r4+64*21]
7112*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r4+64*20]
7113*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r4+64*19]
7114*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r4+64*18]
7115*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r4+64*17]
7116*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r4+64*16]
7117*c0909341SAndroid Build Coastguard Worker    mova                m22, [r4+64*15]
7118*c0909341SAndroid Build Coastguard Worker    mova                m23, [r4+64*14]
7119*c0909341SAndroid Build Coastguard Worker    mova                m24, [r4+64*13]
7120*c0909341SAndroid Build Coastguard Worker    mova                m25, [r4+64*12]
7121*c0909341SAndroid Build Coastguard Worker    mova                m26, [r4+64*11]
7122*c0909341SAndroid Build Coastguard Worker    mova                m27, [r4+64*10]
7123*c0909341SAndroid Build Coastguard Worker    mova                m28, [r4+64* 9]
7124*c0909341SAndroid Build Coastguard Worker    mova                m29, [r4+64* 8]
7125*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi
7126*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_8192)]
7127*c0909341SAndroid Build Coastguard Worker    mova         [r4+64* 8], m1
7128*c0909341SAndroid Build Coastguard Worker    mova         [r4+64* 9], m3
7129*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*10], m5
7130*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*11], m7
7131*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*16], m23
7132*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*17], m25
7133*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*18], m27
7134*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*19], m29
7135*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m23, m13, m0 ; b0
7136*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m25, m13, m2 ; b2
7137*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m27, m13, m4 ; b4
7138*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m29, m13, m6 ; b6
7139*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r4+64*31]
7140*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r4+64*30]
7141*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r4+64*29]
7142*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r4+64*28]
7143*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r4+64*27]
7144*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r4+64*26]
7145*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r4+64*25]
7146*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r4+64*24]
7147*c0909341SAndroid Build Coastguard Worker    mova                m14, [r4+64* 7]
7148*c0909341SAndroid Build Coastguard Worker    mova                m15, [r4+64* 6]
7149*c0909341SAndroid Build Coastguard Worker    mova                m16, [r4+64* 5]
7150*c0909341SAndroid Build Coastguard Worker    mova                m17, [r4+64* 4]
7151*c0909341SAndroid Build Coastguard Worker    mova                m18, [r4+64* 3]
7152*c0909341SAndroid Build Coastguard Worker    mova                m19, [r4+64* 2]
7153*c0909341SAndroid Build Coastguard Worker    mova                m20, [r4+64* 1]
7154*c0909341SAndroid Build Coastguard Worker    mova                m21, [r4+64* 0]
7155*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo
7156*c0909341SAndroid Build Coastguard Worker    mov                  r6, cq
7157*c0909341SAndroid Build Coastguard Worker    call .pass2_end
7158*c0909341SAndroid Build Coastguard Worker    jmp .end
7159*c0909341SAndroid Build Coastguard Worker.fast: ; bottom/right halves are zero
7160*c0909341SAndroid Build Coastguard Worker    mova                m28, [o(dup16_perm)]
7161*c0909341SAndroid Build Coastguard Worker    pmovzxwd             m9,       [cq+64* 0]
7162*c0909341SAndroid Build Coastguard Worker    vpermb               m8, m28,  [cq+64* 4]
7163*c0909341SAndroid Build Coastguard Worker    vpermb              ym1, ym28, [cq+64*12]
7164*c0909341SAndroid Build Coastguard Worker    vpermb               m7, m28,  [cq+64* 8]
7165*c0909341SAndroid Build Coastguard Worker    pslld                m9, 16
7166*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main_fast2
7167*c0909341SAndroid Build Coastguard Worker    vpermb              m21, m28,  [cq+64* 2]
7168*c0909341SAndroid Build Coastguard Worker    vpermb             ym15, ym28, [cq+64*14]
7169*c0909341SAndroid Build Coastguard Worker    vpermb             ym18, ym28, [cq+64*10]
7170*c0909341SAndroid Build Coastguard Worker    vpermb              m14, m28,  [cq+64* 6]
7171*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
7172*c0909341SAndroid Build Coastguard Worker    vpermb              m22, m28,  [cq+64* 1]
7173*c0909341SAndroid Build Coastguard Worker    vpermb             ym29, ym28, [cq+64*15]
7174*c0909341SAndroid Build Coastguard Worker    vpermb             ym26, ym28, [cq+64* 9]
7175*c0909341SAndroid Build Coastguard Worker    vpermb              m25, m28,  [cq+64* 7]
7176*c0909341SAndroid Build Coastguard Worker    vpermb              m24, m28,  [cq+64* 5]
7177*c0909341SAndroid Build Coastguard Worker    vpermb             ym27, ym28, [cq+64*11]
7178*c0909341SAndroid Build Coastguard Worker    vpermb              m23, m28,  [cq+64* 3]
7179*c0909341SAndroid Build Coastguard Worker    vpermb             ym28, ym28, [cq+64*13]
7180*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m14
7181*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 1], m15
7182*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 2], m16
7183*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 3], m17
7184*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 4], m18
7185*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 5], m19
7186*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 6], m20
7187*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 7], m21
7188*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
7189*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_8192)]
7190*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*16], m4
7191*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*17], m5
7192*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*18], m6
7193*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*19], m7
7194*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*28], m26
7195*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*29], m27
7196*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*30], m28
7197*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*31], m29
7198*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
7199*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*20], m22
7200*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*21], m23
7201*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*22], m24
7202*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*23], m25
7203*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*24], m26
7204*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*25], m27
7205*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*26], m28
7206*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*27], m29
7207*c0909341SAndroid Build Coastguard Worker    lea                  r4, [rsp+64*64]
7208*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rsp+64*32]
7209*c0909341SAndroid Build Coastguard Worker    call .pass2_fast
7210*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13, [cq+64*16]
7211*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m13, [cq+64*17]
7212*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m13, [cq+64*18]
7213*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m13, [cq+64*19]
7214*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m13, [cq+64*20]
7215*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m13, [cq+64*21]
7216*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m13, [cq+64*22]
7217*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m13, [cq+64*23]
7218*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m13, [cq+64*24]
7219*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m13, [cq+64*25]
7220*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m13, [cq+64*26]
7221*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m13, [cq+64*27]
7222*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m13, [cq+64*28]
7223*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m13, [cq+64*29]
7224*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m13, [cq+64*30]
7225*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m13, [cq+64*31]
7226*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
7227*c0909341SAndroid Build Coastguard Worker    mov                  r4, rsp
7228*c0909341SAndroid Build Coastguard Worker    mov                  r3, cq
7229*c0909341SAndroid Build Coastguard Worker    call .pass2_fast
7230*c0909341SAndroid Build Coastguard Worker.end:
7231*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m17, [o(pw_2048)]
7232*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq*8]
7233*c0909341SAndroid Build Coastguard Worker    mov                  r3, dstq
7234*c0909341SAndroid Build Coastguard Worker    pxor                m16, m16
7235*c0909341SAndroid Build Coastguard Worker    sub                  r4, 64*5 ; rsp+64*31
7236*c0909341SAndroid Build Coastguard Worker    mov                  r6, rsp
7237*c0909341SAndroid Build Coastguard Worker.end_loop:
7238*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r6+64*32] ; idct16 0+n  lo
7239*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r6+64*48] ; idct32 31-n lo
7240*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64* 0] ; idct16 0+n  hi
7241*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64*16] ; idct32 31-n hi
7242*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r4+64*64] ; idct64 63-n lo
7243*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r4+64* 0] ; idct64 63-n hi
7244*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r6+64*64] ; idct64 32+n lo
7245*c0909341SAndroid Build Coastguard Worker    mova                 m8, [r6+64* 0] ; idct64 32+n hi
7246*c0909341SAndroid Build Coastguard Worker    sub                  r3, strideq
7247*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m2, m7     ; idct32  0+n lo
7248*c0909341SAndroid Build Coastguard Worker    mova                m12, [dstq+r5*0]
7249*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m7         ; idct32 31-n lo
7250*c0909341SAndroid Build Coastguard Worker    mova                m15, [r3  +r5*8]
7251*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m6, m0     ; idct32  0+n hi
7252*c0909341SAndroid Build Coastguard Worker    mova                m13, [r3  +r5*4]
7253*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m0         ; idct32 31-n hi
7254*c0909341SAndroid Build Coastguard Worker    mova                m14, [dstq+r5*4]
7255*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m3, m4     ; out  0+n lo
7256*c0909341SAndroid Build Coastguard Worker    add                  r6, 64
7257*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m4         ; out 63-n lo
7258*c0909341SAndroid Build Coastguard Worker    sub                  r4, 64
7259*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m7, m1     ; out  0+n hi
7260*c0909341SAndroid Build Coastguard Worker    mova         [cq+64* 0], m16
7261*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m1         ; out 63-n hi
7262*c0909341SAndroid Build Coastguard Worker    mova         [cq+64*16], m16
7263*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, m5     ; out 31-n lo
7264*c0909341SAndroid Build Coastguard Worker    add                  cq, 64
7265*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m5         ; out 32+n lo
7266*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m6, m8     ; out 31-n hi
7267*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m8         ; out 32+n hi
7268*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m17
7269*c0909341SAndroid Build Coastguard Worker    punpcklbw            m8, m12, m16
7270*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m17
7271*c0909341SAndroid Build Coastguard Worker    punpckhbw           m12, m16
7272*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m17
7273*c0909341SAndroid Build Coastguard Worker    punpcklbw           m11, m15, m16
7274*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m17
7275*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m16
7276*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m17
7277*c0909341SAndroid Build Coastguard Worker    punpcklbw            m9, m13, m16
7278*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m17
7279*c0909341SAndroid Build Coastguard Worker    punpckhbw           m13, m16
7280*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m17
7281*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m14, m16
7282*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m17
7283*c0909341SAndroid Build Coastguard Worker    punpckhbw           m14, m16
7284*c0909341SAndroid Build Coastguard Worker    paddw                m0, m8
7285*c0909341SAndroid Build Coastguard Worker    paddw                m4, m12
7286*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m4
7287*c0909341SAndroid Build Coastguard Worker    paddw                m3, m11
7288*c0909341SAndroid Build Coastguard Worker    paddw                m7, m15
7289*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m7
7290*c0909341SAndroid Build Coastguard Worker    paddw                m1, m9
7291*c0909341SAndroid Build Coastguard Worker    paddw                m5, m13
7292*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m5
7293*c0909341SAndroid Build Coastguard Worker    paddw                m2, m10
7294*c0909341SAndroid Build Coastguard Worker    paddw                m6, m14
7295*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m6
7296*c0909341SAndroid Build Coastguard Worker    mova        [dstq+r5*0], m0
7297*c0909341SAndroid Build Coastguard Worker    mova        [r3  +r5*8], m3
7298*c0909341SAndroid Build Coastguard Worker    mova        [r3  +r5*4], m1
7299*c0909341SAndroid Build Coastguard Worker    mova        [dstq+r5*4], m2
7300*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
7301*c0909341SAndroid Build Coastguard Worker    cmp                  r6, r4
7302*c0909341SAndroid Build Coastguard Worker    jb .end_loop
7303*c0909341SAndroid Build Coastguard Worker    RET
7304*c0909341SAndroid Build Coastguard Worker.dconly:
7305*c0909341SAndroid Build Coastguard Worker    movsx               r6d, word [cq]
7306*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
7307*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
7308*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
7309*c0909341SAndroid Build Coastguard WorkerALIGN function_align
7310*c0909341SAndroid Build Coastguard Worker.pass2_end:
7311*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6
7312*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*20], m1
7313*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*21], m3
7314*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*22], m5
7315*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*23], m7
7316*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m1, m23, ym14, 1    ; a00 a01 c00 c01
7317*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m23, m14, q3232 ; a02 a03 c02 c03
7318*c0909341SAndroid Build Coastguard Worker    vinserti32x8         m5, m22, ym0, 1     ; e00 e01 g00 g01
7319*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m22, m0, q3232  ; e02 e03 g02 g03
7320*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*12], m15
7321*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*13], m17
7322*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*14], m19
7323*c0909341SAndroid Build Coastguard Worker    mova         [r4+64*15], m21
7324*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m15, m27, ym18, 1    ; a40 a41 c40 c41
7325*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m27, m18, q3232 ; a42 a43 c42 c43
7326*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, m26, ym4, 1     ; e40 e41 g40 g41
7327*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m26, m4, q3232  ; e42 e43 g42 g43
7328*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m22, m25, ym16, 1    ; a20 a21 c20 c21
7329*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m26, m25, m16, q3232 ; a22 a23 c22 c23
7330*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m25, m24, ym2, 1     ; e20 e21 g20 g21
7331*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m24, m2, q3232  ; e22 e23 g22 g23
7332*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m23, m29, ym20, 1    ; a60 a61 c60 c61
7333*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m29, m20, q3232      ; a62 a63 c62 c63
7334*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m13, m28, m6, q3232  ; e62 e63 g62 g63
7335*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m28, ym6, 1          ; e60 e61 g60 g61
7336*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m1, m5, q2020   ;  0
7337*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m5, q3131       ;  8
7338*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m3, m14, q2020  ; 16
7339*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m14, q3131      ; 24
7340*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m15, m18, q2020 ;  4
7341*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m18, q3131      ; 12
7342*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m17, m19, q2020 ; 20
7343*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m19, q3131      ; 28
7344*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
7345*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m24, m22, m25, q3131 ; 10
7346*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m22, m25, q2020      ;  2
7347*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m25, m23, m28, q3131 ; 14
7348*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m23, m28, q2020      ;  6
7349*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m28, m26, m27, q3131 ; 26
7350*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m26, m27, q2020      ; 18
7351*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m29, m13, q2020 ; 22
7352*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m29, m13, q3131      ; 30
7353*c0909341SAndroid Build Coastguard Worker    mova         [r6+64* 0], m0
7354*c0909341SAndroid Build Coastguard Worker    mova         [r6+64* 1], m1
7355*c0909341SAndroid Build Coastguard Worker    mova         [r6+64* 2], m2
7356*c0909341SAndroid Build Coastguard Worker    mova         [r6+64* 3], m3
7357*c0909341SAndroid Build Coastguard Worker    mova         [r6+64* 4], m4
7358*c0909341SAndroid Build Coastguard Worker    mova         [r6+64* 5], m5
7359*c0909341SAndroid Build Coastguard Worker    mova         [r6+64* 6], m6
7360*c0909341SAndroid Build Coastguard Worker    mova         [r6+64* 7], m7
7361*c0909341SAndroid Build Coastguard Worker    mova         [r6+64* 8], m14
7362*c0909341SAndroid Build Coastguard Worker    mova         [r6+64* 9], m15
7363*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*10], m16
7364*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*11], m17
7365*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*12], m18
7366*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*13], m19
7367*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*14], m20
7368*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*15], m21
7369*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
7370*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_8192)]
7371*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*16], m29
7372*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*17], m28
7373*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*18], m27
7374*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*19], m26
7375*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*20], m25
7376*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*21], m24
7377*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*22], m23
7378*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*23], m22
7379*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*24], m21
7380*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*25], m20
7381*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*26], m19
7382*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*27], m18
7383*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*28], m17
7384*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*29], m16
7385*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*30], m15
7386*c0909341SAndroid Build Coastguard Worker    mova         [r6+64*31], m14
7387*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m13, [r4+64* 8] ;  1  9 17 25
7388*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m16, m13, [r4+64*12]
7389*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m17, m13, [r4+64*16]
7390*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m18, m13, [r4+64*20]
7391*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m19, m13, [r4+64*11] ;  7 15 23 31
7392*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m20, m13, [r4+64*15]
7393*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m21, m13, [r4+64*19]
7394*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m22, m13, [r4+64*23]
7395*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m14, m15, ym16, 1 ; a1  a9  c1  c9
7396*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m16, q3232   ; a17 a25 c17 c25
7397*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m16, m17, ym18, 1 ; e1  e9  g1  g9
7398*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m18, q3232   ; e17 e25 g17 g25
7399*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m23, m13, [r4+64*10] ;  5 13 21 29
7400*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m24, m13, [r4+64*14]
7401*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m25, m13, [r4+64*18]
7402*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m26, m13, [r4+64*22]
7403*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m18, m19, ym20, 1 ; a7  a15 c7  c15
7404*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m20, q3232   ; a23 a31 c23 c31
7405*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m20, m21, ym22, 1 ; e7  e15 g7  g15
7406*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m21, m22, q3232   ; e23 e31 g23 g31
7407*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m27, m13, [r4+64* 9] ;  3 11 19 27
7408*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m28, m13, [r4+64*13]
7409*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m29, m13, [r4+64*17]
7410*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m13,      [r4+64*21]
7411*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m14, m16, q2020 ;  1
7412*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m19, m21, q3131 ; 31
7413*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m15, m17, q2020 ; 17
7414*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m18, m20, q3131 ; 15
7415*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7416*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m18, m20, q2020 ;  7
7417*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m15, m17, q3131 ; 25
7418*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m19, m21, q2020 ; 23
7419*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m14, m16, q3131 ;  9
7420*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7421*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m22, m23, ym24, 1 ; a5  a13 c5  c13
7422*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m23, m24, q3232   ; a21 a29 c21 c29
7423*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m24, m25, ym26, 1 ; e5  e13 g5  g13
7424*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m25, m26, q3232   ; e21 e29 g21 g29
7425*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m26, m27, ym28, 1 ; a3  a11 c3  c11
7426*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m28, q3232   ; a19 a27 c19 c27
7427*c0909341SAndroid Build Coastguard Worker    vinserti32x8        m28, m29, ym13, 1 ; e3  e11 g3  g11
7428*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m29, m13, q3232   ; e19 e17 g19 g27
7429*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m22, m24, q2020 ;  5
7430*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m27, m29, q3131 ; 27
7431*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m23, m25, q2020 ; 21
7432*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m26, m28, q3131 ; 11
7433*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7434*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m26, m28, q2020 ;  3
7435*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m1, m23, m25, q3131 ; 29
7436*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m2, m27, m29, q2020 ; 19
7437*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m22, m24, q3131 ; 13
7438*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
7439*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
7440*c0909341SAndroid Build Coastguard WorkerALIGN function_align
7441*c0909341SAndroid Build Coastguard Worker.pass2_fast:
7442*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m23, m1, m16, q3131  ;  6
7443*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m22, m1, m16, q2020  ;  2
7444*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m14, m0, m3, q3131   ;  4
7445*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m26, m0, m3, q2020   ;  0
7446*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m28, m9, m15, q3131  ;  5
7447*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m0, m9, m15, q2020  ;  1
7448*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m16, m11, m17, q3131 ;  7
7449*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m29, m11, m17, q2020 ;  3
7450*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m15, m8, m2, q3131   ; 12
7451*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m27, m8, m2, q2020   ;  8
7452*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m25, m5, m19, q3131  ; 14
7453*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m24, m5, m19, q2020  ; 10
7454*c0909341SAndroid Build Coastguard Worker    vshufi32x4           m3, m6, m20, q3131  ; 15
7455*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m19, m6, m20, q2020  ; 11
7456*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m17, m4, m18, q3131  ; 13
7457*c0909341SAndroid Build Coastguard Worker    vshufi32x4          m18, m4, m18, q2020  ;  9
7458*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
7459*c0909341SAndroid Build Coastguard Worker    mova                 m0, m16
7460*c0909341SAndroid Build Coastguard Worker    mova                 m3, m18
7461*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
7462*c0909341SAndroid Build Coastguard Worker    mova                 m0, m28
7463*c0909341SAndroid Build Coastguard Worker    mova                 m3, m19
7464*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
7465*c0909341SAndroid Build Coastguard Worker    mova                 m0, m29
7466*c0909341SAndroid Build Coastguard Worker    mova                 m3, m17
7467*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
7468*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
7469*c0909341SAndroid Build Coastguard Worker    mova                 m0, m26
7470*c0909341SAndroid Build Coastguard Worker    mova                 m1, m27
7471*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
7472*c0909341SAndroid Build Coastguard Worker    mova         [r3+64* 0], m0
7473*c0909341SAndroid Build Coastguard Worker    mova         [r3+64* 1], m1
7474*c0909341SAndroid Build Coastguard Worker    mova         [r3+64* 2], m2
7475*c0909341SAndroid Build Coastguard Worker    mova         [r3+64* 3], m3
7476*c0909341SAndroid Build Coastguard Worker    mova         [r3+64* 4], m4
7477*c0909341SAndroid Build Coastguard Worker    mova         [r3+64* 5], m5
7478*c0909341SAndroid Build Coastguard Worker    mova         [r3+64* 6], m6
7479*c0909341SAndroid Build Coastguard Worker    mova         [r3+64* 7], m7
7480*c0909341SAndroid Build Coastguard Worker    mova         [r3+64* 8], m14
7481*c0909341SAndroid Build Coastguard Worker    mova         [r3+64* 9], m15
7482*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*10], m16
7483*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*11], m17
7484*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*12], m18
7485*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*13], m19
7486*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*14], m20
7487*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*15], m21
7488*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
7489*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*16], m29
7490*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*17], m28
7491*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*18], m27
7492*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*19], m26
7493*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*20], m25
7494*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*21], m24
7495*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*22], m23
7496*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*23], m22
7497*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*24], m21
7498*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*25], m20
7499*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*26], m19
7500*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*27], m18
7501*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*28], m17
7502*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*29], m16
7503*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*30], m15
7504*c0909341SAndroid Build Coastguard Worker    mova         [r3+64*31], m14
7505*c0909341SAndroid Build Coastguard Worker    ret
7506*c0909341SAndroid Build Coastguard Worker
7507*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
7508