1*c0909341SAndroid Build Coastguard Worker; Copyright © 2020-2023, VideoLAN and dav1d authors 2*c0909341SAndroid Build Coastguard Worker; Copyright © 2020-2023, Two Orioles, LLC 3*c0909341SAndroid Build Coastguard Worker; All rights reserved. 4*c0909341SAndroid Build Coastguard Worker; 5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without 6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met: 7*c0909341SAndroid Build Coastguard Worker; 8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this 9*c0909341SAndroid Build Coastguard Worker; list of conditions and the following disclaimer. 10*c0909341SAndroid Build Coastguard Worker; 11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice, 12*c0909341SAndroid Build Coastguard Worker; this list of conditions and the following disclaimer in the documentation 13*c0909341SAndroid Build Coastguard Worker; and/or other materials provided with the distribution. 14*c0909341SAndroid Build Coastguard Worker; 15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25*c0909341SAndroid Build Coastguard Worker 26*c0909341SAndroid Build Coastguard Worker%include "config.asm" 27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm" 28*c0909341SAndroid Build Coastguard Worker 29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 30*c0909341SAndroid Build Coastguard Worker 31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 64 32*c0909341SAndroid Build Coastguard Workerconst \ 33*c0909341SAndroid Build Coastguard Workerdup16_perm, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 34*c0909341SAndroid Build Coastguard Worker db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 35*c0909341SAndroid Build Coastguard Worker db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 36*c0909341SAndroid Build Coastguard Worker db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 37*c0909341SAndroid Build Coastguard Workerconst \ 38*c0909341SAndroid Build Coastguard Workerint8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 39*c0909341SAndroid Build Coastguard Worker db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 40*c0909341SAndroid Build Coastguard Worker db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 41*c0909341SAndroid Build Coastguard Worker db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 42*c0909341SAndroid Build Coastguard Workerint8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 43*c0909341SAndroid Build Coastguard Worker db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 44*c0909341SAndroid Build Coastguard Worker db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 45*c0909341SAndroid Build Coastguard Worker db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 46*c0909341SAndroid Build Coastguard Workerint16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 47*c0909341SAndroid Build Coastguard Worker db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 48*c0909341SAndroid Build Coastguard Worker db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 49*c0909341SAndroid Build Coastguard Worker db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 50*c0909341SAndroid Build Coastguard Workeridtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23 51*c0909341SAndroid Build Coastguard Worker db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55 52*c0909341SAndroid Build Coastguard Worker db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31 53*c0909341SAndroid Build Coastguard Worker db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63 54*c0909341SAndroid Build Coastguard Workeridct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9 55*c0909341SAndroid Build Coastguard Worker db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17 56*c0909341SAndroid Build Coastguard Worker db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51 57*c0909341SAndroid Build Coastguard Worker db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35 58*c0909341SAndroid Build Coastguard Workeridct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35 59*c0909341SAndroid Build Coastguard Worker db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21 60*c0909341SAndroid Build Coastguard Worker db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51 61*c0909341SAndroid Build Coastguard Worker db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37 62*c0909341SAndroid Build Coastguard Workerend_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60 63*c0909341SAndroid Build Coastguard Worker db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61 64*c0909341SAndroid Build Coastguard Worker db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63 65*c0909341SAndroid Build Coastguard Worker db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62 66*c0909341SAndroid Build Coastguard Worker 67*c0909341SAndroid Build Coastguard Worker; packed 4-bit qword shuffle indices 68*c0909341SAndroid Build Coastguard WorkerpermA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262 69*c0909341SAndroid Build Coastguard Worker dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373 70*c0909341SAndroid Build Coastguard Worker dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb 71*c0909341SAndroid Build Coastguard Worker dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea 72*c0909341SAndroid Build Coastguard WorkerpermB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604 73*c0909341SAndroid Build Coastguard Worker dq 0xc824352d56128751, 0xd906171e74301e15 74*c0909341SAndroid Build Coastguard Worker dq 0x6271604b03472d62, 0x735342782165b426 75*c0909341SAndroid Build Coastguard Worker dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37 76*c0909341SAndroid Build Coastguard WorkerpermC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486 77*c0909341SAndroid Build Coastguard Worker dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597 78*c0909341SAndroid Build Coastguard Worker dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e 79*c0909341SAndroid Build Coastguard Worker dq 0x5115049dd9045b79, 0x733726bffb263d1f 80*c0909341SAndroid Build Coastguard WorkerpermD: dq 0x0cda098800041504, 0x0edb09b2028c3726 81*c0909341SAndroid Build Coastguard Worker dq 0x0f11fa9c01150415, 0x0988f326039d2637 82*c0909341SAndroid Build Coastguard Worker dq 0x05640f1108269d8c, 0x05290edb0aaebfae 83*c0909341SAndroid Build Coastguard Worker dq 0x0005000509378c9d, 0xffffffff0bbfaebf 84*c0909341SAndroid Build Coastguard Worker 85*c0909341SAndroid Build Coastguard Workerpd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 86*c0909341SAndroid Build Coastguard Workergather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 87*c0909341SAndroid Build Coastguard Workergather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 88*c0909341SAndroid Build Coastguard Workergather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 89*c0909341SAndroid Build Coastguard Workergather8d: dd 0, 19, 1, 18, 2, 17, 3, 16 90*c0909341SAndroid Build Coastguard Worker 91*c0909341SAndroid Build Coastguard Workerint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 92*c0909341SAndroid Build Coastguard Workerint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 93*c0909341SAndroid Build Coastguard Workerint_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 94*c0909341SAndroid Build Coastguard Workerint_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7 95*c0909341SAndroid Build Coastguard Workerdeint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 96*c0909341SAndroid Build Coastguard Workerint_mshift: db 12, 20, 0, 0, 44, 52, 0, 0 97*c0909341SAndroid Build Coastguard Worker 98*c0909341SAndroid Build Coastguard Workerpb_32: times 4 db 32 99*c0909341SAndroid Build Coastguard Workerpw_2048: times 2 dw 2048 100*c0909341SAndroid Build Coastguard Workerpw_4096: times 2 dw 4096 101*c0909341SAndroid Build Coastguard Workerpw_8192: times 2 dw 8192 102*c0909341SAndroid Build Coastguard Workerpw_16384: times 2 dw 16384 103*c0909341SAndroid Build Coastguard Workerpw_1697x16: times 2 dw 1697*16 104*c0909341SAndroid Build Coastguard Workerpw_1697x8: times 2 dw 1697*8 105*c0909341SAndroid Build Coastguard Workerpw_2896x8: times 2 dw 2896*8 106*c0909341SAndroid Build Coastguard Workerpd_2048: dd 2048 107*c0909341SAndroid Build Coastguard Worker 108*c0909341SAndroid Build Coastguard Worker%define pw_5 (permD+52) 109*c0909341SAndroid Build Coastguard Worker%define pd_m1 (permD+60) 110*c0909341SAndroid Build Coastguard Worker%define pw_3803_1321 (permD+44) 111*c0909341SAndroid Build Coastguard Worker%define pw_2482_3803 (permD+12) 112*c0909341SAndroid Build Coastguard Worker%define pw_2440_3290 (permD+ 4) 113*c0909341SAndroid Build Coastguard Worker%define pw_m3290_2440 (permD+28) 114*c0909341SAndroid Build Coastguard Worker%define pw_3857_1380 (permD+36) 115*c0909341SAndroid Build Coastguard Worker%define pw_m1380_3857 (permD+20) 116*c0909341SAndroid Build Coastguard Worker 117*c0909341SAndroid Build Coastguard Workerpw_8192_m8192: dw 8192, -8192 118*c0909341SAndroid Build Coastguard Workerpw_m8192_8192: dw -8192, 8192 119*c0909341SAndroid Build Coastguard Workerpw_16384_m16384: dw 16384, -16384 120*c0909341SAndroid Build Coastguard Workerpw_m16384_16384: dw -16384, 16384 121*c0909341SAndroid Build Coastguard Worker 122*c0909341SAndroid Build Coastguard Workerpw_m1321_2482: dw -1321, 2482 123*c0909341SAndroid Build Coastguard Workerpw_m3344_3344: dw -3344, 3344 124*c0909341SAndroid Build Coastguard Workerpw_2482_3344: dw 2482, 3344 125*c0909341SAndroid Build Coastguard Workerpw_m3803_3344: dw -3803, 3344 126*c0909341SAndroid Build Coastguard Workerpd_3344: dd 3344 127*c0909341SAndroid Build Coastguard Workerpw_m1321_m3344: dw -1321, -3344 128*c0909341SAndroid Build Coastguard Workerpw_2896_m2896: dw 2896, -2896 129*c0909341SAndroid Build Coastguard Worker 130*c0909341SAndroid Build Coastguard Workerpw_1567_m3784: dw 1567, -3784 131*c0909341SAndroid Build Coastguard Workerpw_3784_m1567: dw 3784, -1567 132*c0909341SAndroid Build Coastguard Workerpw_4017_m799: dw 4017, -799 133*c0909341SAndroid Build Coastguard Workerpw_2276_m3406: dw 2276, -3406 134*c0909341SAndroid Build Coastguard Workerpw_m799_m4017: dw -799, -4017 135*c0909341SAndroid Build Coastguard Workerpw_m3406_m2276: dw -3406, -2276 136*c0909341SAndroid Build Coastguard Worker 137*c0909341SAndroid Build Coastguard Worker%macro COEF_PAIR 2-3 0 138*c0909341SAndroid Build Coastguard Workerpw_%1_%2: dw %1, %2 139*c0909341SAndroid Build Coastguard Workerpw_m%2_%1: dw -%2, %1 140*c0909341SAndroid Build Coastguard Worker%if %3 141*c0909341SAndroid Build Coastguard Workerpw_m%1_m%2: dw -%1, -%2 142*c0909341SAndroid Build Coastguard Worker%endif 143*c0909341SAndroid Build Coastguard Worker%endmacro 144*c0909341SAndroid Build Coastguard Worker 145*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 2896, 2896 146*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 1567, 3784, 1 147*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3784, 1567 148*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 201, 4091 149*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 995, 3973 150*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 1751, 3703 151*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3035, 2751 152*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3513, 2106 153*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4052, 601 154*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3166, 2598, 1 155*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3920, 1189, 1 156*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 2276, 3406 157*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4017, 799 158*c0909341SAndroid Build Coastguard Worker 159*c0909341SAndroid Build Coastguard Worker%macro COEF_X8 1-* 160*c0909341SAndroid Build Coastguard Worker%rep %0 161*c0909341SAndroid Build Coastguard Worker dw %1*8, %1*8 162*c0909341SAndroid Build Coastguard Worker %rotate 1 163*c0909341SAndroid Build Coastguard Worker%endrep 164*c0909341SAndroid Build Coastguard Worker%endmacro 165*c0909341SAndroid Build Coastguard Worker 166*c0909341SAndroid Build Coastguard Workerpw_m2276x8: COEF_X8 -2276 167*c0909341SAndroid Build Coastguard Workerpw_3406x8: COEF_X8 3406 168*c0909341SAndroid Build Coastguard Workerpw_4017x8: COEF_X8 4017 169*c0909341SAndroid Build Coastguard Workerpw_799x8: COEF_X8 799 170*c0909341SAndroid Build Coastguard Workerpw_3784x8: COEF_X8 3784 171*c0909341SAndroid Build Coastguard Workerpw_1567x8: COEF_X8 1567 172*c0909341SAndroid Build Coastguard Worker 173*c0909341SAndroid Build Coastguard Workerpw_4076x8: COEF_X8 4076 174*c0909341SAndroid Build Coastguard Workerpw_401x8: COEF_X8 401 175*c0909341SAndroid Build Coastguard Workerpw_m2598x8: COEF_X8 -2598 176*c0909341SAndroid Build Coastguard Workerpw_3166x8: COEF_X8 3166 177*c0909341SAndroid Build Coastguard Workerpw_3612x8: COEF_X8 3612 178*c0909341SAndroid Build Coastguard Workerpw_1931x8: COEF_X8 1931 179*c0909341SAndroid Build Coastguard Workerpw_m1189x8: COEF_X8 -1189 180*c0909341SAndroid Build Coastguard Workerpw_3920x8: COEF_X8 3920 181*c0909341SAndroid Build Coastguard Worker 182*c0909341SAndroid Build Coastguard Workerpw_4091x8: COEF_X8 4091 183*c0909341SAndroid Build Coastguard Workerpw_201x8: COEF_X8 201 184*c0909341SAndroid Build Coastguard Workerpw_m2751x8: COEF_X8 -2751 185*c0909341SAndroid Build Coastguard Workerpw_3035x8: COEF_X8 3035 186*c0909341SAndroid Build Coastguard Workerpw_3703x8: COEF_X8 3703 187*c0909341SAndroid Build Coastguard Workerpw_1751x8: COEF_X8 1751 188*c0909341SAndroid Build Coastguard Workerpw_m1380x8: COEF_X8 -1380 189*c0909341SAndroid Build Coastguard Workerpw_3857x8: COEF_X8 3857 190*c0909341SAndroid Build Coastguard Workerpw_3973x8: COEF_X8 3973 191*c0909341SAndroid Build Coastguard Workerpw_995x8: COEF_X8 995 192*c0909341SAndroid Build Coastguard Workerpw_m2106x8: COEF_X8 -2106 193*c0909341SAndroid Build Coastguard Workerpw_3513x8: COEF_X8 3513 194*c0909341SAndroid Build Coastguard Workerpw_3290x8: COEF_X8 3290 195*c0909341SAndroid Build Coastguard Workerpw_2440x8: COEF_X8 2440 196*c0909341SAndroid Build Coastguard Workerpw_m601x8: COEF_X8 -601 197*c0909341SAndroid Build Coastguard Workerpw_4052x8: COEF_X8 4052 198*c0909341SAndroid Build Coastguard Worker 199*c0909341SAndroid Build Coastguard Workerpw_401_4076x8: dw 401*8, 4076*8 200*c0909341SAndroid Build Coastguard Workerpw_m2598_3166x8: dw -2598*8, 3166*8 201*c0909341SAndroid Build Coastguard Workerpw_1931_3612x8: dw 1931*8, 3612*8 202*c0909341SAndroid Build Coastguard Workerpw_m1189_3920x8: dw -1189*8, 3920*8 203*c0909341SAndroid Build Coastguard Workerpw_799_4017x8: dw 799*8, 4017*8 204*c0909341SAndroid Build Coastguard Workerpw_m2276_3406x8: dw -2276*8, 3406*8 205*c0909341SAndroid Build Coastguard Worker 206*c0909341SAndroid Build Coastguard Workerpw_201_4091x8: dw 201*8, 4091*8 207*c0909341SAndroid Build Coastguard Workerpw_m601_4052x8: dw -601*8, 4052*8 208*c0909341SAndroid Build Coastguard Workerpw_995_3973x8: dw 995*8, 3973*8 209*c0909341SAndroid Build Coastguard Workerpw_m1380_3857x8: dw -1380*8, 3857*8 210*c0909341SAndroid Build Coastguard Workerpw_1751_3703x8: dw 1751*8, 3703*8 211*c0909341SAndroid Build Coastguard Workerpw_m2106_3513x8: dw -2106*8, 3513*8 212*c0909341SAndroid Build Coastguard Workerpw_2440_3290x8: dw 2440*8, 3290*8 213*c0909341SAndroid Build Coastguard Workerpw_m2751_3035x8: dw -2751*8, 3035*8 214*c0909341SAndroid Build Coastguard Worker 215*c0909341SAndroid Build Coastguard Workerpw_101_4095x8: dw 101*8, 4095*8 216*c0909341SAndroid Build Coastguard Workerpw_m2824_2967x8: dw -2824*8, 2967*8 217*c0909341SAndroid Build Coastguard Workerpw_1660_3745x8: dw 1660*8, 3745*8 218*c0909341SAndroid Build Coastguard Workerpw_m1474_3822x8: dw -1474*8, 3822*8 219*c0909341SAndroid Build Coastguard Workerpw_897_3996x8: dw 897*8, 3996*8 220*c0909341SAndroid Build Coastguard Workerpw_m2191_3461x8: dw -2191*8, 3461*8 221*c0909341SAndroid Build Coastguard Workerpw_2359_3349x8: dw 2359*8, 3349*8 222*c0909341SAndroid Build Coastguard Workerpw_m700_4036x8: dw -700*8, 4036*8 223*c0909341SAndroid Build Coastguard Workerpw_501_4065x8: dw 501*8, 4065*8 224*c0909341SAndroid Build Coastguard Workerpw_m2520_3229x8: dw -2520*8, 3229*8 225*c0909341SAndroid Build Coastguard Workerpw_2019_3564x8: dw 2019*8, 3564*8 226*c0909341SAndroid Build Coastguard Workerpw_m1092_3948x8: dw -1092*8, 3948*8 227*c0909341SAndroid Build Coastguard Workerpw_1285_3889x8: dw 1285*8, 3889*8 228*c0909341SAndroid Build Coastguard Workerpw_m1842_3659x8: dw -1842*8, 3659*8 229*c0909341SAndroid Build Coastguard Workerpw_2675_3102x8: dw 2675*8, 3102*8 230*c0909341SAndroid Build Coastguard Workerpw_m301_4085x8: dw -301*8, 4085*8 231*c0909341SAndroid Build Coastguard Worker 232*c0909341SAndroid Build Coastguard Workeridct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474 233*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 401, 4076, 1 234*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 799, 4017 235*c0909341SAndroid Build Coastguard Worker COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996 236*c0909341SAndroid Build Coastguard Workerdw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017 237*c0909341SAndroid Build Coastguard Worker COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092 238*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 1931, 3612, 1 239*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3406, 2276 240*c0909341SAndroid Build Coastguard Worker COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889 241*c0909341SAndroid Build Coastguard Workerdw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276 242*c0909341SAndroid Build Coastguard Worker 243*c0909341SAndroid Build Coastguard WorkerSECTION .text 244*c0909341SAndroid Build Coastguard Worker 245*c0909341SAndroid Build Coastguard Worker%define o_base int8_permA+64*18 246*c0909341SAndroid Build Coastguard Worker%define o(x) (r5 - (o_base) + (x)) 247*c0909341SAndroid Build Coastguard Worker%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 248*c0909341SAndroid Build Coastguard Worker 249*c0909341SAndroid Build Coastguard Worker; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack, 250*c0909341SAndroid Build Coastguard Worker; 16 = special_mul1, 32 = special_mul2 251*c0909341SAndroid Build Coastguard Worker%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags 252*c0909341SAndroid Build Coastguard Worker mova m%2, m%4 253*c0909341SAndroid Build Coastguard Worker%if %7 & 16 254*c0909341SAndroid Build Coastguard Worker vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd} 255*c0909341SAndroid Build Coastguard Worker mova m%3, m%4 256*c0909341SAndroid Build Coastguard Worker%if %7 & 32 257*c0909341SAndroid Build Coastguard Worker vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} 258*c0909341SAndroid Build Coastguard Worker%else 259*c0909341SAndroid Build Coastguard Worker vpdpwssd m%3, m%1, m%6 260*c0909341SAndroid Build Coastguard Worker%endif 261*c0909341SAndroid Build Coastguard Worker%elif %7 & 32 262*c0909341SAndroid Build Coastguard Worker vpdpwssd m%2, m%1, m%5 263*c0909341SAndroid Build Coastguard Worker mova m%3, m%4 264*c0909341SAndroid Build Coastguard Worker vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} 265*c0909341SAndroid Build Coastguard Worker%elif %6 < 32 266*c0909341SAndroid Build Coastguard Worker vpdpwssd m%2, m%1, m%5 267*c0909341SAndroid Build Coastguard Worker mova m%3, m%4 268*c0909341SAndroid Build Coastguard Worker vpdpwssd m%3, m%1, m%6 269*c0909341SAndroid Build Coastguard Worker%elif %7 & 1 270*c0909341SAndroid Build Coastguard Worker vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd} 271*c0909341SAndroid Build Coastguard Worker mova m%3, m%4 272*c0909341SAndroid Build Coastguard Worker vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd} 273*c0909341SAndroid Build Coastguard Worker%else 274*c0909341SAndroid Build Coastguard Worker vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd} 275*c0909341SAndroid Build Coastguard Worker mova m%3, m%4 276*c0909341SAndroid Build Coastguard Worker vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd} 277*c0909341SAndroid Build Coastguard Worker%endif 278*c0909341SAndroid Build Coastguard Worker%if %7 & 2 279*c0909341SAndroid Build Coastguard Worker psrld m%2, 12 280*c0909341SAndroid Build Coastguard Worker pslld m%3, 4 281*c0909341SAndroid Build Coastguard Worker vpshrdd m%1, m%3, m%2, 16 282*c0909341SAndroid Build Coastguard Worker%elif %7 & 4 283*c0909341SAndroid Build Coastguard Worker ; compared to using shifts (as above) this has better throughput, 284*c0909341SAndroid Build Coastguard Worker ; but worse latency and requires setting up the opmask/index 285*c0909341SAndroid Build Coastguard Worker ; registers, so only use this method for the larger transforms 286*c0909341SAndroid Build Coastguard Worker pslld m%1, m%2, 4 287*c0909341SAndroid Build Coastguard Worker vpmultishiftqb m%1{k7}, m13, m%3 288*c0909341SAndroid Build Coastguard Worker%else 289*c0909341SAndroid Build Coastguard Worker psrad m%2, 12 290*c0909341SAndroid Build Coastguard Worker psrad m%3, 12 291*c0909341SAndroid Build Coastguard Worker%if %7 & 8 == 0 292*c0909341SAndroid Build Coastguard Worker packssdw m%1, m%3, m%2 293*c0909341SAndroid Build Coastguard Worker%endif 294*c0909341SAndroid Build Coastguard Worker%endif 295*c0909341SAndroid Build Coastguard Worker%endmacro 296*c0909341SAndroid Build Coastguard Worker 297*c0909341SAndroid Build Coastguard Worker; flags: same as ITX_MUL2X_PACK 298*c0909341SAndroid Build Coastguard Worker%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags 299*c0909341SAndroid Build Coastguard Worker%if %11 & 1 300*c0909341SAndroid Build Coastguard Worker vpbroadcastd m%4, [o(pw_%9_%10)] 301*c0909341SAndroid Build Coastguard Worker vpbroadcastd m%4{k1}, [o(pw_%7_%8)] 302*c0909341SAndroid Build Coastguard Worker vpbroadcastd m%5, [o(pw_m%10_%9)] 303*c0909341SAndroid Build Coastguard Worker vpbroadcastd m%5{k1}, [o(pw_m%8_%7)] 304*c0909341SAndroid Build Coastguard Worker%else 305*c0909341SAndroid Build Coastguard Worker vpbroadcastd m%4, [o(pw_m%10_%9)] 306*c0909341SAndroid Build Coastguard Worker vpbroadcastd m%4{k1}, [o(pw_m%8_%7)] 307*c0909341SAndroid Build Coastguard Worker vpbroadcastd m%5, [o(pw_%9_%10)] 308*c0909341SAndroid Build Coastguard Worker vpbroadcastd m%5{k1}, [o(pw_%7_%8)] 309*c0909341SAndroid Build Coastguard Worker%endif 310*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11 311*c0909341SAndroid Build Coastguard Worker%endmacro 312*c0909341SAndroid Build Coastguard Worker 313*c0909341SAndroid Build Coastguard Worker; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 314*c0909341SAndroid Build Coastguard Worker; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 315*c0909341SAndroid Build Coastguard Worker%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 316*c0909341SAndroid Build Coastguard Worker punpcklwd m%3, m%2, m%1 317*c0909341SAndroid Build Coastguard Worker punpckhwd m%2, m%1 318*c0909341SAndroid Build Coastguard Worker%if %7 < 32 319*c0909341SAndroid Build Coastguard Worker mova m%1, m%5 320*c0909341SAndroid Build Coastguard Worker vpdpwssd m%1, m%3, m%7 321*c0909341SAndroid Build Coastguard Worker mova m%4, m%5 322*c0909341SAndroid Build Coastguard Worker vpdpwssd m%4, m%2, m%7 323*c0909341SAndroid Build Coastguard Worker%else 324*c0909341SAndroid Build Coastguard Worker mova m%1, m%5 325*c0909341SAndroid Build Coastguard Worker vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd} 326*c0909341SAndroid Build Coastguard Worker mova m%4, m%5 327*c0909341SAndroid Build Coastguard Worker vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd} 328*c0909341SAndroid Build Coastguard Worker%endif 329*c0909341SAndroid Build Coastguard Worker psrad m%1, 12 330*c0909341SAndroid Build Coastguard Worker psrad m%4, 12 331*c0909341SAndroid Build Coastguard Worker packssdw m%1, m%4 332*c0909341SAndroid Build Coastguard Worker mova m%4, m%5 333*c0909341SAndroid Build Coastguard Worker%if %7 < 32 334*c0909341SAndroid Build Coastguard Worker vpdpwssd m%4, m%2, m%6 335*c0909341SAndroid Build Coastguard Worker mova m%2, m%5 336*c0909341SAndroid Build Coastguard Worker vpdpwssd m%2, m%3, m%6 337*c0909341SAndroid Build Coastguard Worker%else 338*c0909341SAndroid Build Coastguard Worker vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd} 339*c0909341SAndroid Build Coastguard Worker mova m%2, m%5 340*c0909341SAndroid Build Coastguard Worker vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd} 341*c0909341SAndroid Build Coastguard Worker%endif 342*c0909341SAndroid Build Coastguard Worker psrad m%4, 12 343*c0909341SAndroid Build Coastguard Worker psrad m%2, 12 344*c0909341SAndroid Build Coastguard Worker%if %0 == 8 345*c0909341SAndroid Build Coastguard Worker packssdw m%8, m%2, m%4 346*c0909341SAndroid Build Coastguard Worker%else 347*c0909341SAndroid Build Coastguard Worker packssdw m%2, m%4 348*c0909341SAndroid Build Coastguard Worker%endif 349*c0909341SAndroid Build Coastguard Worker%endmacro 350*c0909341SAndroid Build Coastguard Worker 351*c0909341SAndroid Build Coastguard Worker%macro WRAP_XMM 1+ 352*c0909341SAndroid Build Coastguard Worker %xdefine %%reset RESET_MM_PERMUTATION 353*c0909341SAndroid Build Coastguard Worker INIT_XMM cpuname 354*c0909341SAndroid Build Coastguard Worker DEFINE_MMREGS xmm 355*c0909341SAndroid Build Coastguard Worker AVX512_MM_PERMUTATION 356*c0909341SAndroid Build Coastguard Worker %1 357*c0909341SAndroid Build Coastguard Worker %%reset 358*c0909341SAndroid Build Coastguard Worker%endmacro 359*c0909341SAndroid Build Coastguard Worker 360*c0909341SAndroid Build Coastguard Worker%macro WRAP_YMM 1+ 361*c0909341SAndroid Build Coastguard Worker INIT_YMM cpuname 362*c0909341SAndroid Build Coastguard Worker %1 363*c0909341SAndroid Build Coastguard Worker INIT_ZMM cpuname 364*c0909341SAndroid Build Coastguard Worker%endmacro 365*c0909341SAndroid Build Coastguard Worker 366*c0909341SAndroid Build Coastguard Worker%macro ITX4_END 4-5 2048 ; row[1-4], rnd 367*c0909341SAndroid Build Coastguard Worker%if %5 368*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_%5)] 369*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m2 370*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 371*c0909341SAndroid Build Coastguard Worker%endif 372*c0909341SAndroid Build Coastguard Worker lea r2, [dstq+strideq*2] 373*c0909341SAndroid Build Coastguard Worker%assign %%i 1 374*c0909341SAndroid Build Coastguard Worker%rep 4 375*c0909341SAndroid Build Coastguard Worker %if %1 & 2 376*c0909341SAndroid Build Coastguard Worker CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) 377*c0909341SAndroid Build Coastguard Worker %else 378*c0909341SAndroid Build Coastguard Worker CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) 379*c0909341SAndroid Build Coastguard Worker %endif 380*c0909341SAndroid Build Coastguard Worker %assign %%i %%i + 1 381*c0909341SAndroid Build Coastguard Worker %rotate 1 382*c0909341SAndroid Build Coastguard Worker%endrep 383*c0909341SAndroid Build Coastguard Worker movd m2, [%%row_adr1] 384*c0909341SAndroid Build Coastguard Worker pinsrd m2, [%%row_adr2], 1 385*c0909341SAndroid Build Coastguard Worker movd m3, [%%row_adr3] 386*c0909341SAndroid Build Coastguard Worker pinsrd m3, [%%row_adr4], 1 387*c0909341SAndroid Build Coastguard Worker pmovzxbw m2, m2 388*c0909341SAndroid Build Coastguard Worker pmovzxbw m3, m3 389*c0909341SAndroid Build Coastguard Worker paddw m0, m2 390*c0909341SAndroid Build Coastguard Worker paddw m1, m3 391*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 392*c0909341SAndroid Build Coastguard Worker movd [%%row_adr1], m0 393*c0909341SAndroid Build Coastguard Worker pextrd [%%row_adr2], m0, 1 394*c0909341SAndroid Build Coastguard Worker pextrd [%%row_adr3], m0, 2 395*c0909341SAndroid Build Coastguard Worker pextrd [%%row_adr4], m0, 3 396*c0909341SAndroid Build Coastguard Worker ret 397*c0909341SAndroid Build Coastguard Worker%endmacro 398*c0909341SAndroid Build Coastguard Worker 399*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_FN 3 ; type1, type2, size 400*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base 401*c0909341SAndroid Build Coastguard Worker %define %%p1 m(i%1_%3_internal_8bpc) 402*c0909341SAndroid Build Coastguard Worker lea baseq, [o_base] 403*c0909341SAndroid Build Coastguard Worker ; Jump to the 1st txfm function if we're not taking the fast path, which 404*c0909341SAndroid Build Coastguard Worker ; in turn performs an indirect jump to the 2nd txfm function. 405*c0909341SAndroid Build Coastguard Worker lea tx2q, [m(i%2_%3_internal_8bpc).pass2] 406*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct 407*c0909341SAndroid Build Coastguard Worker test eobd, eobd 408*c0909341SAndroid Build Coastguard Worker jnz %%p1 409*c0909341SAndroid Build Coastguard Worker%else 410*c0909341SAndroid Build Coastguard Worker ; jump to the 1st txfm function unless it's located directly after this 411*c0909341SAndroid Build Coastguard Worker times ((%%end - %%p1) >> 31) & 1 jmp %%p1 412*c0909341SAndroid Build Coastguard WorkerALIGN function_align 413*c0909341SAndroid Build Coastguard Worker%%end: 414*c0909341SAndroid Build Coastguard Worker%endif 415*c0909341SAndroid Build Coastguard Worker%endmacro 416*c0909341SAndroid Build Coastguard Worker 417*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_4X4_FN 2 ; type1, type2 418*c0909341SAndroid Build Coastguard Worker INV_TXFM_FN %1, %2, 4x4 419*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct 420*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, [cq] 421*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [o(pw_2896x8)] 422*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m1 423*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 424*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m1 425*c0909341SAndroid Build Coastguard Worker mova m1, m0 426*c0909341SAndroid Build Coastguard Worker jmp m(iadst_4x4_internal_8bpc).end2 427*c0909341SAndroid Build Coastguard Worker%endif 428*c0909341SAndroid Build Coastguard Worker%endmacro 429*c0909341SAndroid Build Coastguard Worker 430*c0909341SAndroid Build Coastguard Worker%macro IDCT4_1D_PACKED 0 431*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pd_2048)] 432*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m1, m0 433*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0 434*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 435*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 436*c0909341SAndroid Build Coastguard Worker paddsw m0, m1, m2 ; out0 out1 437*c0909341SAndroid Build Coastguard Worker psubsw m1, m2 ; out3 out2 438*c0909341SAndroid Build Coastguard Worker%endmacro 439*c0909341SAndroid Build Coastguard Worker 440*c0909341SAndroid Build Coastguard Worker%macro IADST4_1D_PACKED 0 441*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m1, m0 ; in2 in0 442*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m1, m0 ; in3 in1 443*c0909341SAndroid Build Coastguard Worker.main2: 444*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pd_2048)] 445*c0909341SAndroid Build Coastguard Worker mova m0, m3 446*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd} 447*c0909341SAndroid Build Coastguard Worker mova m2, m3 448*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd} 449*c0909341SAndroid Build Coastguard Worker mova m1, m3 450*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd} 451*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd} 452*c0909341SAndroid Build Coastguard Worker vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd} 453*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd} 454*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m5, [o(pd_3344)] {bcstd} 455*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd} 456*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 12}, m0, m2, m1, m3 457*c0909341SAndroid Build Coastguard Worker packssdw m0, m2 ; out0 out1 458*c0909341SAndroid Build Coastguard Worker packssdw m1, m3 ; out2 out3 459*c0909341SAndroid Build Coastguard Worker%endmacro 460*c0909341SAndroid Build Coastguard Worker 461*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx512icl 462*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, dct 463*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, adst 464*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, flipadst 465*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, identity 466*c0909341SAndroid Build Coastguard Worker 467*c0909341SAndroid Build Coastguard Workercglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 468*c0909341SAndroid Build Coastguard Worker mova m0, [cq+16*0] 469*c0909341SAndroid Build Coastguard Worker mova m1, [cq+16*1] 470*c0909341SAndroid Build Coastguard Worker IDCT4_1D_PACKED 471*c0909341SAndroid Build Coastguard Worker mova m2, [o(deint_shuf)] 472*c0909341SAndroid Build Coastguard Worker shufps m3, m0, m1, q1331 473*c0909341SAndroid Build Coastguard Worker shufps m0, m0, m1, q0220 474*c0909341SAndroid Build Coastguard Worker pshufb m0, m2 475*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m2 476*c0909341SAndroid Build Coastguard Worker jmp tx2q 477*c0909341SAndroid Build Coastguard Worker.pass2: 478*c0909341SAndroid Build Coastguard Worker IDCT4_1D_PACKED 479*c0909341SAndroid Build Coastguard Worker pxor ymm16, ymm16 480*c0909341SAndroid Build Coastguard Worker mova [cq], ymm16 481*c0909341SAndroid Build Coastguard Worker ITX4_END 0, 1, 3, 2 482*c0909341SAndroid Build Coastguard Worker 483*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, dct 484*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, adst 485*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, flipadst 486*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, identity 487*c0909341SAndroid Build Coastguard Worker 488*c0909341SAndroid Build Coastguard Workercglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 489*c0909341SAndroid Build Coastguard Worker mova m0, [cq+16*0] 490*c0909341SAndroid Build Coastguard Worker mova m1, [cq+16*1] 491*c0909341SAndroid Build Coastguard Worker call .main 492*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0, m1 493*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 494*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m3 495*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m3 496*c0909341SAndroid Build Coastguard Worker jmp tx2q 497*c0909341SAndroid Build Coastguard Worker.pass2: 498*c0909341SAndroid Build Coastguard Worker call .main 499*c0909341SAndroid Build Coastguard Worker.end: 500*c0909341SAndroid Build Coastguard Worker pxor ymm16, ymm16 501*c0909341SAndroid Build Coastguard Worker mova [cq], ymm16 502*c0909341SAndroid Build Coastguard Worker.end2: 503*c0909341SAndroid Build Coastguard Worker ITX4_END 0, 1, 2, 3 504*c0909341SAndroid Build Coastguard WorkerALIGN function_align 505*c0909341SAndroid Build Coastguard Worker.main: 506*c0909341SAndroid Build Coastguard Worker IADST4_1D_PACKED 507*c0909341SAndroid Build Coastguard Worker ret 508*c0909341SAndroid Build Coastguard Worker 509*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, dct 510*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, adst 511*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, flipadst 512*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, identity 513*c0909341SAndroid Build Coastguard Worker 514*c0909341SAndroid Build Coastguard Workercglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 515*c0909341SAndroid Build Coastguard Worker mova m0, [cq+16*0] 516*c0909341SAndroid Build Coastguard Worker mova m1, [cq+16*1] 517*c0909341SAndroid Build Coastguard Worker call m(iadst_4x4_internal_8bpc).main 518*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1, m0 519*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0 520*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m2 521*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2 522*c0909341SAndroid Build Coastguard Worker jmp tx2q 523*c0909341SAndroid Build Coastguard Worker.pass2: 524*c0909341SAndroid Build Coastguard Worker call m(iadst_4x4_internal_8bpc).main 525*c0909341SAndroid Build Coastguard Worker.end: 526*c0909341SAndroid Build Coastguard Worker pxor ymm16, ymm16 527*c0909341SAndroid Build Coastguard Worker mova [cq], ymm16 528*c0909341SAndroid Build Coastguard Worker.end2: 529*c0909341SAndroid Build Coastguard Worker ITX4_END 3, 2, 1, 0 530*c0909341SAndroid Build Coastguard Worker 531*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, dct 532*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, adst 533*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, flipadst 534*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, identity 535*c0909341SAndroid Build Coastguard Worker 536*c0909341SAndroid Build Coastguard Workercglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 537*c0909341SAndroid Build Coastguard Worker mova m0, [cq+16*0] 538*c0909341SAndroid Build Coastguard Worker mova m1, [cq+16*1] 539*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_1697x8)] 540*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3, m0 541*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m1 542*c0909341SAndroid Build Coastguard Worker paddsw m0, m2 543*c0909341SAndroid Build Coastguard Worker paddsw m1, m3 544*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0, m1 545*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 546*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m2 547*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 548*c0909341SAndroid Build Coastguard Worker jmp tx2q 549*c0909341SAndroid Build Coastguard Worker.pass2: 550*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_1697x8)] 551*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3, m0 552*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m1 553*c0909341SAndroid Build Coastguard Worker paddsw m0, m2 554*c0909341SAndroid Build Coastguard Worker paddsw m1, m3 555*c0909341SAndroid Build Coastguard Worker jmp m(iadst_4x4_internal_8bpc).end 556*c0909341SAndroid Build Coastguard Worker 557*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_4X8_FN 2 ; type1, type2 558*c0909341SAndroid Build Coastguard Worker INV_TXFM_FN %1, %2, 4x8 559*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct 560*c0909341SAndroid Build Coastguard Worker movd xmm1, [o(pw_2896x8)] 561*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm0, xmm1, [cq] 562*c0909341SAndroid Build Coastguard Worker movd xmm2, [o(pw_2048)] 563*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm0, xmm1 564*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm0, xmm1 565*c0909341SAndroid Build Coastguard Worker pmulhrsw xmm0, xmm2 566*c0909341SAndroid Build Coastguard Worker vpbroadcastw ym0, xmm0 567*c0909341SAndroid Build Coastguard Worker mova ym1, ym0 568*c0909341SAndroid Build Coastguard Worker jmp m(iadst_4x8_internal_8bpc).end3 569*c0909341SAndroid Build Coastguard Worker%endif 570*c0909341SAndroid Build Coastguard Worker%endmacro 571*c0909341SAndroid Build Coastguard Worker 572*c0909341SAndroid Build Coastguard Worker%macro IDCT8_1D_PACKED 0 573*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m0 ; in7 in1 574*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m1, m2 ; in3 in5 575*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m1 ; in6 in2 576*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m0 ; in4 in0 577*c0909341SAndroid Build Coastguard Worker.main2: 578*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [o(pd_2048)] 579*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a 580*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a 581*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 582*c0909341SAndroid Build Coastguard Worker psubsw m0, m5, m4 ; t5a t6a (interleaved) 583*c0909341SAndroid Build Coastguard Worker paddsw m4, m5 ; t4 t7 (interleaved) 584*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 585*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5 586*c0909341SAndroid Build Coastguard Worker%if mmsize > 16 587*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m1, [o(deint_shuf)] 588*c0909341SAndroid Build Coastguard Worker pshufb m4, m1 589*c0909341SAndroid Build Coastguard Worker%else 590*c0909341SAndroid Build Coastguard Worker pshufb m4, [o(deint_shuf)] 591*c0909341SAndroid Build Coastguard Worker%endif 592*c0909341SAndroid Build Coastguard Worker psubsw m1, m2, m3 ; tmp3 tmp2 593*c0909341SAndroid Build Coastguard Worker paddsw m3, m2 ; tmp0 tmp1 594*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m4, m0 ; t7 t6 595*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m0 ; t4 t5 596*c0909341SAndroid Build Coastguard Worker paddsw m0, m3, m2 ; out0 out1 597*c0909341SAndroid Build Coastguard Worker psubsw m3, m2 ; out7 out6 598*c0909341SAndroid Build Coastguard Worker psubsw m2, m1, m4 ; out4 out5 599*c0909341SAndroid Build Coastguard Worker paddsw m1, m4 ; out3 out2 600*c0909341SAndroid Build Coastguard Worker%endmacro 601*c0909341SAndroid Build Coastguard Worker 602*c0909341SAndroid Build Coastguard Worker%macro IADST8_1D_PACKED 1 ; pass 603*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [o(pd_2048)] 604*c0909341SAndroid Build Coastguard Worker%if %1 == 1 605*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a 606*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a 607*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a 608*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a 609*c0909341SAndroid Build Coastguard Worker psubsw m4, m0, m2 ; t5 t4 610*c0909341SAndroid Build Coastguard Worker paddsw m0, m2 ; t1 t0 611*c0909341SAndroid Build Coastguard Worker psubsw m5, m1, m3 ; t6 t7 612*c0909341SAndroid Build Coastguard Worker paddsw m1, m3 ; t2 t3 613*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a 614*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a 615*c0909341SAndroid Build Coastguard Worker%if mmsize > 16 616*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [o(deint_shuf)] 617*c0909341SAndroid Build Coastguard Worker%else 618*c0909341SAndroid Build Coastguard Worker mova m2, [o(deint_shuf)] 619*c0909341SAndroid Build Coastguard Worker%endif 620*c0909341SAndroid Build Coastguard Worker vprord m1, 16 621*c0909341SAndroid Build Coastguard Worker psubsw m3, m0, m1 ; t3 t2 622*c0909341SAndroid Build Coastguard Worker paddsw m0, m1 ; -out7 out0 623*c0909341SAndroid Build Coastguard Worker psubsw m1, m4, m5 ; t7 t6 624*c0909341SAndroid Build Coastguard Worker paddsw m4, m5 ; out6 -out1 625*c0909341SAndroid Build Coastguard Worker pshufb m0, m2 626*c0909341SAndroid Build Coastguard Worker pshufb m4, m2 627*c0909341SAndroid Build Coastguard Worker mova m2, m6 628*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd} 629*c0909341SAndroid Build Coastguard Worker mova m5, m6 630*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd} 631*c0909341SAndroid Build Coastguard Worker psrad m2, 12 632*c0909341SAndroid Build Coastguard Worker psrad m5, 12 633*c0909341SAndroid Build Coastguard Worker packssdw m2, m5 ; out4 -out5 634*c0909341SAndroid Build Coastguard Worker mova m5, m6 635*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd} 636*c0909341SAndroid Build Coastguard Worker mova m3, m6 637*c0909341SAndroid Build Coastguard Worker vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd} 638*c0909341SAndroid Build Coastguard Worker psrad m5, 12 639*c0909341SAndroid Build Coastguard Worker psrad m3, 12 640*c0909341SAndroid Build Coastguard Worker packssdw m1, m3, m5 ; out2 -out3 641*c0909341SAndroid Build Coastguard Worker%else 642*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m4, m3 ; 0 7 643*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m5, m2 ; 2 5 644*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m5 ; 4 3 645*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 ; 6 1 646*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a 647*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a 648*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a 649*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a 650*c0909341SAndroid Build Coastguard Worker psubsw m4, m0, m2 ; t4 t5 651*c0909341SAndroid Build Coastguard Worker paddsw m0, m2 ; t0 t1 652*c0909341SAndroid Build Coastguard Worker psubsw m5, m1, m3 ; t6 t7 653*c0909341SAndroid Build Coastguard Worker paddsw m1, m3 ; t2 t3 654*c0909341SAndroid Build Coastguard Worker shufps m2, m5, m4, q1032 655*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m2 656*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m2 657*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a 658*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a 659*c0909341SAndroid Build Coastguard Worker psubsw m2, m0, m1 ; t2 t3 660*c0909341SAndroid Build Coastguard Worker paddsw m0, m1 ; out0 -out7 661*c0909341SAndroid Build Coastguard Worker psubsw m1, m4, m5 ; t6 t7 662*c0909341SAndroid Build Coastguard Worker paddsw m4, m5 ; -out1 out6 663*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(pw_2896x8)] 664*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m2, m1 ; t3 t7 665*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m1 ; t2 t6 666*c0909341SAndroid Build Coastguard Worker paddsw m1, m2, m3 ; t2+t3 t6+t7 667*c0909341SAndroid Build Coastguard Worker psubsw m2, m3 ; t2-t3 t6-t7 668*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m4, m0 ; out6 -out7 669*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m4 ; out0 -out1 670*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m5 ; out4 -out5 671*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q1032 672*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 ; out2 -out3 673*c0909341SAndroid Build Coastguard Worker%endif 674*c0909341SAndroid Build Coastguard Worker%endmacro 675*c0909341SAndroid Build Coastguard Worker 676*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx512icl 677*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, dct 678*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, identity 679*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, adst 680*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, flipadst 681*c0909341SAndroid Build Coastguard Worker 682*c0909341SAndroid Build Coastguard Workercglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 683*c0909341SAndroid Build Coastguard Worker vpermq m0, [cq+32*0], q3120 684*c0909341SAndroid Build Coastguard Worker vpermq m1, [cq+32*1], q3120 685*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_2896x8)] 686*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m2 687*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 688*c0909341SAndroid Build Coastguard Worker IDCT4_1D_PACKED 689*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m2, [o(deint_shuf)] 690*c0909341SAndroid Build Coastguard Worker shufps m3, m0, m1, q1331 691*c0909341SAndroid Build Coastguard Worker shufps m0, m0, m1, q0220 692*c0909341SAndroid Build Coastguard Worker pshufb m0, m2 693*c0909341SAndroid Build Coastguard Worker pshufb m1, m3, m2 694*c0909341SAndroid Build Coastguard Worker jmp tx2q 695*c0909341SAndroid Build Coastguard Worker.pass2: 696*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m0, 1 697*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm3, m1, 1 698*c0909341SAndroid Build Coastguard Worker call .main 699*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_2048)] 700*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, m0, xm2, 1 701*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, m1, xm3, 1 702*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q1032 703*c0909341SAndroid Build Coastguard Worker jmp m(iadst_4x8_internal_8bpc).end2 704*c0909341SAndroid Build Coastguard WorkerALIGN function_align 705*c0909341SAndroid Build Coastguard Worker.main: 706*c0909341SAndroid Build Coastguard Worker WRAP_XMM IDCT8_1D_PACKED 707*c0909341SAndroid Build Coastguard Worker ret 708*c0909341SAndroid Build Coastguard Worker 709*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, dct 710*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, adst 711*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, flipadst 712*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, identity 713*c0909341SAndroid Build Coastguard Worker 714*c0909341SAndroid Build Coastguard Workercglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 715*c0909341SAndroid Build Coastguard Worker vpermq m0, [cq+32*0], q3120 716*c0909341SAndroid Build Coastguard Worker vpermq m1, [cq+32*1], q3120 717*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_2896x8)] 718*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m2 719*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 720*c0909341SAndroid Build Coastguard Worker call m(iadst_8x4_internal_8bpc).main 721*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0, m1 722*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 723*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m3 724*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m3 725*c0909341SAndroid Build Coastguard Worker jmp tx2q 726*c0909341SAndroid Build Coastguard Worker.pass2: 727*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m0, 1 728*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm3, m1, 1 729*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm0, q1032 730*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm1, q1032 731*c0909341SAndroid Build Coastguard Worker call .main_pass2 732*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_2048)] 733*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, xm2, 1 734*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, xm3, 1 735*c0909341SAndroid Build Coastguard Worker pxor m5, m5 736*c0909341SAndroid Build Coastguard Worker psubw m5, m4 737*c0909341SAndroid Build Coastguard Worker.end: 738*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m5 739*c0909341SAndroid Build Coastguard Worker.end2: 740*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 741*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4 742*c0909341SAndroid Build Coastguard Worker.end3: 743*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, strided 744*c0909341SAndroid Build Coastguard Worker pmulld m5, m3, [o(pd_0to15)] 745*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 746*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 747*c0909341SAndroid Build Coastguard Worker vpgatherdd m3{k1}, [dstq+m5] 748*c0909341SAndroid Build Coastguard Worker pxor m4, m4 749*c0909341SAndroid Build Coastguard Worker mova [cq], zmm20 750*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m4 751*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 752*c0909341SAndroid Build Coastguard Worker paddw m0, m2 753*c0909341SAndroid Build Coastguard Worker paddw m1, m3 754*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 755*c0909341SAndroid Build Coastguard Worker vpscatterdd [dstq+m5]{k2}, m0 756*c0909341SAndroid Build Coastguard Worker RET 757*c0909341SAndroid Build Coastguard WorkerALIGN function_align 758*c0909341SAndroid Build Coastguard Worker.main_pass1: 759*c0909341SAndroid Build Coastguard Worker punpckhwd xm0, xm4, xm3 ; 0 7 760*c0909341SAndroid Build Coastguard Worker punpckhwd xm1, xm5, xm2 ; 2 5 761*c0909341SAndroid Build Coastguard Worker punpcklwd xm2, xm5 ; 4 3 762*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm4 ; 6 1 763*c0909341SAndroid Build Coastguard Worker WRAP_XMM IADST8_1D_PACKED 1 764*c0909341SAndroid Build Coastguard Worker punpcklqdq xm3, xm4, xm0 ; out6 -out7 765*c0909341SAndroid Build Coastguard Worker punpckhqdq xm0, xm4 ; out0 -out1 766*c0909341SAndroid Build Coastguard Worker ret 767*c0909341SAndroid Build Coastguard WorkerALIGN function_align 768*c0909341SAndroid Build Coastguard Worker.main_pass2: 769*c0909341SAndroid Build Coastguard Worker WRAP_XMM IADST8_1D_PACKED 2 770*c0909341SAndroid Build Coastguard Worker ret 771*c0909341SAndroid Build Coastguard Worker 772*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, dct 773*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, adst 774*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, flipadst 775*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, identity 776*c0909341SAndroid Build Coastguard Worker 777*c0909341SAndroid Build Coastguard Workercglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 778*c0909341SAndroid Build Coastguard Worker vpermq m0, [cq+32*0], q3120 779*c0909341SAndroid Build Coastguard Worker vpermq m1, [cq+32*1], q3120 780*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_2896x8)] 781*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m2 782*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 783*c0909341SAndroid Build Coastguard Worker call m(iadst_8x4_internal_8bpc).main 784*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m1, m0 785*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0 786*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m3 787*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m3 788*c0909341SAndroid Build Coastguard Worker jmp tx2q 789*c0909341SAndroid Build Coastguard Worker.pass2: 790*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, m0, 1 791*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm3, m1, 1 792*c0909341SAndroid Build Coastguard Worker pshufd xm4, xm0, q1032 793*c0909341SAndroid Build Coastguard Worker pshufd xm5, xm1, q1032 794*c0909341SAndroid Build Coastguard Worker call m(iadst_4x8_internal_8bpc).main_pass2 795*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(pw_2048)] 796*c0909341SAndroid Build Coastguard Worker vinserti32x4 m3, xm1, 1 797*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, xm0, 1 798*c0909341SAndroid Build Coastguard Worker pxor m4, m4 799*c0909341SAndroid Build Coastguard Worker psubw m4, m5 800*c0909341SAndroid Build Coastguard Worker pshufd m0, m3, q1032 801*c0909341SAndroid Build Coastguard Worker pshufd m1, m2, q1032 802*c0909341SAndroid Build Coastguard Worker jmp m(iadst_4x8_internal_8bpc).end 803*c0909341SAndroid Build Coastguard Worker 804*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 805*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, dct 806*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, adst 807*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, flipadst 808*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, identity 809*c0909341SAndroid Build Coastguard Worker 810*c0909341SAndroid Build Coastguard Workercglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 811*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [o(pw_2896x8)] 812*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, [cq] 813*c0909341SAndroid Build Coastguard Worker mova m1, [o(int8_permB)] 814*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_1697x8)] 815*c0909341SAndroid Build Coastguard Worker vpermb m0, m1, m0 816*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m0 817*c0909341SAndroid Build Coastguard Worker paddsw m0, m2 818*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym1, m0, 1 819*c0909341SAndroid Build Coastguard Worker jmp tx2q 820*c0909341SAndroid Build Coastguard Worker.pass2: 821*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym4, [o(pw_4096)] 822*c0909341SAndroid Build Coastguard Worker jmp m(iadst_4x8_internal_8bpc).end2 823*c0909341SAndroid Build Coastguard Worker 824*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_4X16_FN 2 ; type1, type2 825*c0909341SAndroid Build Coastguard Worker INV_TXFM_FN %1, %2, 4x16 826*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct 827*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 828*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 829*c0909341SAndroid Build Coastguard Worker imul r6d, 181 830*c0909341SAndroid Build Coastguard Worker add r6d, 128+256 831*c0909341SAndroid Build Coastguard Worker sar r6d, 8+1 832*c0909341SAndroid Build Coastguard Worker imul r6d, 181 833*c0909341SAndroid Build Coastguard Worker add r6d, 128+2048 834*c0909341SAndroid Build Coastguard Worker sar r6d, 8+4 835*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, r6d 836*c0909341SAndroid Build Coastguard Worker mova m1, m0 837*c0909341SAndroid Build Coastguard Worker jmp m(iadst_4x16_internal_8bpc).end3 838*c0909341SAndroid Build Coastguard Worker%endif 839*c0909341SAndroid Build Coastguard Worker%endmacro 840*c0909341SAndroid Build Coastguard Worker 841*c0909341SAndroid Build Coastguard Worker%macro IDCT16_1D_PACKED 0 842*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m7, m0 ; dct16 in15 in1 843*c0909341SAndroid Build Coastguard Worker punpcklwd m9, m4, m0 ; dct4 in2 in0 844*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m3, m4 ; dct16 in7 in9 845*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m1 ; dct8 in7 in1 846*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 ; dct16 in3 in13 847*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m5 ; dct8 in3 in5 848*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m2 ; dct16 in11 in5 849*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m2 ; dct4 in3 in1 850*c0909341SAndroid Build Coastguard Workercglobal_label .main2 851*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pd_2048)] 852*c0909341SAndroid Build Coastguard Worker.main3: 853*c0909341SAndroid Build Coastguard Worker vpbroadcastq m13, [o(int_mshift)] 854*c0909341SAndroid Build Coastguard Worker vpcmpub k7, m13, m10, 6 ; 0x33... 855*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a 856*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a 857*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a 858*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a 859*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a 860*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a 861*c0909341SAndroid Build Coastguard Worker.main4: 862*c0909341SAndroid Build Coastguard Worker psubsw m2, m8, m0 ; t9 t14 863*c0909341SAndroid Build Coastguard Worker paddsw m8, m0 ; t8 t15 864*c0909341SAndroid Build Coastguard Worker psubsw m4, m1, m5 ; t10 t13 865*c0909341SAndroid Build Coastguard Worker paddsw m1, m5 ; t11 t12 866*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2 867*c0909341SAndroid Build Coastguard Worker psubsw m0, m8, m1 ; t11a t12a 868*c0909341SAndroid Build Coastguard Worker paddsw m8, m1 ; t8a t15a 869*c0909341SAndroid Build Coastguard Worker psubsw m1, m7, m3 ; t5a t6a 870*c0909341SAndroid Build Coastguard Worker paddsw m7, m3 ; t4 t7 871*c0909341SAndroid Build Coastguard Worker.main5: 872*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a 873*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a 874*c0909341SAndroid Build Coastguard Worker%if mmsize > 16 875*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [o(deint_shuf)] 876*c0909341SAndroid Build Coastguard Worker%else 877*c0909341SAndroid Build Coastguard Worker mova m5, [o(deint_shuf)] 878*c0909341SAndroid Build Coastguard Worker%endif 879*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_m2896_2896)] 880*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_2896_2896)] 881*c0909341SAndroid Build Coastguard Worker paddsw m3, m2, m4 ; t9 t14 882*c0909341SAndroid Build Coastguard Worker psubsw m2, m4 ; t10 t13 883*c0909341SAndroid Build Coastguard Worker pshufb m8, m5 884*c0909341SAndroid Build Coastguard Worker pshufb m7, m5 885*c0909341SAndroid Build Coastguard Worker pshufb m3, m5 886*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1 887*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6 888*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12 889*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a 890*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m7, m1 ; t7 t6 891*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m1 ; t4 t5 892*c0909341SAndroid Build Coastguard Worker psubsw m1, m9, m6 ; dct4 out3 out2 893*c0909341SAndroid Build Coastguard Worker paddsw m9, m6 ; dct4 out0 out1 894*c0909341SAndroid Build Coastguard Worker packssdw m5, m11 ; t12 t13a 895*c0909341SAndroid Build Coastguard Worker packssdw m4, m0 ; t11 t10a 896*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m8, m3 ; t15a t14 897*c0909341SAndroid Build Coastguard Worker punpcklqdq m8, m3 ; t8a t9 898*c0909341SAndroid Build Coastguard Worker psubsw m3, m9, m2 ; dct8 out7 out6 899*c0909341SAndroid Build Coastguard Worker paddsw m9, m2 ; dct8 out0 out1 900*c0909341SAndroid Build Coastguard Worker psubsw m2, m1, m7 ; dct8 out4 out5 901*c0909341SAndroid Build Coastguard Worker paddsw m1, m7 ; dct8 out3 out2 902*c0909341SAndroid Build Coastguard Worker psubsw m7, m9, m0 ; out15 out14 903*c0909341SAndroid Build Coastguard Worker paddsw m0, m9 ; out0 out1 904*c0909341SAndroid Build Coastguard Worker psubsw m6, m1, m5 ; out12 out13 905*c0909341SAndroid Build Coastguard Worker paddsw m1, m5 ; out3 out2 906*c0909341SAndroid Build Coastguard Worker psubsw m5, m2, m4 ; out11 out10 907*c0909341SAndroid Build Coastguard Worker paddsw m2, m4 ; out4 out5 908*c0909341SAndroid Build Coastguard Worker psubsw m4, m3, m8 ; out8 out9 909*c0909341SAndroid Build Coastguard Worker paddsw m3, m8 ; out7 out6 910*c0909341SAndroid Build Coastguard Worker%endmacro 911*c0909341SAndroid Build Coastguard Worker 912*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, dct 913*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, identity 914*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, adst 915*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, flipadst 916*c0909341SAndroid Build Coastguard Worker 917*c0909341SAndroid Build Coastguard Workercglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 918*c0909341SAndroid Build Coastguard Worker mova ym1, [cq+32*2] 919*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [cq+32*0], 1 920*c0909341SAndroid Build Coastguard Worker mova m0, [o(int16_perm)] 921*c0909341SAndroid Build Coastguard Worker mova ym2, [cq+32*3] 922*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, [cq+32*1], 1 923*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pd_2048)] 924*c0909341SAndroid Build Coastguard Worker vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3 925*c0909341SAndroid Build Coastguard Worker vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3 926*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2 927*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2 928*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_16384)] 929*c0909341SAndroid Build Coastguard Worker psubsw m3, m1, m2 930*c0909341SAndroid Build Coastguard Worker paddsw m1, m2 ; out0 out1 931*c0909341SAndroid Build Coastguard Worker vprord m3, 16 ; out2 out3 932*c0909341SAndroid Build Coastguard Worker punpckldq m0, m1, m3 933*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m3 934*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 935*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4 936*c0909341SAndroid Build Coastguard Worker jmp tx2q 937*c0909341SAndroid Build Coastguard Worker.pass2: 938*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm2, ym0, 1 939*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm3, ym1, 1 940*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm4, m0, 2 941*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm5, m1, 2 942*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm6, m0, 3 943*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm7, m1, 3 944*c0909341SAndroid Build Coastguard Worker call .main 945*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, xm2, 1 946*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, xm3, 1 947*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym4, xm6, 1 948*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym5, xm7, 1 949*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym4, 1 950*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, ym5, 1 951*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(pw_2048)] 952*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q1032 953*c0909341SAndroid Build Coastguard Worker jmp m(iadst_4x16_internal_8bpc).end2 954*c0909341SAndroid Build Coastguard WorkerALIGN function_align 955*c0909341SAndroid Build Coastguard Worker.main: 956*c0909341SAndroid Build Coastguard Worker WRAP_XMM IDCT16_1D_PACKED 957*c0909341SAndroid Build Coastguard Worker ret 958*c0909341SAndroid Build Coastguard Worker 959*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, dct 960*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, adst 961*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, flipadst 962*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, identity 963*c0909341SAndroid Build Coastguard Worker 964*c0909341SAndroid Build Coastguard Workercglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 965*c0909341SAndroid Build Coastguard Worker mova m1, [o(permB)] 966*c0909341SAndroid Build Coastguard Worker vpermq m0, m1, [cq+64*0] 967*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, [cq+64*1] 968*c0909341SAndroid Build Coastguard Worker call m(iadst_16x4_internal_8bpc).main 969*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_16384)] 970*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0, m1 971*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 972*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 973*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3 974*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m2 975*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 976*c0909341SAndroid Build Coastguard Worker jmp tx2q 977*c0909341SAndroid Build Coastguard Worker.pass2: 978*c0909341SAndroid Build Coastguard Worker call .main 979*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(pw_2048)] 980*c0909341SAndroid Build Coastguard Worker psrlq m10, 4 981*c0909341SAndroid Build Coastguard Worker psubw m6, m8, m5 982*c0909341SAndroid Build Coastguard Worker.end: 983*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [o(pw_2896x8)] 984*c0909341SAndroid Build Coastguard Worker paddsw ym1, ym2, ym4 985*c0909341SAndroid Build Coastguard Worker psubsw ym2, ym4 986*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, ym2, 1 987*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10 988*c0909341SAndroid Build Coastguard Worker psrlq m0, m10, 4 989*c0909341SAndroid Build Coastguard Worker vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d 990*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f 991*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m6 992*c0909341SAndroid Build Coastguard Worker.end2: 993*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5 994*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 995*c0909341SAndroid Build Coastguard Worker.end3: 996*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, strided 997*c0909341SAndroid Build Coastguard Worker pmulld m5, m3, [o(pd_0to15)] 998*c0909341SAndroid Build Coastguard Worker kxnorw k1, k1, k1 999*c0909341SAndroid Build Coastguard Worker kmovw k2, k1 1000*c0909341SAndroid Build Coastguard Worker vpgatherdd m3{k1}, [dstq+m5] 1001*c0909341SAndroid Build Coastguard Worker pxor m4, m4 1002*c0909341SAndroid Build Coastguard Worker mova [cq+64*0], m4 1003*c0909341SAndroid Build Coastguard Worker mova [cq+64*1], m4 1004*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m4 1005*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m4 1006*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1007*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1008*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1009*c0909341SAndroid Build Coastguard Worker vpscatterdd [dstq+m5]{k2}, m0 1010*c0909341SAndroid Build Coastguard Worker RET 1011*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1012*c0909341SAndroid Build Coastguard Worker.main: 1013*c0909341SAndroid Build Coastguard Worker movu m3, [o(permB+1)] 1014*c0909341SAndroid Build Coastguard Worker psrlq m10, m3, 4 1015*c0909341SAndroid Build Coastguard Worker.main2: 1016*c0909341SAndroid Build Coastguard Worker vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10 1017*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5 1018*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pd_2048)] 1019*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym13, [o(int_mshift)] 1020*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 1021*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m3, m0 ; in12 in3 in14 in1 1022*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m3 ; in0 in15 in2 in13 1023*c0909341SAndroid Build Coastguard Worker kshiftrb k1, k1, 4 1024*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5 1025*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9 1026*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx512icl 1027*c0909341SAndroid Build Coastguard Worker vpcmpub k7, m13, m9, 6 ; 0x33... 1028*c0909341SAndroid Build Coastguard Worker pxor m8, m8 1029*c0909341SAndroid Build Coastguard Worker ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5 1030*c0909341SAndroid Build Coastguard Worker ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5 1031*c0909341SAndroid Build Coastguard Worker ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5 1032*c0909341SAndroid Build Coastguard Worker ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5 1033*c0909341SAndroid Build Coastguard Worker psubsw m2, m0, m3 ; t9a t8a t11a t10a 1034*c0909341SAndroid Build Coastguard Worker paddsw m0, m3 ; t1a t0a t3a t2a 1035*c0909341SAndroid Build Coastguard Worker psubsw m3, m1, m4 ; t13a t12a t15a t14a 1036*c0909341SAndroid Build Coastguard Worker paddsw m4, m1 ; t5a t4a t7a t6a 1037*c0909341SAndroid Build Coastguard Worker ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5 1038*c0909341SAndroid Build Coastguard Worker psubw m7, m8, m7 1039*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4 1040*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [o(pw_3784_m1567)] 1041*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6{k1}, [o(pw_m3784_1567)] 1042*c0909341SAndroid Build Coastguard Worker psubsw m1, m0, m4 ; t5 t4 t7 t6 1043*c0909341SAndroid Build Coastguard Worker paddsw m0, m4 ; t1 t0 t3 t2 1044*c0909341SAndroid Build Coastguard Worker psubsw m4, m2, m3 ; t13a t12a t15a t14a 1045*c0909341SAndroid Build Coastguard Worker paddsw m2, m3 ; t9a t8a t11a t10a 1046*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a 1047*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14 1048*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [o(deint_shuf)] 1049*c0909341SAndroid Build Coastguard Worker pshufb m0, m5 1050*c0909341SAndroid Build Coastguard Worker pshufb m2, m5 1051*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a 1052*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a 1053*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14 1054*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13 1055*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q1032 ; t6a t7a t14 t15 1056*c0909341SAndroid Build Coastguard Worker psubsw m4, m0, m3 ; t3a t2a t11 t10 1057*c0909341SAndroid Build Coastguard Worker paddsw m0, m3 ; -out15 out0 out14 -out1 1058*c0909341SAndroid Build Coastguard Worker paddsw m3, m1, m2 ; out12 -out3 -out13 out2 1059*c0909341SAndroid Build Coastguard Worker psubsw m1, m2 ; t7 t6 t15a t14a 1060*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a 1061*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m1 ; t3a t7 t11 t15a 1062*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 1063*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1 1064*c0909341SAndroid Build Coastguard Worker ret 1065*c0909341SAndroid Build Coastguard Worker 1066*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, dct 1067*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, adst 1068*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, flipadst 1069*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, identity 1070*c0909341SAndroid Build Coastguard Worker 1071*c0909341SAndroid Build Coastguard Workercglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1072*c0909341SAndroid Build Coastguard Worker mova m1, [o(permB)] 1073*c0909341SAndroid Build Coastguard Worker vpermq m0, m1, [cq+64*0] 1074*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, [cq+64*1] 1075*c0909341SAndroid Build Coastguard Worker call m(iadst_16x4_internal_8bpc).main 1076*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_16384)] 1077*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1, m0 1078*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0 1079*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1080*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 1081*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m2 1082*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2 1083*c0909341SAndroid Build Coastguard Worker jmp tx2q 1084*c0909341SAndroid Build Coastguard Worker.pass2: 1085*c0909341SAndroid Build Coastguard Worker call m(iadst_4x16_internal_8bpc).main 1086*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [o(pw_2048)] 1087*c0909341SAndroid Build Coastguard Worker psrlq m10, 12 1088*c0909341SAndroid Build Coastguard Worker psubw m5, m8, m6 1089*c0909341SAndroid Build Coastguard Worker jmp m(iadst_4x16_internal_8bpc).end 1090*c0909341SAndroid Build Coastguard Worker 1091*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, dct 1092*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, adst 1093*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, flipadst 1094*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, identity 1095*c0909341SAndroid Build Coastguard Worker 1096*c0909341SAndroid Build Coastguard Workercglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1097*c0909341SAndroid Build Coastguard Worker mova m2, [o(int16_perm)] 1098*c0909341SAndroid Build Coastguard Worker vpermb m1, m2, [cq+64*0] 1099*c0909341SAndroid Build Coastguard Worker vpermb m2, m2, [cq+64*1] 1100*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_1697x8)] 1101*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [o(pd_m1)] 1102*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is 1103*c0909341SAndroid Build Coastguard Worker vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal 1104*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m2 ; it still works, but if the input is -1 the 1105*c0909341SAndroid Build Coastguard Worker vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes 1106*c0909341SAndroid Build Coastguard Worker vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless 1107*c0909341SAndroid Build Coastguard Worker vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here. 1108*c0909341SAndroid Build Coastguard Worker punpckldq m0, m1, m2 1109*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m2 1110*c0909341SAndroid Build Coastguard Worker jmp tx2q 1111*c0909341SAndroid Build Coastguard Worker.pass2: 1112*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_1697x16)] 1113*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(pw_2048)] 1114*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3, m0 1115*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m1 1116*c0909341SAndroid Build Coastguard Worker paddsw m0, m0 1117*c0909341SAndroid Build Coastguard Worker paddsw m1, m1 1118*c0909341SAndroid Build Coastguard Worker paddsw m0, m2 1119*c0909341SAndroid Build Coastguard Worker paddsw m1, m3 1120*c0909341SAndroid Build Coastguard Worker jmp m(iadst_4x16_internal_8bpc).end2 1121*c0909341SAndroid Build Coastguard Worker 1122*c0909341SAndroid Build Coastguard Worker%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3] 1123*c0909341SAndroid Build Coastguard Worker movq xm%3, [dstq ] 1124*c0909341SAndroid Build Coastguard Worker movhps xm%3, [dstq+%5] 1125*c0909341SAndroid Build Coastguard Worker movq xm%4, [dstq+%6] 1126*c0909341SAndroid Build Coastguard Worker movhps xm%4, [dstq+%7] 1127*c0909341SAndroid Build Coastguard Worker pmovzxbw m%3, xm%3 1128*c0909341SAndroid Build Coastguard Worker pmovzxbw m%4, xm%4 1129*c0909341SAndroid Build Coastguard Worker%ifnum %1 1130*c0909341SAndroid Build Coastguard Worker paddw m%3, m%1 1131*c0909341SAndroid Build Coastguard Worker%else 1132*c0909341SAndroid Build Coastguard Worker paddw m%3, %1 1133*c0909341SAndroid Build Coastguard Worker%endif 1134*c0909341SAndroid Build Coastguard Worker%ifnum %2 1135*c0909341SAndroid Build Coastguard Worker paddw m%4, m%2 1136*c0909341SAndroid Build Coastguard Worker%else 1137*c0909341SAndroid Build Coastguard Worker paddw m%4, %2 1138*c0909341SAndroid Build Coastguard Worker%endif 1139*c0909341SAndroid Build Coastguard Worker packuswb m%3, m%4 1140*c0909341SAndroid Build Coastguard Worker vextracti32x4 xm%4, m%3, 1 1141*c0909341SAndroid Build Coastguard Worker movq [dstq ], xm%3 1142*c0909341SAndroid Build Coastguard Worker movhps [dstq+%6], xm%3 1143*c0909341SAndroid Build Coastguard Worker movq [dstq+%5], xm%4 1144*c0909341SAndroid Build Coastguard Worker movhps [dstq+%7], xm%4 1145*c0909341SAndroid Build Coastguard Worker%endmacro 1146*c0909341SAndroid Build Coastguard Worker 1147*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X4_FN 2 ; type1, type2 1148*c0909341SAndroid Build Coastguard Worker INV_TXFM_FN %1, %2, 8x4 1149*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct 1150*c0909341SAndroid Build Coastguard Worker movd xm1, [o(pw_2896x8)] 1151*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm1, [cq] 1152*c0909341SAndroid Build Coastguard Worker movd xm2, [o(pw_2048)] 1153*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm1 1154*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm1 1155*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm2 1156*c0909341SAndroid Build Coastguard Worker vpbroadcastw m0, xm0 1157*c0909341SAndroid Build Coastguard Worker mova m1, m0 1158*c0909341SAndroid Build Coastguard Worker jmp m(iadst_8x4_internal_8bpc).end3 1159*c0909341SAndroid Build Coastguard Worker%endif 1160*c0909341SAndroid Build Coastguard Worker%endmacro 1161*c0909341SAndroid Build Coastguard Worker 1162*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx512icl 1163*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, dct 1164*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, adst 1165*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, flipadst 1166*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, identity 1167*c0909341SAndroid Build Coastguard Worker 1168*c0909341SAndroid Build Coastguard Workercglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1169*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm3, [o(pw_2896x8)] 1170*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm3, [cq+16*0] 1171*c0909341SAndroid Build Coastguard Worker pmulhrsw xm1, xm3, [cq+16*1] 1172*c0909341SAndroid Build Coastguard Worker pmulhrsw xm2, xm3, [cq+16*2] 1173*c0909341SAndroid Build Coastguard Worker pmulhrsw xm3, [cq+16*3] 1174*c0909341SAndroid Build Coastguard Worker call m(idct_4x8_internal_8bpc).main 1175*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [o(deint_shuf)] 1176*c0909341SAndroid Build Coastguard Worker vinserti32x4 m3, m1, xm3, 1 1177*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, m0, xm2, 1 1178*c0909341SAndroid Build Coastguard Worker shufps m0, m1, m3, q0220 1179*c0909341SAndroid Build Coastguard Worker shufps m1, m3, q1331 1180*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 1181*c0909341SAndroid Build Coastguard Worker pshufb m1, m4 1182*c0909341SAndroid Build Coastguard Worker jmp tx2q 1183*c0909341SAndroid Build Coastguard Worker.pass2: 1184*c0909341SAndroid Build Coastguard Worker IDCT4_1D_PACKED 1185*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 1186*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q2031 1187*c0909341SAndroid Build Coastguard Worker jmp m(iadst_8x4_internal_8bpc).end2 1188*c0909341SAndroid Build Coastguard Worker 1189*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, dct 1190*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, adst 1191*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, flipadst 1192*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, identity 1193*c0909341SAndroid Build Coastguard Worker 1194*c0909341SAndroid Build Coastguard Workercglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1195*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [o(pw_2896x8)] 1196*c0909341SAndroid Build Coastguard Worker pshufd xm4, [cq+16*0], q1032 1197*c0909341SAndroid Build Coastguard Worker pmulhrsw xm3, xm0, [cq+16*3] 1198*c0909341SAndroid Build Coastguard Worker pshufd xm5, [cq+16*1], q1032 1199*c0909341SAndroid Build Coastguard Worker pmulhrsw xm2, xm0, [cq+16*2] 1200*c0909341SAndroid Build Coastguard Worker pmulhrsw xm4, xm0 1201*c0909341SAndroid Build Coastguard Worker pmulhrsw xm5, xm0 1202*c0909341SAndroid Build Coastguard Worker call m(iadst_4x8_internal_8bpc).main_pass1 1203*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, xm2, 1 1204*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, xm3, 1 1205*c0909341SAndroid Build Coastguard Worker pxor m3, m3 1206*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0, m1 1207*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 1208*c0909341SAndroid Build Coastguard Worker psubsw m3, m2 1209*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m3 1210*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m3 1211*c0909341SAndroid Build Coastguard Worker jmp tx2q 1212*c0909341SAndroid Build Coastguard Worker.pass2: 1213*c0909341SAndroid Build Coastguard Worker call .main 1214*c0909341SAndroid Build Coastguard Worker.end: 1215*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 1216*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q3120 1217*c0909341SAndroid Build Coastguard Worker.end2: 1218*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_2048)] 1219*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m2 1220*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2 1221*c0909341SAndroid Build Coastguard Worker.end3: 1222*c0909341SAndroid Build Coastguard Worker pxor m2, m2 1223*c0909341SAndroid Build Coastguard Worker mova [cq], zmm18 1224*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 1225*c0909341SAndroid Build Coastguard Worker WRITE_8X4 0, 1, 4, 5 1226*c0909341SAndroid Build Coastguard Worker RET 1227*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1228*c0909341SAndroid Build Coastguard Worker.main: 1229*c0909341SAndroid Build Coastguard Worker IADST4_1D_PACKED 1230*c0909341SAndroid Build Coastguard Worker ret 1231*c0909341SAndroid Build Coastguard Worker 1232*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, dct 1233*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, adst 1234*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, flipadst 1235*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, identity 1236*c0909341SAndroid Build Coastguard Worker 1237*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1238*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm0, [o(pw_2896x8)] 1239*c0909341SAndroid Build Coastguard Worker pshufd xm4, [cq+16*0], q1032 1240*c0909341SAndroid Build Coastguard Worker pmulhrsw xm3, xm0, [cq+16*3] 1241*c0909341SAndroid Build Coastguard Worker pshufd xm5, [cq+16*1], q1032 1242*c0909341SAndroid Build Coastguard Worker pmulhrsw xm2, xm0, [cq+16*2] 1243*c0909341SAndroid Build Coastguard Worker pmulhrsw xm4, xm0 1244*c0909341SAndroid Build Coastguard Worker pmulhrsw xm5, xm0 1245*c0909341SAndroid Build Coastguard Worker call m(iadst_4x8_internal_8bpc).main_pass1 1246*c0909341SAndroid Build Coastguard Worker vinserti32x4 m3, m3, xm1, 1 1247*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, m2, xm0, 1 1248*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m3, m2 1249*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m2 1250*c0909341SAndroid Build Coastguard Worker pxor m0, m0 1251*c0909341SAndroid Build Coastguard Worker psubsw m0, m1 1252*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m3 1253*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m3 1254*c0909341SAndroid Build Coastguard Worker jmp tx2q 1255*c0909341SAndroid Build Coastguard Worker.pass2: 1256*c0909341SAndroid Build Coastguard Worker call m(iadst_8x4_internal_8bpc).main 1257*c0909341SAndroid Build Coastguard Worker mova m2, m1 1258*c0909341SAndroid Build Coastguard Worker vpermq m1, m0, q2031 1259*c0909341SAndroid Build Coastguard Worker vpermq m0, m2, q2031 1260*c0909341SAndroid Build Coastguard Worker jmp m(iadst_8x4_internal_8bpc).end2 1261*c0909341SAndroid Build Coastguard Worker 1262*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, dct 1263*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, adst 1264*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, flipadst 1265*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, identity 1266*c0909341SAndroid Build Coastguard Worker 1267*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1268*c0909341SAndroid Build Coastguard Worker mova xm2, [cq+16*0] 1269*c0909341SAndroid Build Coastguard Worker mova xm0, [cq+16*1] 1270*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [cq+16*2], 1 1271*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [cq+16*3], 1 1272*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_2896x8)] 1273*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m0 1274*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0 1275*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 1276*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 1277*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m2 1278*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2 1279*c0909341SAndroid Build Coastguard Worker paddsw m0, m0 1280*c0909341SAndroid Build Coastguard Worker paddsw m1, m1 1281*c0909341SAndroid Build Coastguard Worker jmp tx2q 1282*c0909341SAndroid Build Coastguard Worker.pass2: 1283*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_1697x8)] 1284*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3, m0 1285*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m1 1286*c0909341SAndroid Build Coastguard Worker paddsw m0, m2 1287*c0909341SAndroid Build Coastguard Worker paddsw m1, m3 1288*c0909341SAndroid Build Coastguard Worker jmp m(iadst_8x4_internal_8bpc).end 1289*c0909341SAndroid Build Coastguard Worker 1290*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X8_FN 2 ; type1, type2 1291*c0909341SAndroid Build Coastguard Worker INV_TXFM_FN %1, %2, 8x8 1292*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct 1293*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 1294*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 1295*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 1296*c0909341SAndroid Build Coastguard Worker.dconly: 1297*c0909341SAndroid Build Coastguard Worker imul r6d, 181 1298*c0909341SAndroid Build Coastguard Worker add r6d, 128+256 1299*c0909341SAndroid Build Coastguard Worker sar r6d, 8+1 1300*c0909341SAndroid Build Coastguard Worker.dconly2: 1301*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym2, strided 1302*c0909341SAndroid Build Coastguard Worker imul r6d, 181 1303*c0909341SAndroid Build Coastguard Worker pmulld ym5, ym2, [o(pd_0to15)] 1304*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 1305*c0909341SAndroid Build Coastguard Worker add r6d, 128+2048 1306*c0909341SAndroid Build Coastguard Worker sar r6d, 8+4 1307*c0909341SAndroid Build Coastguard Worker pxor m3, m3 1308*c0909341SAndroid Build Coastguard Worker vpbroadcastw m4, r6d 1309*c0909341SAndroid Build Coastguard Worker.dconly_loop: 1310*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 1311*c0909341SAndroid Build Coastguard Worker vpgatherdq m2{k1}, [dstq+ym5] 1312*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m2, m3 1313*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2, m3 1314*c0909341SAndroid Build Coastguard Worker paddw m0, m4 1315*c0909341SAndroid Build Coastguard Worker paddw m1, m4 1316*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1317*c0909341SAndroid Build Coastguard Worker kmovb k1, k2 1318*c0909341SAndroid Build Coastguard Worker vpscatterdq [dstq+ym5]{k2}, m0 1319*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*8] 1320*c0909341SAndroid Build Coastguard Worker sub r3d, 8 1321*c0909341SAndroid Build Coastguard Worker jg .dconly_loop 1322*c0909341SAndroid Build Coastguard Worker RET 1323*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx512icl 1324*c0909341SAndroid Build Coastguard Worker%endif 1325*c0909341SAndroid Build Coastguard Worker%endmacro 1326*c0909341SAndroid Build Coastguard Worker 1327*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, dct 1328*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, identity 1329*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, adst 1330*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, flipadst 1331*c0909341SAndroid Build Coastguard Worker 1332*c0909341SAndroid Build Coastguard Workercglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1333*c0909341SAndroid Build Coastguard Worker vpermq m0, [cq+32*0], q3120 ; 0 1 1334*c0909341SAndroid Build Coastguard Worker vpermq m3, [cq+32*3], q3120 ; 6 7 1335*c0909341SAndroid Build Coastguard Worker vpermq m2, [cq+32*2], q3120 ; 4 5 1336*c0909341SAndroid Build Coastguard Worker vpermq m1, [cq+32*1], q3120 ; 2 3 1337*c0909341SAndroid Build Coastguard Worker call .main 1338*c0909341SAndroid Build Coastguard Worker shufps m4, m0, m1, q0220 1339*c0909341SAndroid Build Coastguard Worker shufps m5, m0, m1, q1331 1340*c0909341SAndroid Build Coastguard Worker shufps m1, m2, m3, q0220 1341*c0909341SAndroid Build Coastguard Worker shufps m3, m2, m3, q1331 1342*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m0, [o(deint_shuf)] 1343*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_16384)] 1344*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m0}, m4, m5, m1, m3 1345*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m2}, m4, m5, m1, m3 1346*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, m4, xm1, 1 1347*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m4, m1, 0x03 1348*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, m5, xm3, 1 1349*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m5, m3, 0x03 1350*c0909341SAndroid Build Coastguard Worker jmp tx2q 1351*c0909341SAndroid Build Coastguard Worker.pass2: 1352*c0909341SAndroid Build Coastguard Worker call .main 1353*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_2048)] 1354*c0909341SAndroid Build Coastguard Worker vpermq m0, m0, q3120 1355*c0909341SAndroid Build Coastguard Worker vpermq m1, m1, q2031 1356*c0909341SAndroid Build Coastguard Worker vpermq m2, m2, q3120 1357*c0909341SAndroid Build Coastguard Worker vpermq m3, m3, q2031 1358*c0909341SAndroid Build Coastguard Worker jmp m(iadst_8x8_internal_8bpc).end2 1359*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1360*c0909341SAndroid Build Coastguard Workercglobal_label .main 1361*c0909341SAndroid Build Coastguard Worker IDCT8_1D_PACKED 1362*c0909341SAndroid Build Coastguard Worker ret 1363*c0909341SAndroid Build Coastguard Worker 1364*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, dct 1365*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, adst 1366*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, flipadst 1367*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, identity 1368*c0909341SAndroid Build Coastguard Worker 1369*c0909341SAndroid Build Coastguard Workercglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1370*c0909341SAndroid Build Coastguard Worker vpermq m4, [cq+32*0], q1302 ; 1 0 1371*c0909341SAndroid Build Coastguard Worker vpermq m3, [cq+32*3], q3120 ; 6 7 1372*c0909341SAndroid Build Coastguard Worker vpermq m5, [cq+32*1], q1302 ; 3 2 1373*c0909341SAndroid Build Coastguard Worker vpermq m2, [cq+32*2], q3120 ; 4 5 1374*c0909341SAndroid Build Coastguard Worker call .main_pass1 1375*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(pw_16384_m16384)] 1376*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m0, m1 1377*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m1 1378*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m3 1379*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 1380*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m0 1381*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 1382*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m2 1383*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2 1384*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m5}, m3, m4, m0, m1 1385*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m3, m0, 0x03 1386*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, m3, xm0, 1 1387*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m4, m1, 0x03 1388*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, m4, xm1, 1 1389*c0909341SAndroid Build Coastguard Worker jmp tx2q 1390*c0909341SAndroid Build Coastguard Worker.pass2: 1391*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q1032 1392*c0909341SAndroid Build Coastguard Worker pshufd m5, m1, q1032 1393*c0909341SAndroid Build Coastguard Worker call .main_pass2 1394*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(pw_2048)] 1395*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm4, [o(pw_4096)] 1396*c0909341SAndroid Build Coastguard Worker psubw m4, m5 ; lower half = 2048, upper half = -2048 1397*c0909341SAndroid Build Coastguard Worker.end: 1398*c0909341SAndroid Build Coastguard Worker REPX {vpermq x, x, q3120}, m0, m1, m2, m3 1399*c0909341SAndroid Build Coastguard Worker.end2: 1400*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 1401*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4 1402*c0909341SAndroid Build Coastguard Worker.end3: 1403*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 1404*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4 1405*c0909341SAndroid Build Coastguard Worker.end4: 1406*c0909341SAndroid Build Coastguard Worker pxor m4, m4 1407*c0909341SAndroid Build Coastguard Worker mova [cq+32*0], m4 1408*c0909341SAndroid Build Coastguard Worker mova [cq+32*1], m4 1409*c0909341SAndroid Build Coastguard Worker mova [cq+32*2], m4 1410*c0909341SAndroid Build Coastguard Worker mova [cq+32*3], m4 1411*c0909341SAndroid Build Coastguard Worker lea r6, [strideq*3] 1412*c0909341SAndroid Build Coastguard Worker WRITE_8X4 0, 1, 4, 5 1413*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 1414*c0909341SAndroid Build Coastguard Worker WRITE_8X4 2, 3, 4, 5 1415*c0909341SAndroid Build Coastguard Worker RET 1416*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1417*c0909341SAndroid Build Coastguard Worker.main_pass1: 1418*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m4, m3 ; 0 7 1419*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m5, m2 ; 2 5 1420*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m5 ; 4 3 1421*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 ; 6 1 1422*c0909341SAndroid Build Coastguard Worker IADST8_1D_PACKED 1 1423*c0909341SAndroid Build Coastguard Worker punpcklqdq m3, m4, m0 ; out6 -out7 1424*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m4 ; out0 -out1 1425*c0909341SAndroid Build Coastguard Worker ret 1426*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1427*c0909341SAndroid Build Coastguard Workercglobal_label .main_pass2 1428*c0909341SAndroid Build Coastguard Worker IADST8_1D_PACKED 2 1429*c0909341SAndroid Build Coastguard Worker ret 1430*c0909341SAndroid Build Coastguard Worker 1431*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, dct 1432*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, adst 1433*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, flipadst 1434*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, identity 1435*c0909341SAndroid Build Coastguard Worker 1436*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1437*c0909341SAndroid Build Coastguard Worker vpermq m4, [cq+32*0], q1302 ; 1 0 1438*c0909341SAndroid Build Coastguard Worker vpermq m3, [cq+32*3], q3120 ; 6 7 1439*c0909341SAndroid Build Coastguard Worker vpermq m5, [cq+32*1], q1302 ; 3 2 1440*c0909341SAndroid Build Coastguard Worker vpermq m2, [cq+32*2], q3120 ; 4 5 1441*c0909341SAndroid Build Coastguard Worker call m(iadst_8x8_internal_8bpc).main_pass1 1442*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(pw_m16384_16384)] 1443*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m3, m2 1444*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m2 1445*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m1, m0 1446*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0 1447*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m4, m3 1448*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m3 1449*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m2, m1 1450*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1 1451*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m5}, m0, m4, m3, m2 1452*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, m0, xm3, 1 1453*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m0, m3, 0x03 1454*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, m4, xm2, 1 1455*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m4, m2, 0x03 1456*c0909341SAndroid Build Coastguard Worker jmp tx2q 1457*c0909341SAndroid Build Coastguard Worker.pass2: 1458*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q1032 1459*c0909341SAndroid Build Coastguard Worker pshufd m5, m1, q1032 1460*c0909341SAndroid Build Coastguard Worker call m(iadst_8x8_internal_8bpc).main_pass2 1461*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_2048)] 1462*c0909341SAndroid Build Coastguard Worker vpbroadcastd xm5, [o(pw_4096)] 1463*c0909341SAndroid Build Coastguard Worker psubw m4, m5 ; lower half = -2048, upper half = 2048 1464*c0909341SAndroid Build Coastguard Worker vpermq m5, m3, q2031 1465*c0909341SAndroid Build Coastguard Worker vpermq m3, m0, q2031 1466*c0909341SAndroid Build Coastguard Worker vpermq m0, m2, q2031 1467*c0909341SAndroid Build Coastguard Worker vpermq m2, m1, q2031 1468*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m0, m4 1469*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m5, m4 1470*c0909341SAndroid Build Coastguard Worker jmp m(iadst_8x8_internal_8bpc).end3 1471*c0909341SAndroid Build Coastguard Worker 1472*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, dct 1473*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, adst 1474*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, flipadst 1475*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, identity 1476*c0909341SAndroid Build Coastguard Worker 1477*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1478*c0909341SAndroid Build Coastguard Worker mova xm3, [cq+16*0] 1479*c0909341SAndroid Build Coastguard Worker mova xm2, [cq+16*1] 1480*c0909341SAndroid Build Coastguard Worker vinserti32x4 m3, [cq+16*4], 1 1481*c0909341SAndroid Build Coastguard Worker vinserti32x4 m2, [cq+16*5], 1 1482*c0909341SAndroid Build Coastguard Worker mova xm4, [cq+16*2] 1483*c0909341SAndroid Build Coastguard Worker mova xm0, [cq+16*3] 1484*c0909341SAndroid Build Coastguard Worker vinserti32x4 m4, [cq+16*6], 1 1485*c0909341SAndroid Build Coastguard Worker vinserti32x4 m0, [cq+16*7], 1 1486*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3, m2 1487*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m2 1488*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m4, m0 1489*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 1490*c0909341SAndroid Build Coastguard Worker punpckldq m0, m1, m2 1491*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m2 1492*c0909341SAndroid Build Coastguard Worker punpckldq m2, m3, m4 1493*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m4 1494*c0909341SAndroid Build Coastguard Worker jmp tx2q 1495*c0909341SAndroid Build Coastguard Worker.pass2: 1496*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_4096)] 1497*c0909341SAndroid Build Coastguard Worker jmp m(iadst_8x8_internal_8bpc).end 1498*c0909341SAndroid Build Coastguard Worker 1499*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X16_FN 2 ; type1, type2 1500*c0909341SAndroid Build Coastguard Worker INV_TXFM_FN %1, %2, 8x16 1501*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct 1502*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 1503*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 1504*c0909341SAndroid Build Coastguard Worker or r3d, 16 1505*c0909341SAndroid Build Coastguard Worker imul r6d, 181 1506*c0909341SAndroid Build Coastguard Worker add r6d, 128 1507*c0909341SAndroid Build Coastguard Worker sar r6d, 8 1508*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly 1509*c0909341SAndroid Build Coastguard Worker%endif 1510*c0909341SAndroid Build Coastguard Worker%endmacro 1511*c0909341SAndroid Build Coastguard Worker 1512*c0909341SAndroid Build Coastguard Worker%macro ITX_8X16_LOAD_COEFS 0 1513*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_2896x8)] 1514*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4, [cq+32*0] 1515*c0909341SAndroid Build Coastguard Worker add cq, 32*4 1516*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m4, [cq+32*3] 1517*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4, [cq-32*3] 1518*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m4, [cq+32*2] 1519*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4, [cq-32*2] 1520*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m4, [cq+32*1] 1521*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4, [cq-32*1] 1522*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, [cq+32*0] 1523*c0909341SAndroid Build Coastguard Worker%endmacro 1524*c0909341SAndroid Build Coastguard Worker 1525*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 1526*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, dct 1527*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, identity 1528*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, adst 1529*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, flipadst 1530*c0909341SAndroid Build Coastguard Worker 1531*c0909341SAndroid Build Coastguard Workercglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1532*c0909341SAndroid Build Coastguard Worker mova m3, [o(permB)] 1533*c0909341SAndroid Build Coastguard Worker vpermq m0, m3, [cq+64*0] 1534*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_2896x8)] 1535*c0909341SAndroid Build Coastguard Worker vpermq m1, m3, [cq+64*1] 1536*c0909341SAndroid Build Coastguard Worker vpermq m2, m3, [cq+64*2] 1537*c0909341SAndroid Build Coastguard Worker vpermq m3, m3, [cq+64*3] 1538*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m4}, m0, m1, m2, m3 1539*c0909341SAndroid Build Coastguard Worker call m(idct_16x8_internal_8bpc).main 1540*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(pw_16384)] 1541*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3 1542*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3 1543*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3 1544*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3 1545*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m5}, m4, m0, m2, m1 1546*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3 1547*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1 1548*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3 1549*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1 1550*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m0, m2 ; 1 5 9 13 1551*c0909341SAndroid Build Coastguard Worker punpckldq m0, m2 ; 0 4 8 12 1552*c0909341SAndroid Build Coastguard Worker punpckldq m2, m3, m4 ; 2 6 10 14 1553*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m4 ; 3 7 11 15 1554*c0909341SAndroid Build Coastguard Worker jmp tx2q 1555*c0909341SAndroid Build Coastguard Worker.pass2: 1556*c0909341SAndroid Build Coastguard Worker vprord m5, [o(int16_perm)], 16 1557*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m2, q1320 ; 2 10 14 6 1558*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11 1559*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m3, q0132 ; 9 13 7 3 1560*c0909341SAndroid Build Coastguard Worker vpermb m9, m5, m0 1561*c0909341SAndroid Build Coastguard Worker vpermb m7, m5, m2 1562*c0909341SAndroid Build Coastguard Worker vpermb m8, m5, m4 1563*c0909341SAndroid Build Coastguard Worker vpermb m0, m5, m1 1564*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym6, m9, 1 1565*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym3, m7, 1 1566*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym5, m8, 1 1567*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym1, m0, 1 1568*c0909341SAndroid Build Coastguard Worker call .main2 1569*c0909341SAndroid Build Coastguard Worker mova ym8, [o(gather8a)] 1570*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+strideq*4] 1571*c0909341SAndroid Build Coastguard Worker pmovzxdq m9, ym8 1572*c0909341SAndroid Build Coastguard Worker pshufd ym8, ym8, q1230 1573*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m9, m4 1574*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m9, m5 1575*c0909341SAndroid Build Coastguard Worker vpermt2q m2, m9, m6 1576*c0909341SAndroid Build Coastguard Worker vpermt2q m3, m9, m7 1577*c0909341SAndroid Build Coastguard Worker.end: 1578*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [o(pw_2048)] 1579*c0909341SAndroid Build Coastguard Worker.end2: 1580*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m7 1581*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7 1582*c0909341SAndroid Build Coastguard Worker.end3: 1583*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m7 1584*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m7 1585*c0909341SAndroid Build Coastguard Worker.end4: 1586*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym6, strided 1587*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 1588*c0909341SAndroid Build Coastguard Worker pxor m4, m4 1589*c0909341SAndroid Build Coastguard Worker pmulld ym8, ym6 1590*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 1591*c0909341SAndroid Build Coastguard Worker vpgatherdq m6{k1}, [dstq+ym8] 1592*c0909341SAndroid Build Coastguard Worker kmovb k1, k2 1593*c0909341SAndroid Build Coastguard Worker vpgatherdq m7{k2}, [r3+ym8] 1594*c0909341SAndroid Build Coastguard Worker mova [cq+64*0], m4 1595*c0909341SAndroid Build Coastguard Worker mova [cq+64*1], m4 1596*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 1597*c0909341SAndroid Build Coastguard Worker mova [cq+64*2], m4 1598*c0909341SAndroid Build Coastguard Worker mova [cq+64*3], m4 1599*c0909341SAndroid Build Coastguard Worker punpcklbw m5, m6, m4 1600*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m4 1601*c0909341SAndroid Build Coastguard Worker paddw m0, m5 1602*c0909341SAndroid Build Coastguard Worker paddw m1, m6 1603*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1604*c0909341SAndroid Build Coastguard Worker vpscatterdq [dstq+ym8]{k1}, m0 1605*c0909341SAndroid Build Coastguard Worker punpcklbw m6, m7, m4 1606*c0909341SAndroid Build Coastguard Worker punpckhbw m7, m4 1607*c0909341SAndroid Build Coastguard Worker paddw m2, m6 1608*c0909341SAndroid Build Coastguard Worker paddw m3, m7 1609*c0909341SAndroid Build Coastguard Worker packuswb m2, m3 1610*c0909341SAndroid Build Coastguard Worker vpscatterdq [r3+ym8]{k2}, m2 1611*c0909341SAndroid Build Coastguard Worker RET 1612*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1613*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast2 ; bottom three-quarters are zero 1614*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym10, [o(pd_2048)] 1615*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym13, [o(int_mshift)] 1616*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym3, [o(pw_401_4076x8)] 1617*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym5, [o(pw_799_4017x8)] 1618*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym4, [o(pw_m1189_3920x8)] 1619*c0909341SAndroid Build Coastguard Worker pxor ym6, ym6 1620*c0909341SAndroid Build Coastguard Worker punpckhwd ym2, ym0, ym0 1621*c0909341SAndroid Build Coastguard Worker pmulhrsw ym2, ym3 ; t8a t15a 1622*c0909341SAndroid Build Coastguard Worker punpcklwd ym7, ym1, ym1 1623*c0909341SAndroid Build Coastguard Worker pmulhrsw ym7, ym5 ; t4a t7a 1624*c0909341SAndroid Build Coastguard Worker punpckhwd ym1, ym1 1625*c0909341SAndroid Build Coastguard Worker pmulhrsw ym4, ym1 ; t11a t12a 1626*c0909341SAndroid Build Coastguard Worker vpcmpub k7, ym13, ym10, 6 1627*c0909341SAndroid Build Coastguard Worker punpcklwd ym9, ym6, ym0 1628*c0909341SAndroid Build Coastguard Worker psubsw ym0, ym2, ym4 ; t11a t12a 1629*c0909341SAndroid Build Coastguard Worker paddsw ym8, ym2, ym4 ; t8a t15a 1630*c0909341SAndroid Build Coastguard Worker mova ym1, ym7 1631*c0909341SAndroid Build Coastguard Worker jmp .main5 1632*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1633*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast ; bottom half is zero 1634*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym10, [o(pd_2048)] 1635*c0909341SAndroid Build Coastguard Worker vpbroadcastq ym13, [o(int_mshift)] 1636*c0909341SAndroid Build Coastguard Worker pxor ym6, ym6 1637*c0909341SAndroid Build Coastguard Worker punpckhwd ym8, ym0, ym0 1638*c0909341SAndroid Build Coastguard Worker punpckhwd ym4, ym3, ym3 1639*c0909341SAndroid Build Coastguard Worker punpckhwd ym5, ym2, ym2 1640*c0909341SAndroid Build Coastguard Worker punpcklwd ym7, ym1, ym1 1641*c0909341SAndroid Build Coastguard Worker punpckhwd ym1, ym1 1642*c0909341SAndroid Build Coastguard Worker punpcklwd ym3, ym3 1643*c0909341SAndroid Build Coastguard Worker punpcklwd ym9, ym6, ym0 1644*c0909341SAndroid Build Coastguard Worker punpcklwd ym6, ym2 1645*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym2, [o(pw_401_4076x8)] 1646*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym0, [o(pw_m2598_3166x8)] 1647*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym11, [o(pw_1931_3612x8)] 1648*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym12, [o(pw_m1189_3920x8)] 1649*c0909341SAndroid Build Coastguard Worker pmulhrsw ym8, ym2 ; t8a t15a 1650*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym2, [o(pw_799_4017x8)] 1651*c0909341SAndroid Build Coastguard Worker pmulhrsw ym0, ym4 ; t9a t14a 1652*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym4, [o(pw_m2276_3406x8)] 1653*c0909341SAndroid Build Coastguard Worker pmulhrsw ym5, ym11 ; t10a t13a 1654*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym12 ; t11a t12a 1655*c0909341SAndroid Build Coastguard Worker pmulhrsw ym7, ym2 ; t4a t7a 1656*c0909341SAndroid Build Coastguard Worker pmulhrsw ym3, ym4 ; t5a t6a 1657*c0909341SAndroid Build Coastguard Worker vpcmpub k7, ym13, ym10, 6 1658*c0909341SAndroid Build Coastguard Worker jmp .main4 1659*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1660*c0909341SAndroid Build Coastguard Workercglobal_label .main 1661*c0909341SAndroid Build Coastguard Worker WRAP_YMM IDCT16_1D_PACKED 1662*c0909341SAndroid Build Coastguard Worker ret 1663*c0909341SAndroid Build Coastguard Worker 1664*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, dct 1665*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, adst 1666*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, flipadst 1667*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, identity 1668*c0909341SAndroid Build Coastguard Worker 1669*c0909341SAndroid Build Coastguard Workercglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1670*c0909341SAndroid Build Coastguard Worker call m(iadst_16x8_internal_8bpc).main_pass1 1671*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [o(int_shuf1)] 1672*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [o(pw_16384_m16384)] 1673*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 1674*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 1675*c0909341SAndroid Build Coastguard Worker pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 1676*c0909341SAndroid Build Coastguard Worker pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 1677*c0909341SAndroid Build Coastguard Worker.pass1_end: 1678*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m7}, m3, m5, m4, m2 1679*c0909341SAndroid Build Coastguard Worker punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1 1680*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3 1681*c0909341SAndroid Build Coastguard Worker punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 1682*c0909341SAndroid Build Coastguard Worker punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 1683*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m0, m2 1684*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m2 1685*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m3, m5 1686*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m5 1687*c0909341SAndroid Build Coastguard Worker jmp tx2q 1688*c0909341SAndroid Build Coastguard Worker.pass2: 1689*c0909341SAndroid Build Coastguard Worker call .main_pass2 1690*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [o(pw_2048)] 1691*c0909341SAndroid Build Coastguard Worker psrlq m10, 4 1692*c0909341SAndroid Build Coastguard Worker psubw m7, m8, m6 1693*c0909341SAndroid Build Coastguard Worker.pass2_end: 1694*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(pw_2896x8)] 1695*c0909341SAndroid Build Coastguard Worker paddsw m1, m2, m4 1696*c0909341SAndroid Build Coastguard Worker psubsw m2, m4 1697*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m5 ; -out7 out4 out6 -out5 1698*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m2 ; out8 -out11 -out9 out10 1699*c0909341SAndroid Build Coastguard Worker mova ym8, [o(gather8c)] 1700*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+strideq] 1701*c0909341SAndroid Build Coastguard Worker psrlq m2, m10, 4 1702*c0909341SAndroid Build Coastguard Worker vpermi2q m2, m0, m3 ; 1 3 13 15 1703*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m10, m3 ; 0 2 12 14 1704*c0909341SAndroid Build Coastguard Worker psrlq m3, m10, 8 1705*c0909341SAndroid Build Coastguard Worker vpermi2q m3, m1, m5 ; 5 7 9 11 1706*c0909341SAndroid Build Coastguard Worker psrlq m10, 12 1707*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m10, m5 ; 4 6 8 10 1708*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m6 1709*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6 1710*c0909341SAndroid Build Coastguard Worker jmp m(idct_8x16_internal_8bpc).end3 1711*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1712*c0909341SAndroid Build Coastguard Worker.main_pass1: 1713*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_2896x8)] 1714*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m2, [cq+64*0] 1715*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m2, [cq+64*3] 1716*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m2, [cq+64*1] 1717*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, [cq+64*2] 1718*c0909341SAndroid Build Coastguard Worker movu m4, [o(permA+3)] 1719*c0909341SAndroid Build Coastguard Worker psrlq m10, m4, 4 1720*c0909341SAndroid Build Coastguard Worker mova m6, m4 1721*c0909341SAndroid Build Coastguard Worker vpermi2q m4, m5, m3 ; in0 in12 in2 in14 1722*c0909341SAndroid Build Coastguard Worker vpermt2q m5, m10, m3 ; in15 in3 in13 in1 1723*c0909341SAndroid Build Coastguard Worker vpermi2q m6, m1, m2 ; in4 in8 in6 in10 1724*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m10, m2 ; in11 in7 in9 in5 1725*c0909341SAndroid Build Coastguard Worker jmp .main 1726*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1727*c0909341SAndroid Build Coastguard Worker.main_pass2: 1728*c0909341SAndroid Build Coastguard Worker mova m4, [o(permC)] 1729*c0909341SAndroid Build Coastguard Worker psrlq m5, m4, 4 1730*c0909341SAndroid Build Coastguard Worker vpermi2q m4, m0, m2 ; in0 in12 in2 in14 1731*c0909341SAndroid Build Coastguard Worker psrlq m6, m5, 4 1732*c0909341SAndroid Build Coastguard Worker vpermi2q m5, m1, m3 ; in15 in3 in13 in1 1733*c0909341SAndroid Build Coastguard Worker psrlq m10, m6, 4 1734*c0909341SAndroid Build Coastguard Worker vpermi2q m6, m0, m2 ; in4 in8 in6 in10 1735*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m10, m3 ; in11 in7 in9 in5 1736*c0909341SAndroid Build Coastguard Worker.main: 1737*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m4, m5 ; in0 in15 in2 in13 1738*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 ; in12 in3 in14 in1 1739*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m6, m1 ; in4 in11 in6 in9 1740*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m1 ; in8 in7 in10 in5 1741*c0909341SAndroid Build Coastguard Workercglobal_label .main2 1742*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pd_2048)] 1743*c0909341SAndroid Build Coastguard Worker vpbroadcastq m13, [o(int_mshift)] 1744*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 1745*c0909341SAndroid Build Coastguard Worker vpcmpub k7, m13, m9, 6 ; 0x33... 1746*c0909341SAndroid Build Coastguard Worker pxor m8, m8 1747*c0909341SAndroid Build Coastguard Worker ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5 1748*c0909341SAndroid Build Coastguard Worker ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5 1749*c0909341SAndroid Build Coastguard Worker ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5 1750*c0909341SAndroid Build Coastguard Worker ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5 1751*c0909341SAndroid Build Coastguard Worker psubsw m2, m0, m6 ; t9a t8a t11a t10a 1752*c0909341SAndroid Build Coastguard Worker paddsw m0, m6 ; t1a t0a t3a t2a 1753*c0909341SAndroid Build Coastguard Worker psubsw m3, m5, m4 ; t13a t12a t15a t14a 1754*c0909341SAndroid Build Coastguard Worker paddsw m5, m4 ; t5a t4a t7a t6a 1755*c0909341SAndroid Build Coastguard Worker ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5 1756*c0909341SAndroid Build Coastguard Worker psubw m7, m8, m7 1757*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4 1758*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [o(pw_3784_m1567)] 1759*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6{k1}, [o(pw_m3784_1567)] 1760*c0909341SAndroid Build Coastguard Worker psubsw m1, m0, m5 ; t5 t4 t7 t6 1761*c0909341SAndroid Build Coastguard Worker paddsw m0, m5 ; t1 t0 t3 t2 1762*c0909341SAndroid Build Coastguard Worker psubsw m4, m2, m3 ; t13a t12a t15a t14a 1763*c0909341SAndroid Build Coastguard Worker paddsw m2, m3 ; t9a t8a t11a t10a 1764*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a 1765*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15 1766*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [o(deint_shuf)] 1767*c0909341SAndroid Build Coastguard Worker pshufb m0, m5 1768*c0909341SAndroid Build Coastguard Worker pshufb m2, m5 1769*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a 1770*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a 1771*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15 1772*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12 1773*c0909341SAndroid Build Coastguard Worker pshufd m2, m2, q1032 ; t7a t6a t15 t14 1774*c0909341SAndroid Build Coastguard Worker psubsw m4, m0, m3 ; t3a t2a t11 t10 1775*c0909341SAndroid Build Coastguard Worker paddsw m0, m3 ; -out15 out0 out14 -out1 1776*c0909341SAndroid Build Coastguard Worker paddsw m3, m1, m2 ; out12 -out3 -out13 out2 1777*c0909341SAndroid Build Coastguard Worker psubsw m1, m2 ; t7 t6 t15a t14a 1778*c0909341SAndroid Build Coastguard Worker punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a 1779*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m1 ; t3a t7 t11 t15a 1780*c0909341SAndroid Build Coastguard Worker ret 1781*c0909341SAndroid Build Coastguard Worker 1782*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, dct 1783*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, adst 1784*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, flipadst 1785*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, identity 1786*c0909341SAndroid Build Coastguard Worker 1787*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1788*c0909341SAndroid Build Coastguard Worker call m(iadst_16x8_internal_8bpc).main_pass1 1789*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m6, [o(int_shuf2)] 1790*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [o(pw_m16384_16384)] 1791*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 1792*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 1793*c0909341SAndroid Build Coastguard Worker pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 1794*c0909341SAndroid Build Coastguard Worker pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 1795*c0909341SAndroid Build Coastguard Worker jmp m(iadst_8x16_internal_8bpc).pass1_end 1796*c0909341SAndroid Build Coastguard Worker.pass2: 1797*c0909341SAndroid Build Coastguard Worker call m(iadst_8x16_internal_8bpc).main_pass2 1798*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [o(pw_2048)] 1799*c0909341SAndroid Build Coastguard Worker psrlq m10, 36 1800*c0909341SAndroid Build Coastguard Worker psubw m6, m8, m7 1801*c0909341SAndroid Build Coastguard Worker jmp m(iadst_8x16_internal_8bpc).pass2_end 1802*c0909341SAndroid Build Coastguard Worker 1803*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, dct 1804*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, adst 1805*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, flipadst 1806*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, identity 1807*c0909341SAndroid Build Coastguard Worker 1808*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1809*c0909341SAndroid Build Coastguard Worker mova m0, [o(int16_perm)] 1810*c0909341SAndroid Build Coastguard Worker vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 1811*c0909341SAndroid Build Coastguard Worker vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 1812*c0909341SAndroid Build Coastguard Worker vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 1813*c0909341SAndroid Build Coastguard Worker vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 1814*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(pw_2896x8)] 1815*c0909341SAndroid Build Coastguard Worker punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 1816*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 1817*c0909341SAndroid Build Coastguard Worker punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1 1818*c0909341SAndroid Build Coastguard Worker punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3 1819*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m5}, m1, m2, m3, m4 1820*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 1821*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1 1822*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2 1823*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3 1824*c0909341SAndroid Build Coastguard Worker jmp tx2q 1825*c0909341SAndroid Build Coastguard Worker.pass2: 1826*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [o(pw_1697x16)] 1827*c0909341SAndroid Build Coastguard Worker mova ym8, [o(gather8b)] 1828*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+strideq*2] 1829*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m7, m0 1830*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m7, m1 1831*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m7, m2 1832*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m3 1833*c0909341SAndroid Build Coastguard Worker REPX {paddsw x, x}, m0, m1, m2, m3 1834*c0909341SAndroid Build Coastguard Worker paddsw m0, m4 1835*c0909341SAndroid Build Coastguard Worker paddsw m1, m5 1836*c0909341SAndroid Build Coastguard Worker paddsw m2, m6 1837*c0909341SAndroid Build Coastguard Worker paddsw m3, m7 1838*c0909341SAndroid Build Coastguard Worker jmp m(idct_8x16_internal_8bpc).end 1839*c0909341SAndroid Build Coastguard Worker 1840*c0909341SAndroid Build Coastguard Worker%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] 1841*c0909341SAndroid Build Coastguard Worker pmovzxbw m%3, [dstq+%5] 1842*c0909341SAndroid Build Coastguard Worker%ifnum %1 1843*c0909341SAndroid Build Coastguard Worker paddw m%3, m%1 1844*c0909341SAndroid Build Coastguard Worker%else 1845*c0909341SAndroid Build Coastguard Worker paddw m%3, %1 1846*c0909341SAndroid Build Coastguard Worker%endif 1847*c0909341SAndroid Build Coastguard Worker pmovzxbw m%4, [dstq+%6] 1848*c0909341SAndroid Build Coastguard Worker%ifnum %2 1849*c0909341SAndroid Build Coastguard Worker paddw m%4, m%2 1850*c0909341SAndroid Build Coastguard Worker%else 1851*c0909341SAndroid Build Coastguard Worker paddw m%4, %2 1852*c0909341SAndroid Build Coastguard Worker%endif 1853*c0909341SAndroid Build Coastguard Worker packuswb m%3, m%4 1854*c0909341SAndroid Build Coastguard Worker vpermq m%3, m%3, q3120 1855*c0909341SAndroid Build Coastguard Worker mova [dstq+%5], xm%3 1856*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+%6], m%3, 1 1857*c0909341SAndroid Build Coastguard Worker%endmacro 1858*c0909341SAndroid Build Coastguard Worker 1859*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X4_FN 2 ; type1, type2 1860*c0909341SAndroid Build Coastguard Worker INV_TXFM_FN %1, %2, 16x4 1861*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct 1862*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 1863*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 1864*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2 1865*c0909341SAndroid Build Coastguard Worker%endif 1866*c0909341SAndroid Build Coastguard Worker%endmacro 1867*c0909341SAndroid Build Coastguard Worker 1868*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 1869*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, dct 1870*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, adst 1871*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, flipadst 1872*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, identity 1873*c0909341SAndroid Build Coastguard Worker 1874*c0909341SAndroid Build Coastguard Workercglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1875*c0909341SAndroid Build Coastguard Worker mova xm0, [cq+16*0] 1876*c0909341SAndroid Build Coastguard Worker mova xm1, [cq+16*1] 1877*c0909341SAndroid Build Coastguard Worker mova xm2, [cq+16*2] 1878*c0909341SAndroid Build Coastguard Worker mova xm3, [cq+16*3] 1879*c0909341SAndroid Build Coastguard Worker mova xm4, [cq+16*4] 1880*c0909341SAndroid Build Coastguard Worker mova xm5, [cq+16*5] 1881*c0909341SAndroid Build Coastguard Worker mova xm6, [cq+16*6] 1882*c0909341SAndroid Build Coastguard Worker mova xm7, [cq+16*7] 1883*c0909341SAndroid Build Coastguard Worker call m(idct_4x16_internal_8bpc).main 1884*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_16384)] 1885*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, xm3, 1 ; 3 2 7 6 1886*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym5, xm7, 1 ; b a f e 1887*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, xm2, 1 ; 0 1 4 5 1888*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym4, xm6, 1 ; 8 9 c d 1889*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e 1890*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d 1891*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m8 1892*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m8 1893*c0909341SAndroid Build Coastguard Worker pshufd m1, m1, q1032 1894*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0, m1 1895*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 1896*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m2 1897*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 1898*c0909341SAndroid Build Coastguard Worker jmp tx2q 1899*c0909341SAndroid Build Coastguard Worker.pass2: 1900*c0909341SAndroid Build Coastguard Worker IDCT4_1D_PACKED 1901*c0909341SAndroid Build Coastguard Worker mova m2, [o(permA)] 1902*c0909341SAndroid Build Coastguard Worker jmp m(iadst_16x4_internal_8bpc).end 1903*c0909341SAndroid Build Coastguard Worker 1904*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, dct 1905*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, adst 1906*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, flipadst 1907*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, identity 1908*c0909341SAndroid Build Coastguard Worker 1909*c0909341SAndroid Build Coastguard Workercglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1910*c0909341SAndroid Build Coastguard Worker mova m0, [cq+64*0] 1911*c0909341SAndroid Build Coastguard Worker mova m1, [cq+64*1] 1912*c0909341SAndroid Build Coastguard Worker movshdup m3, [o(permB)] 1913*c0909341SAndroid Build Coastguard Worker psrlq m10, m3, 4 1914*c0909341SAndroid Build Coastguard Worker call m(iadst_4x16_internal_8bpc).main2 1915*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [o(pw_16384_m16384)] 1916*c0909341SAndroid Build Coastguard Worker psrlq m0, m10, 4 1917*c0909341SAndroid Build Coastguard Worker psrlq m10, 8 1918*c0909341SAndroid Build Coastguard Worker.pass1_end: 1919*c0909341SAndroid Build Coastguard Worker punpcklwd ym5, ym4, ym2 1920*c0909341SAndroid Build Coastguard Worker punpckhwd ym4, ym2 1921*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, ym4, 1 1922*c0909341SAndroid Build Coastguard Worker mova m1, m9 1923*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} 1924*c0909341SAndroid Build Coastguard Worker mova m4, m9 1925*c0909341SAndroid Build Coastguard Worker vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16} 1926*c0909341SAndroid Build Coastguard Worker psrad m1, 12 1927*c0909341SAndroid Build Coastguard Worker psrad m4, 12 1928*c0909341SAndroid Build Coastguard Worker packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5 1929*c0909341SAndroid Build Coastguard Worker vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d 1930*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f 1931*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m0, m1 1932*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 1933*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m2 1934*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m2 1935*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m6 1936*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6 1937*c0909341SAndroid Build Coastguard Worker jmp tx2q 1938*c0909341SAndroid Build Coastguard Worker.pass2: 1939*c0909341SAndroid Build Coastguard Worker call .main 1940*c0909341SAndroid Build Coastguard Worker movu m2, [o(permA+1)] 1941*c0909341SAndroid Build Coastguard Worker.end: 1942*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_2048)] 1943*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m3 1944*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 1945*c0909341SAndroid Build Coastguard Worker.end2: 1946*c0909341SAndroid Build Coastguard Worker psrlq m3, m2, 4 1947*c0909341SAndroid Build Coastguard Worker vpermi2q m2, m0, m1 1948*c0909341SAndroid Build Coastguard Worker vpermi2q m3, m0, m1 1949*c0909341SAndroid Build Coastguard Worker.end3: 1950*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+strideq*2] 1951*c0909341SAndroid Build Coastguard Worker mova xm1, [dstq+strideq*0] 1952*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, [dstq+strideq*1], 1 1953*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [r3 +strideq*0], 2 1954*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [r3 +strideq*1], 3 1955*c0909341SAndroid Build Coastguard Worker pxor m4, m4 1956*c0909341SAndroid Build Coastguard Worker mova [cq+64*0], m4 1957*c0909341SAndroid Build Coastguard Worker mova [cq+64*1], m4 1958*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m4 1959*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m4 1960*c0909341SAndroid Build Coastguard Worker paddw m0, m2 1961*c0909341SAndroid Build Coastguard Worker paddw m1, m3 1962*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 1963*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 1964*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym0, 1 1965*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r3 +strideq*0], m0, 2 1966*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r3 +strideq*1], m0, 3 1967*c0909341SAndroid Build Coastguard Worker RET 1968*c0909341SAndroid Build Coastguard WorkerALIGN function_align 1969*c0909341SAndroid Build Coastguard Worker.main: 1970*c0909341SAndroid Build Coastguard Worker IADST4_1D_PACKED 1971*c0909341SAndroid Build Coastguard Worker ret 1972*c0909341SAndroid Build Coastguard Worker 1973*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, dct 1974*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, adst 1975*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, flipadst 1976*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, identity 1977*c0909341SAndroid Build Coastguard Worker 1978*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1979*c0909341SAndroid Build Coastguard Worker mova m0, [cq+64*0] 1980*c0909341SAndroid Build Coastguard Worker mova m1, [cq+64*1] 1981*c0909341SAndroid Build Coastguard Worker movshdup m3, [o(permB)] 1982*c0909341SAndroid Build Coastguard Worker psrlq m10, m3, 4 1983*c0909341SAndroid Build Coastguard Worker call m(iadst_4x16_internal_8bpc).main2 1984*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [o(pw_m16384_16384)] 1985*c0909341SAndroid Build Coastguard Worker psrlq m0, m10, 12 1986*c0909341SAndroid Build Coastguard Worker psrlq m10, 16 1987*c0909341SAndroid Build Coastguard Worker jmp m(iadst_16x4_internal_8bpc).pass1_end 1988*c0909341SAndroid Build Coastguard Worker.pass2: 1989*c0909341SAndroid Build Coastguard Worker call m(iadst_16x4_internal_8bpc).main 1990*c0909341SAndroid Build Coastguard Worker movu m2, [o(permA+2)] 1991*c0909341SAndroid Build Coastguard Worker jmp m(iadst_16x4_internal_8bpc).end 1992*c0909341SAndroid Build Coastguard Worker 1993*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, dct 1994*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, adst 1995*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, flipadst 1996*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, identity 1997*c0909341SAndroid Build Coastguard Worker 1998*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 1999*c0909341SAndroid Build Coastguard Worker mova m1, [cq+64*0] 2000*c0909341SAndroid Build Coastguard Worker mova m2, [cq+64*1] 2001*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_1697x16)] 2002*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_16384)] 2003*c0909341SAndroid Build Coastguard Worker mova m5, [o(idtx_16x4p)] 2004*c0909341SAndroid Build Coastguard Worker shufps m0, m1, m2, q2020 2005*c0909341SAndroid Build Coastguard Worker shufps m1, m2, q3131 2006*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3, m0 2007*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m1 2008*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4 2009*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4 2010*c0909341SAndroid Build Coastguard Worker paddsw m0, m2 2011*c0909341SAndroid Build Coastguard Worker paddsw m1, m3 2012*c0909341SAndroid Build Coastguard Worker vpermb m0, m5, m0 2013*c0909341SAndroid Build Coastguard Worker vpermb m1, m5, m1 2014*c0909341SAndroid Build Coastguard Worker jmp tx2q 2015*c0909341SAndroid Build Coastguard Worker.pass2: 2016*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_1697x8)] 2017*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3, m0 2018*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m1 2019*c0909341SAndroid Build Coastguard Worker paddsw m0, m2 2020*c0909341SAndroid Build Coastguard Worker paddsw m1, m3 2021*c0909341SAndroid Build Coastguard Worker movu m2, [o(permA+1)] 2022*c0909341SAndroid Build Coastguard Worker jmp m(iadst_16x4_internal_8bpc).end 2023*c0909341SAndroid Build Coastguard Worker 2024*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X8_FN 2 ; type1, type2 2025*c0909341SAndroid Build Coastguard Worker INV_TXFM_FN %1, %2, 16x8 2026*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct 2027*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 2028*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 2029*c0909341SAndroid Build Coastguard Worker or r3d, 8 2030*c0909341SAndroid Build Coastguard Worker.dconly: 2031*c0909341SAndroid Build Coastguard Worker imul r6d, 181 2032*c0909341SAndroid Build Coastguard Worker add r6d, 128 2033*c0909341SAndroid Build Coastguard Worker sar r6d, 8 2034*c0909341SAndroid Build Coastguard Worker.dconly2: 2035*c0909341SAndroid Build Coastguard Worker imul r6d, 181 2036*c0909341SAndroid Build Coastguard Worker add r6d, 128+256 2037*c0909341SAndroid Build Coastguard Worker sar r6d, 8+1 2038*c0909341SAndroid Build Coastguard Worker.dconly3: 2039*c0909341SAndroid Build Coastguard Worker imul r6d, 181 2040*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 2041*c0909341SAndroid Build Coastguard Worker add r6d, 128+2048 2042*c0909341SAndroid Build Coastguard Worker sar r6d, 8+4 2043*c0909341SAndroid Build Coastguard Worker pxor m2, m2 2044*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, r6d 2045*c0909341SAndroid Build Coastguard Worker.dconly_loop: 2046*c0909341SAndroid Build Coastguard Worker mova xm1, [dstq+strideq*0] 2047*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym1, [dstq+strideq*1], 1 2048*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [dstq+strideq*2], 2 2049*c0909341SAndroid Build Coastguard Worker vinserti32x4 m1, [dstq+r2 ], 3 2050*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 2051*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 2052*c0909341SAndroid Build Coastguard Worker paddw m0, m3 2053*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2054*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2055*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 2056*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym0, 1 2057*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m0, 2 2058*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r2 ], m0, 3 2059*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 2060*c0909341SAndroid Build Coastguard Worker sub r3d, 4 2061*c0909341SAndroid Build Coastguard Worker jg .dconly_loop 2062*c0909341SAndroid Build Coastguard Worker RET 2063*c0909341SAndroid Build Coastguard Worker%endif 2064*c0909341SAndroid Build Coastguard Worker%endmacro 2065*c0909341SAndroid Build Coastguard Worker 2066*c0909341SAndroid Build Coastguard Worker%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd 2067*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_2896x8)] 2068*c0909341SAndroid Build Coastguard Worker vpermq m0, [cq+32*0], q3120 2069*c0909341SAndroid Build Coastguard Worker add cq, 32*4 2070*c0909341SAndroid Build Coastguard Worker vpermq m7, [cq+32*3], q%1 2071*c0909341SAndroid Build Coastguard Worker vpermq m1, [cq-32*3], q%1 2072*c0909341SAndroid Build Coastguard Worker vpermq m6, [cq+32*2], q3120 2073*c0909341SAndroid Build Coastguard Worker vpermq m2, [cq-32*2], q3120 2074*c0909341SAndroid Build Coastguard Worker vpermq m5, [cq+32*1], q%1 2075*c0909341SAndroid Build Coastguard Worker vpermq m3, [cq-32*1], q%1 2076*c0909341SAndroid Build Coastguard Worker vpermq m4, [cq+32*0], q3120 2077*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 2078*c0909341SAndroid Build Coastguard Worker%endmacro 2079*c0909341SAndroid Build Coastguard Worker 2080*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, dct 2081*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, identity 2082*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, adst 2083*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, flipadst 2084*c0909341SAndroid Build Coastguard Worker 2085*c0909341SAndroid Build Coastguard Workercglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2086*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [o(pw_2896x8)] 2087*c0909341SAndroid Build Coastguard Worker vpermq m0, [cq+64*0], q3120 2088*c0909341SAndroid Build Coastguard Worker vpermq m2, [cq+64*1], q3120 2089*c0909341SAndroid Build Coastguard Worker vpermq m4, [cq+64*2], q3120 2090*c0909341SAndroid Build Coastguard Worker vpermq m6, [cq+64*3], q3120 2091*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m1}, m0, m2, m4, m6 2092*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym1, m0, 1 2093*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym3, m2, 1 2094*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym5, m4, 1 2095*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym7, m6, 1 2096*c0909341SAndroid Build Coastguard Worker call m(idct_8x16_internal_8bpc).main 2097*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m8, [o(int_shuf1)] 2098*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m9, [o(int_shuf2)] 2099*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3 2100*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3 2101*c0909341SAndroid Build Coastguard Worker vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3 2102*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3 2103*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_16384)] 2104*c0909341SAndroid Build Coastguard Worker pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3 2105*c0909341SAndroid Build Coastguard Worker pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3 2106*c0909341SAndroid Build Coastguard Worker pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3 2107*c0909341SAndroid Build Coastguard Worker pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3 2108*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m2}, m0, m1, m6, m7 2109*c0909341SAndroid Build Coastguard Worker punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 2110*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3 2111*c0909341SAndroid Build Coastguard Worker punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1 2112*c0909341SAndroid Build Coastguard Worker punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3 2113*c0909341SAndroid Build Coastguard Worker jmp tx2q 2114*c0909341SAndroid Build Coastguard Worker.pass2: 2115*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m2, m4, q2020 ; 0 1 2116*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m4, q3131 ; 4 5 2117*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m3, m5, q2020 ; 2 3 2118*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m5, q3131 ; 6 7 2119*c0909341SAndroid Build Coastguard Worker call .main 2120*c0909341SAndroid Build Coastguard Worker movshdup m4, [o(permC)] 2121*c0909341SAndroid Build Coastguard Worker psrlq m6, m4, 4 2122*c0909341SAndroid Build Coastguard Worker vpermq m5, m4, q1032 2123*c0909341SAndroid Build Coastguard Worker vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3 2124*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1 2125*c0909341SAndroid Build Coastguard Worker psrlq m6, m5, 4 2126*c0909341SAndroid Build Coastguard Worker vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3 2127*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1 2128*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [o(pw_2048)] 2129*c0909341SAndroid Build Coastguard Worker.end: 2130*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m6}, m0, m4, m1, m5 2131*c0909341SAndroid Build Coastguard Worker.end2: 2132*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+strideq*4] 2133*c0909341SAndroid Build Coastguard Worker lea r4, [strideq*3] 2134*c0909341SAndroid Build Coastguard Worker mova xm3, [dstq+strideq*0] 2135*c0909341SAndroid Build Coastguard Worker mova xm6, [dstq+strideq*2] 2136*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym3, [dstq+strideq*1], 1 2137*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym6, [dstq+r4 ], 1 2138*c0909341SAndroid Build Coastguard Worker vinserti32x4 m3, [r3 +strideq*0], 2 2139*c0909341SAndroid Build Coastguard Worker vinserti32x4 m6, [r3 +strideq*2], 2 2140*c0909341SAndroid Build Coastguard Worker vinserti32x4 m3, [r3 +strideq*1], 3 2141*c0909341SAndroid Build Coastguard Worker vinserti32x4 m6, [r3 +r4 ], 3 2142*c0909341SAndroid Build Coastguard Worker pxor m7, m7 2143*c0909341SAndroid Build Coastguard Worker mova [cq+64*0], m7 2144*c0909341SAndroid Build Coastguard Worker mova [cq+64*1], m7 2145*c0909341SAndroid Build Coastguard Worker mova [cq+64*2], m7 2146*c0909341SAndroid Build Coastguard Worker mova [cq+64*3], m7 2147*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m7 2148*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m7 2149*c0909341SAndroid Build Coastguard Worker paddw m0, m2 2150*c0909341SAndroid Build Coastguard Worker paddw m4, m3 2151*c0909341SAndroid Build Coastguard Worker packuswb m0, m4 2152*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 2153*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym0, 1 2154*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r3 +strideq*0], m0, 2 2155*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r3 +strideq*1], m0, 3 2156*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m6, m7 2157*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m7 2158*c0909341SAndroid Build Coastguard Worker paddw m1, m3 2159*c0909341SAndroid Build Coastguard Worker paddw m5, m6 2160*c0909341SAndroid Build Coastguard Worker packuswb m1, m5 2161*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm1 2162*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r4 ], ym1, 1 2163*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r3 +strideq*2], m1, 2 2164*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r3 +r4 ], m1, 3 2165*c0909341SAndroid Build Coastguard Worker RET 2166*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2167*c0909341SAndroid Build Coastguard Workercglobal_label .main 2168*c0909341SAndroid Build Coastguard Worker IDCT8_1D_PACKED 2169*c0909341SAndroid Build Coastguard Worker ret 2170*c0909341SAndroid Build Coastguard Worker 2171*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, dct 2172*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, adst 2173*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, flipadst 2174*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, identity 2175*c0909341SAndroid Build Coastguard Worker 2176*c0909341SAndroid Build Coastguard Workercglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2177*c0909341SAndroid Build Coastguard Worker call m(iadst_8x16_internal_8bpc).main_pass1 2178*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [o(pw_16384_m16384)] 2179*c0909341SAndroid Build Coastguard Worker psrlq m10, 4 2180*c0909341SAndroid Build Coastguard Worker.pass1_end: 2181*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m4, m2 2182*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m2 2183*c0909341SAndroid Build Coastguard Worker mova m1, m9 2184*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} 2185*c0909341SAndroid Build Coastguard Worker mova m6, m9 2186*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16} 2187*c0909341SAndroid Build Coastguard Worker mova m2, m9 2188*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16} 2189*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16} 2190*c0909341SAndroid Build Coastguard Worker psrad m1, 12 2191*c0909341SAndroid Build Coastguard Worker psrad m6, 12 2192*c0909341SAndroid Build Coastguard Worker packssdw m1, m6 ; out8 -out7 -out9 out6 2193*c0909341SAndroid Build Coastguard Worker psrad m2, 12 2194*c0909341SAndroid Build Coastguard Worker psrad m9, 12 2195*c0909341SAndroid Build Coastguard Worker packssdw m2, m9 ; -out11 out4 out10 -out5 2196*c0909341SAndroid Build Coastguard Worker psrlq m4, m10, 4 2197*c0909341SAndroid Build Coastguard Worker vpermi2q m4, m0, m2 2198*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m10, m2 2199*c0909341SAndroid Build Coastguard Worker psrlq m5, m10, 8 2200*c0909341SAndroid Build Coastguard Worker vpermi2q m5, m1, m3 2201*c0909341SAndroid Build Coastguard Worker psrlq m10, 12 2202*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m10, m3 2203*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3 2204*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3 2205*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3 2206*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3 2207*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1 2208*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3 2209*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1 2210*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3 2211*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m7}, m2, m3, m4, m5 2212*c0909341SAndroid Build Coastguard Worker jmp tx2q 2213*c0909341SAndroid Build Coastguard Worker.pass2: 2214*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m2, m4, q2020 2215*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m4, q3131 ; 4 5 2216*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m3, m5, q2020 2217*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m5, q3131 ; 6 7 2218*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q1032 ; 1 0 2219*c0909341SAndroid Build Coastguard Worker pshufd m5, m1, q1032 ; 3 2 2220*c0909341SAndroid Build Coastguard Worker call .main_pass2 2221*c0909341SAndroid Build Coastguard Worker movshdup m4, [o(permC)] 2222*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m6 2223*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6 2224*c0909341SAndroid Build Coastguard Worker psrlq m6, m4, 4 2225*c0909341SAndroid Build Coastguard Worker mova m5, m4 2226*c0909341SAndroid Build Coastguard Worker vpermi2q m4, m0, m2 2227*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m6, m2 2228*c0909341SAndroid Build Coastguard Worker vpermi2q m5, m1, m3 2229*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m6, m3 2230*c0909341SAndroid Build Coastguard Worker jmp m(idct_16x8_internal_8bpc).end2 2231*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2232*c0909341SAndroid Build Coastguard Worker.main_pass1: 2233*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_2896x8)] 2234*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4, [cq+64*0] 2235*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m4, [cq+64*3] 2236*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m4, [cq+64*1] 2237*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, [cq+64*2] 2238*c0909341SAndroid Build Coastguard Worker mova m5, [o(int16_perm)] 2239*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 2240*c0909341SAndroid Build Coastguard Worker vpblendmd m0{k1}, m1, m3 ; 0 7 2241*c0909341SAndroid Build Coastguard Worker vmovdqa32 m3{k1}, m1 ; 6 1 2242*c0909341SAndroid Build Coastguard Worker vpblendmd m1{k1}, m4, m2 ; 2 5 2243*c0909341SAndroid Build Coastguard Worker vmovdqa32 m2{k1}, m4 ; 4 3 2244*c0909341SAndroid Build Coastguard Worker REPX {vpermb x, m5, x}, m0, m1, m2, m3 2245*c0909341SAndroid Build Coastguard Worker IADST8_1D_PACKED 1 2246*c0909341SAndroid Build Coastguard Worker ret 2247*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2248*c0909341SAndroid Build Coastguard Workercglobal_label .main_pass2 2249*c0909341SAndroid Build Coastguard Worker IADST8_1D_PACKED 2 2250*c0909341SAndroid Build Coastguard Worker pxor m5, m5 2251*c0909341SAndroid Build Coastguard Worker psubd m5, m6 2252*c0909341SAndroid Build Coastguard Worker packssdw m6, m5 2253*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m6 2254*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m6 2255*c0909341SAndroid Build Coastguard Worker ret 2256*c0909341SAndroid Build Coastguard Worker 2257*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, dct 2258*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, adst 2259*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, flipadst 2260*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, identity 2261*c0909341SAndroid Build Coastguard Worker 2262*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2263*c0909341SAndroid Build Coastguard Worker call m(iadst_8x16_internal_8bpc).main_pass1 2264*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [o(pw_m16384_16384)] 2265*c0909341SAndroid Build Coastguard Worker psrlq m10, 20 2266*c0909341SAndroid Build Coastguard Worker jmp m(iadst_16x8_internal_8bpc).pass1_end 2267*c0909341SAndroid Build Coastguard Worker.pass2: 2268*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m2, m4, q2020 2269*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m4, q3131 ; 4 5 2270*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m3, m5, q2020 2271*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m5, q3131 ; 6 7 2272*c0909341SAndroid Build Coastguard Worker pshufd m4, m0, q1032 ; 1 0 2273*c0909341SAndroid Build Coastguard Worker pshufd m5, m1, q1032 ; 3 2 2274*c0909341SAndroid Build Coastguard Worker call m(iadst_16x8_internal_8bpc).main_pass2 2275*c0909341SAndroid Build Coastguard Worker movshdup m4, [o(permC)] 2276*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m6, m0 2277*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m6, m1 2278*c0909341SAndroid Build Coastguard Worker psrlq m1, m4, 12 2279*c0909341SAndroid Build Coastguard Worker psrlq m4, 8 2280*c0909341SAndroid Build Coastguard Worker mova m7, m4 2281*c0909341SAndroid Build Coastguard Worker vpermi2q m4, m0, m3 2282*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m1, m3 2283*c0909341SAndroid Build Coastguard Worker vpermi2q m1, m5, m2 2284*c0909341SAndroid Build Coastguard Worker vpermt2q m5, m7, m2 2285*c0909341SAndroid Build Coastguard Worker jmp m(idct_16x8_internal_8bpc).end2 2286*c0909341SAndroid Build Coastguard Worker 2287*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, dct 2288*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, adst 2289*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, flipadst 2290*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, identity 2291*c0909341SAndroid Build Coastguard Worker 2292*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2293*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [o(pw_2896x8)] 2294*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m0, [cq+64*0] 2295*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m0, [cq+64*1] 2296*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m0, [cq+64*2] 2297*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, [cq+64*3] 2298*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [o(pw_1697x16)] 2299*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_16384)] 2300*c0909341SAndroid Build Coastguard Worker shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5 2301*c0909341SAndroid Build Coastguard Worker shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7 2302*c0909341SAndroid Build Coastguard Worker shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5 2303*c0909341SAndroid Build Coastguard Worker shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7 2304*c0909341SAndroid Build Coastguard Worker mova m9, [o(int8_permA)] 2305*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m7, m2 2306*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m7, m3 2307*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m7, m4 2308*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m5 2309*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m8}, m0, m1, m6, m7 2310*c0909341SAndroid Build Coastguard Worker paddsw m2, m0 2311*c0909341SAndroid Build Coastguard Worker paddsw m3, m1 2312*c0909341SAndroid Build Coastguard Worker paddsw m4, m6 2313*c0909341SAndroid Build Coastguard Worker paddsw m5, m7 2314*c0909341SAndroid Build Coastguard Worker REPX {vpermb x, m9, x}, m2, m3, m4, m5 2315*c0909341SAndroid Build Coastguard Worker jmp tx2q 2316*c0909341SAndroid Build Coastguard Worker.pass2: 2317*c0909341SAndroid Build Coastguard Worker mova m7, [o(permB)] 2318*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [o(pw_4096)] 2319*c0909341SAndroid Build Coastguard Worker vpermq m0, m7, m2 2320*c0909341SAndroid Build Coastguard Worker vpermq m4, m7, m4 2321*c0909341SAndroid Build Coastguard Worker vpermq m1, m7, m3 2322*c0909341SAndroid Build Coastguard Worker vpermq m5, m7, m5 2323*c0909341SAndroid Build Coastguard Worker jmp m(idct_16x8_internal_8bpc).end 2324*c0909341SAndroid Build Coastguard Worker 2325*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X16_FN 2 ; type1, type2 2326*c0909341SAndroid Build Coastguard Worker INV_TXFM_FN %1, %2, 16x16 2327*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct 2328*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 2329*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 2330*c0909341SAndroid Build Coastguard Worker or r3d, 16 2331*c0909341SAndroid Build Coastguard Worker imul r6d, 181 2332*c0909341SAndroid Build Coastguard Worker add r6d, 128+512 2333*c0909341SAndroid Build Coastguard Worker sar r6d, 8+2 2334*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 2335*c0909341SAndroid Build Coastguard Worker%endif 2336*c0909341SAndroid Build Coastguard Worker%endmacro 2337*c0909341SAndroid Build Coastguard Worker 2338*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, dct 2339*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, identity 2340*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, adst 2341*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, flipadst 2342*c0909341SAndroid Build Coastguard Worker 2343*c0909341SAndroid Build Coastguard Workercglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2344*c0909341SAndroid Build Coastguard Worker mova m7, [o(permB)] 2345*c0909341SAndroid Build Coastguard Worker vpermq m0, m7, [cq+64*0] 2346*c0909341SAndroid Build Coastguard Worker vpermq m1, m7, [cq+64*1] 2347*c0909341SAndroid Build Coastguard Worker vpermq m2, m7, [cq+64*2] 2348*c0909341SAndroid Build Coastguard Worker vpermq m3, m7, [cq+64*3] 2349*c0909341SAndroid Build Coastguard Worker vpermq m4, m7, [cq+64*4] 2350*c0909341SAndroid Build Coastguard Worker vpermq m5, m7, [cq+64*5] 2351*c0909341SAndroid Build Coastguard Worker vpermq m6, m7, [cq+64*6] 2352*c0909341SAndroid Build Coastguard Worker vpermq m7, m7, [cq+64*7] 2353*c0909341SAndroid Build Coastguard Worker call .main 2354*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m12, [o(int_shuf1)] 2355*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m11, [o(int_shuf2)] 2356*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [o(pw_8192)] 2357*c0909341SAndroid Build Coastguard Worker pshufb m0, m12 2358*c0909341SAndroid Build Coastguard Worker pshufb m8, m1, m11 2359*c0909341SAndroid Build Coastguard Worker pshufb m2, m12 2360*c0909341SAndroid Build Coastguard Worker pshufb m9, m3, m11 2361*c0909341SAndroid Build Coastguard Worker pshufb m4, m12 2362*c0909341SAndroid Build Coastguard Worker pshufb m10, m5, m11 2363*c0909341SAndroid Build Coastguard Worker pshufb m6, m12 2364*c0909341SAndroid Build Coastguard Worker pshufb m11, m7, m11 2365*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11 2366*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m0, m8 2367*c0909341SAndroid Build Coastguard Worker punpckldq m0, m8 2368*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m2, m9 2369*c0909341SAndroid Build Coastguard Worker punpckldq m2, m9 2370*c0909341SAndroid Build Coastguard Worker punpckhdq m5, m4, m10 2371*c0909341SAndroid Build Coastguard Worker punpckldq m4, m10 2372*c0909341SAndroid Build Coastguard Worker punpckhdq m7, m6, m11 2373*c0909341SAndroid Build Coastguard Worker punpckldq m6, m11 2374*c0909341SAndroid Build Coastguard Worker jmp tx2q 2375*c0909341SAndroid Build Coastguard Worker.pass2: 2376*c0909341SAndroid Build Coastguard Worker vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc 2377*c0909341SAndroid Build Coastguard Worker vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 2378*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec 2379*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 2380*c0909341SAndroid Build Coastguard Worker vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me 2381*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 2382*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee 2383*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 2384*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m0, m4, q3131 ; 4 5 2385*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m4, q2020 ; 0 1 2386*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m6, m8, q2020 ; 8 9 2387*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m8, q3131 ; 12 13 2388*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m1, m5, q3131 ; 6 7 2389*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m5, q2020 ; 2 3 2390*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m7, m9, q2020 ; 10 11 2391*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m9, q3131 ; 14 15 2392*c0909341SAndroid Build Coastguard Worker call .main 2393*c0909341SAndroid Build Coastguard Worker mova m8, [o(permD)] 2394*c0909341SAndroid Build Coastguard Worker psrlq m12, m8, 4 2395*c0909341SAndroid Build Coastguard Worker psrlq m9, m8, 8 2396*c0909341SAndroid Build Coastguard Worker psrlq m13, m8, 12 2397*c0909341SAndroid Build Coastguard Worker mova m10, m8 2398*c0909341SAndroid Build Coastguard Worker vpermi2q m8, m0, m2 ; 0 1 4 5 2399*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m12, m2 2400*c0909341SAndroid Build Coastguard Worker mova m11, m9 2401*c0909341SAndroid Build Coastguard Worker vpermi2q m9, m1, m3 ; 2 3 6 7 2402*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m13, m3 2403*c0909341SAndroid Build Coastguard Worker vpermi2q m10, m4, m6 ; 8 9 12 13 2404*c0909341SAndroid Build Coastguard Worker vpermt2q m4, m12, m6 2405*c0909341SAndroid Build Coastguard Worker vpermi2q m11, m5, m7 ; 10 11 14 15 2406*c0909341SAndroid Build Coastguard Worker vpermt2q m5, m13, m7 2407*c0909341SAndroid Build Coastguard Worker.end: 2408*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_2048)] 2409*c0909341SAndroid Build Coastguard Worker.end2: 2410*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m0, m1, m4, m5 2411*c0909341SAndroid Build Coastguard Worker.end3: 2412*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m8, m9, m10, m11 2413*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 2414*c0909341SAndroid Build Coastguard Worker lea r4, [dstq+strideq*4] 2415*c0909341SAndroid Build Coastguard Worker lea r5, [dstq+strideq*8] 2416*c0909341SAndroid Build Coastguard Worker lea r6, [r4 +strideq*8] 2417*c0909341SAndroid Build Coastguard Worker mova xm3, [dstq+strideq*0] 2418*c0909341SAndroid Build Coastguard Worker mova xm6, [dstq+strideq*2] 2419*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym3, [dstq+strideq*1], 1 2420*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym6, [dstq+r3 ], 1 2421*c0909341SAndroid Build Coastguard Worker vinserti32x4 m3, [r4+strideq*0], 2 2422*c0909341SAndroid Build Coastguard Worker vinserti32x4 m6, [r4+strideq*2], 2 2423*c0909341SAndroid Build Coastguard Worker vinserti32x4 m3, [r4+strideq*1], 3 2424*c0909341SAndroid Build Coastguard Worker vinserti32x4 m6, [r4+r3 ], 3 2425*c0909341SAndroid Build Coastguard Worker mova xm12, [r5+strideq*0] 2426*c0909341SAndroid Build Coastguard Worker mova xm13, [r5+strideq*2] 2427*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym12, [r5+strideq*1], 1 2428*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym13, [r5+r3 ], 1 2429*c0909341SAndroid Build Coastguard Worker vinserti32x4 m12, [r6+strideq*0], 2 2430*c0909341SAndroid Build Coastguard Worker vinserti32x4 m13, [r6+strideq*2], 2 2431*c0909341SAndroid Build Coastguard Worker vinserti32x4 m12, [r6+strideq*1], 3 2432*c0909341SAndroid Build Coastguard Worker vinserti32x4 m13, [r6+r3 ], 3 2433*c0909341SAndroid Build Coastguard Worker pxor m7, m7 2434*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 2435*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m3, m7 2436*c0909341SAndroid Build Coastguard Worker punpckhbw m3, m7 2437*c0909341SAndroid Build Coastguard Worker paddw m0, m2 2438*c0909341SAndroid Build Coastguard Worker paddw m8, m3 2439*c0909341SAndroid Build Coastguard Worker packuswb m0, m8 2440*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m6, m7 2441*c0909341SAndroid Build Coastguard Worker punpckhbw m6, m7 2442*c0909341SAndroid Build Coastguard Worker paddw m1, m2 2443*c0909341SAndroid Build Coastguard Worker paddw m9, m6 2444*c0909341SAndroid Build Coastguard Worker packuswb m1, m9 2445*c0909341SAndroid Build Coastguard Worker punpcklbw m2, m12, m7 2446*c0909341SAndroid Build Coastguard Worker punpckhbw m12, m7 2447*c0909341SAndroid Build Coastguard Worker paddw m2, m4 2448*c0909341SAndroid Build Coastguard Worker paddw m10, m12 2449*c0909341SAndroid Build Coastguard Worker packuswb m2, m10 2450*c0909341SAndroid Build Coastguard Worker punpcklbw m3, m13, m7 2451*c0909341SAndroid Build Coastguard Worker punpckhbw m13, m7 2452*c0909341SAndroid Build Coastguard Worker paddw m3, m5 2453*c0909341SAndroid Build Coastguard Worker paddw m11, m13 2454*c0909341SAndroid Build Coastguard Worker packuswb m3, m11 2455*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm0 2456*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym0, 1 2457*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], xm1 2458*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r3 ], ym1, 1 2459*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r4+strideq*0], m0, 2 2460*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r4+strideq*1], m0, 3 2461*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r4+strideq*2], m1, 2 2462*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r4+r3 ], m1, 3 2463*c0909341SAndroid Build Coastguard Worker mova [r5+strideq*0], xm2 2464*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r5+strideq*1], ym2, 1 2465*c0909341SAndroid Build Coastguard Worker mova [r5+strideq*2], xm3 2466*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r5+r3 ], ym3, 1 2467*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r6+strideq*0], m2, 2 2468*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r6+strideq*1], m2, 3 2469*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r6+strideq*2], m3, 2 2470*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r6+r3 ], m3, 3 2471*c0909341SAndroid Build Coastguard Worker RET 2472*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2473*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast2 ; bottom three-quarters are zero 2474*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pd_2048)] 2475*c0909341SAndroid Build Coastguard Worker vpbroadcastq m13, [o(int_mshift)] 2476*c0909341SAndroid Build Coastguard Worker vpcmpub k7, m13, m10, 6 2477*c0909341SAndroid Build Coastguard Worker.main_fast4: 2478*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_401_4076x8)] 2479*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_m1189_3920x8)] 2480*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_799_4017x8)] 2481*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m8 ; t8a t15a 2482*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m1 ; t11a t12a 2483*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m3 ; t4a t7a 2484*c0909341SAndroid Build Coastguard Worker pxor m6, m6 2485*c0909341SAndroid Build Coastguard Worker psubsw m0, m2, m4 ; t11a t12a 2486*c0909341SAndroid Build Coastguard Worker paddsw m8, m2, m4 ; t8a t15a 2487*c0909341SAndroid Build Coastguard Worker mova m1, m7 2488*c0909341SAndroid Build Coastguard Worker jmp .main5 2489*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2490*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast ; bottom half is zero 2491*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pd_2048)] 2492*c0909341SAndroid Build Coastguard Worker.main_fast3: 2493*c0909341SAndroid Build Coastguard Worker vpbroadcastq m13, [o(int_mshift)] 2494*c0909341SAndroid Build Coastguard Worker vpcmpub k7, m13, m10, 6 2495*c0909341SAndroid Build Coastguard Worker.main_fast5: 2496*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_401_4076x8)] 2497*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_m2598_3166x8)] 2498*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_1931_3612x8)] 2499*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m1189_3920x8)] 2500*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m2 ; t8a t15a 2501*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_799_4017x8)] 2502*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m4 ; t9a t14a 2503*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_m2276_3406x8)] 2504*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m11 ; t10a t13a 2505*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m12 ; t11a t12a 2506*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m2 ; t4a t7a 2507*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m4 ; t5a t6a 2508*c0909341SAndroid Build Coastguard Worker jmp .main4 2509*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2510*c0909341SAndroid Build Coastguard Workercglobal_label .main 2511*c0909341SAndroid Build Coastguard Worker IDCT16_1D_PACKED 2512*c0909341SAndroid Build Coastguard Worker ret 2513*c0909341SAndroid Build Coastguard Worker 2514*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, dct 2515*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, adst 2516*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, flipadst 2517*c0909341SAndroid Build Coastguard Worker 2518*c0909341SAndroid Build Coastguard Workercglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2519*c0909341SAndroid Build Coastguard Worker call .main_pass1 2520*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pw_8192_m8192)] 2521*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3 2522*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3 2523*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3 2524*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1 2525*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3 2526*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3 2527*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3 2528*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1 2529*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3 2530*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3 2531*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 2532*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 2533*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3 2534*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3 2535*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 2536*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 2537*c0909341SAndroid Build Coastguard Worker.pass1_end: 2538*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 2539*c0909341SAndroid Build Coastguard Worker jmp tx2q 2540*c0909341SAndroid Build Coastguard Worker.pass2: 2541*c0909341SAndroid Build Coastguard Worker call .main_pass2 2542*c0909341SAndroid Build Coastguard Worker mova m10, [o(permD)] 2543*c0909341SAndroid Build Coastguard Worker psrlq m8, m10, 8 2544*c0909341SAndroid Build Coastguard Worker psrlq m12, m10, 12 2545*c0909341SAndroid Build Coastguard Worker psrlq m13, m10, 4 2546*c0909341SAndroid Build Coastguard Worker mova m9, m8 2547*c0909341SAndroid Build Coastguard Worker vpermi2q m8, m0, m2 ; 0 1 4 5 2548*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m12, m2 2549*c0909341SAndroid Build Coastguard Worker vpermi2q m9, m1, m3 ; 2 3 6 7 2550*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m12, m3 2551*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_2048)] 2552*c0909341SAndroid Build Coastguard Worker mov r3d, 0xff00ff00 2553*c0909341SAndroid Build Coastguard Worker mova m11, m10 2554*c0909341SAndroid Build Coastguard Worker vpermi2q m10, m4, m6 ; 8 9 12 13 2555*c0909341SAndroid Build Coastguard Worker vpermt2q m4, m13, m6 2556*c0909341SAndroid Build Coastguard Worker kmovd k1, r3d 2557*c0909341SAndroid Build Coastguard Worker vpermi2q m11, m5, m7 ; 10 11 14 15 2558*c0909341SAndroid Build Coastguard Worker vpermt2q m5, m13, m7 2559*c0909341SAndroid Build Coastguard Worker pxor m7, m7 2560*c0909341SAndroid Build Coastguard Worker vpsubw m12{k1}, m7, m12 2561*c0909341SAndroid Build Coastguard Worker jmp m(idct_16x16_internal_8bpc).end2 2562*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2563*c0909341SAndroid Build Coastguard Worker.main_pass1: 2564*c0909341SAndroid Build Coastguard Worker mova m4, [o(permB)] 2565*c0909341SAndroid Build Coastguard Worker psrlq m3, m4, 4 2566*c0909341SAndroid Build Coastguard Worker vpermq m0, m4, [cq+64*0] 2567*c0909341SAndroid Build Coastguard Worker vpermq m7, m3, [cq+64*7] 2568*c0909341SAndroid Build Coastguard Worker vpermq m6, m4, [cq+64*6] 2569*c0909341SAndroid Build Coastguard Worker vpermq m1, m3, [cq+64*1] 2570*c0909341SAndroid Build Coastguard Worker vpermq m2, m4, [cq+64*2] 2571*c0909341SAndroid Build Coastguard Worker vpermq m5, m3, [cq+64*5] 2572*c0909341SAndroid Build Coastguard Worker vpermq m4, m4, [cq+64*4] 2573*c0909341SAndroid Build Coastguard Worker vpermq m3, m3, [cq+64*3] 2574*c0909341SAndroid Build Coastguard Worker call .main 2575*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [o(pw_2896_2896)] 2576*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m2896_2896)] 2577*c0909341SAndroid Build Coastguard Worker mova m2, m10 2578*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m5, m13 ; -out5 2579*c0909341SAndroid Build Coastguard Worker mova m8, m10 2580*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m11, m13 ; out4 2581*c0909341SAndroid Build Coastguard Worker mova m9, m10 2582*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m5, m12 ; out10 2583*c0909341SAndroid Build Coastguard Worker mova m5, m10 2584*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m11, m12 ; -out11 2585*c0909341SAndroid Build Coastguard Worker mova m11, m10 2586*c0909341SAndroid Build Coastguard Worker vpdpwssd m11, m3, m13 ; -out7 2587*c0909341SAndroid Build Coastguard Worker mova m14, m10 2588*c0909341SAndroid Build Coastguard Worker vpdpwssd m14, m4, m13 ; out6 2589*c0909341SAndroid Build Coastguard Worker mova m13, m10 2590*c0909341SAndroid Build Coastguard Worker vpdpwssd m13, m3, m12 ; out8 2591*c0909341SAndroid Build Coastguard Worker vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9 2592*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10 2593*c0909341SAndroid Build Coastguard Worker packssdw m2, m8 ; -out5 out4 2594*c0909341SAndroid Build Coastguard Worker packssdw m5, m9, m5 ; out10 -out11 2595*c0909341SAndroid Build Coastguard Worker packssdw m3, m11, m14 ; -out7 out6 2596*c0909341SAndroid Build Coastguard Worker packssdw m4, m13, m10 ; out8 -out9 2597*c0909341SAndroid Build Coastguard Worker ret 2598*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2599*c0909341SAndroid Build Coastguard Worker.main_pass2: 2600*c0909341SAndroid Build Coastguard Worker vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc 2601*c0909341SAndroid Build Coastguard Worker vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 2602*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec 2603*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 2604*c0909341SAndroid Build Coastguard Worker vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me 2605*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 2606*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee 2607*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 2608*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m0, m4, q3131 ; 4 5 2609*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m4, q2020 ; 0 1 2610*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m6, m8, q2020 ; 8 9 2611*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m8, q3131 ; 12 13 2612*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m1, m5, q3131 ; 6 7 2613*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m5, q2020 ; 2 3 2614*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m7, m9, q2020 ; 10 11 2615*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m9, q3131 ; 14 15 2616*c0909341SAndroid Build Coastguard Workercglobal_label .main_pass2b 2617*c0909341SAndroid Build Coastguard Worker REPX {pshufd x, x, q1032}, m1, m3, m5, m7 2618*c0909341SAndroid Build Coastguard Worker call .main 2619*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_2896x8)] 2620*c0909341SAndroid Build Coastguard Worker pshufb m2, m11, m12 2621*c0909341SAndroid Build Coastguard Worker pshufb m5, m12 2622*c0909341SAndroid Build Coastguard Worker pshufb m3, m12 2623*c0909341SAndroid Build Coastguard Worker pshufb m4, m12 2624*c0909341SAndroid Build Coastguard Worker punpcklqdq m9, m5, m2 ; t15a t7 2625*c0909341SAndroid Build Coastguard Worker punpckhqdq m5, m2 ; t14a t6 2626*c0909341SAndroid Build Coastguard Worker shufps m2, m3, m4, q1032 ; t2a t10 2627*c0909341SAndroid Build Coastguard Worker shufps m3, m4, q3210 ; t3a t11 2628*c0909341SAndroid Build Coastguard Worker psubsw m4, m2, m3 ; out8 -out9 2629*c0909341SAndroid Build Coastguard Worker paddsw m3, m2 ; -out7 out6 2630*c0909341SAndroid Build Coastguard Worker paddsw m2, m5, m9 ; -out5 out4 2631*c0909341SAndroid Build Coastguard Worker psubsw m5, m9 ; out10 -out11 2632*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m8}, m2, m3, m4, m5 2633*c0909341SAndroid Build Coastguard Worker ret 2634*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2635*c0909341SAndroid Build Coastguard Worker.main: 2636*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pd_2048)] 2637*c0909341SAndroid Build Coastguard Worker vpbroadcastq m13, [o(int_mshift)] 2638*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m7, m0 ; in14 in1 2639*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m7 ; in0 in15 2640*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m6, m1 ; in12 in3 2641*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6 ; in2 in13 2642*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m5, m2 ; in10 in5 2643*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m5 ; in4 in11 2644*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m4, m3 ; in8 in7 2645*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 ; in6 in9 2646*c0909341SAndroid Build Coastguard Worker vpcmpub k7, m13, m10, 6 ; 0x33... 2647*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1 2648*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3 2649*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5 2650*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7 2651*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9 2652*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11 2653*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13 2654*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15 2655*c0909341SAndroid Build Coastguard Worker psubsw m4, m0, m5 ; t9a t8a 2656*c0909341SAndroid Build Coastguard Worker paddsw m0, m5 ; t1a t0a 2657*c0909341SAndroid Build Coastguard Worker psubsw m5, m1, m6 ; t11a t10a 2658*c0909341SAndroid Build Coastguard Worker paddsw m1, m6 ; t3a t2a 2659*c0909341SAndroid Build Coastguard Worker psubsw m6, m2, m7 ; t13a t12a 2660*c0909341SAndroid Build Coastguard Worker paddsw m2, m7 ; t5a t4a 2661*c0909341SAndroid Build Coastguard Worker psubsw m7, m3, m8 ; t15a t14a 2662*c0909341SAndroid Build Coastguard Worker paddsw m3, m8 ; t7a t6a 2663*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9 2664*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13 2665*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11 2666*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15 2667*c0909341SAndroid Build Coastguard Worker psubsw m8, m1, m3 ; t7 t6 2668*c0909341SAndroid Build Coastguard Worker paddsw m1, m3 ; t3 t2 2669*c0909341SAndroid Build Coastguard Worker psubsw m3, m0, m2 ; t5 t4 2670*c0909341SAndroid Build Coastguard Worker paddsw m0, m2 ; t1 t0 2671*c0909341SAndroid Build Coastguard Worker psubsw m2, m5, m7 ; t14a t15a 2672*c0909341SAndroid Build Coastguard Worker paddsw m7, m5 ; t10a t11a 2673*c0909341SAndroid Build Coastguard Worker psubsw m5, m4, m6 ; t12a t13a 2674*c0909341SAndroid Build Coastguard Worker paddsw m4, m6 ; t8a t9a 2675*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a 2676*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a 2677*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14 2678*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12 2679*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m12, [o(deint_shuf)] 2680*c0909341SAndroid Build Coastguard Worker paddsw m6, m4, m7 ; -out1 out14 2681*c0909341SAndroid Build Coastguard Worker psubsw m4, m7 ; t10 t11 2682*c0909341SAndroid Build Coastguard Worker psubsw m11, m3, m8 ; t7 t6 2683*c0909341SAndroid Build Coastguard Worker paddsw m8, m3 ; out12 -out3 2684*c0909341SAndroid Build Coastguard Worker psubsw m3, m0, m1 ; t3a t2a 2685*c0909341SAndroid Build Coastguard Worker paddsw m0, m1 ; -out15 out0 2686*c0909341SAndroid Build Coastguard Worker paddsw m1, m2, m5 ; -out13 out2 2687*c0909341SAndroid Build Coastguard Worker psubsw m5, m2 ; t15a t14a 2688*c0909341SAndroid Build Coastguard Worker pshufb m0, m12 2689*c0909341SAndroid Build Coastguard Worker pshufb m6, m12 2690*c0909341SAndroid Build Coastguard Worker pshufb m8, m12 2691*c0909341SAndroid Build Coastguard Worker pshufb m1, m12 2692*c0909341SAndroid Build Coastguard Worker shufps m7, m6, m0, q1032 ; out14 -out15 2693*c0909341SAndroid Build Coastguard Worker shufps m0, m6, m0, q3210 ; -out1 out0 2694*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m8, m1 ; out12 -out13 2695*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m8, m1 ; -out3 out2 2696*c0909341SAndroid Build Coastguard Worker ret 2697*c0909341SAndroid Build Coastguard Worker 2698*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, dct 2699*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, adst 2700*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, flipadst 2701*c0909341SAndroid Build Coastguard Worker 2702*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2703*c0909341SAndroid Build Coastguard Worker call m(iadst_16x16_internal_8bpc).main_pass1 2704*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pw_m8192_8192)] 2705*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3 2706*c0909341SAndroid Build Coastguard Worker punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3 2707*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3 2708*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3 2709*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1 2710*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3 2711*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1 2712*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3 2713*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3 2714*c0909341SAndroid Build Coastguard Worker punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3 2715*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3 2716*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3 2717*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1 2718*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3 2719*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1 2720*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3 2721*c0909341SAndroid Build Coastguard Worker jmp m(iadst_16x16_internal_8bpc).pass1_end 2722*c0909341SAndroid Build Coastguard Worker.pass2: 2723*c0909341SAndroid Build Coastguard Worker call m(iadst_16x16_internal_8bpc).main_pass2 2724*c0909341SAndroid Build Coastguard Worker mova m10, [o(permD)] 2725*c0909341SAndroid Build Coastguard Worker psrlq m8, m10, 8 2726*c0909341SAndroid Build Coastguard Worker psrlq m12, m10, 12 2727*c0909341SAndroid Build Coastguard Worker psrlq m13, m10, 4 2728*c0909341SAndroid Build Coastguard Worker mova m9, m8 2729*c0909341SAndroid Build Coastguard Worker vpermi2q m8, m7, m5 ; 0 1 4 5 2730*c0909341SAndroid Build Coastguard Worker vpermt2q m7, m12, m5 2731*c0909341SAndroid Build Coastguard Worker vpermi2q m9, m6, m4 ; 2 3 6 7 2732*c0909341SAndroid Build Coastguard Worker vpermt2q m6, m12, m4 2733*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_2048)] 2734*c0909341SAndroid Build Coastguard Worker mov r3d, 0x00ff00ff 2735*c0909341SAndroid Build Coastguard Worker mova m11, m10 2736*c0909341SAndroid Build Coastguard Worker vpermi2q m10, m3, m1 ; 8 9 12 13 2737*c0909341SAndroid Build Coastguard Worker vpermt2q m3, m13, m1 2738*c0909341SAndroid Build Coastguard Worker kmovd k1, r3d 2739*c0909341SAndroid Build Coastguard Worker vpermi2q m11, m2, m0 ; 10 11 14 15 2740*c0909341SAndroid Build Coastguard Worker vpermt2q m2, m13, m0 2741*c0909341SAndroid Build Coastguard Worker pxor m0, m0 2742*c0909341SAndroid Build Coastguard Worker vpsubw m12{k1}, m0, m12 2743*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m7, m12 2744*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m6, m12 2745*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m3, m12 2746*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m2, m12 2747*c0909341SAndroid Build Coastguard Worker jmp m(idct_16x16_internal_8bpc).end3 2748*c0909341SAndroid Build Coastguard Worker 2749*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN identity, dct 2750*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN identity, identity 2751*c0909341SAndroid Build Coastguard Worker 2752*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 2753*c0909341SAndroid Build Coastguard Worker mova m8, [o(int16_perm)] 2754*c0909341SAndroid Build Coastguard Worker vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 2755*c0909341SAndroid Build Coastguard Worker vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 2756*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [o(pw_1697x16)] 2757*c0909341SAndroid Build Coastguard Worker vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 2758*c0909341SAndroid Build Coastguard Worker vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 2759*c0909341SAndroid Build Coastguard Worker vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3 2760*c0909341SAndroid Build Coastguard Worker vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3 2761*c0909341SAndroid Build Coastguard Worker vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3 2762*c0909341SAndroid Build Coastguard Worker vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3 2763*c0909341SAndroid Build Coastguard Worker pmulhrsw m9, m0, m1 2764*c0909341SAndroid Build Coastguard Worker pmulhrsw m10, m0, m2 2765*c0909341SAndroid Build Coastguard Worker pmulhrsw m11, m0, m3 2766*c0909341SAndroid Build Coastguard Worker pmulhrsw m12, m0, m4 2767*c0909341SAndroid Build Coastguard Worker pmulhrsw m13, m0, m5 2768*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m0, m6 2769*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m0, m7 2770*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m8 2771*c0909341SAndroid Build Coastguard Worker REPX {psraw x, 1}, m9, m10, m11, m12 2772*c0909341SAndroid Build Coastguard Worker pavgw m1, m9 2773*c0909341SAndroid Build Coastguard Worker pavgw m2, m10 2774*c0909341SAndroid Build Coastguard Worker pavgw m3, m11 2775*c0909341SAndroid Build Coastguard Worker pavgw m4, m12 2776*c0909341SAndroid Build Coastguard Worker REPX {psraw x, 1}, m13, m14, m15, m0 2777*c0909341SAndroid Build Coastguard Worker pavgw m5, m13 2778*c0909341SAndroid Build Coastguard Worker pavgw m6, m14 2779*c0909341SAndroid Build Coastguard Worker pavgw m7, m15 2780*c0909341SAndroid Build Coastguard Worker pavgw m8, m0 2781*c0909341SAndroid Build Coastguard Worker punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 2782*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 2783*c0909341SAndroid Build Coastguard Worker punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 2784*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 2785*c0909341SAndroid Build Coastguard Worker punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1 2786*c0909341SAndroid Build Coastguard Worker punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3 2787*c0909341SAndroid Build Coastguard Worker punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 2788*c0909341SAndroid Build Coastguard Worker punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 2789*c0909341SAndroid Build Coastguard Worker jmp tx2q 2790*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2791*c0909341SAndroid Build Coastguard Worker.pass2: 2792*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_1697x16)] 2793*c0909341SAndroid Build Coastguard Worker pmulhrsw m12, m11, m0 2794*c0909341SAndroid Build Coastguard Worker pmulhrsw m13, m11, m1 2795*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m11, m2 2796*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m11, m3 2797*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m11, m4 2798*c0909341SAndroid Build Coastguard Worker pmulhrsw m9, m11, m5 2799*c0909341SAndroid Build Coastguard Worker pmulhrsw m10, m11, m6 2800*c0909341SAndroid Build Coastguard Worker pmulhrsw m11, m7 2801*c0909341SAndroid Build Coastguard Worker REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 2802*c0909341SAndroid Build Coastguard Worker paddsw m0, m12 2803*c0909341SAndroid Build Coastguard Worker paddsw m1, m13 2804*c0909341SAndroid Build Coastguard Worker paddsw m2, m14 2805*c0909341SAndroid Build Coastguard Worker paddsw m3, m15 2806*c0909341SAndroid Build Coastguard Worker paddsw m8, m4 2807*c0909341SAndroid Build Coastguard Worker movu m4, [o(permD+2)] 2808*c0909341SAndroid Build Coastguard Worker paddsw m9, m5 2809*c0909341SAndroid Build Coastguard Worker paddsw m6, m10 2810*c0909341SAndroid Build Coastguard Worker paddsw m7, m11 2811*c0909341SAndroid Build Coastguard Worker psrlq m12, m4, 4 2812*c0909341SAndroid Build Coastguard Worker mova m5, m4 2813*c0909341SAndroid Build Coastguard Worker mova m10, m4 2814*c0909341SAndroid Build Coastguard Worker mova m11, m4 2815*c0909341SAndroid Build Coastguard Worker vpermi2q m4, m0, m2 ; 8 9 12 13 2816*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m12, m2 ; 0 1 4 5 2817*c0909341SAndroid Build Coastguard Worker vpermi2q m5, m1, m3 ; 10 11 14 15 2818*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m12, m3 ; 2 3 6 7 2819*c0909341SAndroid Build Coastguard Worker vpermi2q m10, m8, m6 2820*c0909341SAndroid Build Coastguard Worker vpermt2q m8, m12, m6 2821*c0909341SAndroid Build Coastguard Worker vpermi2q m11, m9, m7 2822*c0909341SAndroid Build Coastguard Worker vpermt2q m9, m12, m7 2823*c0909341SAndroid Build Coastguard Worker jmp m(idct_16x16_internal_8bpc).end 2824*c0909341SAndroid Build Coastguard Worker 2825*c0909341SAndroid Build Coastguard Worker%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4] 2826*c0909341SAndroid Build Coastguard Worker vpbroadcastd m%4, [o(pw_%5_%6x8)] 2827*c0909341SAndroid Build Coastguard Worker punpcklwd m%1, m%3, m%3 2828*c0909341SAndroid Build Coastguard Worker pmulhrsw m%1, m%4 2829*c0909341SAndroid Build Coastguard Worker vpbroadcastd m%4, [o(pw_%7_%8x8)] 2830*c0909341SAndroid Build Coastguard Worker punpckhwd m%2, m%3, m%3 2831*c0909341SAndroid Build Coastguard Worker pmulhrsw m%2, m%4 2832*c0909341SAndroid Build Coastguard Worker%endmacro 2833*c0909341SAndroid Build Coastguard Worker 2834*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob 2835*c0909341SAndroid Build Coastguard Worker%undef cmp 2836*c0909341SAndroid Build Coastguard Worker lea r5, [o_base] 2837*c0909341SAndroid Build Coastguard Worker test eobd, eobd 2838*c0909341SAndroid Build Coastguard Worker jz .dconly 2839*c0909341SAndroid Build Coastguard Worker cmp eobd, 107 2840*c0909341SAndroid Build Coastguard Worker jb .fast 2841*c0909341SAndroid Build Coastguard Worker mova m5, [cq+64*5] 2842*c0909341SAndroid Build Coastguard Worker mova m3, [cq+64*3] 2843*c0909341SAndroid Build Coastguard Worker mova m1, [cq+64*1] 2844*c0909341SAndroid Build Coastguard Worker mova m7, [cq+64*7] 2845*c0909341SAndroid Build Coastguard Worker mova m2, [cq+64*2] 2846*c0909341SAndroid Build Coastguard Worker mova m6, [cq+64*6] 2847*c0909341SAndroid Build Coastguard Worker mova m0, [cq+64*0] 2848*c0909341SAndroid Build Coastguard Worker mova m4, [cq+64*4] 2849*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x8_8bpc).main 2850*c0909341SAndroid Build Coastguard Worker mova m8, [o(idct_8x32p)] 2851*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_8192)] 2852*c0909341SAndroid Build Coastguard Worker REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7 2853*c0909341SAndroid Build Coastguard Worker punpckldq m8, m0, m1 ; ab 2854*c0909341SAndroid Build Coastguard Worker punpckhdq m0, m1 2855*c0909341SAndroid Build Coastguard Worker punpckldq m1, m2, m3 ; cd 2856*c0909341SAndroid Build Coastguard Worker punpckhdq m2, m3 2857*c0909341SAndroid Build Coastguard Worker punpckldq m3, m4, m5 ; ef 2858*c0909341SAndroid Build Coastguard Worker punpckhdq m4, m5 2859*c0909341SAndroid Build Coastguard Worker punpckldq m5, m6, m7 ; gh 2860*c0909341SAndroid Build Coastguard Worker punpckhdq m6, m7 2861*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6 2862*c0909341SAndroid Build Coastguard Worker punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9 2863*c0909341SAndroid Build Coastguard Worker punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21 2864*c0909341SAndroid Build Coastguard Worker punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13 2865*c0909341SAndroid Build Coastguard Worker punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17 2866*c0909341SAndroid Build Coastguard Worker punpcklqdq m20, m3, m5 2867*c0909341SAndroid Build Coastguard Worker punpckhqdq m16, m3, m5 2868*c0909341SAndroid Build Coastguard Worker punpcklqdq m19, m4, m6 2869*c0909341SAndroid Build Coastguard Worker punpckhqdq m17, m4, m6 2870*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym8, ym18, xm20, 1 2871*c0909341SAndroid Build Coastguard Worker vshufi32x4 ym1, ym18, ym20, 0x03 2872*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym9, ym14, xm16, 1 2873*c0909341SAndroid Build Coastguard Worker vshufi32x4 ym3, ym14, ym16, 0x03 2874*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, ym21, xm19, 1 2875*c0909341SAndroid Build Coastguard Worker vshufi32x4 ym5, ym21, ym19, 0x03 2876*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym7, ym15, xm17, 1 2877*c0909341SAndroid Build Coastguard Worker vshufi32x4 ym6, ym15, ym17, 0x03 2878*c0909341SAndroid Build Coastguard Worker call m(idct_8x16_internal_8bpc).main2 2879*c0909341SAndroid Build Coastguard Worker psrlq m12, [o(permB)], 60 2880*c0909341SAndroid Build Coastguard Worker vpermt2q m14, m12, m16 2881*c0909341SAndroid Build Coastguard Worker vpermt2q m21, m12, m19 2882*c0909341SAndroid Build Coastguard Worker vpermt2q m15, m12, m17 2883*c0909341SAndroid Build Coastguard Worker vpermi2q m12, m18, m20 2884*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym16, m14, 1 2885*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym19, m21, 1 2886*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym17, m15, 1 2887*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym20, m12, 1 2888*c0909341SAndroid Build Coastguard Worker call .main2 2889*c0909341SAndroid Build Coastguard Worker jmp .end 2890*c0909341SAndroid Build Coastguard Worker.fast: ; right half is zero 2891*c0909341SAndroid Build Coastguard Worker mova m0, [o(int16_perm)] 2892*c0909341SAndroid Build Coastguard Worker mova ym2, [cq+64*4] 2893*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, [cq+64*0], 1 2894*c0909341SAndroid Build Coastguard Worker mova ym3, [cq+64*6] 2895*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [cq+64*2], 1 2896*c0909341SAndroid Build Coastguard Worker mova ym4, [cq+64*3] 2897*c0909341SAndroid Build Coastguard Worker vinserti32x8 m4, [cq+64*5], 1 2898*c0909341SAndroid Build Coastguard Worker mova ym5, [cq+64*7] 2899*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, [cq+64*1], 1 2900*c0909341SAndroid Build Coastguard Worker REPX {vpermb x, m0, x}, m2, m3, m4, m5 2901*c0909341SAndroid Build Coastguard Worker call m(idct_16x8_internal_8bpc).main2 2902*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m4, [o(int_shuf3)] 2903*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m5, [o(int_shuf4)] 2904*c0909341SAndroid Build Coastguard Worker pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3 2905*c0909341SAndroid Build Coastguard Worker pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3 2906*c0909341SAndroid Build Coastguard Worker pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3 2907*c0909341SAndroid Build Coastguard Worker pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3 2908*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_8192)] 2909*c0909341SAndroid Build Coastguard Worker psrlq m5, [o(permB)], 60 2910*c0909341SAndroid Build Coastguard Worker punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2 2911*c0909341SAndroid Build Coastguard Worker punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3 2912*c0909341SAndroid Build Coastguard Worker punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2 2913*c0909341SAndroid Build Coastguard Worker punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3 2914*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m4}, m6, m17, m2, m16 2915*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym0, ym2, xm6, 1 ; 0 2 2916*c0909341SAndroid Build Coastguard Worker vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 2917*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 2918*c0909341SAndroid Build Coastguard Worker vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 2919*c0909341SAndroid Build Coastguard Worker vpermt2q m2, m5, m6 ; 8 10 2920*c0909341SAndroid Build Coastguard Worker vpermt2q m16, m5, m17 ; 9 11 2921*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym3, m2, 1 ; 12 14 2922*c0909341SAndroid Build Coastguard Worker vextracti32x8 ym17, m16, 1 ; 13 15 2923*c0909341SAndroid Build Coastguard Worker call m(idct_8x16_internal_8bpc).main_fast 2924*c0909341SAndroid Build Coastguard Worker call .main_fast 2925*c0909341SAndroid Build Coastguard Worker.end: 2926*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym8, strided 2927*c0909341SAndroid Build Coastguard Worker pmulld ym8, [o(gather8d)] 2928*c0909341SAndroid Build Coastguard Worker call .main_end 2929*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+strideq*4] 2930*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 2931*c0909341SAndroid Build Coastguard Worker lea r4, [dstq+strideq*8] 2932*c0909341SAndroid Build Coastguard Worker pxor m9, m9 2933*c0909341SAndroid Build Coastguard Worker lea r1, [r3+strideq*8] 2934*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 2935*c0909341SAndroid Build Coastguard Worker vpgatherdq m12{k1}, [r0+ym8] 2936*c0909341SAndroid Build Coastguard Worker kmovb k1, k2 2937*c0909341SAndroid Build Coastguard Worker vpgatherdq m13{k2}, [r3+ym8] 2938*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 2939*c0909341SAndroid Build Coastguard Worker vpgatherdq m14{k1}, [r4+ym8] 2940*c0909341SAndroid Build Coastguard Worker kmovb k1, k2 2941*c0909341SAndroid Build Coastguard Worker vpgatherdq m15{k2}, [r1+ym8] 2942*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 2943*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7 2944*c0909341SAndroid Build Coastguard Worker punpcklbw m11, m12, m9 2945*c0909341SAndroid Build Coastguard Worker punpckhbw m12, m9 2946*c0909341SAndroid Build Coastguard Worker paddw m0, m11 2947*c0909341SAndroid Build Coastguard Worker paddw m1, m12 2948*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 2949*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 2950*c0909341SAndroid Build Coastguard Worker vpscatterdq [r0+ym8]{k1}, m0 2951*c0909341SAndroid Build Coastguard Worker punpcklbw m12, m13, m9 2952*c0909341SAndroid Build Coastguard Worker punpckhbw m13, m9 2953*c0909341SAndroid Build Coastguard Worker paddw m2, m12 2954*c0909341SAndroid Build Coastguard Worker paddw m3, m13 2955*c0909341SAndroid Build Coastguard Worker packuswb m2, m3 2956*c0909341SAndroid Build Coastguard Worker kmovb k1, k2 2957*c0909341SAndroid Build Coastguard Worker vpscatterdq [r3+ym8]{k2}, m2 2958*c0909341SAndroid Build Coastguard Worker punpcklbw m13, m14, m9 2959*c0909341SAndroid Build Coastguard Worker punpckhbw m14, m9 2960*c0909341SAndroid Build Coastguard Worker paddw m4, m13 2961*c0909341SAndroid Build Coastguard Worker paddw m5, m14 2962*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 2963*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 2964*c0909341SAndroid Build Coastguard Worker vpscatterdq [r4+ym8]{k1}, m4 2965*c0909341SAndroid Build Coastguard Worker punpcklbw m14, m15, m9 2966*c0909341SAndroid Build Coastguard Worker punpckhbw m15, m9 2967*c0909341SAndroid Build Coastguard Worker paddw m6, m14 2968*c0909341SAndroid Build Coastguard Worker paddw m7, m15 2969*c0909341SAndroid Build Coastguard Worker packuswb m6, m7 2970*c0909341SAndroid Build Coastguard Worker vpscatterdq [r1+ym8]{k2}, m6 2971*c0909341SAndroid Build Coastguard Worker RET 2972*c0909341SAndroid Build Coastguard Worker.dconly: 2973*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 2974*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 2975*c0909341SAndroid Build Coastguard Worker or r3d, 32 2976*c0909341SAndroid Build Coastguard Worker imul r6d, 181 2977*c0909341SAndroid Build Coastguard Worker add r6d, 128+512 2978*c0909341SAndroid Build Coastguard Worker sar r6d, 8+2 2979*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 2980*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx512icl 2981*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2982*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast2 ; bottom three-quarters are zero 2983*c0909341SAndroid Build Coastguard Worker ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a 2984*c0909341SAndroid Build Coastguard Worker ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a 2985*c0909341SAndroid Build Coastguard Worker mova m11, m12 2986*c0909341SAndroid Build Coastguard Worker mova m17, m20 2987*c0909341SAndroid Build Coastguard Worker mova m15, m21 2988*c0909341SAndroid Build Coastguard Worker mova m16, m14 2989*c0909341SAndroid Build Coastguard Worker jmp .main4 2990*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2991*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast ; bottom half is zero 2992*c0909341SAndroid Build Coastguard Worker ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a 2993*c0909341SAndroid Build Coastguard Worker ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a 2994*c0909341SAndroid Build Coastguard Worker ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a 2995*c0909341SAndroid Build Coastguard Worker ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a 2996*c0909341SAndroid Build Coastguard Worker jmp .main3 2997*c0909341SAndroid Build Coastguard WorkerALIGN function_align 2998*c0909341SAndroid Build Coastguard Workercglobal_label .main 2999*c0909341SAndroid Build Coastguard Worker punpcklwd m12, m21, m14 ; in31 in1 3000*c0909341SAndroid Build Coastguard Worker punpckhwd m14, m21 ; in3 in29 3001*c0909341SAndroid Build Coastguard Worker punpcklwd m21, m20, m15 ; in27 in5 3002*c0909341SAndroid Build Coastguard Worker punpckhwd m15, m20 ; in7 in25 3003*c0909341SAndroid Build Coastguard Worker punpcklwd m20, m19, m16 ; in23 in9 3004*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m19 ; in11 in21 3005*c0909341SAndroid Build Coastguard Worker punpcklwd m19, m18, m17 ; in19 in13 3006*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m18 ; in15 in17 3007*c0909341SAndroid Build Coastguard Worker.main2: 3008*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a 3009*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a 3010*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a 3011*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a 3012*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a 3013*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a 3014*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a 3015*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a 3016*c0909341SAndroid Build Coastguard Worker.main3: 3017*c0909341SAndroid Build Coastguard Worker psubsw m11, m12, m17 ; t17 t30 3018*c0909341SAndroid Build Coastguard Worker paddsw m12, m17 ; t16 t31 3019*c0909341SAndroid Build Coastguard Worker psubsw m17, m15, m20 ; t18 t29 3020*c0909341SAndroid Build Coastguard Worker paddsw m20, m15 ; t19 t28 3021*c0909341SAndroid Build Coastguard Worker psubsw m15, m21, m16 ; t21 t26 3022*c0909341SAndroid Build Coastguard Worker paddsw m21, m16 ; t20 t27 3023*c0909341SAndroid Build Coastguard Worker psubsw m16, m14, m19 ; t22 t25 3024*c0909341SAndroid Build Coastguard Worker paddsw m14, m19 ; t23 t24 3025*c0909341SAndroid Build Coastguard Worker.main4: 3026*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a 3027*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a 3028*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a 3029*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a 3030*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_m3784_1567)] 3031*c0909341SAndroid Build Coastguard Worker psubsw m19, m12, m20 ; t19a t28a 3032*c0909341SAndroid Build Coastguard Worker paddsw m20, m12 ; t16a t31a 3033*c0909341SAndroid Build Coastguard Worker psubsw m12, m14, m21 ; t20a t27a 3034*c0909341SAndroid Build Coastguard Worker paddsw m14, m21 ; t23a t24a 3035*c0909341SAndroid Build Coastguard Worker psubsw m21, m11, m17 ; t18 t29 3036*c0909341SAndroid Build Coastguard Worker paddsw m11, m17 ; t17 t30 3037*c0909341SAndroid Build Coastguard Worker psubsw m17, m16, m15 ; t21 t26 3038*c0909341SAndroid Build Coastguard Worker paddsw m16, m15 ; t22 t25 3039*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a 3040*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28 3041*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27 3042*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a 3043*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m18, [o(deint_shuf)] 3044*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_m2896_2896)] 3045*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_2896_2896)] 3046*c0909341SAndroid Build Coastguard Worker psubsw m15, m20, m14 ; t23 t24 3047*c0909341SAndroid Build Coastguard Worker paddsw m20, m14 ; t16 t31 3048*c0909341SAndroid Build Coastguard Worker psubsw m14, m11, m16 ; t22a t25a 3049*c0909341SAndroid Build Coastguard Worker paddsw m11, m16 ; t17a t30a 3050*c0909341SAndroid Build Coastguard Worker psubsw m16, m21, m17 ; t21 t26 3051*c0909341SAndroid Build Coastguard Worker paddsw m21, m17 ; t18 t29 3052*c0909341SAndroid Build Coastguard Worker psubsw m17, m19, m12 ; t20a t27a 3053*c0909341SAndroid Build Coastguard Worker paddsw m19, m12 ; t19a t28a 3054*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m18}, m20, m11, m21, m19 3055*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a 3056*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 3057*c0909341SAndroid Build Coastguard Worker packssdw m18, m13 ; t23a t22 3058*c0909341SAndroid Build Coastguard Worker packssdw m12, m15 ; t24a t25 3059*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a 3060*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 3061*c0909341SAndroid Build Coastguard Worker packssdw m16, m13 ; t20 t21a 3062*c0909341SAndroid Build Coastguard Worker packssdw m14, m15 ; t27 t26a 3063*c0909341SAndroid Build Coastguard Worker punpcklqdq m13, m19, m21 ; t19a t18 3064*c0909341SAndroid Build Coastguard Worker punpckhqdq m19, m21 ; t28a t29 3065*c0909341SAndroid Build Coastguard Worker punpcklqdq m21, m20, m11 ; t16 t17a 3066*c0909341SAndroid Build Coastguard Worker punpckhqdq m20, m11 ; t31 t30a 3067*c0909341SAndroid Build Coastguard WorkerINIT_ZMM avx512icl 3068*c0909341SAndroid Build Coastguard Worker mova m15, [o(permA)] 3069*c0909341SAndroid Build Coastguard Worker ret 3070*c0909341SAndroid Build Coastguard Workercglobal_label .main_end 3071*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pw_2048)] 3072*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m15, m1 ; t0 t1 t2 t3 3073*c0909341SAndroid Build Coastguard Worker vpermt2q m20, m15, m19 ; t31 t30a t29 t28a 3074*c0909341SAndroid Build Coastguard Worker vpermt2q m2, m15, m3 ; t4 t5 t6 t7 3075*c0909341SAndroid Build Coastguard Worker vpermt2q m14, m15, m12 ; t27 t26a t25 t24a 3076*c0909341SAndroid Build Coastguard Worker vpermt2q m4, m15, m5 ; t8 t9 t10 t11 3077*c0909341SAndroid Build Coastguard Worker vpermt2q m18, m15, m16 ; t23a t22 t21a t20 3078*c0909341SAndroid Build Coastguard Worker vpermt2q m6, m15, m7 ; t12 t13 t14 t15 3079*c0909341SAndroid Build Coastguard Worker vpermt2q m13, m15, m21 ; t19a t18 t17a t16 3080*c0909341SAndroid Build Coastguard Worker psubsw m7, m0, m20 ; out31 out30 out29 out28 3081*c0909341SAndroid Build Coastguard Worker paddsw m0, m20 ; out0 out1 out2 out3 3082*c0909341SAndroid Build Coastguard Worker psubsw m5, m2, m14 ; out27 out26 out25 out24 3083*c0909341SAndroid Build Coastguard Worker paddsw m2, m14 ; out4 out5 out6 out7 3084*c0909341SAndroid Build Coastguard Worker psubsw m3, m4, m18 ; out23 out22 out21 out20 3085*c0909341SAndroid Build Coastguard Worker paddsw m4, m18 ; out8 out9 out10 out11 3086*c0909341SAndroid Build Coastguard Worker psubsw m1, m6, m13 ; out19 out18 out17 out16 3087*c0909341SAndroid Build Coastguard Worker paddsw m6, m13 ; out12 out13 out14 out15 3088*c0909341SAndroid Build Coastguard Worker vzeroupper 3089*c0909341SAndroid Build Coastguard Worker ret 3090*c0909341SAndroid Build Coastguard Worker 3091*c0909341SAndroid Build Coastguard Worker%macro LOAD_PACKED_16X2 3 ; dst, row[1-2] 3092*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym%1, [cq+16*%2] 3093*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 ym8, [cq+16*%3] 3094*c0909341SAndroid Build Coastguard Worker shufpd ym%1, ym8, 0x0c 3095*c0909341SAndroid Build Coastguard Worker%endmacro 3096*c0909341SAndroid Build Coastguard Worker 3097*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob 3098*c0909341SAndroid Build Coastguard Worker%undef cmp 3099*c0909341SAndroid Build Coastguard Worker test eobd, eobd 3100*c0909341SAndroid Build Coastguard Worker jz .dconly 3101*c0909341SAndroid Build Coastguard Worker lea r5, [o_base] 3102*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 0, 0, 2 ; in0 in2 3103*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 1, 4, 6 ; in4 in6 3104*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 2, 8, 10 ; in8 in10 3105*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 3, 12, 14 ; in12 in14 3106*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 14, 1, 3 ; in1 in3 3107*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 15, 5, 7 ; in5 in7 3108*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 16, 9, 11 ; in9 in11 3109*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 17, 13, 15 ; in13 in15 3110*c0909341SAndroid Build Coastguard Worker pxor m4, m4 3111*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 3112*c0909341SAndroid Build Coastguard Worker cmp eobd, 107 3113*c0909341SAndroid Build Coastguard Worker jb .fast 3114*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 4, 16, 18 ; in16 in18 3115*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 5, 20, 22 ; in20 in22 3116*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 6, 24, 26 ; in24 in26 3117*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 7, 28, 30 ; in28 in30 3118*c0909341SAndroid Build Coastguard Worker call m(idct_8x16_internal_8bpc).main 3119*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 18, 19, 17 ; in19 in17 3120*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 19, 23, 21 ; in23 in21 3121*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 20, 27, 25 ; in27 in25 3122*c0909341SAndroid Build Coastguard Worker LOAD_PACKED_16X2 21, 31, 29 ; in31 in29 3123*c0909341SAndroid Build Coastguard Worker pxor m8, m8 3124*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 3125*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_8x32_8bpc).main 3126*c0909341SAndroid Build Coastguard Worker jmp .pass2 3127*c0909341SAndroid Build Coastguard Worker.fast: ; bottom half is zero 3128*c0909341SAndroid Build Coastguard Worker mova ym5, ym4 3129*c0909341SAndroid Build Coastguard Worker mova ym6, ym4 3130*c0909341SAndroid Build Coastguard Worker mova ym7, ym4 3131*c0909341SAndroid Build Coastguard Worker call m(idct_8x16_internal_8bpc).main 3132*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast 3133*c0909341SAndroid Build Coastguard Worker.pass2: 3134*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pw_8192)] 3135*c0909341SAndroid Build Coastguard Worker vpermt2q m0, m15, m4 ; t0 t1 t9 t8 3136*c0909341SAndroid Build Coastguard Worker vpermt2q m20, m15, m18 ; t31 t30a t23a t22 3137*c0909341SAndroid Build Coastguard Worker vpermt2q m3, m15, m7 ; t7 t6 t14 t15 3138*c0909341SAndroid Build Coastguard Worker vpermt2q m12, m15, m21 ; t25 t24a t17a t16 3139*c0909341SAndroid Build Coastguard Worker vpermt2q m2, m15, m6 ; t4 t5 t13 t12 3140*c0909341SAndroid Build Coastguard Worker vpermt2q m14, m15, m13 ; t23a t22 t21a t20 3141*c0909341SAndroid Build Coastguard Worker vpermt2q m1, m15, m5 ; t3 t2 t10 t11 3142*c0909341SAndroid Build Coastguard Worker vpermt2q m19, m15, m16 ; t27 t26a t19a t18 3143*c0909341SAndroid Build Coastguard Worker psubsw m8, m0, m20 ; out31 out30 out22 out23 3144*c0909341SAndroid Build Coastguard Worker paddsw m0, m20 ; out0 out1 out9 out8 3145*c0909341SAndroid Build Coastguard Worker paddsw m6, m3, m12 ; out7 out6 out14 out15 3146*c0909341SAndroid Build Coastguard Worker psubsw m3, m12 ; out24 out25 out17 out16 3147*c0909341SAndroid Build Coastguard Worker psubsw m5, m2, m14 ; out27 out26 out18 out19 3148*c0909341SAndroid Build Coastguard Worker paddsw m4, m2, m14 ; out4 out5 out13 out12 3149*c0909341SAndroid Build Coastguard Worker psubsw m7, m1, m19 ; out28 out29 out21 out20 3150*c0909341SAndroid Build Coastguard Worker paddsw m2, m1, m19 ; out3 out2 out10 out11 3151*c0909341SAndroid Build Coastguard Worker vzeroupper 3152*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25 3153*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24 3154*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27 3155*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26 3156*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29 3157*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28 3158*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31 3159*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30 3160*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 3161*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 3162*c0909341SAndroid Build Coastguard Worker call .main 3163*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_2048)] 3164*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 3165*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 3166*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+strideq*4] 3167*c0909341SAndroid Build Coastguard Worker movshdup m12, [o(permD)] 3168*c0909341SAndroid Build Coastguard Worker pmovzxbw m8, [dstq+strideq*0] 3169*c0909341SAndroid Build Coastguard Worker pmovzxbw m9, [dstq+strideq*1] 3170*c0909341SAndroid Build Coastguard Worker pmovzxbw m10, [dstq+strideq*2] 3171*c0909341SAndroid Build Coastguard Worker pmovzxbw m11, [dstq+r2 ] 3172*c0909341SAndroid Build Coastguard Worker paddw m0, m8 3173*c0909341SAndroid Build Coastguard Worker paddw m1, m9 3174*c0909341SAndroid Build Coastguard Worker paddw m2, m10 3175*c0909341SAndroid Build Coastguard Worker paddw m3, m11 3176*c0909341SAndroid Build Coastguard Worker pmovzxbw m8, [r3+strideq*0] 3177*c0909341SAndroid Build Coastguard Worker pmovzxbw m9, [r3+strideq*1] 3178*c0909341SAndroid Build Coastguard Worker pmovzxbw m10, [r3+strideq*2] 3179*c0909341SAndroid Build Coastguard Worker pmovzxbw m11, [r3+r2 ] 3180*c0909341SAndroid Build Coastguard Worker paddw m4, m8 3181*c0909341SAndroid Build Coastguard Worker paddw m5, m9 3182*c0909341SAndroid Build Coastguard Worker paddw m6, m10 3183*c0909341SAndroid Build Coastguard Worker paddw m7, m11 3184*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 3185*c0909341SAndroid Build Coastguard Worker packuswb m2, m3 3186*c0909341SAndroid Build Coastguard Worker vpermq m0, m12, m0 3187*c0909341SAndroid Build Coastguard Worker vpermq m2, m12, m2 3188*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 3189*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 3190*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], ym2 3191*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+r2 ], m2, 1 3192*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 3193*c0909341SAndroid Build Coastguard Worker packuswb m6, m7 3194*c0909341SAndroid Build Coastguard Worker vpermq m4, m12, m4 3195*c0909341SAndroid Build Coastguard Worker vpermq m6, m12, m6 3196*c0909341SAndroid Build Coastguard Worker mova [r3+strideq*0], ym4 3197*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r3+strideq*1], m4, 1 3198*c0909341SAndroid Build Coastguard Worker mova [r3+strideq*2], ym6 3199*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r3+r2 ], m6, 1 3200*c0909341SAndroid Build Coastguard Worker RET 3201*c0909341SAndroid Build Coastguard Worker.dconly: 3202*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 3203*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 3204*c0909341SAndroid Build Coastguard Worker or r3d, 8 3205*c0909341SAndroid Build Coastguard Worker.dconly2: 3206*c0909341SAndroid Build Coastguard Worker imul r6d, 181 3207*c0909341SAndroid Build Coastguard Worker add r6d, 128+512 3208*c0909341SAndroid Build Coastguard Worker sar r6d, 8+2 3209*c0909341SAndroid Build Coastguard Worker.dconly3: 3210*c0909341SAndroid Build Coastguard Worker imul r6d, 181 3211*c0909341SAndroid Build Coastguard Worker add r6d, 128+2048 3212*c0909341SAndroid Build Coastguard Worker sar r6d, 8+4 3213*c0909341SAndroid Build Coastguard Worker pxor m2, m2 3214*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, r6d 3215*c0909341SAndroid Build Coastguard Worker.dconly_loop: 3216*c0909341SAndroid Build Coastguard Worker mova ym1, [dstq+strideq*0] 3217*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [dstq+strideq*1], 1 3218*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 3219*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 3220*c0909341SAndroid Build Coastguard Worker paddw m0, m3 3221*c0909341SAndroid Build Coastguard Worker paddw m1, m3 3222*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 3223*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 3224*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 3225*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*2] 3226*c0909341SAndroid Build Coastguard Worker sub r3d, 2 3227*c0909341SAndroid Build Coastguard Worker jg .dconly_loop 3228*c0909341SAndroid Build Coastguard Worker RET 3229*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3230*c0909341SAndroid Build Coastguard Workercglobal_label .main 3231*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pd_2048)] 3232*c0909341SAndroid Build Coastguard Worker.main2: 3233*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a 3234*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a 3235*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3 3236*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_2896_2896)] 3237*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m2896_2896)] 3238*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0 3239*c0909341SAndroid Build Coastguard Worker.main3: 3240*c0909341SAndroid Build Coastguard Worker paddsw m8, m1, m5 ; t4 3241*c0909341SAndroid Build Coastguard Worker psubsw m1, m5 ; t5a 3242*c0909341SAndroid Build Coastguard Worker paddsw m9, m7, m3 ; t7 3243*c0909341SAndroid Build Coastguard Worker psubsw m7, m3 ; t6a 3244*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6 3245*c0909341SAndroid Build Coastguard Worker psubsw m5, m0, m2 ; dct4 out2 3246*c0909341SAndroid Build Coastguard Worker paddsw m2, m0 ; dct4 out1 3247*c0909341SAndroid Build Coastguard Worker paddsw m0, m4, m6 ; dct4 out0 3248*c0909341SAndroid Build Coastguard Worker psubsw m4, m6 ; dct4 out3 3249*c0909341SAndroid Build Coastguard Worker psubsw m6, m2, m1 ; out6 3250*c0909341SAndroid Build Coastguard Worker paddsw m1, m2 ; out1 3251*c0909341SAndroid Build Coastguard Worker paddsw m2, m5, m7 ; out2 3252*c0909341SAndroid Build Coastguard Worker psubsw m5, m7 ; out5 3253*c0909341SAndroid Build Coastguard Worker psubsw m7, m0, m9 ; out7 3254*c0909341SAndroid Build Coastguard Worker paddsw m0, m9 ; out0 3255*c0909341SAndroid Build Coastguard Worker paddsw m3, m4, m8 ; out3 3256*c0909341SAndroid Build Coastguard Worker psubsw m4, m8 ; out4 3257*c0909341SAndroid Build Coastguard Worker ret 3258*c0909341SAndroid Build Coastguard Worker 3259*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c 3260*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [pw_5] 3261*c0909341SAndroid Build Coastguard Worker paddsw m0, m7, [cq+64*0] 3262*c0909341SAndroid Build Coastguard Worker paddsw m1, m7, [cq+64*1] 3263*c0909341SAndroid Build Coastguard Worker vpbroadcastd ym9, strided 3264*c0909341SAndroid Build Coastguard Worker paddsw m2, m7, [cq+64*2] 3265*c0909341SAndroid Build Coastguard Worker paddsw m3, m7, [cq+64*3] 3266*c0909341SAndroid Build Coastguard Worker paddsw m4, m7, [cq+64*4] 3267*c0909341SAndroid Build Coastguard Worker paddsw m5, m7, [cq+64*5] 3268*c0909341SAndroid Build Coastguard Worker paddsw m6, m7, [cq+64*6] 3269*c0909341SAndroid Build Coastguard Worker paddsw m7, [cq+64*7] 3270*c0909341SAndroid Build Coastguard Worker pmulld ym14, ym9, [pd_0to15] 3271*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+strideq*1] 3272*c0909341SAndroid Build Coastguard Worker lea r4, [dstq+strideq*2] 3273*c0909341SAndroid Build Coastguard Worker kxnorb k1, k1, k1 3274*c0909341SAndroid Build Coastguard Worker pxor m13, m13 3275*c0909341SAndroid Build Coastguard Worker add r1, r4 ; dstq+strideq*3 3276*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 3277*c0909341SAndroid Build Coastguard Worker vpgatherdq m9{k1}, [r0+ym14*4] 3278*c0909341SAndroid Build Coastguard Worker kmovb k1, k2 3279*c0909341SAndroid Build Coastguard Worker vpgatherdq m10{k2}, [r3+ym14*4] 3280*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 3281*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 3282*c0909341SAndroid Build Coastguard Worker REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 3283*c0909341SAndroid Build Coastguard Worker vpgatherdq m11{k1}, [r4+ym14*4] 3284*c0909341SAndroid Build Coastguard Worker kmovb k1, k2 3285*c0909341SAndroid Build Coastguard Worker vpgatherdq m12{k2}, [r1+ym14*4] 3286*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 3287*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m9, m13 ; 0 8 16 24 3288*c0909341SAndroid Build Coastguard Worker punpckhbw m9, m13 ; 4 12 20 28 3289*c0909341SAndroid Build Coastguard Worker paddw m0, m8 3290*c0909341SAndroid Build Coastguard Worker paddw m4, m9 3291*c0909341SAndroid Build Coastguard Worker packuswb m0, m4 3292*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 3293*c0909341SAndroid Build Coastguard Worker vpscatterdq [r0+ym14*4]{k1}, m0 3294*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m10, m13 ; 1 9 17 25 3295*c0909341SAndroid Build Coastguard Worker punpckhbw m10, m13 ; 5 13 21 29 3296*c0909341SAndroid Build Coastguard Worker paddw m1, m8 3297*c0909341SAndroid Build Coastguard Worker paddw m5, m10 3298*c0909341SAndroid Build Coastguard Worker packuswb m1, m5 3299*c0909341SAndroid Build Coastguard Worker kmovb k1, k2 3300*c0909341SAndroid Build Coastguard Worker vpscatterdq [r3+ym14*4]{k2}, m1 3301*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m11, m13 ; 2 10 18 26 3302*c0909341SAndroid Build Coastguard Worker punpckhbw m11, m13 ; 6 14 22 30 3303*c0909341SAndroid Build Coastguard Worker paddw m2, m8 3304*c0909341SAndroid Build Coastguard Worker paddw m6, m11 3305*c0909341SAndroid Build Coastguard Worker packuswb m2, m6 3306*c0909341SAndroid Build Coastguard Worker kmovb k2, k1 3307*c0909341SAndroid Build Coastguard Worker vpscatterdq [r4+ym14*4]{k1}, m2 3308*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m12, m13 ; 3 11 19 27 3309*c0909341SAndroid Build Coastguard Worker punpckhbw m12, m13 ; 7 15 23 31 3310*c0909341SAndroid Build Coastguard Worker paddw m3, m8 3311*c0909341SAndroid Build Coastguard Worker paddw m7, m12 3312*c0909341SAndroid Build Coastguard Worker packuswb m3, m7 3313*c0909341SAndroid Build Coastguard Worker vpscatterdq [r1+ym14*4]{k2}, m3 3314*c0909341SAndroid Build Coastguard Worker RET 3315*c0909341SAndroid Build Coastguard Worker 3316*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c 3317*c0909341SAndroid Build Coastguard Worker vpbroadcastd m0, [pw_4096] 3318*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m0, [cq+64*0] 3319*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m0, [cq+64*4] 3320*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m0, [cq+64*1] 3321*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m0, [cq+64*5] 3322*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m0, [cq+64*2] 3323*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m0, [cq+64*6] 3324*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m0, [cq+64*3] 3325*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, [cq+64*7] 3326*c0909341SAndroid Build Coastguard Worker mova m13, [int8_permA] 3327*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 3328*c0909341SAndroid Build Coastguard Worker lea r4, [dstq+strideq*4] 3329*c0909341SAndroid Build Coastguard Worker punpckldq m1, m3, m4 3330*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m4 3331*c0909341SAndroid Build Coastguard Worker punpckldq m4, m6, m5 3332*c0909341SAndroid Build Coastguard Worker punpckhdq m6, m5 3333*c0909341SAndroid Build Coastguard Worker punpckldq m5, m7, m2 3334*c0909341SAndroid Build Coastguard Worker punpckhdq m7, m2 3335*c0909341SAndroid Build Coastguard Worker punpckldq m2, m8, m0 3336*c0909341SAndroid Build Coastguard Worker punpckhdq m8, m0 3337*c0909341SAndroid Build Coastguard Worker mova ym9, [dstq+strideq*0] 3338*c0909341SAndroid Build Coastguard Worker vinserti32x8 m9, [dstq+strideq*2], 1 3339*c0909341SAndroid Build Coastguard Worker mova ym10, [dstq+strideq*1] 3340*c0909341SAndroid Build Coastguard Worker vinserti32x8 m10, [dstq+r3 ], 1 3341*c0909341SAndroid Build Coastguard Worker mova ym11, [r4+strideq*0] 3342*c0909341SAndroid Build Coastguard Worker vinserti32x8 m11, [r4+strideq*2], 1 3343*c0909341SAndroid Build Coastguard Worker mova ym12, [r4+strideq*1] 3344*c0909341SAndroid Build Coastguard Worker vinserti32x8 m12, [r4+r3 ], 1 3345*c0909341SAndroid Build Coastguard Worker REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8 3346*c0909341SAndroid Build Coastguard Worker pxor m13, m13 3347*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 3348*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m1, m4 ; a0 a2 c0 c2 3349*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m4 ; b0 b2 d0 d2 3350*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m5, m2 ; a1 a3 c1 c3 3351*c0909341SAndroid Build Coastguard Worker punpckhqdq m5, m2 ; b1 b3 d1 d3 3352*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m3, m6 ; e0 e2 g0 g2 3353*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m6 ; f0 f2 h0 h2 3354*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m7, m8 ; e1 e3 g1 g3 3355*c0909341SAndroid Build Coastguard Worker punpckhqdq m7, m8 ; f1 f3 h1 h3 3356*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m9, m13 3357*c0909341SAndroid Build Coastguard Worker punpckhbw m9, m13 3358*c0909341SAndroid Build Coastguard Worker paddw m0, m8 3359*c0909341SAndroid Build Coastguard Worker paddw m4, m9 3360*c0909341SAndroid Build Coastguard Worker packuswb m0, m4 3361*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 3362*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*2], m0, 1 3363*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m10, m13 3364*c0909341SAndroid Build Coastguard Worker punpckhbw m10, m13 3365*c0909341SAndroid Build Coastguard Worker paddw m1, m8 3366*c0909341SAndroid Build Coastguard Worker paddw m5, m10 3367*c0909341SAndroid Build Coastguard Worker packuswb m1, m5 3368*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*1], ym1 3369*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+r3 ], m1, 1 3370*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m11, m13 3371*c0909341SAndroid Build Coastguard Worker punpckhbw m11, m13 3372*c0909341SAndroid Build Coastguard Worker paddw m2, m8 3373*c0909341SAndroid Build Coastguard Worker paddw m6, m11 3374*c0909341SAndroid Build Coastguard Worker packuswb m2, m6 3375*c0909341SAndroid Build Coastguard Worker mova [r4+strideq*0], ym2 3376*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r4+strideq*2], m2, 1 3377*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m12, m13 3378*c0909341SAndroid Build Coastguard Worker punpckhbw m12, m13 3379*c0909341SAndroid Build Coastguard Worker paddw m3, m8 3380*c0909341SAndroid Build Coastguard Worker paddw m7, m12 3381*c0909341SAndroid Build Coastguard Worker packuswb m3, m7 3382*c0909341SAndroid Build Coastguard Worker mova [r4+strideq*1], ym3 3383*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r4+r3 ], m3, 1 3384*c0909341SAndroid Build Coastguard Worker RET 3385*c0909341SAndroid Build Coastguard Worker 3386*c0909341SAndroid Build Coastguard Worker%macro IDCT_16x32_END 3 ; src[1-2], row 3387*c0909341SAndroid Build Coastguard Worker mova xm8, [dstq+strideq*0] 3388*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym8, [dstq+strideq*1], 1 3389*c0909341SAndroid Build Coastguard Worker mova xm9, [dstq+r3 ] 3390*c0909341SAndroid Build Coastguard Worker vinserti32x4 ym9, [dstq+strideq*2], 1 3391*c0909341SAndroid Build Coastguard Worker pmulhrsw m%1, m10 3392*c0909341SAndroid Build Coastguard Worker pmulhrsw m%2, m10 3393*c0909341SAndroid Build Coastguard Worker vpermb m8, m11, m8 3394*c0909341SAndroid Build Coastguard Worker vpermb m9, m11, m9 3395*c0909341SAndroid Build Coastguard Worker mova [cq+64*(%3*2+0)], m13 3396*c0909341SAndroid Build Coastguard Worker mova [cq+64*(%3*2+1)], m13 3397*c0909341SAndroid Build Coastguard Worker paddw m8, m%1 3398*c0909341SAndroid Build Coastguard Worker paddw m9, m%2 3399*c0909341SAndroid Build Coastguard Worker packuswb m8, m9 3400*c0909341SAndroid Build Coastguard Worker vpermd m8, m12, m8 3401*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], xm8 3402*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*1], ym8, 1 3403*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+strideq*2], m8, 2 3404*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r3 ], m8, 3 3405*c0909341SAndroid Build Coastguard Worker%if %1 != 20 3406*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 3407*c0909341SAndroid Build Coastguard Worker%endif 3408*c0909341SAndroid Build Coastguard Worker%endmacro 3409*c0909341SAndroid Build Coastguard Worker 3410*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob 3411*c0909341SAndroid Build Coastguard Worker%undef cmp 3412*c0909341SAndroid Build Coastguard Worker lea r5, [o_base] 3413*c0909341SAndroid Build Coastguard Worker test eobd, eobd 3414*c0909341SAndroid Build Coastguard Worker jz .dconly 3415*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [o(pw_2896x8)] 3416*c0909341SAndroid Build Coastguard Worker cmp eobd, 151 3417*c0909341SAndroid Build Coastguard Worker jb .fast 3418*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m15, [cq+64*10] 3419*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m15, [cq+64* 6] 3420*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m15, [cq+64* 2] 3421*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m15, [cq+64*14] 3422*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m15, [cq+64* 4] 3423*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m15, [cq+64*12] 3424*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m15, [cq+64* 0] 3425*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m15, [cq+64* 8] 3426*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x8_8bpc).main 3427*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m15, [cq+64* 1] 3428*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m15, [cq+64*15] 3429*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m15, [cq+64* 9] 3430*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m15, [cq+64* 7] 3431*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m15, [cq+64* 5] 3432*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m15, [cq+64*11] 3433*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m15, [cq+64*13] 3434*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, [cq+64* 3] 3435*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 3436*c0909341SAndroid Build Coastguard Worker mova m8, [o(idct_16x32p)] 3437*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_16384)] 3438*c0909341SAndroid Build Coastguard Worker REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \ 3439*c0909341SAndroid Build Coastguard Worker m14, m15, m16, m17, m18, m19, m20, m21 3440*c0909341SAndroid Build Coastguard Worker punpckldq m8, m0, m1 3441*c0909341SAndroid Build Coastguard Worker punpckhdq m0, m1 3442*c0909341SAndroid Build Coastguard Worker punpckldq m1, m2, m3 3443*c0909341SAndroid Build Coastguard Worker punpckhdq m2, m3 3444*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m9}, m8, m0, m1, m2 3445*c0909341SAndroid Build Coastguard Worker punpckldq m3, m4, m5 3446*c0909341SAndroid Build Coastguard Worker punpckhdq m4, m5 3447*c0909341SAndroid Build Coastguard Worker punpckldq m5, m6, m7 3448*c0909341SAndroid Build Coastguard Worker punpckhdq m6, m7 3449*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m9}, m3, m4, m5, m6 3450*c0909341SAndroid Build Coastguard Worker punpckldq m7, m14, m15 3451*c0909341SAndroid Build Coastguard Worker punpckhdq m14, m15 3452*c0909341SAndroid Build Coastguard Worker punpckldq m15, m16, m17 3453*c0909341SAndroid Build Coastguard Worker punpckhdq m16, m17 3454*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m9}, m7, m14, m15, m16 3455*c0909341SAndroid Build Coastguard Worker punpckldq m17, m18, m19 3456*c0909341SAndroid Build Coastguard Worker punpckhdq m18, m19 3457*c0909341SAndroid Build Coastguard Worker punpckldq m19, m20, m21 3458*c0909341SAndroid Build Coastguard Worker punpckhdq m20, m21 3459*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m9}, m17, m18, m19, m20 3460*c0909341SAndroid Build Coastguard Worker punpcklqdq m21, m8, m1 3461*c0909341SAndroid Build Coastguard Worker punpckhqdq m8, m1 3462*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m0, m2 3463*c0909341SAndroid Build Coastguard Worker punpckhqdq m0, m2 3464*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m3, m5 3465*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m5 3466*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m4, m6 3467*c0909341SAndroid Build Coastguard Worker punpckhqdq m4, m6 3468*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m7, m15 3469*c0909341SAndroid Build Coastguard Worker punpckhqdq m7, m15 3470*c0909341SAndroid Build Coastguard Worker punpcklqdq m15, m14, m16 3471*c0909341SAndroid Build Coastguard Worker punpckhqdq m14, m16 3472*c0909341SAndroid Build Coastguard Worker punpcklqdq m16, m17, m19 3473*c0909341SAndroid Build Coastguard Worker punpckhqdq m17, m19 3474*c0909341SAndroid Build Coastguard Worker punpcklqdq m19, m18, m20 3475*c0909341SAndroid Build Coastguard Worker punpckhqdq m18, m20 3476*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, m21, ym2, 1 3477*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m2, q3232 3478*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, m8, ym3, 1 3479*c0909341SAndroid Build Coastguard Worker vshufi32x4 m8, m3, q3232 3480*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, m1, ym5, 1 3481*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m5, q3232 3482*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, m0, ym4, 1 3483*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m4, q3232 3484*c0909341SAndroid Build Coastguard Worker vinserti32x8 m4, m6, ym16, 1 3485*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m16, q3232 3486*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, m7, ym17, 1 3487*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m17, q3232 3488*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, m15, ym19, 1 3489*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m19, q3232 3490*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, m14, ym18, 1 3491*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m18, q3232 3492*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m21, m6, q3131 ; 27 5 3493*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m6, q2020 ; 31 1 3494*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m8, m7, q2020 ; 24 8 3495*c0909341SAndroid Build Coastguard Worker vshufi32x4 m8, m7, q3131 ; 30 2 3496*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m1, m15, q2020 ; 28 4 3497*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m15, q3131 ; 6 26 3498*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m0, m14, q2020 ; 7 25 3499*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m14, q3131 ; 14 18 3500*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m20, m4, q2020 ; 3 29 3501*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m4, q3131 ; 23 9 3502*c0909341SAndroid Build Coastguard Worker vshufi32x4 m9, m3, m17, q2020 ; 16 0 3503*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m17, q3131 ; 12 20 3504*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m5, m19, q2020 ; 15 17 3505*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m19, q3131 ; 22 10 3506*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m2, m16, q2020 ; 19 13 3507*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m2, m16, q3131 ; 11 21 3508*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main3 3509*c0909341SAndroid Build Coastguard Worker call .main_oddhalf 3510*c0909341SAndroid Build Coastguard Worker jmp .pass2 3511*c0909341SAndroid Build Coastguard Worker.fast: ; right half is zero 3512*c0909341SAndroid Build Coastguard Worker mova ym8, [cq+64*15] 3513*c0909341SAndroid Build Coastguard Worker vinserti32x8 m8, [cq+64* 1], 1 3514*c0909341SAndroid Build Coastguard Worker mova m2, [o(int16_perm)] 3515*c0909341SAndroid Build Coastguard Worker mova ym9, [cq+64* 8] 3516*c0909341SAndroid Build Coastguard Worker vinserti32x8 m9, [cq+64* 0], 1 3517*c0909341SAndroid Build Coastguard Worker mova ym0, [cq+64* 7] 3518*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [cq+64* 9], 1 3519*c0909341SAndroid Build Coastguard Worker mova ym7, [cq+64*14] 3520*c0909341SAndroid Build Coastguard Worker vinserti32x8 m7, [cq+64* 2], 1 3521*c0909341SAndroid Build Coastguard Worker mova ym1, [cq+64* 3] 3522*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [cq+64*13], 1 3523*c0909341SAndroid Build Coastguard Worker mova ym3, [cq+64* 6] 3524*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [cq+64*10], 1 3525*c0909341SAndroid Build Coastguard Worker mova ym5, [cq+64*11] 3526*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, [cq+64* 5], 1 3527*c0909341SAndroid Build Coastguard Worker mova ym6, [cq+64*12] 3528*c0909341SAndroid Build Coastguard Worker vinserti32x8 m6, [cq+64* 4], 1 3529*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6 3530*c0909341SAndroid Build Coastguard Worker REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 3531*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main2 3532*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m8, [o(int_shuf3)] 3533*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m9, [o(int_shuf4)] 3534*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_16384)] 3535*c0909341SAndroid Build Coastguard Worker pshufb m0, m8 3536*c0909341SAndroid Build Coastguard Worker pshufb m1, m9 3537*c0909341SAndroid Build Coastguard Worker pshufb m2, m8 3538*c0909341SAndroid Build Coastguard Worker pshufb m3, m9 3539*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m11}, m0, m1, m2, m3 3540*c0909341SAndroid Build Coastguard Worker pshufb m4, m8 3541*c0909341SAndroid Build Coastguard Worker pshufb m5, m9 3542*c0909341SAndroid Build Coastguard Worker pshufb m6, m8 3543*c0909341SAndroid Build Coastguard Worker pshufb m7, m9 3544*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m11}, m4, m5, m6, m7 3545*c0909341SAndroid Build Coastguard Worker punpckhdq m17, m0, m1 3546*c0909341SAndroid Build Coastguard Worker punpckldq m0, m1 3547*c0909341SAndroid Build Coastguard Worker punpckhdq m16, m2, m3 3548*c0909341SAndroid Build Coastguard Worker punpckldq m2, m3 3549*c0909341SAndroid Build Coastguard Worker punpckhdq m18, m4, m5 3550*c0909341SAndroid Build Coastguard Worker punpckldq m4, m5 3551*c0909341SAndroid Build Coastguard Worker punpckhdq m5, m6, m7 3552*c0909341SAndroid Build Coastguard Worker punpckldq m6, m7 3553*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, m0, ym2, 1 3554*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m0, m2, q3232 3555*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, m4, ym6, 1 3556*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m6, q3232 3557*c0909341SAndroid Build Coastguard Worker vinserti32x8 m15, m17, ym16, 1 3558*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m16, q3232 3559*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, m18, ym5, 1 3560*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m5, q3232 3561*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m1, m2, q2020 ; 0 2 3562*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m2, q3131 ; 4 6 3563*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m3, m4, q2020 ; 8 10 3564*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m4, q3131 ; 12 14 3565*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m15, m16, q2020 ; 1 3 3566*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m16, q3131 ; 5 7 3567*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m17, m18, q2020 ; 9 11 3568*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m18, q3131 ; 13 15 3569*c0909341SAndroid Build Coastguard Worker pxor m6, m6 3570*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m0, m0 3571*c0909341SAndroid Build Coastguard Worker punpcklwd m9, m6, m0 3572*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m3, m3 3573*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m2, m2 3574*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m1, m1 3575*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m1 3576*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m3 3577*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m2 3578*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main_fast5 3579*c0909341SAndroid Build Coastguard Worker punpcklwd m21, m14, m14 3580*c0909341SAndroid Build Coastguard Worker punpckhwd m14, m14 3581*c0909341SAndroid Build Coastguard Worker punpcklwd m18, m15, m15 3582*c0909341SAndroid Build Coastguard Worker punpckhwd m15, m15 3583*c0909341SAndroid Build Coastguard Worker punpcklwd m20, m16, m16 3584*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m16 3585*c0909341SAndroid Build Coastguard Worker punpcklwd m19, m17, m17 3586*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m17 3587*c0909341SAndroid Build Coastguard Worker call .main_oddhalf_fast 3588*c0909341SAndroid Build Coastguard Worker.pass2: 3589*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pw_2048)] 3590*c0909341SAndroid Build Coastguard Worker mova m11, [o(end_16x32p)] 3591*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 3592*c0909341SAndroid Build Coastguard Worker pxor m13, m13 3593*c0909341SAndroid Build Coastguard Worker psrld m12, m11, 8 3594*c0909341SAndroid Build Coastguard Worker IDCT_16x32_END 0, 1, 0 3595*c0909341SAndroid Build Coastguard Worker IDCT_16x32_END 2, 3, 1 3596*c0909341SAndroid Build Coastguard Worker IDCT_16x32_END 4, 5, 2 3597*c0909341SAndroid Build Coastguard Worker IDCT_16x32_END 6, 7, 3 3598*c0909341SAndroid Build Coastguard Worker IDCT_16x32_END 14, 15, 4 3599*c0909341SAndroid Build Coastguard Worker IDCT_16x32_END 16, 17, 5 3600*c0909341SAndroid Build Coastguard Worker IDCT_16x32_END 18, 19, 6 3601*c0909341SAndroid Build Coastguard Worker IDCT_16x32_END 20, 21, 7 3602*c0909341SAndroid Build Coastguard Worker RET 3603*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3604*c0909341SAndroid Build Coastguard Worker.dconly: 3605*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 3606*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 3607*c0909341SAndroid Build Coastguard Worker or r3d, 32 3608*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly 3609*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3610*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero 3611*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_201_4091x8)] 3612*c0909341SAndroid Build Coastguard Worker vpbroadcastd m20, [o(pw_m1380_3857x8)] 3613*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_995_3973x8)] 3614*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, [o(pw_m601_4052x8)] 3615*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m8 ; t16a, t31a 3616*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m15 ; t19a, t28a 3617*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m9 ; t20a, t27a 3618*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m16 ; t23a, t24a 3619*c0909341SAndroid Build Coastguard Worker mova m8, m21 3620*c0909341SAndroid Build Coastguard Worker mova m17, m20 3621*c0909341SAndroid Build Coastguard Worker mova m15, m18 3622*c0909341SAndroid Build Coastguard Worker mova m16, m14 3623*c0909341SAndroid Build Coastguard Worker jmp .main3 3624*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3625*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast ; bottom half is zero 3626*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_201_4091x8)] 3627*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_m2751_3035x8)] 3628*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_1751_3703x8)] 3629*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m1380_3857x8)] 3630*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m8 ; t16a, t31a 3631*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_995_3973x8)] 3632*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m9 ; t17a, t30a 3633*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_m2106_3513x8)] 3634*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m11 ; t18a, t29a 3635*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_2440_3290x8)] 3636*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m12 ; t19a, t28a 3637*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m601_4052x8)] 3638*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m8 ; t20a, t27a 3639*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m9 ; t21a, t26a 3640*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m11 ; t22a, t25a 3641*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m12 ; t23a, t24a 3642*c0909341SAndroid Build Coastguard Worker jmp .main2 3643*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3644*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf 3645*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a 3646*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a 3647*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a 3648*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a 3649*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a 3650*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a 3651*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a 3652*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a 3653*c0909341SAndroid Build Coastguard Worker.main2: 3654*c0909341SAndroid Build Coastguard Worker psubsw m8, m21, m17 ; t17 t30 3655*c0909341SAndroid Build Coastguard Worker paddsw m21, m17 ; t16 t31 3656*c0909341SAndroid Build Coastguard Worker psubsw m17, m15, m20 ; t18 t29 3657*c0909341SAndroid Build Coastguard Worker paddsw m20, m15 ; t19 t28 3658*c0909341SAndroid Build Coastguard Worker psubsw m15, m18, m16 ; t21 t26 3659*c0909341SAndroid Build Coastguard Worker paddsw m18, m16 ; t20 t27 3660*c0909341SAndroid Build Coastguard Worker psubsw m16, m14, m19 ; t22 t25 3661*c0909341SAndroid Build Coastguard Worker paddsw m14, m19 ; t23 t24 3662*c0909341SAndroid Build Coastguard Worker.main3: 3663*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a 3664*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a 3665*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a 3666*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a 3667*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_m3784_1567)] 3668*c0909341SAndroid Build Coastguard Worker psubsw m19, m21, m20 ; t19a t28a 3669*c0909341SAndroid Build Coastguard Worker paddsw m21, m20 ; t16a t31a 3670*c0909341SAndroid Build Coastguard Worker psubsw m20, m14, m18 ; t20a t27a 3671*c0909341SAndroid Build Coastguard Worker paddsw m14, m18 ; t23a t24a 3672*c0909341SAndroid Build Coastguard Worker psubsw m18, m8, m17 ; t18 t29 3673*c0909341SAndroid Build Coastguard Worker paddsw m8, m17 ; t17 t30 3674*c0909341SAndroid Build Coastguard Worker psubsw m17, m16, m15 ; t21 t26 3675*c0909341SAndroid Build Coastguard Worker paddsw m15, m16 ; t22 t25 3676*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a 3677*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28 3678*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27 3679*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a 3680*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m9, [o(deint_shuf)] 3681*c0909341SAndroid Build Coastguard Worker psubsw m16, m21, m14 ; t23 t24 3682*c0909341SAndroid Build Coastguard Worker paddsw m14, m21 ; t16 t31 3683*c0909341SAndroid Build Coastguard Worker psubsw m21, m8, m15 ; t22a t25a 3684*c0909341SAndroid Build Coastguard Worker paddsw m15, m8 ; t17a t30a 3685*c0909341SAndroid Build Coastguard Worker psubsw m8, m18, m17 ; t21 t26 3686*c0909341SAndroid Build Coastguard Worker paddsw m18, m17 ; t18 t29 3687*c0909341SAndroid Build Coastguard Worker paddsw m17, m19, m20 ; t19a t28a 3688*c0909341SAndroid Build Coastguard Worker psubsw m19, m20 ; t20a t27a 3689*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_m2896_2896)] 3690*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_2896_2896)] 3691*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m9}, m14, m15, m18, m17 3692*c0909341SAndroid Build Coastguard Worker mova m9, m10 3693*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m16, m11 3694*c0909341SAndroid Build Coastguard Worker mova m20, m10 3695*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m21, m11 3696*c0909341SAndroid Build Coastguard Worker psrad m9, 12 3697*c0909341SAndroid Build Coastguard Worker psrad m20, 12 3698*c0909341SAndroid Build Coastguard Worker packssdw m9, m20 ; t23a t22 3699*c0909341SAndroid Build Coastguard Worker mova m20, m10 3700*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m16, m12 3701*c0909341SAndroid Build Coastguard Worker mova m16, m10 3702*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m21, m12 3703*c0909341SAndroid Build Coastguard Worker psrad m20, 12 3704*c0909341SAndroid Build Coastguard Worker psrad m16, 12 3705*c0909341SAndroid Build Coastguard Worker packssdw m16, m20, m16 ; t24a t25 3706*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a 3707*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27 3708*c0909341SAndroid Build Coastguard Worker packssdw m11, m20 ; t27 t26a 3709*c0909341SAndroid Build Coastguard Worker packssdw m8, m21 ; t20 t21a 3710*c0909341SAndroid Build Coastguard Worker punpcklqdq m20, m14, m15 ; t16 t17a 3711*c0909341SAndroid Build Coastguard Worker punpckhqdq m14, m15 ; t31 t30a 3712*c0909341SAndroid Build Coastguard Worker punpckhqdq m15, m17, m18 ; t28a t29 3713*c0909341SAndroid Build Coastguard Worker punpcklqdq m17, m18 ; t19a t18 3714*c0909341SAndroid Build Coastguard Worker psubsw m21, m0, m14 ; out31 out30 3715*c0909341SAndroid Build Coastguard Worker paddsw m0, m14 ; out0 out1 3716*c0909341SAndroid Build Coastguard Worker psubsw m14, m7, m20 ; out16 out17 3717*c0909341SAndroid Build Coastguard Worker paddsw m7, m20 ; out15 out14 3718*c0909341SAndroid Build Coastguard Worker psubsw m20, m1, m15 ; out28 out29 3719*c0909341SAndroid Build Coastguard Worker paddsw m1, m15 ; out3 out2 3720*c0909341SAndroid Build Coastguard Worker psubsw m15, m6, m17 ; out19 out18 3721*c0909341SAndroid Build Coastguard Worker paddsw m6, m17 ; out12 out13 3722*c0909341SAndroid Build Coastguard Worker psubsw m17, m4, m9 ; out23 out22 3723*c0909341SAndroid Build Coastguard Worker paddsw m4, m9 ; out8 out9 3724*c0909341SAndroid Build Coastguard Worker psubsw m18, m3, m16 ; out24 out25 3725*c0909341SAndroid Build Coastguard Worker paddsw m3, m16 ; out7 out6 3726*c0909341SAndroid Build Coastguard Worker psubsw m16, m5, m8 ; out20 out21 3727*c0909341SAndroid Build Coastguard Worker paddsw m5, m8 ; out11 out10 3728*c0909341SAndroid Build Coastguard Worker psubsw m19, m2, m11 ; out27 out26 3729*c0909341SAndroid Build Coastguard Worker paddsw m2, m11 ; out4 out5 3730*c0909341SAndroid Build Coastguard Worker ret 3731*c0909341SAndroid Build Coastguard Worker 3732*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob 3733*c0909341SAndroid Build Coastguard Worker%undef cmp 3734*c0909341SAndroid Build Coastguard Worker lea r5, [o_base] 3735*c0909341SAndroid Build Coastguard Worker test eobd, eobd 3736*c0909341SAndroid Build Coastguard Worker jz .dconly 3737*c0909341SAndroid Build Coastguard Worker mova m21, [o(permB)] 3738*c0909341SAndroid Build Coastguard Worker vpermq m1, m21, [cq+64* 0] ; 0 1 3739*c0909341SAndroid Build Coastguard Worker vpermq m14, m21, [cq+64* 1] ; 2 3 3740*c0909341SAndroid Build Coastguard Worker vpermq m20, m21, [cq+64* 2] ; 4 5 3741*c0909341SAndroid Build Coastguard Worker vpermq m15, m21, [cq+64* 3] ; 6 7 3742*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_2896x8)] 3743*c0909341SAndroid Build Coastguard Worker vpermq m2, m21, [cq+64* 4] ; 8 9 3744*c0909341SAndroid Build Coastguard Worker vpermq m16, m21, [cq+64* 5] ; 10 11 3745*c0909341SAndroid Build Coastguard Worker vpermq m3, m21, [cq+64* 6] ; 12 13 3746*c0909341SAndroid Build Coastguard Worker vpermq m17, m21, [cq+64* 7] ; 14 15 3747*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17 3748*c0909341SAndroid Build Coastguard Worker pxor m12, m12 3749*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7 3750*c0909341SAndroid Build Coastguard Worker cmp eobd, 151 3751*c0909341SAndroid Build Coastguard Worker jb .fast 3752*c0909341SAndroid Build Coastguard Worker vpermq m9, m21, [cq+64* 8] ; 16 17 3753*c0909341SAndroid Build Coastguard Worker vpermq m19, m21, [cq+64* 9] ; 18 19 3754*c0909341SAndroid Build Coastguard Worker vpermq m4, m21, [cq+64*10] ; 20 21 3755*c0909341SAndroid Build Coastguard Worker vpermq m5, m21, [cq+64*11] ; 22 23 3756*c0909341SAndroid Build Coastguard Worker vpermq m6, m21, [cq+64*12] ; 24 25 3757*c0909341SAndroid Build Coastguard Worker vpermq m18, m21, [cq+64*13] ; 26 27 3758*c0909341SAndroid Build Coastguard Worker vpermq m7, m21, [cq+64*14] ; 28 29 3759*c0909341SAndroid Build Coastguard Worker vpermq m21, m21, [cq+64*15] ; 30 31 3760*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21 3761*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15 3762*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m21, m14 ; 30 2 3763*c0909341SAndroid Build Coastguard Worker punpckhwd m21, m1 ; 31 1 3764*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m17, m19 ; 14 18 3765*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m9 ; 15 17 3766*c0909341SAndroid Build Coastguard Worker punpcklwd m9, m1 ; 16 0 3767*c0909341SAndroid Build Coastguard Worker punpckhwd m14, m7 ; 3 29 3768*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m15, m18 ; 6 26 3769*c0909341SAndroid Build Coastguard Worker punpckhwd m15, m6 ; 7 25 3770*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m2 ; 24 8 3771*c0909341SAndroid Build Coastguard Worker punpckhwd m19, m3 ; 19 13 3772*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4 ; 12 20 3773*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m20 ; 27 5 3774*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m20 ; 28 4 3775*c0909341SAndroid Build Coastguard Worker punpckhwd m20, m5, m2 ; 23 9 3776*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m16 ; 22 10 3777*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m4 ; 11 21 3778*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main2 3779*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf 3780*c0909341SAndroid Build Coastguard Worker jmp .pass2 3781*c0909341SAndroid Build Coastguard Worker.fast: ; bottom half zero 3782*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m14, m14 ; 2 3783*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m17, m17 ; 14 3784*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m16, m16 ; 10 3785*c0909341SAndroid Build Coastguard Worker punpcklwd m9, m12, m1 ; __ 0 3786*c0909341SAndroid Build Coastguard Worker punpckhwd m21, m1, m1 ; 1 3787*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m15, m15 ; 6 3788*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m20, m20 ; 4 3789*c0909341SAndroid Build Coastguard Worker punpckhwd m19, m3, m3 ; 13 3790*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m3 ; 12 3791*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m12, m2 ; __ 8 3792*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m20, m20 ; 5 3793*c0909341SAndroid Build Coastguard Worker punpckhwd m20, m2, m2 ; 9 3794*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main_fast 3795*c0909341SAndroid Build Coastguard Worker punpckhwd m15, m15 ; 7 3796*c0909341SAndroid Build Coastguard Worker punpckhwd m14, m14 ; 3 3797*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m16 ; 11 3798*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m17 ; 15 3799*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 3800*c0909341SAndroid Build Coastguard Worker.pass2: 3801*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_16384)] 3802*c0909341SAndroid Build Coastguard Worker call .transpose_round 3803*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m14, m2, q3131 ; 5 3804*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m2, q2020 ; 1 3805*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m0, m3, q3131 ; 4 3806*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m3, q2020 ; 0 3807*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m1, m18, q3131 ; 6 3808*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m18, q2020 ; 2 3809*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m20, m6, q2020 ; 9 3810*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m6, q3131 ; 13 3811*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m21, m4, q3131 ; 12 3812*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m21, m4, q2020 ; 8 3813*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m19, m7, q3131 ; 15 3814*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m7, q2020 ; 11 3815*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m5, m15, q3131 ; 14 3816*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m15, q2020 ; 10 3817*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m17, m9, q2020 ; 3 3818*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m9, q3131 ; 7 3819*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 3820*c0909341SAndroid Build Coastguard Worker call .main_oddhalf 3821*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_2048)] 3822*c0909341SAndroid Build Coastguard Worker movshdup m13, [o(permD)] 3823*c0909341SAndroid Build Coastguard Worker lea r2, [strideq*3] 3824*c0909341SAndroid Build Coastguard Worker pmovzxbw m8, [dstq+strideq*0] 3825*c0909341SAndroid Build Coastguard Worker pmovzxbw m9, [dstq+strideq*1] 3826*c0909341SAndroid Build Coastguard Worker pmovzxbw m10, [dstq+strideq*2] 3827*c0909341SAndroid Build Coastguard Worker pmovzxbw m11, [dstq+r2 ] 3828*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m0, m1, m2, m3 3829*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+strideq*4] 3830*c0909341SAndroid Build Coastguard Worker paddw m0, m8 3831*c0909341SAndroid Build Coastguard Worker paddw m1, m9 3832*c0909341SAndroid Build Coastguard Worker paddw m2, m10 3833*c0909341SAndroid Build Coastguard Worker paddw m3, m11 3834*c0909341SAndroid Build Coastguard Worker pmovzxbw m8, [r3+strideq*0] 3835*c0909341SAndroid Build Coastguard Worker pmovzxbw m9, [r3+strideq*1] 3836*c0909341SAndroid Build Coastguard Worker pmovzxbw m10, [r3+strideq*2] 3837*c0909341SAndroid Build Coastguard Worker pmovzxbw m11, [r3+r2 ] 3838*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m4, m5, m6, m7 3839*c0909341SAndroid Build Coastguard Worker lea r4, [dstq+strideq*8] 3840*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 3841*c0909341SAndroid Build Coastguard Worker paddw m4, m8 3842*c0909341SAndroid Build Coastguard Worker paddw m5, m9 3843*c0909341SAndroid Build Coastguard Worker packuswb m2, m3 3844*c0909341SAndroid Build Coastguard Worker paddw m6, m10 3845*c0909341SAndroid Build Coastguard Worker paddw m7, m11 3846*c0909341SAndroid Build Coastguard Worker pmovzxbw m8, [r4+strideq*0] 3847*c0909341SAndroid Build Coastguard Worker pmovzxbw m9, [r4+strideq*1] 3848*c0909341SAndroid Build Coastguard Worker pmovzxbw m10, [r4+strideq*2] 3849*c0909341SAndroid Build Coastguard Worker pmovzxbw m11, [r4+r2 ] 3850*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m14, m15, m16, m17 3851*c0909341SAndroid Build Coastguard Worker lea r5, [r3+strideq*8] 3852*c0909341SAndroid Build Coastguard Worker packuswb m4, m5 3853*c0909341SAndroid Build Coastguard Worker paddw m14, m8 3854*c0909341SAndroid Build Coastguard Worker paddw m15, m9 3855*c0909341SAndroid Build Coastguard Worker packuswb m6, m7 3856*c0909341SAndroid Build Coastguard Worker paddw m16, m10 3857*c0909341SAndroid Build Coastguard Worker paddw m17, m11 3858*c0909341SAndroid Build Coastguard Worker pmovzxbw m8, [r5+strideq*0] 3859*c0909341SAndroid Build Coastguard Worker pmovzxbw m9, [r5+strideq*1] 3860*c0909341SAndroid Build Coastguard Worker pmovzxbw m10, [r5+strideq*2] 3861*c0909341SAndroid Build Coastguard Worker pmovzxbw m11, [r5+r2 ] 3862*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m18, m19, m20, m21 3863*c0909341SAndroid Build Coastguard Worker packuswb m14, m15 3864*c0909341SAndroid Build Coastguard Worker paddw m18, m8 3865*c0909341SAndroid Build Coastguard Worker paddw m19, m9 3866*c0909341SAndroid Build Coastguard Worker packuswb m16, m17 3867*c0909341SAndroid Build Coastguard Worker paddw m20, m10 3868*c0909341SAndroid Build Coastguard Worker paddw m21, m11 3869*c0909341SAndroid Build Coastguard Worker packuswb m18, m19 3870*c0909341SAndroid Build Coastguard Worker packuswb m20, m21 3871*c0909341SAndroid Build Coastguard Worker REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20 3872*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym0 3873*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*1], m0, 1 3874*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*2], ym2 3875*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+r2 ], m2, 1 3876*c0909341SAndroid Build Coastguard Worker mova [r3+strideq*0], ym4 3877*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r3+strideq*1], m4, 1 3878*c0909341SAndroid Build Coastguard Worker mova [r3+strideq*2], ym6 3879*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r3+r2 ], m6, 1 3880*c0909341SAndroid Build Coastguard Worker mova [r4+strideq*0], ym14 3881*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r4+strideq*1], m14, 1 3882*c0909341SAndroid Build Coastguard Worker mova [r4+strideq*2], ym16 3883*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r4+r2 ], m16, 1 3884*c0909341SAndroid Build Coastguard Worker mova [r5+strideq*0], ym18 3885*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r5+strideq*1], m18, 1 3886*c0909341SAndroid Build Coastguard Worker mova [r5+strideq*2], ym20 3887*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r5+r2 ], m20, 1 3888*c0909341SAndroid Build Coastguard Worker RET 3889*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3890*c0909341SAndroid Build Coastguard Worker.dconly: 3891*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 3892*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 3893*c0909341SAndroid Build Coastguard Worker or r3d, 16 3894*c0909341SAndroid Build Coastguard Worker imul r6d, 181 3895*c0909341SAndroid Build Coastguard Worker add r6d, 128 3896*c0909341SAndroid Build Coastguard Worker sar r6d, 8 3897*c0909341SAndroid Build Coastguard Worker imul r6d, 181 3898*c0909341SAndroid Build Coastguard Worker add r6d, 128+256 3899*c0909341SAndroid Build Coastguard Worker sar r6d, 8+1 3900*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 3901*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3902*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero 3903*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_2896x8)] 3904*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_4076x8)] 3905*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_401x8)] 3906*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m0 ; t0 3907*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m14 ; t15a 3908*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m14 ; t8a 3909*c0909341SAndroid Build Coastguard Worker punpcklwd m9, m3, m4 3910*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m3, m4 3911*c0909341SAndroid Build Coastguard Worker mova m2, m10 3912*c0909341SAndroid Build Coastguard Worker vpdpwssd m2, m9, [o(pw_m3784_1567)] {bcstd} 3913*c0909341SAndroid Build Coastguard Worker mova m1, m10 3914*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m5, [o(pw_m3784_1567)] {bcstd} 3915*c0909341SAndroid Build Coastguard Worker mova m6, m10 3916*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m5, [o(pw_1567_3784)] {bcstd} 3917*c0909341SAndroid Build Coastguard Worker mova m5, m10 3918*c0909341SAndroid Build Coastguard Worker vpdpwssd m5, m9, [o(pw_1567_3784)] {bcstd} 3919*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_2896_2896)] 3920*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m2896_2896)] 3921*c0909341SAndroid Build Coastguard Worker psubsw m21, m8, m4 ; out15 3922*c0909341SAndroid Build Coastguard Worker paddsw m0, m8, m4 ; out0 3923*c0909341SAndroid Build Coastguard Worker psubsw m14, m8, m3 ; out8 3924*c0909341SAndroid Build Coastguard Worker paddsw m7, m8, m3 ; out7 3925*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 12}, m2, m1, m6, m5 3926*c0909341SAndroid Build Coastguard Worker packssdw m2, m1 ; t9a 3927*c0909341SAndroid Build Coastguard Worker packssdw m5, m6 ; t14a 3928*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 4, 3, 16, 17, 10, 11, 12 ; t11, t12 3929*c0909341SAndroid Build Coastguard Worker psubsw m20, m8, m5 ; out14 3930*c0909341SAndroid Build Coastguard Worker paddsw m1, m8, m5 ; out1 3931*c0909341SAndroid Build Coastguard Worker psubsw m15, m8, m2 ; out9 3932*c0909341SAndroid Build Coastguard Worker paddsw m6, m8, m2 ; out6 3933*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 5, 2, 16, 17, 10, 11, 12 ; t10a, t13a 3934*c0909341SAndroid Build Coastguard Worker psubsw m18, m8, m3 ; out12 3935*c0909341SAndroid Build Coastguard Worker paddsw m3, m8 ; out3 3936*c0909341SAndroid Build Coastguard Worker psubsw m17, m8, m4 ; out11 3937*c0909341SAndroid Build Coastguard Worker paddsw m4, m8 ; out4 3938*c0909341SAndroid Build Coastguard Worker psubsw m19, m8, m2 ; out13 3939*c0909341SAndroid Build Coastguard Worker paddsw m2, m8 ; out2 3940*c0909341SAndroid Build Coastguard Worker psubsw m16, m8, m5 ; out10 3941*c0909341SAndroid Build Coastguard Worker paddsw m5, m8 ; out5 3942*c0909341SAndroid Build Coastguard Worker ret 3943*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero 3944*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_2896x8)] 3945*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(pw_4017x8)] 3946*c0909341SAndroid Build Coastguard Worker vpbroadcastd m3, [o(pw_799x8)] 3947*c0909341SAndroid Build Coastguard Worker vpbroadcastd m18, [o(pw_4076x8)] 3948*c0909341SAndroid Build Coastguard Worker vpbroadcastd m19, [o(pw_401x8)] 3949*c0909341SAndroid Build Coastguard Worker vpbroadcastd m20, [o(pw_m1189x8)] 3950*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, [o(pw_3920x8)] 3951*c0909341SAndroid Build Coastguard Worker pmulhrsw m9, m0 ; t0 3952*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m1 ; t7a 3953*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m3 ; t4a 3954*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m14 ; t15a 3955*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m19 ; t8a 3956*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m15 ; t11a 3957*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m16 ; t12a 3958*c0909341SAndroid Build Coastguard Worker psubsw m7, m9, m2 ; idct8 out7 3959*c0909341SAndroid Build Coastguard Worker paddsw m0, m9, m2 ; idct8 out0 3960*c0909341SAndroid Build Coastguard Worker psubsw m4, m9, m1 ; idct8 out4 3961*c0909341SAndroid Build Coastguard Worker paddsw m3, m9, m1 ; idct8 out3 3962*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6 3963*c0909341SAndroid Build Coastguard Worker mova m21, m18 3964*c0909341SAndroid Build Coastguard Worker mova m19, m14 3965*c0909341SAndroid Build Coastguard Worker mova m16, m15 3966*c0909341SAndroid Build Coastguard Worker mova m8, m20 3967*c0909341SAndroid Build Coastguard Worker psubsw m6, m9, m1 ; idct8 out6 3968*c0909341SAndroid Build Coastguard Worker paddsw m1, m9 ; idct8 out1 3969*c0909341SAndroid Build Coastguard Worker psubsw m5, m9, m2 ; idct8 out5 3970*c0909341SAndroid Build Coastguard Worker paddsw m2, m9 ; idct8 out2 3971*c0909341SAndroid Build Coastguard Worker jmp .main3 3972*c0909341SAndroid Build Coastguard WorkerALIGN function_align 3973*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast ; bottom half is zero 3974*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(pw_m2276x8)] 3975*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_3406x8)] 3976*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [o(pw_4017x8)] 3977*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_799x8)] 3978*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [o(pw_3784x8)] 3979*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pw_1567x8)] 3980*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(pw_2896x8)] 3981*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m3 ; t5a 3982*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m11 ; t6a 3983*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m1 ; t7a 3984*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m12 ; t4a 3985*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m2 ; t3 3986*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m10 ; t2 3987*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m0 ; t0 3988*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_2896_2896)] 3989*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m2896_2896)] 3990*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pd_2048)] 3991*c0909341SAndroid Build Coastguard Worker mova m0, m4 ; t1 3992*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x8_8bpc).main3 3993*c0909341SAndroid Build Coastguard Worker vpbroadcastd m21, [o(pw_4076x8)] 3994*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_401x8)] 3995*c0909341SAndroid Build Coastguard Worker vpbroadcastd m18, [o(pw_m2598x8)] 3996*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_3166x8)] 3997*c0909341SAndroid Build Coastguard Worker vpbroadcastd m19, [o(pw_3612x8)] 3998*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_1931x8)] 3999*c0909341SAndroid Build Coastguard Worker vpbroadcastd m20, [o(pw_m1189x8)] 4000*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_3920x8)] 4001*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m14 ; t15a 4002*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m8 ; t8a 4003*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m17 ; t9a 4004*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m9 ; t14a 4005*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m16 ; t13a 4006*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m11 ; t10a 4007*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m15 ; t11a 4008*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m12 ; t12a 4009*c0909341SAndroid Build Coastguard Worker jmp .main2 4010*c0909341SAndroid Build Coastguard WorkerALIGN function_align 4011*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf 4012*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a 4013*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a 4014*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a 4015*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a 4016*c0909341SAndroid Build Coastguard Worker.main2: 4017*c0909341SAndroid Build Coastguard Worker paddsw m8, m20, m16 ; t11 4018*c0909341SAndroid Build Coastguard Worker psubsw m20, m16 ; t10 4019*c0909341SAndroid Build Coastguard Worker paddsw m16, m15, m19 ; t12 4020*c0909341SAndroid Build Coastguard Worker psubsw m15, m19 ; t13 4021*c0909341SAndroid Build Coastguard Worker psubsw m19, m14, m18 ; t9 4022*c0909341SAndroid Build Coastguard Worker paddsw m14, m18 ; t8 4023*c0909341SAndroid Build Coastguard Worker psubsw m18, m21, m17 ; t14 4024*c0909341SAndroid Build Coastguard Worker paddsw m21, m17 ; t15 4025*c0909341SAndroid Build Coastguard Worker.main3: 4026*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_1567_3784)] 4027*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m3784_1567)] 4028*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a 4029*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_m1567_m3784)] 4030*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a 4031*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_2896_2896)] 4032*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m2896_2896)] 4033*c0909341SAndroid Build Coastguard Worker psubsw m17, m14, m8 ; t11a 4034*c0909341SAndroid Build Coastguard Worker paddsw m8, m14 ; t8a 4035*c0909341SAndroid Build Coastguard Worker paddsw m14, m18, m15 ; t9 4036*c0909341SAndroid Build Coastguard Worker psubsw m18, m15 ; t10 4037*c0909341SAndroid Build Coastguard Worker psubsw m15, m19, m20 ; t13 4038*c0909341SAndroid Build Coastguard Worker paddsw m19, m20 ; t14 4039*c0909341SAndroid Build Coastguard Worker paddsw m20, m21, m16 ; t15a 4040*c0909341SAndroid Build Coastguard Worker psubsw m16, m21, m16 ; t12a 4041*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a 4042*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12 4043*c0909341SAndroid Build Coastguard Worker psubsw m21, m0, m20 ; out15 4044*c0909341SAndroid Build Coastguard Worker paddsw m0, m20 ; out0 4045*c0909341SAndroid Build Coastguard Worker psubsw m20, m1, m19 ; out14 4046*c0909341SAndroid Build Coastguard Worker paddsw m1, m19 ; out1 4047*c0909341SAndroid Build Coastguard Worker psubsw m19, m2, m18 ; out13 4048*c0909341SAndroid Build Coastguard Worker paddsw m2, m18 ; out2 4049*c0909341SAndroid Build Coastguard Worker psubsw m18, m3, m17 ; out12 4050*c0909341SAndroid Build Coastguard Worker paddsw m3, m17 ; out3 4051*c0909341SAndroid Build Coastguard Worker psubsw m17, m4, m16 ; out11 4052*c0909341SAndroid Build Coastguard Worker paddsw m4, m16 ; out4 4053*c0909341SAndroid Build Coastguard Worker psubsw m16, m5, m15 ; out10 4054*c0909341SAndroid Build Coastguard Worker paddsw m5, m15 ; out5 4055*c0909341SAndroid Build Coastguard Worker psubsw m15, m6, m14 ; out9 4056*c0909341SAndroid Build Coastguard Worker paddsw m6, m14 ; out6 4057*c0909341SAndroid Build Coastguard Worker psubsw m14, m7, m8 ; out8 4058*c0909341SAndroid Build Coastguard Worker paddsw m7, m8 ; out7 4059*c0909341SAndroid Build Coastguard Worker ret 4060*c0909341SAndroid Build Coastguard Worker.transpose_round: 4061*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m0, m2 4062*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m2 4063*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1, m3 4064*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m3 4065*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m4, m6 4066*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m6 4067*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m5, m7 4068*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m7 4069*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m14, m16 4070*c0909341SAndroid Build Coastguard Worker punpckhwd m14, m16 4071*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m15, m17 4072*c0909341SAndroid Build Coastguard Worker punpckhwd m15, m17 4073*c0909341SAndroid Build Coastguard Worker punpcklwd m17, m19, m21 4074*c0909341SAndroid Build Coastguard Worker punpckhwd m19, m21 4075*c0909341SAndroid Build Coastguard Worker punpckhwd m21, m18, m20 4076*c0909341SAndroid Build Coastguard Worker punpcklwd m18, m20 4077*c0909341SAndroid Build Coastguard Worker punpcklwd m20, m8, m1 4078*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m1 4079*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m0, m2 4080*c0909341SAndroid Build Coastguard Worker punpckhwd m0, m2 4081*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3, m5 4082*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m5 4083*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m4, m6 4084*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m6 4085*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m9}, m20, m8, m1, m0 4086*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7, m15 4087*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m15 4088*c0909341SAndroid Build Coastguard Worker punpcklwd m15, m14, m16 4089*c0909341SAndroid Build Coastguard Worker punpckhwd m14, m16 4090*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m9}, m2, m3, m5, m4 4091*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m18, m19 4092*c0909341SAndroid Build Coastguard Worker punpcklwd m18, m19 4093*c0909341SAndroid Build Coastguard Worker punpcklwd m19, m21, m17 4094*c0909341SAndroid Build Coastguard Worker punpckhwd m21, m17 4095*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m9}, m6, m7, m15, m14 4096*c0909341SAndroid Build Coastguard Worker punpcklwd m17, m8, m0 ; a2 a6 aa ae 4097*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m0 ; a3 a7 ab af 4098*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m20, m1 ; a0 a4 a8 ac 4099*c0909341SAndroid Build Coastguard Worker punpckhwd m20, m1 ; a1 a5 a9 ad 4100*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m9}, m16, m18, m19, m21 4101*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m2, m5 ; b0 b4 b8 bc 4102*c0909341SAndroid Build Coastguard Worker punpckhwd m2, m5 ; b1 b5 b9 bd 4103*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m3, m4 ; b2 b6 ba be 4104*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m4 ; b3 b7 bb bf 4105*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m6, m15 ; c0 c4 c8 cc 4106*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m15 ; c1 c5 c9 cd 4107*c0909341SAndroid Build Coastguard Worker punpcklwd m15, m7, m14 ; c2 c6 ca ce 4108*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m14 ; c3 c7 cb cf 4109*c0909341SAndroid Build Coastguard Worker punpcklwd m14, m18, m19 ; d0 d4 d8 dc 4110*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m19 ; d1 d5 d9 dd 4111*c0909341SAndroid Build Coastguard Worker punpcklwd m9, m16, m21 ; d2 d6 da de 4112*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m21 ; d3 d7 db df 4113*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc 4114*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4 4115*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6 4116*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be 4117*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7 4118*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf 4119*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4 4120*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc 4121*c0909341SAndroid Build Coastguard Worker vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5 4122*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd 4123*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5 4124*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd 4125*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6 4126*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m9, q3232 ; ca ce da de 4127*c0909341SAndroid Build Coastguard Worker vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7 4128*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m16, q3232 ; cb cf db df 4129*c0909341SAndroid Build Coastguard Worker ret 4130*c0909341SAndroid Build Coastguard Worker 4131*c0909341SAndroid Build Coastguard Worker%macro IDTX_16x32 4 ; src/dst[1-4] 4132*c0909341SAndroid Build Coastguard Worker pmulhrsw m%1, m15, [cq+64*%1] 4133*c0909341SAndroid Build Coastguard Worker pmulhrsw m%2, m15, [cq+64*%2] 4134*c0909341SAndroid Build Coastguard Worker pmulhrsw m%3, m15, [cq+64*%3] 4135*c0909341SAndroid Build Coastguard Worker pmulhrsw m%4, m15, [cq+64*%4] 4136*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m16, m%1 4137*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m16, m%2 4138*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m16, m%3 4139*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m16, m%4 4140*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m17}, m18, m19, m20, m21 4141*c0909341SAndroid Build Coastguard Worker paddsw m%1, m18 4142*c0909341SAndroid Build Coastguard Worker paddsw m%2, m19 4143*c0909341SAndroid Build Coastguard Worker paddsw m%3, m20 4144*c0909341SAndroid Build Coastguard Worker paddsw m%4, m21 4145*c0909341SAndroid Build Coastguard Worker%endmacro 4146*c0909341SAndroid Build Coastguard Worker 4147*c0909341SAndroid Build Coastguard Worker%macro IDTX_16x32_STORE 2 ; src[1-2] 4148*c0909341SAndroid Build Coastguard Worker mova xm17, [dstq+r3*0] 4149*c0909341SAndroid Build Coastguard Worker vinserti128 ym17, [dstq+r3*4], 1 4150*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17, [dstq+r3*8], 2 4151*c0909341SAndroid Build Coastguard Worker vinserti32x4 m17, [dstq+r4*8], 3 4152*c0909341SAndroid Build Coastguard Worker mova [cq+64*(%1*2+0)], m18 4153*c0909341SAndroid Build Coastguard Worker mova [cq+64*(%1*2+1)], m18 4154*c0909341SAndroid Build Coastguard Worker punpcklbw m16, m17, m18 4155*c0909341SAndroid Build Coastguard Worker punpckhbw m17, m18 4156*c0909341SAndroid Build Coastguard Worker paddw m16, m%1 4157*c0909341SAndroid Build Coastguard Worker paddw m17, m%2 4158*c0909341SAndroid Build Coastguard Worker packuswb m16, m17 4159*c0909341SAndroid Build Coastguard Worker mova [dstq+r3*0], xm16 4160*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+r3*4], ym16, 1 4161*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r3*8], m16, 2 4162*c0909341SAndroid Build Coastguard Worker vextracti32x4 [dstq+r4*8], m16, 3 4163*c0909341SAndroid Build Coastguard Worker%if %1 != 7 4164*c0909341SAndroid Build Coastguard Worker add dstq, strideq 4165*c0909341SAndroid Build Coastguard Worker%endif 4166*c0909341SAndroid Build Coastguard Worker%endmacro 4167*c0909341SAndroid Build Coastguard Worker 4168*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c 4169*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [pw_2896x8] 4170*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, [pw_1697x16] 4171*c0909341SAndroid Build Coastguard Worker vpbroadcastd m17, [pw_16384] 4172*c0909341SAndroid Build Coastguard Worker IDTX_16x32 0, 1, 2, 3 4173*c0909341SAndroid Build Coastguard Worker IDTX_16x32 4, 5, 6, 7 4174*c0909341SAndroid Build Coastguard Worker IDTX_16x32 8, 9, 10, 11 4175*c0909341SAndroid Build Coastguard Worker IDTX_16x32 12, 13, 14, 15 4176*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, [pw_8192] 4177*c0909341SAndroid Build Coastguard Worker call .transpose_2x8x8_round 4178*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*2] 4179*c0909341SAndroid Build Coastguard Worker lea r4, [strideq*3] 4180*c0909341SAndroid Build Coastguard Worker pxor m18, m18 4181*c0909341SAndroid Build Coastguard Worker IDTX_16x32_STORE 0, 8 4182*c0909341SAndroid Build Coastguard Worker IDTX_16x32_STORE 1, 9 4183*c0909341SAndroid Build Coastguard Worker IDTX_16x32_STORE 2, 10 4184*c0909341SAndroid Build Coastguard Worker IDTX_16x32_STORE 3, 11 4185*c0909341SAndroid Build Coastguard Worker IDTX_16x32_STORE 4, 12 4186*c0909341SAndroid Build Coastguard Worker IDTX_16x32_STORE 5, 13 4187*c0909341SAndroid Build Coastguard Worker IDTX_16x32_STORE 6, 14 4188*c0909341SAndroid Build Coastguard Worker IDTX_16x32_STORE 7, 15 4189*c0909341SAndroid Build Coastguard Worker RET 4190*c0909341SAndroid Build Coastguard WorkerALIGN function_align 4191*c0909341SAndroid Build Coastguard Worker.transpose_2x8x8_round: 4192*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m4, m5 4193*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 4194*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m0, m1 4195*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 4196*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6, m7 4197*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 4198*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m2, m3 4199*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 4200*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m0, m2 4201*c0909341SAndroid Build Coastguard Worker punpckldq m0, m2 4202*c0909341SAndroid Build Coastguard Worker punpckldq m2, m4, m6 4203*c0909341SAndroid Build Coastguard Worker punpckhdq m4, m6 4204*c0909341SAndroid Build Coastguard Worker punpckhdq m6, m5, m7 4205*c0909341SAndroid Build Coastguard Worker punpckldq m5, m7 4206*c0909341SAndroid Build Coastguard Worker punpckldq m7, m17, m1 4207*c0909341SAndroid Build Coastguard Worker punpckhdq m17, m1 4208*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17 4209*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m0, m2 4210*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m2 4211*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m3, m4 4212*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m4 4213*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m5, m7 4214*c0909341SAndroid Build Coastguard Worker punpckhqdq m5, m7 4215*c0909341SAndroid Build Coastguard Worker punpckhqdq m7, m6, m17 4216*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m17 4217*c0909341SAndroid Build Coastguard Worker punpckhwd m17, m12, m13 4218*c0909341SAndroid Build Coastguard Worker punpcklwd m12, m13 4219*c0909341SAndroid Build Coastguard Worker punpckhwd m13, m8, m9 4220*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m9 4221*c0909341SAndroid Build Coastguard Worker punpckhwd m9, m14, m15 4222*c0909341SAndroid Build Coastguard Worker punpcklwd m14, m15 4223*c0909341SAndroid Build Coastguard Worker punpckhwd m15, m10, m11 4224*c0909341SAndroid Build Coastguard Worker punpcklwd m10, m11 4225*c0909341SAndroid Build Coastguard Worker punpckhdq m11, m8, m10 4226*c0909341SAndroid Build Coastguard Worker punpckldq m8, m10 4227*c0909341SAndroid Build Coastguard Worker punpckldq m10, m12, m14 4228*c0909341SAndroid Build Coastguard Worker punpckhdq m12, m14 4229*c0909341SAndroid Build Coastguard Worker punpckhdq m14, m13, m15 4230*c0909341SAndroid Build Coastguard Worker punpckldq m13, m15 4231*c0909341SAndroid Build Coastguard Worker punpckldq m15, m17, m9 4232*c0909341SAndroid Build Coastguard Worker punpckhdq m17, m9 4233*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17 4234*c0909341SAndroid Build Coastguard Worker punpckhqdq m9, m8, m10 4235*c0909341SAndroid Build Coastguard Worker punpcklqdq m8, m10 4236*c0909341SAndroid Build Coastguard Worker punpcklqdq m10, m11, m12 4237*c0909341SAndroid Build Coastguard Worker punpckhqdq m11, m12 4238*c0909341SAndroid Build Coastguard Worker punpcklqdq m12, m13, m15 4239*c0909341SAndroid Build Coastguard Worker punpckhqdq m13, m15 4240*c0909341SAndroid Build Coastguard Worker punpckhqdq m15, m14, m17 4241*c0909341SAndroid Build Coastguard Worker punpcklqdq m14, m17 4242*c0909341SAndroid Build Coastguard Worker ret 4243*c0909341SAndroid Build Coastguard Worker 4244*c0909341SAndroid Build Coastguard Worker%macro IDTX_32x16 4 ; dst[1-4] 4245*c0909341SAndroid Build Coastguard Worker pmulhrsw m%2, m12, [cq+32*(%1+ 0)] 4246*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m12, [cq+32*(%1+16)] 4247*c0909341SAndroid Build Coastguard Worker pmulhrsw m%4, m12, [cq+32*(%3+ 0)] 4248*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m12, [cq+32*(%3+16)] 4249*c0909341SAndroid Build Coastguard Worker REPX {paddsw x, x}, m%2, m18, m%4, m19 4250*c0909341SAndroid Build Coastguard Worker mova m%1, m14 4251*c0909341SAndroid Build Coastguard Worker vpermi2q m%1, m%2, m18 4252*c0909341SAndroid Build Coastguard Worker vpermt2q m%2, m16, m18 4253*c0909341SAndroid Build Coastguard Worker%if %3 != 14 4254*c0909341SAndroid Build Coastguard Worker mova m%3, m14 4255*c0909341SAndroid Build Coastguard Worker%endif 4256*c0909341SAndroid Build Coastguard Worker vpermi2q m%3, m%4, m19 4257*c0909341SAndroid Build Coastguard Worker vpermt2q m%4, m16, m19 4258*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m17, m%1 4259*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m17, m%2 4260*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m17, m%3 4261*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m17, m%4 4262*c0909341SAndroid Build Coastguard Worker REPX {paddsw x, x}, m%1, m%2, m%3, m%4 4263*c0909341SAndroid Build Coastguard Worker paddsw m%1, m18 4264*c0909341SAndroid Build Coastguard Worker paddsw m%2, m19 4265*c0909341SAndroid Build Coastguard Worker paddsw m%3, m20 4266*c0909341SAndroid Build Coastguard Worker paddsw m%4, m21 4267*c0909341SAndroid Build Coastguard Worker%endmacro 4268*c0909341SAndroid Build Coastguard Worker 4269*c0909341SAndroid Build Coastguard Worker%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32 4270*c0909341SAndroid Build Coastguard Worker mova ym19, [dstq+strideq*0] 4271*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, [dstq+strideq*8], 1 4272*c0909341SAndroid Build Coastguard Worker%if %3 == 0 4273*c0909341SAndroid Build Coastguard Worker mova [cq+64*(%1*2+0)], m20 4274*c0909341SAndroid Build Coastguard Worker mova [cq+64*(%1*2+1)], m20 4275*c0909341SAndroid Build Coastguard Worker%endif 4276*c0909341SAndroid Build Coastguard Worker punpcklbw m18, m19, m20 4277*c0909341SAndroid Build Coastguard Worker punpckhbw m19, m20 4278*c0909341SAndroid Build Coastguard Worker paddw m18, m%1 4279*c0909341SAndroid Build Coastguard Worker paddw m19, m%2 4280*c0909341SAndroid Build Coastguard Worker packuswb m18, m19 4281*c0909341SAndroid Build Coastguard Worker mova [dstq+strideq*0], ym18 4282*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+strideq*8], m18, 1 4283*c0909341SAndroid Build Coastguard Worker%if %3 || %1 != 7 4284*c0909341SAndroid Build Coastguard Worker add dstq, strideq 4285*c0909341SAndroid Build Coastguard Worker%endif 4286*c0909341SAndroid Build Coastguard Worker%endmacro 4287*c0909341SAndroid Build Coastguard Worker 4288*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c 4289*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [pw_2896x8] 4290*c0909341SAndroid Build Coastguard Worker movu m14, [permB+7] 4291*c0909341SAndroid Build Coastguard Worker vpbroadcastd m17, [pw_1697x16] 4292*c0909341SAndroid Build Coastguard Worker psrlq m16, m14, 4 4293*c0909341SAndroid Build Coastguard Worker IDTX_32x16 0, 1, 2, 3 4294*c0909341SAndroid Build Coastguard Worker IDTX_32x16 4, 5, 6, 7 4295*c0909341SAndroid Build Coastguard Worker IDTX_32x16 8, 9, 10, 11 4296*c0909341SAndroid Build Coastguard Worker IDTX_32x16 12, 13, 14, 15 4297*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, [pw_2048] 4298*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round 4299*c0909341SAndroid Build Coastguard Worker pxor m20, m20 4300*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 0, 8 4301*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 1, 9 4302*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 2, 10 4303*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 3, 11 4304*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 4, 12 4305*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 5, 13 4306*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 6, 14 4307*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 7, 15 4308*c0909341SAndroid Build Coastguard Worker RET 4309*c0909341SAndroid Build Coastguard Worker 4310*c0909341SAndroid Build Coastguard Worker%macro IDCT_32x32_END 4 ; src, mem, stride[1-2] 4311*c0909341SAndroid Build Coastguard Worker pmovzxbw m10, [dstq+%3] 4312*c0909341SAndroid Build Coastguard Worker pmovzxbw m11, [r3 +%4] 4313*c0909341SAndroid Build Coastguard Worker%if %2 < 8 4314*c0909341SAndroid Build Coastguard Worker paddsw m8, m%2, m%1 4315*c0909341SAndroid Build Coastguard Worker psubsw m9, m%2, m%1 4316*c0909341SAndroid Build Coastguard Worker%else 4317*c0909341SAndroid Build Coastguard Worker mova m9, [cq+64*(%2*2-16)] 4318*c0909341SAndroid Build Coastguard Worker paddsw m8, m9, m%1 4319*c0909341SAndroid Build Coastguard Worker psubsw m9, m%1 4320*c0909341SAndroid Build Coastguard Worker%endif 4321*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m12 4322*c0909341SAndroid Build Coastguard Worker pmulhrsw m9, m12 4323*c0909341SAndroid Build Coastguard Worker%if %2 >= 8 4324*c0909341SAndroid Build Coastguard Worker%if %2 == 8 4325*c0909341SAndroid Build Coastguard Worker pxor m0, m0 4326*c0909341SAndroid Build Coastguard Worker%endif 4327*c0909341SAndroid Build Coastguard Worker mova [cq+64*(%2*2-16)], m0 4328*c0909341SAndroid Build Coastguard Worker mova [cq+64*(%2*2-15)], m0 4329*c0909341SAndroid Build Coastguard Worker%endif 4330*c0909341SAndroid Build Coastguard Worker paddw m8, m10 4331*c0909341SAndroid Build Coastguard Worker paddw m9, m11 4332*c0909341SAndroid Build Coastguard Worker packuswb m8, m9 4333*c0909341SAndroid Build Coastguard Worker vpermq m8, m13, m8 4334*c0909341SAndroid Build Coastguard Worker mova [dstq+%3], ym8 4335*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r3 +%4], m8, 1 4336*c0909341SAndroid Build Coastguard Worker%if %2 == 3 || %2 == 7 || %2 == 11 4337*c0909341SAndroid Build Coastguard Worker add dstq, r5 4338*c0909341SAndroid Build Coastguard Worker sub r3, r5 4339*c0909341SAndroid Build Coastguard Worker%endif 4340*c0909341SAndroid Build Coastguard Worker%endmacro 4341*c0909341SAndroid Build Coastguard Worker 4342*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob 4343*c0909341SAndroid Build Coastguard Worker%undef cmp 4344*c0909341SAndroid Build Coastguard Worker lea r5, [o_base] 4345*c0909341SAndroid Build Coastguard Worker test eobd, eobd 4346*c0909341SAndroid Build Coastguard Worker jz .dconly 4347*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 30 4348*c0909341SAndroid Build Coastguard Worker cmp eobd, 136 4349*c0909341SAndroid Build Coastguard Worker jb .fast 4350*c0909341SAndroid Build Coastguard Worker mova m5, [cq+64*20] 4351*c0909341SAndroid Build Coastguard Worker mova m3, [cq+64*12] 4352*c0909341SAndroid Build Coastguard Worker mova m1, [cq+64* 4] 4353*c0909341SAndroid Build Coastguard Worker mova m7, [cq+64*28] 4354*c0909341SAndroid Build Coastguard Worker mova m2, [cq+64* 8] 4355*c0909341SAndroid Build Coastguard Worker mova m6, [cq+64*24] 4356*c0909341SAndroid Build Coastguard Worker mova m0, [cq+64* 0] 4357*c0909341SAndroid Build Coastguard Worker mova m4, [cq+64*16] 4358*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x8_8bpc).main 4359*c0909341SAndroid Build Coastguard Worker mova m14, [cq+64* 2] 4360*c0909341SAndroid Build Coastguard Worker mova m21, [cq+64*30] 4361*c0909341SAndroid Build Coastguard Worker mova m18, [cq+64*18] 4362*c0909341SAndroid Build Coastguard Worker mova m17, [cq+64*14] 4363*c0909341SAndroid Build Coastguard Worker mova m16, [cq+64*10] 4364*c0909341SAndroid Build Coastguard Worker mova m19, [cq+64*22] 4365*c0909341SAndroid Build Coastguard Worker mova m20, [cq+64*26] 4366*c0909341SAndroid Build Coastguard Worker mova m15, [cq+64* 6] 4367*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 4368*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m14 4369*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m15 4370*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m16 4371*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m17 4372*c0909341SAndroid Build Coastguard Worker mova [cq+64* 8], m18 4373*c0909341SAndroid Build Coastguard Worker mova [cq+64*10], m19 4374*c0909341SAndroid Build Coastguard Worker mova [cq+64*12], m20 4375*c0909341SAndroid Build Coastguard Worker mova [cq+64*14], m21 4376*c0909341SAndroid Build Coastguard Worker mova m22, [cq+64* 1] 4377*c0909341SAndroid Build Coastguard Worker mova m21, [cq+64*31] 4378*c0909341SAndroid Build Coastguard Worker mova m14, [cq+64*17] 4379*c0909341SAndroid Build Coastguard Worker mova m29, [cq+64*15] 4380*c0909341SAndroid Build Coastguard Worker mova m26, [cq+64* 9] 4381*c0909341SAndroid Build Coastguard Worker mova m17, [cq+64*23] 4382*c0909341SAndroid Build Coastguard Worker mova m18, [cq+64*25] 4383*c0909341SAndroid Build Coastguard Worker mova m25, [cq+64* 7] 4384*c0909341SAndroid Build Coastguard Worker mova m24, [cq+64* 5] 4385*c0909341SAndroid Build Coastguard Worker mova m19, [cq+64*27] 4386*c0909341SAndroid Build Coastguard Worker mova m16, [cq+64*21] 4387*c0909341SAndroid Build Coastguard Worker mova m27, [cq+64*11] 4388*c0909341SAndroid Build Coastguard Worker mova m28, [cq+64*13] 4389*c0909341SAndroid Build Coastguard Worker mova m15, [cq+64*19] 4390*c0909341SAndroid Build Coastguard Worker mova m20, [cq+64*29] 4391*c0909341SAndroid Build Coastguard Worker mova m23, [cq+64* 3] 4392*c0909341SAndroid Build Coastguard Worker call .main_oddhalf 4393*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pw_8192)] 4394*c0909341SAndroid Build Coastguard Worker psubsw m13, m0, m29 ; 31 4395*c0909341SAndroid Build Coastguard Worker paddsw m0, m29 ; 0 4396*c0909341SAndroid Build Coastguard Worker psubsw m29, m1, m28 ; 30 4397*c0909341SAndroid Build Coastguard Worker paddsw m1, m28 ; 1 4398*c0909341SAndroid Build Coastguard Worker psubsw m28, m2, m27 ; 29 4399*c0909341SAndroid Build Coastguard Worker paddsw m2, m27 ; 2 4400*c0909341SAndroid Build Coastguard Worker psubsw m27, m3, m26 ; 28 4401*c0909341SAndroid Build Coastguard Worker paddsw m3, m26 ; 3 4402*c0909341SAndroid Build Coastguard Worker psubsw m26, m4, m25 ; 27 4403*c0909341SAndroid Build Coastguard Worker paddsw m4, m25 ; 4 4404*c0909341SAndroid Build Coastguard Worker psubsw m25, m5, m24 ; 26 4405*c0909341SAndroid Build Coastguard Worker paddsw m5, m24 ; 5 4406*c0909341SAndroid Build Coastguard Worker psubsw m24, m6, m23 ; 25 4407*c0909341SAndroid Build Coastguard Worker paddsw m6, m23 ; 6 4408*c0909341SAndroid Build Coastguard Worker psubsw m23, m7, m22 ; 24 4409*c0909341SAndroid Build Coastguard Worker paddsw m7, m22 ; 7 4410*c0909341SAndroid Build Coastguard Worker pxor m9, m9 4411*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 4412*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 4413*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 4414*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 4415*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 4416*c0909341SAndroid Build Coastguard Worker punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 4417*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 4418*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 4419*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 4420*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 4421*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m23, m24 4422*c0909341SAndroid Build Coastguard Worker punpcklwd m23, m24 4423*c0909341SAndroid Build Coastguard Worker punpckhwd m24, m25, m26 4424*c0909341SAndroid Build Coastguard Worker punpcklwd m25, m26 4425*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 4426*c0909341SAndroid Build Coastguard Worker punpckhwd m26, m27, m28 4427*c0909341SAndroid Build Coastguard Worker punpcklwd m27, m28 4428*c0909341SAndroid Build Coastguard Worker punpckhwd m28, m29, m13 4429*c0909341SAndroid Build Coastguard Worker punpcklwd m29, m13 4430*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 4431*c0909341SAndroid Build Coastguard Worker punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 4432*c0909341SAndroid Build Coastguard Worker punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 4433*c0909341SAndroid Build Coastguard Worker punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 4434*c0909341SAndroid Build Coastguard Worker punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 4435*c0909341SAndroid Build Coastguard Worker punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 4436*c0909341SAndroid Build Coastguard Worker punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 4437*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 4438*c0909341SAndroid Build Coastguard Worker punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 4439*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m10}, m0, m4, m8, m22 4440*c0909341SAndroid Build Coastguard Worker punpckhdq m13, m23, m25 4441*c0909341SAndroid Build Coastguard Worker punpckldq m23, m25 4442*c0909341SAndroid Build Coastguard Worker punpckhdq m25, m27, m29 4443*c0909341SAndroid Build Coastguard Worker punpckldq m27, m29 4444*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m10}, m13, m23, m25, m27 4445*c0909341SAndroid Build Coastguard Worker punpckhdq m9, m3, m24 4446*c0909341SAndroid Build Coastguard Worker punpckldq m3, m24 4447*c0909341SAndroid Build Coastguard Worker punpckhdq m24, m26, m28 4448*c0909341SAndroid Build Coastguard Worker punpckldq m26, m28 4449*c0909341SAndroid Build Coastguard Worker punpcklqdq m5, m23, m27 ; d00 d08 d16 d24 4450*c0909341SAndroid Build Coastguard Worker punpckhqdq m23, m27 ; d01 d09 d17 d25 4451*c0909341SAndroid Build Coastguard Worker punpckhqdq m27, m13, m25 ; d03 d11 d19 d27 4452*c0909341SAndroid Build Coastguard Worker punpcklqdq m13, m25 ; d02 d10 d18 d26 4453*c0909341SAndroid Build Coastguard Worker punpckhqdq m25, m3, m26 ; d05 d13 d21 d29 4454*c0909341SAndroid Build Coastguard Worker punpcklqdq m3, m26 ; d04 d12 d20 d28 4455*c0909341SAndroid Build Coastguard Worker punpckhqdq m26, m9, m24 ; d07 d15 d23 d31 4456*c0909341SAndroid Build Coastguard Worker punpcklqdq m9, m24 ; d06 d14 d22 d30 4457*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m10}, m25, m3, m26 4458*c0909341SAndroid Build Coastguard Worker mova [cq+64* 9], m23 4459*c0909341SAndroid Build Coastguard Worker mova [cq+64*11], m27 4460*c0909341SAndroid Build Coastguard Worker mova [cq+64*13], m25 4461*c0909341SAndroid Build Coastguard Worker mova [cq+64*15], m26 4462*c0909341SAndroid Build Coastguard Worker punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 4463*c0909341SAndroid Build Coastguard Worker punpcklqdq m8, m22 ; a04 a12 a20 a28 4464*c0909341SAndroid Build Coastguard Worker punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 4465*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m4 ; a00 a08 a16 a24 4466*c0909341SAndroid Build Coastguard Worker punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 4467*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m2 ; a02 a10 a18 a26 4468*c0909341SAndroid Build Coastguard Worker punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 4469*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m1 ; a06 a14 a22 a30 4470*c0909341SAndroid Build Coastguard Worker mova m2, [cq+64* 0] 4471*c0909341SAndroid Build Coastguard Worker mova m11, [cq+64* 2] 4472*c0909341SAndroid Build Coastguard Worker mova m12, [cq+64* 4] 4473*c0909341SAndroid Build Coastguard Worker mova m29, [cq+64* 6] 4474*c0909341SAndroid Build Coastguard Worker mova m27, [cq+64* 8] 4475*c0909341SAndroid Build Coastguard Worker mova m26, [cq+64*10] 4476*c0909341SAndroid Build Coastguard Worker mova m4, [cq+64*12] 4477*c0909341SAndroid Build Coastguard Worker mova m28, [cq+64*14] 4478*c0909341SAndroid Build Coastguard Worker psubsw m1, m2, m21 ; 23 4479*c0909341SAndroid Build Coastguard Worker paddsw m2, m21 ; 8 4480*c0909341SAndroid Build Coastguard Worker psubsw m21, m11, m20 ; 22 4481*c0909341SAndroid Build Coastguard Worker paddsw m11, m20 ; 9 4482*c0909341SAndroid Build Coastguard Worker psubsw m20, m12, m19 ; 21 4483*c0909341SAndroid Build Coastguard Worker paddsw m12, m19 ; 10 4484*c0909341SAndroid Build Coastguard Worker psubsw m19, m29, m18 ; 20 4485*c0909341SAndroid Build Coastguard Worker paddsw m29, m18 ; 11 4486*c0909341SAndroid Build Coastguard Worker psubsw m18, m27, m17 ; 19 4487*c0909341SAndroid Build Coastguard Worker paddsw m27, m17 ; 12 4488*c0909341SAndroid Build Coastguard Worker psubsw m17, m26, m16 ; 18 4489*c0909341SAndroid Build Coastguard Worker paddsw m26, m16 ; 13 4490*c0909341SAndroid Build Coastguard Worker paddsw m16, m4, m15 ; 14 4491*c0909341SAndroid Build Coastguard Worker psubsw m4, m15 ; 17 4492*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m6, m10 4493*c0909341SAndroid Build Coastguard Worker psubsw m6, m28, m14 ; 16 4494*c0909341SAndroid Build Coastguard Worker paddsw m28, m14 ; 15 4495*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m7, m10 4496*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m6, m4 4497*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m4 4498*c0909341SAndroid Build Coastguard Worker punpckhwd m4, m17, m18 4499*c0909341SAndroid Build Coastguard Worker punpcklwd m17, m18 4500*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m19, m20 4501*c0909341SAndroid Build Coastguard Worker punpcklwd m19, m20 4502*c0909341SAndroid Build Coastguard Worker punpckhwd m20, m21, m1 4503*c0909341SAndroid Build Coastguard Worker punpcklwd m21, m1 4504*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 4505*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 4506*c0909341SAndroid Build Coastguard Worker punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 4507*c0909341SAndroid Build Coastguard Worker punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 4508*c0909341SAndroid Build Coastguard Worker punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 4509*c0909341SAndroid Build Coastguard Worker punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 4510*c0909341SAndroid Build Coastguard Worker punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 4511*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 4512*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, m10 4513*c0909341SAndroid Build Coastguard Worker pmulhrsw m25, m10 4514*c0909341SAndroid Build Coastguard Worker punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3 4515*c0909341SAndroid Build Coastguard Worker punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1 4516*c0909341SAndroid Build Coastguard Worker punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3 4517*c0909341SAndroid Build Coastguard Worker punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1 4518*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m10}, m28, m2, m12, m27 4519*c0909341SAndroid Build Coastguard Worker punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 4520*c0909341SAndroid Build Coastguard Worker punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 4521*c0909341SAndroid Build Coastguard Worker punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 4522*c0909341SAndroid Build Coastguard Worker punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 4523*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m10}, m16, m1, m11, m29 4524*c0909341SAndroid Build Coastguard Worker punpckhdq m26, m19, m21 4525*c0909341SAndroid Build Coastguard Worker punpckldq m19, m21 4526*c0909341SAndroid Build Coastguard Worker punpckhdq m21, m6, m4 4527*c0909341SAndroid Build Coastguard Worker punpckldq m6, m4 4528*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m10}, m26, m19, m21, m6 4529*c0909341SAndroid Build Coastguard Worker punpckhdq m4, m18, m20 4530*c0909341SAndroid Build Coastguard Worker punpckldq m18, m20 4531*c0909341SAndroid Build Coastguard Worker punpckhdq m20, m7, m17 4532*c0909341SAndroid Build Coastguard Worker punpckldq m7, m17 4533*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m10}, m4, m18, m20, m7 4534*c0909341SAndroid Build Coastguard Worker punpcklqdq m17, m28, m12 ; b02 b10 b18 b26 4535*c0909341SAndroid Build Coastguard Worker punpckhqdq m28, m12 ; b03 b11 b19 b27 4536*c0909341SAndroid Build Coastguard Worker punpckhqdq m12, m2, m27 ; b01 b09 b17 b25 4537*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m27 ; b00 b08 b16 b24 4538*c0909341SAndroid Build Coastguard Worker punpckhqdq m27, m1, m29 ; b05 b13 b21 b29 4539*c0909341SAndroid Build Coastguard Worker punpcklqdq m1, m29 ; b04 b12 b20 b28 4540*c0909341SAndroid Build Coastguard Worker punpckhqdq m29, m16, m11 ; b07 b15 b23 b31 4541*c0909341SAndroid Build Coastguard Worker punpcklqdq m16, m11 ; b06 b14 b22 b30 4542*c0909341SAndroid Build Coastguard Worker mova [cq+64* 1], m12 4543*c0909341SAndroid Build Coastguard Worker mova [cq+64* 3], m28 4544*c0909341SAndroid Build Coastguard Worker mova [cq+64* 5], m27 4545*c0909341SAndroid Build Coastguard Worker mova [cq+64* 7], m29 4546*c0909341SAndroid Build Coastguard Worker punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 4547*c0909341SAndroid Build Coastguard Worker punpcklqdq m20, m26 ; c02 c10 c18 c26 4548*c0909341SAndroid Build Coastguard Worker punpckhqdq m26, m7, m19 ; c01 c09 c17 c25 4549*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m19 ; c00 c08 c16 c24 4550*c0909341SAndroid Build Coastguard Worker punpckhqdq m28, m6, m18 ; c05 c13 c21 c29 4551*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m18 ; c04 c12 c20 c28 4552*c0909341SAndroid Build Coastguard Worker punpckhqdq m29, m21, m4 ; c07 c15 c23 c31 4553*c0909341SAndroid Build Coastguard Worker punpcklqdq m21, m4 ; c06 c14 c22 c30 4554*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m9, m10 4555*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24 4556*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08 4557*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24 4558*c0909341SAndroid Build Coastguard Worker vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08 4559*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28 4560*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12 4561*c0909341SAndroid Build Coastguard Worker vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28 4562*c0909341SAndroid Build Coastguard Worker vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12 4563*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m1, m6, q3131 ; 12 4564*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m6, q2020 ; 4 4565*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m4, m2, q3131 ; 24 4566*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m2, q2020 ; 16 4567*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m0, m7, q3131 ; 8 4568*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m7, q2020 ; 0 4569*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m5, m8, q3131 ; 28 4570*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m8, q2020 ; 20 4571*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x8_8bpc).main 4572*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26 4573*c0909341SAndroid Build Coastguard Worker vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10 4574*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26 4575*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10 4576*c0909341SAndroid Build Coastguard Worker vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30 4577*c0909341SAndroid Build Coastguard Worker vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14 4578*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30 4579*c0909341SAndroid Build Coastguard Worker vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14 4580*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m14, m20, q3131 ; 10 4581*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m20, q2020 ; 2 4582*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m18, m17, q3131 ; 26 4583*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m17, q2020 ; 18 4584*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m15, m21, q3131 ; 14 4585*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m21, q2020 ; 6 4586*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m19, m13, q3131 ; 30 4587*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m13, q2020 ; 22 4588*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 4589*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m14 4590*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m15 4591*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m16 4592*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m17 4593*c0909341SAndroid Build Coastguard Worker mova [cq+64* 8], m18 4594*c0909341SAndroid Build Coastguard Worker mova [cq+64*10], m19 4595*c0909341SAndroid Build Coastguard Worker mova [cq+64*12], m20 4596*c0909341SAndroid Build Coastguard Worker mova [cq+64*14], m21 4597*c0909341SAndroid Build Coastguard Worker mova m15, [cq+64* 1] 4598*c0909341SAndroid Build Coastguard Worker mova m16, [cq+64* 3] 4599*c0909341SAndroid Build Coastguard Worker mova m17, [cq+64* 5] 4600*c0909341SAndroid Build Coastguard Worker mova m19, [cq+64* 7] 4601*c0909341SAndroid Build Coastguard Worker mova m20, [cq+64* 9] 4602*c0909341SAndroid Build Coastguard Worker mova m21, [cq+64*11] 4603*c0909341SAndroid Build Coastguard Worker mova m13, [cq+64*13] 4604*c0909341SAndroid Build Coastguard Worker mova m18, [cq+64*15] 4605*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25 4606*c0909341SAndroid Build Coastguard Worker vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09 4607*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27 4608*c0909341SAndroid Build Coastguard Worker vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11 4609*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29 4610*c0909341SAndroid Build Coastguard Worker vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13 4611*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31 4612*c0909341SAndroid Build Coastguard Worker vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15 4613*c0909341SAndroid Build Coastguard Worker vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09 4614*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25 4615*c0909341SAndroid Build Coastguard Worker vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11 4616*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27 4617*c0909341SAndroid Build Coastguard Worker vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13 4618*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29 4619*c0909341SAndroid Build Coastguard Worker vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15 4620*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31 4621*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m14, m26, q3131 ; 25 4622*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m26, q2020 ; 17 4623*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m15, m27, q3131 ; 27 4624*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m27, q2020 ; 19 4625*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m16, m28, q3131 ; 29 4626*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m28, q2020 ; 21 4627*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m17, m29, q3131 ; 31 4628*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m29, q2020 ; 23 4629*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m22, m8, q3131 ; 9 4630*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m8, q2020 ; 1 4631*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m23, m9, q3131 ; 11 4632*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m9, q2020 ; 3 4633*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m24, m11, q3131 ; 13 4634*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m11, q2020 ; 5 4635*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m25, m12, q3131 ; 15 4636*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m12, q2020 ; 7 4637*c0909341SAndroid Build Coastguard Worker call .main_oddhalf 4638*c0909341SAndroid Build Coastguard Worker jmp .end 4639*c0909341SAndroid Build Coastguard Worker.fast: ; bottom/right halves are zero 4640*c0909341SAndroid Build Coastguard Worker mova m14, [o(dup16_perm)] 4641*c0909341SAndroid Build Coastguard Worker pmovzxwd m9, [cq+64* 0] 4642*c0909341SAndroid Build Coastguard Worker pmovzxwd m6, [cq+64* 8] 4643*c0909341SAndroid Build Coastguard Worker vpermb m8, m14, [cq+64* 2] 4644*c0909341SAndroid Build Coastguard Worker vpermb ym0, ym14, [cq+64*14] 4645*c0909341SAndroid Build Coastguard Worker vpermb ym5, ym14, [cq+64*10] 4646*c0909341SAndroid Build Coastguard Worker vpermb m1, m14, [cq+64* 6] 4647*c0909341SAndroid Build Coastguard Worker vpermb m7, m14, [cq+64* 4] 4648*c0909341SAndroid Build Coastguard Worker vpermb ym3, ym14, [cq+64*12] 4649*c0909341SAndroid Build Coastguard Worker pslld m9, 16 4650*c0909341SAndroid Build Coastguard Worker pslld m6, 16 4651*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main_fast 4652*c0909341SAndroid Build Coastguard Worker vpermb m21, m14, [cq+64* 1] 4653*c0909341SAndroid Build Coastguard Worker vpermb ym17, ym14, [cq+64*15] 4654*c0909341SAndroid Build Coastguard Worker vpermb ym20, ym14, [cq+64* 9] 4655*c0909341SAndroid Build Coastguard Worker vpermb m15, m14, [cq+64* 7] 4656*c0909341SAndroid Build Coastguard Worker vpermb m18, m14, [cq+64* 5] 4657*c0909341SAndroid Build Coastguard Worker vpermb ym16, ym14, [cq+64*11] 4658*c0909341SAndroid Build Coastguard Worker vpermb ym19, ym14, [cq+64*13] 4659*c0909341SAndroid Build Coastguard Worker vpermb m14, m14, [cq+64* 3] 4660*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 4661*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_8192)] 4662*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round 4663*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m14, m2, q2020 ; 1 4664*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m14, m2, q3131 ; 5 4665*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m17, m9, q2020 ; 3 4666*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m17, m9, q3131 ; 7 4667*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m5, m15, q2020 ; 10 4668*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m5, m15, q3131 ; 14 4669*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m1, m18, q2020 ; 2 4670*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m1, m18, q3131 ; 6 4671*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m0, m3, q3131 ; 4 4672*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m3, q2020 ; 0 4673*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m21, m4, q3131 ; 12 4674*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m21, m4, q2020 ; 8 4675*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m20, m6, q2020 ; 9 4676*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m20, m6, q3131 ; 13 4677*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m19, m7, q2020 ; 11 4678*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m19, m7, q3131 ; 15 4679*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 4680*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m14 4681*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m15 4682*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m16 4683*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m17 4684*c0909341SAndroid Build Coastguard Worker mova [cq+64* 8], m18 4685*c0909341SAndroid Build Coastguard Worker mova [cq+64*10], m19 4686*c0909341SAndroid Build Coastguard Worker mova [cq+64*12], m20 4687*c0909341SAndroid Build Coastguard Worker mova [cq+64*14], m21 4688*c0909341SAndroid Build Coastguard Worker call .main_oddhalf_fast 4689*c0909341SAndroid Build Coastguard Worker.end: 4690*c0909341SAndroid Build Coastguard Worker lea r4, [strideq*3] 4691*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_2048)] 4692*c0909341SAndroid Build Coastguard Worker movshdup m13, [o(permD)] 4693*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+r4*8] 4694*c0909341SAndroid Build Coastguard Worker lea r5, [strideq+r4] ; stride*4 4695*c0909341SAndroid Build Coastguard Worker add r3, r5 ; dst+stride*28 4696*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 29, 0, strideq*0, r4 4697*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 28, 1, strideq*1, strideq*2 4698*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 27, 2, strideq*2, strideq*1 4699*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 26, 3, r4 , strideq*0 4700*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 25, 4, strideq*0, r4 4701*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 24, 5, strideq*1, strideq*2 4702*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 23, 6, strideq*2, strideq*1 4703*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 22, 7, r4 , strideq*0 4704*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 21, 8, strideq*0, r4 4705*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 20, 9, strideq*1, strideq*2 4706*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 19, 10, strideq*2, strideq*1 4707*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 18, 11, r4 , strideq*0 4708*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 17, 12, strideq*0, r4 4709*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 16, 13, strideq*1, strideq*2 4710*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 15, 14, strideq*2, strideq*1 4711*c0909341SAndroid Build Coastguard Worker IDCT_32x32_END 14, 15, r4 , strideq*0 4712*c0909341SAndroid Build Coastguard Worker RET 4713*c0909341SAndroid Build Coastguard Worker.dconly: 4714*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 4715*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 4716*c0909341SAndroid Build Coastguard Worker or r3d, 32 4717*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 4718*c0909341SAndroid Build Coastguard WorkerALIGN function_align 4719*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero 4720*c0909341SAndroid Build Coastguard Worker vpbroadcastd m21, [o(pw_4091x8)] 4721*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_201x8)] 4722*c0909341SAndroid Build Coastguard Worker vpbroadcastd m24, [o(pw_m601x8)] 4723*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_4052x8)] 4724*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m22 ; t31a 4725*c0909341SAndroid Build Coastguard Worker pmulhrsw m22, m8 ; t16a 4726*c0909341SAndroid Build Coastguard Worker pmulhrsw m24, m23 ; t23a 4727*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, m12 ; t24a 4728*c0909341SAndroid Build Coastguard Worker 4729*c0909341SAndroid Build Coastguard Worker punpcklwd m9, m22, m21 4730*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m22, m21 4731*c0909341SAndroid Build Coastguard Worker mova m15, m10 4732*c0909341SAndroid Build Coastguard Worker vpdpwssd m15, m9, [o(pw_m4017_799)] {bcstd} 4733*c0909341SAndroid Build Coastguard Worker mova m17, m10 4734*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m8, [o(pw_m4017_799)] {bcstd} 4735*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 12}, m15, m17 4736*c0909341SAndroid Build Coastguard Worker packssdw m15, m17 4737*c0909341SAndroid Build Coastguard Worker mova m17, m10 4738*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m8, [o(pw_799_4017)] {bcstd} 4739*c0909341SAndroid Build Coastguard Worker mova m8, m10 4740*c0909341SAndroid Build Coastguard Worker vpdpwssd m8, m9, [o(pw_799_4017)] {bcstd} 4741*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 12}, m17, m8 4742*c0909341SAndroid Build Coastguard Worker packssdw m8, m17 4743*c0909341SAndroid Build Coastguard Worker 4744*c0909341SAndroid Build Coastguard Worker punpcklwd m9, m24, m23 4745*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m24, m23 4746*c0909341SAndroid Build Coastguard Worker mova m20, m10 4747*c0909341SAndroid Build Coastguard Worker vpdpwssd m20, m9, [o(pw_m3406_m2276)] {bcstd} 4748*c0909341SAndroid Build Coastguard Worker mova m17, m10 4749*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m16, [o(pw_m3406_m2276)] {bcstd} 4750*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 12}, m20, m17 4751*c0909341SAndroid Build Coastguard Worker packssdw m20, m17 4752*c0909341SAndroid Build Coastguard Worker mova m17, m10 4753*c0909341SAndroid Build Coastguard Worker vpdpwssd m17, m16, [o(pw_m2276_3406)] {bcstd} 4754*c0909341SAndroid Build Coastguard Worker mova m16, m10 4755*c0909341SAndroid Build Coastguard Worker vpdpwssd m16, m9, [o(pw_m2276_3406)] {bcstd} 4756*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 12}, m17, m16 4757*c0909341SAndroid Build Coastguard Worker packssdw m16, m17 4758*c0909341SAndroid Build Coastguard Worker 4759*c0909341SAndroid Build Coastguard Worker mova m17, m21 4760*c0909341SAndroid Build Coastguard Worker mova m27, m15 4761*c0909341SAndroid Build Coastguard Worker mova m25, m20 4762*c0909341SAndroid Build Coastguard Worker mova m29, m8 4763*c0909341SAndroid Build Coastguard Worker mova m18, m22 4764*c0909341SAndroid Build Coastguard Worker mova m14, m24 4765*c0909341SAndroid Build Coastguard Worker mova m28, m16 4766*c0909341SAndroid Build Coastguard Worker mova m26, m23 4767*c0909341SAndroid Build Coastguard Worker jmp .main4 4768*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero 4769*c0909341SAndroid Build Coastguard Worker vpbroadcastd m21, [o(pw_4091x8)] 4770*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_201x8)] 4771*c0909341SAndroid Build Coastguard Worker vpbroadcastd m18, [o(pw_m1380x8)] 4772*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_3857x8)] 4773*c0909341SAndroid Build Coastguard Worker vpbroadcastd m19, [o(pw_3973x8)] 4774*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_995x8)] 4775*c0909341SAndroid Build Coastguard Worker vpbroadcastd m28, [o(pw_m601x8)] 4776*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_4052x8)] 4777*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m22 ; t31a 4778*c0909341SAndroid Build Coastguard Worker pmulhrsw m22, m8 ; t16a 4779*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m25 ; t19a 4780*c0909341SAndroid Build Coastguard Worker pmulhrsw m25, m9 ; t28a 4781*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m24 ; t27a 4782*c0909341SAndroid Build Coastguard Worker pmulhrsw m24, m11 ; t20a 4783*c0909341SAndroid Build Coastguard Worker pmulhrsw m28, m23 ; t23a 4784*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, m12 ; t24a 4785*c0909341SAndroid Build Coastguard Worker mova m15, m21 4786*c0909341SAndroid Build Coastguard Worker mova m8, m22 4787*c0909341SAndroid Build Coastguard Worker mova m14, m18 4788*c0909341SAndroid Build Coastguard Worker mova m27, m25 4789*c0909341SAndroid Build Coastguard Worker mova m29, m19 4790*c0909341SAndroid Build Coastguard Worker mova m26, m24 4791*c0909341SAndroid Build Coastguard Worker mova m16, m28 4792*c0909341SAndroid Build Coastguard Worker mova m20, m23 4793*c0909341SAndroid Build Coastguard Worker jmp .main3 4794*c0909341SAndroid Build Coastguard WorkerALIGN function_align 4795*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast ; bottom half is zero 4796*c0909341SAndroid Build Coastguard Worker vpbroadcastd m21, [o(pw_4091x8)] 4797*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_201x8)] 4798*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [o(pw_m2751x8)] 4799*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_3035x8)] 4800*c0909341SAndroid Build Coastguard Worker vpbroadcastd m17, [o(pw_3703x8)] 4801*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_1751x8)] 4802*c0909341SAndroid Build Coastguard Worker vpbroadcastd m18, [o(pw_m1380x8)] 4803*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_3857x8)] 4804*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m22 ; t31a 4805*c0909341SAndroid Build Coastguard Worker vpbroadcastd m19, [o(pw_3973x8)] 4806*c0909341SAndroid Build Coastguard Worker pmulhrsw m22, m8 ; t16a 4807*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_995x8)] 4808*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m29 ; t30a 4809*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, [o(pw_m2106x8)] 4810*c0909341SAndroid Build Coastguard Worker pmulhrsw m29, m9 ; t17a 4811*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_3513x8)] 4812*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m26 ; t29a 4813*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [o(pw_3290x8)] 4814*c0909341SAndroid Build Coastguard Worker pmulhrsw m26, m11 ; t18a 4815*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_2440x8)] 4816*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m25 ; t19a 4817*c0909341SAndroid Build Coastguard Worker vpbroadcastd m20, [o(pw_m601x8)] 4818*c0909341SAndroid Build Coastguard Worker pmulhrsw m25, m12 ; t28a 4819*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_4052x8)] 4820*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m24 ; t27a 4821*c0909341SAndroid Build Coastguard Worker pmulhrsw m24, m8 ; t20a 4822*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m27 ; t21a 4823*c0909341SAndroid Build Coastguard Worker pmulhrsw m27, m9 ; t26a 4824*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m28 ; t25a 4825*c0909341SAndroid Build Coastguard Worker pmulhrsw m28, m11 ; t22a 4826*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m23 ; t23a 4827*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, m12 ; t24a 4828*c0909341SAndroid Build Coastguard Worker jmp .main2 4829*c0909341SAndroid Build Coastguard WorkerALIGN function_align 4830*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf 4831*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a 4832*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a 4833*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a 4834*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a 4835*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a 4836*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a 4837*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a 4838*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a 4839*c0909341SAndroid Build Coastguard Worker.main2: 4840*c0909341SAndroid Build Coastguard Worker psubsw m8, m22, m14 ; t17 4841*c0909341SAndroid Build Coastguard Worker paddsw m22, m14 ; t16 4842*c0909341SAndroid Build Coastguard Worker paddsw m14, m18, m26 ; t19 4843*c0909341SAndroid Build Coastguard Worker psubsw m18, m26 ; t18 4844*c0909341SAndroid Build Coastguard Worker psubsw m26, m24, m16 ; t21 4845*c0909341SAndroid Build Coastguard Worker paddsw m24, m16 ; t20 4846*c0909341SAndroid Build Coastguard Worker psubsw m16, m20, m28 ; t22 4847*c0909341SAndroid Build Coastguard Worker paddsw m28, m20 ; t23 4848*c0909341SAndroid Build Coastguard Worker psubsw m20, m23, m15 ; t25 4849*c0909341SAndroid Build Coastguard Worker paddsw m23, m15 ; t24 4850*c0909341SAndroid Build Coastguard Worker psubsw m15, m21, m29 ; t30 4851*c0909341SAndroid Build Coastguard Worker paddsw m21, m29 ; t31 4852*c0909341SAndroid Build Coastguard Worker psubsw m29, m19, m27 ; t26 4853*c0909341SAndroid Build Coastguard Worker paddsw m19, m27 ; t27 4854*c0909341SAndroid Build Coastguard Worker paddsw m27, m25, m17 ; t28 4855*c0909341SAndroid Build Coastguard Worker psubsw m25, m17 ; t29 4856*c0909341SAndroid Build Coastguard Worker.main3: 4857*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a 4858*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a 4859*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a 4860*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a 4861*c0909341SAndroid Build Coastguard Worker psubsw m17, m21, m27 ; t28a 4862*c0909341SAndroid Build Coastguard Worker paddsw m21, m27 ; t31a 4863*c0909341SAndroid Build Coastguard Worker psubsw m27, m15, m25 ; t18 4864*c0909341SAndroid Build Coastguard Worker paddsw m15, m25 ; t17 4865*c0909341SAndroid Build Coastguard Worker psubsw m25, m20, m29 ; t21 4866*c0909341SAndroid Build Coastguard Worker paddsw m20, m29 ; t22 4867*c0909341SAndroid Build Coastguard Worker psubsw m29, m8, m18 ; t29 4868*c0909341SAndroid Build Coastguard Worker paddsw m8, m18 ; t30 4869*c0909341SAndroid Build Coastguard Worker psubsw m18, m22, m14 ; t19a 4870*c0909341SAndroid Build Coastguard Worker paddsw m22, m14 ; t16a 4871*c0909341SAndroid Build Coastguard Worker psubsw m14, m28, m24 ; t20a 4872*c0909341SAndroid Build Coastguard Worker paddsw m24, m28 ; t23a 4873*c0909341SAndroid Build Coastguard Worker paddsw m28, m16, m26 ; t25 4874*c0909341SAndroid Build Coastguard Worker psubsw m16, m26 ; t26 4875*c0909341SAndroid Build Coastguard Worker psubsw m26, m23, m19 ; t27a 4876*c0909341SAndroid Build Coastguard Worker paddsw m23, m19 ; t24a 4877*c0909341SAndroid Build Coastguard Worker.main4: 4878*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m3784_1567)] 4879*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_1567_3784)] 4880*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a 4881*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28 4882*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_m1567_m3784)] 4883*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a 4884*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27 4885*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m2896_2896)] 4886*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_2896_2896)] 4887*c0909341SAndroid Build Coastguard Worker psubsw m19, m27, m25 ; t26 4888*c0909341SAndroid Build Coastguard Worker paddsw m27, m25 ; t29 4889*c0909341SAndroid Build Coastguard Worker psubsw m25, m17, m26 ; t20a 4890*c0909341SAndroid Build Coastguard Worker paddsw m17, m26 ; t19a 4891*c0909341SAndroid Build Coastguard Worker paddsw m26, m18, m14 ; t28a 4892*c0909341SAndroid Build Coastguard Worker psubsw m18, m14 ; t27a 4893*c0909341SAndroid Build Coastguard Worker paddsw m14, m22, m24 ; t16 4894*c0909341SAndroid Build Coastguard Worker psubsw m22, m24 ; t23 4895*c0909341SAndroid Build Coastguard Worker psubsw m24, m29, m16 ; t21 4896*c0909341SAndroid Build Coastguard Worker paddsw m16, m29 ; t18 4897*c0909341SAndroid Build Coastguard Worker paddsw m29, m21, m23 ; t31 4898*c0909341SAndroid Build Coastguard Worker psubsw m21, m23 ; t24 4899*c0909341SAndroid Build Coastguard Worker psubsw m23, m15, m20 ; t22a 4900*c0909341SAndroid Build Coastguard Worker paddsw m15, m20 ; t17a 4901*c0909341SAndroid Build Coastguard Worker psubsw m20, m8, m28 ; t25a 4902*c0909341SAndroid Build Coastguard Worker paddsw m28, m8 ; t30a 4903*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27 4904*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a 4905*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a 4906*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25 4907*c0909341SAndroid Build Coastguard Worker ret 4908*c0909341SAndroid Build Coastguard Worker 4909*c0909341SAndroid Build Coastguard Worker%macro IDTX_32x32 2 ; dst[1-2] 4910*c0909341SAndroid Build Coastguard Worker vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which 4911*c0909341SAndroid Build Coastguard Worker vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to 4912*c0909341SAndroid Build Coastguard Worker vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements 4913*c0909341SAndroid Build Coastguard Worker vmovdqa32 ym18, [cq+64*(%2+16)] 4914*c0909341SAndroid Build Coastguard Worker vpermt2q m%1, m21, m17 4915*c0909341SAndroid Build Coastguard Worker vpermt2q m%2, m21, m18 4916*c0909341SAndroid Build Coastguard Worker%endmacro 4917*c0909341SAndroid Build Coastguard Worker 4918*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c 4919*c0909341SAndroid Build Coastguard Worker movu m21, [permB+7] 4920*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, [pw_8192] 4921*c0909341SAndroid Build Coastguard Worker pxor m20, m20 4922*c0909341SAndroid Build Coastguard Worker.loop: 4923*c0909341SAndroid Build Coastguard Worker IDTX_32x32 0, 1 4924*c0909341SAndroid Build Coastguard Worker IDTX_32x32 2, 3 4925*c0909341SAndroid Build Coastguard Worker IDTX_32x32 4, 5 4926*c0909341SAndroid Build Coastguard Worker IDTX_32x32 6, 7 4927*c0909341SAndroid Build Coastguard Worker IDTX_32x32 8, 9 4928*c0909341SAndroid Build Coastguard Worker IDTX_32x32 10, 11 4929*c0909341SAndroid Build Coastguard Worker IDTX_32x32 12, 13 4930*c0909341SAndroid Build Coastguard Worker IDTX_32x32 14, 15 4931*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round 4932*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 0, 8, 1 4933*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 1, 9, 1 4934*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 2, 10, 1 4935*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 3, 11, 1 4936*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 4, 12, 1 4937*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 5, 13, 1 4938*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 6, 14, 1 4939*c0909341SAndroid Build Coastguard Worker IDTX_32x16_STORE 7, 15, 1 4940*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*8] 4941*c0909341SAndroid Build Coastguard Worker btc cq, 5 4942*c0909341SAndroid Build Coastguard Worker jnc .loop 4943*c0909341SAndroid Build Coastguard Worker mov r0d, 8 4944*c0909341SAndroid Build Coastguard Worker.zero_loop: 4945*c0909341SAndroid Build Coastguard Worker mova [cq+64*0], m20 4946*c0909341SAndroid Build Coastguard Worker mova [cq+64*1], m20 4947*c0909341SAndroid Build Coastguard Worker mova [cq+64*2], m20 4948*c0909341SAndroid Build Coastguard Worker mova [cq+64*3], m20 4949*c0909341SAndroid Build Coastguard Worker add cq, 64*4 4950*c0909341SAndroid Build Coastguard Worker dec r0d 4951*c0909341SAndroid Build Coastguard Worker jg .zero_loop 4952*c0909341SAndroid Build Coastguard Worker RET 4953*c0909341SAndroid Build Coastguard Worker 4954*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob 4955*c0909341SAndroid Build Coastguard Worker%undef cmp 4956*c0909341SAndroid Build Coastguard Worker lea r5, [o_base] 4957*c0909341SAndroid Build Coastguard Worker test eobd, eobd 4958*c0909341SAndroid Build Coastguard Worker jz .dconly 4959*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 30 4960*c0909341SAndroid Build Coastguard Worker cmp eobd, 151 4961*c0909341SAndroid Build Coastguard Worker jb .fast 4962*c0909341SAndroid Build Coastguard Worker mova m5, [cq+64*10] 4963*c0909341SAndroid Build Coastguard Worker mova m3, [cq+64* 6] 4964*c0909341SAndroid Build Coastguard Worker mova m1, [cq+64* 2] 4965*c0909341SAndroid Build Coastguard Worker mova m7, [cq+64*14] 4966*c0909341SAndroid Build Coastguard Worker mova m2, [cq+64* 4] 4967*c0909341SAndroid Build Coastguard Worker mova m6, [cq+64*12] 4968*c0909341SAndroid Build Coastguard Worker mova m0, [cq+64* 0] 4969*c0909341SAndroid Build Coastguard Worker mova m4, [cq+64* 8] 4970*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x8_8bpc).main 4971*c0909341SAndroid Build Coastguard Worker mova m14, [cq+64* 1] 4972*c0909341SAndroid Build Coastguard Worker mova m21, [cq+64*15] 4973*c0909341SAndroid Build Coastguard Worker mova m18, [cq+64* 9] 4974*c0909341SAndroid Build Coastguard Worker mova m17, [cq+64* 7] 4975*c0909341SAndroid Build Coastguard Worker mova m16, [cq+64* 5] 4976*c0909341SAndroid Build Coastguard Worker mova m19, [cq+64*11] 4977*c0909341SAndroid Build Coastguard Worker mova m20, [cq+64*13] 4978*c0909341SAndroid Build Coastguard Worker mova m15, [cq+64* 3] 4979*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 4980*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_8192)] 4981*c0909341SAndroid Build Coastguard Worker%macro TRANSPOSE_8x4_ROUND 4 4982*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7 4983*c0909341SAndroid Build Coastguard Worker punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3 4984*c0909341SAndroid Build Coastguard Worker punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7 4985*c0909341SAndroid Build Coastguard Worker punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3 4986*c0909341SAndroid Build Coastguard Worker punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3 4987*c0909341SAndroid Build Coastguard Worker punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1 4988*c0909341SAndroid Build Coastguard Worker punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5 4989*c0909341SAndroid Build Coastguard Worker punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7 4990*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4 4991*c0909341SAndroid Build Coastguard Worker%endmacro 4992*c0909341SAndroid Build Coastguard Worker TRANSPOSE_8x4_ROUND 0, 1, 2, 3 4993*c0909341SAndroid Build Coastguard Worker TRANSPOSE_8x4_ROUND 4, 5, 6, 7 4994*c0909341SAndroid Build Coastguard Worker TRANSPOSE_8x4_ROUND 14, 15, 16, 17 4995*c0909341SAndroid Build Coastguard Worker TRANSPOSE_8x4_ROUND 18, 19, 20, 21 4996*c0909341SAndroid Build Coastguard Worker vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4 4997*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12 4998*c0909341SAndroid Build Coastguard Worker vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5 4999*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13 5000*c0909341SAndroid Build Coastguard Worker vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6 5001*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14 5002*c0909341SAndroid Build Coastguard Worker vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7 5003*c0909341SAndroid Build Coastguard Worker vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15 5004*c0909341SAndroid Build Coastguard Worker vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4 5005*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12 5006*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5 5007*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13 5008*c0909341SAndroid Build Coastguard Worker vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6 5009*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14 5010*c0909341SAndroid Build Coastguard Worker vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7 5011*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15 5012*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m26, m4, q2020 ; 0 1 5013*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m4, q3131 ; 8 9 5014*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m27, m5, q2020 ; 2 3 5015*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m5, q3131 ; 10 11 5016*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m28, m6, q2020 ; 4 5 5017*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m6, q3131 ; 12 13 5018*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m29, m7, q2020 ; 6 7 5019*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m7, q3131 ; 14 15 5020*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m0, m14, q2020 ; 16 17 5021*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m0, m14, q3131 ; 24 25 5022*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m1, m15, q2020 ; 18 19 5023*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m1, m15, q3131 ; 26 27 5024*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m2, m16, q2020 ; 20 21 5025*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m2, m16, q3131 ; 28 29 5026*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m8, m17, q2020 ; 22 23 5027*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m8, m17, q3131 ; 30 31 5028*c0909341SAndroid Build Coastguard Worker pxor m6, m6 5029*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m4 5030*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m5 5031*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m3 5032*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m0 5033*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m24, m24 ; 4 5034*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m0 ; 28 5035*c0909341SAndroid Build Coastguard Worker punpcklwd m5, m5 ; 20 5036*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m28, m28 ; 12 5037*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m26, m26 ; 8 5038*c0909341SAndroid Build Coastguard Worker punpcklwd m3, m3 ; 24 5039*c0909341SAndroid Build Coastguard Worker punpcklwd m9, m6, m22 ; __ 0 5040*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m4 ; __ 16 5041*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main_fast3 5042*c0909341SAndroid Build Coastguard Worker mova [cq+64* 1], m20 5043*c0909341SAndroid Build Coastguard Worker mova [cq+64* 3], m16 5044*c0909341SAndroid Build Coastguard Worker mova [cq+64* 5], m19 5045*c0909341SAndroid Build Coastguard Worker mova [cq+64* 7], m17 5046*c0909341SAndroid Build Coastguard Worker punpcklwd m21, m23, m23 ; 2 5047*c0909341SAndroid Build Coastguard Worker punpcklwd m17, m17 ; 30 5048*c0909341SAndroid Build Coastguard Worker punpcklwd m20, m20 ; 18 5049*c0909341SAndroid Build Coastguard Worker punpcklwd m15, m29, m29 ; 14 5050*c0909341SAndroid Build Coastguard Worker punpcklwd m18, m27, m27 ; 10 5051*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m16 ; 22 5052*c0909341SAndroid Build Coastguard Worker punpcklwd m19, m19 ; 26 5053*c0909341SAndroid Build Coastguard Worker punpcklwd m14, m25, m25 ; 6 5054*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 5055*c0909341SAndroid Build Coastguard Worker mova [cq+64* 8], m14 5056*c0909341SAndroid Build Coastguard Worker mova [cq+64* 9], m15 5057*c0909341SAndroid Build Coastguard Worker mova [cq+64*10], m16 5058*c0909341SAndroid Build Coastguard Worker mova [cq+64*11], m17 5059*c0909341SAndroid Build Coastguard Worker mova [cq+64*12], m18 5060*c0909341SAndroid Build Coastguard Worker mova [cq+64*13], m19 5061*c0909341SAndroid Build Coastguard Worker mova [cq+64*14], m20 5062*c0909341SAndroid Build Coastguard Worker mova [cq+64*15], m21 5063*c0909341SAndroid Build Coastguard Worker mova m21, [cq+64* 7] 5064*c0909341SAndroid Build Coastguard Worker mova m14, [cq+64* 0] 5065*c0909341SAndroid Build Coastguard Worker mova m17, [cq+64* 3] 5066*c0909341SAndroid Build Coastguard Worker mova m18, [cq+64* 4] 5067*c0909341SAndroid Build Coastguard Worker mova m19, [cq+64* 5] 5068*c0909341SAndroid Build Coastguard Worker mova m16, [cq+64* 2] 5069*c0909341SAndroid Build Coastguard Worker mova m15, [cq+64* 1] 5070*c0909341SAndroid Build Coastguard Worker mova m20, [cq+64* 6] 5071*c0909341SAndroid Build Coastguard Worker REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ 5072*c0909341SAndroid Build Coastguard Worker m24, m19, m16, m27, m28, m15, m20, m23 5073*c0909341SAndroid Build Coastguard Worker call .main_oddhalf 5074*c0909341SAndroid Build Coastguard Worker jmp .end 5075*c0909341SAndroid Build Coastguard Worker.fast: ; right half is zero 5076*c0909341SAndroid Build Coastguard Worker mova ym8, [cq+64*15] 5077*c0909341SAndroid Build Coastguard Worker vinserti32x8 m8, [cq+64* 1], 1 5078*c0909341SAndroid Build Coastguard Worker mova m2, [o(int16_perm)] 5079*c0909341SAndroid Build Coastguard Worker mova ym9, [cq+64* 8] 5080*c0909341SAndroid Build Coastguard Worker vinserti32x8 m9, [cq+64* 0], 1 5081*c0909341SAndroid Build Coastguard Worker mova ym0, [cq+64* 7] 5082*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, [cq+64* 9], 1 5083*c0909341SAndroid Build Coastguard Worker mova ym7, [cq+64*14] 5084*c0909341SAndroid Build Coastguard Worker vinserti32x8 m7, [cq+64* 2], 1 5085*c0909341SAndroid Build Coastguard Worker mova ym1, [cq+64* 3] 5086*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, [cq+64*13], 1 5087*c0909341SAndroid Build Coastguard Worker mova ym3, [cq+64* 6] 5088*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, [cq+64*10], 1 5089*c0909341SAndroid Build Coastguard Worker mova ym5, [cq+64*11] 5090*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, [cq+64* 5], 1 5091*c0909341SAndroid Build Coastguard Worker mova ym6, [cq+64*12] 5092*c0909341SAndroid Build Coastguard Worker vinserti32x8 m6, [cq+64* 4], 1 5093*c0909341SAndroid Build Coastguard Worker REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 5094*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main2 5095*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m8, [o(int_shuf3)] 5096*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m9, [o(int_shuf4)] 5097*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_8192)] 5098*c0909341SAndroid Build Coastguard Worker pshufb m0, m8 5099*c0909341SAndroid Build Coastguard Worker pshufb m1, m9 5100*c0909341SAndroid Build Coastguard Worker pshufb m2, m8 5101*c0909341SAndroid Build Coastguard Worker pshufb m3, m9 5102*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m11}, m0, m1, m2, m3 5103*c0909341SAndroid Build Coastguard Worker pshufb m4, m8 5104*c0909341SAndroid Build Coastguard Worker pshufb m5, m9 5105*c0909341SAndroid Build Coastguard Worker pshufb m6, m8 5106*c0909341SAndroid Build Coastguard Worker pshufb m7, m9 5107*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m11}, m4, m5, m6, m7 5108*c0909341SAndroid Build Coastguard Worker punpckhdq m28, m0, m1 5109*c0909341SAndroid Build Coastguard Worker punpckldq m0, m1 5110*c0909341SAndroid Build Coastguard Worker punpckhdq m27, m2, m3 5111*c0909341SAndroid Build Coastguard Worker punpckldq m2, m3 5112*c0909341SAndroid Build Coastguard Worker punpckhdq m22, m4, m5 5113*c0909341SAndroid Build Coastguard Worker punpckldq m4, m5 5114*c0909341SAndroid Build Coastguard Worker punpckhdq m23, m6, m7 5115*c0909341SAndroid Build Coastguard Worker punpckldq m6, m7 5116*c0909341SAndroid Build Coastguard Worker vinserti32x8 m14, m0, ym2, 1 5117*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m0, m2, q3232 5118*c0909341SAndroid Build Coastguard Worker vinserti32x8 m2, m4, ym6, 1 5119*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m6, q3232 5120*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m14, m2, q2020 ; 0 2 5121*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m2, q3131 ; 4 6 5122*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m15, m4, q2020 ; 8 10 5123*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m4, q3131 ; 12 14 5124*c0909341SAndroid Build Coastguard Worker pxor m9, m9 5125*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m14, m14 ; 4 5126*c0909341SAndroid Build Coastguard Worker punpcklwd m1, m15, m15 ; 12 5127*c0909341SAndroid Build Coastguard Worker punpcklwd m7, m18, m18 ; 8 5128*c0909341SAndroid Build Coastguard Worker punpcklwd m9, m21 ; __ 0 5129*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main_fast4 5130*c0909341SAndroid Build Coastguard Worker punpckhwd m21, m21 ; 2 5131*c0909341SAndroid Build Coastguard Worker punpckhwd m15, m15 ; 14 5132*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m18 ; 10 5133*c0909341SAndroid Build Coastguard Worker punpckhwd m14, m14 ; 6 5134*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 5135*c0909341SAndroid Build Coastguard Worker vinserti32x8 m24, m28, ym27, 1 5136*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m27, q3232 5137*c0909341SAndroid Build Coastguard Worker vinserti32x8 m27, m22, ym23, 1 5138*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m23, q3232 5139*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m24, m27, q2020 ; 1 3 5140*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m27, q3131 ; 5 7 5141*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m28, m22, q2020 ; 9 11 5142*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m22, q3131 ; 13 15 5143*c0909341SAndroid Build Coastguard Worker punpcklwd m22, m23, m23 ; 1 5144*c0909341SAndroid Build Coastguard Worker punpckhwd m29, m28, m28 ; 15 5145*c0909341SAndroid Build Coastguard Worker punpcklwd m26, m27, m27 ; 9 5146*c0909341SAndroid Build Coastguard Worker punpckhwd m25, m24, m24 ; 7 5147*c0909341SAndroid Build Coastguard Worker mova [cq+64* 8], m14 5148*c0909341SAndroid Build Coastguard Worker mova [cq+64* 9], m15 5149*c0909341SAndroid Build Coastguard Worker mova [cq+64*10], m16 5150*c0909341SAndroid Build Coastguard Worker mova [cq+64*11], m17 5151*c0909341SAndroid Build Coastguard Worker punpcklwd m24, m24 ; 5 5152*c0909341SAndroid Build Coastguard Worker punpckhwd m27, m27 ; 11 5153*c0909341SAndroid Build Coastguard Worker punpcklwd m28, m28 ; 13 5154*c0909341SAndroid Build Coastguard Worker punpckhwd m23, m23 ; 3 5155*c0909341SAndroid Build Coastguard Worker mova [cq+64*12], m18 5156*c0909341SAndroid Build Coastguard Worker mova [cq+64*13], m19 5157*c0909341SAndroid Build Coastguard Worker mova [cq+64*14], m20 5158*c0909341SAndroid Build Coastguard Worker mova [cq+64*15], m21 5159*c0909341SAndroid Build Coastguard Worker call .main_oddhalf_fast 5160*c0909341SAndroid Build Coastguard Worker.end: 5161*c0909341SAndroid Build Coastguard Worker imul r6, strideq, 60 5162*c0909341SAndroid Build Coastguard Worker mova m10, [o(end_16x32p)] 5163*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_2048)] 5164*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 5165*c0909341SAndroid Build Coastguard Worker pxor m12, m12 5166*c0909341SAndroid Build Coastguard Worker add r6, dstq ; dst+stride*60 5167*c0909341SAndroid Build Coastguard Worker psrldq m13, m10, 1 5168*c0909341SAndroid Build Coastguard Worker lea r4, [strideq+r3] ; stride*4 5169*c0909341SAndroid Build Coastguard Worker%macro IDCT_16x64_END 3 ; idct32, idct64, tmp 5170*c0909341SAndroid Build Coastguard Worker%if %1 & 1 5171*c0909341SAndroid Build Coastguard Worker %define %%s0 r3 5172*c0909341SAndroid Build Coastguard Worker %define %%s1 strideq*2 5173*c0909341SAndroid Build Coastguard Worker %define %%s2 strideq*1 5174*c0909341SAndroid Build Coastguard Worker %define %%s3 strideq*0 5175*c0909341SAndroid Build Coastguard Worker%else 5176*c0909341SAndroid Build Coastguard Worker %define %%s0 strideq*0 5177*c0909341SAndroid Build Coastguard Worker %define %%s1 strideq*1 5178*c0909341SAndroid Build Coastguard Worker %define %%s2 strideq*2 5179*c0909341SAndroid Build Coastguard Worker %define %%s3 r3 5180*c0909341SAndroid Build Coastguard Worker%if %1 5181*c0909341SAndroid Build Coastguard Worker add dstq, r4 5182*c0909341SAndroid Build Coastguard Worker sub r6, r4 5183*c0909341SAndroid Build Coastguard Worker%endif 5184*c0909341SAndroid Build Coastguard Worker%endif 5185*c0909341SAndroid Build Coastguard Worker%if %1 < 8 5186*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m11, m%1 5187*c0909341SAndroid Build Coastguard Worker pmulhrsw m9, m11, m%2 5188*c0909341SAndroid Build Coastguard Worker%else 5189*c0909341SAndroid Build Coastguard Worker mova m9, [cq+64*%1] 5190*c0909341SAndroid Build Coastguard Worker paddsw m8, m9, m%2 ; out 0+n, 1+n 5191*c0909341SAndroid Build Coastguard Worker psubsw m9, m%2 ; out 63-n, 62-n 5192*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m11 5193*c0909341SAndroid Build Coastguard Worker pmulhrsw m9, m11 5194*c0909341SAndroid Build Coastguard Worker%endif 5195*c0909341SAndroid Build Coastguard Worker mova xm29, [dstq+%%s0] 5196*c0909341SAndroid Build Coastguard Worker vinserti128 ym29, [dstq+%%s1], 1 5197*c0909341SAndroid Build Coastguard Worker mova xm%3, [r6 +%%s3] 5198*c0909341SAndroid Build Coastguard Worker vinserti128 ym%3, [r6 +%%s2], 1 5199*c0909341SAndroid Build Coastguard Worker vpermb m29, m10, m29 5200*c0909341SAndroid Build Coastguard Worker vpermb m%3, m10, m%3 5201*c0909341SAndroid Build Coastguard Worker mova [cq+64*%1], m12 5202*c0909341SAndroid Build Coastguard Worker paddw m29, m8 5203*c0909341SAndroid Build Coastguard Worker paddw m%3, m9 5204*c0909341SAndroid Build Coastguard Worker packuswb m29, m%3 5205*c0909341SAndroid Build Coastguard Worker vpermd m29, m13, m29 5206*c0909341SAndroid Build Coastguard Worker mova [dstq+%%s0], xm29 5207*c0909341SAndroid Build Coastguard Worker vextracti128 [dstq+%%s1], ym29, 1 5208*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r6 +%%s2], m29, 2 5209*c0909341SAndroid Build Coastguard Worker vextracti32x4 [r6 +%%s3], m29, 3 5210*c0909341SAndroid Build Coastguard Worker%endmacro 5211*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 0, 29, 0 5212*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 1, 28, 28 5213*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 2, 27, 28 5214*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 3, 26, 28 5215*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 4, 25, 28 5216*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 5, 24, 28 5217*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 6, 23, 28 5218*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 7, 22, 28 5219*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 8, 21, 28 5220*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 9, 20, 28 5221*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 10, 19, 28 5222*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 11, 18, 28 5223*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 12, 17, 28 5224*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 13, 16, 28 5225*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 14, 15, 28 5226*c0909341SAndroid Build Coastguard Worker IDCT_16x64_END 15, 14, 28 5227*c0909341SAndroid Build Coastguard Worker RET 5228*c0909341SAndroid Build Coastguard Worker.dconly: 5229*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 5230*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 5231*c0909341SAndroid Build Coastguard Worker or r3d, 64 5232*c0909341SAndroid Build Coastguard Worker imul r6d, 181 5233*c0909341SAndroid Build Coastguard Worker add r6d, 128+512 5234*c0909341SAndroid Build Coastguard Worker sar r6d, 8+2 5235*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 5236*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5237*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast ; bottom three-quarters are zero 5238*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_101_4095x8)] 5239*c0909341SAndroid Build Coastguard Worker vpbroadcastd m21, [o(pw_m1474_3822x8)] 5240*c0909341SAndroid Build Coastguard Worker vpbroadcastd m14, [o(pw_897_3996x8)] 5241*c0909341SAndroid Build Coastguard Worker vpbroadcastd m17, [o(pw_m700_4036x8)] 5242*c0909341SAndroid Build Coastguard Worker vpbroadcastd m18, [o(pw_501_4065x8)] 5243*c0909341SAndroid Build Coastguard Worker vpbroadcastd m19, [o(pw_m1092_3948x8)] 5244*c0909341SAndroid Build Coastguard Worker vpbroadcastd m16, [o(pw_1285_3889x8)] 5245*c0909341SAndroid Build Coastguard Worker vpbroadcastd m15, [o(pw_m301_4085x8)] 5246*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m22 ; t32a t63a 5247*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m29 ; t35a t60a 5248*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m26 ; t36a t59a 5249*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m25 ; t39a t56 5250*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m24 ; t40a t55a 5251*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m27 ; t43a t52a 5252*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m28 ; t44a t51a 5253*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m23 ; t47a t48a 5254*c0909341SAndroid Build Coastguard Worker mova m22, m8 5255*c0909341SAndroid Build Coastguard Worker mova m29, m21 5256*c0909341SAndroid Build Coastguard Worker mova m26, m14 5257*c0909341SAndroid Build Coastguard Worker mova m25, m17 5258*c0909341SAndroid Build Coastguard Worker mova m24, m18 5259*c0909341SAndroid Build Coastguard Worker mova m27, m19 5260*c0909341SAndroid Build Coastguard Worker mova m28, m16 5261*c0909341SAndroid Build Coastguard Worker mova m20, m15 5262*c0909341SAndroid Build Coastguard Worker jmp .main_oddhalf2 5263*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5264*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf 5265*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_101_4095x8)] 5266*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_m2824_2967x8)] 5267*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_1660_3745x8)] 5268*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m1474_3822x8)] 5269*c0909341SAndroid Build Coastguard Worker pmulhrsw m22, m8 ; t32a t63a 5270*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_897_3996x8)] 5271*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m9 ; t33a t62a 5272*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_m2191_3461x8)] 5273*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m11 ; t34a t61a 5274*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_2359_3349x8)] 5275*c0909341SAndroid Build Coastguard Worker pmulhrsw m29, m12 ; t35a t60a 5276*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m700_4036x8)] 5277*c0909341SAndroid Build Coastguard Worker pmulhrsw m26, m8 ; t36a t59a 5278*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_501_4065x8)] 5279*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m9 ; t37a t58a 5280*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_m2520_3229x8)] 5281*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m11 ; t38a t57a 5282*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_2019_3564x8)] 5283*c0909341SAndroid Build Coastguard Worker pmulhrsw m25, m12 ; t39a t56a 5284*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m1092_3948x8)] 5285*c0909341SAndroid Build Coastguard Worker pmulhrsw m24, m8 ; t40a t55a 5286*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(pw_1285_3889x8)] 5287*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m9 ; t41a t54a 5288*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_m1842_3659x8)] 5289*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m11 ; t42a t53a 5290*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_2675_3102x8)] 5291*c0909341SAndroid Build Coastguard Worker pmulhrsw m27, m12 ; t43a t52a 5292*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m301_4085x8)] 5293*c0909341SAndroid Build Coastguard Worker pmulhrsw m28, m8 ; t44a t51a 5294*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m9 ; t45a t50a 5295*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m11 ; t46a t49a 5296*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, m12 ; t47a t48a 5297*c0909341SAndroid Build Coastguard Worker psubsw m8, m22, m21 ; t33 t62 5298*c0909341SAndroid Build Coastguard Worker paddsw m22, m21 ; t32 t63 5299*c0909341SAndroid Build Coastguard Worker psubsw m21, m29, m14 ; t34 t61 5300*c0909341SAndroid Build Coastguard Worker paddsw m29, m14 ; t35 t60 5301*c0909341SAndroid Build Coastguard Worker psubsw m14, m26, m17 ; t37 t58 5302*c0909341SAndroid Build Coastguard Worker paddsw m26, m17 ; t36 t59 5303*c0909341SAndroid Build Coastguard Worker psubsw m17, m25, m18 ; t38 t57 5304*c0909341SAndroid Build Coastguard Worker paddsw m25, m18 ; t39 t56 5305*c0909341SAndroid Build Coastguard Worker psubsw m18, m24, m19 ; t41 t54 5306*c0909341SAndroid Build Coastguard Worker paddsw m24, m19 ; t40 t55 5307*c0909341SAndroid Build Coastguard Worker psubsw m19, m27, m16 ; t42 t53 5308*c0909341SAndroid Build Coastguard Worker paddsw m27, m16 ; t43 t52 5309*c0909341SAndroid Build Coastguard Worker psubsw m16, m28, m15 ; t45 t50 5310*c0909341SAndroid Build Coastguard Worker paddsw m28, m15 ; t44 t51 5311*c0909341SAndroid Build Coastguard Worker psubsw m15, m23, m20 ; t46 t49 5312*c0909341SAndroid Build Coastguard Worker paddsw m20, m23 ; t47 t48 5313*c0909341SAndroid Build Coastguard Worker.main_oddhalf2: 5314*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a 5315*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a 5316*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a 5317*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a 5318*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a 5319*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a 5320*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a 5321*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a 5322*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_m4017_799)] 5323*c0909341SAndroid Build Coastguard Worker psubsw m23, m25, m26 ; t36a t59a 5324*c0909341SAndroid Build Coastguard Worker paddsw m25, m26 ; t39a t56a 5325*c0909341SAndroid Build Coastguard Worker psubsw m26, m24, m27 ; t43a t52a 5326*c0909341SAndroid Build Coastguard Worker paddsw m27, m24 ; t40a t55a 5327*c0909341SAndroid Build Coastguard Worker psubsw m24, m20, m28 ; t44a t51a 5328*c0909341SAndroid Build Coastguard Worker paddsw m20, m28 ; t47a t48a 5329*c0909341SAndroid Build Coastguard Worker psubsw m28, m8, m21 ; t34 t61 5330*c0909341SAndroid Build Coastguard Worker paddsw m8, m21 ; t33 t62 5331*c0909341SAndroid Build Coastguard Worker psubsw m21, m17, m14 ; t37 t58 5332*c0909341SAndroid Build Coastguard Worker paddsw m17, m14 ; t38 t57 5333*c0909341SAndroid Build Coastguard Worker psubsw m14, m18, m19 ; t42 t53 5334*c0909341SAndroid Build Coastguard Worker paddsw m18, m19 ; t41 t54 5335*c0909341SAndroid Build Coastguard Worker psubsw m19, m15, m16 ; t45 t50 5336*c0909341SAndroid Build Coastguard Worker paddsw m15, m16 ; t46 t49 5337*c0909341SAndroid Build Coastguard Worker psubsw m16, m22, m29 ; t35a t60a 5338*c0909341SAndroid Build Coastguard Worker paddsw m22, m29 ; t32a t63a 5339*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60 5340*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a 5341*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59 5342*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a 5343*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_m2276_3406)] 5344*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52 5345*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a 5346*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51 5347*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a 5348*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_1567_3784)] 5349*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m3784_1567)] 5350*c0909341SAndroid Build Coastguard Worker psubsw m29, m22, m25 ; t39 t56 5351*c0909341SAndroid Build Coastguard Worker paddsw m22, m25 ; t32 t63 5352*c0909341SAndroid Build Coastguard Worker psubsw m25, m20, m27 ; t40 t55 5353*c0909341SAndroid Build Coastguard Worker paddsw m20, m27 ; t47 t48 5354*c0909341SAndroid Build Coastguard Worker psubsw m27, m8, m17 ; t38a t57a 5355*c0909341SAndroid Build Coastguard Worker paddsw m8, m17 ; t33a t62a 5356*c0909341SAndroid Build Coastguard Worker psubsw m17, m15, m18 ; t41a t54a 5357*c0909341SAndroid Build Coastguard Worker paddsw m15, m18 ; t46a t49a 5358*c0909341SAndroid Build Coastguard Worker paddsw m18, m16, m23 ; t35a t60a 5359*c0909341SAndroid Build Coastguard Worker psubsw m16, m23 ; t36a t59a 5360*c0909341SAndroid Build Coastguard Worker psubsw m23, m24, m26 ; t43a t52a 5361*c0909341SAndroid Build Coastguard Worker paddsw m24, m26 ; t44a t51a 5362*c0909341SAndroid Build Coastguard Worker paddsw m26, m28, m21 ; t34 t61 5363*c0909341SAndroid Build Coastguard Worker psubsw m28, m21 ; t37 t58 5364*c0909341SAndroid Build Coastguard Worker psubsw m21, m19, m14 ; t42 t53 5365*c0909341SAndroid Build Coastguard Worker paddsw m19, m14 ; t45 t50 5366*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a 5367*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57 5368*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59 5369*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a 5370*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_m1567_m3784)] 5371*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a 5372*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54 5373*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52 5374*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a 5375*c0909341SAndroid Build Coastguard Worker vbroadcasti32x4 m13, [o(deint_shuf)] 5376*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_2896_2896)] 5377*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m2896_2896)] 5378*c0909341SAndroid Build Coastguard Worker paddsw m14, m22, m20 ; t32a t63a 5379*c0909341SAndroid Build Coastguard Worker psubsw m22, m20 ; t47a t48a 5380*c0909341SAndroid Build Coastguard Worker psubsw m20, m8, m15 ; t46 t49 5381*c0909341SAndroid Build Coastguard Worker paddsw m8, m15 ; t33 t62 5382*c0909341SAndroid Build Coastguard Worker paddsw m15, m18, m24 ; t35 t60 5383*c0909341SAndroid Build Coastguard Worker psubsw m18, m24 ; t44 t51 5384*c0909341SAndroid Build Coastguard Worker psubsw m24, m26, m19 ; t45a t50a 5385*c0909341SAndroid Build Coastguard Worker paddsw m26, m19 ; t34a t61a 5386*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m13}, m14, m8, m15, m26 5387*c0909341SAndroid Build Coastguard Worker psubsw m19, m29, m25 ; t40 t55 5388*c0909341SAndroid Build Coastguard Worker paddsw m25, m29 ; t39 t56 5389*c0909341SAndroid Build Coastguard Worker psubsw m29, m27, m17 ; t41a t54a 5390*c0909341SAndroid Build Coastguard Worker paddsw m27, m17 ; t38a t57a 5391*c0909341SAndroid Build Coastguard Worker psubsw m17, m16, m23 ; t43a t52a 5392*c0909341SAndroid Build Coastguard Worker paddsw m16, m23 ; t36a t59a 5393*c0909341SAndroid Build Coastguard Worker psubsw m9, m28, m21 ; t42 t53 5394*c0909341SAndroid Build Coastguard Worker paddsw m28, m21 ; t37 t58 5395*c0909341SAndroid Build Coastguard Worker REPX {pshufb x, m13}, m25, m27, m16, m28 5396*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48 5397*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a 5398*c0909341SAndroid Build Coastguard Worker packssdw m21, m22 ; t47 t46a 5399*c0909341SAndroid Build Coastguard Worker packssdw m13, m23 ; t48 t49a 5400*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a 5401*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50 5402*c0909341SAndroid Build Coastguard Worker packssdw m20, m18 ; t44a t45 5403*c0909341SAndroid Build Coastguard Worker packssdw m22, m23 ; t51a t50 5404*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a 5405*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54 5406*c0909341SAndroid Build Coastguard Worker packssdw m18, m19 ; t40a t41 5407*c0909341SAndroid Build Coastguard Worker packssdw m24, m23 ; t55a t54 5408*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52 5409*c0909341SAndroid Build Coastguard Worker ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a 5410*c0909341SAndroid Build Coastguard Worker packssdw m19, m17 ; t43 t42a 5411*c0909341SAndroid Build Coastguard Worker packssdw m23, m29 ; t52 t53a 5412*c0909341SAndroid Build Coastguard Worker punpcklqdq m17, m25, m27 ; t39 t38a 5413*c0909341SAndroid Build Coastguard Worker punpckhqdq m25, m27 ; t56 t57a 5414*c0909341SAndroid Build Coastguard Worker punpckhqdq m27, m15, m26 ; t60 t61a 5415*c0909341SAndroid Build Coastguard Worker punpcklqdq m15, m26 ; t35 t34a 5416*c0909341SAndroid Build Coastguard Worker punpckhqdq m26, m16, m28 ; t59a t58 5417*c0909341SAndroid Build Coastguard Worker punpcklqdq m16, m28 ; t36a t37 5418*c0909341SAndroid Build Coastguard Worker punpckhqdq m28, m14, m8 ; t63a t62 5419*c0909341SAndroid Build Coastguard Worker punpcklqdq m14, m8 ; t32a t33 5420*c0909341SAndroid Build Coastguard Worker psubsw m29, m0, m28 ; out63 out62 5421*c0909341SAndroid Build Coastguard Worker paddsw m0, m28 ; out0 out1 5422*c0909341SAndroid Build Coastguard Worker psubsw m28, m1, m27 ; out60 out61 5423*c0909341SAndroid Build Coastguard Worker paddsw m1, m27 ; out3 out2 5424*c0909341SAndroid Build Coastguard Worker psubsw m27, m2, m26 ; out59 out58 5425*c0909341SAndroid Build Coastguard Worker paddsw m2, m26 ; out4 out5 5426*c0909341SAndroid Build Coastguard Worker psubsw m26, m3, m25 ; out56 out57 5427*c0909341SAndroid Build Coastguard Worker paddsw m3, m25 ; out7 out6 5428*c0909341SAndroid Build Coastguard Worker psubsw m25, m4, m24 ; out55 out54 5429*c0909341SAndroid Build Coastguard Worker paddsw m4, m24 ; out8 out9 5430*c0909341SAndroid Build Coastguard Worker psubsw m24, m5, m23 ; out52 out53 5431*c0909341SAndroid Build Coastguard Worker paddsw m5, m23 ; out11 out10 5432*c0909341SAndroid Build Coastguard Worker psubsw m23, m6, m22 ; out51 out50 5433*c0909341SAndroid Build Coastguard Worker paddsw m6, m22 ; out12 out13 5434*c0909341SAndroid Build Coastguard Worker psubsw m22, m7, m13 ; out48 out49 5435*c0909341SAndroid Build Coastguard Worker paddsw m7, m13 ; out15 out14 5436*c0909341SAndroid Build Coastguard Worker ret 5437*c0909341SAndroid Build Coastguard Worker 5438*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob 5439*c0909341SAndroid Build Coastguard Worker%undef cmp 5440*c0909341SAndroid Build Coastguard Worker lea r5, [o_base] 5441*c0909341SAndroid Build Coastguard Worker test eobd, eobd 5442*c0909341SAndroid Build Coastguard Worker jnz .normal 5443*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 5444*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 5445*c0909341SAndroid Build Coastguard Worker or r3d, 16 5446*c0909341SAndroid Build Coastguard Worker.dconly: 5447*c0909341SAndroid Build Coastguard Worker imul r6d, 181 5448*c0909341SAndroid Build Coastguard Worker add r6d, 128+512 5449*c0909341SAndroid Build Coastguard Worker sar r6d, 8+2 5450*c0909341SAndroid Build Coastguard Worker.dconly2: 5451*c0909341SAndroid Build Coastguard Worker imul r6d, 181 5452*c0909341SAndroid Build Coastguard Worker add r6d, 128+2048 5453*c0909341SAndroid Build Coastguard Worker sar r6d, 8+4 5454*c0909341SAndroid Build Coastguard Worker pxor m2, m2 5455*c0909341SAndroid Build Coastguard Worker vpbroadcastw m3, r6d 5456*c0909341SAndroid Build Coastguard Worker.dconly_loop: 5457*c0909341SAndroid Build Coastguard Worker mova m1, [dstq] 5458*c0909341SAndroid Build Coastguard Worker punpcklbw m0, m1, m2 5459*c0909341SAndroid Build Coastguard Worker punpckhbw m1, m2 5460*c0909341SAndroid Build Coastguard Worker paddw m0, m3 5461*c0909341SAndroid Build Coastguard Worker paddw m1, m3 5462*c0909341SAndroid Build Coastguard Worker packuswb m0, m1 5463*c0909341SAndroid Build Coastguard Worker mova [dstq], m0 5464*c0909341SAndroid Build Coastguard Worker add dstq, strideq 5465*c0909341SAndroid Build Coastguard Worker dec r3d 5466*c0909341SAndroid Build Coastguard Worker jg .dconly_loop 5467*c0909341SAndroid Build Coastguard Worker RET 5468*c0909341SAndroid Build Coastguard Worker.normal: 5469*c0909341SAndroid Build Coastguard Worker WIN64_SPILL_XMM 31 5470*c0909341SAndroid Build Coastguard Worker mova m19, [o(dup16_perm)] 5471*c0909341SAndroid Build Coastguard Worker mova m24, [cq+64* 2] 5472*c0909341SAndroid Build Coastguard Worker mova m28, [cq+64* 6] 5473*c0909341SAndroid Build Coastguard Worker mova m26, [cq+64* 4] 5474*c0909341SAndroid Build Coastguard Worker mova m22, [cq+64* 0] 5475*c0909341SAndroid Build Coastguard Worker mova m23, [cq+64* 1] 5476*c0909341SAndroid Build Coastguard Worker mova m29, [cq+64* 7] 5477*c0909341SAndroid Build Coastguard Worker mova m27, [cq+64* 5] 5478*c0909341SAndroid Build Coastguard Worker mova m25, [cq+64* 3] 5479*c0909341SAndroid Build Coastguard Worker vpermb m8, m19, m24 ; 4 5480*c0909341SAndroid Build Coastguard Worker vpermb m1, m19, m28 ; 12 5481*c0909341SAndroid Build Coastguard Worker vpermb m7, m19, m26 ; 8 5482*c0909341SAndroid Build Coastguard Worker vpermb m9, m19, m22 ; __ 0 5483*c0909341SAndroid Build Coastguard Worker vpermb m21, m19, m23 ; 2 5484*c0909341SAndroid Build Coastguard Worker vpermb m15, m19, m29 ; 14 5485*c0909341SAndroid Build Coastguard Worker vpermb m18, m19, m27 ; 10 5486*c0909341SAndroid Build Coastguard Worker vpermb m14, m19, m25 ; 6 5487*c0909341SAndroid Build Coastguard Worker pslld m9, 16 5488*c0909341SAndroid Build Coastguard Worker vpord m30, m19, [o(pb_32)] {1to16} 5489*c0909341SAndroid Build Coastguard Worker REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23 5490*c0909341SAndroid Build Coastguard Worker cmp eobd, 151 5491*c0909341SAndroid Build Coastguard Worker jb .fast 5492*c0909341SAndroid Build Coastguard Worker vpermb m0, m19, [cq+64*14] ; 28 5493*c0909341SAndroid Build Coastguard Worker vpermb m5, m19, [cq+64*10] ; 20 5494*c0909341SAndroid Build Coastguard Worker vpermb m3, m19, [cq+64*12] ; 24 5495*c0909341SAndroid Build Coastguard Worker vpermb m6, m19, [cq+64* 8] ; __ 16 5496*c0909341SAndroid Build Coastguard Worker pslld m6, 16 5497*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main_fast 5498*c0909341SAndroid Build Coastguard Worker vpermb m17, m19, [cq+64*15] ; 30 5499*c0909341SAndroid Build Coastguard Worker vpermb m20, m19, [cq+64* 9] ; 18 5500*c0909341SAndroid Build Coastguard Worker vpermb m16, m19, [cq+64*11] ; 22 5501*c0909341SAndroid Build Coastguard Worker vpermb m19, m19, [cq+64*13] ; 26 5502*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 5503*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m14 5504*c0909341SAndroid Build Coastguard Worker mova [cq+64* 1], m15 5505*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m16 5506*c0909341SAndroid Build Coastguard Worker mova [cq+64* 3], m17 5507*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m18 5508*c0909341SAndroid Build Coastguard Worker mova [cq+64* 5], m19 5509*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m20 5510*c0909341SAndroid Build Coastguard Worker mova [cq+64* 7], m21 5511*c0909341SAndroid Build Coastguard Worker vpermb m21, m30, [cq+64*15] 5512*c0909341SAndroid Build Coastguard Worker vpermb m14, m30, [cq+64* 8] 5513*c0909341SAndroid Build Coastguard Worker vpermb m17, m30, [cq+64*11] 5514*c0909341SAndroid Build Coastguard Worker vpermb m18, m30, [cq+64*12] 5515*c0909341SAndroid Build Coastguard Worker vpermb m19, m30, [cq+64*13] 5516*c0909341SAndroid Build Coastguard Worker vpermb m16, m30, [cq+64*10] 5517*c0909341SAndroid Build Coastguard Worker vpermb m15, m30, [cq+64* 9] 5518*c0909341SAndroid Build Coastguard Worker vpermb m20, m30, [cq+64*14] 5519*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf 5520*c0909341SAndroid Build Coastguard Worker jmp .end 5521*c0909341SAndroid Build Coastguard Worker.fast: ; bottom half is zero 5522*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main_fast2 5523*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 5524*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m14 5525*c0909341SAndroid Build Coastguard Worker mova [cq+64* 1], m15 5526*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m16 5527*c0909341SAndroid Build Coastguard Worker mova [cq+64* 3], m17 5528*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m18 5529*c0909341SAndroid Build Coastguard Worker mova [cq+64* 5], m19 5530*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m20 5531*c0909341SAndroid Build Coastguard Worker mova [cq+64* 7], m21 5532*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast 5533*c0909341SAndroid Build Coastguard Worker.end: 5534*c0909341SAndroid Build Coastguard Worker mova [cq+64* 8], m4 5535*c0909341SAndroid Build Coastguard Worker mova [cq+64* 9], m5 5536*c0909341SAndroid Build Coastguard Worker mova [cq+64*10], m6 5537*c0909341SAndroid Build Coastguard Worker mova [cq+64*11], m7 5538*c0909341SAndroid Build Coastguard Worker mova [cq+64*12], m26 5539*c0909341SAndroid Build Coastguard Worker mova [cq+64*13], m27 5540*c0909341SAndroid Build Coastguard Worker mova [cq+64*14], m28 5541*c0909341SAndroid Build Coastguard Worker mova [cq+64*15], m29 5542*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [o(pw_8192)] 5543*c0909341SAndroid Build Coastguard Worker call .pass1_end 5544*c0909341SAndroid Build Coastguard Worker call .pass2 5545*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m0 5546*c0909341SAndroid Build Coastguard Worker mova [cq+64* 1], m1 5547*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m2 5548*c0909341SAndroid Build Coastguard Worker mova [cq+64* 3], m3 5549*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m4 5550*c0909341SAndroid Build Coastguard Worker mova [cq+64* 5], m5 5551*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m6 5552*c0909341SAndroid Build Coastguard Worker mova [cq+64* 7], m7 5553*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m13, [cq+64* 8] 5554*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m13, [cq+64* 9] 5555*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m13, [cq+64*10] 5556*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m13, [cq+64*11] 5557*c0909341SAndroid Build Coastguard Worker vpbroadcastd m30, [o(pw_2048)] 5558*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m13, m22 5559*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m13, m23 5560*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m13, m24 5561*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m13, m25 5562*c0909341SAndroid Build Coastguard Worker pmulhrsw m22, m30, m14 5563*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m13, m26 5564*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, m30, m15 5565*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m13, m27 5566*c0909341SAndroid Build Coastguard Worker pmulhrsw m24, m30, m16 5567*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m13, m28 5568*c0909341SAndroid Build Coastguard Worker pmulhrsw m25, m30, m17 5569*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m13, m29 5570*c0909341SAndroid Build Coastguard Worker pmulhrsw m26, m30, m18 5571*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m13, [cq+64*12] 5572*c0909341SAndroid Build Coastguard Worker pmulhrsw m27, m30, m19 5573*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m13, [cq+64*13] 5574*c0909341SAndroid Build Coastguard Worker pmulhrsw m28, m30, m20 5575*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m13, [cq+64*14] 5576*c0909341SAndroid Build Coastguard Worker pmulhrsw m29, m30, m21 5577*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m13, [cq+64*15] 5578*c0909341SAndroid Build Coastguard Worker call .transpose_round 5579*c0909341SAndroid Build Coastguard Worker call .pass2 5580*c0909341SAndroid Build Coastguard Worker pxor m10, m10 5581*c0909341SAndroid Build Coastguard Worker lea r3, [strideq*3] 5582*c0909341SAndroid Build Coastguard Worker%macro IDCT_64x16_END 4 5583*c0909341SAndroid Build Coastguard Worker mova m9, [dstq+%4] 5584*c0909341SAndroid Build Coastguard Worker%if %1 < 8 5585*c0909341SAndroid Build Coastguard Worker pmulhrsw m%3, m30, [cq+64*%1] 5586*c0909341SAndroid Build Coastguard Worker%endif 5587*c0909341SAndroid Build Coastguard Worker pmulhrsw m%2, m30 5588*c0909341SAndroid Build Coastguard Worker mova [cq+64*%1], m10 5589*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m9, m10 5590*c0909341SAndroid Build Coastguard Worker punpckhbw m9, m10 5591*c0909341SAndroid Build Coastguard Worker paddw m8, m%3 5592*c0909341SAndroid Build Coastguard Worker paddw m9, m%2 5593*c0909341SAndroid Build Coastguard Worker packuswb m8, m9 5594*c0909341SAndroid Build Coastguard Worker mova [dstq+%4], m8 5595*c0909341SAndroid Build Coastguard Worker%if %1 == 3 || %1 == 7 || %1 == 11 5596*c0909341SAndroid Build Coastguard Worker lea dstq, [dstq+strideq*4] 5597*c0909341SAndroid Build Coastguard Worker%endif 5598*c0909341SAndroid Build Coastguard Worker%endmacro 5599*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 0, 0, 11, strideq*0 5600*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 1, 1, 11, strideq*1 5601*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 2, 2, 11, strideq*2 5602*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 3, 3, 11, r3 5603*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 4, 4, 11, strideq*0 5604*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 5, 5, 11, strideq*1 5605*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 6, 6, 11, strideq*2 5606*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 7, 7, 11, r3 5607*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 8, 14, 22, strideq*0 5608*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 9, 15, 23, strideq*1 5609*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 10, 16, 24, strideq*2 5610*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 11, 17, 25, r3 5611*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 12, 18, 26, strideq*0 5612*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 13, 19, 27, strideq*1 5613*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 14, 20, 28, strideq*2 5614*c0909341SAndroid Build Coastguard Worker IDCT_64x16_END 15, 21, 29, r3 5615*c0909341SAndroid Build Coastguard Worker RET 5616*c0909341SAndroid Build Coastguard WorkerALIGN function_align 5617*c0909341SAndroid Build Coastguard Worker.pass1_end: 5618*c0909341SAndroid Build Coastguard Worker mova m4, [cq+64* 0] 5619*c0909341SAndroid Build Coastguard Worker mova m5, [cq+64* 1] 5620*c0909341SAndroid Build Coastguard Worker mova m6, [cq+64* 2] 5621*c0909341SAndroid Build Coastguard Worker mova m7, [cq+64* 3] 5622*c0909341SAndroid Build Coastguard Worker mova m8, [cq+64* 4] 5623*c0909341SAndroid Build Coastguard Worker mova m9, [cq+64* 5] 5624*c0909341SAndroid Build Coastguard Worker mova m11, [cq+64* 6] 5625*c0909341SAndroid Build Coastguard Worker mova m12, [cq+64* 7] 5626*c0909341SAndroid Build Coastguard Worker psubsw m29, m4, m21 ; out47 out46 5627*c0909341SAndroid Build Coastguard Worker paddsw m4, m21 ; out16 out17 5628*c0909341SAndroid Build Coastguard Worker psubsw m28, m5, m20 ; out44 out45 5629*c0909341SAndroid Build Coastguard Worker paddsw m5, m20 ; out19 out18 5630*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m13}, m0, m1, m2, m3 5631*c0909341SAndroid Build Coastguard Worker psubsw m27, m6, m19 ; out43 out42 5632*c0909341SAndroid Build Coastguard Worker paddsw m6, m19 ; out20 out21 5633*c0909341SAndroid Build Coastguard Worker psubsw m26, m7, m18 ; out40 out41 5634*c0909341SAndroid Build Coastguard Worker paddsw m7, m18 ; out23 out22 5635*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m13, m22 5636*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m13, m23 5637*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m13, m24 5638*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m13, m25 5639*c0909341SAndroid Build Coastguard Worker paddsw m25, m12, m14 ; out31 out30 5640*c0909341SAndroid Build Coastguard Worker psubsw m14, m12, m14 ; out32 out33 5641*c0909341SAndroid Build Coastguard Worker paddsw m24, m11, m15 ; out28 out29 5642*c0909341SAndroid Build Coastguard Worker psubsw m15, m11, m15 ; out35 out34 5643*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m13}, m4, m5, m6, m7 5644*c0909341SAndroid Build Coastguard Worker paddsw m23, m9, m16 ; out27 out26 5645*c0909341SAndroid Build Coastguard Worker psubsw m16, m9, m16 ; out36 out37 5646*c0909341SAndroid Build Coastguard Worker paddsw m22, m8, m17 ; out24 out25 5647*c0909341SAndroid Build Coastguard Worker psubsw m17, m8, m17 ; out39 out38 5648*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m13}, m14, m15, m16, m17 5649*c0909341SAndroid Build Coastguard Worker.transpose_round: 5650*c0909341SAndroid Build Coastguard Worker%macro TRANSPOSE_8x4_PACKED 4 5651*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3 5652*c0909341SAndroid Build Coastguard Worker punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3 5653*c0909341SAndroid Build Coastguard Worker punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3 5654*c0909341SAndroid Build Coastguard Worker punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3 5655*c0909341SAndroid Build Coastguard Worker punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3 5656*c0909341SAndroid Build Coastguard Worker punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1 5657*c0909341SAndroid Build Coastguard Worker punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3 5658*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1 5659*c0909341SAndroid Build Coastguard Worker punpcklwd m%3, m%4, m%2 ; 2 5660*c0909341SAndroid Build Coastguard Worker punpckhwd m%4, m%2 ; 3 5661*c0909341SAndroid Build Coastguard Worker punpckhwd m%2, m%1, m8 ; 1 5662*c0909341SAndroid Build Coastguard Worker punpcklwd m%1, m8 ; 0 5663*c0909341SAndroid Build Coastguard Worker%endmacro 5664*c0909341SAndroid Build Coastguard Worker TRANSPOSE_8x4_PACKED 0, 1, 2, 3 5665*c0909341SAndroid Build Coastguard Worker TRANSPOSE_8x4_PACKED 18, 19, 20, 21 5666*c0909341SAndroid Build Coastguard Worker TRANSPOSE_8x4_PACKED 4, 5, 6, 7 5667*c0909341SAndroid Build Coastguard Worker TRANSPOSE_8x4_PACKED 14, 15, 16, 17 5668*c0909341SAndroid Build Coastguard Worker vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03 5669*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01 5670*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13 5671*c0909341SAndroid Build Coastguard Worker vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11 5672*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23 5673*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21 5674*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33 5675*c0909341SAndroid Build Coastguard Worker vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31 5676*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03 5677*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01 5678*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13 5679*c0909341SAndroid Build Coastguard Worker vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11 5680*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23 5681*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21 5682*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33 5683*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31 5684*c0909341SAndroid Build Coastguard Worker ret 5685*c0909341SAndroid Build Coastguard Worker.pass2: 5686*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m5, m19, q3131 ; 14 5687*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m19, q2020 ; 10 5688*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m6, m20, q3131 ; 15 5689*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m6, m20, q2020 ; 11 5690*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m4, m18, q3131 ; 13 5691*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m4, m18, q2020 ; 9 5692*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m8, m2, q3131 ; 12 5693*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m8, m2, q2020 ; 8 5694*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m0, m3, q3131 ; 4 5695*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m3, q2020 ; 0 5696*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m1, m16, q3131 ; 6 5697*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m16, q2020 ; 2 5698*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m9, m15, q3131 ; 5 5699*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m9, m15, q2020 ; 1 5700*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m11, m17, q2020 ; 3 5701*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m11, m17, q3131 ; 7 5702*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 5703*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 5704*c0909341SAndroid Build Coastguard Worker 5705*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob 5706*c0909341SAndroid Build Coastguard Worker lea r5, [o_base] 5707*c0909341SAndroid Build Coastguard Worker test eobd, eobd 5708*c0909341SAndroid Build Coastguard Worker jz .dconly 5709*c0909341SAndroid Build Coastguard Worker PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob 5710*c0909341SAndroid Build Coastguard Worker vpbroadcastd m23, [o(pw_2896x8)] 5711*c0909341SAndroid Build Coastguard Worker%undef cmp 5712*c0909341SAndroid Build Coastguard Worker cmp eobd, 136 5713*c0909341SAndroid Build Coastguard Worker jb .fast 5714*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m23, [cq+64*20] 5715*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m23, [cq+64*12] 5716*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m23, [cq+64* 4] 5717*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m23, [cq+64*28] 5718*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m23, [cq+64* 8] 5719*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m23, [cq+64*24] 5720*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m23, [cq+64* 0] 5721*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m23, [cq+64*16] 5722*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x8_8bpc).main 5723*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m23, [cq+64* 2] 5724*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m23, [cq+64*30] 5725*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m23, [cq+64*18] 5726*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m23, [cq+64*14] 5727*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m23, [cq+64*10] 5728*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m23, [cq+64*22] 5729*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m23, [cq+64*26] 5730*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m23, [cq+64* 6] 5731*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 5732*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m14 5733*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m15 5734*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m16 5735*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m17 5736*c0909341SAndroid Build Coastguard Worker mova [cq+64* 8], m18 5737*c0909341SAndroid Build Coastguard Worker mova [cq+64*10], m19 5738*c0909341SAndroid Build Coastguard Worker mova [cq+64*12], m20 5739*c0909341SAndroid Build Coastguard Worker mova [cq+64*14], m21 5740*c0909341SAndroid Build Coastguard Worker pmulhrsw m22, m23, [cq+64* 1] 5741*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m23, [cq+64*31] 5742*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m23, [cq+64*17] 5743*c0909341SAndroid Build Coastguard Worker pmulhrsw m29, m23, [cq+64*15] 5744*c0909341SAndroid Build Coastguard Worker pmulhrsw m26, m23, [cq+64* 9] 5745*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m23, [cq+64*23] 5746*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m23, [cq+64*25] 5747*c0909341SAndroid Build Coastguard Worker pmulhrsw m25, m23, [cq+64* 7] 5748*c0909341SAndroid Build Coastguard Worker pmulhrsw m24, m23, [cq+64* 5] 5749*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m23, [cq+64*27] 5750*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m23, [cq+64*21] 5751*c0909341SAndroid Build Coastguard Worker pmulhrsw m27, m23, [cq+64*11] 5752*c0909341SAndroid Build Coastguard Worker pmulhrsw m28, m23, [cq+64*13] 5753*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m23, [cq+64*19] 5754*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m23, [cq+64*29] 5755*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, [cq+64* 3] 5756*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf 5757*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_16384)] 5758*c0909341SAndroid Build Coastguard Worker psubsw m13, m0, m29 ; 31 5759*c0909341SAndroid Build Coastguard Worker paddsw m0, m29 ; 0 5760*c0909341SAndroid Build Coastguard Worker psubsw m29, m1, m28 ; 30 5761*c0909341SAndroid Build Coastguard Worker paddsw m1, m28 ; 1 5762*c0909341SAndroid Build Coastguard Worker psubsw m28, m2, m27 ; 29 5763*c0909341SAndroid Build Coastguard Worker paddsw m2, m27 ; 2 5764*c0909341SAndroid Build Coastguard Worker psubsw m27, m3, m26 ; 28 5765*c0909341SAndroid Build Coastguard Worker paddsw m3, m26 ; 3 5766*c0909341SAndroid Build Coastguard Worker psubsw m26, m4, m25 ; 27 5767*c0909341SAndroid Build Coastguard Worker paddsw m4, m25 ; 4 5768*c0909341SAndroid Build Coastguard Worker psubsw m25, m5, m24 ; 26 5769*c0909341SAndroid Build Coastguard Worker paddsw m5, m24 ; 5 5770*c0909341SAndroid Build Coastguard Worker psubsw m24, m6, m23 ; 25 5771*c0909341SAndroid Build Coastguard Worker paddsw m6, m23 ; 6 5772*c0909341SAndroid Build Coastguard Worker psubsw m23, m7, m22 ; 24 5773*c0909341SAndroid Build Coastguard Worker paddsw m7, m22 ; 7 5774*c0909341SAndroid Build Coastguard Worker pxor m9, m9 5775*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 5776*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 5777*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 5778*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 5779*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 5780*c0909341SAndroid Build Coastguard Worker punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 5781*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 5782*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 5783*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 5784*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 5785*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m23, m24 5786*c0909341SAndroid Build Coastguard Worker punpcklwd m23, m24 5787*c0909341SAndroid Build Coastguard Worker punpckhwd m24, m25, m26 5788*c0909341SAndroid Build Coastguard Worker punpcklwd m25, m26 5789*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 5790*c0909341SAndroid Build Coastguard Worker punpckhwd m26, m27, m28 5791*c0909341SAndroid Build Coastguard Worker punpcklwd m27, m28 5792*c0909341SAndroid Build Coastguard Worker punpckhwd m28, m29, m13 5793*c0909341SAndroid Build Coastguard Worker punpcklwd m29, m13 5794*c0909341SAndroid Build Coastguard Worker REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 5795*c0909341SAndroid Build Coastguard Worker punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 5796*c0909341SAndroid Build Coastguard Worker punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 5797*c0909341SAndroid Build Coastguard Worker punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 5798*c0909341SAndroid Build Coastguard Worker punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 5799*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m7, m0, m2, m4 5800*c0909341SAndroid Build Coastguard Worker punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 5801*c0909341SAndroid Build Coastguard Worker punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 5802*c0909341SAndroid Build Coastguard Worker punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 5803*c0909341SAndroid Build Coastguard Worker punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 5804*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m6, m8, m1, m22 5805*c0909341SAndroid Build Coastguard Worker punpckhdq m13, m23, m25 5806*c0909341SAndroid Build Coastguard Worker punpckldq m23, m25 5807*c0909341SAndroid Build Coastguard Worker punpckhdq m25, m27, m29 5808*c0909341SAndroid Build Coastguard Worker punpckldq m27, m29 5809*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m13, m23, m25, m27 5810*c0909341SAndroid Build Coastguard Worker punpckhdq m9, m3, m24 5811*c0909341SAndroid Build Coastguard Worker punpckldq m3, m24 5812*c0909341SAndroid Build Coastguard Worker punpckhdq m24, m26, m28 5813*c0909341SAndroid Build Coastguard Worker punpckldq m26, m28 5814*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m9, m3, m24, m26 5815*c0909341SAndroid Build Coastguard Worker punpckhqdq m5, m23, m27 ; d01 d09 d17 d25 5816*c0909341SAndroid Build Coastguard Worker punpcklqdq m23, m27 ; d00 d08 d16 d24 5817*c0909341SAndroid Build Coastguard Worker punpcklqdq m27, m13, m25 ; d02 d10 d18 d26 5818*c0909341SAndroid Build Coastguard Worker punpckhqdq m13, m25 ; d03 d11 d19 d27 5819*c0909341SAndroid Build Coastguard Worker punpcklqdq m25, m3, m26 ; d04 d12 d20 d28 5820*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m26 ; d05 d13 d21 d29 5821*c0909341SAndroid Build Coastguard Worker punpcklqdq m26, m9, m24 ; d06 d14 d22 d30 5822*c0909341SAndroid Build Coastguard Worker punpckhqdq m9, m24 ; d07 d15 d23 d31 5823*c0909341SAndroid Build Coastguard Worker mova [cq+64* 3], m23 5824*c0909341SAndroid Build Coastguard Worker mova [cq+64*13], m27 5825*c0909341SAndroid Build Coastguard Worker mova [cq+64* 7], m25 5826*c0909341SAndroid Build Coastguard Worker mova [cq+64*15], m26 5827*c0909341SAndroid Build Coastguard Worker punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 5828*c0909341SAndroid Build Coastguard Worker punpcklqdq m8, m22 ; a04 a12 a20 a28 5829*c0909341SAndroid Build Coastguard Worker punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 5830*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m4 ; a00 a08 a16 a24 5831*c0909341SAndroid Build Coastguard Worker punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 5832*c0909341SAndroid Build Coastguard Worker punpcklqdq m7, m2 ; a02 a10 a18 a26 5833*c0909341SAndroid Build Coastguard Worker punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 5834*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m1 ; a06 a14 a22 a30 5835*c0909341SAndroid Build Coastguard Worker mova [cq+64* 1], m0 5836*c0909341SAndroid Build Coastguard Worker mova [cq+64* 9], m7 5837*c0909341SAndroid Build Coastguard Worker mova [cq+64* 5], m8 5838*c0909341SAndroid Build Coastguard Worker mova [cq+64*11], m6 5839*c0909341SAndroid Build Coastguard Worker mova m2, [cq+64* 0] 5840*c0909341SAndroid Build Coastguard Worker mova m11, [cq+64* 2] 5841*c0909341SAndroid Build Coastguard Worker mova m8, [cq+64* 4] 5842*c0909341SAndroid Build Coastguard Worker mova m29, [cq+64* 6] 5843*c0909341SAndroid Build Coastguard Worker mova m27, [cq+64* 8] 5844*c0909341SAndroid Build Coastguard Worker mova m26, [cq+64*10] 5845*c0909341SAndroid Build Coastguard Worker mova m4, [cq+64*12] 5846*c0909341SAndroid Build Coastguard Worker mova m28, [cq+64*14] 5847*c0909341SAndroid Build Coastguard Worker psubsw m1, m2, m21 ; 23 5848*c0909341SAndroid Build Coastguard Worker paddsw m2, m21 ; 8 5849*c0909341SAndroid Build Coastguard Worker psubsw m21, m11, m20 ; 22 5850*c0909341SAndroid Build Coastguard Worker paddsw m11, m20 ; 9 5851*c0909341SAndroid Build Coastguard Worker psubsw m20, m8, m19 ; 21 5852*c0909341SAndroid Build Coastguard Worker paddsw m8, m19 ; 10 5853*c0909341SAndroid Build Coastguard Worker psubsw m19, m29, m18 ; 20 5854*c0909341SAndroid Build Coastguard Worker paddsw m29, m18 ; 11 5855*c0909341SAndroid Build Coastguard Worker psubsw m18, m27, m17 ; 19 5856*c0909341SAndroid Build Coastguard Worker paddsw m27, m17 ; 12 5857*c0909341SAndroid Build Coastguard Worker psubsw m17, m26, m16 ; 18 5858*c0909341SAndroid Build Coastguard Worker paddsw m26, m16 ; 13 5859*c0909341SAndroid Build Coastguard Worker psubsw m16, m4, m15 ; 17 5860*c0909341SAndroid Build Coastguard Worker paddsw m4, m15 ; 14 5861*c0909341SAndroid Build Coastguard Worker psubsw m15, m28, m14 ; 16 5862*c0909341SAndroid Build Coastguard Worker paddsw m28, m14 ; 15 5863*c0909341SAndroid Build Coastguard Worker punpcklwd m14, m15, m16 5864*c0909341SAndroid Build Coastguard Worker punpckhwd m15, m16 5865*c0909341SAndroid Build Coastguard Worker punpckhwd m16, m17, m18 5866*c0909341SAndroid Build Coastguard Worker punpcklwd m17, m18 5867*c0909341SAndroid Build Coastguard Worker punpckhwd m18, m19, m20 5868*c0909341SAndroid Build Coastguard Worker punpcklwd m19, m20 5869*c0909341SAndroid Build Coastguard Worker punpckhwd m20, m21, m1 5870*c0909341SAndroid Build Coastguard Worker punpcklwd m21, m1 5871*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 5872*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 5873*c0909341SAndroid Build Coastguard Worker punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 5874*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 5875*c0909341SAndroid Build Coastguard Worker punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 5876*c0909341SAndroid Build Coastguard Worker punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 5877*c0909341SAndroid Build Coastguard Worker punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 5878*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 5879*c0909341SAndroid Build Coastguard Worker punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 5880*c0909341SAndroid Build Coastguard Worker punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 5881*c0909341SAndroid Build Coastguard Worker punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3 5882*c0909341SAndroid Build Coastguard Worker punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1 5883*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m28, m2, m8, m27 5884*c0909341SAndroid Build Coastguard Worker punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 5885*c0909341SAndroid Build Coastguard Worker punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 5886*c0909341SAndroid Build Coastguard Worker punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 5887*c0909341SAndroid Build Coastguard Worker punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 5888*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m4, m1, m11, m29 5889*c0909341SAndroid Build Coastguard Worker punpckhdq m26, m19, m21 5890*c0909341SAndroid Build Coastguard Worker punpckldq m19, m21 5891*c0909341SAndroid Build Coastguard Worker punpckhdq m21, m15, m16 5892*c0909341SAndroid Build Coastguard Worker punpckldq m15, m16 5893*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m26, m19, m21, m15 5894*c0909341SAndroid Build Coastguard Worker punpckhdq m16, m18, m20 5895*c0909341SAndroid Build Coastguard Worker punpckldq m18, m20 5896*c0909341SAndroid Build Coastguard Worker punpckhdq m20, m14, m17 5897*c0909341SAndroid Build Coastguard Worker punpckldq m14, m17 5898*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m16, m18, m20, m14 5899*c0909341SAndroid Build Coastguard Worker punpckhqdq m17, m28, m8 ; b03 b11 b19 b27 5900*c0909341SAndroid Build Coastguard Worker punpcklqdq m28, m8 ; b02 b10 b18 b26 5901*c0909341SAndroid Build Coastguard Worker punpckhqdq m8, m2, m27 ; b01 b09 b17 b25 5902*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m27 ; b00 b08 b16 b24 5903*c0909341SAndroid Build Coastguard Worker punpcklqdq m27, m1, m29 ; b04 b12 b20 b28 5904*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m29 ; b05 b13 b21 b29 5905*c0909341SAndroid Build Coastguard Worker punpcklqdq m29, m4, m11 ; b06 b14 b22 b30 5906*c0909341SAndroid Build Coastguard Worker punpckhqdq m4, m11 ; b07 b15 b23 b31 5907*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m2 5908*c0909341SAndroid Build Coastguard Worker mova [cq+64* 8], m28 5909*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m27 5910*c0909341SAndroid Build Coastguard Worker mova [cq+64*10], m29 5911*c0909341SAndroid Build Coastguard Worker punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 5912*c0909341SAndroid Build Coastguard Worker punpcklqdq m20, m26 ; c02 c10 c18 c26 5913*c0909341SAndroid Build Coastguard Worker punpckhqdq m26, m14, m19 ; c01 c09 c17 c25 5914*c0909341SAndroid Build Coastguard Worker punpcklqdq m14, m19 ; c00 c08 c16 c24 5915*c0909341SAndroid Build Coastguard Worker punpckhqdq m28, m15, m18 ; c05 c13 c21 c29 5916*c0909341SAndroid Build Coastguard Worker punpcklqdq m15, m18 ; c04 c12 c20 c28 5917*c0909341SAndroid Build Coastguard Worker punpckhqdq m29, m21, m16 ; c07 c15 c23 c31 5918*c0909341SAndroid Build Coastguard Worker punpcklqdq m21, m16 ; c06 c14 c22 c30 5919*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m14 5920*c0909341SAndroid Build Coastguard Worker mova [cq+64*12], m20 5921*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m15 5922*c0909341SAndroid Build Coastguard Worker mova [cq+64*14], m21 5923*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25 5924*c0909341SAndroid Build Coastguard Worker vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09 5925*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27 5926*c0909341SAndroid Build Coastguard Worker vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11 5927*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29 5928*c0909341SAndroid Build Coastguard Worker vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13 5929*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31 5930*c0909341SAndroid Build Coastguard Worker vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15 5931*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09 5932*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25 5933*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11 5934*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27 5935*c0909341SAndroid Build Coastguard Worker vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13 5936*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29 5937*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15 5938*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31 5939*c0909341SAndroid Build Coastguard Worker mov r4, rsp 5940*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m22, m19, q2020 ; 1 5941*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m17, m29, q3131 ; 31 5942*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m14, m26, q2020 ; 17 5943*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m25, m18, q3131 ; 15 5944*c0909341SAndroid Build Coastguard Worker call .main_part1 5945*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m25, m18, q2020 ; 7 5946*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m14, m26, q3131 ; 25 5947*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m17, m29, q2020 ; 23 5948*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m22, m19, q3131 ; 9 5949*c0909341SAndroid Build Coastguard Worker call .main_part1 5950*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m24, m21, q2020 ; 5 5951*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m15, m27, q3131 ; 27 5952*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m16, m28, q2020 ; 21 5953*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m23, m20, q3131 ; 11 5954*c0909341SAndroid Build Coastguard Worker call .main_part1 5955*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m23, m20, q2020 ; 3 5956*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m16, m28, q3131 ; 29 5957*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m15, m27, q2020 ; 19 5958*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m24, m21, q3131 ; 13 5959*c0909341SAndroid Build Coastguard Worker call .main_part1 5960*c0909341SAndroid Build Coastguard Worker call .main_part2 5961*c0909341SAndroid Build Coastguard Worker mova m0, [cq+64* 1] ; a0 5962*c0909341SAndroid Build Coastguard Worker mova m15, [cq+64* 0] ; b0 5963*c0909341SAndroid Build Coastguard Worker mova m3, [cq+64* 2] ; c0 5964*c0909341SAndroid Build Coastguard Worker mova m16, [cq+64* 3] ; d0 5965*c0909341SAndroid Build Coastguard Worker mova m14, [cq+64* 5] ; a4 5966*c0909341SAndroid Build Coastguard Worker mova m8, [cq+64* 4] ; b4 5967*c0909341SAndroid Build Coastguard Worker mova m17, [cq+64* 6] ; c4 5968*c0909341SAndroid Build Coastguard Worker mova m1, [cq+64* 7] ; d4 5969*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24 5970*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08 5971*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24 5972*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08 5973*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28 5974*c0909341SAndroid Build Coastguard Worker vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12 5975*c0909341SAndroid Build Coastguard Worker vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28 5976*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12 5977*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m0, m3, q3131 ; 8 5978*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m3, q2020 ; 0 5979*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m2, m15, q3131 ; 24 5980*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m15, q2020 ; 16 5981*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m14, m17, q3131 ; 12 5982*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m17, q2020 ; 4 5983*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m16, m8, q3131 ; 28 5984*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m8, q2020 ; 20 5985*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 5986*c0909341SAndroid Build Coastguard Worker mova m8, [cq+64* 8] 5987*c0909341SAndroid Build Coastguard Worker mova m9, [cq+64*12] 5988*c0909341SAndroid Build Coastguard Worker mova m11, [cq+64*10] 5989*c0909341SAndroid Build Coastguard Worker mova m12, [cq+64*14] 5990*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m14 5991*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m15 5992*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m16 5993*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m17 5994*c0909341SAndroid Build Coastguard Worker mova [cq+64* 8], m18 5995*c0909341SAndroid Build Coastguard Worker mova [cq+64*10], m19 5996*c0909341SAndroid Build Coastguard Worker mova [cq+64*12], m20 5997*c0909341SAndroid Build Coastguard Worker mova [cq+64*14], m21 5998*c0909341SAndroid Build Coastguard Worker mova m22, [cq+64* 9] 5999*c0909341SAndroid Build Coastguard Worker mova m27, [cq+64*13] 6000*c0909341SAndroid Build Coastguard Worker mova m23, [cq+64*11] 6001*c0909341SAndroid Build Coastguard Worker mova m24, [cq+64*15] 6002*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26 6003*c0909341SAndroid Build Coastguard Worker vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10 6004*c0909341SAndroid Build Coastguard Worker vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26 6005*c0909341SAndroid Build Coastguard Worker vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10 6006*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30 6007*c0909341SAndroid Build Coastguard Worker vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14 6008*c0909341SAndroid Build Coastguard Worker vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30 6009*c0909341SAndroid Build Coastguard Worker vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14 6010*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m26, m8, q3131 ; 26 6011*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m8, q2020 ; 18 6012*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m22, m9, q3131 ; 10 6013*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m9, q2020 ; 2 6014*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m27, m11, q3131 ; 30 6015*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m11, q2020 ; 22 6016*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m23, m12, q3131 ; 14 6017*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m12, q2020 ; 6 6018*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 6019*c0909341SAndroid Build Coastguard Worker jmp .end 6020*c0909341SAndroid Build Coastguard Worker.fast: ; bottom/right halves are zero 6021*c0909341SAndroid Build Coastguard Worker pmulhrsw ym9, ym23, [cq+64* 0] 6022*c0909341SAndroid Build Coastguard Worker pmulhrsw ym6, ym23, [cq+64* 8] 6023*c0909341SAndroid Build Coastguard Worker mova m14, [o(dup16_perm)] 6024*c0909341SAndroid Build Coastguard Worker pmulhrsw ym8, ym23, [cq+64* 2] 6025*c0909341SAndroid Build Coastguard Worker pmulhrsw xm0, xm23, [cq+64*14] 6026*c0909341SAndroid Build Coastguard Worker pmulhrsw xm5, xm23, [cq+64*10] 6027*c0909341SAndroid Build Coastguard Worker pmulhrsw ym1, ym23, [cq+64* 6] 6028*c0909341SAndroid Build Coastguard Worker pmulhrsw ym7, ym23, [cq+64* 4] 6029*c0909341SAndroid Build Coastguard Worker pmulhrsw xm3, xm23, [cq+64*12] 6030*c0909341SAndroid Build Coastguard Worker pmovzxwd m9, ym9 6031*c0909341SAndroid Build Coastguard Worker pmovzxwd m6, ym6 6032*c0909341SAndroid Build Coastguard Worker vpermb m8, m14, m8 6033*c0909341SAndroid Build Coastguard Worker punpcklwd xm0, xm0 6034*c0909341SAndroid Build Coastguard Worker vpermb ym5, ym14, ym5 6035*c0909341SAndroid Build Coastguard Worker vpermb m1, m14, m1 6036*c0909341SAndroid Build Coastguard Worker vpermb m7, m14, m7 6037*c0909341SAndroid Build Coastguard Worker punpcklwd xm3, xm3 6038*c0909341SAndroid Build Coastguard Worker pslld m9, 16 6039*c0909341SAndroid Build Coastguard Worker pslld m6, 16 6040*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main_fast 6041*c0909341SAndroid Build Coastguard Worker vpmulhrsw ym21, ym23, [cq+64* 1] 6042*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which 6043*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to 6044*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements 6045*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw ym18, ym23, [cq+64* 5] 6046*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw xm16, xm23, [cq+64*11] 6047*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw xm19, xm23, [cq+64*13] 6048*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw ym23, [cq+64* 3] 6049*c0909341SAndroid Build Coastguard Worker vpermb m21, m14, m21 6050*c0909341SAndroid Build Coastguard Worker punpcklwd xm17, xm17 6051*c0909341SAndroid Build Coastguard Worker vpermb ym20, ym14, ym20 6052*c0909341SAndroid Build Coastguard Worker vpermb m15, m14, m15 6053*c0909341SAndroid Build Coastguard Worker vpermb m18, m14, m18 6054*c0909341SAndroid Build Coastguard Worker vpermb ym16, ym14, ym16 6055*c0909341SAndroid Build Coastguard Worker punpcklwd xm19, xm19 6056*c0909341SAndroid Build Coastguard Worker vpermb m14, m14, m23 6057*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast 6058*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(pw_16384)] 6059*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round 6060*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m0, m3, q2020 ; 0 6061*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m0, m3, q3131 ; 4 6062*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m14, m2, q2020 ; 1 6063*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m2, q3131 ; 5 6064*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m19, m7, q3131 ; 15 6065*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m7, q2020 ; 11 6066*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m17, m9, q2020 ; 3 6067*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m9, q3131 ; 7 6068*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m20, m6, q2020 ; 9 6069*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m6, q3131 ; 13 6070*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m1, m18, q2020 ; 2 6071*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m1, m18, q3131 ; 6 6072*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m5, m15, q2020 ; 10 6073*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m5, m15, q3131 ; 14 6074*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m21, m4, q3131 ; 12 6075*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m21, m4, q2020 ; 8 6076*c0909341SAndroid Build Coastguard Worker mov r4, rsp 6077*c0909341SAndroid Build Coastguard Worker call .main_part1_fast 6078*c0909341SAndroid Build Coastguard Worker mova m0, m17 6079*c0909341SAndroid Build Coastguard Worker mova m3, m28 6080*c0909341SAndroid Build Coastguard Worker call .main_part1_fast 6081*c0909341SAndroid Build Coastguard Worker mova m0, m14 6082*c0909341SAndroid Build Coastguard Worker mova m3, m19 6083*c0909341SAndroid Build Coastguard Worker call .main_part1_fast 6084*c0909341SAndroid Build Coastguard Worker mova m0, m27 6085*c0909341SAndroid Build Coastguard Worker mova m3, m20 6086*c0909341SAndroid Build Coastguard Worker call .main_part1_fast 6087*c0909341SAndroid Build Coastguard Worker call .main_part2 6088*c0909341SAndroid Build Coastguard Worker mova m0, m16 6089*c0909341SAndroid Build Coastguard Worker mova m1, m21 6090*c0909341SAndroid Build Coastguard Worker mova m14, m26 6091*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 6092*c0909341SAndroid Build Coastguard Worker mova [cq+64*14], m21 6093*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m14 6094*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m17 6095*c0909341SAndroid Build Coastguard Worker mova [cq+64* 8], m18 6096*c0909341SAndroid Build Coastguard Worker mova [cq+64*10], m19 6097*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m16 6098*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m15 6099*c0909341SAndroid Build Coastguard Worker mova [cq+64*12], m20 6100*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 6101*c0909341SAndroid Build Coastguard Worker.end: 6102*c0909341SAndroid Build Coastguard Worker lea r4, [strideq*3] 6103*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_2048)] 6104*c0909341SAndroid Build Coastguard Worker movshdup m13, [o(permD)] 6105*c0909341SAndroid Build Coastguard Worker lea r5, [r4+strideq] ; stride*4 6106*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+r4*8] 6107*c0909341SAndroid Build Coastguard Worker lea r6, [strideq+r5*8] ; stride*33 6108*c0909341SAndroid Build Coastguard Worker lea r8, [r4+r5*8] ; stride*35 6109*c0909341SAndroid Build Coastguard Worker add r3, r5 ; dst+stride*28 6110*c0909341SAndroid Build Coastguard Worker lea r7, [r6+strideq] ; stride*34 6111*c0909341SAndroid Build Coastguard Worker%macro IDCT_32x64_END 6 ; src, mem, stride[1-4] 6112*c0909341SAndroid Build Coastguard Worker%if %2 < 8 6113*c0909341SAndroid Build Coastguard Worker paddsw m10, m%2, m%1 6114*c0909341SAndroid Build Coastguard Worker psubsw m11, m%2, m%1 6115*c0909341SAndroid Build Coastguard Worker%else 6116*c0909341SAndroid Build Coastguard Worker mova m11, [cq+64*(%2*2-16)] 6117*c0909341SAndroid Build Coastguard Worker paddsw m10, m11, m%1 6118*c0909341SAndroid Build Coastguard Worker psubsw m11, m%1 6119*c0909341SAndroid Build Coastguard Worker%endif 6120*c0909341SAndroid Build Coastguard Worker mova m9, [rsp+64*(31-%2)] 6121*c0909341SAndroid Build Coastguard Worker mova m%1, [rsp+64*%2] 6122*c0909341SAndroid Build Coastguard Worker paddsw m8, m10, m9 6123*c0909341SAndroid Build Coastguard Worker psubsw m10, m9 6124*c0909341SAndroid Build Coastguard Worker paddsw m9, m11, m%1 6125*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, [dstq+%3] 6126*c0909341SAndroid Build Coastguard Worker psubsw m11, m%1 6127*c0909341SAndroid Build Coastguard Worker pmovzxbw m%1, [r3 +%4] 6128*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m12}, m8, m10, m9, m11 6129*c0909341SAndroid Build Coastguard Worker paddw m8, m0 6130*c0909341SAndroid Build Coastguard Worker pmovzxbw m0, [r3 +%5] 6131*c0909341SAndroid Build Coastguard Worker paddw m10, m%1 6132*c0909341SAndroid Build Coastguard Worker pmovzxbw m%1, [dstq+%6] 6133*c0909341SAndroid Build Coastguard Worker paddw m9, m0 6134*c0909341SAndroid Build Coastguard Worker paddw m11, m%1 6135*c0909341SAndroid Build Coastguard Worker%if %2 >= 8 6136*c0909341SAndroid Build Coastguard Worker%if %2 == 8 6137*c0909341SAndroid Build Coastguard Worker pxor m1, m1 6138*c0909341SAndroid Build Coastguard Worker%endif 6139*c0909341SAndroid Build Coastguard Worker mova [cq+64*(%2*2-16)], m1 6140*c0909341SAndroid Build Coastguard Worker mova [cq+64*(%2*2-15)], m1 6141*c0909341SAndroid Build Coastguard Worker%endif 6142*c0909341SAndroid Build Coastguard Worker packuswb m8, m10 6143*c0909341SAndroid Build Coastguard Worker packuswb m9, m11 6144*c0909341SAndroid Build Coastguard Worker vpermq m8, m13, m8 6145*c0909341SAndroid Build Coastguard Worker vpermq m9, m13, m9 6146*c0909341SAndroid Build Coastguard Worker mova [dstq+%3], ym8 6147*c0909341SAndroid Build Coastguard Worker vextracti32x8 [r3 +%4], m8, 1 6148*c0909341SAndroid Build Coastguard Worker mova [r3 +%5], ym9 6149*c0909341SAndroid Build Coastguard Worker vextracti32x8 [dstq+%6], m9, 1 6150*c0909341SAndroid Build Coastguard Worker%if %2 == 3 || %2 == 7 || %2 == 11 6151*c0909341SAndroid Build Coastguard Worker add dstq, r5 6152*c0909341SAndroid Build Coastguard Worker sub r3, r5 6153*c0909341SAndroid Build Coastguard Worker%endif 6154*c0909341SAndroid Build Coastguard Worker%endmacro 6155*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8 6156*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6 6157*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7 6158*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8 6159*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8 6160*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6 6161*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7 6162*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8 6163*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8 6164*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6 6165*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7 6166*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8 6167*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8 6168*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6 6169*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7 6170*c0909341SAndroid Build Coastguard Worker IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8 6171*c0909341SAndroid Build Coastguard Worker RET 6172*c0909341SAndroid Build Coastguard Worker.dconly: 6173*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 6174*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 6175*c0909341SAndroid Build Coastguard Worker or r3d, 64 6176*c0909341SAndroid Build Coastguard Worker imul r6d, 181 6177*c0909341SAndroid Build Coastguard Worker add r6d, 128 6178*c0909341SAndroid Build Coastguard Worker sar r6d, 8 6179*c0909341SAndroid Build Coastguard Worker imul r6d, 181 6180*c0909341SAndroid Build Coastguard Worker add r6d, 128+256 6181*c0909341SAndroid Build Coastguard Worker sar r6d, 8+1 6182*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 6183*c0909341SAndroid Build Coastguard WorkerALIGN function_align ; bottom three-quarters are zero 6184*c0909341SAndroid Build Coastguard Workercglobal_label .main_part1_fast2 6185*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [o(idct64_mul+4*0)] 6186*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(idct64_mul+4*1)] 6187*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m0 ; t63a 6188*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m8 ; t32a 6189*c0909341SAndroid Build Coastguard Worker 6190*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m0, m7 6191*c0909341SAndroid Build Coastguard Worker punpckhwd m6, m0, m7 6192*c0909341SAndroid Build Coastguard Worker mova m1, m10 6193*c0909341SAndroid Build Coastguard Worker vpdpwssd m1, m4, [o(idct64_mul+4*9)] {bcstd} 6194*c0909341SAndroid Build Coastguard Worker mova m9, m10 6195*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m6, [o(idct64_mul+4*9)] {bcstd} 6196*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 12}, m1, m9 6197*c0909341SAndroid Build Coastguard Worker packssdw m1, m9 6198*c0909341SAndroid Build Coastguard Worker mova m9, m10 6199*c0909341SAndroid Build Coastguard Worker vpdpwssd m9, m6, [o(idct64_mul+4*8)] {bcstd} 6200*c0909341SAndroid Build Coastguard Worker mova m6, m10 6201*c0909341SAndroid Build Coastguard Worker vpdpwssd m6, m4, [o(idct64_mul+4*8)] {bcstd} 6202*c0909341SAndroid Build Coastguard Worker REPX {psrad x, 12}, m9, m6 6203*c0909341SAndroid Build Coastguard Worker packssdw m6, m9 6204*c0909341SAndroid Build Coastguard Worker 6205*c0909341SAndroid Build Coastguard Worker mova m4, m0 6206*c0909341SAndroid Build Coastguard Worker mova m3, m7 6207*c0909341SAndroid Build Coastguard Worker mova m5, m1 6208*c0909341SAndroid Build Coastguard Worker mova m2, m6 6209*c0909341SAndroid Build Coastguard Worker jmp .main_part1c 6210*c0909341SAndroid Build Coastguard Workercglobal_label .main_part1_fast 6211*c0909341SAndroid Build Coastguard Worker vpbroadcastd m1, [o(idct64_mul+4*0)] 6212*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(idct64_mul+4*1)] 6213*c0909341SAndroid Build Coastguard Worker vpbroadcastd m2, [o(idct64_mul+4*6)] 6214*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(idct64_mul+4*7)] 6215*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m0 ; t63a 6216*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m8 ; t32a 6217*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m3 ; t60a 6218*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m9 ; t35a 6219*c0909341SAndroid Build Coastguard Worker mova m8, m0 6220*c0909341SAndroid Build Coastguard Worker mova m7, m1 6221*c0909341SAndroid Build Coastguard Worker mova m6, m3 6222*c0909341SAndroid Build Coastguard Worker mova m5, m2 6223*c0909341SAndroid Build Coastguard Worker jmp .main_part1b 6224*c0909341SAndroid Build Coastguard Workercglobal_label .main_part1 6225*c0909341SAndroid Build Coastguard Worker ; idct64 steps 1-5: 6226*c0909341SAndroid Build Coastguard Worker ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 6227*c0909341SAndroid Build Coastguard Worker ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 6228*c0909341SAndroid Build Coastguard Worker ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 6229*c0909341SAndroid Build Coastguard Worker ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 6230*c0909341SAndroid Build Coastguard Worker vpbroadcastd m7, [o(idct64_mul+4*0)] 6231*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(idct64_mul+4*1)] 6232*c0909341SAndroid Build Coastguard Worker vpbroadcastd m6, [o(idct64_mul+4*2)] 6233*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(idct64_mul+4*3)] 6234*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m0 ; t63a 6235*c0909341SAndroid Build Coastguard Worker vpbroadcastd m5, [o(idct64_mul+4*4)] 6236*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m8 ; t32a 6237*c0909341SAndroid Build Coastguard Worker vpbroadcastd m8, [o(idct64_mul+4*5)] 6238*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m1 ; t62a 6239*c0909341SAndroid Build Coastguard Worker vpbroadcastd m4, [o(idct64_mul+4*6)] 6240*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m9 ; t33a 6241*c0909341SAndroid Build Coastguard Worker vpbroadcastd m9, [o(idct64_mul+4*7)] 6242*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m2 ; t61a 6243*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m8 ; t34a 6244*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m3 ; t60a 6245*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m9 ; t35a 6246*c0909341SAndroid Build Coastguard Worker psubsw m8, m0, m1 ; t33 6247*c0909341SAndroid Build Coastguard Worker paddsw m0, m1 ; t32 6248*c0909341SAndroid Build Coastguard Worker psubsw m1, m7, m6 ; t62 6249*c0909341SAndroid Build Coastguard Worker paddsw m7, m6 ; t63 6250*c0909341SAndroid Build Coastguard Worker psubsw m6, m3, m2 ; t34 6251*c0909341SAndroid Build Coastguard Worker paddsw m3, m2 ; t35 6252*c0909341SAndroid Build Coastguard Worker psubsw m2, m4, m5 ; t61 6253*c0909341SAndroid Build Coastguard Worker paddsw m5, m4 ; t60 6254*c0909341SAndroid Build Coastguard Worker.main_part1b: 6255*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(idct64_mul+4*8)] 6256*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(idct64_mul+4*9)] 6257*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a 6258*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(idct64_mul+4*10)] 6259*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a 6260*c0909341SAndroid Build Coastguard Worker psubsw m4, m0, m3 ; t35a 6261*c0909341SAndroid Build Coastguard Worker paddsw m0, m3 ; t32a 6262*c0909341SAndroid Build Coastguard Worker psubsw m3, m7, m5 ; t60a 6263*c0909341SAndroid Build Coastguard Worker paddsw m7, m5 ; t63a 6264*c0909341SAndroid Build Coastguard Worker psubsw m5, m1, m2 ; t34 6265*c0909341SAndroid Build Coastguard Worker paddsw m1, m2 ; t33 6266*c0909341SAndroid Build Coastguard Worker psubsw m2, m8, m6 ; t61 6267*c0909341SAndroid Build Coastguard Worker paddsw m6, m8 ; t62 6268*c0909341SAndroid Build Coastguard Worker.main_part1c: 6269*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(idct64_mul+4*11)] 6270*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(idct64_mul+4*12)] 6271*c0909341SAndroid Build Coastguard Worker add r5, 4*13 6272*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60 6273*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a 6274*c0909341SAndroid Build Coastguard Worker mova [r4+64*0], m0 6275*c0909341SAndroid Build Coastguard Worker mova [r4+64*7], m7 6276*c0909341SAndroid Build Coastguard Worker mova [r4+64*1], m1 6277*c0909341SAndroid Build Coastguard Worker mova [r4+64*6], m6 6278*c0909341SAndroid Build Coastguard Worker mova [r4+64*3], m3 6279*c0909341SAndroid Build Coastguard Worker mova [r4+64*4], m4 6280*c0909341SAndroid Build Coastguard Worker mova [r4+64*2], m2 6281*c0909341SAndroid Build Coastguard Worker mova [r4+64*5], m5 6282*c0909341SAndroid Build Coastguard Worker add r4, 64*8 6283*c0909341SAndroid Build Coastguard Worker ret 6284*c0909341SAndroid Build Coastguard Workercglobal_label .main_part2 6285*c0909341SAndroid Build Coastguard Worker vpbroadcastd m11, [o(pw_1567_3784 -16*13)] 6286*c0909341SAndroid Build Coastguard Worker vpbroadcastd m12, [o(pw_m3784_1567 -16*13)] 6287*c0909341SAndroid Build Coastguard Worker lea r6, [r4+64*7] 6288*c0909341SAndroid Build Coastguard Worker vpbroadcastd m17, [o(pw_m1567_m3784-16*13)] 6289*c0909341SAndroid Build Coastguard Worker vpbroadcastd m18, [o(pw_2896_2896 -16*13)] 6290*c0909341SAndroid Build Coastguard Worker vpbroadcastd m19, [o(pw_m2896_2896 -16*13)] 6291*c0909341SAndroid Build Coastguard Worker sub r5, 16*13 6292*c0909341SAndroid Build Coastguard Worker.main_part2_loop: 6293*c0909341SAndroid Build Coastguard Worker mova m0, [r4-64*32] ; t32a 6294*c0909341SAndroid Build Coastguard Worker mova m1, [r6-64*24] ; t39a 6295*c0909341SAndroid Build Coastguard Worker mova m2, [r6-64*32] ; t63a 6296*c0909341SAndroid Build Coastguard Worker mova m3, [r4-64*24] ; t56a 6297*c0909341SAndroid Build Coastguard Worker mova m4, [r4-64*16] ; t40a 6298*c0909341SAndroid Build Coastguard Worker mova m5, [r6-64* 8] ; t47a 6299*c0909341SAndroid Build Coastguard Worker mova m6, [r6-64*16] ; t55a 6300*c0909341SAndroid Build Coastguard Worker mova m7, [r4-64* 8] ; t48a 6301*c0909341SAndroid Build Coastguard Worker psubsw m8, m0, m1 ; t39 6302*c0909341SAndroid Build Coastguard Worker paddsw m0, m1 ; t32 6303*c0909341SAndroid Build Coastguard Worker psubsw m1, m2, m3 ; t56 6304*c0909341SAndroid Build Coastguard Worker paddsw m2, m3 ; t63 6305*c0909341SAndroid Build Coastguard Worker psubsw m3, m5, m4 ; t40 6306*c0909341SAndroid Build Coastguard Worker paddsw m5, m4 ; t47 6307*c0909341SAndroid Build Coastguard Worker psubsw m4, m7, m6 ; t55 6308*c0909341SAndroid Build Coastguard Worker paddsw m7, m6 ; t48 6309*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a 6310*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a 6311*c0909341SAndroid Build Coastguard Worker psubsw m6, m2, m7 ; t48a 6312*c0909341SAndroid Build Coastguard Worker paddsw m2, m7 ; t63a 6313*c0909341SAndroid Build Coastguard Worker psubsw m7, m0, m5 ; t47a 6314*c0909341SAndroid Build Coastguard Worker paddsw m0, m5 ; t32a 6315*c0909341SAndroid Build Coastguard Worker psubsw m5, m8, m3 ; t55 6316*c0909341SAndroid Build Coastguard Worker paddsw m8, m3 ; t56 6317*c0909341SAndroid Build Coastguard Worker psubsw m3, m1, m4 ; t40 6318*c0909341SAndroid Build Coastguard Worker paddsw m1, m4 ; t39 6319*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48 6320*c0909341SAndroid Build Coastguard Worker ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a 6321*c0909341SAndroid Build Coastguard Worker mova [r6-64* 8], m2 6322*c0909341SAndroid Build Coastguard Worker mova [r4-64*32], m0 6323*c0909341SAndroid Build Coastguard Worker mova [r4-64* 8], m8 6324*c0909341SAndroid Build Coastguard Worker mova [r6-64*32], m1 6325*c0909341SAndroid Build Coastguard Worker mova [r6-64*24], m6 6326*c0909341SAndroid Build Coastguard Worker mova [r4-64*16], m7 6327*c0909341SAndroid Build Coastguard Worker mova [r4-64*24], m5 6328*c0909341SAndroid Build Coastguard Worker mova [r6-64*16], m3 6329*c0909341SAndroid Build Coastguard Worker add r4, 64 6330*c0909341SAndroid Build Coastguard Worker sub r6, 64 6331*c0909341SAndroid Build Coastguard Worker cmp r4, r6 6332*c0909341SAndroid Build Coastguard Worker jb .main_part2_loop 6333*c0909341SAndroid Build Coastguard Worker ret 6334*c0909341SAndroid Build Coastguard Worker 6335*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob 6336*c0909341SAndroid Build Coastguard Worker lea r5, [o_base] 6337*c0909341SAndroid Build Coastguard Worker test eobd, eobd 6338*c0909341SAndroid Build Coastguard Worker jz .dconly 6339*c0909341SAndroid Build Coastguard Worker PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob 6340*c0909341SAndroid Build Coastguard Worker vpbroadcastd m23, [o(pw_2896x8)] 6341*c0909341SAndroid Build Coastguard Worker%undef cmp 6342*c0909341SAndroid Build Coastguard Worker cmp eobd, 136 6343*c0909341SAndroid Build Coastguard Worker jb .fast 6344*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m23, [cq+64* 1] 6345*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m23, [cq+64*31] 6346*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m23, [cq+64*17] 6347*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m23, [cq+64*15] 6348*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pd_2048)] 6349*c0909341SAndroid Build Coastguard Worker mov r4, rsp 6350*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6351*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m23, [cq+64* 7] 6352*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m23, [cq+64*25] 6353*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m23, [cq+64*23] 6354*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m23, [cq+64* 9] 6355*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6356*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m23, [cq+64* 5] 6357*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m23, [cq+64*27] 6358*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m23, [cq+64*21] 6359*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m23, [cq+64*11] 6360*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6361*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m23, [cq+64* 3] 6362*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m23, [cq+64*29] 6363*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m23, [cq+64*19] 6364*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m23, [cq+64*13] 6365*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 6366*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 6367*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m23, [cq+64*24] 6368*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m23, [cq+64* 8] 6369*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m23, [cq+64*16] 6370*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m23, [cq+64* 0] 6371*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m23, [cq+64* 4] 6372*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m23, [cq+64*28] 6373*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m23, [cq+64*20] 6374*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m23, [cq+64*12] 6375*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 6376*c0909341SAndroid Build Coastguard Worker pmulhrsw m22, m23, [cq+64* 2] 6377*c0909341SAndroid Build Coastguard Worker pmulhrsw m29, m23, [cq+64*30] 6378*c0909341SAndroid Build Coastguard Worker pmulhrsw m26, m23, [cq+64*18] 6379*c0909341SAndroid Build Coastguard Worker pmulhrsw m25, m23, [cq+64*14] 6380*c0909341SAndroid Build Coastguard Worker pmulhrsw m24, m23, [cq+64*10] 6381*c0909341SAndroid Build Coastguard Worker pmulhrsw m27, m23, [cq+64*22] 6382*c0909341SAndroid Build Coastguard Worker pmulhrsw m28, m23, [cq+64*26] 6383*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, [cq+64* 6] 6384*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m14 6385*c0909341SAndroid Build Coastguard Worker mova [cq+64* 1], m15 6386*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m16 6387*c0909341SAndroid Build Coastguard Worker mova [cq+64* 3], m17 6388*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m18 6389*c0909341SAndroid Build Coastguard Worker mova [cq+64* 5], m19 6390*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m20 6391*c0909341SAndroid Build Coastguard Worker mova [cq+64* 7], m21 6392*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 6393*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [o(pw_16384)] 6394*c0909341SAndroid Build Coastguard Worker call .pass1_end_part1 6395*c0909341SAndroid Build Coastguard Worker mova [cq+64*16], m1 6396*c0909341SAndroid Build Coastguard Worker mova [cq+64*17], m3 6397*c0909341SAndroid Build Coastguard Worker mova [cq+64*18], m5 6398*c0909341SAndroid Build Coastguard Worker mova [cq+64*19], m7 6399*c0909341SAndroid Build Coastguard Worker mova [cq+64*24], m23 6400*c0909341SAndroid Build Coastguard Worker mova [cq+64*25], m25 6401*c0909341SAndroid Build Coastguard Worker mova [cq+64*26], m27 6402*c0909341SAndroid Build Coastguard Worker mova [cq+64*27], m29 6403*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, m13, m0 ; a0 6404*c0909341SAndroid Build Coastguard Worker pmulhrsw m25, m13, m2 ; a2 6405*c0909341SAndroid Build Coastguard Worker pmulhrsw m27, m13, m4 ; a4 6406*c0909341SAndroid Build Coastguard Worker pmulhrsw m29, m13, m6 ; a6 6407*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6 6408*c0909341SAndroid Build Coastguard Worker call .pass1_end_part2 6409*c0909341SAndroid Build Coastguard Worker mova [cq+64*20], m15 6410*c0909341SAndroid Build Coastguard Worker mova [cq+64*21], m17 6411*c0909341SAndroid Build Coastguard Worker mova [cq+64*22], m19 6412*c0909341SAndroid Build Coastguard Worker mova [cq+64*23], m21 6413*c0909341SAndroid Build Coastguard Worker mova [cq+64*28], m1 6414*c0909341SAndroid Build Coastguard Worker mova [cq+64*29], m3 6415*c0909341SAndroid Build Coastguard Worker mova [cq+64*30], m5 6416*c0909341SAndroid Build Coastguard Worker mova [cq+64*31], m7 6417*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6 6418*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6 6419*c0909341SAndroid Build Coastguard Worker vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01 6420*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03 6421*c0909341SAndroid Build Coastguard Worker vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01 6422*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03 6423*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41 6424*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43 6425*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 6426*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43 6427*c0909341SAndroid Build Coastguard Worker vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21 6428*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23 6429*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21 6430*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23 6431*c0909341SAndroid Build Coastguard Worker vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61 6432*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 6433*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61 6434*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63 6435*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m3, m15, q3131 ; 8 6436*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m3, m15, q2020 ; 0 6437*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m23, m22, q3131 ; 24 6438*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m23, m22, q2020 ; 16 6439*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m1, m18, q3131 ; 12 6440*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m18, q2020 ; 4 6441*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m27, m26, q3131 ; 28 6442*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m27, m26, q2020 ; 20 6443*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x8_8bpc).main 6444*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m14, m17, q3131 ; 10 6445*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m17, q2020 ; 2 6446*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m19, m20, q3131 ; 14 6447*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m19, m20, q2020 ; 6 6448*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m25, m24, q3131 ; 26 6449*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m25, m24, q2020 ; 18 6450*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m29, m28, q3131 ; 30 6451*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m29, m28, q2020 ; 22 6452*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 6453*c0909341SAndroid Build Coastguard Worker pmulhrsw m22, m13, [cq+64*16] ; a1 6454*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, m13, [cq+64*20] ; c1 6455*c0909341SAndroid Build Coastguard Worker pmulhrsw m24, m13, [cq+64*24] ; e1 6456*c0909341SAndroid Build Coastguard Worker pmulhrsw m25, m13, [cq+64*28] ; g1 6457*c0909341SAndroid Build Coastguard Worker pmulhrsw m26, m13, [cq+64*17] ; a3 6458*c0909341SAndroid Build Coastguard Worker pmulhrsw m27, m13, [cq+64*21] ; c3 6459*c0909341SAndroid Build Coastguard Worker pmulhrsw m28, m13, [cq+64*25] ; e3 6460*c0909341SAndroid Build Coastguard Worker pmulhrsw m29, m13, [cq+64*29] ; g3 6461*c0909341SAndroid Build Coastguard Worker mova [cq+64* 8], m14 6462*c0909341SAndroid Build Coastguard Worker mova [cq+64* 9], m15 6463*c0909341SAndroid Build Coastguard Worker mova [cq+64*10], m16 6464*c0909341SAndroid Build Coastguard Worker mova [cq+64*11], m17 6465*c0909341SAndroid Build Coastguard Worker mova [cq+64*12], m18 6466*c0909341SAndroid Build Coastguard Worker mova [cq+64*13], m19 6467*c0909341SAndroid Build Coastguard Worker mova [cq+64*14], m20 6468*c0909341SAndroid Build Coastguard Worker mova [cq+64*15], m21 6469*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m13, [cq+64*18] ; a5 6470*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m13, [cq+64*22] ; c5 6471*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m13, [cq+64*26] ; e5 6472*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m13, [cq+64*30] ; g5 6473*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m13, [cq+64*19] ; a7 6474*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m13, [cq+64*23] ; c7 6475*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m13, [cq+64*27] ; e7 6476*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m13, [cq+64*31] ; g7 6477*c0909341SAndroid Build Coastguard Worker vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11 6478*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13 6479*c0909341SAndroid Build Coastguard Worker vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11 6480*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13 6481*c0909341SAndroid Build Coastguard Worker vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31 6482*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33 6483*c0909341SAndroid Build Coastguard Worker vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31 6484*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33 6485*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m0 6486*c0909341SAndroid Build Coastguard Worker mova [cq+64* 1], m1 6487*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m2 6488*c0909341SAndroid Build Coastguard Worker mova [cq+64* 3], m3 6489*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m4 6490*c0909341SAndroid Build Coastguard Worker mova [cq+64* 5], m5 6491*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m6 6492*c0909341SAndroid Build Coastguard Worker mova [cq+64* 7], m7 6493*c0909341SAndroid Build Coastguard Worker vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51 6494*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53 6495*c0909341SAndroid Build Coastguard Worker vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51 6496*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53 6497*c0909341SAndroid Build Coastguard Worker vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71 6498*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73 6499*c0909341SAndroid Build Coastguard Worker vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71 6500*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73 6501*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m23, m11, q3131 ; 11 m27 6502*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m11, q2020 ; 3 m23 6503*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m26, m28, q3131 ; 27 m19 6504*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m26, m28, q2020 ; 19 m15 6505*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m25, m17, q3131 ; 15 m29 6506*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m17, q2020 ; 7 m25 6507*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m18, m20, q3131 ; 31 m21 6508*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m18, m20, q2020 ; 23 m17 6509*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m14, m16, q3131 ; 29 m20 6510*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m14, m16, q2020 ; 21 m16 6511*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m22, m24, q3131 ; 25 m18 6512*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m22, m24, q2020 ; 17 m14 6513*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m8, m9, q3131 ; 9 m26 6514*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m8, m9, q2020 ; 1 m22 6515*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m12, m13, q3131 ; 13 m28 6516*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m12, m13, q2020 ; 5 m24 6517*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf 6518*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [o(pw_16384)] 6519*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m13, [r4-64*21] 6520*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m13, [r4-64*22] 6521*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m13, [r4-64*23] 6522*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m13, [r4-64*24] 6523*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m13, [r4-64*25] 6524*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m13, [r4-64*26] 6525*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m13, [r4-64*27] 6526*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m13, [r4-64*28] 6527*c0909341SAndroid Build Coastguard Worker mova [cq+64*16], m14 6528*c0909341SAndroid Build Coastguard Worker mova [cq+64*17], m15 6529*c0909341SAndroid Build Coastguard Worker mova [cq+64*18], m16 6530*c0909341SAndroid Build Coastguard Worker mova [cq+64*19], m17 6531*c0909341SAndroid Build Coastguard Worker mova [cq+64*20], m18 6532*c0909341SAndroid Build Coastguard Worker mova [cq+64*21], m19 6533*c0909341SAndroid Build Coastguard Worker mova [cq+64*22], m20 6534*c0909341SAndroid Build Coastguard Worker mova [cq+64*23], m21 6535*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m13, [r4-64*12] 6536*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m13, [r4-64*11] 6537*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m13, [r4-64*10] 6538*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m13, [r4-64* 9] 6539*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m13, [r4-64* 8] 6540*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m13, [r4-64* 7] 6541*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m13, [r4-64* 6] 6542*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m13, [r4-64* 5] 6543*c0909341SAndroid Build Coastguard Worker mova [cq+64*24], m22 6544*c0909341SAndroid Build Coastguard Worker mova [cq+64*25], m23 6545*c0909341SAndroid Build Coastguard Worker mova [cq+64*26], m24 6546*c0909341SAndroid Build Coastguard Worker mova [cq+64*27], m25 6547*c0909341SAndroid Build Coastguard Worker mova [cq+64*28], m26 6548*c0909341SAndroid Build Coastguard Worker mova [cq+64*29], m27 6549*c0909341SAndroid Build Coastguard Worker mova [cq+64*30], m28 6550*c0909341SAndroid Build Coastguard Worker mova [cq+64*31], m29 6551*c0909341SAndroid Build Coastguard Worker call .transpose_2x8x8_lo 6552*c0909341SAndroid Build Coastguard Worker mova [r4-64*12], m1 6553*c0909341SAndroid Build Coastguard Worker mova [r4-64*11], m3 6554*c0909341SAndroid Build Coastguard Worker mova [r4-64*10], m5 6555*c0909341SAndroid Build Coastguard Worker mova [r4-64* 9], m7 6556*c0909341SAndroid Build Coastguard Worker mova [r4-64* 8], m15 6557*c0909341SAndroid Build Coastguard Worker mova [r4-64* 7], m17 6558*c0909341SAndroid Build Coastguard Worker mova [r4-64* 6], m19 6559*c0909341SAndroid Build Coastguard Worker mova [r4-64* 5], m21 6560*c0909341SAndroid Build Coastguard Worker vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01 6561*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03 6562*c0909341SAndroid Build Coastguard Worker vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21 6563*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23 6564*c0909341SAndroid Build Coastguard Worker vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41 6565*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43 6566*c0909341SAndroid Build Coastguard Worker vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61 6567*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63 6568*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m13, [r4-64*20] 6569*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m13, [r4-64*19] 6570*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m13, [r4-64*18] 6571*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m13, [r4-64*17] 6572*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m13, [r4-64*16] 6573*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m13, [r4-64*15] 6574*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m13, [r4-64*14] 6575*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m13, [r4-64*13] 6576*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m13, [r4-64*29] 6577*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m13, [r4-64*30] 6578*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m13, [r4-64*31] 6579*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m13, [r4-64*32] 6580*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m13, [r4-64*33] 6581*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m13, [r4-64*34] 6582*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m13, [r4-64*35] 6583*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m13, [r4-64*36] 6584*c0909341SAndroid Build Coastguard Worker call .transpose_2x8x8_lo 6585*c0909341SAndroid Build Coastguard Worker mova [r4-64*20], m1 6586*c0909341SAndroid Build Coastguard Worker mova [r4-64*19], m3 6587*c0909341SAndroid Build Coastguard Worker mova [r4-64*18], m5 6588*c0909341SAndroid Build Coastguard Worker mova [r4-64*17], m7 6589*c0909341SAndroid Build Coastguard Worker mova [r4-64*16], m15 6590*c0909341SAndroid Build Coastguard Worker mova [r4-64*15], m17 6591*c0909341SAndroid Build Coastguard Worker mova [r4-64*14], m19 6592*c0909341SAndroid Build Coastguard Worker mova [r4-64*13], m21 6593*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41 6594*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43 6595*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03 6596*c0909341SAndroid Build Coastguard Worker vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01 6597*c0909341SAndroid Build Coastguard Worker vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21 6598*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23 6599*c0909341SAndroid Build Coastguard Worker vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61 6600*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63 6601*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m0, m22, q3131 ; 8 6602*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m22, q2020 ; 0 6603*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m1, m26, q3131 ; 12 6604*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m26, q2020 ; 4 6605*c0909341SAndroid Build Coastguard Worker vshufi32x4 m6, m4, m23, q3131 ; 24 6606*c0909341SAndroid Build Coastguard Worker vshufi32x4 m4, m23, q2020 ; 16 6607*c0909341SAndroid Build Coastguard Worker vshufi32x4 m7, m5, m27, q3131 ; 28 6608*c0909341SAndroid Build Coastguard Worker vshufi32x4 m5, m27, q2020 ; 20 6609*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x8_8bpc).main 6610*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m14, m24, q3131 ; 10 6611*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m24, q2020 ; 2 6612*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m15, m28, q3131 ; 14 6613*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m28, q2020 ; 6 6614*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m18, m25, q3131 ; 26 6615*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m25, q2020 ; 18 6616*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m19, m29, q3131 ; 30 6617*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m29, q2020 ; 22 6618*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf 6619*c0909341SAndroid Build Coastguard Worker mova m22, [r4-64*20] 6620*c0909341SAndroid Build Coastguard Worker mova m26, [r4-64*16] 6621*c0909341SAndroid Build Coastguard Worker mova m23, [r4-64*19] 6622*c0909341SAndroid Build Coastguard Worker mova m27, [r4-64*15] 6623*c0909341SAndroid Build Coastguard Worker mova m24, [r4-64*18] 6624*c0909341SAndroid Build Coastguard Worker mova m28, [r4-64*14] 6625*c0909341SAndroid Build Coastguard Worker mova m25, [r4-64*17] 6626*c0909341SAndroid Build Coastguard Worker mova m29, [r4-64*13] 6627*c0909341SAndroid Build Coastguard Worker mova [r4-64*20], m14 6628*c0909341SAndroid Build Coastguard Worker mova [r4-64*19], m15 6629*c0909341SAndroid Build Coastguard Worker mova [r4-64*18], m16 6630*c0909341SAndroid Build Coastguard Worker mova [r4-64*17], m17 6631*c0909341SAndroid Build Coastguard Worker mova [r4-64*16], m18 6632*c0909341SAndroid Build Coastguard Worker mova [r4-64*15], m19 6633*c0909341SAndroid Build Coastguard Worker mova [r4-64*14], m20 6634*c0909341SAndroid Build Coastguard Worker mova [r4-64*13], m21 6635*c0909341SAndroid Build Coastguard Worker mova m19, [r4-64*12] 6636*c0909341SAndroid Build Coastguard Worker mova m11, [r4-64* 8] 6637*c0909341SAndroid Build Coastguard Worker mova m20, [r4-64*11] 6638*c0909341SAndroid Build Coastguard Worker mova m12, [r4-64* 7] 6639*c0909341SAndroid Build Coastguard Worker mova m21, [r4-64*10] 6640*c0909341SAndroid Build Coastguard Worker mova m8, [r4-64* 6] 6641*c0909341SAndroid Build Coastguard Worker mova m9, [r4-64* 9] 6642*c0909341SAndroid Build Coastguard Worker mova m18, [r4-64* 5] 6643*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13 6644*c0909341SAndroid Build Coastguard Worker vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11 6645*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33 6646*c0909341SAndroid Build Coastguard Worker vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31 6647*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53 6648*c0909341SAndroid Build Coastguard Worker vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51 6649*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73 6650*c0909341SAndroid Build Coastguard Worker vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71 6651*c0909341SAndroid Build Coastguard Worker vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11 6652*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13 6653*c0909341SAndroid Build Coastguard Worker vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31 6654*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33 6655*c0909341SAndroid Build Coastguard Worker vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51 6656*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53 6657*c0909341SAndroid Build Coastguard Worker vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71 6658*c0909341SAndroid Build Coastguard Worker vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73 6659*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m22, m27, q3131 ; 9 6660*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m27, q2020 ; 1 6661*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m23, m28, q3131 ; 11 6662*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m28, q2020 ; 3 6663*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m24, m29, q3131 ; 13 6664*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m29, q2020 ; 5 6665*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m25, m8, q3131 ; 15 6666*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m8, q2020 ; 7 6667*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m14, m19, q3131 ; 25 6668*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m19, q2020 ; 17 6669*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m15, m20, q3131 ; 27 6670*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m20, q2020 ; 19 6671*c0909341SAndroid Build Coastguard Worker vshufi32x4 m20, m16, m21, q3131 ; 29 6672*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m21, q2020 ; 21 6673*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m17, m9, q3131 ; 31 6674*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m9, q2020 ; 23 6675*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf 6676*c0909341SAndroid Build Coastguard Worker jmp .end 6677*c0909341SAndroid Build Coastguard Worker.fast: ; bottom/right halves are zero 6678*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw ym8, ym23, [cq+64* 4] 6679*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw xm1, xm23, [cq+64*12] 6680*c0909341SAndroid Build Coastguard Worker mova m28, [o(dup16_perm)] 6681*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw ym7, ym23, [cq+64* 8] 6682*c0909341SAndroid Build Coastguard Worker vpmulhrsw ym22, ym23, [cq+64* 0] 6683*c0909341SAndroid Build Coastguard Worker vpermb m8, m28, m8 6684*c0909341SAndroid Build Coastguard Worker vpermb ym1, ym28, ym1 6685*c0909341SAndroid Build Coastguard Worker vpermb m7, m28, m7 6686*c0909341SAndroid Build Coastguard Worker pmovzxwd m9, ym22 6687*c0909341SAndroid Build Coastguard Worker pslld m9, 16 6688*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main_fast2 6689*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw ym21, ym23, [cq+64* 2] 6690*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw xm15, xm23, [cq+64*14] 6691*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw xm18, xm23, [cq+64*10] 6692*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw ym14, ym23, [cq+64* 6] 6693*c0909341SAndroid Build Coastguard Worker vpermb m21, m28, m21 6694*c0909341SAndroid Build Coastguard Worker punpcklwd xm15, xm15 6695*c0909341SAndroid Build Coastguard Worker vpermb ym18, ym28, ym18 6696*c0909341SAndroid Build Coastguard Worker vpermb m14, m28, m14 6697*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 6698*c0909341SAndroid Build Coastguard Worker vpmulhrsw ym22, ym23, [cq+64* 1] 6699*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw xm29, xm23, [cq+64*15] 6700*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw xm26, xm23, [cq+64* 9] 6701*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw ym25, ym23, [cq+64* 7] 6702*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw ym24, ym23, [cq+64* 5] 6703*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw xm27, xm23, [cq+64*11] 6704*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw xm8, xm23, [cq+64*13] 6705*c0909341SAndroid Build Coastguard Worker {evex}vpmulhrsw ym23, [cq+64* 3] 6706*c0909341SAndroid Build Coastguard Worker vpermb m22, m28, m22 6707*c0909341SAndroid Build Coastguard Worker punpcklwd xm29, xm29 6708*c0909341SAndroid Build Coastguard Worker vpermb ym26, ym28, ym26 6709*c0909341SAndroid Build Coastguard Worker vpermb m25, m28, m25 6710*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m14 6711*c0909341SAndroid Build Coastguard Worker mova [cq+64* 1], m15 6712*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m16 6713*c0909341SAndroid Build Coastguard Worker mova [cq+64* 3], m17 6714*c0909341SAndroid Build Coastguard Worker REPX {vpermb x, m28, x}, m24, m27, m23 6715*c0909341SAndroid Build Coastguard Worker punpcklwd xm28, xm8, xm8 6716*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m18 6717*c0909341SAndroid Build Coastguard Worker mova [cq+64* 5], m19 6718*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m20 6719*c0909341SAndroid Build Coastguard Worker mova [cq+64* 7], m21 6720*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast 6721*c0909341SAndroid Build Coastguard Worker mov r4, rsp 6722*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [o(pw_16384)] 6723*c0909341SAndroid Build Coastguard Worker mova [r4+64*16], m4 6724*c0909341SAndroid Build Coastguard Worker mova [r4+64*17], m5 6725*c0909341SAndroid Build Coastguard Worker mova [r4+64*18], m6 6726*c0909341SAndroid Build Coastguard Worker mova [r4+64*19], m7 6727*c0909341SAndroid Build Coastguard Worker mova [r4+64*28], m26 6728*c0909341SAndroid Build Coastguard Worker mova [r4+64*29], m27 6729*c0909341SAndroid Build Coastguard Worker mova [r4+64*30], m28 6730*c0909341SAndroid Build Coastguard Worker mova [r4+64*31], m29 6731*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end 6732*c0909341SAndroid Build Coastguard Worker mova [r4+64*20], m22 6733*c0909341SAndroid Build Coastguard Worker mova [r4+64*21], m23 6734*c0909341SAndroid Build Coastguard Worker mova [r4+64*22], m24 6735*c0909341SAndroid Build Coastguard Worker mova [r4+64*23], m25 6736*c0909341SAndroid Build Coastguard Worker mova [r4+64*24], m26 6737*c0909341SAndroid Build Coastguard Worker mova [r4+64*25], m27 6738*c0909341SAndroid Build Coastguard Worker mova [r4+64*26], m28 6739*c0909341SAndroid Build Coastguard Worker mova [r4+64*27], m29 6740*c0909341SAndroid Build Coastguard Worker call .pass2_fast 6741*c0909341SAndroid Build Coastguard Worker mova [cq+64* 8], m14 6742*c0909341SAndroid Build Coastguard Worker mova [cq+64* 9], m15 6743*c0909341SAndroid Build Coastguard Worker mova [cq+64*10], m16 6744*c0909341SAndroid Build Coastguard Worker mova [cq+64*11], m17 6745*c0909341SAndroid Build Coastguard Worker mova [cq+64*12], m18 6746*c0909341SAndroid Build Coastguard Worker mova [cq+64*13], m19 6747*c0909341SAndroid Build Coastguard Worker mova [cq+64*14], m20 6748*c0909341SAndroid Build Coastguard Worker mova [cq+64*15], m21 6749*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 6750*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m0 6751*c0909341SAndroid Build Coastguard Worker mova [cq+64* 1], m1 6752*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m2 6753*c0909341SAndroid Build Coastguard Worker mova [cq+64* 3], m3 6754*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m4 6755*c0909341SAndroid Build Coastguard Worker mova [cq+64* 5], m5 6756*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m6 6757*c0909341SAndroid Build Coastguard Worker mova [cq+64* 7], m7 6758*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m13, [r4+64*16] 6759*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m13, [r4+64*17] 6760*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m13, [r4+64*18] 6761*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m13, [r4+64*19] 6762*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m13, [r4+64*20] 6763*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m13, [r4+64*21] 6764*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m13, [r4+64*22] 6765*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m13, [r4+64*23] 6766*c0909341SAndroid Build Coastguard Worker mova [cq+64*16], m14 6767*c0909341SAndroid Build Coastguard Worker mova [cq+64*17], m15 6768*c0909341SAndroid Build Coastguard Worker mova [cq+64*18], m16 6769*c0909341SAndroid Build Coastguard Worker mova [cq+64*19], m17 6770*c0909341SAndroid Build Coastguard Worker mova [cq+64*20], m18 6771*c0909341SAndroid Build Coastguard Worker mova [cq+64*21], m19 6772*c0909341SAndroid Build Coastguard Worker mova [cq+64*22], m20 6773*c0909341SAndroid Build Coastguard Worker mova [cq+64*23], m21 6774*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m13, [r4+64*24] 6775*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m13, [r4+64*25] 6776*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m13, [r4+64*26] 6777*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m13, [r4+64*27] 6778*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m13, [r4+64*28] 6779*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m13, [r4+64*29] 6780*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m13, [r4+64*30] 6781*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m13, [r4+64*31] 6782*c0909341SAndroid Build Coastguard Worker mova [cq+64*24], m22 6783*c0909341SAndroid Build Coastguard Worker mova [cq+64*25], m23 6784*c0909341SAndroid Build Coastguard Worker mova [cq+64*26], m24 6785*c0909341SAndroid Build Coastguard Worker mova [cq+64*27], m25 6786*c0909341SAndroid Build Coastguard Worker mova [cq+64*28], m26 6787*c0909341SAndroid Build Coastguard Worker mova [cq+64*29], m27 6788*c0909341SAndroid Build Coastguard Worker mova [cq+64*30], m28 6789*c0909341SAndroid Build Coastguard Worker mova [cq+64*31], m29 6790*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round 6791*c0909341SAndroid Build Coastguard Worker call .pass2_fast 6792*c0909341SAndroid Build Coastguard Worker mova [r4+64*16], m14 6793*c0909341SAndroid Build Coastguard Worker mova [r4+64*17], m15 6794*c0909341SAndroid Build Coastguard Worker mova [r4+64*18], m16 6795*c0909341SAndroid Build Coastguard Worker mova [r4+64*19], m17 6796*c0909341SAndroid Build Coastguard Worker mova [r4+64*20], m18 6797*c0909341SAndroid Build Coastguard Worker mova [r4+64*21], m19 6798*c0909341SAndroid Build Coastguard Worker mova [r4+64*22], m20 6799*c0909341SAndroid Build Coastguard Worker mova [r4+64*23], m21 6800*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 6801*c0909341SAndroid Build Coastguard Worker.end: 6802*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [o(pw_2048)] 6803*c0909341SAndroid Build Coastguard Worker lea r5, [strideq*3] 6804*c0909341SAndroid Build Coastguard Worker pxor m12, m12 6805*c0909341SAndroid Build Coastguard Worker lea r3, [dstq+r5*8] 6806*c0909341SAndroid Build Coastguard Worker lea r6, [strideq+r5] ; stride*4 6807*c0909341SAndroid Build Coastguard Worker add r3, r6 ; dst+stride*28 6808*c0909341SAndroid Build Coastguard Worker%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi 6809*c0909341SAndroid Build Coastguard Worker mova m11, [cq+64*( %3)] ; 0 6810*c0909341SAndroid Build Coastguard Worker mova m9, [cq+64*(31-%3)] ; 31 6811*c0909341SAndroid Build Coastguard Worker%if %3 >= 8 6812*c0909341SAndroid Build Coastguard Worker mova m%1, [rsp+64*(%1+16)] 6813*c0909341SAndroid Build Coastguard Worker%endif 6814*c0909341SAndroid Build Coastguard Worker mova m10, [dstq+%4] 6815*c0909341SAndroid Build Coastguard Worker paddsw m8, m11, m9 6816*c0909341SAndroid Build Coastguard Worker psubsw m11, m9 6817*c0909341SAndroid Build Coastguard Worker paddsw m9, m%1, m%2 6818*c0909341SAndroid Build Coastguard Worker psubsw m%1, m%2 6819*c0909341SAndroid Build Coastguard Worker punpcklbw m%2, m10, m12 6820*c0909341SAndroid Build Coastguard Worker punpckhbw m10, m12 6821*c0909341SAndroid Build Coastguard Worker pmulhrsw m8, m13 6822*c0909341SAndroid Build Coastguard Worker pmulhrsw m9, m13 6823*c0909341SAndroid Build Coastguard Worker paddw m8, m%2 6824*c0909341SAndroid Build Coastguard Worker paddw m9, m10 6825*c0909341SAndroid Build Coastguard Worker mova m10, [r3+%5] 6826*c0909341SAndroid Build Coastguard Worker pmulhrsw m11, m13 6827*c0909341SAndroid Build Coastguard Worker pmulhrsw m%1, m13 6828*c0909341SAndroid Build Coastguard Worker mova [cq+64*( %3)], m12 6829*c0909341SAndroid Build Coastguard Worker mova [cq+64*(31-%3)], m12 6830*c0909341SAndroid Build Coastguard Worker punpcklbw m%2, m10, m12 6831*c0909341SAndroid Build Coastguard Worker punpckhbw m10, m12 6832*c0909341SAndroid Build Coastguard Worker packuswb m8, m9 6833*c0909341SAndroid Build Coastguard Worker paddw m11, m%2 6834*c0909341SAndroid Build Coastguard Worker paddw m%1, m10 6835*c0909341SAndroid Build Coastguard Worker packuswb m11, m%1 6836*c0909341SAndroid Build Coastguard Worker mova [dstq+%4], m8 6837*c0909341SAndroid Build Coastguard Worker mova [r3 +%5], m11 6838*c0909341SAndroid Build Coastguard Worker%if %3 == 3 || %3 == 7 || %3 == 11 6839*c0909341SAndroid Build Coastguard Worker add dstq, r6 6840*c0909341SAndroid Build Coastguard Worker sub r3, r6 6841*c0909341SAndroid Build Coastguard Worker%endif 6842*c0909341SAndroid Build Coastguard Worker%endmacro 6843*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 0, 29, 0, strideq*0, r5 6844*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2 6845*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1 6846*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 3, 26, 3, r5 , strideq*0 6847*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 4, 25, 4, strideq*0, r5 6848*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2 6849*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1 6850*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 7, 22, 7, r5 , strideq*0 6851*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 0, 21, 8, strideq*0, r5 6852*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2 6853*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1 6854*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 3, 18, 11, r5 , strideq*0 6855*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 4, 17, 12, strideq*0, r5 6856*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2 6857*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1 6858*c0909341SAndroid Build Coastguard Worker IDCT_64x32_END 7, 14, 15, r5 , strideq*0 6859*c0909341SAndroid Build Coastguard Worker RET 6860*c0909341SAndroid Build Coastguard WorkerALIGN function_align 6861*c0909341SAndroid Build Coastguard Worker.dconly: 6862*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 6863*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 6864*c0909341SAndroid Build Coastguard Worker or r3d, 32 6865*c0909341SAndroid Build Coastguard Worker imul r6d, 181 6866*c0909341SAndroid Build Coastguard Worker add r6d, 128 6867*c0909341SAndroid Build Coastguard Worker sar r6d, 8 6868*c0909341SAndroid Build Coastguard Worker imul r6d, 181 6869*c0909341SAndroid Build Coastguard Worker add r6d, 128+256 6870*c0909341SAndroid Build Coastguard Worker sar r6d, 8+1 6871*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2 6872*c0909341SAndroid Build Coastguard WorkerALIGN function_align 6873*c0909341SAndroid Build Coastguard Worker.pass1_end_part1: 6874*c0909341SAndroid Build Coastguard Worker%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64 6875*c0909341SAndroid Build Coastguard Worker%if %1 != %3 6876*c0909341SAndroid Build Coastguard Worker mova m%1, [cq+64*%1] 6877*c0909341SAndroid Build Coastguard Worker%endif 6878*c0909341SAndroid Build Coastguard Worker mova m9, [r4+64*(%3-36)] ; idct64 32+n 6879*c0909341SAndroid Build Coastguard Worker mova m11, [r4+64*(-5-%3)] ; idct64 63-n 6880*c0909341SAndroid Build Coastguard Worker psubsw m8, m%1, m%2 ; idct32 31-n 6881*c0909341SAndroid Build Coastguard Worker paddsw m%1, m%2 ; idct32 0+n 6882*c0909341SAndroid Build Coastguard Worker%if %1 == %3 6883*c0909341SAndroid Build Coastguard Worker psubsw m%2, m8, m9 ; out 32+n e 6884*c0909341SAndroid Build Coastguard Worker paddsw m8, m9 ; out 31-n d 6885*c0909341SAndroid Build Coastguard Worker psubsw m9, m%1, m11 ; out 63-n h 6886*c0909341SAndroid Build Coastguard Worker paddsw m%1, m11 ; out 0+n a 6887*c0909341SAndroid Build Coastguard Worker%else 6888*c0909341SAndroid Build Coastguard Worker paddsw m%2, m8, m9 ; out 23-n c 6889*c0909341SAndroid Build Coastguard Worker psubsw m8, m9 ; out 40+n f 6890*c0909341SAndroid Build Coastguard Worker paddsw m9, m%1, m11 ; out 8+n b 6891*c0909341SAndroid Build Coastguard Worker psubsw m%1, m11 ; out 55-n g 6892*c0909341SAndroid Build Coastguard Worker%endif 6893*c0909341SAndroid Build Coastguard Worker mova [r4+64*(%3-36)], m8 6894*c0909341SAndroid Build Coastguard Worker mova [r4+64*(-5-%3)], m9 6895*c0909341SAndroid Build Coastguard Worker%endmacro 6896*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 0, 29, 0 6897*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 1, 28, 1 6898*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 2, 27, 2 6899*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 3, 26, 3 6900*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 4, 25, 4 6901*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 5, 24, 5 6902*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 6, 23, 6 6903*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 7, 22, 7 6904*c0909341SAndroid Build Coastguard Worker.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted) 6905*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3 6906*c0909341SAndroid Build Coastguard Worker punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7 6907*c0909341SAndroid Build Coastguard Worker punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3 6908*c0909341SAndroid Build Coastguard Worker punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7 6909*c0909341SAndroid Build Coastguard Worker punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3 6910*c0909341SAndroid Build Coastguard Worker punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7 6911*c0909341SAndroid Build Coastguard Worker punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3 6912*c0909341SAndroid Build Coastguard Worker punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7 6913*c0909341SAndroid Build Coastguard Worker punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5 6914*c0909341SAndroid Build Coastguard Worker punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7 6915*c0909341SAndroid Build Coastguard Worker punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1 6916*c0909341SAndroid Build Coastguard Worker punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3 6917*c0909341SAndroid Build Coastguard Worker punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3 6918*c0909341SAndroid Build Coastguard Worker punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1 6919*c0909341SAndroid Build Coastguard Worker punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5 6920*c0909341SAndroid Build Coastguard Worker punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7 6921*c0909341SAndroid Build Coastguard Worker punpckhqdq m23, m22, m27 ; 1 23 6922*c0909341SAndroid Build Coastguard Worker punpcklqdq m22, m27 ; 0 22 6923*c0909341SAndroid Build Coastguard Worker punpckhqdq m27, m26, m28 ; 5 27 6924*c0909341SAndroid Build Coastguard Worker punpcklqdq m26, m28 ; 4 26 6925*c0909341SAndroid Build Coastguard Worker punpcklqdq m28, m29, m25 ; 6 28 6926*c0909341SAndroid Build Coastguard Worker punpckhqdq m29, m25 ; 7 29 6927*c0909341SAndroid Build Coastguard Worker punpckhqdq m25, m24, m8 ; 3 25 6928*c0909341SAndroid Build Coastguard Worker punpcklqdq m24, m8 ; 2 24 6929*c0909341SAndroid Build Coastguard Worker.transpose_8x8: 6930*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m4, m5 6931*c0909341SAndroid Build Coastguard Worker punpcklwd m4, m5 6932*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m0, m1 6933*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m1 6934*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m6, m7 6935*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m7 6936*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m2, m3 6937*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m3 6938*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m0, m2 6939*c0909341SAndroid Build Coastguard Worker punpckldq m0, m2 6940*c0909341SAndroid Build Coastguard Worker punpckldq m2, m4, m6 6941*c0909341SAndroid Build Coastguard Worker punpckhdq m4, m6 6942*c0909341SAndroid Build Coastguard Worker punpckhdq m6, m5, m7 6943*c0909341SAndroid Build Coastguard Worker punpckldq m5, m7 6944*c0909341SAndroid Build Coastguard Worker punpckldq m7, m8, m1 6945*c0909341SAndroid Build Coastguard Worker punpckhdq m8, m1 6946*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m0, m2 6947*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m2 6948*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m3, m4 6949*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m4 6950*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m5, m7 6951*c0909341SAndroid Build Coastguard Worker punpckhqdq m5, m7 6952*c0909341SAndroid Build Coastguard Worker punpckhqdq m7, m6, m8 6953*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m8 6954*c0909341SAndroid Build Coastguard Worker ret 6955*c0909341SAndroid Build Coastguard Worker.pass1_end_part2: 6956*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 0, 21, 8 6957*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 1, 20, 9 6958*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 2, 19, 10 6959*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 3, 18, 11 6960*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 4, 17, 12 6961*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 5, 16, 13 6962*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 6, 15, 14 6963*c0909341SAndroid Build Coastguard Worker IDCT_64x32_PASS1_END 7, 14, 15 6964*c0909341SAndroid Build Coastguard Worker.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21 6965*c0909341SAndroid Build Coastguard Worker punpcklwd m8, m3, m2 6966*c0909341SAndroid Build Coastguard Worker punpckhwd m3, m2 6967*c0909341SAndroid Build Coastguard Worker punpcklwd m2, m1, m0 6968*c0909341SAndroid Build Coastguard Worker punpckhwd m1, m0 6969*c0909341SAndroid Build Coastguard Worker punpcklwd m0, m7, m6 6970*c0909341SAndroid Build Coastguard Worker punpckhwd m7, m6 6971*c0909341SAndroid Build Coastguard Worker punpcklwd m6, m5, m4 6972*c0909341SAndroid Build Coastguard Worker punpckhwd m5, m4 6973*c0909341SAndroid Build Coastguard Worker punpckldq m4, m7, m5 6974*c0909341SAndroid Build Coastguard Worker punpckhdq m7, m5 6975*c0909341SAndroid Build Coastguard Worker punpckldq m5, m8, m2 6976*c0909341SAndroid Build Coastguard Worker punpckhdq m8, m2 6977*c0909341SAndroid Build Coastguard Worker punpckhdq m2, m0, m6 6978*c0909341SAndroid Build Coastguard Worker punpckldq m0, m6 6979*c0909341SAndroid Build Coastguard Worker punpckldq m6, m3, m1 6980*c0909341SAndroid Build Coastguard Worker punpckhdq m3, m1 6981*c0909341SAndroid Build Coastguard Worker punpckhqdq m1, m0, m5 6982*c0909341SAndroid Build Coastguard Worker punpcklqdq m0, m5 6983*c0909341SAndroid Build Coastguard Worker punpckhqdq m5, m4, m6 6984*c0909341SAndroid Build Coastguard Worker punpcklqdq m4, m6 6985*c0909341SAndroid Build Coastguard Worker punpcklqdq m6, m7, m3 6986*c0909341SAndroid Build Coastguard Worker punpckhqdq m7, m3 6987*c0909341SAndroid Build Coastguard Worker punpckhqdq m3, m2, m8 6988*c0909341SAndroid Build Coastguard Worker punpcklqdq m2, m8 6989*c0909341SAndroid Build Coastguard Worker punpckhwd m8, m18, m19 6990*c0909341SAndroid Build Coastguard Worker punpcklwd m18, m19 6991*c0909341SAndroid Build Coastguard Worker punpckhwd m19, m14, m15 6992*c0909341SAndroid Build Coastguard Worker punpcklwd m14, m15 6993*c0909341SAndroid Build Coastguard Worker punpckhwd m15, m20, m21 6994*c0909341SAndroid Build Coastguard Worker punpcklwd m20, m21 6995*c0909341SAndroid Build Coastguard Worker punpckhwd m21, m16, m17 6996*c0909341SAndroid Build Coastguard Worker punpcklwd m16, m17 6997*c0909341SAndroid Build Coastguard Worker punpckhdq m17, m14, m16 6998*c0909341SAndroid Build Coastguard Worker punpckldq m14, m16 6999*c0909341SAndroid Build Coastguard Worker punpckldq m16, m18, m20 7000*c0909341SAndroid Build Coastguard Worker punpckhdq m18, m20 7001*c0909341SAndroid Build Coastguard Worker punpckhdq m20, m19, m21 7002*c0909341SAndroid Build Coastguard Worker punpckldq m19, m21 7003*c0909341SAndroid Build Coastguard Worker punpckldq m21, m8, m15 7004*c0909341SAndroid Build Coastguard Worker punpckhdq m8, m15 7005*c0909341SAndroid Build Coastguard Worker punpckhqdq m15, m14, m16 7006*c0909341SAndroid Build Coastguard Worker punpcklqdq m14, m16 7007*c0909341SAndroid Build Coastguard Worker punpcklqdq m16, m17, m18 7008*c0909341SAndroid Build Coastguard Worker punpckhqdq m17, m18 7009*c0909341SAndroid Build Coastguard Worker punpcklqdq m18, m19, m21 7010*c0909341SAndroid Build Coastguard Worker punpckhqdq m19, m21 7011*c0909341SAndroid Build Coastguard Worker punpckhqdq m21, m20, m8 7012*c0909341SAndroid Build Coastguard Worker punpcklqdq m20, m8 7013*c0909341SAndroid Build Coastguard Worker ret 7014*c0909341SAndroid Build Coastguard Worker.pass2_fast: 7015*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m9, m15, q3131 ; 5 7016*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m9, m15, q2020 ; 1 7017*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m1, m16, q3131 ; 6 7018*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m1, m16, q2020 ; 2 7019*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m0, m3, q3131 ; 4 7020*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m3, q2020 ; 0 7021*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m8, m2, q3131 ; 12 7022*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m8, m2, q2020 ; 8 7023*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m11, m17, q3131 ; 7 7024*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m11, m17, q2020 ; 3 7025*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m5, m19, q3131 ; 14 7026*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m5, m19, q2020 ; 10 7027*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m6, m20, q3131 ; 15 7028*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m6, m20, q2020 ; 11 7029*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m4, m18, q3131 ; 13 7030*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m4, m18, q2020 ; 9 7031*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 7032*c0909341SAndroid Build Coastguard Worker 7033*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob 7034*c0909341SAndroid Build Coastguard Worker lea r5, [o_base] 7035*c0909341SAndroid Build Coastguard Worker test eobd, eobd 7036*c0909341SAndroid Build Coastguard Worker jz .dconly 7037*c0909341SAndroid Build Coastguard Worker PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob 7038*c0909341SAndroid Build Coastguard Worker%undef cmp 7039*c0909341SAndroid Build Coastguard Worker cmp eobd, 136 7040*c0909341SAndroid Build Coastguard Worker jb .fast 7041*c0909341SAndroid Build Coastguard Worker mova m0, [cq+64* 1] 7042*c0909341SAndroid Build Coastguard Worker mova m1, [cq+64*31] 7043*c0909341SAndroid Build Coastguard Worker mova m2, [cq+64*17] 7044*c0909341SAndroid Build Coastguard Worker mova m3, [cq+64*15] 7045*c0909341SAndroid Build Coastguard Worker vpbroadcastd m10, [o(pd_2048)] 7046*c0909341SAndroid Build Coastguard Worker mov r4, rsp 7047*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7048*c0909341SAndroid Build Coastguard Worker mova m0, [cq+64* 7] 7049*c0909341SAndroid Build Coastguard Worker mova m1, [cq+64*25] 7050*c0909341SAndroid Build Coastguard Worker mova m2, [cq+64*23] 7051*c0909341SAndroid Build Coastguard Worker mova m3, [cq+64* 9] 7052*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7053*c0909341SAndroid Build Coastguard Worker mova m0, [cq+64* 5] 7054*c0909341SAndroid Build Coastguard Worker mova m1, [cq+64*27] 7055*c0909341SAndroid Build Coastguard Worker mova m2, [cq+64*21] 7056*c0909341SAndroid Build Coastguard Worker mova m3, [cq+64*11] 7057*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7058*c0909341SAndroid Build Coastguard Worker mova m0, [cq+64* 3] 7059*c0909341SAndroid Build Coastguard Worker mova m1, [cq+64*29] 7060*c0909341SAndroid Build Coastguard Worker mova m2, [cq+64*19] 7061*c0909341SAndroid Build Coastguard Worker mova m3, [cq+64*13] 7062*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7063*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 7064*c0909341SAndroid Build Coastguard Worker mova m0, [cq+64* 0] 7065*c0909341SAndroid Build Coastguard Worker mova m1, [cq+64* 8] 7066*c0909341SAndroid Build Coastguard Worker mova m2, [cq+64*16] 7067*c0909341SAndroid Build Coastguard Worker mova m3, [cq+64*24] 7068*c0909341SAndroid Build Coastguard Worker mova m14, [cq+64* 4] 7069*c0909341SAndroid Build Coastguard Worker mova m15, [cq+64*12] 7070*c0909341SAndroid Build Coastguard Worker mova m16, [cq+64*20] 7071*c0909341SAndroid Build Coastguard Worker mova m17, [cq+64*28] 7072*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 7073*c0909341SAndroid Build Coastguard Worker mova m22, [cq+64* 2] 7074*c0909341SAndroid Build Coastguard Worker mova m29, [cq+64*30] 7075*c0909341SAndroid Build Coastguard Worker mova m26, [cq+64*18] 7076*c0909341SAndroid Build Coastguard Worker mova m25, [cq+64*14] 7077*c0909341SAndroid Build Coastguard Worker mova m24, [cq+64*10] 7078*c0909341SAndroid Build Coastguard Worker mova m27, [cq+64*22] 7079*c0909341SAndroid Build Coastguard Worker mova m28, [cq+64*26] 7080*c0909341SAndroid Build Coastguard Worker mova m23, [cq+64* 6] 7081*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m14 7082*c0909341SAndroid Build Coastguard Worker mova [cq+64* 1], m15 7083*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m16 7084*c0909341SAndroid Build Coastguard Worker mova [cq+64* 3], m17 7085*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m18 7086*c0909341SAndroid Build Coastguard Worker mova [cq+64* 5], m19 7087*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m20 7088*c0909341SAndroid Build Coastguard Worker mova [cq+64* 7], m21 7089*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 7090*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [o(pw_8192)] 7091*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1 7092*c0909341SAndroid Build Coastguard Worker mova [r4+64*36], m1 7093*c0909341SAndroid Build Coastguard Worker mova [r4+64*37], m3 7094*c0909341SAndroid Build Coastguard Worker mova [r4+64*38], m5 7095*c0909341SAndroid Build Coastguard Worker mova [r4+64*39], m7 7096*c0909341SAndroid Build Coastguard Worker mova [r4+64*44], m23 7097*c0909341SAndroid Build Coastguard Worker mova [r4+64*45], m25 7098*c0909341SAndroid Build Coastguard Worker mova [r4+64*46], m27 7099*c0909341SAndroid Build Coastguard Worker mova [r4+64*47], m29 7100*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, m13, m0 ; a0 7101*c0909341SAndroid Build Coastguard Worker pmulhrsw m25, m13, m2 ; a2 7102*c0909341SAndroid Build Coastguard Worker pmulhrsw m27, m13, m4 ; a4 7103*c0909341SAndroid Build Coastguard Worker pmulhrsw m29, m13, m6 ; a6 7104*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2 7105*c0909341SAndroid Build Coastguard Worker lea r6, [r4-64*4] 7106*c0909341SAndroid Build Coastguard Worker add r4, 64*28 7107*c0909341SAndroid Build Coastguard Worker call .pass2_end 7108*c0909341SAndroid Build Coastguard Worker mov r4, rsp 7109*c0909341SAndroid Build Coastguard Worker mova m0, [r4+64*23] 7110*c0909341SAndroid Build Coastguard Worker mova m1, [r4+64*22] 7111*c0909341SAndroid Build Coastguard Worker mova m2, [r4+64*21] 7112*c0909341SAndroid Build Coastguard Worker mova m3, [r4+64*20] 7113*c0909341SAndroid Build Coastguard Worker mova m4, [r4+64*19] 7114*c0909341SAndroid Build Coastguard Worker mova m5, [r4+64*18] 7115*c0909341SAndroid Build Coastguard Worker mova m6, [r4+64*17] 7116*c0909341SAndroid Build Coastguard Worker mova m7, [r4+64*16] 7117*c0909341SAndroid Build Coastguard Worker mova m22, [r4+64*15] 7118*c0909341SAndroid Build Coastguard Worker mova m23, [r4+64*14] 7119*c0909341SAndroid Build Coastguard Worker mova m24, [r4+64*13] 7120*c0909341SAndroid Build Coastguard Worker mova m25, [r4+64*12] 7121*c0909341SAndroid Build Coastguard Worker mova m26, [r4+64*11] 7122*c0909341SAndroid Build Coastguard Worker mova m27, [r4+64*10] 7123*c0909341SAndroid Build Coastguard Worker mova m28, [r4+64* 9] 7124*c0909341SAndroid Build Coastguard Worker mova m29, [r4+64* 8] 7125*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi 7126*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [o(pw_8192)] 7127*c0909341SAndroid Build Coastguard Worker mova [r4+64* 8], m1 7128*c0909341SAndroid Build Coastguard Worker mova [r4+64* 9], m3 7129*c0909341SAndroid Build Coastguard Worker mova [r4+64*10], m5 7130*c0909341SAndroid Build Coastguard Worker mova [r4+64*11], m7 7131*c0909341SAndroid Build Coastguard Worker mova [r4+64*16], m23 7132*c0909341SAndroid Build Coastguard Worker mova [r4+64*17], m25 7133*c0909341SAndroid Build Coastguard Worker mova [r4+64*18], m27 7134*c0909341SAndroid Build Coastguard Worker mova [r4+64*19], m29 7135*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, m13, m0 ; b0 7136*c0909341SAndroid Build Coastguard Worker pmulhrsw m25, m13, m2 ; b2 7137*c0909341SAndroid Build Coastguard Worker pmulhrsw m27, m13, m4 ; b4 7138*c0909341SAndroid Build Coastguard Worker pmulhrsw m29, m13, m6 ; b6 7139*c0909341SAndroid Build Coastguard Worker mova m0, [r4+64*31] 7140*c0909341SAndroid Build Coastguard Worker mova m1, [r4+64*30] 7141*c0909341SAndroid Build Coastguard Worker mova m2, [r4+64*29] 7142*c0909341SAndroid Build Coastguard Worker mova m3, [r4+64*28] 7143*c0909341SAndroid Build Coastguard Worker mova m4, [r4+64*27] 7144*c0909341SAndroid Build Coastguard Worker mova m5, [r4+64*26] 7145*c0909341SAndroid Build Coastguard Worker mova m6, [r4+64*25] 7146*c0909341SAndroid Build Coastguard Worker mova m7, [r4+64*24] 7147*c0909341SAndroid Build Coastguard Worker mova m14, [r4+64* 7] 7148*c0909341SAndroid Build Coastguard Worker mova m15, [r4+64* 6] 7149*c0909341SAndroid Build Coastguard Worker mova m16, [r4+64* 5] 7150*c0909341SAndroid Build Coastguard Worker mova m17, [r4+64* 4] 7151*c0909341SAndroid Build Coastguard Worker mova m18, [r4+64* 3] 7152*c0909341SAndroid Build Coastguard Worker mova m19, [r4+64* 2] 7153*c0909341SAndroid Build Coastguard Worker mova m20, [r4+64* 1] 7154*c0909341SAndroid Build Coastguard Worker mova m21, [r4+64* 0] 7155*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo 7156*c0909341SAndroid Build Coastguard Worker mov r6, cq 7157*c0909341SAndroid Build Coastguard Worker call .pass2_end 7158*c0909341SAndroid Build Coastguard Worker jmp .end 7159*c0909341SAndroid Build Coastguard Worker.fast: ; bottom/right halves are zero 7160*c0909341SAndroid Build Coastguard Worker mova m28, [o(dup16_perm)] 7161*c0909341SAndroid Build Coastguard Worker pmovzxwd m9, [cq+64* 0] 7162*c0909341SAndroid Build Coastguard Worker vpermb m8, m28, [cq+64* 4] 7163*c0909341SAndroid Build Coastguard Worker vpermb ym1, ym28, [cq+64*12] 7164*c0909341SAndroid Build Coastguard Worker vpermb m7, m28, [cq+64* 8] 7165*c0909341SAndroid Build Coastguard Worker pslld m9, 16 7166*c0909341SAndroid Build Coastguard Worker call m(idct_16x16_internal_8bpc).main_fast2 7167*c0909341SAndroid Build Coastguard Worker vpermb m21, m28, [cq+64* 2] 7168*c0909341SAndroid Build Coastguard Worker vpermb ym15, ym28, [cq+64*14] 7169*c0909341SAndroid Build Coastguard Worker vpermb ym18, ym28, [cq+64*10] 7170*c0909341SAndroid Build Coastguard Worker vpermb m14, m28, [cq+64* 6] 7171*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 7172*c0909341SAndroid Build Coastguard Worker vpermb m22, m28, [cq+64* 1] 7173*c0909341SAndroid Build Coastguard Worker vpermb ym29, ym28, [cq+64*15] 7174*c0909341SAndroid Build Coastguard Worker vpermb ym26, ym28, [cq+64* 9] 7175*c0909341SAndroid Build Coastguard Worker vpermb m25, m28, [cq+64* 7] 7176*c0909341SAndroid Build Coastguard Worker vpermb m24, m28, [cq+64* 5] 7177*c0909341SAndroid Build Coastguard Worker vpermb ym27, ym28, [cq+64*11] 7178*c0909341SAndroid Build Coastguard Worker vpermb m23, m28, [cq+64* 3] 7179*c0909341SAndroid Build Coastguard Worker vpermb ym28, ym28, [cq+64*13] 7180*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m14 7181*c0909341SAndroid Build Coastguard Worker mova [cq+64* 1], m15 7182*c0909341SAndroid Build Coastguard Worker mova [cq+64* 2], m16 7183*c0909341SAndroid Build Coastguard Worker mova [cq+64* 3], m17 7184*c0909341SAndroid Build Coastguard Worker mova [cq+64* 4], m18 7185*c0909341SAndroid Build Coastguard Worker mova [cq+64* 5], m19 7186*c0909341SAndroid Build Coastguard Worker mova [cq+64* 6], m20 7187*c0909341SAndroid Build Coastguard Worker mova [cq+64* 7], m21 7188*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast 7189*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [o(pw_8192)] 7190*c0909341SAndroid Build Coastguard Worker mova [cq+64*16], m4 7191*c0909341SAndroid Build Coastguard Worker mova [cq+64*17], m5 7192*c0909341SAndroid Build Coastguard Worker mova [cq+64*18], m6 7193*c0909341SAndroid Build Coastguard Worker mova [cq+64*19], m7 7194*c0909341SAndroid Build Coastguard Worker mova [cq+64*28], m26 7195*c0909341SAndroid Build Coastguard Worker mova [cq+64*29], m27 7196*c0909341SAndroid Build Coastguard Worker mova [cq+64*30], m28 7197*c0909341SAndroid Build Coastguard Worker mova [cq+64*31], m29 7198*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end 7199*c0909341SAndroid Build Coastguard Worker mova [cq+64*20], m22 7200*c0909341SAndroid Build Coastguard Worker mova [cq+64*21], m23 7201*c0909341SAndroid Build Coastguard Worker mova [cq+64*22], m24 7202*c0909341SAndroid Build Coastguard Worker mova [cq+64*23], m25 7203*c0909341SAndroid Build Coastguard Worker mova [cq+64*24], m26 7204*c0909341SAndroid Build Coastguard Worker mova [cq+64*25], m27 7205*c0909341SAndroid Build Coastguard Worker mova [cq+64*26], m28 7206*c0909341SAndroid Build Coastguard Worker mova [cq+64*27], m29 7207*c0909341SAndroid Build Coastguard Worker lea r4, [rsp+64*64] 7208*c0909341SAndroid Build Coastguard Worker lea r3, [rsp+64*32] 7209*c0909341SAndroid Build Coastguard Worker call .pass2_fast 7210*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m13, [cq+64*16] 7211*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m13, [cq+64*17] 7212*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m13, [cq+64*18] 7213*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m13, [cq+64*19] 7214*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m13, [cq+64*20] 7215*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m13, [cq+64*21] 7216*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m13, [cq+64*22] 7217*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m13, [cq+64*23] 7218*c0909341SAndroid Build Coastguard Worker pmulhrsw m14, m13, [cq+64*24] 7219*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m13, [cq+64*25] 7220*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m13, [cq+64*26] 7221*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m13, [cq+64*27] 7222*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m13, [cq+64*28] 7223*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m13, [cq+64*29] 7224*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m13, [cq+64*30] 7225*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m13, [cq+64*31] 7226*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round 7227*c0909341SAndroid Build Coastguard Worker mov r4, rsp 7228*c0909341SAndroid Build Coastguard Worker mov r3, cq 7229*c0909341SAndroid Build Coastguard Worker call .pass2_fast 7230*c0909341SAndroid Build Coastguard Worker.end: 7231*c0909341SAndroid Build Coastguard Worker vpbroadcastd m17, [o(pw_2048)] 7232*c0909341SAndroid Build Coastguard Worker lea r5, [strideq*8] 7233*c0909341SAndroid Build Coastguard Worker mov r3, dstq 7234*c0909341SAndroid Build Coastguard Worker pxor m16, m16 7235*c0909341SAndroid Build Coastguard Worker sub r4, 64*5 ; rsp+64*31 7236*c0909341SAndroid Build Coastguard Worker mov r6, rsp 7237*c0909341SAndroid Build Coastguard Worker.end_loop: 7238*c0909341SAndroid Build Coastguard Worker mova m2, [r6+64*32] ; idct16 0+n lo 7239*c0909341SAndroid Build Coastguard Worker mova m7, [r6+64*48] ; idct32 31-n lo 7240*c0909341SAndroid Build Coastguard Worker mova m6, [cq+64* 0] ; idct16 0+n hi 7241*c0909341SAndroid Build Coastguard Worker mova m0, [cq+64*16] ; idct32 31-n hi 7242*c0909341SAndroid Build Coastguard Worker mova m4, [r4+64*64] ; idct64 63-n lo 7243*c0909341SAndroid Build Coastguard Worker mova m1, [r4+64* 0] ; idct64 63-n hi 7244*c0909341SAndroid Build Coastguard Worker mova m5, [r6+64*64] ; idct64 32+n lo 7245*c0909341SAndroid Build Coastguard Worker mova m8, [r6+64* 0] ; idct64 32+n hi 7246*c0909341SAndroid Build Coastguard Worker sub r3, strideq 7247*c0909341SAndroid Build Coastguard Worker paddsw m3, m2, m7 ; idct32 0+n lo 7248*c0909341SAndroid Build Coastguard Worker mova m12, [dstq+r5*0] 7249*c0909341SAndroid Build Coastguard Worker psubsw m2, m7 ; idct32 31-n lo 7250*c0909341SAndroid Build Coastguard Worker mova m15, [r3 +r5*8] 7251*c0909341SAndroid Build Coastguard Worker paddsw m7, m6, m0 ; idct32 0+n hi 7252*c0909341SAndroid Build Coastguard Worker mova m13, [r3 +r5*4] 7253*c0909341SAndroid Build Coastguard Worker psubsw m6, m0 ; idct32 31-n hi 7254*c0909341SAndroid Build Coastguard Worker mova m14, [dstq+r5*4] 7255*c0909341SAndroid Build Coastguard Worker paddsw m0, m3, m4 ; out 0+n lo 7256*c0909341SAndroid Build Coastguard Worker add r6, 64 7257*c0909341SAndroid Build Coastguard Worker psubsw m3, m4 ; out 63-n lo 7258*c0909341SAndroid Build Coastguard Worker sub r4, 64 7259*c0909341SAndroid Build Coastguard Worker paddsw m4, m7, m1 ; out 0+n hi 7260*c0909341SAndroid Build Coastguard Worker mova [cq+64* 0], m16 7261*c0909341SAndroid Build Coastguard Worker psubsw m7, m1 ; out 63-n hi 7262*c0909341SAndroid Build Coastguard Worker mova [cq+64*16], m16 7263*c0909341SAndroid Build Coastguard Worker paddsw m1, m2, m5 ; out 31-n lo 7264*c0909341SAndroid Build Coastguard Worker add cq, 64 7265*c0909341SAndroid Build Coastguard Worker psubsw m2, m5 ; out 32+n lo 7266*c0909341SAndroid Build Coastguard Worker paddsw m5, m6, m8 ; out 31-n hi 7267*c0909341SAndroid Build Coastguard Worker psubsw m6, m8 ; out 32+n hi 7268*c0909341SAndroid Build Coastguard Worker pmulhrsw m0, m17 7269*c0909341SAndroid Build Coastguard Worker punpcklbw m8, m12, m16 7270*c0909341SAndroid Build Coastguard Worker pmulhrsw m4, m17 7271*c0909341SAndroid Build Coastguard Worker punpckhbw m12, m16 7272*c0909341SAndroid Build Coastguard Worker pmulhrsw m3, m17 7273*c0909341SAndroid Build Coastguard Worker punpcklbw m11, m15, m16 7274*c0909341SAndroid Build Coastguard Worker pmulhrsw m7, m17 7275*c0909341SAndroid Build Coastguard Worker punpckhbw m15, m16 7276*c0909341SAndroid Build Coastguard Worker pmulhrsw m1, m17 7277*c0909341SAndroid Build Coastguard Worker punpcklbw m9, m13, m16 7278*c0909341SAndroid Build Coastguard Worker pmulhrsw m5, m17 7279*c0909341SAndroid Build Coastguard Worker punpckhbw m13, m16 7280*c0909341SAndroid Build Coastguard Worker pmulhrsw m2, m17 7281*c0909341SAndroid Build Coastguard Worker punpcklbw m10, m14, m16 7282*c0909341SAndroid Build Coastguard Worker pmulhrsw m6, m17 7283*c0909341SAndroid Build Coastguard Worker punpckhbw m14, m16 7284*c0909341SAndroid Build Coastguard Worker paddw m0, m8 7285*c0909341SAndroid Build Coastguard Worker paddw m4, m12 7286*c0909341SAndroid Build Coastguard Worker packuswb m0, m4 7287*c0909341SAndroid Build Coastguard Worker paddw m3, m11 7288*c0909341SAndroid Build Coastguard Worker paddw m7, m15 7289*c0909341SAndroid Build Coastguard Worker packuswb m3, m7 7290*c0909341SAndroid Build Coastguard Worker paddw m1, m9 7291*c0909341SAndroid Build Coastguard Worker paddw m5, m13 7292*c0909341SAndroid Build Coastguard Worker packuswb m1, m5 7293*c0909341SAndroid Build Coastguard Worker paddw m2, m10 7294*c0909341SAndroid Build Coastguard Worker paddw m6, m14 7295*c0909341SAndroid Build Coastguard Worker packuswb m2, m6 7296*c0909341SAndroid Build Coastguard Worker mova [dstq+r5*0], m0 7297*c0909341SAndroid Build Coastguard Worker mova [r3 +r5*8], m3 7298*c0909341SAndroid Build Coastguard Worker mova [r3 +r5*4], m1 7299*c0909341SAndroid Build Coastguard Worker mova [dstq+r5*4], m2 7300*c0909341SAndroid Build Coastguard Worker add dstq, strideq 7301*c0909341SAndroid Build Coastguard Worker cmp r6, r4 7302*c0909341SAndroid Build Coastguard Worker jb .end_loop 7303*c0909341SAndroid Build Coastguard Worker RET 7304*c0909341SAndroid Build Coastguard Worker.dconly: 7305*c0909341SAndroid Build Coastguard Worker movsx r6d, word [cq] 7306*c0909341SAndroid Build Coastguard Worker mov [cq], eobd 7307*c0909341SAndroid Build Coastguard Worker or r3d, 64 7308*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly 7309*c0909341SAndroid Build Coastguard WorkerALIGN function_align 7310*c0909341SAndroid Build Coastguard Worker.pass2_end: 7311*c0909341SAndroid Build Coastguard Worker REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6 7312*c0909341SAndroid Build Coastguard Worker mova [r4+64*20], m1 7313*c0909341SAndroid Build Coastguard Worker mova [r4+64*21], m3 7314*c0909341SAndroid Build Coastguard Worker mova [r4+64*22], m5 7315*c0909341SAndroid Build Coastguard Worker mova [r4+64*23], m7 7316*c0909341SAndroid Build Coastguard Worker vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01 7317*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03 7318*c0909341SAndroid Build Coastguard Worker vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01 7319*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03 7320*c0909341SAndroid Build Coastguard Worker mova [r4+64*12], m15 7321*c0909341SAndroid Build Coastguard Worker mova [r4+64*13], m17 7322*c0909341SAndroid Build Coastguard Worker mova [r4+64*14], m19 7323*c0909341SAndroid Build Coastguard Worker mova [r4+64*15], m21 7324*c0909341SAndroid Build Coastguard Worker vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41 7325*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43 7326*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 7327*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43 7328*c0909341SAndroid Build Coastguard Worker vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21 7329*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23 7330*c0909341SAndroid Build Coastguard Worker vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21 7331*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23 7332*c0909341SAndroid Build Coastguard Worker vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61 7333*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 7334*c0909341SAndroid Build Coastguard Worker vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63 7335*c0909341SAndroid Build Coastguard Worker vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61 7336*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m1, m5, q2020 ; 0 7337*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m5, q3131 ; 8 7338*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m3, m14, q2020 ; 16 7339*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m14, q3131 ; 24 7340*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m15, m18, q2020 ; 4 7341*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m18, q3131 ; 12 7342*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m17, m19, q2020 ; 20 7343*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m19, q3131 ; 28 7344*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast 7345*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m22, m25, q3131 ; 10 7346*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m25, q2020 ; 2 7347*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m23, m28, q3131 ; 14 7348*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m28, q2020 ; 6 7349*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m26, m27, q3131 ; 26 7350*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m27, q2020 ; 18 7351*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m29, m13, q2020 ; 22 7352*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m13, q3131 ; 30 7353*c0909341SAndroid Build Coastguard Worker mova [r6+64* 0], m0 7354*c0909341SAndroid Build Coastguard Worker mova [r6+64* 1], m1 7355*c0909341SAndroid Build Coastguard Worker mova [r6+64* 2], m2 7356*c0909341SAndroid Build Coastguard Worker mova [r6+64* 3], m3 7357*c0909341SAndroid Build Coastguard Worker mova [r6+64* 4], m4 7358*c0909341SAndroid Build Coastguard Worker mova [r6+64* 5], m5 7359*c0909341SAndroid Build Coastguard Worker mova [r6+64* 6], m6 7360*c0909341SAndroid Build Coastguard Worker mova [r6+64* 7], m7 7361*c0909341SAndroid Build Coastguard Worker mova [r6+64* 8], m14 7362*c0909341SAndroid Build Coastguard Worker mova [r6+64* 9], m15 7363*c0909341SAndroid Build Coastguard Worker mova [r6+64*10], m16 7364*c0909341SAndroid Build Coastguard Worker mova [r6+64*11], m17 7365*c0909341SAndroid Build Coastguard Worker mova [r6+64*12], m18 7366*c0909341SAndroid Build Coastguard Worker mova [r6+64*13], m19 7367*c0909341SAndroid Build Coastguard Worker mova [r6+64*14], m20 7368*c0909341SAndroid Build Coastguard Worker mova [r6+64*15], m21 7369*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast 7370*c0909341SAndroid Build Coastguard Worker vpbroadcastd m13, [o(pw_8192)] 7371*c0909341SAndroid Build Coastguard Worker mova [r6+64*16], m29 7372*c0909341SAndroid Build Coastguard Worker mova [r6+64*17], m28 7373*c0909341SAndroid Build Coastguard Worker mova [r6+64*18], m27 7374*c0909341SAndroid Build Coastguard Worker mova [r6+64*19], m26 7375*c0909341SAndroid Build Coastguard Worker mova [r6+64*20], m25 7376*c0909341SAndroid Build Coastguard Worker mova [r6+64*21], m24 7377*c0909341SAndroid Build Coastguard Worker mova [r6+64*22], m23 7378*c0909341SAndroid Build Coastguard Worker mova [r6+64*23], m22 7379*c0909341SAndroid Build Coastguard Worker mova [r6+64*24], m21 7380*c0909341SAndroid Build Coastguard Worker mova [r6+64*25], m20 7381*c0909341SAndroid Build Coastguard Worker mova [r6+64*26], m19 7382*c0909341SAndroid Build Coastguard Worker mova [r6+64*27], m18 7383*c0909341SAndroid Build Coastguard Worker mova [r6+64*28], m17 7384*c0909341SAndroid Build Coastguard Worker mova [r6+64*29], m16 7385*c0909341SAndroid Build Coastguard Worker mova [r6+64*30], m15 7386*c0909341SAndroid Build Coastguard Worker mova [r6+64*31], m14 7387*c0909341SAndroid Build Coastguard Worker pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25 7388*c0909341SAndroid Build Coastguard Worker pmulhrsw m16, m13, [r4+64*12] 7389*c0909341SAndroid Build Coastguard Worker pmulhrsw m17, m13, [r4+64*16] 7390*c0909341SAndroid Build Coastguard Worker pmulhrsw m18, m13, [r4+64*20] 7391*c0909341SAndroid Build Coastguard Worker pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31 7392*c0909341SAndroid Build Coastguard Worker pmulhrsw m20, m13, [r4+64*15] 7393*c0909341SAndroid Build Coastguard Worker pmulhrsw m21, m13, [r4+64*19] 7394*c0909341SAndroid Build Coastguard Worker pmulhrsw m22, m13, [r4+64*23] 7395*c0909341SAndroid Build Coastguard Worker vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9 7396*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25 7397*c0909341SAndroid Build Coastguard Worker vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9 7398*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25 7399*c0909341SAndroid Build Coastguard Worker pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29 7400*c0909341SAndroid Build Coastguard Worker pmulhrsw m24, m13, [r4+64*14] 7401*c0909341SAndroid Build Coastguard Worker pmulhrsw m25, m13, [r4+64*18] 7402*c0909341SAndroid Build Coastguard Worker pmulhrsw m26, m13, [r4+64*22] 7403*c0909341SAndroid Build Coastguard Worker vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15 7404*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31 7405*c0909341SAndroid Build Coastguard Worker vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15 7406*c0909341SAndroid Build Coastguard Worker vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31 7407*c0909341SAndroid Build Coastguard Worker pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27 7408*c0909341SAndroid Build Coastguard Worker pmulhrsw m28, m13, [r4+64*13] 7409*c0909341SAndroid Build Coastguard Worker pmulhrsw m29, m13, [r4+64*17] 7410*c0909341SAndroid Build Coastguard Worker pmulhrsw m13, [r4+64*21] 7411*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m14, m16, q2020 ; 1 7412*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m19, m21, q3131 ; 31 7413*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m15, m17, q2020 ; 17 7414*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m18, m20, q3131 ; 15 7415*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7416*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m18, m20, q2020 ; 7 7417*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m15, m17, q3131 ; 25 7418*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m19, m21, q2020 ; 23 7419*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m14, m16, q3131 ; 9 7420*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7421*c0909341SAndroid Build Coastguard Worker vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13 7422*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29 7423*c0909341SAndroid Build Coastguard Worker vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13 7424*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29 7425*c0909341SAndroid Build Coastguard Worker vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11 7426*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27 7427*c0909341SAndroid Build Coastguard Worker vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11 7428*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27 7429*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m22, m24, q2020 ; 5 7430*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m27, m29, q3131 ; 27 7431*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m23, m25, q2020 ; 21 7432*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m26, m28, q3131 ; 11 7433*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7434*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m26, m28, q2020 ; 3 7435*c0909341SAndroid Build Coastguard Worker vshufi32x4 m1, m23, m25, q3131 ; 29 7436*c0909341SAndroid Build Coastguard Worker vshufi32x4 m2, m27, m29, q2020 ; 19 7437*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m22, m24, q3131 ; 13 7438*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 7439*c0909341SAndroid Build Coastguard Worker jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 7440*c0909341SAndroid Build Coastguard WorkerALIGN function_align 7441*c0909341SAndroid Build Coastguard Worker.pass2_fast: 7442*c0909341SAndroid Build Coastguard Worker vshufi32x4 m23, m1, m16, q3131 ; 6 7443*c0909341SAndroid Build Coastguard Worker vshufi32x4 m22, m1, m16, q2020 ; 2 7444*c0909341SAndroid Build Coastguard Worker vshufi32x4 m14, m0, m3, q3131 ; 4 7445*c0909341SAndroid Build Coastguard Worker vshufi32x4 m26, m0, m3, q2020 ; 0 7446*c0909341SAndroid Build Coastguard Worker vshufi32x4 m28, m9, m15, q3131 ; 5 7447*c0909341SAndroid Build Coastguard Worker vshufi32x4 m0, m9, m15, q2020 ; 1 7448*c0909341SAndroid Build Coastguard Worker vshufi32x4 m16, m11, m17, q3131 ; 7 7449*c0909341SAndroid Build Coastguard Worker vshufi32x4 m29, m11, m17, q2020 ; 3 7450*c0909341SAndroid Build Coastguard Worker vshufi32x4 m15, m8, m2, q3131 ; 12 7451*c0909341SAndroid Build Coastguard Worker vshufi32x4 m27, m8, m2, q2020 ; 8 7452*c0909341SAndroid Build Coastguard Worker vshufi32x4 m25, m5, m19, q3131 ; 14 7453*c0909341SAndroid Build Coastguard Worker vshufi32x4 m24, m5, m19, q2020 ; 10 7454*c0909341SAndroid Build Coastguard Worker vshufi32x4 m3, m6, m20, q3131 ; 15 7455*c0909341SAndroid Build Coastguard Worker vshufi32x4 m19, m6, m20, q2020 ; 11 7456*c0909341SAndroid Build Coastguard Worker vshufi32x4 m17, m4, m18, q3131 ; 13 7457*c0909341SAndroid Build Coastguard Worker vshufi32x4 m18, m4, m18, q2020 ; 9 7458*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 7459*c0909341SAndroid Build Coastguard Worker mova m0, m16 7460*c0909341SAndroid Build Coastguard Worker mova m3, m18 7461*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 7462*c0909341SAndroid Build Coastguard Worker mova m0, m28 7463*c0909341SAndroid Build Coastguard Worker mova m3, m19 7464*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 7465*c0909341SAndroid Build Coastguard Worker mova m0, m29 7466*c0909341SAndroid Build Coastguard Worker mova m3, m17 7467*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast 7468*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 7469*c0909341SAndroid Build Coastguard Worker mova m0, m26 7470*c0909341SAndroid Build Coastguard Worker mova m1, m27 7471*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 7472*c0909341SAndroid Build Coastguard Worker mova [r3+64* 0], m0 7473*c0909341SAndroid Build Coastguard Worker mova [r3+64* 1], m1 7474*c0909341SAndroid Build Coastguard Worker mova [r3+64* 2], m2 7475*c0909341SAndroid Build Coastguard Worker mova [r3+64* 3], m3 7476*c0909341SAndroid Build Coastguard Worker mova [r3+64* 4], m4 7477*c0909341SAndroid Build Coastguard Worker mova [r3+64* 5], m5 7478*c0909341SAndroid Build Coastguard Worker mova [r3+64* 6], m6 7479*c0909341SAndroid Build Coastguard Worker mova [r3+64* 7], m7 7480*c0909341SAndroid Build Coastguard Worker mova [r3+64* 8], m14 7481*c0909341SAndroid Build Coastguard Worker mova [r3+64* 9], m15 7482*c0909341SAndroid Build Coastguard Worker mova [r3+64*10], m16 7483*c0909341SAndroid Build Coastguard Worker mova [r3+64*11], m17 7484*c0909341SAndroid Build Coastguard Worker mova [r3+64*12], m18 7485*c0909341SAndroid Build Coastguard Worker mova [r3+64*13], m19 7486*c0909341SAndroid Build Coastguard Worker mova [r3+64*14], m20 7487*c0909341SAndroid Build Coastguard Worker mova [r3+64*15], m21 7488*c0909341SAndroid Build Coastguard Worker call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 7489*c0909341SAndroid Build Coastguard Worker mova [r3+64*16], m29 7490*c0909341SAndroid Build Coastguard Worker mova [r3+64*17], m28 7491*c0909341SAndroid Build Coastguard Worker mova [r3+64*18], m27 7492*c0909341SAndroid Build Coastguard Worker mova [r3+64*19], m26 7493*c0909341SAndroid Build Coastguard Worker mova [r3+64*20], m25 7494*c0909341SAndroid Build Coastguard Worker mova [r3+64*21], m24 7495*c0909341SAndroid Build Coastguard Worker mova [r3+64*22], m23 7496*c0909341SAndroid Build Coastguard Worker mova [r3+64*23], m22 7497*c0909341SAndroid Build Coastguard Worker mova [r3+64*24], m21 7498*c0909341SAndroid Build Coastguard Worker mova [r3+64*25], m20 7499*c0909341SAndroid Build Coastguard Worker mova [r3+64*26], m19 7500*c0909341SAndroid Build Coastguard Worker mova [r3+64*27], m18 7501*c0909341SAndroid Build Coastguard Worker mova [r3+64*28], m17 7502*c0909341SAndroid Build Coastguard Worker mova [r3+64*29], m16 7503*c0909341SAndroid Build Coastguard Worker mova [r3+64*30], m15 7504*c0909341SAndroid Build Coastguard Worker mova [r3+64*31], m14 7505*c0909341SAndroid Build Coastguard Worker ret 7506*c0909341SAndroid Build Coastguard Worker 7507*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64 7508